mlbench package and randomly mlbench missing values (NA). #   data ("BostonHousing", package="mlbench") original <- BostonHousing #    #    set.seed(100) BostonHousing[sample(1:nrow(BostonHousing), 40), "rad"] <- NA BostonHousing[sample(1:nrow(BostonHousing), 40), "ptratio"]  #> crim zn indus chas nox rm age dis rad tax ptratio b lstat medv #> 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0 #> 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6 #> 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7 #> 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4 #> 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2 #> 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21 28.7 mice::md.pattern . #    library(mice) md.pattern(BostonHousing) #       #> crim zn indus chas nox rm age dis tax b lstat medv rad ptratio #> 431 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 #> 35 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 #> 35 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 #> 5 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 #> 0 0 0 0 0 0 0 0 0 0 0 0 40 40 80 na.action=na.omit ). Make sure that after deleting the data you have: #  lm(medv ~ ptratio + rad, data=BostonHousing, na.action=na.omit)  library(Hmisc) impute(BostonHousing$ptratio, mean) #   impute(BostonHousing$ptratio, median) #  impute(BostonHousing$ptratio, 20) #    #      BostonHousing$ptratio[is.na(BostonHousing$ptratio)] <- mean(BostonHousing$ptratio, na.rm = T)  library(DMwR) actuals <- original$ptratio[is.na(BostonHousing$ptratio)] predicteds <- rep(mean(BostonHousing$ptratio, na.rm=T), length(actuals)) regr.eval(actuals, predicteds)  #> mae mse rmse mape #> 1.62324034 4.19306071 2.04769644 0.09545664  library(DMwR) knnOutput <- knnImputation(BostonHousing[, !names(BostonHousing) %in% "medv"]) #  knn- anyNA(knnOutput)  #> FALSE  actuals <- original$ptratio[is.na(BostonHousing$ptratio)] predicteds <- knnOutput[is.na(BostonHousing$ptratio), "ptratio"] regr.eval(actuals, predicteds)  #> mae mse rmse mape #> 1.00188715 1.97910183 1.40680554 0.05859526 DMwR::knnImputation is that sometimes this function cannot be used if the values of the factor variable are missing. Both rpart and mice are suitable for such a case. The advantage of rpart is that at least one variable that does not contain NA sufficient.rpart to replace the missing values instead of kNN . In order to process a factor variable, you need to set method=class when calling rpart() . For numeric values, we will use method=anova . In this case, you also need to make sure that the output variable ( medv ) is not used in rpart training. library(rpart) class_mod <- rpart(rad ~ . - medv, data=BostonHousing[!is.na(BostonHousing$rad), ], method="class", na.action=na.omit) # .. rad -   anova_mod <- rpart(ptratio ~ . - medv, data=BostonHousing[!is.na(BostonHousing$ptratio), ], method="anova", na.action=na.omit) # .. ptratio -   rad_pred <- predict(class_mod, BostonHousing[is.na(BostonHousing$rad), ]) ptratio_pred <- predict(anova_mod, BostonHousing[is.na(BostonHousing$ptratio), ])  actuals <- original$ptratio[is.na(BostonHousing$ptratio)] predicteds <- ptratio_pred regr.eval(actuals, predicteds)  #> mae mse rmse mape #> 0.71061673 0.99693845 0.99846805 0.04099908  actuals <- original$rad[is.na(BostonHousing$rad)] predicteds <- as.numeric(colnames(rad_pred)[apply(rad_pred, 1, which.max)]) mean(actuals != predicteds) #      #> 0.25 mice - short for Multivariate Imputation by Chained Equations (multidimensional estimation of chain equations) - R package, which provides complex functions for working with missing values. It uses a slightly unusual evaluation method in two steps: mice() for building the model and complete() for generating data. The mice(df) function creates several full copies of df, each with its own estimate of the missing data. The complete() function complete() returns one or several data sets, the default set will be the first. Let's see how to replace rad and ptratio: library(mice) miceMod <- mice(BostonHousing[, !names(BostonHousing) %in% "medv"], method="rf") #  mice     miceOutput <- complete(miceMod) #    anyNA(miceOutput)  #> FALSE  actuals <- original$ptratio[is.na(BostonHousing$ptratio)] predicteds <- miceOutput[is.na(BostonHousing$ptratio), "ptratio"] regr.eval(actuals, predicteds)  #> mae mse rmse mape #> 0.36500000 0.78100000 0.88374204 0.02121326  actuals <- original$rad[is.na(BostonHousing$rad)] predicteds <- miceOutput[is.na(BostonHousing$rad), "rad"] mean(actuals != predicteds) #      #> 0.15 Source: https://habr.com/ru/post/283168/
All Articles