mlbench
package and randomly mlbench
missing values (NA). # data ("BostonHousing", package="mlbench") original <- BostonHousing # # set.seed(100) BostonHousing[sample(1:nrow(BostonHousing), 40), "rad"] <- NA BostonHousing[sample(1:nrow(BostonHousing), 40), "ptratio"]
#> crim zn indus chas nox rm age dis rad tax ptratio b lstat medv #> 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0 #> 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6 #> 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7 #> 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4 #> 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2 #> 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21 28.7
mice::md.pattern
. # library(mice) md.pattern(BostonHousing) #
#> crim zn indus chas nox rm age dis tax b lstat medv rad ptratio #> 431 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 #> 35 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 #> 35 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 #> 5 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 #> 0 0 0 0 0 0 0 0 0 0 0 0 40 40 80
na.action=na.omit
). Make sure that after deleting the data you have: # lm(medv ~ ptratio + rad, data=BostonHousing, na.action=na.omit)
library(Hmisc) impute(BostonHousing$ptratio, mean) # impute(BostonHousing$ptratio, median) # impute(BostonHousing$ptratio, 20) # # BostonHousing$ptratio[is.na(BostonHousing$ptratio)] <- mean(BostonHousing$ptratio, na.rm = T)
library(DMwR) actuals <- original$ptratio[is.na(BostonHousing$ptratio)] predicteds <- rep(mean(BostonHousing$ptratio, na.rm=T), length(actuals)) regr.eval(actuals, predicteds)
#> mae mse rmse mape #> 1.62324034 4.19306071 2.04769644 0.09545664
library(DMwR) knnOutput <- knnImputation(BostonHousing[, !names(BostonHousing) %in% "medv"]) # knn- anyNA(knnOutput)
#> FALSE
actuals <- original$ptratio[is.na(BostonHousing$ptratio)] predicteds <- knnOutput[is.na(BostonHousing$ptratio), "ptratio"] regr.eval(actuals, predicteds)
#> mae mse rmse mape #> 1.00188715 1.97910183 1.40680554 0.05859526
DMwR::knnImputation
is that sometimes this function cannot be used if the values of the factor variable are missing. Both rpart
and mice
are suitable for such a case. The advantage of rpart
is that at least one variable that does not contain NA
sufficient.rpart
to replace the missing values instead of kNN
. In order to process a factor variable, you need to set method=class
when calling rpart()
. For numeric values, we will use method=anova
. In this case, you also need to make sure that the output variable ( medv
) is not used in rpart
training. library(rpart) class_mod <- rpart(rad ~ . - medv, data=BostonHousing[!is.na(BostonHousing$rad), ], method="class", na.action=na.omit) # .. rad - anova_mod <- rpart(ptratio ~ . - medv, data=BostonHousing[!is.na(BostonHousing$ptratio), ], method="anova", na.action=na.omit) # .. ptratio - rad_pred <- predict(class_mod, BostonHousing[is.na(BostonHousing$rad), ]) ptratio_pred <- predict(anova_mod, BostonHousing[is.na(BostonHousing$ptratio), ])
actuals <- original$ptratio[is.na(BostonHousing$ptratio)] predicteds <- ptratio_pred regr.eval(actuals, predicteds)
#> mae mse rmse mape #> 0.71061673 0.99693845 0.99846805 0.04099908
actuals <- original$rad[is.na(BostonHousing$rad)] predicteds <- as.numeric(colnames(rad_pred)[apply(rad_pred, 1, which.max)]) mean(actuals != predicteds) #
#> 0.25
mice
- short for Multivariate Imputation by Chained Equations (multidimensional estimation of chain equations) - R package, which provides complex functions for working with missing values. It uses a slightly unusual evaluation method in two steps: mice()
for building the model and complete()
for generating data. The mice(df)
function creates several full copies of df, each with its own estimate of the missing data. The complete()
function complete()
returns one or several data sets, the default set will be the first. Let's see how to replace rad and ptratio: library(mice) miceMod <- mice(BostonHousing[, !names(BostonHousing) %in% "medv"], method="rf") # mice miceOutput <- complete(miceMod) # anyNA(miceOutput)
#> FALSE
actuals <- original$ptratio[is.na(BostonHousing$ptratio)] predicteds <- miceOutput[is.na(BostonHousing$ptratio), "ptratio"] regr.eval(actuals, predicteds)
#> mae mse rmse mape #> 0.36500000 0.78100000 0.88374204 0.02121326
actuals <- original$rad[is.na(BostonHousing$rad)] predicteds <- miceOutput[is.na(BostonHousing$rad), "rad"] mean(actuals != predicteds) #
#> 0.15
Source: https://habr.com/ru/post/283168/
All Articles