dat <- read.csv("dataset.txt") # R
dat$mileage[is.na(dat$mileage)] <- median(na.omit(dat$mileage)) #
dat_cor <- as.matrix(cor(dat)) #
dat_cor[is.na(dat_cor)] <- 0 # 0 (.. )
library(corrplot) # corrplot,
palette <-colorRampPalette(c("#7F0000","red","#FF7F00","yellow","#7FFF7F", "cyan", "#007FFF", "blue","#00007F"))
corrplot(dat_cor, method="color", col=palette(20), cl.length=21,order = "AOE", addCoef.col="green") #
model <- lm(price ~ year + mileage + diesel + hybrid + mt + front.drive + rear.drive + engine.power + sedan + hatchback + wagon + coupe + cabriolet + minivan + pickup, data = dat) #
model.dffits <- dffits(model) # dffits
model.dffits.we <- model.dffits[model.dffits < 0.42]
model.covratio <- covratio(model) #
model.covratio.we <- model.covratio[abs(model.covratio -1) < 0.13]
dat.we <- dat[intersect(c(rownames(as.matrix(model.dffits.we))), c(rownames(as.matrix(model.covratio.we)))),] #
model.we <- lm(price ~ year + mileage + diesel + hybrid + mt + front.drive + rear.drive + engine.power + sedan + hatchback + wagon + coupe + cabriolet + minivan + pickup, data = dat.we) #
plot(dat.we$year, resid(model.we))
plot(dat.we$mileage, resid(model.we))
plot(dat.we$engine.power, resid(model.we))
qqnorm(resid(model.we))
qqline(resid(model.we))
model.we <- lm(price ~ year + mileage + diesel + hybrid + mt + front.drive + rear.drive + engine.power + sedan + hatchback + wagon + coupe + cabriolet + minivan + pickup, data = dat.we)
coef(model.we) #
(Intercept) year mileage diesel rear.drive engine.power sedan
-1.76e+08 8.79e+04 -1.4e+00 2.5e+04 4.14e+04 2.11e+03 -2.866407e+04
predicted.price <- predict(model.we, dat) #
real.price <- dat$price #
profit <- predicted.price - real.price #
plot(real.price,profit)
abline(0,0)
sorted <- sort(predicted.price /real.price, decreasing = TRUE)
sorted[1:10]
69 42 122 248 168 15 244 271 109 219
1.590489 1.507614 1.386353 1.279716 1.279380 1.248001 1.227829 1.209341 1.209232 1.204062
set.seed(1) # ( )
split <- runif(dim(dat)[1]) > 0.2 #
train <- dat[split,] #
test <- dat[!split,] #
train.model <- lm(price ~ year + mileage + diesel + hybrid + mt + front.drive + rear.drive + engine.power + sedan + hatchback + wagon + coupe + cabriolet + minivan + pickup, data = train) #
train.dffits <- dffits(train.model) # dffits
train.dffits.we <- train.dffits[train.dffits < 0.42] # dffits
train.covratio <- covratio(train.model) #
train.covratio.we <- model.covratio[abs(model.covratio -1) < 0.13] # covratio
train.we <- dat[intersect(c(rownames(as.matrix(train.dffits.we))), c(rownames(as.matrix(train.covratio.we)))),] #
train.model.we <- lm(price ~ year + mileage + diesel + hybrid + mt + front.drive + rear.drive + engine.power + sedan + hatchback + wagon + coupe + cabriolet + minivan + pickup, data = train.we) #
predictions <- predict(train.model.we, test) #
print(sqrt(sum((as.vector(predictions - test$price))^2)/length(predictions))) # ( )
[1] 121231.5
Source: https://habr.com/ru/post/302788/