The Imputer class provides basic strategies for recovering lost values, either using the mean, median, or the most commonly occurring value of a column or row containing lost data.Even despite the understanding that the result will not be useful, I still decided to try to use this class, and this is what actually happened:
import pandas as pd from sklearn.preprocessing import Imputer from sklearn.model_selection import train_test_split url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' df = pd.read_csv(url, header=None) df.columns = ['', '', ' ', '', ' ', '', ' ', '', ' ', '', ' ', '', 'OD280/OD315 ', ''] imp = Imputer(missing_values='NaN', strategy='mean') imp.fit(df) imp.transform([[3, 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN']])
array([[3.00000000e+00, 1.30006180e+01, 2.33634831e+00, 2.36651685e+00, 1.94949438e+01, 9.97415730e+01, 2.29511236e+00, 2.02926966e+00, 3.61853933e-01, 1.59089888e+00, 5.05808988e+00, 9.57449438e-01, 2.61168539e+00, 7.46893258e+02]])
from sklearn.datasets import make_regression from sklearn.multioutput import MultiOutputRegressor X, y = make_regression(n_features=1, n_targets=10) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4) multioutput = MultiOutputRegressor(LinearRegression()).fit(X_train, y_train) print(" : {:.2f}".format(multioutput.score(X_test, y_test))) print(" : {:.2f}".format(multioutput.score(X_train, y_train)))
: 0.82 : 0.83
class MultiOutputRegressor__: def __init__(self, est): self.est = est def fit(self, X, y): g, h = y.shape self.estimators_ = [sklearn.base.clone(self.est).fit(X, y[:, i]) for i in range(h)] return self.estimators_ def predict(self, X): res = [est.predict(X)[:, np.newaxis] for est in self.estimators_] return np.hstack(res)
df = df.drop([''], axis=1) X, y = df[['', '']], df.drop(['', ''], axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) forest = RandomForestRegressor(n_estimators=30, random_state=13) forest.fit(X_train, y_train) print(" : {:.2f}".format(forest.score(X_test, y_test))) print(" :{:.2f}".format(forest.score(X_train, y_train)))
: 0.65 :0.87
Source: https://habr.com/ru/post/358954/
All Articles