mxk
and kxn
on a test computing system. Participants knew how much time this problem was solved on other systems, matrix sizes and system parameters. # -*- coding: utf-8 -*- import pandas as pd import numpy as np from sklearn.ensemble import GradientBoostingRegressor from sklearn.cross_validation import KFold from hyperopt import fmin, tpe, hp, STATUS_OK, Trials import random random.seed(1) def mean_absolute_percentage_error(y_true, y_pred): ind = y_true > -1 return np.mean(np.abs((y_true[ind] - y_pred[ind]) / y_true[ind])) def loss_func(y_true, y_pred): return mean_absolute_percentage_error(y_true,y_pred) all_train = pd.read_csv('~/Projects/DataMining/Bimbo/data/train1.csv') all_target = pd.read_csv('~/Projects/DataMining/Bimbo/data/y_train.csv') all_train['TARGET'] = all_target['time'] cols_to_drop = ['ID','TARGET'] cols = list(set(all_train.columns)-set(cols_to_drop)) print(len(cols)) def hyperopt_train_test(hpparams): all_results = [] kf = KFold(len(all_train['TARGET'].values),n_folds=5,random_state=1, shuffle=True) for train_index, test_index in kf: train = all_train.ix[train_index,:] test = all_train.ix[test_index,:] X_train = train[cols].values y_train_c = train['n'].values*train['m'].values*train['k'].values y_train = train['TARGET'].values X_test = test[cols].values y_test_c = test['n'].values*test['m'].values*test['k'].values y_test = test['TARGET'].values params_est = {'n_estimators':int(hpparams['n_estimators']), 'learning_rate':hpparams['eta'], 'max_depth':hpparams['max_depth'], 'min_samples_split':hpparams['min_samples_split'], 'min_samples_leaf':hpparams['min_samples_leaf'], 'loss':hpparams['loss'], 'alpha':hpparams['alpha'], 'subsample':hpparams['subsample'], 'random_state':1} bst = GradientBoostingRegressor(**params_est) bst.fit(X_train, np.log(y_train/y_train_c)) y_test_pred = np.exp(bst.predict(X_test))*y_test_c current_res = loss_func(y_test, y_test_pred) all_results.append(current_res) return np.mean(all_results) space4dt = { 'min_samples_split': hp.quniform('min_samples_split', 3, 14, 1), 'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 7, 1), 'subsample': hp.quniform('subsample', 0.6, 0.99, 0.001), 'eta': hp.quniform('eta', 0.07,0.2, 0.001), 'n_estimators': hp.quniform('n_estimators', 10, 1000, 10), 'max_depth': hp.choice('max_depth', (4,5,6,7,8,9,10)), 'alpha': hp.quniform('alpha', 0.01, 0.99, 0.01), 'loss':hp.choice('loss', ('ls', 'lad', 'huber', 'quantile')), } def f(params): acc = hyperopt_train_test(params) print(acc) print(params) return {'loss': acc, 'status': STATUS_OK} trials = Trials() best = fmin(f, space4dt, algo=tpe.suggest, max_evals=2000, trials=trials) print 'best:' print best
# -*- coding: utf-8 -*- from sklearn.feature_selection.rfe import RFECV from sklearn.ensemble import GradientBoostingRegressor import numpy as np bst = GradientBoostingRegressor(**params_est) selector = RFECV(bst, step=50, cv=5) selector.fit(all_train[cols], target) print(list(selector.ranking_ )) print(np.asarray(cols)[selector.support_ ])
df.ix[:, 'cpuExtra1'] = 0 df.ix[df['cpuFull'] == 'Intel(R) Core(TM) i3-2310M CPU @ 2.10GHz', 'cpuExtra1'] = 1 df.ix[:, 'cpuExtra2'] = 0 df.ix[df['cpuFull'] == 'Intel(R) Atom(TM) CPU N550 @ 1.50GHz', 'cpuExtra2'] = 1 # df.ix[:, 'm_div_n'] = df['m'] / df['n'] df.ix[:, 'magic'] = df['k'] * df['m'] * df['n'] / (df['cpuCount'] * df['cpuCount']) cols = [ 'n', 'Sequential_read_128B_by128', 'k', 'Random_write_3MB_by128', 'cpuCount', 'Sequential_write_32kB_by128', 'Random_read_9MB_by128', 'm', 'SeqRead_20kB_by256', 'cpuCores', 'Sequential_read_48MB_by128', 'Random_read_4MB_by32', 'Random_write_32MB_by128', 'Random_read_2MB_by32', 'SeqCopy10MB_by128', 'BMI', 'm_div_n', 'magic', 'cpuExtra1', 'cpuExtra2', 'Random_write_bypassing_cache_6kB_by128', 'Sequential_read_192kB_by32', ]
# params_est = {'n_estimators': 370, 'subsample': 0.961, 'learning_rate': 0.076, 'min_samples_split': 18.0, 'max_depth': 6, 'min_samples_leaf': 8.0, 'random_state':1, 'loss':'lad',} bst1 = GradientBoostingRegressor(**params_est) bst1.fit(X_train, y_train/y_train_c1) params_est = {'n_estimators': 680, 'subsample': 0.902, 'learning_rate': 0.076, 'min_samples_split': 14.0, 'alpha': 0.29, 'max_depth': 9, 'min_samples_leaf': 5.0, 'loss':'quantile', 'random_state':1} bst2 = GradientBoostingRegressor(**params_est) bst2.fit(X_train, y_train/y_train_c1) params_est = {'n_estimators': 430, 'subsample': 0.978, 'learning_rate': 0.086, 'min_samples_split': 19.0, 'max_depth': 6, 'min_samples_leaf': 10.0, 'loss':'lad', 'random_state':1} bst3 = GradientBoostingRegressor(**params_est) bst3.fit(X_train, y_train/y_train_c1)
# all_train['w'] = 1 all_train['w'][all_train['os'] == 15] = 4 # os = 15 params_est = {'n_estimators': 480, 'subsample': 0.881, 'learning_rate': 0.197, 'min_samples_split': 3.0, 'max_depth': 7, 'min_samples_leaf': 2.0, 'loss':'lad', 'random_state':1} bst4 = GradientBoostingRegressor(**params_est) bst4.fit(X_train, np.log(y_train/y_train_c), sample_weight=train['w'])
all_train.ix[:, 'c1'] = all_train['TARGET'] / (all_train['m'] * all_train['n'] * all_train['k']) all_train_median = all_train[['c1', 'os', 'cpuFull']].groupby(['os', 'cpuFull'], as_index=False).median() def preprocess_data(df): # df = pd.merge(df, all_train_median, on=['os', 'cpuFull'], how='left', suffixes=('', '_med')) df.ix[:, 'test_mdeian'] = df['c1_med']*df['m']*df['n']*df['k'] return df
Source: https://habr.com/ru/post/321016/
All Articles