from sklearn.externals import joblib from sklearn import cross_validation from sklearn import svm from sklearn import neighbors from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression import numpy as np def avg(x): s = 0.0 for t in x: s += t return (s/len(x))*100.0 dataset = joblib.load('kredit.pkl') # , target = [x[0] for x in dataset] target = np.array(target) train = [x[1:] for x in dataset] numcv = 10 # glm = LogisticRegression(penalty='l1', tol=1) scores = cross_validation.cross_val_score(glm, train, target, cv = numcv) print("Logistic Regression with L1 metric - " + ' avg = ' + ('%2.1f'%avg(scores))) linSVM = svm.SVC(kernel='linear', C=1) scores = cross_validation.cross_val_score(linSVM, train, target, cv = numcv) print("SVM with linear kernel - " + ' avg = ' + ('%2.1f'%avg(scores))) poly2SVM = svm.SVC(kernel='poly', degree=2, C=1) scores = cross_validation.cross_val_score(poly2SVM, train, target, cv = numcv) print("SVM with polynomial kernel degree 2 - " + ' avg = ' + ('%2.1f' % avg(scores))) rbfSVM = svm.SVC(kernel='rbf', C=1) scores = cross_validation.cross_val_score(rbfSVM, train, target, cv = numcv) print("SVM with rbf kernel - " + ' avg = ' + ('%2.1f'%avg(scores))) knn = neighbors.KNeighborsClassifier(n_neighbors = 1, weights='uniform') scores = cross_validation.cross_val_score(knn, train, target, cv = numcv) print("kNN 1 neighbour - " + ' avg = ' + ('%2.1f'%avg(scores))) knn = neighbors.KNeighborsClassifier(n_neighbors = 5, weights='uniform') scores = cross_validation.cross_val_score(knn, train, target, cv = numcv) print("kNN 5 neighbours - " + ' avg = ' + ('%2.1f'%avg(scores))) knn = neighbors.KNeighborsClassifier(n_neighbors = 11, weights='uniform') scores = cross_validation.cross_val_score(knn, train, target, cv = numcv) print("kNN 11 neighbours - " + ' avg = ' + ('%2.1f'%avg(scores))) gbm = GradientBoostingClassifier(learning_rate = 0.001, n_estimators = 5000) scores = cross_validation.cross_val_score(gbm, train, target, cv = numcv) print("Gradient Boosting 5000 trees, shrinkage 0.001 - " + ' avg = ' + ('%2.1f'%avg(scores))) gbm = GradientBoostingClassifier(learning_rate = 0.001, n_estimators = 10000) scores = cross_validation.cross_val_score(gbm, train, target, cv = numcv) print("Gradient Boosting 10000 trees, shrinkage 0.001 - " + ' avg = ' + ('%2.1f'%avg(scores))) gbm = GradientBoostingClassifier(learning_rate = 0.001, n_estimators = 15000) scores = cross_validation.cross_val_score(gbm, train, target, cv = numcv) print("Gradient Boosting 15000 trees, shrinkage 0.001 - " + ' avg = ' + ('%2.1f'%avg(scores))) # - forest = RandomForestClassifier(n_estimators = 10, n_jobs = 1) scores = cross_validation.cross_val_score(forest, train, target, cv=numcv) print("Random Forest 10 - " +' avg = ' + ('%2.1f'%avg(scores))) forest = RandomForestClassifier(n_estimators = 50, n_jobs = 1) scores = cross_validation.cross_val_score(forest, train, target, cv=numcv) print("Random Forest 50 - " +' avg = ' + ('%2.1f'%avg(scores))) forest = RandomForestClassifier(n_estimators = 100, n_jobs = 1) scores = cross_validation.cross_val_score(forest, train, target, cv=numcv) print("Random Forest 100 - " +' avg = '+ ('%2.1f'%avg(scores))) forest = RandomForestClassifier(n_estimators = 200, n_jobs = 1) scores = cross_validation.cross_val_score(forest, train, target, cv=numcv) print("Random Forest 200 - " +' avg = ' + ('%2.1f'%avg(scores))) forest = RandomForestClassifier(n_estimators = 300, n_jobs = 1) scores = cross_validation.cross_val_score(forest, train, target, cv=numcv) print("Random Forest 300 - " +' avg = '+ ('%2.1f'%avg(scores))) forest = RandomForestClassifier(n_estimators = 400, n_jobs = 1) scores = cross_validation.cross_val_score(forest, train, target, cv=numcv) print("Random Forest 400 - " +' avg = '+ ('%2.1f'%avg(scores))) forest = RandomForestClassifier(n_estimators = 500, n_jobs = 1) scores = cross_validation.cross_val_score(forest, train, target, cv=numcv) print("Random Forest 500 - " +' avg = '+ ('%2.1f'%avg(scores)))
Method with parameters | Average accuracy on 4 variables | Average accuracy on 20 variables |
---|---|---|
Logistic Regression, L1 metric | 75.5 | 75.2 |
SVM with linear kernel | 73.9 | 74.4 |
SVM with polynomial kernel | 72.6 | 74.9 |
SVM with rbf kernel | 74.3 | 74.7 |
kNN 1 neighbor | 68.8 | 61.4 |
kNN 5 neighbors | 72.1 | 65.1 |
kNN 11 neighbors | 72.3 | 68.7 |
Gradient Boosting 5000 trees shrinkage 0.001 | 75.0 | 77.6 |
Gradient Boosting 10000 trees shrinkage 0.001 | 73.8 | 77.2 |
Gradient Boosting 15000 trees shrinkage 0.001 | 73.7 | 76.5 |
Random Forest 10 | 72.0 | 71.2 |
Random Forest 50 | 72.1 | 75.5 |
Random Forest 100 | 71.6 | 75.9 |
Random Forest 200 | 71.8 | 76.1 |
Radom Forest 300 | 72.4 | 75.9 |
Random Forest 400 | 71.9 | 76.7 |
Random Forest 500 | 72.6 | 76.2 |
Source: https://habr.com/ru/post/173049/
All Articles