engine = create_engine('mysql://login:pass@localhost:3306/db') # Creating MySQL engine sql = """ SELECT md.pnum, p.sum, am.gender, am.photos_public, md.profile_weight, md.profile_height, md.eye_color, md.hair_color, md.dob, md.profile_smoke, md.profile_ethnicity, md.profile_bodytype, md.profile_initially_seeking FROM `member_details` AS md JOIN `aminno_member` AS am ON md.pnum = am.pnum LEFT JOIN pays AS p ON md.pnum = p.id WHERE md.dob is not null AND (am.photos_public > 0 OR p.sum is not NULL) """ df = pd.read_sql_query(sql, engine).fillna(0).set_index('pnum') #Reading data from mysql DB to pandas dataframe
df['month_of_birth'] = df['dob'].apply(lambda x:x.month) df['year_of_birth'] = df['dob'].apply(lambda x:x.year)
THRESHOLD = 0.0001 df0 = df[(df['sum'] > THRESHOLD)] df1 = df[(df['sum'] < THRESHOLD)]
cols = ['profile_weight','profile_height','year_of_birth','month_of_birth', 'eye_color', 'hair_color','profile_smoke', 'profile_ethnicity', 'profile_bodytype', 'profile_initially_seeking','gender'] for col in cols: plt.figure(figsize=(10,10)) df0[col].hist(bins=50, alpha=0.9, color = 'red', normed=1) df1[col].hist(bins=50, alpha=0.7, normed=1) plt.title(col) plt.show()
y = (df['sum'] > THRESHOLD).astype(np.int32)
categorical = ['month_of_birth', 'eye_color', 'hair_color','profile_smoke', 'profile_ethnicity', 'profile_bodytype', 'profile_initially_seeking'] ohe = preprocessing.OneHotEncoder(dtype=np.float32) Xcategories = ohe.fit_transform(df[categorical]).todense()
numeric = ['gender','profile_weight','profile_height','year_of_birth'] Xnumeric = df[numeric].as_matrix() X = np.hstack((Xcategories,Xnumeric))
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1, random_state=7)
from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn import decomposition, pipeline, metrics, grid_search rf = RandomForestClassifier(random_state=7, n_jobs=4) scl = StandardScaler() clf = pipeline.Pipeline([('scl', scl), ('rf', rf)]) param_grid = {'rf__n_estimators': (100,200), 'rf__max_depth': (10,20), } model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring='roc_auc', verbose=10, cv=3) model.fit(X_train, y_train) print("Best score: %0.3f" % model.best_score_) print("Best parameters set:") best_parameters = model.best_estimator_.get_params() for param_name in sorted(param_grid.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
Best score: 0.802 Best parameters set: rf__max_depth: 20 rf__n_estimators: 200
best = model.best_estimator_ print best.steps[1][1].feature_importances_
[ 0.01083346 0.00745737 0.00754652 0.00764087 0.0075468 0.00769951 0.00780227 0.0076059 0.00747405 0.00733789 0.00720822 0.00720196 0.01067164 0.00229657 0.00271315 0.00403617 0.00453246 0.00420906 0.01227852 0.00166965 0.00060406 0.00293115 0.00347255 0.00581456 0.00176878 0.00060611 0.00129565 0.06303697 0.00526695 0.00408359 0.04618295 0.03014204 0.00401634 0.00312768 0.0041792 0.00073294 0.00260749 0.00137382 0.00385419 0.03020433 0.00788376 0.01423438 0.00953692 0.01218361 0.00685376 0.00812187 0.00433835 0.00294894 0.01210143 0.00806778 0.00458055 0.01323813 0.01434638 0.0120177 0.03383968 0.1623351 0.11347244 0.2088358 ]
from sklearn.metrics import roc_curve,roc_auc_score y_pred = best.predict_proba(X_test).T[1] print roc_auc_score(y_test, y_pred) fpr, tpr , thresholds = roc_curve(y_test, y_pred) plt.figure(figsize=(10,10)) plt.plot(fpr, tpr, label='ROC curve') plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show()
import joblib import pandas as pd df = joblib.load("1.pkl") print df
Source: https://habr.com/ru/post/266639/
All Articles