def con(): conn = pymysql.connect( host='10.100.10.100', port=3306, user='******* ', password='*****', db='mysql') return conn; df = pd.read_sql(''' SELECT * FROM idf_ru.data_for_scoring ''', con=con())
def filling(df): cat_vars = df.select_dtypes(include=[object]).columns num_vars = df.select_dtypes(include=[np.number]).columns df[cat_vars] = df[cat_vars].fillna('_MISSING_') df[num_vars] = df[num_vars].fillna(np.nan) return df def replace_not_frequent(df, cols, perc_min=5, value_to_replace = "_ELSE_"): else_df = pd.DataFrame(columns=['var', 'list']) for i in cols: if i != 'date_requested' and i != 'credit_id': t = df[i].value_counts(normalize=True) q = list(t[t.values < perc_min/100].index) if q: else_df = else_df.append(pd.DataFrame([[i, q]], columns=['var', 'list'])) df.loc[df[i].value_counts(normalize=True)[df[i]].values < perc_min/100, i] =value_to_replace else_df = else_df.set_index('var') return df, else_df cat_vars = df.select_dtypes(include=[object]).columns df = filling(df) df, else_df = replace_not_frequent_2(df, cat_vars) df.drop(['credit_id', 'target_value', 'bor_credit_id', 'bchg_credit_id', 'last_credit_id', 'bcacr_credit_id', 'bor_bonuses_got' ], axis=1, inplace=True) df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.33, stratify=df.y, random_state=42)
def plot_bin(ev, for_excel=False): ind = np.arange(len(ev.index)) width = 0.35 fig, ax1 = plt.subplots(figsize=(10, 7)) ax2 = ax1.twinx() p1 = ax1.bar(ind, ev['NONEVENT'], width, color=(24/254, 192/254, 196/254)) p2 = ax1.bar(ind, ev['EVENT'], width, bottom=ev['NONEVENT'], color=(246/254, 115/254, 109/254)) ax1.set_ylabel('Event Distribution', fontsize=15) ax2.set_ylabel('WOE', fontsize=15) plt.title(list(ev.VAR_NAME)[0], fontsize=20) ax2.plot(ind, ev['WOE'], marker='o', color='blue') # Legend plt.legend((p2[0], p1[0]), ('bad', 'good'), loc='best', fontsize=10) #Set xticklabels q = list() for i in range(len(ev)): try: mn = str(round(ev.MIN_VALUE[i], 2)) mx = str(round(ev.MAX_VALUE[i], 2)) except: mn = str((ev.MIN_VALUE[i])) mx = str((ev.MAX_VALUE[i])) q.append(mn + '-' + mx) plt.xticks(ind, q, rotation='vertical') for tick in ax1.get_xticklabels(): tick.set_rotation(60) plt.savefig('{}.png'.format(ev.VAR_NAME[0]), dpi=500, bbox_inches = 'tight') plt.show() def plot_all_bins(iv_df): for i in [x.replace('WOE_','') for x in X_train.columns]: ev = iv_df[iv_df.VAR_NAME==i] ev.reset_index(inplace=True) plot_bin(ev)
def adjust_binning(df, bins_dict): for i in range(len(bins_dict)): key = list(bins_dict.keys())[i] if type(list(bins_dict.values())[i])==dict: df[key] = df[key].map(list(bins_dict.values())[i]) else: #Categories labels categories = list() for j in range(len(list(bins_dict.values())[i])): if j == 0: categories.append('<'+ str(list(bins_dict.values())[i][j])) try: categories.append('(' + str(list(bins_dict.values())[i][j]) +'; '+ str(list(bins_dict.values())[i][j+1]) + ']') except: categories.append('(' + str(list(bins_dict.values())[i][j])) elif j==len(list(bins_dict.values())[i])-1: categories.append(str(list(bins_dict.values())[i][j]) +'>') else: categories.append('(' + str(list(bins_dict.values())[i][j]) +'; '+ str(list(bins_dict.values())[i][j+1]) + ']') values = [df[key].min()] + list(bins_dict.values())[i] + [df[key].max()] df[key + '_bins'] = pd.cut(df[key], values, include_lowest=True, labels=categories).astype(object).fillna('_MISSING_').astype(str) df[key] = df[key + '_bins']#.map(df.groupby(key + '_bins')[key].agg('median')) df.drop([key + '_bins'], axis=1, inplace=True) return df bins_dict = { 'equi_delinquencyDays': [ 200,400,600] 'loan_purpose': {'medicine':'1_group', 'repair':'1_group', 'helpFriend':'2_group'} } df = adjust_binning(df, bins_dict)
def delete_correlated_features(df, cut_off=0.75, exclude = []): # Create correlation matrix corr_matrix = df.corr().abs() # Select upper triangle of correlation matrix upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # Plotting All correlations f, ax = plt.subplots(figsize=(15, 10)) plt.title('All correlations', fontsize=20) sns.heatmap(X_train.corr(), annot=True) # Plotting highly correlated try: f, ax = plt.subplots(figsize=(15, 10)) plt.title('High correlated', fontsize=20) sns.heatmap(corr_matrix[(corr_matrix>cut_off) & (corr_matrix!=1)].dropna(axis=0, how='all').dropna(axis=1, how='all'), annot=True, linewidths=.5) except: print ('No highly correlated features found') # Find index of feature columns with correlation greater than cut_off to_drop = [column for column in upper.columns if any(upper[column] > cut_off)] to_drop = [column for column in to_drop if column not in exclude] print ('Dropped columns:', to_drop, '\n') df2 = df.drop(to_drop, axis=1) print ('Features left after correlation check: {}'.format(len(df.columns)-len(to_drop)), '\n') print ('Not dropped columns:', list(df2.columns), '\n') # Plotting final correlations f, ax = plt.subplots(figsize=(15, 10)) plt.title('Final correlations', fontsize=20) sns.heatmap(df2.corr(), annot=True) plt.show() return df2
def RFE_feature_selection(clf_lr, X, y): rfecv = RFECV(estimator=clf_lr, step=1, cv=StratifiedKFold(5), verbose=0, scoring='roc_auc') rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores f, ax = plt.subplots(figsize=(14, 9)) plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() mask = rfecv.get_support() X = X.ix[:, mask] return X
def plot_score(clf, X_test, y_test, feat_to_show=30, is_normalize=False, cut_off=0.5): #cm = confusion_matrix(pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), y_test) print ('ROC_AUC: ', round(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]), 3)) print ('Gini: ', round(2*roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]) - 1, 3)) print ('F1_score: ', round(f1_score(y_test, clf.predict(X_test)), 3)) print ('Log_loss: ', round(log_loss(y_test, clf.predict(X_test)), 3)) print ('\n') print ('Classification_report: \n', classification_report(pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), y_test)) skplt.metrics.plot_confusion_matrix(y_test, pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), title="Confusion Matrix", normalize=is_normalize,figsize=(8,8),text_fontsize='large') display(eli5.show_weights(clf, top=20, feature_names = list(X_test.columns))) clf_lr = LogisticRegressionCV(random_state=1, cv=7) clf_lr.fit(X_train, y_train) plot_score(clf_lr, X_test, y_test, cut_off=0.5)
#WRITING writer = pd.ExcelWriter('PDL_Score_20180815-3.xlsx', engine='xlsxwriter') workbook = writer.book worksheet = workbook.add_worksheet('Sample information') bold = workbook.add_format({'bold': True}) percent_fmt = workbook.add_format({'num_format': '0.00%'}) worksheet.set_column('A:A', 20) worksheet.set_column('B:B', 15) worksheet.set_column('C:C', 10) # Sample worksheet.write('A2', 'Sample conditions', bold) worksheet.write('A3', 1) worksheet.write('A4', 2) worksheet.write('A5', 3) worksheet.write('A6', 4) # Model worksheet.write('A8', 'Model development', bold) worksheet.write('A9', 1) #labels worksheet.write('C8', 'Bads') worksheet.write('D8', 'Goods') worksheet.write('B9', 'Train') worksheet.write('B10', 'Valid') worksheet.write('B11', 'Total') # goods and bads worksheet.write('C9', y_train.value_counts()[1]) worksheet.write('C10', y_test.value_counts()[1]) worksheet.write('D9', y_train.value_counts()[0]) worksheet.write('D10', y_test.value_counts()[0]) worksheet.write('C11', y.value_counts()[1]) worksheet.write('D11', y.value_counts()[0]) # NPL worksheet.write('A13', 2) worksheet.write('B13', 'NPL') worksheet.write('C13', (y.value_counts()[1]/(y.value_counts()[1]+y.value_counts()[0])), percent_fmt) worksheet.write('A16', 3) worksheet.write('C15', 'Gini') worksheet.write('B16', 'Train') worksheet.write('B17', 'Valid') worksheet.write('B18', 'CV Scores') worksheet.write('C18', str([round(sc, 2) for sc in scores])) worksheet.write('C16', round(2*roc_auc_score(y_train, clf_lr.predict_proba(X_train)[:,1]) - 1, 3)) worksheet.write('C17', round(2*roc_auc_score(y_test, clf_lr.predict_proba(X_test)[:,1]) - 1, 3)) # Regreesion coefs feat.to_excel(writer, sheet_name='Regression coefficients', index=False) worksheet2 = writer.sheets['Regression coefficients'] worksheet2.set_column('A:A', 15) worksheet2.set_column('B:B', 50) #WOE ivs[['VAR_NAME', 'Variable range', 'WOE', 'COUNT', 'WOE_group']].to_excel(writer, sheet_name='WOE', index=False) worksheet3 = writer.sheets['WOE'] worksheet3.set_column('A:A', 50) worksheet3.set_column('B:B', 60) worksheet3.set_column('C:C', 30) worksheet3.set_column('D:D', 20) worksheet3.set_column('E:E', 12) for num, i in enumerate([x.replace('WOE_','') for x in X_train.columns]): ev = iv_df[iv_df.VAR_NAME==i] ev.reset_index(inplace=True) worksheet3.insert_image('G{}'.format(num*34+1), '{}.png'.format(i)) df3.to_excel(writer, sheet_name='Data', index=False) table.to_excel(writer, sheet_name='Scores by buckets', header = True, index = True) worksheet4 = writer.sheets['Scores by buckets'] worksheet4.set_column('A:A', 20) worksheet4.insert_image('J1', 'score_distribution.png') Ginis.to_excel(writer, sheet_name='Gini distribution', header = True, index = True) worksheet5 = writer.sheets['Gini distribution'] worksheet5.insert_image('E1', 'gini_stability.png') writer.save()
Source: https://habr.com/ru/post/421091/
All Articles