import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn import linear_model import warnings warnings.filterwarnings('ignore') %matplotlib inline
#download df = pd.read_csv('https://op.mos.ru/EHDWSREST/catalog/export/get?id=230308', compression='zip', header=0, encoding='cp1251', sep=';', quotechar='"') #look at the data df.head(12)
ID | global_id | Year | Month | StateRegistrationOfBirth | StateRegistrationOfDeath | StateRegistrationOfMarriage | StateRegistrationOfDivorce | StateRegistrationOfPaternityExamination | StateRegistrationOfAdoption | StateRegistrationOfNameChange | Totalnumber |
---|---|---|---|---|---|---|---|---|---|---|---|
one | 37591658 | 2010 | January | 9206 | 10430 | 4997 | 3302 | 1241 | 95 | 491 | 29762 |
2 | 37591659 | 2010 | February | 9060 | 9573 | 4873 | 2937 | 1326 | 97 | 639 | 28505 |
3 | 37591660 | 2010 | March | 10934 | 10528 | 3642 | 4361 | 1644 | 147 | 717 | 31973 |
four | 37591661 | 2010 | April | 10140 | 9501 | 9698 | 3943 | 1530 | 128 | 642 | 35572 |
five | 37591662 | 2010 | May | 9457 | 9482 | 3726 | 3554 | 1397 | 96 | 492 | 28204 |
6 | 62353812 | 2010 | June | 11253 | 9529 | 9148 | 3666 | 1570 | 130 | 556 | 35852 |
7 | 62353813 | 2010 | July | 11477 | 14340 | 12473 | 3675 | 1568 | 123 | 564 | 44220 |
eight | 62353814 | 2010 | August | 10302 | 15016 | 10882 | 3496 | 1512 | 134 | 578 | 41920 |
9 | 62353816 | 2010 | September | 10140 | 9573 | 10736 | 3738 | 1480 | 101 | 686 | 36454 |
ten | 62353817 | 2010 | October | 10776 | 9350 | 8862 | 3899 | 1504 | 89 | 687 | 35167 |
eleven | 62353818 | 2010 | November | 10293 | 9091 | 6080 | 3923 | 1355 | 97 | 568 | 31407 |
12 | 62353819 | 2010 | December | 10600 | 9664 | 6023 | 4145 | 1556 | 124 | 681 | 32793 |
#code months d={'':1, '':2, '':3, '':4, '':5, '':6, '':7, '':8, '':9, '':10, '':11, '':12} df.Month=df.Month.map(d) #delete some unuseful columns df.drop(['ID','global_id','Unnamed: 12'],axis=1,inplace=True) #look at the data df.head(12)
columns_to_show = ['StateRegistrationOfBirth', 'StateRegistrationOfMarriage', 'StateRegistrationOfPaternityExamination', 'StateRegistrationOfDivorce','StateRegistrationOfDeath'] data=df[columns_to_show]
grid = sns.pairplot(data)
# change scale of features scaler = MinMaxScaler() df2=pd.DataFrame(scaler.fit_transform(df)) df2.columns=df.columns data2=df2[columns_to_show] grid2 = sns.pairplot(data2)
X = data2['StateRegistrationOfBirth'].values y = data2['StateRegistrationOfPaternityExamination'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) X_train=np.reshape(X_train,[X_train.shape[0],1]) y_train=np.reshape(y_train,[y_train.shape[0],1]) X_test=np.reshape(X_test,[X_test.shape[0],1]) y_test=np.reshape(y_test,[y_test.shape[0],1])
#teach model and get predictions lr = linear_model.LinearRegression() lr.fit(X_train, y_train) print('Coefficients:', lr.coef_) print('Score:', lr.score(X_test,y_test))
Coefficients: [[ 0.78600258]]
Score: 0.611493944197
plt.scatter(X_train, y_train, color='black') plt.plot(X_train, lr.predict(X_train), color='blue', linewidth=3) plt.xlabel('StateRegistrationOfBirth') plt.ylabel('State Registration OfPaternity Examination') plt.title="Regression on train data"
plt.scatter(X_test, y_test, color='black') plt.plot(X_test, lr.predict(X_test), color='green', linewidth=3) plt.xlabel('StateRegistrationOfBirth') plt.ylabel('State Registration OfPaternity Examination') plt.title="Regression on test data"
#get main data columns_to_show2=columns_to_show.copy() columns_to_show2.remove("StateRegistrationOfMarriage") #get data for a model X = data2[columns_to_show2].values y = data2['StateRegistrationOfMarriage'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) y_train=np.reshape(y_train,[y_train.shape[0],1]) y_test=np.reshape(y_test,[y_test.shape[0],1])
lr = linear_model.LinearRegression() lr.fit(X_train, y_train) print('Coefficients:', lr.coef_) print('Score:', lr.score(X_test,y_test))
Coefficients: [[-0.03475475 0.97143632 -0.44298685 -0.18245718]]
Score: 0.38137432065
#let's look at the different alpha parameter: #large Rid=linear_model.Lasso (alpha = 0.01) Rid.fit(X_train, y_train) print(' Appha:', Rid.alpha) print(' Coefficients:', Rid.coef_) print(' Score:', Rid.score(X_test,y_test)) #Small Rid=linear_model.Lasso (alpha = 0.000000001) Rid.fit(X_train, y_train) print('\n Appha:', Rid.alpha) print(' Coefficients:', Rid.coef_) print(' Score:', Rid.score(X_test,y_test)) #Optimal (for these test data) Rid=linear_model.Lasso (alpha = 0.00025) Rid.fit(X_train, y_train) print('\n Appha:', Rid.alpha) print(' Coefficients:', Rid.coef_) print(' Score:', Rid.score(X_test,y_test))
Appha: 0.01
Coefficients: [ 0. 0.46642996 -0. -0. ]
Score: 0.222071102783
Appha: 1e-09
Coefficients: [-0.03475462 0.97143616 -0.44298679 -0.18245715]
Score: 0.38137433837
Appha: 0.00025
Coefficients: [-0.00387233 0.92989507 -0.42590052 -0.17411615]
Score: 0.385551648602
columns_to_show3=columns_to_show2.copy() columns_to_show3.append("TotalNumber") columns_to_show3 X = df2[columns_to_show3].values # y hasn't changed X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) y_train=np.reshape(y_train,[y_train.shape[0],1]) y_test=np.reshape(y_test,[y_test.shape[0],1])
lr = linear_model.LinearRegression() lr.fit(X_train, y_train) print('Coefficients:', lr.coef_) print('Score:', lr.score(X_test,y_test))
Coefficients: [[-0.45286477 -0.08625204 -0.19375198 -0.63079401 1.57467774]]
Score: 0.999173764473
#Optimal (for these test data) Rid=linear_model.Lasso (alpha = 0.00015) Rid.fit(X_train, y_train) print('\n Appha:', Rid.alpha) print(' Coefficients:', Rid.coef_) print(' Score:', Rid.score(X_test,y_test))
Appha: 0.00015
Coefficients: [-0.44718703 -0.07491507 -0.1944702 -0.62034146 1.55890505]
Score: 0.999266251287
#large Rid=linear_model.Lasso (alpha = 0.01) Rid.fit(X_train, y_train) print('\n Appha:', Rid.alpha) print(' Coefficients:', Rid.coef_) print(' Score:', Rid.score(X_test,y_test))
Appha: 0.01
Coefficients: [-0. -0. -0. -0.05177979 0.87991931]
Score: 0.802210158982
X_train=np.reshape(X_train[:,4],[X_train.shape[0],1]) X_test=np.reshape(X_test[:,4],[X_test.shape[0],1]) lr = linear_model.LinearRegression() lr.fit(X_train, y_train) print('Coefficients:', lr.coef_) print('Score:', lr.score(X_train,y_train))
Coefficients: [ 1.0571131]
Score: 0.788270672692
# plot for train data plt.figure(figsize=(8,10)) plt.subplot(211) plt.scatter(X_train, y_train, color='black') plt.plot(X_train, lr.predict(X_train), color='blue', linewidth=3) plt.xlabel('Total Number of Registration') plt.ylabel('State Registration Of Marriage') plt.title="Regression on train data" # plot for test data plt.subplot(212) plt.scatter(X_test, y_test, color='black') plt.plot(X_test, lr.predict(X_test), '--', color='green', linewidth=3) plt.xlabel('Total Number of Registration') plt.ylabel('State Registration Of Marriage') plt.title="Regression on test data"
columns_to_show4=columns_to_show2.copy() columns_to_show4.append("StateRegistrationOfNameChange") X = df2[columns_to_show4].values # y hasn't changed X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) y_train=np.reshape(y_train,[y_train.shape[0],1]) y_test=np.reshape(y_test,[y_test.shape[0],1]) lr = linear_model.LinearRegression() lr.fit(X_train, y_train) print('Coefficients:', lr.coef_) print('Score:', lr.score(X_test,y_test))
Coefficients: [[ 0.06583714 1.1080889 -0.35025999 -0.24473705 -0.4513887 ]]
Score: 0.285094398157
#get data columns_to_show5=columns_to_show2.copy() columns_to_show5.append("Month") #get data for model X = df2[columns_to_show5].values # y hasn't changed X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) y_train=np.reshape(y_train,[y_train.shape[0],1]) y_test=np.reshape(y_test,[y_test.shape[0],1]) #teach model and get predictions lr = linear_model.LinearRegression() lr.fit(X_train, y_train) print('Coefficients:', lr.coef_) print('Score:', lr.score(X_test,y_test))
Coefficients: [[-0.10613428 0.91315175 -0.55413198 -0.13253367 0.28536285]]
Score: 0.472057997208
#get data df3=df.copy() #get new column df3.Year=df.Year.map(lambda x: (x-2010)*12)+df.Month df3.rename(columns={'Year': 'Months'}, inplace=True) #get data for model X=df3[columns_to_show5].values y=df3['StateRegistrationOfMarriage'].values train=[df3.Months<=72] test=[df3.Months>72] X_train=X[train] y_train=y[train] X_test=X[test] y_test=y[test] y_train=np.reshape(y_train,[y_train.shape[0],1]) y_test=np.reshape(y_test,[y_test.shape[0],1]) #teach model and get predictions lr = linear_model.LinearRegression() lr.fit(X_train, y_train) print('Coefficients:', lr.coef_[0]) print('Score:', lr.score(X_test,y_test))
Coefficients: [ 2.60708376e-01 1.30751121e+01 -3.31447168e+00 -2.34368684e-01
2.88096512e+02]
Score: 0.383195050367
plt.figure(figsize=(9,23)) # plot for train data plt.subplot(311) plt.scatter(df3.Months.values[train], y_train, color='black') plt.plot(df3.Months.values[train], lr.predict(X_train), color='blue', linewidth=2) plt.xlabel('Months (from 01.2010)') plt.ylabel('State Registration Of Marriage') plt.title="Regression on train data" # plot for test data plt.subplot(312) plt.scatter(df3.Months.values[test], y_test, color='black') plt.plot(df3.Months.values[test], lr.predict(X_test), color='green', linewidth=2) plt.xlabel('Months (from 01.2010)') plt.ylabel('State Registration Of Marriage') plt.title="Regression (prediction) on test data" # plot for all data plt.subplot(313) plt.scatter(df3.Months.values[train], y_train, color='black') plt.plot(df3.Months.values[train], lr.predict(X_train), color='blue', label='train', linewidth=2) plt.scatter(df3.Months.values[test], y_test, color='black') plt.plot(df3.Months.values[test], lr.predict(X_test), color='green', label='test', linewidth=2) plt.title="Regression (prediction) on all data" plt.xlabel('Months (from 01.2010)') plt.ylabel('State Registration Of Marriage') #plot line for link train to test plt.plot([72,73], lr.predict([X_train[-1],X_test[0]]) , color='magenta',linewidth=2, label='train to test') plt.legend()
df_base = pd.read_csv('https://op.mos.ru/EHDWSREST/catalog/export/get?id=230308', compression='zip', header=0, encoding='cp1251', sep=';', quotechar='"')
#get data for model df4=df_base.copy() df4.drop(['Year','StateRegistrationOfMarriage','ID','global_id','Unnamed: 12','TotalNumber','StateRegistrationOfNameChange','StateRegistrationOfAdoption'],axis=1,inplace=True) df4=pd.get_dummies(df4,prefix=['Month']) X=df4.values X_train=X[train] X_test=X[test] #teach model and get predictions lr = linear_model.LinearRegression() lr.fit(X_train, y_train) print('Coefficients:', lr.coef_[0]) print('Score:', lr.score(X_test,y_test)) # plot for all data plt.scatter(df3.Months.values[train], y_train, color='black') plt.plot(df3.Months.values[train], lr.predict(X_train), color='blue', label='train', linewidth=2) plt.scatter(df3.Months.values[test], y_test, color='black') plt.plot(df3.Months.values[test], lr.predict(X_test), color='green', label='test', linewidth=2) plt.title="Regression (prediction) on all data" plt.xlabel('Months (from 01.2010)') plt.ylabel('State Registration Of Marriage') #plot line for link train to test plt.plot([72,73], lr.predict([X_train[-1],X_test[0]]) , color='magenta',linewidth=2, label='train to test')
Coefficients: [ 2.18633008e-01 -1.41397731e-01 4.56991414e-02 -5.17558633e-01
4.48131002e+03 -2.94754108e+02 -1.14429758e+03 3.61201946e+03
2.41208054e+03 -3.23415050e+03 -2.73587261e+03 -1.31020899e+03
4.84757208e+02 3.37280689e+03 -2.40539320e+03 -3.23829714e+03]
Score: 0.869208071831
#get data for pandas data frame df5=df_base.copy() d=dict() #get we obtain the mean value of Registration Of Marriages by months on the training data for mon in df5.Month.unique(): d[mon]=df5.StateRegistrationOfMarriage[df5.Month.values[train]==mon].mean() #d+={} df5['MeanMarriagePerMonth']=df5.Month.map(d) df5.drop(['Month','Year','StateRegistrationOfMarriage','ID','global_id','Unnamed: 12','TotalNumber', 'StateRegistrationOfNameChange','StateRegistrationOfAdoption'],axis=1,inplace=True) #get data for model X=df5.values X_train=X[train] X_test=X[test] #teach model and get predictions lr = linear_model.LinearRegression() lr.fit(X_train, y_train) print('Coefficients:', lr.coef_[0]) print('Score:', lr.score(X_test,y_test)) # plot for all data plt.scatter(df3.Months.values[train], y_train, color='black') plt.plot(df3.Months.values[train], lr.predict(X_train), color='blue', label='train', linewidth=2) plt.scatter(df3.Months.values[test], y_test, color='black') plt.plot(df3.Months.values[test], lr.predict(X_test), color='green', label='test', linewidth=2) plt.title="Regression (prediction) on all data" plt.xlabel('Months (from 01.2010)') plt.ylabel('State Registration Of Marriage') #plot line for link train to test plt.plot([72,73], lr.predict([X_train[-1],X_test[0]]) , color='magenta',linewidth=2, label='train to test')-
Coefficients: [ 0.16556761 -0.12746446 -0.03652408 -0.21649349 0.96971467]
Score: 0.875882918435
Source: https://habr.com/ru/post/340698/
All Articles