1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
| from sklearn.preprocessing import StandardScaler def train_model(model, param_grid=[], X=[], y=[], splits=5, repeats=5):
if len(y)==0: X,y = get_trainning_data_omitoutliers() rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats) if len(param_grid)>0: gsearch = GridSearchCV(model, param_grid, cv=rkfold, scoring="neg_mean_squared_error", verbose=1, return_train_score=True)
gsearch.fit(X,y)
model = gsearch.best_estimator_ best_idx = gsearch.best_index_
grid_results = pd.DataFrame(gsearch.cv_results_) cv_mean = abs(grid_results.loc[best_idx,'mean_test_score']) cv_std = grid_results.loc[best_idx,'std_test_score']
else: grid_results = [] cv_results = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=rkfold) cv_mean = abs(np.mean(cv_results)) cv_std = np.std(cv_results) cv_score = pd.Series({'mean':cv_mean,'std':cv_std})
y_pred = model.predict(X) print('----------------------') print(model) print('----------------------') print('score=',model.score(X,y)) print('rmse=',rmse(y, y_pred)) print('mse=',mse(y, y_pred)) print('cross_val: mean=',cv_mean,', std=',cv_std) y_pred = pd.Series(y_pred,index=y.index) resid = y - y_pred mean_resid = resid.mean() std_resid = resid.std() z = (resid - mean_resid)/std_resid n_outliers = sum(abs(z)>3) plt.figure(figsize=(15,5)) ax_131 = plt.subplot(1,3,1) plt.plot(y,y_pred,'.') plt.xlabel('y') plt.ylabel('y_pred'); plt.title('corr = {:.3f}'.format(np.corrcoef(y,y_pred)[0][1])) ax_132=plt.subplot(1,3,2) plt.plot(y,y-y_pred,'.') plt.xlabel('y') plt.ylabel('y - y_pred'); plt.title('std resid = {:.3f}'.format(std_resid)) ax_133=plt.subplot(1,3,3) z.plot.hist(bins=50,ax=ax_133) plt.xlabel('z') plt.title('{:.0f} samples with z>3'.format(n_outliers))
return model, cv_score, grid_results
|