工业蒸汽预测:从数据探索到模型堆叠的完整实战

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns

# modelling
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import LinearSVR, SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler
1
2
3
4
5
#load_dataset
with open("/kaggle/input/industrial-steam/zhengqi_train.txt") as fr: # /kaggle/input/industrial-steam/zhengqi_train.txt
data_train=pd.read_table(fr,sep="\t")
with open("/kaggle/input/industrial-steam/zhengqi_test.txt") as fr_test:
data_test=pd.read_table(fr_test,sep="\t")
1
2
3
4
5
6
#merge train_set and test_set
data_train["oringin"]="train"
data_test["oringin"]="test"
data_all=pd.concat([data_train,data_test],axis=0,ignore_index=True)
#View data
data_all.head()
1
2
3
4
5
6
7
8
9
# Explore feature distibution 
#fig = plt.figure(figsize=(6, 6))
for column in data_all.columns[0:-2]:
g = sns.kdeplot(data_all[column][(data_all["oringin"] == "train")], color="Red", shade = True)
g = sns.kdeplot(data_all[column][(data_all["oringin"] == "test")], ax =g, color="Blue", shade= True)
g.set_xlabel(column)
g.set_ylabel("Frequency")
g = g.legend(["train","test"])
plt.show()
1
2
3
4
fig = plt.figure(figsize=(10, 10))
for i in range(len(data_all.columns)-2):
g = sns.FacetGrid(data_all, col='oringin')
g = g.map(sns.distplot, data_all.columns[i])
1
2
3
4
5
6
7
8
9
10
#删除特征"V5","V9","V11","V17","V22","V28",训练集和测试集分布不均
for column in ["V5","V9","V11","V17","V22","V28"]:
g = sns.kdeplot(data_all[column][(data_all["oringin"] == "train")], color="Red", shade = True)
g = sns.kdeplot(data_all[column][(data_all["oringin"] == "test")], ax =g, color="Blue", shade= True)
g.set_xlabel(column)
g.set_ylabel("Frequency")
g = g.legend(["train","test"])
plt.show()

data_all.drop(["V5","V9","V11","V17","V22","V28"],axis=1,inplace=True)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# figure parameters
data_train1=data_all[data_all["oringin"]=="train"].drop("oringin",axis=1)

fcols = 2
frows = len(data_train.columns)
plt.figure(figsize=(5*fcols,4*frows))

i=0
for col in data_train1.columns:
i+=1
ax=plt.subplot(frows,fcols,i)
sns.regplot(x=col, y='target', data=data_train, ax=ax,
scatter_kws={'marker':'.','s':3,'alpha':0.3},
line_kws={'color':'k'});
plt.xlabel(col)
plt.ylabel('target')

i+=1
ax=plt.subplot(frows,fcols,i)
sns.distplot(data_train[col].dropna() , fit=stats.norm)
plt.xlabel(col)
1
2
3
4
5
6
7
8
9
# 找出相关程度
plt.figure(figsize=(20, 16)) # 指定绘图对象宽度和高度
colnm = data_train1.columns.tolist() # 列表头
mcorr = data_train1[colnm].corr(method="spearman") # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True # 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f') # 热力图(看两两相似度)
plt.show()
1
2
3
4
5
6
7
# Threshold for removing correlated variables
threshold = 0.1

# Absolute value correlation matrix
corr_matrix = data_train1.corr().abs()
drop_col=corr_matrix[corr_matrix["target"]<threshold].index
data_all.drop(drop_col,axis=1,inplace=True)
1
2
3
4
5
6
7
8
# normalise numeric columns
cols_numeric=list(data_all.columns)
cols_numeric.remove("oringin")
def scale_minmax(col):
return (col-col.min())/(col.max()-col.min())
scale_cols = [col for col in cols_numeric if col!='target']
data_all[scale_cols] = data_all[scale_cols].apply(scale_minmax,axis=0)
data_all[scale_cols].describe()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#Check effect of Box-Cox transforms on distributions of continuous variables

fcols = 6
frows = len(cols_numeric)-1
plt.figure(figsize=(4*fcols,4*frows))
i=0

for var in cols_numeric:
if var!='target':
dat = data_all[[var, 'target']].dropna()

i+=1
plt.subplot(frows,fcols,i)
sns.distplot(dat[var] , fit=stats.norm);
plt.title(var+' Original')
plt.xlabel('')

i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(dat[var], plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(dat[var])))
plt.xlabel('')
plt.ylabel('')

i+=1
plt.subplot(frows,fcols,i)
plt.plot(dat[var], dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[var], dat['target'])[0][1]))

i+=1
plt.subplot(frows,fcols,i)
trans_var, lambda_var = stats.boxcox(dat[var].dropna()+1)
trans_var = scale_minmax(trans_var)
sns.distplot(trans_var , fit=stats.norm);
plt.title(var+' Tramsformed')
plt.xlabel('')

i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(trans_var, plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
plt.xlabel('')
plt.ylabel('')

i+=1
plt.subplot(frows,fcols,i)
plt.plot(trans_var, dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))
1
2
3
4
cols_transform=data_all.columns[0:-2]
for col in cols_transform:
# transform column
data_all.loc[:,col], _ = stats.boxcox(data_all.loc[:,col]+1)
1
2
3
4
5
6
7
print(data_all.target.describe())

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.distplot(data_all.target.dropna() , fit=stats.norm);
plt.subplot(1,2,2)
_=stats.probplot(data_all.target.dropna(), plot=plt)
1
2
3
4
5
6
7
8
9
10
#Log Transform SalePrice to improve normality
sp = data_train.target
data_train.target1 =np.power(1.5,sp)
print(data_train.target1.describe())

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.distplot(data_train.target1.dropna(),fit=stats.norm);
plt.subplot(1,2,2)
_=stats.probplot(data_train.target1.dropna(), plot=plt)
1
2
3
4
5
6
7
for column in data_all.columns[0:-2]:
g = sns.kdeplot(data_all[column][(data_all["oringin"] == "train")], color="Red", shade = True)
g = sns.kdeplot(data_all[column][(data_all["oringin"] == "test")], ax =g, color="Blue", shade= True)
g.set_xlabel(column)
g.set_ylabel("Frequency")
g = g.legend(["train","test"])
plt.show()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# function to get training samples
def get_training_data():
# extract training samples
from sklearn.model_selection import train_test_split
df_train = data_all[data_all["oringin"]=="train"]
df_train["label"]=data_train.target1
# split SalePrice and features
y = df_train.target
X = df_train.drop(["oringin","target","label"],axis=1)
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.3,random_state=100)
return X_train,X_valid,y_train,y_valid

# extract test data (without SalePrice)
def get_test_data():
df_test = data_all[data_all["oringin"]=="test"].reset_index(drop=True)
return df_test.drop(["oringin","target"],axis=1)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
from sklearn.metrics import make_scorer
# metric for evaluation
def rmse(y_true, y_pred):
diff = y_pred - y_true
sum_sq = sum(diff**2)
n = len(y_pred)

return np.sqrt(sum_sq/n)
def mse(y_ture,y_pred):
return mean_squared_error(y_ture,y_pred)

# scorer to be used in sklearn model fitting
rmse_scorer = make_scorer(rmse, greater_is_better=False)
mse_scorer = make_scorer(mse, greater_is_better=False)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# function to detect outliers based on the predictions of a model
def find_outliers(model, X, y, sigma=3):

# predict y values using model
try:
y_pred = pd.Series(model.predict(X), index=y.index)
# if predicting fails, try fitting the model first
except:
model.fit(X,y)
y_pred = pd.Series(model.predict(X), index=y.index)

# calculate residuals between the model prediction and true y values
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()

# calculate z statistic, define outliers to be where |z|>sigma
z = (resid - mean_resid)/std_resid
outliers = z[abs(z)>sigma].index

# print and plot the results
print('R2=',model.score(X,y))
print('rmse=',rmse(y, y_pred))
print("mse=",mean_squared_error(y,y_pred))
print('---------------------------------------')

print('mean of residuals:',mean_resid)
print('std of residuals:',std_resid)
print('---------------------------------------')

print(len(outliers),'outliers:')
print(outliers.tolist())

plt.figure(figsize=(15,5))
ax_131 = plt.subplot(1,3,1)
plt.plot(y,y_pred,'.')
plt.plot(y.loc[outliers],y_pred.loc[outliers],'ro')
plt.legend(['Accepted','Outlier'])
plt.xlabel('y')
plt.ylabel('y_pred');

ax_132=plt.subplot(1,3,2)
plt.plot(y,y-y_pred,'.')
plt.plot(y.loc[outliers],y.loc[outliers]-y_pred.loc[outliers],'ro')
plt.legend(['Accepted','Outlier'])
plt.xlabel('y')
plt.ylabel('y - y_pred');

ax_133=plt.subplot(1,3,3)
z.plot.hist(bins=50,ax=ax_133)
z.loc[outliers].plot.hist(color='r',bins=50,ax=ax_133)
plt.legend(['Accepted','Outlier'])
plt.xlabel('z')

plt.savefig('outliers.png')

return outliers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# get training data
from sklearn.linear_model import Ridge
X_train, X_valid,y_train,y_valid = get_training_data()
test=get_test_data()

# find and remove outliers using a Ridge model
outliers = find_outliers(Ridge(), X_train, y_train)

# permanently remove these outliers from the data
#df_train = data_all[data_all["oringin"]=="train"]
#df_train["label"]=data_train.target1
#df_train=df_train.drop(outliers)
X_outliers=X_train.loc[outliers]
y_outliers=y_train.loc[outliers]
X_t=X_train.drop(outliers)
y_t=y_train.drop(outliers)
1
2
3
4
def get_trainning_data_omitoutliers():
y1=y_t.copy()
X1=X_t.copy()
return X1,y1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from sklearn.preprocessing import StandardScaler
def train_model(model, param_grid=[], X=[], y=[],
splits=5, repeats=5):

# get unmodified training data, unless data to use already specified
if len(y)==0:
X,y = get_trainning_data_omitoutliers()
#poly_trans=PolynomialFeatures(degree=2)
#X=poly_trans.fit_transform(X)
#X=MinMaxScaler().fit_transform(X)

# create cross-validation method
rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)

# perform a grid search if param_grid given
if len(param_grid)>0:
# setup grid search parameters
gsearch = GridSearchCV(model, param_grid, cv=rkfold,
scoring="neg_mean_squared_error",
verbose=1, return_train_score=True)

# search the grid
gsearch.fit(X,y)

# extract best model from the grid
model = gsearch.best_estimator_
best_idx = gsearch.best_index_

# get cv-scores for best model
grid_results = pd.DataFrame(gsearch.cv_results_)
cv_mean = abs(grid_results.loc[best_idx,'mean_test_score'])
cv_std = grid_results.loc[best_idx,'std_test_score']

# no grid search, just cross-val score for given model
else:
grid_results = []
cv_results = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=rkfold)
cv_mean = abs(np.mean(cv_results))
cv_std = np.std(cv_results)

# combine mean and std cv-score in to a pandas series
cv_score = pd.Series({'mean':cv_mean,'std':cv_std})

# predict y using the fitted model
y_pred = model.predict(X)

# print stats on model performance
print('----------------------')
print(model)
print('----------------------')
print('score=',model.score(X,y))
print('rmse=',rmse(y, y_pred))
print('mse=',mse(y, y_pred))
print('cross_val: mean=',cv_mean,', std=',cv_std)

# residual plots
y_pred = pd.Series(y_pred,index=y.index)
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
z = (resid - mean_resid)/std_resid
n_outliers = sum(abs(z)>3)

plt.figure(figsize=(15,5))
ax_131 = plt.subplot(1,3,1)
plt.plot(y,y_pred,'.')
plt.xlabel('y')
plt.ylabel('y_pred');
plt.title('corr = {:.3f}'.format(np.corrcoef(y,y_pred)[0][1]))
ax_132=plt.subplot(1,3,2)
plt.plot(y,y-y_pred,'.')
plt.xlabel('y')
plt.ylabel('y - y_pred');
plt.title('std resid = {:.3f}'.format(std_resid))

ax_133=plt.subplot(1,3,3)
z.plot.hist(bins=50,ax=ax_133)
plt.xlabel('z')
plt.title('{:.0f} samples with z>3'.format(n_outliers))

return model, cv_score, grid_results
1
2
3
4
5
6
7
8
# places to store optimal models and scores
opt_models = dict()
score_models = pd.DataFrame(columns=['mean','std'])

# no. k-fold splits
splits=5
# no. k-fold iterations
repeats=5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
model = 'Ridge'

opt_models[model] = Ridge()
alph_range = np.arange(0.25,6,0.25)
param_grid = {'alpha': alph_range}

opt_models[model],cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=repeats)

cv_score.name = model
score_models = score_models.append(cv_score)

plt.figure()
plt.errorbar(alph_range, abs(grid_results['mean_test_score']),
abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('alpha')
plt.ylabel('score')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
model = 'Lasso'

opt_models[model] = Lasso()
alph_range = np.arange(1e-4,1e-3,4e-5)
param_grid = {'alpha': alph_range}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=repeats)

cv_score.name = model
score_models = score_models.append(cv_score)

plt.figure()
plt.errorbar(alph_range, abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('alpha')
plt.ylabel('score')
1
2
3
4
5
6
7
8
9
10
11
12
model ='ElasticNet'
opt_models[model] = ElasticNet()

param_grid = {'alpha': np.arange(1e-4,1e-3,1e-4),
'l1_ratio': np.arange(0.1,1.0,0.1),
'max_iter':[100000]}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
model='LinearSVR'
opt_models[model] = LinearSVR()

crange = np.arange(0.1,1.0,0.1)
param_grid = {'C':crange,
'max_iter':[1000]}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=repeats)


cv_score.name = model
score_models = score_models.append(cv_score)

plt.figure()
plt.errorbar(crange, abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('C')
plt.ylabel('score')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
model = 'KNeighbors'
opt_models[model] = KNeighborsRegressor()

param_grid = {'n_neighbors':np.arange(3,11,1)}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

plt.figure()
plt.errorbar(np.arange(3,11,1), abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*1))
plt.xlabel('n_neighbors')
plt.ylabel('score')
1
2
3
4
5
6
7
8
9
10
11
12
model = 'GradientBoosting'
opt_models[model] = GradientBoostingRegressor()

param_grid = {'n_estimators':[150,250,350],
'max_depth':[1,2,3],
'min_samples_split':[5,6,7]}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)
1
2
3
4
5
6
7
8
9
10
11
12
model = 'XGB'
opt_models[model] = XGBRegressor()

param_grid = {'n_estimators':[100,200,300,400,500],
'max_depth':[1,2,3],
}

opt_models[model], cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)
1
2
3
4
5
6
7
8
9
10
11
12
model = 'RandomForest'
opt_models[model] = RandomForestRegressor()

param_grid = {'n_estimators':[100,150,200],
'max_features':[8,12,16,20,24],
'min_samples_split':[2,4,6]}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=5, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def model_predict(test_data,test_y=[],stack=False):
#poly_trans=PolynomialFeatures(degree=2)
#test_data1=poly_trans.fit_transform(test_data)
#test_data=MinMaxScaler().fit_transform(test_data)
i=0
y_predict_total=np.zeros((test_data.shape[0],))
if stack:
for model in metal_models.keys():
y_predict=metal_models[model].predict(test_data)
y_predict_total+=y_predict
i+=1
if len(test_y)>0:
print("{}_mse:".format(model),mean_squared_error(y_predict,test_y))
y_predict_mean=np.round(y_predict_total/i,3)
if len(test_y)>0:
print("mean_mse:",mean_squared_error(y_predict_mean,test_y))
else:
y_metal_mean=pd.Series(y_predict_mean)
return y_metal_mean
else:
for model in opt_models.keys():
if model!="LinearSVR" and model!="KNeighbors":
y_predict=opt_models[model].predict(test_data)
y_predict_total+=y_predict
i+=1
if len(test_y)>0:
print("{}_mse:".format(model),mean_squared_error(y_predict,test_y))
y_predict_mean=np.round(y_predict_total/i,3)
if len(test_y)>0:
print("mean_mse:",mean_squared_error(y_predict_mean,test_y))
else:
y_predict_mean=pd.Series(y_predict_mean)
return y_predict_mean
1
model_predict(X_valid,y_valid)
1
2
3
4
5
6
7
8
def create_stack_features(test_data):
features={}
columns=[]
for model in opt_models.keys():
columns.append(model)
features[model]=opt_models[model].predict(test_data)
stack_feature=pd.DataFrame(features,columns=columns)
return stack_feature
1
2
3
4
5
6
#load metal data
metal_x_train=create_stack_features(X_t)
metal_y_train=pd.Series(y_t.values)
metal_x_valid=create_stack_features(X_valid)
metal_y_valid=pd.Series(y_valid.values)
metal_x_test=create_stack_features(test)
1
2
3
4
5
6
7
8
# places to store metal models and scores
metal_models = dict()
score_models = pd.DataFrame(columns=['mean','std'])

# no. k-fold splits
splits=5
# no. k-fold iterations
repeats=5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
model = 'Ridge'

metal_models[model] = Ridge()
alph_range = np.arange(0.25,6,0.25)
param_grid = {'alpha': alph_range}

metal_models[model],cv_score,grid_results = train_model(metal_models[model], param_grid=param_grid, X=metal_x_train,y=metal_y_train,
splits=splits, repeats=repeats)

cv_score.name = model
score_models = score_models.append(cv_score)

plt.figure()
plt.errorbar(alph_range, abs(grid_results['mean_test_score']),
abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('alpha')
plt.ylabel('score')

若报错:AttributeError: ‘DataFrame’ object has no attribute ‘append’

解决方案:

pandas版本较新,新版本将append改为了_append

dataframe = dataframe._append()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
model = 'Lasso'

metal_models[model] = Lasso()
alph_range = np.arange(1e-4,1e-3,4e-5)
param_grid = {'alpha': alph_range}

metal_models[model], cv_score, grid_results = train_model(metal_models[model], param_grid=param_grid, X=metal_x_train,y=metal_y_train,
splits=splits, repeats=repeats)

cv_score.name = model
score_models = score_models.append(cv_score)

plt.figure()
plt.errorbar(alph_range, abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('alpha')
plt.ylabel('score')
1
2
3
4
5
6
7
8
9
10
11
12
model ='ElasticNet'
metal_models[model] = ElasticNet()

param_grid = {'alpha': np.arange(1e-4,1e-3,1e-4),
'l1_ratio': np.arange(0.1,1.0,0.1),
'max_iter':[100000]}

metal_models[model], cv_score, grid_results = train_model(metal_models[model], param_grid=param_grid, X=metal_x_train,y=metal_y_train,
splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)
1
2
3
4
5
6
7
8
9
10
11
12
model = 'XGB'
metal_models[model] = XGBRegressor()

param_grid = {'n_estimators':[100,200,300,400,500],
'max_depth':[1,2,3],
}

metal_models[model], cv_score,grid_results = train_model(metal_models[model], param_grid=param_grid, X=metal_x_train,y=metal_train_y_train,
splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)
1
2
3
4
5
6
7
8
9
10
11
12
model = 'GradientBoosting'
metal_models[model] = GradientBoostingRegressor()

param_grid = {'n_estimators':[150,250,350],
'max_depth':[1,2,3],
'min_samples_split':[5,6,7]}

metal_models[model], cv_score, grid_results = train_model(metal_models[model], param_grid=param_grid, X=metal_x_train,y=metal_y_train,
splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)
1
2
3
4
5
6
7
8
9
10
11
12
model = 'RandomForest'
metal_models[model] = RandomForestRegressor()

param_grid = {'n_estimators':[100,150,200],
'max_features':[2,3,4,5],
'min_samples_split':[2,4,6]}

metal_models[model], cv_score, grid_results = train_model(metal_models[model], param_grid=param_grid, X=metal_x_train,y=metal_y_train,
splits=5, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)
1
model_predict(metal_x_valid,metal_y_valid,stack=True)

工业蒸汽预测:从数据探索到模型堆叠的完整实战
https://blog.966677.xyz/2023/09/28/工业蒸汽预测-赛题完整代码分享/
作者
Zhou1317fe5
发布于
2023年9月28日
许可协议