1 基础代码 1.1 导入相关库 1 2 3 4 5 import pandas as pdimport numpy as npimport warnings warnings.filterwarnings("ignore" )
1.2 读取数据 1 2 3 train_data = pd.read_csv('./data/train_all.csv' ,nrows=10000 ) test_data = pd.read_csv('./data/test_all.csv' ,nrows=100 )
1.3 获取训练和测试数据 1 2 3 4 features_columns = [col for col in train_data.columns if col not in ['user_id' ,'label' ]] train = train_data[features_columns].values test = test_data[features_columns].values target =train_data['label' ].values
2 缺失值补全 处理缺失值有很多方法,最常用为以下几种:
删除。当数据量较大时,或者缺失数据占比较小时,可以使用这种方法。
填充。通用的方法是采用平均数、中位数来填充,可以适用插值或者模型预测的方法进行缺失补全。
不处理。树类模型对缺失值不明感。
采用中值进行填充 1 2 3 4 5 6 7 8 9 from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean' ) imputer = imputer.fit(train) train_imputer = imputer.transform(train) test_imputer = imputer.transform(test)
3 特征选择 下面将采用前面提到的方法来进行特征选择,然后通过以下代码对比特征选择前后模型的性能。
1 2 3 4 5 6 7 8 9 10 11 from sklearn.model_selection import cross_val_scorefrom sklearn.ensemble import RandomForestClassifierdef feature_selection (train, train_sel, target ): clf = RandomForestClassifier(n_estimators=100 , max_depth=2 , random_state=0 , n_jobs=-1 ) scores = cross_val_score(clf, train, target, cv=5 ) scores_sel = cross_val_score(clf, train_sel, target, cv=5 ) print ("No Select Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2 )) print ("Features Select Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2 ))
3.1删除方差较小的要素 VarianceThreshold是一种简单的基线特征选择方法。它会删除方差不符合某个阈值的所有要素。默认情况下,它会删除所有零方差要素,即在所有样本中具有相同值的要素。
1 2 3 4 5 6 7 8 from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=(.8 * (1 - .8 ))) sel = sel.fit(train) train_sel = sel.transform(train) test_sel = sel.transform(test)print ('训练数据未特征筛选维度' , train.shape)print ('训练数据特征筛选维度后' , train_sel.shape)
训练数据未特征筛选维度 (8455, 229)
训练数据特征筛选维度后 (8455, 24)
特征选择前后区别
1 feature_selection(train, train_sel, target)
No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)
3.2单变量特征选择 通过基于单变量统计检验选择最佳特征。
1 2 3 4 5 6 7 8 9 from sklearn.feature_selection import SelectKBestfrom sklearn.feature_selection import mutual_info_classif sel = SelectKBest(mutual_info_classif, k=2 ) sel = sel.fit(train, target) train_sel = sel.transform(train) test_sel = sel.transform(test)print ('训练数据未特征筛选维度' , train.shape)print ('训练数据特征筛选维度后' , train_sel.shape)
训练数据未特征筛选维度 (8455, 229)
训练数据特征筛选维度后 (8455, 2)
1 2 3 4 5 6 sel = SelectKBest(mutual_info_classif, k=10 ) sel = sel.fit(train, target) train_sel = sel.transform(train) test_sel = sel.transform(test)print ('训练数据未特征筛选维度' , train.shape)print ('训练数据特征筛选维度后' , train_sel.shape)
训练数据未特征筛选维度 (8455, 229)
训练数据特征筛选维度后 (8455, 10)
特征选择前后区别
1 feature_selection(train, train_sel, target)
No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)
3.3递归功能消除 通过递归地训练多个模型来选择特征。首先,它使用整个特征集合训练一个模型,并按照得分最低的特征的顺序依次消除特征,直到达到预定的特征数量。
1 2 3 4 5 6 7 8 from sklearn.feature_selection import RFECVfrom sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=10 , max_depth=2 , random_state=0 , n_jobs=-1 ) selector = RFECV(clf, step=1 , cv=2 ) selector = selector.fit(train, target)print (selector.support_)print (selector.ranking_)
[False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False False False True
True]
[228 227 226 224 222 219 218 216 215 214 213 212 211 210 206 205 204 203
201 199 195 192 186 178 174 171 169 168 167 165 164 163 162 160 159 158
157 156 154 153 152 151 149 148 147 145 144 143 142 140 138 137 136 135
134 133 132 131 130 129 126 125 124 122 118 117 116 115 114 113 112 111
109 107 106 105 104 103 102 95 93 91 90 79 78 75 73 72 70 69
68 62 59 58 57 53 34 30 27 3 8 19 5 15 4 11 13 10
16 21 2 175 179 187 181 225 223 221 220 217 183 189 207 209 208 193
197 202 200 198 196 194 191 190 188 185 184 182 180 177 176 173 172 170
25 166 31 161 35 37 155 39 41 150 43 146 45 141 139 47 49 51
63 127 128 119 123 121 120 81 83 85 87 110 108 97 99 101 100 98
96 94 92 89 88 86 84 82 80 77 76 74 71 65 67 66 64 61
60 55 56 54 52 50 48 46 44 42 40 38 36 33 32 29 28 26
24 14 12 9 7 6 20 22 17 18 23 1 1]
3.4使用模型选择特征 使用LR拟合的参数进行变量选择(L2范数进行特征选择),LR模型采用拟合参数形式进行变量选择,筛选对回归目标影响大的特征
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 from sklearn.feature_selection import SelectFromModelfrom sklearn.linear_model import LogisticRegressionfrom sklearn.preprocessing import Normalizer normalizer = Normalizer() normalizer = normalizer.fit(train) train_norm = normalizer.transform(train) test_norm = normalizer.transform(test) LR = LogisticRegression(penalty='l2' ,C=5 ) LR = LR.fit(train_norm, target) model = SelectFromModel(LR, prefit=True ) train_sel = model.transform(train) test_sel = model.transform(test)print ('训练数据未特征筛选维度' , train.shape)print ('训练数据特征筛选维度后' , train_sel.shape)
训练数据未特征筛选维度 (8455, 229)
训练数据特征筛选维度后 (8455, 19)
L2范数选择参数
array([ 0.23210864, 0.03214927, -0.00939419, 0.85088717, -0.91507123,
-0.26081965, -0.86681364, 0.57445561, 0.73849952, 0.00342517])
特征选择前后区别
1 feature_selection(train, train_sel, target)
No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)
使用LR拟合的参数进行变量选择(L1范数进行特征选择),LR模型采用拟合参数形式进行变量选择,筛选对回归目标影响大的特征
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 from sklearn.feature_selection import SelectFromModelfrom sklearn.linear_model import LogisticRegressionfrom sklearn.preprocessing import Normalizer normalizer = Normalizer() normalizer = normalizer.fit(train) train_norm = normalizer.transform(train) test_norm = normalizer.transform(test) LR = LogisticRegression(penalty='l1' ,C=5 ,solver='liblinear' ) LR = LR.fit(train_norm, target) model = SelectFromModel(LR, prefit=True ) train_sel = model.transform(train) test_sel = model.transform(test)print ('训练数据未特征筛选维度' , train.shape)print ('训练数据特征筛选维度后' , train_sel.shape)
训练数据未特征筛选维度 (8455, 229)
训练数据特征筛选维度后 (8455, 12)
L1范数选择参数。对于α的良好选择,只要满足某些特定条件,LASSO就可以仅使用少量观察来完全恢复精确的非零变量集。
array([0.16879959, 0. , 0. , 0.56714802, 0. ,
0. , 0. , 0.78078353, 0. , 0. ])
特征选择前后区别
1 feature_selection(train, train_sel, target)
No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)
3.5基于树模型特征选择 树模型基于分裂评价标准所计算的总的评分作为依据进行相关排序,然后进行特征筛选
1 2 3 4 5 6 7 8 9 10 11 from sklearn.ensemble import ExtraTreesClassifierfrom sklearn.feature_selection import SelectFromModel clf = ExtraTreesClassifier(n_estimators=50 ) clf = clf.fit(train, target) model = SelectFromModel(clf, prefit=True ) train_sel = model.transform(train) test_sel = model.transform(test)print ('训练数据未特征筛选维度' , train.shape)print ('训练数据特征筛选维度后' , train_sel.shape)
训练数据未特征筛选维度 (8455, 229)
训练数据特征筛选维度后 (8455, 72)
树特征重要性
1 clf.feature_importances_[:10 ]
array([0.08131766, 0.01536015, 0.00893797, 0.01597656, 0.01636607,
0.01680214, 0.01653297, 0.01548492, 0.01723172, 0.00725235])
1 2 3 df_features_import = pd.DataFrame() df_features_import['features_import' ] = clf.feature_importances_ df_features_import['features_name' ] = features_columns
1 df_features_import.sort_values(['features_import' ],ascending=0 ).head(30 )
features_import
features_name
0
0.081318
merchant_id
228
0.075728
xgb_clf
227
0.067072
lgb_clf
20
0.018029
brand_most_1_cnt
18
0.017616
seller_most_1_cnt
14
0.017317
seller_most_1
8
0.017232
time_stamp_nunique
21
0.017172
action_type_1_cnt
15
0.017125
cat_most_1
26
0.016890
seller_nunique_0
5
0.016802
cat_nunique
12
0.016614
time_stamp_std
6
0.016533
brand_nunique
4
0.016366
seller_nunique
3
0.015977
user_cnt
16
0.015876
brand_most_1
22
0.015715
user_cnt_0
24
0.015566
user_cnt_2
7
0.015485
item_nunique
1
0.015360
age_range
19
0.015145
cat_most_1_cnt
25
0.014939
user_cnt_3
23
0.014691
user_cnt_1
87
0.009245
tfidf_60
2
0.008938
gender
86
0.008607
tfidf_59
9
0.007252
action_type_nunique
42
0.006752
tfidf_15
30
0.006479
tfidf_3
37
0.006460
tfidf_10
特征选择前后区别
1 feature_selection(train, train_sel, target)
No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)
3.6 Lgb特征重要性 利用LGB模型进行特征选择:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 import lightgbmfrom sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.4 , random_state=0 ) clf = lightgbm train_matrix = clf.Dataset(X_train, label=y_train) test_matrix = clf.Dataset(X_test, label=y_test) params = { 'boosting_type' : 'gbdt' , 'objective' : 'multiclass' , 'metric' : 'multi_logloss' , 'min_child_weight' : 1.5 , 'num_leaves' : 2 **5 , 'lambda_l2' : 10 , 'subsample' : 0.7 , 'colsample_bytree' : 0.7 , 'colsample_bylevel' : 0.7 , 'learning_rate' : 0.03 , 'tree_method' : 'exact' , 'seed' : 2017 , "num_class" : 2 , 'silent' : True , } num_round = 10000 early_stopping_rounds = 100 model = clf.train(params, train_matrix, num_round, valid_sets=test_matrix, callbacks=[lightgbm.early_stopping(stopping_rounds=100 )])
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 def lgb_transform (train, test, model, topK ): train_df = pd.DataFrame(train) train_df.columns = range (train.shape[1 ]) test_df = pd.DataFrame(test) test_df.columns = range (test.shape[1 ]) features_import = pd.DataFrame() features_import['importance' ] = model.feature_importance() features_import['col' ] = range (train.shape[1 ]) features_import = features_import.sort_values(['importance' ],ascending=0 ).head(topK) sel_col = list (features_import.col) train_sel = train_df[sel_col] test_sel = test_df[sel_col] return train_sel, test_sel
1 2 3 train_sel, test_sel = lgb_transform(train, test, model, 20 )print ('训练数据未特征筛选维度' , train.shape)print ('训练数据特征筛选维度后' , train_sel.shape)
训练数据未特征筛选维度 (8455, 229)
训练数据特征筛选维度后 (8455, 20)
lgb特征重要性
1 model.feature_importance()[:10 ]
array([ 85, 29, 7, 69, 103, 98, 68, 40, 124, 2])
特征选择前后区别
1 feature_selection(train, train_sel, target)
No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)