# 导入需要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.feature_selection import mutual_info_regression as MIC
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
# 读取文件
train = pd.read_csv('used_car_train_20200313.csv',sep=' ')
test = pd.read_csv('used_car_testB_20200421.csv',sep=' ')
print('train:{}'.format(train.shape))
print('test:{}'.format(test.shape))
train:(150000, 31)
test:(50000, 30)
# 查看数据信息
train.info()
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SaleID 150000 non-null int64
1 name 150000 non-null int64
2 regDate 150000 non-null int64
3 model 149999 non-null float64
4 brand 150000 non-null int64
5 bodyType 145494 non-null float64
6 fuelType 141320 non-null float64
7 gearbox 144019 non-null float64
8 power 150000 non-null int64
9 kilometer 150000 non-null float64
10 notRepairedDamage 150000 non-null object
11 regionCode 150000 non-null int64
12 seller 150000 non-null int64
13 offerType 150000 non-null int64
14 creatDate 150000 non-null int64
15 price 150000 non-null int64
16 v_0 150000 non-null float64
17 v_1 150000 non-null float64
18 v_2 150000 non-null float64
19 v_3 150000 non-null float64
20 v_4 150000 non-null float64
21 v_5 150000 non-null float64
22 v_6 150000 non-null float64
23 v_7 150000 non-null float64
24 v_8 150000 non-null float64
25 v_9 150000 non-null float64
26 v_10 150000 non-null float64
27 v_11 150000 non-null float64
28 v_12 150000 non-null float64
29 v_13 150000 non-null float64
30 v_14 150000 non-null float64
dtypes: float64(20), int64(10), object(1)
memory usage: 35.5+ MB
# 查看数据前五行
train.head()
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_5 | v_6 | v_7 | v_8 | v_9 | v_10 | v_11 | v_12 | v_13 | v_14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 736 | 20040402 | 30.0 | 6 | 1.0 | 0.0 | 0.0 | 60 | 12.5 | ... | 0.235676 | 0.101988 | 0.129549 | 0.022816 | 0.097462 | -2.881803 | 2.804097 | -2.420821 | 0.795292 | 0.914762 |
1 | 1 | 2262 | 20030301 | 40.0 | 1 | 2.0 | 0.0 | 0.0 | 0 | 15.0 | ... | 0.264777 | 0.121004 | 0.135731 | 0.026597 | 0.020582 | -4.900482 | 2.096338 | -1.030483 | -1.722674 | 0.245522 |
2 | 2 | 14874 | 20040403 | 115.0 | 15 | 1.0 | 0.0 | 0.0 | 163 | 12.5 | ... | 0.251410 | 0.114912 | 0.165147 | 0.062173 | 0.027075 | -4.846749 | 1.803559 | 1.565330 | -0.832687 | -0.229963 |
3 | 3 | 71865 | 19960908 | 109.0 | 10 | 0.0 | 0.0 | 1.0 | 193 | 15.0 | ... | 0.274293 | 0.110300 | 0.121964 | 0.033395 | 0.000000 | -4.509599 | 1.285940 | -0.501868 | -2.438353 | -0.478699 |
4 | 4 | 111080 | 20120103 | 110.0 | 5 | 1.0 | 0.0 | 0.0 | 68 | 5.0 | ... | 0.228036 | 0.073205 | 0.091880 | 0.078819 | 0.121534 | -1.896240 | 0.910783 | 0.931110 | 2.834518 | 1.923482 |
5 rows × 31 columns
# 查看测试集信息
test.info()
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SaleID 50000 non-null int64
1 name 50000 non-null int64
2 regDate 50000 non-null int64
3 model 50000 non-null float64
4 brand 50000 non-null int64
5 bodyType 48496 non-null float64
6 fuelType 47076 non-null float64
7 gearbox 48032 non-null float64
8 power 50000 non-null int64
9 kilometer 50000 non-null float64
10 notRepairedDamage 50000 non-null object
11 regionCode 50000 non-null int64
12 seller 50000 non-null int64
13 offerType 50000 non-null int64
14 creatDate 50000 non-null int64
15 v_0 50000 non-null float64
16 v_1 50000 non-null float64
17 v_2 50000 non-null float64
18 v_3 50000 non-null float64
19 v_4 50000 non-null float64
20 v_5 50000 non-null float64
21 v_6 50000 non-null float64
22 v_7 50000 non-null float64
23 v_8 50000 non-null float64
24 v_9 50000 non-null float64
25 v_10 50000 non-null float64
26 v_11 50000 non-null float64
27 v_12 50000 non-null float64
28 v_13 50000 non-null float64
29 v_14 50000 non-null float64
dtypes: float64(20), int64(9), object(1)
memory usage: 11.4+ MB
train['notRepairedDamage'].value_counts()
0.0 111361
- 24324
1.0 14315
Name: notRepairedDamage, dtype: int64
train['notRepairedDamage'].value_counts()
0.0 111361
- 24324
1.0 14315
Name: notRepairedDamage, dtype: int64
# 将‘-’转化成空值,并将notRepairedDamage特征转换成数值型
train['notRepairedDamage'] = train['notRepairedDamage'].replace('-', np.nan).astype('float')
train['notRepairedDamage'].value_counts()
0.0 111361
1.0 14315
Name: notRepairedDamage, dtype: int64
test['notRepairedDamage'] = test['notRepairedDamage'].replace('-', np.nan).astype('float')
test['notRepairedDamage'].value_counts()
0.0 37224
1.0 4707
Name: notRepairedDamage, dtype: int64
# 查看缺失值特征, 看到全是分类特征
print(train['model'].value_counts())
print(train['bodyType'].value_counts())
print(train['fuelType'].value_counts())
print(train['gearbox'].value_counts())
0.0 11762
19.0 9573
4.0 8445
1.0 6038
29.0 5186
...
240.0 2
209.0 2
245.0 2
242.0 2
247.0 1
Name: model, Length: 248, dtype: int64
0.0 41420
1.0 35272
2.0 30324
3.0 13491
4.0 9609
5.0 7607
6.0 6482
7.0 1289
Name: bodyType, dtype: int64
0.0 91656
1.0 46991
2.0 2212
3.0 262
4.0 118
5.0 45
6.0 36
Name: fuelType, dtype: int64
0.0 111623
1.0 32396
Name: gearbox, dtype: int64
# 获取有缺失值的特征
col_train_null = train.columns[train.isnull().any()].to_list()
col_test_null = test.columns[test.isnull().any()].to_list()
print(col_train_null)
print(col_test_null)
['model', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
['bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
# 使用SimpleImputer进行缺失值填充
imp = SimpleImputer(strategy='most_frequent')
train[col_train_null] = imp.fit_transform(train[col_train_null])
test[col_train_null] = imp.fit_transform(test[col_train_null])
# 检查特征
train.isnull().any().sum()
0
test.isnull().any().sum()
0
# 检查日期列的异常值
train['regDate'].astype('str').str[4:6].value_counts()
03 14949
06 13809
04 12798
05 12614
07 11937
10 11490
00 11347
11 10687
12 10637
09 10522
01 9943
08 9936
02 9331
Name: regDate, dtype: int64
# 定义函数,用于转换月份为零的值
def tran_date(x):
month = int(x[4:6])
if month == 0:
month = 1
return x[0:4] + '-' + str(month) + '-' + x[6:]
# 日期替换
train['regDate'] = pd.to_datetime(train['regDate'].astype('str').apply(tran_date))
test['regDate'] = pd.to_datetime(test['regDate'].astype('str').apply(tran_date))
train['creatDate'] = pd.to_datetime(train['creatDate'].astype('str').apply(tran_date))
test['creatDate'] = pd.to_datetime(test['creatDate'].astype('str').apply(tran_date))
# 密度图查看price列
sns.distplot(train['price'])
# 取对数调整偏态
sns.distplot(np.log(train['price']))
# 查看一下分布
train['price'].describe([0.01,0.25,0.5,0.75,0.99])
count 150000.000000
mean 5923.327333
std 7501.998477
min 11.000000
1% 150.000000
25% 1300.000000
50% 3250.000000
75% 7700.000000
99% 34950.000000
max 99999.000000
Name: price, dtype: float64
# 查看年份和价格的关系
train.resample('Y', on='regDate')['price'].mean().to_period('Y').plot(kind='bar')
# 创造特征
train['diff_day'] = (train['creatDate'] - train['regDate']).dt.days
train['diff_year'] = round(train['diff_day'] / 365, 1)
train['regDate_year'] = train['regDate'].dt.year
train['regDate_month'] = train['regDate'].dt.month
train['regDate_day'] = train['regDate'].dt.day
train['creatDate_year'] = train['creatDate'].dt.year
train['creatDate_month'] = train['creatDate'].dt.month
train['creatDate_day'] = train['creatDate'].dt.day
test['diff_day'] = (test['creatDate'] - test['regDate']).dt.days
test['diff_year'] = round(test['diff_day'] / 365, 1)
test['regDate_year'] = test['regDate'].dt.year
test['regDate_month'] = test['regDate'].dt.month
test['regDate_day'] = test['regDate'].dt.day
test['creatDate_year'] = test['creatDate'].dt.year
test['creatDate_month'] = test['creatDate'].dt.month
test['creatDate_day'] = test['creatDate'].dt.day
# 查看name特征有多少种类
train['name'].unique().shape
(99662,)
# 对power的描述性统计
train['power'].describe()
count 150000.000000
mean 119.316547
std 177.168419
min 0.000000
25% 75.000000
50% 110.000000
75% 150.000000
max 19312.000000
Name: power, dtype: float64
# 对model的描述性统计
train['model'].describe()
count 150000.000000
mean 47.128707
std 49.536024
min 0.000000
25% 10.000000
50% 30.000000
75% 66.000000
max 247.000000
Name: model, dtype: float64
# 对power进行分箱
bin = [i*20 for i in range(0,31)]
train['power_bin'] = pd.cut(train['power'], bin, labels=False).fillna(31)
test['power_bin'] = pd.cut(test['power'], bin, labels=False).fillna(31)
# 对model进行分箱
bin_model = [i*10 for i in range(0,26)]
train['model_bin'] = pd.cut(train['model'], bin_model, labels=False)
test['model_bin'] = pd.cut(test['model'], bin_model, labels=False)
train['model_bin'].value_counts()
0.0 25963
1.0 21123
2.0 18095
4.0 14872
3.0 11069
6.0 8748
7.0 5193
5.0 4629
8.0 3879
10.0 3818
11.0 3376
9.0 2550
12.0 2417
16.0 2096
15.0 1993
17.0 1699
13.0 1623
14.0 1162
18.0 988
19.0 860
21.0 771
20.0 640
22.0 473
23.0 171
24.0 29
Name: model_bin, dtype: int64
# 找出分类型特征
col_clf = ['brand', 'bodyType', 'fuelType', 'gearbox', 'kilometer', 'notRepairedDamage', 'seller', 'offerType']
train[col_clf]
brand | bodyType | fuelType | gearbox | kilometer | notRepairedDamage | seller | offerType | |
---|---|---|---|---|---|---|---|---|
0 | 6 | 1.0 | 0.0 | 0.0 | 12.5 | 0.0 | 0 | 0 |
1 | 1 | 2.0 | 0.0 | 0.0 | 15.0 | 0.0 | 0 | 0 |
2 | 15 | 1.0 | 0.0 | 0.0 | 12.5 | 0.0 | 0 | 0 |
3 | 10 | 0.0 | 0.0 | 1.0 | 15.0 | 0.0 | 0 | 0 |
4 | 5 | 1.0 | 0.0 | 0.0 | 5.0 | 0.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
149995 | 10 | 4.0 | 0.0 | 1.0 | 15.0 | 0.0 | 0 | 0 |
149996 | 11 | 0.0 | 0.0 | 0.0 | 10.0 | 0.0 | 0 | 0 |
149997 | 11 | 1.0 | 1.0 | 0.0 | 6.0 | 0.0 | 0 | 0 |
149998 | 10 | 3.0 | 1.0 | 0.0 | 15.0 | 0.0 | 0 | 0 |
149999 | 28 | 6.0 | 0.0 | 1.0 | 12.5 | 0.0 | 0 | 0 |
150000 rows × 8 columns
# 观察分类型特征的数据分布
plt.figure(figsize=(18,10))
for i in range(len(col_clf)):
plt.subplot(2,4,i+1)
train[col_clf[i]].value_counts().plot(kind='bar',color='yellow')
test[col_clf[i]].value_counts().plot(kind='bar',color='blue')
plt.title(col_clf[i])
plt.tight_layout()
# 可以发现其中有两个特征只有一种类型,删除seller和offerType列
train = train.drop(['seller', 'offerType'], axis=1)
test = test.drop(['seller', 'offerType'], axis=1)
col_clf = ['brand', 'bodyType', 'fuelType', 'gearbox', 'kilometer', 'notRepairedDamage']
# 查看不同分类和价格的关系
plt.figure(figsize=(18,10))
for i in range(len(col_clf)):
plt.subplot(2,3,i+1)
train.groupby(col_clf[i])['price'].mean().plot(kind='bar')
plt.title(col_clf[i])
plt.tight_layout()
# 删去id这一列
train = train.drop(['SaleID'], axis=1)
test = test.drop(['SaleID'], axis=1)
# 绘制热力图,观察特征之间的关系
plt.figure(figsize=(10,10))
corr = train.corr()
sns.heatmap(corr)
# 将name列转换成计数
train['name_count'] = train.groupby('name')['brand'].agg(['count'])
test['name_count'] = test.groupby('name')['brand'].agg(['count'])
# 删除name列
train = train.drop('name', axis=1)
test = test.drop('name', axis=1)
# 将分类特征和价格组合出新的特征
col_clf = ['brand', 'model', 'kilometer', 'fuelType', 'bodyType']
for col in col_clf:
train_gb = train.groupby(col)
all_info = {}
for kind, kind_data in train_gb:
info = {}
info[col + '_amount'] = len(kind_data)
info[col + '_price_max'] = kind_data.price.max()
info[col + '_price_median'] = kind_data.price.median()
info[col + '_price_min'] = kind_data.price.min()
info[col + '_price_sum'] = kind_data.price.sum()
info[col + '_price_std'] = kind_data.price.std()
info[col+'_price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2)
all_info[kind] = info
fe = pd.DataFrame(all_info).T.reset_index().rename(columns={'index':col})
train = train.copy().merge(fe, how='left', on=col)
test = test.copy().merge(fe, how='left', on=col)
print(train.shape)
print(test.shape)
(150000, 73)
(50000, 72)
# kilometer和power组合形成新特征
col_kp = ['kilometer', 'power']
t1 = train.groupby(col_kp[0], as_index=False)[col_kp[1]].agg({
col_kp[0] + '_' + col_kp[1] + '_count':'count',
col_kp[0] + '_' + col_kp[1] + '_max':'max',
col_kp[0] + '_' + col_kp[1] + '_median':'median',
col_kp[0] + '_' + col_kp[1] + '_min':'min',
col_kp[0] + '_' + col_kp[1] + '_sum':'sum',
col_kp[0] + '_' + col_kp[1] + '_std':'std',
col_kp[0] + '_' + col_kp[1] + '_mean':'mean'
})
train = train.copy().merge(t1, how='left', on=col_kp[0])
test = test.copy().merge(t1, how='left', on=col_kp[0])
print(train.shape)
print(test.shape)
(150000, 80)
(50000, 79)
# 将与价格相关性高的匿名特征分别进行计算,生成新的特征
col_v = [0,3,8,12]
for i in col_v:
for j in col_v:
train[str(i)+'*'+str(j)] = train['v_'+str(i)] * train['v_'+str(j)]
test[str(i)+'*'+str(j)] = test['v_'+str(i)] * test['v_'+str(j)]
for i in col_v:
for j in col_v:
train[str(i)+'+'+str(j)] = train['v_'+str(i)] + train['v_'+str(j)]
test[str(i)+'+'+str(j)] = test['v_'+str(i)] + test['v_'+str(j)]
for i in col_v:
for j in col_v:
train[str(i)+'-'+str(j)] = train['v_'+str(i)] - train['v_'+str(j)]
test[str(i)+'-'+str(j)] = test['v_'+str(i)] - test['v_'+str(j)]
for i in col_v:
train[str(i)+'*diff_year'] = train['v_'+str(i)] * train['diff_year']
test[str(i)+'*diff_year'] = test['v_'+str(i)] * test['diff_year']
print(train.shape)
print(test.shape)
(150000, 132)
(50000, 131)
# 深复制
train_new = train.copy(deep=True)
test_new = test.copy(deep=True)
# 删除没有用的特征
X_train = train_new.drop(['price', 'regDate', 'creatDate', 'regionCode'], axis=1)
X_test = test_new.drop(['regDate', 'creatDate', 'regionCode'], axis=1)
y_train = train_new['price']
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
X_train = pd.read_csv('X_train.csv', index_col=0)
X_test = pd.read_csv('X_test.csv', index_col=0)
y_train = pd.read_csv('y_train.csv', index_col=0)
y_train = np.ravel(y_train)
from sklearn.model_selection import GridSearchCV
from lightgbm.sklearn import LGBMRegressor
from time import time
import datetime
# lgbm模型调参 n_estimators
lgbm_scores = []
time0 = time()
for i in np.arange(200, 2001, 100):
reg_lgbm = LGBMRegressor(learning_rate=0.1, n_estimators=i, objective='regression_l1', random_state=42)
lgbm_score = cross_val_score(reg_lgbm, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()
lgbm_scores.append(lgbm_score)
print(time() - time0, lgbm_score)
print(max(lgbm_scores))
print(np.arange(200, 2001, 100)[np.argmax(lgbm_scores)])
plt.figure(figsize=(8,6))
plt.plot(np.arange(200, 2001, 100), lgbm_scores)
8.246474504470825 -594.8424586944835
19.095320463180542 -568.7034240842332
32.98492646217346 -552.3891983733455
48.85633111000061 -542.3964373457885
67.14593052864075 -535.7482170534481
87.76364278793335 -529.8703107609151
111.7964539527893 -525.7724715224499
137.1214382648468 -522.2536456032711
164.56334352493286 -519.7242720183268
194.57858514785767 -517.0013123928143
226.66670727729797 -515.3243156345435
261.38421607017517 -513.8374464322388
298.36878204345703 -512.2929007437376
337.6414248943329 -511.0992392114774
379.26119804382324 -510.01377926737086
423.065954208374 -508.88194129037237
468.8004615306854 -507.9469756382571
516.7109439373016 -506.9312361319216
567.1637334823608 -505.91707028368864
-505.91707028368864
2000
# lgbm模型调参 max_depth和num_leaves
parameters = {
'max_depth':[4,5,6,7],
'num_leaves':np.arange(5,100,5)
}
reg_lgbm = LGBMRegressor(learning_rate=0.1, n_estimators=2000, objective='regression_l1', random_state=42)
gs = GridSearchCV(reg_lgbm, param_grid=parameters, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
gs_model = gs.fit(X_train, y_train)
print('最优分数:{}'.format(gs_model.best_score_))
print('最优参数:{}'.format(gs_model.best_params_))
print('最优模型:{}'.format(gs_model.best_estimator_))
最优分数:-500.9400071901773
最优参数:{'max_depth': 7, 'num_leaves': 45}
最优模型:LGBMRegressor(max_depth=7, n_estimators=2000, num_leaves=45,
objective='regression_l1', random_state=42)
# xgboost模型调参 learning_rate
xgb_scores = []
time0 = time()
for i in np.arange(0.05,0.31,0.05)
reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=i)
xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()
xgb_scores.append(xgb_score)
print(time() - time0)
print(max(xgb_scores))
print(np.arange(0.05,0.31,0.05)[np.argmax(xgb_scores)])
plt.figure(figsize=(8,6))
plt.plot(np.arange(0.05,0.31,0.05), xgb_scores)
132.54270577430725
268.7969219684601
402.8747355937958
543.0700986385345
673.9736497402191
807.1916081905365
-559.3930964745617
0.15000000000000002
# xgboost模型调参 max_depth
xgb_scores = []
time0 = time()
for i in np.arange(5,12,1):
reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=i)
xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()
xgb_scores.append(xgb_score)
print(time() - time0, xgb_score)
print(max(xgb_scores))
print(np.arange(5,12,1)[np.argmax(xgb_scores)])
plt.figure(figsize=(8,6))
plt.plot(np.arange(5,12,1), xgb_scores)
106.13332343101501 -581.7825425249935
233.66106414794922 -559.3930964745617
386.93016719818115 -545.8423732084185
577.3439819812775 -540.0337358052888
789.385425567627 -535.3663493749481
1027.0641367435455 -537.2171228026253
1293.055543422699 -540.2481429747703
-535.3663493749481
9
# xgboost模型调参 colsample_bytree
xgb_scores = []
time0 = time()
for i in np.arange(0.4,0.8,0.1):
reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=9, colsample_bytree=i)
xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()
xgb_scores.append(xgb_score)
print(time() - time0, xgb_score)
print(max(xgb_scores))
print(np.arange(0.4,0.8,0.1)[np.argmax(xgb_scores)])
plt.figure(figsize=(8,6))
plt.plot(np.arange(0.4,0.8,0.1), xgb_scores)
101.24388527870178 -540.0945292119669
219.8485279083252 -536.1881194847441
358.25814485549927 -534.7100199133007
509.0920376777649 -534.7369636623599
-534.7100199133007
0.6
# xgboost模型调参 colsample_bylevel
xgb_scores = []
time0 = time()
for i in np.arange(0.5,1.1,0.1):
reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=9, colsample_bytree=0.6, colsample_bylevel=i)
xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()
xgb_scores.append(xgb_score)
print(time() - time0, xgb_score)
print(max(xgb_scores))
print(np.arange(0.5,1.1,0.1)[np.argmax(xgb_scores)])
plt.figure(figsize=(8,6))
plt.plot(np.arange(0.5,1.1,0.1), xgb_scores)
82.85519623756409 -534.1242236725466
176.23594546318054 -535.4707890065283
279.6718213558197 -534.5832091972042
391.7265202999115 -533.988677477093
518.4175012111664 -533.3711266578522
656.7450432777405 -534.7100199133007
657.538763999939 nan
-533.3711266578522
1.0999999999999999
# 导入sklearn自带的模型融合库
from sklearn.ensemble import StackingRegressor
# 实例化模型
reg_lgbm = LGBMRegressor(max_depth=7, n_estimators=2000, num_leaves=45, objective='regression_l1', random_state=42)
reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=9, colsample_bytree=0.6, colsample_bylevel=0.9)
#进行模型融合
estimators=[('lgbm',reg_lgbm), ('xgb',reg_xgb)]
sr = StackingRegressor(estimators, verbose=True)
sr_scores = cross_val_score(sr, X_train, y_train, cv=3, scoring='neg_mean_absolute_error')
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 2.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 2.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 1.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 2.8min finished
# 查看模型融合分数
sr_scores
array([-491.53751775, -494.7186037 , -486.69418657])
# 导出预测结果
sr.fit(X_train, y_train)
sr_predict = sr.predict(X_test)
pd.DataFrame(sr_predict).to_csv('stack_submit.csv')
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 1.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 4.1min finished