阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)

# 导入需要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.feature_selection import mutual_info_regression as MIC

import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import mean_squared_error, mean_absolute_error
# 读取文件
train = pd.read_csv('used_car_train_20200313.csv',sep=' ')
test = pd.read_csv('used_car_testB_20200421.csv',sep=' ')

print('train:{}'.format(train.shape))
print('test:{}'.format(test.shape))
train:(150000, 31)
test:(50000, 30)
# 查看数据信息
train.info()

RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             150000 non-null  int64  
 1   name               150000 non-null  int64  
 2   regDate            150000 non-null  int64  
 3   model              149999 non-null  float64
 4   brand              150000 non-null  int64  
 5   bodyType           145494 non-null  float64
 6   fuelType           141320 non-null  float64
 7   gearbox            144019 non-null  float64
 8   power              150000 non-null  int64  
 9   kilometer          150000 non-null  float64
 10  notRepairedDamage  150000 non-null  object 
 11  regionCode         150000 non-null  int64  
 12  seller             150000 non-null  int64  
 13  offerType          150000 non-null  int64  
 14  creatDate          150000 non-null  int64  
 15  price              150000 non-null  int64  
 16  v_0                150000 non-null  float64
 17  v_1                150000 non-null  float64
 18  v_2                150000 non-null  float64
 19  v_3                150000 non-null  float64
 20  v_4                150000 non-null  float64
 21  v_5                150000 non-null  float64
 22  v_6                150000 non-null  float64
 23  v_7                150000 non-null  float64
 24  v_8                150000 non-null  float64
 25  v_9                150000 non-null  float64
 26  v_10               150000 non-null  float64
 27  v_11               150000 non-null  float64
 28  v_12               150000 non-null  float64
 29  v_13               150000 non-null  float64
 30  v_14               150000 non-null  float64
dtypes: float64(20), int64(10), object(1)
memory usage: 35.5+ MB
# 查看数据前五行
train.head()
SaleID name regDate model brand bodyType fuelType gearbox power kilometer ... v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
0 0 736 20040402 30.0 6 1.0 0.0 0.0 60 12.5 ... 0.235676 0.101988 0.129549 0.022816 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762
1 1 2262 20030301 40.0 1 2.0 0.0 0.0 0 15.0 ... 0.264777 0.121004 0.135731 0.026597 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522
2 2 14874 20040403 115.0 15 1.0 0.0 0.0 163 12.5 ... 0.251410 0.114912 0.165147 0.062173 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963
3 3 71865 19960908 109.0 10 0.0 0.0 1.0 193 15.0 ... 0.274293 0.110300 0.121964 0.033395 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699
4 4 111080 20120103 110.0 5 1.0 0.0 0.0 68 5.0 ... 0.228036 0.073205 0.091880 0.078819 0.121534 -1.896240 0.910783 0.931110 2.834518 1.923482

5 rows × 31 columns

# 查看测试集信息
test.info()

RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SaleID             50000 non-null  int64  
 1   name               50000 non-null  int64  
 2   regDate            50000 non-null  int64  
 3   model              50000 non-null  float64
 4   brand              50000 non-null  int64  
 5   bodyType           48496 non-null  float64
 6   fuelType           47076 non-null  float64
 7   gearbox            48032 non-null  float64
 8   power              50000 non-null  int64  
 9   kilometer          50000 non-null  float64
 10  notRepairedDamage  50000 non-null  object 
 11  regionCode         50000 non-null  int64  
 12  seller             50000 non-null  int64  
 13  offerType          50000 non-null  int64  
 14  creatDate          50000 non-null  int64  
 15  v_0                50000 non-null  float64
 16  v_1                50000 non-null  float64
 17  v_2                50000 non-null  float64
 18  v_3                50000 non-null  float64
 19  v_4                50000 non-null  float64
 20  v_5                50000 non-null  float64
 21  v_6                50000 non-null  float64
 22  v_7                50000 non-null  float64
 23  v_8                50000 non-null  float64
 24  v_9                50000 non-null  float64
 25  v_10               50000 non-null  float64
 26  v_11               50000 non-null  float64
 27  v_12               50000 non-null  float64
 28  v_13               50000 non-null  float64
 29  v_14               50000 non-null  float64
dtypes: float64(20), int64(9), object(1)
memory usage: 11.4+ MB
train['notRepairedDamage'].value_counts()
0.0    111361
-       24324
1.0     14315
Name: notRepairedDamage, dtype: int64
train['notRepairedDamage'].value_counts()
0.0    111361
-       24324
1.0     14315
Name: notRepairedDamage, dtype: int64
# 将‘-’转化成空值,并将notRepairedDamage特征转换成数值型
train['notRepairedDamage'] = train['notRepairedDamage'].replace('-', np.nan).astype('float')
train['notRepairedDamage'].value_counts()
0.0    111361
1.0     14315
Name: notRepairedDamage, dtype: int64
test['notRepairedDamage'] = test['notRepairedDamage'].replace('-', np.nan).astype('float')
test['notRepairedDamage'].value_counts()
0.0    37224
1.0     4707
Name: notRepairedDamage, dtype: int64
# 查看缺失值特征, 看到全是分类特征
print(train['model'].value_counts())
print(train['bodyType'].value_counts())
print(train['fuelType'].value_counts())
print(train['gearbox'].value_counts())
0.0      11762
19.0      9573
4.0       8445
1.0       6038
29.0      5186
         ...  
240.0        2
209.0        2
245.0        2
242.0        2
247.0        1
Name: model, Length: 248, dtype: int64
0.0    41420
1.0    35272
2.0    30324
3.0    13491
4.0     9609
5.0     7607
6.0     6482
7.0     1289
Name: bodyType, dtype: int64
0.0    91656
1.0    46991
2.0     2212
3.0      262
4.0      118
5.0       45
6.0       36
Name: fuelType, dtype: int64
0.0    111623
1.0     32396
Name: gearbox, dtype: int64
# 获取有缺失值的特征
col_train_null = train.columns[train.isnull().any()].to_list()
col_test_null = test.columns[test.isnull().any()].to_list()

print(col_train_null)
print(col_test_null)
['model', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
['bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
# 使用SimpleImputer进行缺失值填充
imp = SimpleImputer(strategy='most_frequent')

train[col_train_null] = imp.fit_transform(train[col_train_null])
test[col_train_null] = imp.fit_transform(test[col_train_null])
# 检查特征
train.isnull().any().sum()
0
test.isnull().any().sum()
0
# 检查日期列的异常值
train['regDate'].astype('str').str[4:6].value_counts()
03    14949
06    13809
04    12798
05    12614
07    11937
10    11490
00    11347
11    10687
12    10637
09    10522
01     9943
08     9936
02     9331
Name: regDate, dtype: int64
# 定义函数,用于转换月份为零的值
def tran_date(x):
    month = int(x[4:6])
    if month == 0:
        month = 1
    return x[0:4] + '-' + str(month) + '-' + x[6:]
# 日期替换
train['regDate'] = pd.to_datetime(train['regDate'].astype('str').apply(tran_date))
test['regDate'] = pd.to_datetime(test['regDate'].astype('str').apply(tran_date))
train['creatDate'] = pd.to_datetime(train['creatDate'].astype('str').apply(tran_date))
test['creatDate'] = pd.to_datetime(test['creatDate'].astype('str').apply(tran_date))
# 密度图查看price列
sns.distplot(train['price'])

阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)_第1张图片

# 取对数调整偏态
sns.distplot(np.log(train['price']))

阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)_第2张图片

# 查看一下分布
train['price'].describe([0.01,0.25,0.5,0.75,0.99])
count    150000.000000
mean       5923.327333
std        7501.998477
min          11.000000
1%          150.000000
25%        1300.000000
50%        3250.000000
75%        7700.000000
99%       34950.000000
max       99999.000000
Name: price, dtype: float64
# 查看年份和价格的关系
train.resample('Y', on='regDate')['price'].mean().to_period('Y').plot(kind='bar')

阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)_第3张图片

# 创造特征
train['diff_day'] = (train['creatDate'] - train['regDate']).dt.days
train['diff_year'] = round(train['diff_day'] / 365, 1)
train['regDate_year'] = train['regDate'].dt.year
train['regDate_month'] = train['regDate'].dt.month
train['regDate_day'] = train['regDate'].dt.day
train['creatDate_year'] = train['creatDate'].dt.year
train['creatDate_month'] = train['creatDate'].dt.month
train['creatDate_day'] = train['creatDate'].dt.day
test['diff_day'] = (test['creatDate'] - test['regDate']).dt.days
test['diff_year'] = round(test['diff_day'] / 365, 1)
test['regDate_year'] = test['regDate'].dt.year
test['regDate_month'] = test['regDate'].dt.month
test['regDate_day'] = test['regDate'].dt.day
test['creatDate_year'] = test['creatDate'].dt.year
test['creatDate_month'] = test['creatDate'].dt.month
test['creatDate_day'] = test['creatDate'].dt.day
# 查看name特征有多少种类
train['name'].unique().shape
(99662,)
# 对power的描述性统计
train['power'].describe()
count    150000.000000
mean        119.316547
std         177.168419
min           0.000000
25%          75.000000
50%         110.000000
75%         150.000000
max       19312.000000
Name: power, dtype: float64
# 对model的描述性统计
train['model'].describe()
count    150000.000000
mean         47.128707
std          49.536024
min           0.000000
25%          10.000000
50%          30.000000
75%          66.000000
max         247.000000
Name: model, dtype: float64
# 对power进行分箱
bin = [i*20 for i in range(0,31)]

train['power_bin'] = pd.cut(train['power'], bin, labels=False).fillna(31)
test['power_bin'] = pd.cut(test['power'], bin, labels=False).fillna(31)
# 对model进行分箱
bin_model = [i*10 for i in range(0,26)]
train['model_bin'] = pd.cut(train['model'], bin_model, labels=False)
test['model_bin'] = pd.cut(test['model'], bin_model, labels=False)
train['model_bin'].value_counts()
0.0     25963
1.0     21123
2.0     18095
4.0     14872
3.0     11069
6.0      8748
7.0      5193
5.0      4629
8.0      3879
10.0     3818
11.0     3376
9.0      2550
12.0     2417
16.0     2096
15.0     1993
17.0     1699
13.0     1623
14.0     1162
18.0      988
19.0      860
21.0      771
20.0      640
22.0      473
23.0      171
24.0       29
Name: model_bin, dtype: int64
# 找出分类型特征
col_clf = ['brand', 'bodyType', 'fuelType', 'gearbox', 'kilometer', 'notRepairedDamage', 'seller', 'offerType']
train[col_clf]
brand bodyType fuelType gearbox kilometer notRepairedDamage seller offerType
0 6 1.0 0.0 0.0 12.5 0.0 0 0
1 1 2.0 0.0 0.0 15.0 0.0 0 0
2 15 1.0 0.0 0.0 12.5 0.0 0 0
3 10 0.0 0.0 1.0 15.0 0.0 0 0
4 5 1.0 0.0 0.0 5.0 0.0 0 0
... ... ... ... ... ... ... ... ...
149995 10 4.0 0.0 1.0 15.0 0.0 0 0
149996 11 0.0 0.0 0.0 10.0 0.0 0 0
149997 11 1.0 1.0 0.0 6.0 0.0 0 0
149998 10 3.0 1.0 0.0 15.0 0.0 0 0
149999 28 6.0 0.0 1.0 12.5 0.0 0 0

150000 rows × 8 columns

# 观察分类型特征的数据分布
plt.figure(figsize=(18,10))
for i in range(len(col_clf)):
    plt.subplot(2,4,i+1)
    train[col_clf[i]].value_counts().plot(kind='bar',color='yellow')
    test[col_clf[i]].value_counts().plot(kind='bar',color='blue')
    plt.title(col_clf[i])
plt.tight_layout()

阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)_第4张图片

# 可以发现其中有两个特征只有一种类型,删除seller和offerType列
train = train.drop(['seller', 'offerType'], axis=1)
test = test.drop(['seller', 'offerType'], axis=1)
col_clf = ['brand', 'bodyType', 'fuelType', 'gearbox', 'kilometer', 'notRepairedDamage']
# 查看不同分类和价格的关系
plt.figure(figsize=(18,10))
for i in range(len(col_clf)):
    plt.subplot(2,3,i+1)
    train.groupby(col_clf[i])['price'].mean().plot(kind='bar')
    plt.title(col_clf[i])
plt.tight_layout()

阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)_第5张图片

# 删去id这一列
train = train.drop(['SaleID'], axis=1)
test = test.drop(['SaleID'], axis=1)
# 绘制热力图,观察特征之间的关系
plt.figure(figsize=(10,10))
corr = train.corr()
sns.heatmap(corr)

阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)_第6张图片

# 将name列转换成计数
train['name_count'] = train.groupby('name')['brand'].agg(['count'])
test['name_count'] = test.groupby('name')['brand'].agg(['count'])
# 删除name列
train = train.drop('name', axis=1)
test = test.drop('name', axis=1)
# 将分类特征和价格组合出新的特征
col_clf = ['brand', 'model', 'kilometer', 'fuelType', 'bodyType']
for col in col_clf:
    train_gb = train.groupby(col)
    all_info = {}
    for kind, kind_data in train_gb:
        info = {}
        info[col + '_amount'] = len(kind_data)
        info[col + '_price_max'] = kind_data.price.max()
        info[col + '_price_median'] = kind_data.price.median()
        info[col + '_price_min'] = kind_data.price.min()
        info[col + '_price_sum'] = kind_data.price.sum()
        info[col + '_price_std'] = kind_data.price.std()
        info[col+'_price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2)
        all_info[kind] = info
    fe = pd.DataFrame(all_info).T.reset_index().rename(columns={'index':col})
    train = train.copy().merge(fe, how='left', on=col)
    test = test.copy().merge(fe, how='left', on=col)

print(train.shape)
print(test.shape)
(150000, 73)
(50000, 72)
# kilometer和power组合形成新特征
col_kp = ['kilometer', 'power']
t1 = train.groupby(col_kp[0], as_index=False)[col_kp[1]].agg({
    col_kp[0] + '_' + col_kp[1] + '_count':'count',
    col_kp[0] + '_' + col_kp[1] + '_max':'max',
    col_kp[0] + '_' + col_kp[1] + '_median':'median',
    col_kp[0] + '_' + col_kp[1] + '_min':'min',
    col_kp[0] + '_' + col_kp[1] + '_sum':'sum',
    col_kp[0] + '_' + col_kp[1] + '_std':'std',
    col_kp[0] + '_' + col_kp[1] + '_mean':'mean'
})
train = train.copy().merge(t1, how='left', on=col_kp[0])
test = test.copy().merge(t1, how='left', on=col_kp[0])

print(train.shape)
print(test.shape)
(150000, 80)
(50000, 79)
# 将与价格相关性高的匿名特征分别进行计算,生成新的特征
col_v = [0,3,8,12]
for i in col_v:
    for j in col_v:
        train[str(i)+'*'+str(j)] = train['v_'+str(i)] * train['v_'+str(j)]
        test[str(i)+'*'+str(j)] = test['v_'+str(i)] * test['v_'+str(j)]

for i in col_v:
    for j in col_v:
        train[str(i)+'+'+str(j)] = train['v_'+str(i)] + train['v_'+str(j)]
        test[str(i)+'+'+str(j)] = test['v_'+str(i)] + test['v_'+str(j)]
        
for i in col_v:
    for j in col_v:
        train[str(i)+'-'+str(j)] = train['v_'+str(i)] - train['v_'+str(j)]
        test[str(i)+'-'+str(j)] = test['v_'+str(i)] - test['v_'+str(j)]

for i in col_v:
    train[str(i)+'*diff_year'] = train['v_'+str(i)] * train['diff_year']
    test[str(i)+'*diff_year'] = test['v_'+str(i)] * test['diff_year']
    
print(train.shape)
print(test.shape)
(150000, 132)
(50000, 131)
# 深复制
train_new = train.copy(deep=True)
test_new = test.copy(deep=True)
# 删除没有用的特征
X_train = train_new.drop(['price', 'regDate', 'creatDate', 'regionCode'], axis=1)
X_test = test_new.drop(['regDate', 'creatDate', 'regionCode'], axis=1)
y_train = train_new['price']
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
X_train = pd.read_csv('X_train.csv', index_col=0)
X_test = pd.read_csv('X_test.csv', index_col=0)
y_train = pd.read_csv('y_train.csv', index_col=0)
y_train = np.ravel(y_train)
from sklearn.model_selection import GridSearchCV
from lightgbm.sklearn import LGBMRegressor
from time import time
import datetime
# lgbm模型调参 n_estimators
lgbm_scores = []
time0 = time()
for i in np.arange(200, 2001, 100):
    reg_lgbm = LGBMRegressor(learning_rate=0.1, n_estimators=i, objective='regression_l1', random_state=42)

    lgbm_score = cross_val_score(reg_lgbm, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()
    lgbm_scores.append(lgbm_score)
    print(time() - time0, lgbm_score)

print(max(lgbm_scores))
print(np.arange(200, 2001, 100)[np.argmax(lgbm_scores)])

plt.figure(figsize=(8,6))
plt.plot(np.arange(200, 2001, 100), lgbm_scores)
8.246474504470825 -594.8424586944835
19.095320463180542 -568.7034240842332
32.98492646217346 -552.3891983733455
48.85633111000061 -542.3964373457885
67.14593052864075 -535.7482170534481
87.76364278793335 -529.8703107609151
111.7964539527893 -525.7724715224499
137.1214382648468 -522.2536456032711
164.56334352493286 -519.7242720183268
194.57858514785767 -517.0013123928143
226.66670727729797 -515.3243156345435
261.38421607017517 -513.8374464322388
298.36878204345703 -512.2929007437376
337.6414248943329 -511.0992392114774
379.26119804382324 -510.01377926737086
423.065954208374 -508.88194129037237
468.8004615306854 -507.9469756382571
516.7109439373016 -506.9312361319216
567.1637334823608 -505.91707028368864
-505.91707028368864
2000

阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)_第7张图片

# lgbm模型调参 max_depth和num_leaves
parameters = {
    'max_depth':[4,5,6,7],
    'num_leaves':np.arange(5,100,5)
}
reg_lgbm = LGBMRegressor(learning_rate=0.1, n_estimators=2000, objective='regression_l1', random_state=42)

gs = GridSearchCV(reg_lgbm, param_grid=parameters, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
gs_model = gs.fit(X_train, y_train)
print('最优分数:{}'.format(gs_model.best_score_))
print('最优参数:{}'.format(gs_model.best_params_))
print('最优模型:{}'.format(gs_model.best_estimator_))
最优分数:-500.9400071901773
最优参数:{'max_depth': 7, 'num_leaves': 45}
最优模型:LGBMRegressor(max_depth=7, n_estimators=2000, num_leaves=45,
              objective='regression_l1', random_state=42)
# xgboost模型调参 learning_rate
xgb_scores = []
time0 = time()
for i in np.arange(0.05,0.31,0.05)
    reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=i)
    xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()
    xgb_scores.append(xgb_score)
    print(time() - time0)

print(max(xgb_scores))
print(np.arange(0.05,0.31,0.05)[np.argmax(xgb_scores)])

plt.figure(figsize=(8,6))
plt.plot(np.arange(0.05,0.31,0.05), xgb_scores)
132.54270577430725
268.7969219684601
402.8747355937958
543.0700986385345
673.9736497402191
807.1916081905365
-559.3930964745617
0.15000000000000002

阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)_第8张图片

# xgboost模型调参 max_depth
xgb_scores = []
time0 = time()
for i in np.arange(5,12,1):
    reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=i)
    xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()
    xgb_scores.append(xgb_score)
    print(time() - time0, xgb_score)

print(max(xgb_scores))
print(np.arange(5,12,1)[np.argmax(xgb_scores)])

plt.figure(figsize=(8,6))
plt.plot(np.arange(5,12,1), xgb_scores)
106.13332343101501 -581.7825425249935
233.66106414794922 -559.3930964745617
386.93016719818115 -545.8423732084185
577.3439819812775 -540.0337358052888
789.385425567627 -535.3663493749481
1027.0641367435455 -537.2171228026253
1293.055543422699 -540.2481429747703
-535.3663493749481
9

阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)_第9张图片

# xgboost模型调参 colsample_bytree
xgb_scores = []
time0 = time()
for i in np.arange(0.4,0.8,0.1):
    reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=9, colsample_bytree=i)
    xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()
    xgb_scores.append(xgb_score)
    print(time() - time0, xgb_score)

print(max(xgb_scores))
print(np.arange(0.4,0.8,0.1)[np.argmax(xgb_scores)])

plt.figure(figsize=(8,6))
plt.plot(np.arange(0.4,0.8,0.1), xgb_scores)
101.24388527870178 -540.0945292119669
219.8485279083252 -536.1881194847441
358.25814485549927 -534.7100199133007
509.0920376777649 -534.7369636623599
-534.7100199133007
0.6

阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)_第10张图片

# xgboost模型调参 colsample_bylevel
xgb_scores = []
time0 = time()
for i in np.arange(0.5,1.1,0.1):
    reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=9, colsample_bytree=0.6, colsample_bylevel=i)
    xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()
    xgb_scores.append(xgb_score)
    print(time() - time0, xgb_score)

print(max(xgb_scores))
print(np.arange(0.5,1.1,0.1)[np.argmax(xgb_scores)])

plt.figure(figsize=(8,6))
plt.plot(np.arange(0.5,1.1,0.1), xgb_scores)
82.85519623756409 -534.1242236725466
176.23594546318054 -535.4707890065283
279.6718213558197 -534.5832091972042
391.7265202999115 -533.988677477093
518.4175012111664 -533.3711266578522
656.7450432777405 -534.7100199133007


657.538763999939 nan
-533.3711266578522
1.0999999999999999

阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)_第11张图片

# 导入sklearn自带的模型融合库
from sklearn.ensemble import StackingRegressor
# 实例化模型
reg_lgbm = LGBMRegressor(max_depth=7, n_estimators=2000, num_leaves=45, objective='regression_l1', random_state=42)
reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=9, colsample_bytree=0.6, colsample_bylevel=0.9)
#进行模型融合
estimators=[('lgbm',reg_lgbm), ('xgb',reg_xgb)]

sr = StackingRegressor(estimators, verbose=True)
sr_scores = cross_val_score(sr, X_train, y_train, cv=3, scoring='neg_mean_absolute_error')
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.8min finished
# 查看模型融合分数
sr_scores
array([-491.53751775, -494.7186037 , -486.69418657])
# 导出预测结果
sr.fit(X_train, y_train)
sr_predict = sr.predict(X_test)
pd.DataFrame(sr_predict).to_csv('stack_submit.csv')
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.1min finished

你可能感兴趣的:(机器学习个人练习项目,python,机器学习)