赛题以预测二手车的交易价格为任务,数据来自某交易平台的二手车交易记录,总数据量超过40w,包含31列变量信息,其中15列为匿名变量。从中抽取15万条作为训练集,5万条作为测试集,同时对name、model、brand和regionCode等信息进行脱敏。评价标准为MAE。详情请看天池。
EDA是数据挖掘最基础也是最重要的一步,通过统计量分析,关联度分析等方法分析数据特征,挖掘与预测值强相关的特征,处理无用值、缺失值与异常值。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# 载入数据
train_data=pd.read_csv('../data/used_car_train_20200313.csv',sep=' ')
test_data=pd.read_csv('../data/used_car_testB_20200421.csv',sep=' ')
## 简略观察数据(head()+tail()+shape)
train_data.head().append(train_data.tail())
train_data.shape
test_data.head().append(train_data.tail())
test_data.shape
# describe()熟悉数据的相关统计量
train_data.describe()
test_data.describe()
# info()熟悉数据类型
train_data.info()
test_data.info()
# isnull().sum()查看每列nan
train_data.isnull().sum()
test_data.isnull().sum()
# value_counts()查看异常值
train_data['notRepairedDamage'].value_counts()
train_data['notRepairedDamage'].replace('-',np.nan,inplace=True) # 缺失值先换为nan
train_data['notRepairedDamage'].value_counts()
train_data.isnull().sum()
test_data['notRepairedDamage'].value_counts()
test_data['notRepairedDamage'].replace('-',np.nan,inplace=True)
test_data['notRepairedDamage'].value_counts()
test_data.isnull().sum()
# 删除无用值
del train_data['seller']
del test_data['seller']
del train_data['offerType']
del test_data['offerType']
# 了解预测值的分布
train_data['price']
train_data['price'].value_counts()
# 总体分布概况
import scipy.stats as st
y=train_data['price']
plt.figure(1)
plt.title('johnson su')
sns.distplot(y,kde=False,fit=st.johnsonsu)
plt.figure(2)
plt.title('normal')
sns.distplot(y,kde=False,fit=st.norm)
plt.figure(3)
plt.title('log normal')
sns.distplot(y,kde=False,fit=st.lognorm)
# 查看偏度与峰值
sns.distplot(train_data['price'])
print('skewness:%f'%train_data['price'].skew())
print('kurtosis:%f'%train_data['price'].kurt())
sns.distplot(train_data.skew(),color='blue',axlabel='skewness')
sns.distplot(train_data.kurt(),color='orange',axlabel='kurtosis')
# 对数变换
plt.hist(np.log(train_data['price']),orientation='vertical',histtype='bar',color='red')
plt.show()
# 下面的方法适用于没有进行特征编码的数据,这里不适用,需要人为根据实际含义来区分
# 数字特征
# numeric_features = train_data.select_dtypes(include=[np.number])
# numeric_features.columns
# 类型特征
# categorical_features = train_data.select_dtypes(include=[np.object])
# categorical_features.columns
numeric_features=['power','kilometer','v_0','v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13','v_14']
categorical_features=['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode']
# 数字特征分析
numeric_features.append('price')
numeric_features
# 与预测值的相关性分析
price_numeric=train_data[numeric_features]
correlation=price_numeric.corr()
print(correlation['price'].sort_values(ascending=False),'\n')
# 与预测值的相关性可视化
f,ax=plt.subplots(figsize=(7,7))
plt.title('correlation of numeric features with price',y=1,size=16)
sns.heatmap(correlation,square=True,vmax=0.8)
del price_numeric['price']
# 查看数字特征的偏度与峰值
for col in numeric_features:
print('{:15}'.format(col),'skewness:{:05.2f}'.format(train_data[col].skew()),' ','kurtosis:{:06.2f}'.format(train_data[col].kurt()))
# 数字特征的分布可视化
f=pd.melt(train_data,value_vars=numeric_features)
g=sns.FacetGrid(f,col='variable',col_wrap=2,sharex=False,sharey=False)
g=g.map(sns.distplot,'value')
# 类别特征分析
# nunique分布
for feat in categorical_features:
print('{}特征有{}不同的值'.format(feat,train_data[feat].nunique()))
# 类别特征的分布可视化,name与regionCode太稀疏可以不画
categorical_features=['model','brand','bodyType','fuelType','gearbox','notRepairedDamage']
# 类型转换,填充缺失值
for col in categorical_features:
train_data[col]=train_data[col].astype('category')
if train_data[col].isnull().any():
train_data[col]=train_data[col].cat.add_categories(['MISSING'])
train_data[col]=train_data[col].fillna('MISSING')
def boxplot(x,y,**kwargs):
sns.boxplot(x=x,y=y)
x=plt.xticks(rotation=90)
f=pd.melt(train_data,id_vars='price',value_vars=categorical_features)
g=sns.FacetGrid(f,col='variable',col_wrap=2,sharex=False,sharey=False,size=5)
g=g.map(boxplot,'value','price')
def feature_engineering(data, train_data):
# 时间类特征
data['regDate'] = data['regDate'].apply(date_process)
data['creatDate'] = data['creatDate'].apply(date_process)
data['regDate_year'] = data['regDate'].dt.year
data['regDate_month'] = data['regDate'].dt.month
data['regDate_day'] = data['regDate'].dt.day
data['creatDate_year'] = data['creatDate'].dt.year
data['creatDate_month'] = data['creatDate'].dt.month
data['creatDate_day'] = data['creatDate'].dt.day
data['car_age_day'] = (data['creatDate'] - data['regDate']).dt.days
data['car_age_year'] = round(data['car_age_day'] / 365, 1)
# 地区类特征
data['regionCode_count'] = data.groupby(['regionCode'])['SaleID'].transform('count')
data['city'] = data['regionCode'].apply(lambda x: str(x)[:2])
# 可分类特征,进行分桶
bin = [i * 10 for i in range(31)]
data['power_bin'] = pd.cut(data['power'], bin, labels=False)
tong = data[['power_bin', 'power']].head()
bin = [i * 10 for i in range(24)]
data['model_bin'] = pd.cut(data['model'], bin, labels=False)
tong = data[['model_bin', 'model']].head()
# 可分类特征组合,与目标特征price组合
data = feature_merge(data, train_data, 'regionCode')
data = feature_merge(data, train_data, 'brand')
data = feature_merge(data, train_data, 'model')
data = feature_merge(data, train_data, 'kilometer')
data = feature_merge(data, train_data, 'bodyType')
data = feature_merge(data, train_data, 'fuelType')
# 其他可分类特征组合
feat1 = 'regionCode'
train_gb = data.groupby(feat1)
infos_dic = {}
for key, value in train_gb:
info_dic = {}
value = value[value['car_age_day'] > 0]
info_dic[feat1 + '_days_max'] = value.car_age_day.max()
info_dic[feat1 + '_days_min'] = value.car_age_day.min()
info_dic[feat1 + '_days_mean'] = value.car_age_day.mean()
info_dic[feat1 + '_days_std'] = value.car_age_day.std()
info_dic[feat1 + '_days_sum'] = value.car_age_day.sum()
info_dic[feat1 + '_days_median'] = value.car_age_day.median()
infos_dic[key] = info_dic
df = pd.DataFrame(infos_dic).T.reset_index().rename(columns={"index": feat1})
data = data.merge(df, how='left', on=feat1)
train_gb = data.groupby(feat1)
infos_dic = {}
for key, value in train_gb:
info_dic = {}
value = value[value['power'] > 0]
info_dic[feat1 + '_days_max'] = value.power.max()
info_dic[feat1 + '_days_min'] = value.power.min()
info_dic[feat1 + '_days_mean'] = value.power.mean()
info_dic[feat1 + '_days_std'] = value.power.std()
info_dic[feat1 + '_days_sum'] = value.power.sum()
info_dic[feat1 + '_days_median'] = value.power.median()
infos_dic[key] = info_dic
df = pd.DataFrame(infos_dic).T.reset_index().rename(columns={"index": feat1})
data = data.merge(df, how='left', on=feat1)
# 匿名特征组合
feat2 = 'v_3'
train_gb = data.groupby(feat1)
infos_dic = {}
for key, value in train_gb:
info_dic = {}
value = value[value[feat2] > -10000000]
info_dic[feat1 + '_' + feat2 + '_max'] = value.v_3.max()
info_dic[feat1 + '_' + feat2 + '_min'] = value.v_3.min()
info_dic[feat1 + '_' + feat2 + '_mean'] = value.v_3.mean()
info_dic[feat1 + '_' + feat2 + '_std'] = value.v_3.std()
info_dic[feat1 + '_' + feat2 + '_sum'] = value.v_3.sum()
info_dic[feat1 + '_' + feat2 + '_median'] = value.v_3.median()
infos_dic[key] = info_dic
df = pd.DataFrame(infos_dic).T.reset_index().rename(columns={'index': feat1})
data = data.merge(df, how='left', on=feat1)
feat3 = 'v_0'
train_gb = data.groupby(feat1)
infos_dic = {}
for key, value in train_gb:
info_dic = {}
value = value[value[feat3] > -10000000]
info_dic[feat1 + '_' + feat3 + '_max'] = value.v_0.max()
info_dic[feat1 + '_' + feat3 + '_min'] = value.v_0.min()
info_dic[feat1 + '_' + feat3 + '_mean'] = value.v_0.mean()
info_dic[feat1 + '_' + feat3 + '_std'] = value.v_0.std()
info_dic[feat1 + '_' + feat3 + '_sum'] = value.v_0.sum()
info_dic[feat1 + '_' + feat3 + '_median'] = value.v_0.median()
infos_dic[key] = info_dic
df = pd.DataFrame(infos_dic).T.reset_index().rename(columns={'index': feat1})
data = data.merge(df, how='left', on=feat1)
# 特征交叉,针对匿名特征及重要性高的可分类特征
for i in range(15):
for j in range(15):
data['new' + str(i) + '*' + str(j)] = data['v_' + str(i)] * data['v_' + str(j)]
for i in range(15):
for j in range(15):
data['new' + str(i) + '+' + str(j)] = data['v_' + str(i)] + data['v_' + str(j)]
for i in range(15):
data['new' + str(i) + '*power'] = data['v_' + str(i)] * data['power']
for i in range(15):
data['new' + str(i) + '*day'] = data['v_' + str(i)] * data['car_age_day']
for i in range(15):
data['new' + str(i) + '*year'] = data['v_' + str(i)] * data['car_age_year']
return data
本项目最终选择lgb+catb+nn。采用正则化,早停法,10折交叉验证等方法防止过拟合,训练模型。
def final_model(x_train_tree, x_test_tree, y_train_tree, x_train_nn, x_test_nn, y_train_nn):
# lgbm模型
predictions_lgbm, oof_lgbm = lgbm_model(x_train_tree, x_test_tree, y_train_tree)
# catb模型
predictions_catb, oof_catb = catb_model(x_train_tree, x_test_tree, y_train_tree)
# 树模型stack
predictions_tree, oof_tree = stack_model(predictions_lgbm, predictions_catb, oof_lgbm, oof_catb, y_train_tree)
# nn模型
predictions_nn, oof_nn = nn_model(x_train_nn, x_test_nn, y_train_nn)
# nn模型+树模型stack
predictions = (predictions_tree + predictions_nn) / 2
oof = (oof_tree + oof_nn) / 2
point = mean_absolute_error(oof, np.expm1(y_train_nn))
print("final model mae:{:<8.8f}".format(point))
return predictions
线上MAE为405.67,与SOTA有一定差距,个人认为可以从特征工程做优化,挖掘更多强相关的特征。
Github