这个比赛是天池的一个数据挖掘入门赛,要求根据提供的数据预测二手车的交易价格,属于回归问题,此篇主要分享一下特征工程和基础模型方面的思路。
date_cols = ['regDate', 'creatDate']
cate_cols = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode', 'seller', 'offerType']
num_cols = ['power', 'kilometer'] + ['v_{}'.format(i) for i in range(15)]
cols = date_cols + cate_cols + num_cols
df = pd.DataFrame()
df['count'] = data[cols].count().values
df['missing_rate'] = (data.shape[0] - df['count']) / data.shape[0]
df['nunique'] = data[cols].nunique().values
df['max_value_counts'] = [data[f].value_counts().values[0] for f in cols]
df['max_value_counts_prop'] = df['max_value_counts'] / data.shape[0]
df['max_value_counts_value'] = [data[f].value_counts().index[0] for f in cols]
df.index = cols
df
#处理异常值
def smooth_cols(group,cols = ['power'],out_value = 600):
for col in cols:
yes_no = (group[col]<out_value).astype('int')
new = yes_no * group[col]
group[col] = new.replace(0,group[col].median())
return group
data = data.groupby('brand').apply(smooth_cols,cols = ['power'],out_value = 600)
data.index = range(len(data))
from tqdm import tqdm
#提取日期信息
date_cols = ['regDate', 'creatDate']
# data.loc[data['regDate'] == 20070009]
# 使用时间:data['creatDate'] - data['regDate'],反应汽车使用时间,一般来说价格与使用时间成反比
# 数据里有时间出错的格式,需要加errors='coerce'
data['used_time_days'] = (pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') -
pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days
data['used_time_month'] = round(data['used_time_days'] / 30, 3)
data['used_time_year'] = round(data['used_time_days'] / 365, 3)
def date_proc(x):
m = int(x[4:6])
if m == 0:
m = 1
return x[:4] + '-' + str(m) + '-' + x[6:]
for col in tqdm(date_cols):
data[col] = pd.to_datetime(data[col].astype('str').apply(date_proc))
data[col + '_year'] = data[col].dt.year
data[col + '_month'] = data[col].dt.month
data[col + '_day'] = data[col].dt.day
data[col + '_dayofweek'] = data[col].dt.dayofweek
# 增加新特征,成交日期是否是周末
data['is_weekend'] = data['creatDate_dayofweek'].apply(lambda x: 1 if x in(5,6) else 0)
# 使用年限折旧
def depreciation_year(year):
if year <= 3:
return 1 - year * 0.15
elif year > 3 and year <= 7:
return 0.55 - (year-3) * 0.1
elif year > 7 and year <= 10:
return 0.25 - (year-7) * 0.05
else:
return 0
data['depreciation_year'] = data['used_time_year'].apply(lambda x: depreciation_year(x))
# 将fuelType2之后的值都设置成2
data.loc[data['fuelType'] == 3,'fuelType'] = 2
data.loc[data['fuelType'] == 4,'fuelType'] = 2
data.loc[data['fuelType'] == 5,'fuelType'] = 2
data.loc[data['fuelType'] == 6,'fuelType'] = 2
data['fuelType'].value_counts()
data['notRepairedDamage'].replace('-', '2.0', inplace=True)
print('concat data shape:',data.shape)
features = ['model','bodyType','fuelType','gearbox']
for fe in features:
data[fe].fillna(data[fe].mode()[0], inplace=True)
train[fe].fillna(data[fe].mode()[0], inplace=True)
test[fe].fillna(data[fe].mode()[0], inplace=True)
# 从邮编中提取城市信息,这个特征后续通过lightGBM的特征重要性来看不高
data['city'] = data['regionCode'].apply(lambda x : str(x)[:-3])
data['city'].replace('', 0, inplace=True)
# 计算年平均里程, 即kilometer/汽车使用年限
features = ['used_time_days','used_time_month','used_time_year']
for fe in features:
data[fe].fillna(data[fe].median(), inplace=True)
data['kilometer_everyear'] = round(1000 * data['kilometer'] / data['used_time_year'],3)
from tqdm import tqdm
# count编码
def count_features(df, feat_cols):
for feat in tqdm(feat_cols):
df[feat + '_count'] = df[feat].map(df[feat].value_counts())
return(df)
feature_list = ['regDate','creatDate','regDate_year','model','brand','regionCode','bodyType',
'fuelType','gearbox','notRepairedDamage']
data = count_features(data, feature_list)
# 计算brand/model的销售统计量
Train_gb = train.groupby("brand")
all_info = {}
for kind, kind_data in Train_gb:
info = {}
kind_data = kind_data[kind_data['price'] > 0]
info['brand_amount'] = len(kind_data)
info['brand_price_max'] = kind_data.price.max()
info['brand_price_median'] = kind_data.price.median()
info['brand_price_min'] = kind_data.price.min()
info['brand_price_sum'] = kind_data.price.sum()
info['brand_price_ptp'] = kind_data.price.ptp()
info['brand_price_std'] = kind_data.price.std()
info['brand_price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2)
all_info[kind] = info
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": "brand"})
data = data.merge(brand_fe, how='left', on='brand')
内容有点多,感兴趣的可以去我的github下载。但是后续实际跑模型来看,特征数量对模型的影响并不是成正比的(特征数目越多模型效果越好)。比如有些特征(regionCode的统计),加进去后反而会升高MAE,经过多次试验,最终确定下来180维左右的特征。
https://github.com/SunnyFei-78/Recommendation/tree/master/天池_二手车交易价格预测