大家好,我是小泽
预测分析·民宿价格预测比赛是和鲸社区与ChallengeHub联合举办的一场新手赛,本文旨在多角度构建特征工程来帮助选手快速比赛上手。
比赛链接
话不多说,直接开!
import time
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor, Pool
#读取训练集与测试集并构建原始数据
train = pd.read_csv('./训练集.csv')
test = pd.read_csv('./测试集.csv')
df_features = train.append(test)
df_features['洗手间数量'].fillna(-1, inplace=True)
df_features['床的数量'].fillna(-1, inplace=True)
df_features['卧室数量'].fillna(-1, inplace=True)
df_features['房主是否有个人资料图片'].fillna('na', inplace=True)
df_features['房主身份是否验证'].fillna('na', inplace=True)
df_features['房主回复率'].fillna('-1', inplace=True)
df_features['房主回复率'] = df_features['房主回复率'].astype(str).apply(lambda x: x.replace('%', ''))
df_features['房主回复率'] = df_features['房主回复率'].astype(int)
df_features['民宿周边'].fillna('na', inplace=True)
mean_score = df_features['民宿评分'].mean()
df_features['民宿评分'].fillna(mean_score, inplace=True)
df_features['邮编'].fillna('na', inplace=True)
for feat in ['房主是否有个人资料图片', '房主身份是否验证', '民宿周边', '邮编']:
lbl = LabelEncoder()
lbl.fit(df_features[feat])
df_features[feat] = lbl.transform(df_features[feat])
def freq_enc(df, col):
vc = df[col].value_counts(dropna=True, normalize=True).to_dict()
df[f'{col}_freq'] = df[col].map(vc)
return df
for feat in ['容纳人数', '洗手间数量', '床的数量', '床的类型',
'卧室数量', '取消条款', '所在城市', '清洁费',
'房主是否有个人资料图片', '房主回复率', '是否支持随即预订',
'民宿周边', '房产类型', '房型', '邮编']:
df_features = freq_enc(df_features, feat)
# 时间特征处理
from tqdm import tqdm
df_features['首次评论日期'] = pd.to_datetime(df_features['首次评论日期']).values.astype(np.int64) // 10 ** 9
df_features['何时成为房主'] = pd.to_datetime(df_features['何时成为房主']).values.astype(np.int64) // 10 ** 9
df_features['最近评论日期'] = pd.to_datetime(df_features['最近评论日期']).values.astype(np.int64) // 10 ** 9
df_features['timestamp_diff1'] = df_features['首次评论日期'] - df_features['何时成为房主']
df_features['timestamp_diff2'] = df_features['最近评论日期'] - df_features['首次评论日期']
df_features['timestamp_diff3'] = df_features['最近评论日期'] - df_features['何时成为房主']
def brute_force(df, features, groups):
for method in tqdm(['max', 'min', 'mean', 'median', 'std']):
for feature in features:
for group in groups:
df[f'{group}_{feature}_{method}'] = df.groupby(group)[feature].transform(method)
return df
dense_feats = ['timestamp_diff1', 'timestamp_diff2', 'timestamp_diff3']
cate_feats = ['房型']
df_features = brute_force(df_features, dense_feats, cate_feats)
def f(x):
if x>0:
return 1
else:
return 0
df_features['if_bed'] = train['床的数量'].apply(f)
df_features['if_bedroom'] = train['卧室数量'].apply(f)
df_features['if_wc'] = train['洗手间数量'].apply(f)
#交叉衍生特征
df_features['人均床数量'] = df_features['容纳人数'] / (df_features['床的数量'] + 1e-3) # 1e-3 是为了避免 zero-divide
df_features['人均卧室量'] = df_features['容纳人数'] / (df_features['卧室数量'] + 1e-3)
df_features['卧室床均量'] = df_features['床的数量'] / (df_features['卧室数量'] + 1e-3)
df_features['经纬度平方根'] = (df_features['维度']*df_features['维度'] + df_features['经度']*df_features['经度'])**.5
def get_features(df):
features = [['人均床数量','人均卧室量'],['卧室床均量','人均卧室量']]
for fea in features:
df[f'{fea[0]}_{fea[1]}_std'] = df[fea].std(1)
df[f'{fea[0]}_{fea[1]}_max'] = df[fea].max(1)
df[f'{fea[0]}_{fea[1]}_min'] = df[fea].min(1)
df[f'{fea[0]}_{fea[1]}_sub'] = df[fea[0]] - df[fea[1]]
#df.loc[df[fea[0]] <= df[fea[1]],f'{fea[0]}_{fea[1]}_mark'] = 0
#df.loc[df[fea[0]] > df[fea[1]],f'{fea[0]}_{fea[1]}_mark'] = 1
return df
df_features = get_features(df_features)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
df_features['便利设施数量']=df_features['便利设施'].apply(lambda x:len(x.lstrip('{').rstrip('}').split(',')))
df_features['便利设施'] = df_features['便利设施'].apply(
lambda x: x.replace('{', '').replace('}', '').replace('"', '').replace(':', '').replace(',', ' '))
# df_features['便利设施'] = df_features['便利设施'].str.lower()
n_components = 12
X = list(df_features['便利设施'].values)
tfv = TfidfVectorizer(ngram_range=(1,1), max_features=10000)
tfv.fit(X)
X_tfidf = tfv.transform(X)
svd = TruncatedSVD(n_components= n_components)
svd.fit(X_tfidf)
X_svd = svd.transform(X_tfidf)
for i in range(n_components):
df_features[f'便利设施_tfidf_{i}'] = X_svd[:, i]
df_train = df_features[~df_features['价格'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_features[df_features['价格'].isnull()]
no_features = ['数据ID', '价格', '便利设施']
# 输入特征列
features = [col for col in df_train.columns if col not in no_features]
X = df_train[features] # 训练集输入
y = df_train['价格'] # 训练集标签
X_test = df_test[features] # 测试集输入
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=1314)
oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
cate_features=['房主是否有个人资料图片','房主身份是否验证','是否支持随即预订','房产类型',
'房型','if_bed','if_bedroom','if_wc']
train_pool = Pool(X_train, y_train, cat_features=cate_features)
eval_pool = Pool(X_valid, y_valid, cat_features=cate_features)
cbt_model = CatBoostRegressor(iterations=10000, # 注:baseline 提到的分数是用 iterations=60000 得到的,但运行时间有点久
learning_rate=0.1, # 注:事实上好几个 property 在 lr=0.1 时收敛巨慢。后面可以考虑调大
eval_metric='SMAPE',
use_best_model=True,
random_seed=42,
logging_level='Verbose',
#task_type='GPU',
devices='0',
gpu_ram_part=0.5,
early_stopping_rounds=400)
cbt_model.fit(train_pool,
eval_set=eval_pool,
verbose=1000)
y_pred_valid = cbt_model.predict(X_valid)
y_pred = cbt_model.predict(X_test)
oof[valid_index] = y_pred_valid.reshape(-1, )
prediction += y_pred
prediction /= n_fold
from sklearn.metrics import mean_squared_error
score = mean_squared_error(oof, df_train['价格'].values, squared=False)
print(score)
test['价格'] = prediction
test[['数据ID', '价格']].to_csv('./sub_cat.csv'.format(score), index=None)
最后线上RMSE可以达到5.3以内,目前可以排到top10左右。
本文主要参考了官方的baseline以及恒哥的代码思路
如果本文可以帮助到大家,欢迎点个关注!