预测分析·民宿价格预测baseline

大家好,我是小泽
预测分析·民宿价格预测比赛是和鲸社区与ChallengeHub联合举办的一场新手赛,本文旨在多角度构建特征工程来帮助选手快速比赛上手。
比赛链接
话不多说,直接开!

导入相关库

import time
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor, Pool

#读取训练集与测试集并构建原始数据
train = pd.read_csv('./训练集.csv')
test = pd.read_csv('./测试集.csv')
df_features = train.append(test)

填充缺失值并且相关Encoding操作

df_features['洗手间数量'].fillna(-1, inplace=True)
df_features['床的数量'].fillna(-1, inplace=True)
df_features['卧室数量'].fillna(-1, inplace=True)
df_features['房主是否有个人资料图片'].fillna('na', inplace=True)
df_features['房主身份是否验证'].fillna('na', inplace=True)
df_features['房主回复率'].fillna('-1', inplace=True)
df_features['房主回复率'] = df_features['房主回复率'].astype(str).apply(lambda x: x.replace('%', ''))
df_features['房主回复率'] = df_features['房主回复率'].astype(int)
df_features['民宿周边'].fillna('na', inplace=True)
mean_score = df_features['民宿评分'].mean()
df_features['民宿评分'].fillna(mean_score, inplace=True)
df_features['邮编'].fillna('na', inplace=True)

for feat in ['房主是否有个人资料图片', '房主身份是否验证', '民宿周边', '邮编']:
    lbl = LabelEncoder()
    lbl.fit(df_features[feat])
    df_features[feat] = lbl.transform(df_features[feat])

def freq_enc(df, col):
    vc = df[col].value_counts(dropna=True, normalize=True).to_dict()
    df[f'{col}_freq'] = df[col].map(vc)
    return df

for feat in ['容纳人数', '洗手间数量', '床的数量', '床的类型',
             '卧室数量', '取消条款', '所在城市', '清洁费',
             '房主是否有个人资料图片', '房主回复率', '是否支持随即预订',
             '民宿周边', '房产类型', '房型', '邮编']:
    df_features = freq_enc(df_features, feat)

对时间特征进行处理

# 时间特征处理
from tqdm import tqdm
df_features['首次评论日期'] = pd.to_datetime(df_features['首次评论日期']).values.astype(np.int64) // 10 ** 9
df_features['何时成为房主'] = pd.to_datetime(df_features['何时成为房主']).values.astype(np.int64) // 10 ** 9
df_features['最近评论日期'] = pd.to_datetime(df_features['最近评论日期']).values.astype(np.int64) // 10 ** 9

df_features['timestamp_diff1'] = df_features['首次评论日期'] - df_features['何时成为房主']
df_features['timestamp_diff2'] = df_features['最近评论日期'] - df_features['首次评论日期']
df_features['timestamp_diff3'] = df_features['最近评论日期'] - df_features['何时成为房主']

def brute_force(df, features, groups):
    for method in tqdm(['max', 'min', 'mean', 'median', 'std']):
        for feature in features:
            for group in groups:
                df[f'{group}_{feature}_{method}'] = df.groupby(group)[feature].transform(method)

    return df


dense_feats = ['timestamp_diff1', 'timestamp_diff2', 'timestamp_diff3']
cate_feats  = ['房型']

df_features = brute_force(df_features, dense_feats, cate_feats)

其他简单业务特征

def f(x):
    if x>0:
        return 1
    else:
        return 0 
df_features['if_bed'] = train['床的数量'].apply(f)
df_features['if_bedroom'] = train['卧室数量'].apply(f)
df_features['if_wc'] = train['洗手间数量'].apply(f)

#交叉衍生特征
df_features['人均床数量'] = df_features['容纳人数'] / (df_features['床的数量'] + 1e-3)  # 1e-3 是为了避免 zero-divide
df_features['人均卧室量'] = df_features['容纳人数'] / (df_features['卧室数量'] + 1e-3)
df_features['卧室床均量'] = df_features['床的数量'] / (df_features['卧室数量'] + 1e-3)
df_features['经纬度平方根'] = (df_features['维度']*df_features['维度'] + df_features['经度']*df_features['经度'])**.5

def get_features(df):
    features = [['人均床数量','人均卧室量'],['卧室床均量','人均卧室量']]
    for fea in features:
        df[f'{fea[0]}_{fea[1]}_std'] = df[fea].std(1)
        df[f'{fea[0]}_{fea[1]}_max'] = df[fea].max(1)
        df[f'{fea[0]}_{fea[1]}_min'] = df[fea].min(1)

        df[f'{fea[0]}_{fea[1]}_sub'] = df[fea[0]] - df[fea[1]]

        #df.loc[df[fea[0]] <= df[fea[1]],f'{fea[0]}_{fea[1]}_mark'] = 0
        #df.loc[df[fea[0]] > df[fea[1]],f'{fea[0]}_{fea[1]}_mark'] = 1 
    return df

df_features = get_features(df_features)

对“便利设施”特征进行挖掘

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
df_features['便利设施数量']=df_features['便利设施'].apply(lambda x:len(x.lstrip('{').rstrip('}').split(',')))
df_features['便利设施'] = df_features['便利设施'].apply(
    lambda x: x.replace('{', '').replace('}', '').replace('"', '').replace(':', '').replace(',', ' '))
# df_features['便利设施'] = df_features['便利设施'].str.lower()

n_components = 12

X = list(df_features['便利设施'].values)
tfv = TfidfVectorizer(ngram_range=(1,1), max_features=10000)
tfv.fit(X)
X_tfidf = tfv.transform(X)
svd = TruncatedSVD(n_components= n_components)
svd.fit(X_tfidf)
X_svd = svd.transform(X_tfidf)

for i in range(n_components):
    df_features[f'便利设施_tfidf_{i}'] = X_svd[:, i]

获取特征和标签数据

df_train = df_features[~df_features['价格'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_features[df_features['价格'].isnull()]

no_features = ['数据ID', '价格', '便利设施']
# 输入特征列
features = [col for col in df_train.columns if col not in no_features]

X = df_train[features] # 训练集输入
y = df_train['价格'] # 训练集标签
X_test = df_test[features] # 测试集输入

五折Catboost模型

n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=1314)

oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
    X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    cate_features=['房主是否有个人资料图片','房主身份是否验证','是否支持随即预订','房产类型',
                  '房型','if_bed','if_bedroom','if_wc']
    train_pool = Pool(X_train, y_train, cat_features=cate_features)
    eval_pool = Pool(X_valid, y_valid, cat_features=cate_features)
    cbt_model = CatBoostRegressor(iterations=10000, # 注:baseline 提到的分数是用 iterations=60000 得到的,但运行时间有点久
                           learning_rate=0.1, # 注:事实上好几个 property 在 lr=0.1 时收敛巨慢。后面可以考虑调大
                           eval_metric='SMAPE',
                           use_best_model=True,
                           random_seed=42,
                           logging_level='Verbose',
                           #task_type='GPU',
                           devices='0',
                           gpu_ram_part=0.5,
                           early_stopping_rounds=400)
    
    cbt_model.fit(train_pool,
              eval_set=eval_pool,
              verbose=1000)

    y_pred_valid = cbt_model.predict(X_valid)
    y_pred = cbt_model.predict(X_test)
    oof[valid_index] = y_pred_valid.reshape(-1, )
    prediction += y_pred
prediction /= n_fold

from sklearn.metrics import mean_squared_error
score = mean_squared_error(oof, df_train['价格'].values, squared=False)
print(score)


test['价格'] = prediction
test[['数据ID', '价格']].to_csv('./sub_cat.csv'.format(score), index=None)

最后线上RMSE可以达到5.3以内,目前可以排到top10左右。
本文主要参考了官方的baseline以及恒哥的代码思路

如果本文可以帮助到大家,欢迎点个关注!

你可能感兴趣的:(回归预测,机器学习,数据分析)