代码
#!/usr/bin/env python
# coding: utf-8
import os
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy
from scipy.stats import stats
import statsmodels.api as sm
from numpy import loadtxt
from sklearn import metrics
from xgboost import XGBClassifier
from matplotlib import pyplot
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from matplotlib import pyplot
from xgboost import plot_importance
import xgboost as xgb
import lightgbm as lgb
class DataSearch(object):
def load_data(self,train_data_path,test_data_path):
# 训练数据初步统计
train_data_df = pd.read_csv(train_data_path, sep=' ')
pd.set_option('display.max_columns', None)
test_data_df = pd.read_csv(test_data_path, sep=' ')
pd.set_option('display.max_columns', None)
train_data_df['train'] = 1
test_data_df['train'] = 0
data = pd.concat([train_data_df, test_data_df], ignore_index=True)
# test_data_path = r"C:\Users\ccs\Documents\dataWhale\used_car_testA_20200313\used_car_testA_20200313.csv"
# test_data_df = pd.read_csv(test_data_path, sep=' ')
# train_data_df = train_data_df.append(test_data_df)
print("训练数据的数量:\n",train_data_df.count())
print("测试数据的数量:\n",test_data_df.count())
print("训练数据合并上测试数据后的数量:\n",data.count())
print("合并后数据去重后的数量:\n",data.drop_duplicates().count())
data.describe(include='all')
# 空值统计
print(data.isnull().sum())
print(data.columns)
return data
def categorial_statistus(self,train_data_df,category_columns):
"""
总体特征或者字符特征数据统计
"""
print(train_data_df.columns)
# train_data_df.loc[train_data_df['regDate'][4:6],'C']=train_data_df['regDate'][4:6]
# train_data_df
for i in category_columns:
# print(train_data_df.groupby(i).size())
total = pd.DataFrame({'count': train_data_df.groupby(i).size()})
total = total.sort_values(['count'], ascending=False)
print(total, '\n', total.count())
return train_data_df
def categorial_extend(self,train_data_df):
"""
字符特征扩展
"""
def fun(x):
if str(x)[4:6] == '00':
rst = str(x)[0:4] + '03' + str(x)[6:]
return rst
else:
return str(x)
train_data_df['regDate'] = train_data_df['regDate'].apply(lambda x: fun(x))
train_data_df["year_regDate"] = train_data_df['regDate'].astype("str").str[0:4]
train_data_df["month_regDate"] = train_data_df['regDate'].astype("str").str[4:6]
#构造车辆使用天数特征
train_data_df['used_time'] = (pd.to_datetime(train_data_df['creatDate'], format='%Y%m%d', errors='coerce') -
pd.to_datetime(train_data_df['regDate'], format='%Y%m%d', errors='coerce')).dt.days
#构造品牌与价格相关的特征
#选取为训练数据的那部分数据--计算品牌与价格的关联特征
train_data_df_actual = train_data_df[train_data_df['train']==1]
def combine_data(train_data_df,column):
Train_gb = train_data_df_actual.groupby(column)
all_info = {}
for kind, kind_data in Train_gb:
print("kind, kind_data is ",kind, kind_data)
info = {}
kind_data = kind_data[kind_data['price'] > 0]
info[column + 'amount'] = len(kind_data)
info[column + 'price_max'] = kind_data.price.max()
info[column + 'price_median'] = kind_data.price.median()
info[column + 'price_min'] = kind_data.price.min()
info[column + 'price_sum'] = kind_data.price.sum()
info[column + 'price_std'] = kind_data.price.std()
info[column + 'price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2)
all_info[kind] = info
print("all_info[kind] is ",all_info)
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": column})
train_data_df = train_data_df.merge(brand_fe, how='left', on='brand')
return train_data_df
train_data_df = combine_data(train_data_df,"brand")
train_data_df = combine_data(train_data_df,"power")
train_data_df.describe(include='all')
return train_data_df
#数字特征可视化
def plot_nemurical(self,train_data_df,numerical_columns):
## 3) 每个数字特征得分布可视化--连续型取值的
##去除掉字符型的变量
# numerical_columns = ['regDate', 'power', 'kilometer', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
# 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
# 'v_13', 'v_14']
f = pd.melt(train_data_df, value_vars=numerical_columns)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")
## 4) 数字特征相互之间的关系可视化
sns.set()
columns = ['price', 'v_12', 'v_8', 'v_0', 'power', 'v_5', 'v_2', 'v_6', 'v_1', 'v_14']
sns.pairplot(train_data_df[columns], size=2, kind='scatter', diag_kind='kde')
plt.show()
def normal_test(self,train_data_df):
# 对于连续型指标---正态分布检验
# 判断是否符合近似正态分布
# 若p_value比较小,表示不大可能来自正态分布
#经检验,都不是正态分布,因此需要对重要的power和kilometer进行转换。
numerical_columns = ['regDate', 'power', 'kilometer', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
'v_13', 'v_14']
train_data_df['regDate'] = train_data_df['regDate'].astype('int')
train_data_df['notRepairedDamage'] = train_data_df['notRepairedDamage'].replace('-', np.nan, inplace=True)
train_data_df = train_data_df.fillna(0)
train_data_df.info()
print('看P-Value是否满足正态分布,不大表明不大可能来自正态分布',
list(map(lambda x: scipy.stats.normaltest(train_data_df[x])[1], numerical_columns)))
#画指标原图与对数转化后的原图
def log_plot(self,train_data_df):
## 3) 查看预测值的具体频数
plt.subplot(2,2,1)
plt.hist(train_data_df['price'], orientation='vertical', histtype='bar', color='red',label='price')
plt.subplot(2, 2, 2)
plt.hist(train_data_df['kilometer'], orientation='vertical', histtype='bar', color='green',label='price')
# plt.subplot(2, 3, 3)
# plt.hist(train_data_df['power'], orientation='vertical', histtype='bar', color='yellow',label='price')
# log变换 z之后的分布较均匀,可以进行log变换进行预测,这也是预测问题常用的trick
plt.subplot(2, 2, 3)
plt.hist(np.log(train_data_df['price']), orientation='vertical', histtype='bar', color='red',label='price')
plt.subplot(2, 2, 4)
#会发现,这种离散的做了对数变化,正态化效果并不明显
# plt.hist(np.log(train_data_df['kilometer']), orientation='vertical', histtype='bar', color='red',label='kilometer')
#转换不了,会报错,ValueError: supplied range of [-inf, 9.868481943337313] is not finite
# plt.subplot(2, 3, 6)
# plt.hist(np.log(train_data_df['power']), orientation='vertical', histtype='bar', color='red',label='power')
plt.show()
def change_to_nomal(self,train_data_df):
"""
转换为正态分布
"""
train_data_df[train_data_df['train']==1]['price'] = train_data_df[train_data_df['train']==1].apply(lambda x: np.log(x))
# train_data_df['log_kilometer'] = train_data_df['kilometer'].apply(lambda x: np.log(x))
# train_data_df['log_power'] = train_data_df['power'].apply(lambda x: np.log(x))
# train_data_df['log_model'] = train_data_df['model'].apply(lambda x: np.log(x))
return train_data_df
# 异常值检测
def detect_outliers(self,df, n, features):
"""
"""
outlier_indices = []
df_raw = df
#只对测试数据进行异常值检测
df = df[df['train']==1]
# iterate over features(columns)
for col in features:
# 1st quartile (25%)
Q1 = np.percentile(df[col], 25)
# 3rd quartile (75%)
Q3 = np.percentile(df[col], 75)
# quartile spacing (IQR)
IQR = Q3 - Q1
# outlier step
outlier_step = 1.5 * IQR
# Determine a list of indices of outliers for feature col
outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
# append the found outlier indices for col to the list of outlier indices
outlier_indices.extend(outlier_list_col)
# select observations containing more than n outliers
outlier_indices = Counter(outlier_indices)
print("outlier_indices is ", outlier_indices)
print("outlier_indices length is ", outlier_indices.__len__())
multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)
print("multiple_outliers is ",multiple_outliers)
#删除测试集中数据索引为异常的那部分样本
df_raw = df_raw.drop(multiple_outliers)
print("set(multiple_outliers) & set(df_raw.index) should be empty ",set(multiple_outliers) & set(df_raw.index))
return df_raw
def person_corr(self,train_data_df):
numerical_columns = ['regDate', 'power', 'kilometer', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
'v_13', 'v_14']
## 1) 与价格的相关性分析
price_numeric = train_data_df[numerical_columns]
correlation = price_numeric.corr()
print(correlation['price'].sort_values(ascending=False), '\n')
f, ax = plt.subplots(figsize=(7, 7))
plt.title('Correlation of Numeric Features with Price', y=1, size=16)
sns.heatmap(correlation, square=True, vmax=0.8)
#自变量与自变量相关系数
# 设置路径
os.chdir(os.getcwd()) # os.getcwd()获取当前路径,os.chdir(...)改变路径为...
# 输入数据
columns = ['bodyType', 'brand', 'creatDate', 'fuelType', 'gearbox',
'kilometer', 'model', 'name', 'notRepairedDamage', 'offerType', 'power',
'regDate', 'regionCode', 'seller', 'v_0', 'v_1', 'v_10',
'v_11', 'v_12', 'v_13', 'v_14', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6',
'v_7', 'v_8', 'v_9', "year_regDate", "month_regDate", 'price']
data = train_data_df[columns]
# 求解相关系数
correlations = data.corr()
correction = abs(correlations) # 取绝对值,只看相关程度 ,不关心正相关还是负相关
# plot correlation matrix
fig = plt.figure(figsize=(60, 60))
ax = fig.add_subplot(figsize=(40, 40)) # 图片大小为20*20
ax = sns.heatmap(correction, cmap=plt.cm.Oranges, linewidths=0.05, vmax=1, vmin=0, annot=True,
annot_kws={'size': 12, 'weight': 'bold'})
# 热力图参数设置(相关系数矩阵,颜色,每个值间隔等)
# ticks = numpy.arange(0,16,1) #生成0-16,步长为1
plt.xticks(np.arange(30) + 0.5, columns) # 横坐标标注点
plt.yticks(np.arange(30) + 0.5, columns) # 纵坐标标注点
# ax.set_xticks(ticks) #生成刻度
# ax.set_yticks(ticks)
# ax.set_xticklabels(names) #生成x轴标签
# ax.set_yticklabels(names)
ax.set_title('Characteristic correlation') # 标题设置
plt.savefig('cluster.tif', dpi=300)
plt.show()
def ridge_cv(self,train_data_df,feature_columns):
"""
注意此时价格为正态
"""
# 使用岭回归处理共线性 ;逐步回归法(Stepwise Regression);
from sklearn import linear_model
# 初始化一个Ridge Cross-Validation Regression
# train_data_df = train_data_df.fillna(0)
data = train_data_df[feature_columns]
clf = linear_model.RidgeCV(fit_intercept=False)
# 训练模型---岭回归训练模型
clf.fit(data, train_data_df['price'])
print('alpha的数值 : ', clf.alpha_)
rst = list(map(lambda x: '{:.5f}'.format(abs(x)), clf.coef_))
rst = sorted(rst)
print(rst)
print(len(rst), len(feature_columns))
print('参数的数值:', dict(zip(feature_columns, rst)))
"""
结果是:
参数的数值: {'name': '0.00000', 'regDate': '0.07543', 'model': '0.20068', 'brand': '1.91918', 'bodyType': '12.22141', 'fuelType': '15027.09136', 'gearbox': '1506.17263', 'power': '154331.09559', 'kilometer': '17103.78850', 'notRepairedDamage': '18457.71267', 'regionCode': '194831.93107', 'v_0': '20013.22587', 'v_1': '20882.49239', 'v_2': '2249.08699', 'v_3': '22750.43400', 'v_4': '236965.73075', 'v_5': '24.47606', 'v_6': '241.11729', 'v_7': '2456.38493', 'v_8': '2465.45254', 'v_9': '319.47281', 'v_10': '32640.53892', 'v_11': '333.91531', 'v_12': '38188.50573', 'v_13': '43.12500', 'v_14': '43445.24262', 'year_regDate': '451.61198', 'month_regDate': '9.24321', 'price': '987.79713'}
['0.00000', '0.07543', '0.20068', '1.91918', '12.22141', '15027.09136', '1506.17263', '154331.09559', '17103.78850', '18457.71267', '194831.93107', '20013.22587', '20882.49239', '2249.08699', '22750.43400', '236965.73075', '24.47606', '241.11729', '2456.38493', '2465.45254', '319.47281', '32640.53892', '333.91531', '38188.50573', '43.12500', '43445.24262', '451.61198', '9.24321', '987.79713']
"""
def stepwise_selection(self,X, y,
initial_list=[],
threshold_in=0.01,
threshold_out=0.05,
verbose=True):
"""
逐步回归,筛选特征
"""
included = list(initial_list)
while True:
changed = False
# forward step
excluded = list(set(X.columns) - set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.argmin()
included.append(best_feature)
changed = True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
changed = True
worst_feature = pvalues.argmax()
included.remove(worst_feature)
if verbose:
print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
if not changed:
break
return included
def xgb_model_fit(self,
X_train, X_test, y_train, y_test,alg, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
"""训练集训练数据"""
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='mae', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
# 拟合模型
alg.fit(X_train, y_train, eval_metric='mae')
# 预测训练集、测试集
train_data_df_predictions = alg.predict(X_train)
test_data_df_predictions = alg.predict(X_test)
# 回归问题评价标--训练集
print("training mean_absolute_error is : " )
print(mean_absolute_error(y_train, train_data_df_predictions))
#测试集
print("test mean_absolute_error is : ")
print(mean_absolute_error(y_test, test_data_df_predictions))
#特征重要度
plt.ylabel('Feature Importance is')
plot_importance(alg)
plt.show()
def light_gbm_model_fit(self,X_train, X_test, y_train, y_test):
gbm = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=20)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='l1', early_stopping_rounds=5)
print('Start lightgbm predicting...')
# 训练集与测试集预测
y_train_pred = gbm.predict(X_train, num_iteration=gbm.best_iteration_)
y_test_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# 模型评估
print('The y_train mae of test prediction is:', mean_absolute_error(y_train, y_train_pred))
print('The y_test mae of test prediction is:', mean_absolute_error(y_test, y_test_pred) )
# feature importances
print('Feature importances:', list(gbm.feature_importances_))
# 网格搜索,参数优化
estimator = lgb.LGBMRegressor(num_leaves=64, metrics='mae', max_depth=7, min_child_samples=1000)
param_grid = {
'learning_rate': [0.01, 0.1, 1],
'n_estimators': [20, 40]
}
gbm_grid = GridSearchCV(estimator, param_grid)
gbm_grid = gbm_grid.fit(X_train, y_train)
print("用网格搜索的方式开始进行预测")
print('Best parameters found by grid search are:', gbm_grid.best_params_)
# 训练集与测试集预测
y_train_pred = gbm_grid.predict(X_train)
y_test_pred = gbm_grid.predict(X_test)
# 模型评估
print('grid search cv The y_train mae of test prediction is:', mean_absolute_error(y_train, y_train_pred))
print('grid search cv The y_test mae of test prediction is:', mean_absolute_error(y_test, y_test_pred))
# feature importances
print('Feature importances:', list(gbm_grid.feature_importances_))
def data_process(data_path,test_data_path):
data_search = DataSearch()
# 加载数据
data_df = data_search.load_data(data_path,test_data_path)
# 枚举特征分类统计
category_columns = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode',
'seller', 'offerType']
# data_df = data_search.categorial_statistus(data_df, category_columns)
# 数字特征可视化
numerical_columns = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9',
'v_10', 'v_11', 'v_12', 'v_13', 'v_14']
# data_search.plot_nemurical(data_df,numerical_columns)
# 字符特征--扩展及去噪
data_df = data_search.categorial_extend(data_df)
# 数字特征异常值检测---price的原始值异常值检测有10353个,---对power异常值筛选
data_df = data_search.detect_outliers(data_df, 0, ['price'])
# 正态分布检测
# data_search.normal_test(data_df)
# 对数转化图形对比
# data_search.log_plot(data_df)
# 正态转化---对价格进行对数正态变换
data_df = data_search.change_to_nomal(data_df)
# 数字特征异常值检测---price的正态化后异常值检测有 个
# data_search.detect_outliers(data_df, 1, ['power'])
# 特征选择,根据数据分布,手动删除3个特征:'seller', 'offerType','creatDate';
all_col = ['name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
'v_0', 'v_1', 'v_2', 'v_3',
'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
'v_13', 'v_14', 'year_regDate', 'month_regDate', 'price']
print(data_df.columns)
data_df[all_col] = data_df[all_col].apply(pd.to_numeric, errors='coerce').fillna(0.0)
data_df.describe(include='all')
return data_df,all_col
if __name__ == '__main__':
data_search = DataSearch()
#加载训练数据
train_data_path = r"C:\Users\ccs\Documents\dataWhale\used_car_train_20200313\used_car_train_20200313.csv"
test_data_path = r"C:\Users\ccs\Documents\dataWhale\used_car_testA_20200313\used_car_testA_20200313.csv"
train_data_df,all_col = data_process(train_data_path,test_data_path)
# #岭回归
# feature_cols = list(all_col)
# feature_cols.remove("price")
# data_search.ridge_cv(train_data_df,feature_cols)
#
# #逐步回归
# result = data_search.stepwise_selection(train_data_df[feature_cols], y_train)
# 运用树模型训练特征
predictors = [x for x in all_col if x not in ['price', 'SaleID']]
#选择训练数据进行训练集及测试集的划分
train_data_df = train_data_df[train_data_df['train'] == 1]
X_train, X_test, y_train, y_test = train_test_split(train_data_df[predictors], train_data_df['price'], test_size=0.3)
xgb_model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=False, objective='reg:gamma')
data_search.xgb_model_fit(X_train, X_test, y_train, y_test,xgb_model)
data_search.light_gbm_model_fit(X_train, X_test, y_train, y_test)
#
"""
#xgboost的方法进行预测
training mean_absolute_error is :
0.14024175116320706
test mean_absolute_error is :
0.14639476706968618
#lightGBM的方法预测
Start lightgbm predicting...
The y_train mae of test prediction is: 0.39791827020196974
The y_test mae of test prediction is: 0.39712612190553775
用网格搜索的方式开始进行预测
Best parameters found by grid search are: {'learning_rate': 1, 'n_estimators': 40}
grid search cv The y_train mae of test prediction is: 0.1658337832367841
grid search cv The y_test mae of test prediction is: 0.17248839882027442
"""
结果
“”"
#xgboost的方法进行预测
training mean_absolute_error is :
0.14024175116320706
test mean_absolute_error is :
0.14639476706968618
#lightGBM的方法预测
Start lightgbm predicting...
The y_train mae of test prediction is: 0.39791827020196974
The y_test mae of test prediction is: 0.39712612190553775
#lightGBM的方法用网格搜索的方式开始进行预测
Best parameters found by grid search are: {'learning_rate': 1, 'n_estimators': 40}
grid search cv The y_train mae of test prediction is: 0.1658337832367841
grid search cv The y_test mae of test prediction is: 0.17248839882027442
"""