图片来源:特征工程全过程
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from operator import itemgetter
%matplotlib inline
Train_data= pd.read_csv(r'D:\ershouche\used_car_train_20200313.csv', sep=' ')
Test_data = pd.read_csv(r'D:\ershouche\used_car_testA_20200313.csv', sep=' ')
Train_data['notRepairedDamage'].replace('-', np.nan, inplace=True)
Train_data['notRepairedDamage'].value_counts()
0.0 111361
1.0 14315
Name: notRepairedDamage, dtype: int64
#对偏斜类做删除处理
del Train_data["seller"]
del Train_data["offerType"]
del Test_data["seller"]
del Test_data["offerType"]
def out_proc(data,col_name,scale=3): #(data:接收pandas 数据格式,col_name: pandas 列名, scale 尺度)
def box(data_ser,box_scale): #(接收箱线图的数据格式,箱线图尺度)
iqr=box_scale*(data_ser.quantile(0.75)-data_ser.quantile(0.25))
val_low=(data_ser.quantile(0.25)-iqr)
val_up=(data_ser.quantile(0.75)+iqr)
rule_low=(data_ser<val_low)
rule_up=(data_ser>val_up)
return (rule_low,rule_up),(val_low,val_up)
data_n=data.copy()
data_series=data_n[col_name]
rule,value = box(data_series,box_scale=scale)
index = np.arange(data_series.shape[0])[rule[0]|rule[1]]
print('delete number is: {}'.format(len(index)))
data_n=data_n.drop(index)
data_n.reset_index(drop=True,inplace=True)
print('now column number is :{}'.format(data_n.shape[0]))
index_low = np.arange(data_series.shape[0])[rule[1]]
out=data_series.iloc[index_low]
print("description of data larger than the upper bound is :{}")
print(pd.Series(out).describe())
fix ,ax = plt.subplots(1,2)
sns.boxplot (y = data[col_name],data = data,palette='Set1',ax=ax[0])
sns.boxplot (y=data_n[col_name],data=data_n,palette='Set1',ax=ax[1])
return data_n
通过EDA数据分析,我们对不对匿名数据进行任何处理,仅仅对数值数据中的power,kilometer进行异常值处理。
dcol=['power','kilometer']
for cst in dcol:
print(cst+ "异常值处理:")
print(out_proc(Train_data,cst,scale=1.5))
power异常值处理:
delete number is: 4878
now column number is :145122
description of data larger than the upper bound is :{}
count 4878.000000
mean 410.132021
std 884.219933
min 264.000000
25% 286.000000
50% 306.000000
75% 349.000000
max 19312.000000
Name: power, dtype: float64
SaleID name regDate model brand bodyType fuelType gearbox \
0 0 736 20040402 30.0 6 1.0 0.0 0.0
1 1 2262 20030301 40.0 1 2.0 0.0 0.0
2 2 14874 20040403 115.0 15 1.0 0.0 0.0
3 3 71865 19960908 109.0 10 0.0 0.0 1.0
4 4 111080 20120103 110.0 5 1.0 0.0 0.0
... ... ... ... ... ... ... ... ...
145117 149995 163978 20000607 121.0 10 4.0 0.0 1.0
145118 149996 184535 20091102 116.0 11 0.0 0.0 0.0
145119 149997 147587 20101003 60.0 11 1.0 1.0 0.0
145120 149998 45907 20060312 34.0 10 3.0 1.0 0.0
145121 149999 177672 19990204 19.0 28 6.0 0.0 1.0
power kilometer ... v_5 v_6 v_7 v_8 \
0 60 12.5 ... 0.235676 0.101988 0.129549 0.022816
1 0 15.0 ... 0.264777 0.121004 0.135731 0.026597
2 163 12.5 ... 0.251410 0.114912 0.165147 0.062173
3 193 15.0 ... 0.274293 0.110300 0.121964 0.033395
4 68 5.0 ... 0.228036 0.073205 0.091880 0.078819
... ... ... ... ... ... ... ...
145117 163 15.0 ... 0.280264 0.000310 0.048441 0.071158
145118 125 10.0 ... 0.253217 0.000777 0.084079 0.099681
145119 90 6.0 ... 0.233353 0.000705 0.118872 0.100118
145120 156 15.0 ... 0.256369 0.000252 0.081479 0.083558
145121 193 12.5 ... 0.284475 0.000000 0.040072 0.062543
v_9 v_10 v_11 v_12 v_13 v_14
0 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762
1 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522
2 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963
3 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699
4 0.121534 -1.896240 0.910783 0.931110 2.834518 1.923482
... ... ... ... ... ... ...
145117 0.019174 1.988114 -2.983973 0.589167 -1.304370 -0.302592
145118 0.079371 1.839166 -2.774615 2.553994 0.924196 -0.272160
145119 0.097914 2.439812 -1.630677 2.290197 1.891922 0.414931
145120 0.081498 2.075380 -2.633719 1.414937 0.431981 -1.659014
145121 0.025819 1.978453 -3.179913 0.031724 -1.483350 -0.342674
[145122 rows x 29 columns]
kilometer异常值处理:
delete number is: 25685
now column number is :124315
description of data larger than the upper bound is :{}
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
Name: kilometer, dtype: float64
SaleID name regDate model brand bodyType fuelType gearbox \
0 0 736 20040402 30.0 6 1.0 0.0 0.0
1 1 2262 20030301 40.0 1 2.0 0.0 0.0
2 2 14874 20040403 115.0 15 1.0 0.0 0.0
3 3 71865 19960908 109.0 10 0.0 0.0 1.0
4 5 137642 20090602 24.0 10 0.0 1.0 0.0
... ... ... ... ... ... ... ... ...
124310 149992 183499 20001206 32.0 8 1.0 0.0 0.0
124311 149995 163978 20000607 121.0 10 4.0 0.0 1.0
124312 149996 184535 20091102 116.0 11 0.0 0.0 0.0
124313 149998 45907 20060312 34.0 10 3.0 1.0 0.0
124314 149999 177672 19990204 19.0 28 6.0 0.0 1.0
power kilometer ... v_5 v_6 v_7 v_8 \
0 60 12.5 ... 0.235676 0.101988 0.129549 0.022816
1 0 15.0 ... 0.264777 0.121004 0.135731 0.026597
2 163 12.5 ... 0.251410 0.114912 0.165147 0.062173
3 193 15.0 ... 0.274293 0.110300 0.121964 0.033395
4 109 10.0 ... 0.260246 0.000518 0.119838 0.090922
... ... ... ... ... ... ... ...
124310 82 15.0 ... 0.234736 0.000000 0.105834 0.042096
124311 163 15.0 ... 0.280264 0.000310 0.048441 0.071158
124312 125 10.0 ... 0.253217 0.000777 0.084079 0.099681
124313 156 15.0 ... 0.256369 0.000252 0.081479 0.083558
124314 193 12.5 ... 0.284475 0.000000 0.040072 0.062543
v_9 v_10 v_11 v_12 v_13 v_14
0 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762
1 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522
2 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963
3 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699
4 0.048769 1.885526 -2.721943 2.457660 -0.286973 0.206573
... ... ... ... ... ... ...
124310 0.102435 3.735963 -0.176973 -2.353203 0.998859 -0.085879
124311 0.019174 1.988114 -2.983973 0.589167 -1.304370 -0.302592
124312 0.079371 1.839166 -2.774615 2.553994 0.924196 -0.272160
124313 0.081498 2.075380 -2.633719 1.414937 0.431981 -1.659014
124314 0.025819 1.978453 -3.179913 0.031724 -1.483350 -0.342674
[124315 rows x 29 columns]
由上图异常值处理过后,我们发现对于kilometer的效果并不完美,所以我们不对此进行异常值处理
print( "power异常值处理:")
Train_data=(out_proc(Train_data,'power',scale=1.5))
power异常值处理:
delete number is: 4878
now column number is :145122
description of data larger than the upper bound is :{}
count 4878.000000
mean 410.132021
std 884.219933
min 264.000000
25% 286.000000
50% 306.000000
75% 349.000000
max 19312.000000
Name: power, dtype: float64
这样呢我们就得到了处理完异常值之后的数据。
#训练集和测试集放在一起,方便构造 _——
Train_data['train']=1
Test_data['train']=0
data = pd.concat([Train_data,Test_data],ignore_index=True)
c:\users\administrator\appdata\local\programs\python\python37\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
after removing the cwd from sys.path.
SaleID 0
bodyType 5868
brand 0
creatDate 0
fuelType 11416
gearbox 7832
kilometer 0
model 1
name 0
notRepairedDamage 23903
power 0
price 50000
regDate 0
regionCode 0
train 0
v_0 0
v_1 0
v_10 0
v_11 0
v_12 0
v_13 0
v_14 0
v_2 0
v_3 0
v_4 0
v_5 0
v_6 0
v_7 0
v_8 0
v_9 0
dtype: int64
#找使用时间=creatDate-regDate,数据中出错歌神,用error ='coerce'
data['used_time']=(pd.to_datetime(data['creatDate'],format='%Y%m%d',errors='coerce')-
pd.to_datetime(data['regDate'],format='%Y%m%d',errors='coerce')).dt.days
# 看一下空数据,有 15k 个样本的时间是有问题的,我们可以选择删除,也可以选择放着。
# 但是这里不建议删除,因为删除缺失数据占总样本量过大,7.5%
# 我们可以先放着,因为如果我们 XGBoost 之类的决策树,其本身就能处理缺失值,所以可以不用管;
data['used_time'].isnull().sum()
14969
# 从邮编中提取城市信息,相当于加入了先验知识
data['city']=data['regionCode'].apply(lambda x : str(x)[:-3]) #lambda函数也叫匿名函数,即没有具体名称的函数,它允许快速定义单行函数,可以用在任何需要函数的地方
data=data
## 计算某品牌的销售统计量,同学们还可以计算其他特征的统计量
# 这里要以 train 的数据计算统计量
Train_gb=Train_data.groupby('brand')
all_info={}
for kind,kind_data in Train_gb:
info={}
kind_data=kind_data[kind_data['price']>0]
info['brand_amount']=len(kind_data)
info['brand_prince_max']=kind_data.price.max()
info['brand_prince_min']=kind_data.price.min()
info['brand_prince_median']=kind_data.price.median()
info['brand_prince_sum']=kind_data.price.sum()
info['brand_prince_std']=kind_data.price.std()
info['brand_prince_averge']=round(kind_data.price.sum()/(len(kind_data)+1),2) #这个地方为什么要加1呢
all_info[kind] =info
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={'index': 'brand'})
data = data.merge(brand_fe,how = 'left',on='brand')
这个部分是特征构造中,构造统计量特征,至于意义何在,目前不清楚。
为什么要做数据分桶呢,原因有很多,= =
当然还有很多原因,LightGBM 在改进 XGBoost 时就增加了数据分桶,增强了模型的泛化性
关于数据分桶,具体可以参考数据分桶
(可以好好看一下该博客后面写的参考文献里的博客)
data.head()
SaleID | bodyType | brand | creatDate | fuelType | gearbox | kilometer | model | name | notRepairedDamage | ... | v_9 | used_time | city | brand_amount | brand_prince_max | brand_prince_min | brand_prince_median | brand_prince_sum | brand_prince_std | brand_prince_averge | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1.0 | 6 | 20160404 | 0.0 | 0.0 | 12.5 | 30.0 | 736 | 0.0 | ... | 0.097462 | 4385.0 | 1 | 10126.0 | 35990.0 | 13.0 | 1799.0 | 35138859.0 | 4353.895183 | 3469.82 |
1 | 1 | 2.0 | 1 | 20160309 | 0.0 | 0.0 | 15.0 | 40.0 | 2262 | NaN | ... | 0.020582 | 4757.0 | 4 | 12938.0 | 84000.0 | 15.0 | 5950.0 | 110453942.0 | 8278.094769 | 8536.51 |
2 | 2 | 1.0 | 15 | 20160402 | 0.0 | 0.0 | 12.5 | 115.0 | 14874 | 0.0 | ... | 0.027075 | 4382.0 | 2 | 1456.0 | 45000.0 | 100.0 | 8500.0 | 14334320.0 | 5416.367362 | 9838.24 |
3 | 3 | 0.0 | 10 | 20160312 | 0.0 | 1.0 | 15.0 | 109.0 | 71865 | 0.0 | ... | 0.000000 | 7125.0 | 13081.0 | 89000.0 | 15.0 | 4900.0 | 98590188.0 | 7789.654875 | 7536.32 | |
4 | 4 | 1.0 | 5 | 20160313 | 0.0 | 0.0 | 5.0 | 110.0 | 111080 | 0.0 | ... | 0.121534 | 1531.0 | 6 | 4660.0 | 29950.0 | 20.0 | 2300.0 | 15379822.0 | 3319.801819 | 3299.68 |
5 rows × 39 columns
通过看上述数据描述,我们可以看到对于匿名特征来讲,其值较为均匀,没有做数据分桶的必要,我们这里只对分布较广的数据做数据 分桶,因此仅选择power做数据分桶。
bin = [i*10 for i in range(31)]
data['power_bin'] = pd.cut(data['power'], bin, labels=False)
data[['power_bin', 'power']].head()
power_bin | power | |
---|---|---|
0 | 5.0 | 60 |
1 | NaN | 0 |
2 | 16.0 | 163 |
3 | 19.0 | 193 |
4 | 6.0 | 68 |
print('power变量中31个桶中的个数:')
pd.value_counts(data['power_bin'])
power变量中31个桶中的个数:
10.0 24263
7.0 17580
13.0 16023
5.0 15385
14.0 14252
11.0 12903
8.0 11962
16.0 10920
12.0 9108
6.0 7269
17.0 5412
19.0 4602
4.0 4486
9.0 4190
23.0 3196
18.0 3165
15.0 2832
21.0 2688
20.0 1932
22.0 1823
24.0 1403
25.0 586
3.0 287
27.0 227
26.0 152
28.0 142
29.0 135
0.0 113
1.0 56
2.0 25
Name: power_bin, dtype: int64
接下来我们看一下现有数据集中的变量有哪些,并进行一定的筛选
data.columns
Index(['SaleID', 'bodyType', 'brand', 'creatDate', 'fuelType', 'gearbox',
'kilometer', 'model', 'name', 'notRepairedDamage', 'power', 'price',
'regDate', 'regionCode', 'train', 'v_0', 'v_1', 'v_10', 'v_11', 'v_12',
'v_13', 'v_14', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9',
'used_time', 'city', 'brand_amount', 'brand_prince_max',
'brand_prince_min', 'brand_prince_median', 'brand_prince_sum',
'brand_prince_std', 'brand_prince_averge', 'power_bin'],
dtype='object')
因上面已经对 ‘creatDate’‘regDate’, 'regionCode’进行了一定的特征构造,所以,可以将其删除。
data = data.drop(['creatDate', 'regDate', 'regionCode'], axis=1)
data.columns
Index(['SaleID', 'bodyType', 'brand', 'fuelType', 'gearbox', 'kilometer',
'model', 'name', 'notRepairedDamage', 'power', 'price', 'train', 'v_0',
'v_1', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'v_2', 'v_3', 'v_4',
'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'used_time', 'city', 'brand_amount',
'brand_prince_max', 'brand_prince_min', 'brand_prince_median',
'brand_prince_sum', 'brand_prince_std', 'brand_prince_averge',
'power_bin'],
dtype='object')
# 目前的数据其实已经可以给树模型使用了,所以我们导出一下
data.to_csv('D:/ershouche/data_for_tree.csv', index=0)
对于生成树的数据处理
1.我们对于异常值,采取了运用箱线图删除异常值的方法,这里对,多个在EDA分析中对目标变量具有较强相关变量进行异常值处理。
2.在特征构造方面,我们针对brand变量构造了其统计量特征,另外修改了使用时间特征,以及其地理信息由邮编改为城市,有助于提高之后模型分析的效率。
3.我们对于通过对31个变量的数据描述可以看出,power变量较适合数据分桶,因此对其进行分桶,以提高之后采样的效率等。
4.删除已被替代的数据。
5.在此过程中,我们并没有对缺失值进行相关处理,也没有进行相关特征的筛选,所以此时生成的数据,更适合XGBoost,以及随机森林进行建模处理。
1.天池二手车
2.特征工程过程
3.数据预处理方法
4.数据分桶
5.箱线图,3标准差