jupyter notebook数据分析

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
## 1.载入训练集和测试集;
path = '/Users/dingyunfei/Downloads/比赛/'
Train_data = pd.read_csv(path+'used_car_train_20200313.csv', sep=' ')
Test_data = pd.read_csv(path+'used_car_testA_20200313.csv', sep=' ')
## 2) 简略观察数据(head()+shape)
Train_data.head().append(Train_data.tail())
SaleID name regDate model brand bodyType fuelType gearbox power kilometer ... v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
0 0 736 20040402 30.0 6 1.0 0.0 0.0 60 12.5 ... 0.235676 0.101988 0.129549 0.022816 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762
1 1 2262 20030301 40.0 1 2.0 0.0 0.0 0 15.0 ... 0.264777 0.121004 0.135731 0.026597 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522
2 2 14874 20040403 115.0 15 1.0 0.0 0.0 163 12.5 ... 0.251410 0.114912 0.165147 0.062173 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963
3 3 71865 19960908 109.0 10 0.0 0.0 1.0 193 15.0 ... 0.274293 0.110300 0.121964 0.033395 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699
4 4 111080 20120103 110.0 5 1.0 0.0 0.0 68 5.0 ... 0.228036 0.073205 0.091880 0.078819 0.121534 -1.896240 0.910783 0.931110 2.834518 1.923482
149995 149995 163978 20000607 121.0 10 4.0 0.0 1.0 163 15.0 ... 0.280264 0.000310 0.048441 0.071158 0.019174 1.988114 -2.983973 0.589167 -1.304370 -0.302592
149996 149996 184535 20091102 116.0 11 0.0 0.0 0.0 125 10.0 ... 0.253217 0.000777 0.084079 0.099681 0.079371 1.839166 -2.774615 2.553994 0.924196 -0.272160
149997 149997 147587 20101003 60.0 11 1.0 1.0 0.0 90 6.0 ... 0.233353 0.000705 0.118872 0.100118 0.097914 2.439812 -1.630677 2.290197 1.891922 0.414931
149998 149998 45907 20060312 34.0 10 3.0 1.0 0.0 156 15.0 ... 0.256369 0.000252 0.081479 0.083558 0.081498 2.075380 -2.633719 1.414937 0.431981 -1.659014
149999 149999 177672 19990204 19.0 28 6.0 0.0 1.0 193 12.5 ... 0.284475 0.000000 0.040072 0.062543 0.025819 1.978453 -3.179913 0.031724 -1.483350 -0.342674

10 rows × 31 columns

Train_data.shape
(150000, 31)
Test_data.head().append(Test_data.tail())
SaleID name regDate model brand bodyType fuelType gearbox power kilometer ... v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
0 150000 66932 20111212 222.0 4 5.0 1.0 1.0 313 15.0 ... 0.264405 0.121800 0.070899 0.106558 0.078867 -7.050969 -0.854626 4.800151 0.620011 -3.664654
1 150001 174960 19990211 19.0 21 0.0 0.0 0.0 75 12.5 ... 0.261745 0.000000 0.096733 0.013705 0.052383 3.679418 -0.729039 -3.796107 -1.541230 -0.757055
2 150002 5356 20090304 82.0 21 0.0 0.0 0.0 109 7.0 ... 0.260216 0.112081 0.078082 0.062078 0.050540 -4.926690 1.001106 0.826562 0.138226 0.754033
3 150003 50688 20100405 0.0 0 0.0 0.0 1.0 160 7.0 ... 0.260466 0.106727 0.081146 0.075971 0.048268 -4.864637 0.505493 1.870379 0.366038 1.312775
4 150004 161428 19970703 26.0 14 2.0 0.0 0.0 75 15.0 ... 0.250999 0.000000 0.077806 0.028600 0.081709 3.616475 -0.673236 -3.197685 -0.025678 -0.101290
49995 199995 20903 19960503 4.0 4 4.0 0.0 0.0 116 15.0 ... 0.284664 0.130044 0.049833 0.028807 0.004616 -5.978511 1.303174 -1.207191 -1.981240 -0.357695
49996 199996 708 19991011 0.0 0 0.0 0.0 0.0 75 15.0 ... 0.268101 0.108095 0.066039 0.025468 0.025971 -3.913825 1.759524 -2.075658 -1.154847 0.169073
49997 199997 6693 20040412 49.0 1 0.0 1.0 1.0 224 15.0 ... 0.269432 0.105724 0.117652 0.057479 0.015669 -4.639065 0.654713 1.137756 -1.390531 0.254420
49998 199998 96900 20020008 27.0 1 0.0 0.0 1.0 334 15.0 ... 0.261152 0.000490 0.137366 0.086216 0.051383 1.833504 -2.828687 2.465630 -0.911682 -2.057353
49999 199999 193384 20041109 166.0 6 1.0 NaN 1.0 68 9.0 ... 0.228730 0.000300 0.103534 0.080625 0.124264 2.914571 -1.135270 0.547628 2.094057 -1.552150

10 rows × 30 columns

Test_data.shape
(50000, 30)
## 1) 通过describe()来熟悉数据的相关统计量
Train_data.describe()
SaleID name regDate model brand bodyType fuelType gearbox power kilometer ... v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
count 150000.000000 150000.000000 1.500000e+05 149999.000000 150000.000000 145494.000000 141320.000000 144019.000000 150000.000000 150000.000000 ... 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000
mean 74999.500000 68349.172873 2.003417e+07 47.129021 8.052733 1.792369 0.375842 0.224943 119.316547 12.597160 ... 0.248204 0.044923 0.124692 0.058144 0.061996 -0.001000 0.009035 0.004813 0.000313 -0.000688
std 43301.414527 61103.875095 5.364988e+04 49.536040 7.864956 1.760640 0.548677 0.417546 177.168419 3.919576 ... 0.045804 0.051743 0.201410 0.029186 0.035692 3.772386 3.286071 2.517478 1.288988 1.038685
min 0.000000 0.000000 1.991000e+07 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.500000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 -9.168192 -5.558207 -9.639552 -4.153899 -6.546556
25% 37499.750000 11156.000000 1.999091e+07 10.000000 1.000000 0.000000 0.000000 0.000000 75.000000 12.500000 ... 0.243615 0.000038 0.062474 0.035334 0.033930 -3.722303 -1.951543 -1.871846 -1.057789 -0.437034
50% 74999.500000 51638.000000 2.003091e+07 30.000000 6.000000 1.000000 0.000000 0.000000 110.000000 15.000000 ... 0.257798 0.000812 0.095866 0.057014 0.058484 1.624076 -0.358053 -0.130753 -0.036245 0.141246
75% 112499.250000 118841.250000 2.007111e+07 66.000000 13.000000 3.000000 1.000000 0.000000 150.000000 15.000000 ... 0.265297 0.102009 0.125243 0.079382 0.087491 2.844357 1.255022 1.776933 0.942813 0.680378
max 149999.000000 196812.000000 2.015121e+07 247.000000 39.000000 7.000000 6.000000 1.000000 19312.000000 15.000000 ... 0.291838 0.151420 1.404936 0.160791 0.222787 12.357011 18.819042 13.847792 11.147669 8.658418

8 rows × 30 columns

Test_data.describe()
SaleID name regDate model brand bodyType fuelType gearbox power kilometer ... v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
count 50000.000000 50000.000000 5.000000e+04 50000.000000 50000.000000 48587.000000 47107.000000 48090.000000 50000.000000 50000.000000 ... 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000
mean 174999.500000 68542.223280 2.003393e+07 46.844520 8.056240 1.782185 0.373405 0.224350 119.883620 12.595580 ... 0.248669 0.045021 0.122744 0.057997 0.062000 -0.017855 -0.013742 -0.013554 -0.003147 0.001516
std 14433.901067 61052.808133 5.368870e+04 49.469548 7.819477 1.760736 0.546442 0.417158 185.097387 3.908979 ... 0.044601 0.051766 0.195972 0.029211 0.035653 3.747985 3.231258 2.515962 1.286597 1.027360
min 150000.000000 0.000000 1.991000e+07 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.500000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 -9.160049 -5.411964 -8.916949 -4.123333 -6.112667
25% 162499.750000 11203.500000 1.999091e+07 10.000000 1.000000 0.000000 0.000000 0.000000 75.000000 12.500000 ... 0.243762 0.000044 0.062644 0.035084 0.033714 -3.700121 -1.971325 -1.876703 -1.060428 -0.437920
50% 174999.500000 52248.500000 2.003091e+07 29.000000 6.000000 1.000000 0.000000 0.000000 109.000000 15.000000 ... 0.257877 0.000815 0.095828 0.057084 0.058764 1.613212 -0.355843 -0.142779 -0.035956 0.138799
75% 187499.250000 118856.500000 2.007110e+07 65.000000 13.000000 3.000000 1.000000 0.000000 150.000000 15.000000 ... 0.265328 0.102025 0.125438 0.079077 0.087489 2.832708 1.262914 1.764335 0.941469 0.681163
max 199999.000000 196805.000000 2.015121e+07 246.000000 39.000000 7.000000 6.000000 1.000000 20000.000000 15.000000 ... 0.291618 0.153265 1.358813 0.156355 0.214775 12.338872 18.856218 12.950498 5.913273 2.624622

8 rows × 29 columns

## 2) 通过info()来熟悉数据类型
Train_data.info()

RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
SaleID               150000 non-null int64
name                 150000 non-null int64
regDate              150000 non-null int64
model                149999 non-null float64
brand                150000 non-null int64
bodyType             145494 non-null float64
fuelType             141320 non-null float64
gearbox              144019 non-null float64
power                150000 non-null int64
kilometer            150000 non-null float64
notRepairedDamage    150000 non-null object
regionCode           150000 non-null int64
seller               150000 non-null int64
offerType            150000 non-null int64
creatDate            150000 non-null int64
price                150000 non-null int64
v_0                  150000 non-null float64
v_1                  150000 non-null float64
v_2                  150000 non-null float64
v_3                  150000 non-null float64
v_4                  150000 non-null float64
v_5                  150000 non-null float64
v_6                  150000 non-null float64
v_7                  150000 non-null float64
v_8                  150000 non-null float64
v_9                  150000 non-null float64
v_10                 150000 non-null float64
v_11                 150000 non-null float64
v_12                 150000 non-null float64
v_13                 150000 non-null float64
v_14                 150000 non-null float64
dtypes: float64(20), int64(10), object(1)
memory usage: 35.5+ MB
Test_data.info()

RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
SaleID               50000 non-null int64
name                 50000 non-null int64
regDate              50000 non-null int64
model                50000 non-null float64
brand                50000 non-null int64
bodyType             48587 non-null float64
fuelType             47107 non-null float64
gearbox              48090 non-null float64
power                50000 non-null int64
kilometer            50000 non-null float64
notRepairedDamage    50000 non-null object
regionCode           50000 non-null int64
seller               50000 non-null int64
offerType            50000 non-null int64
creatDate            50000 non-null int64
v_0                  50000 non-null float64
v_1                  50000 non-null float64
v_2                  50000 non-null float64
v_3                  50000 non-null float64
v_4                  50000 non-null float64
v_5                  50000 non-null float64
v_6                  50000 non-null float64
v_7                  50000 non-null float64
v_8                  50000 non-null float64
v_9                  50000 non-null float64
v_10                 50000 non-null float64
v_11                 50000 non-null float64
v_12                 50000 non-null float64
v_13                 50000 non-null float64
v_14                 50000 non-null float64
dtypes: float64(20), int64(9), object(1)
memory usage: 11.4+ MB
## 1) 查看每列的存在nan情况
Train_data.isnull().sum()
SaleID                  0
name                    0
regDate                 0
model                   1
brand                   0
bodyType             4506
fuelType             8680
gearbox              5981
power                   0
kilometer               0
notRepairedDamage       0
regionCode              0
seller                  0
offerType               0
creatDate               0
price                   0
v_0                     0
v_1                     0
v_2                     0
v_3                     0
v_4                     0
v_5                     0
v_6                     0
v_7                     0
v_8                     0
v_9                     0
v_10                    0
v_11                    0
v_12                    0
v_13                    0
v_14                    0
dtype: int64
Test_data.isnull().sum()
SaleID                  0
name                    0
regDate                 0
model                   0
brand                   0
bodyType             1413
fuelType             2893
gearbox              1910
power                   0
kilometer               0
notRepairedDamage       0
regionCode              0
seller                  0
offerType               0
creatDate               0
v_0                     0
v_1                     0
v_2                     0
v_3                     0
v_4                     0
v_5                     0
v_6                     0
v_7                     0
v_8                     0
v_9                     0
v_10                    0
v_11                    0
v_12                    0
v_13                    0
v_14                    0
dtype: int64
# nan可视化
missing = Train_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GXEwRaEw-1585030415044)(output_12_1.png)]

# 可视化看下缺省值
msno.matrix(Train_data.sample(250))

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-driW4Lld-1585030415046)(output_13_1.png)]

msno.bar(Train_data.sample(1000))

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DevlaRZP-1585030415047)(output_14_1.png)]

# 可视化看下缺省值
msno.matrix(Test_data.sample(250))

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-BpSdpvyr-1585030415047)(output_15_1.png)]

msno.bar(Test_data.sample(1000))

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qhrwsbks-1585030415048)(output_16_1.png)]

## 2) 查看异常值检测
Train_data.info()

RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
SaleID               150000 non-null int64
name                 150000 non-null int64
regDate              150000 non-null int64
model                149999 non-null float64
brand                150000 non-null int64
bodyType             145494 non-null float64
fuelType             141320 non-null float64
gearbox              144019 non-null float64
power                150000 non-null int64
kilometer            150000 non-null float64
notRepairedDamage    150000 non-null object
regionCode           150000 non-null int64
seller               150000 non-null int64
offerType            150000 non-null int64
creatDate            150000 non-null int64
price                150000 non-null int64
v_0                  150000 non-null float64
v_1                  150000 non-null float64
v_2                  150000 non-null float64
v_3                  150000 non-null float64
v_4                  150000 non-null float64
v_5                  150000 non-null float64
v_6                  150000 non-null float64
v_7                  150000 non-null float64
v_8                  150000 non-null float64
v_9                  150000 non-null float64
v_10                 150000 non-null float64
v_11                 150000 non-null float64
v_12                 150000 non-null float64
v_13                 150000 non-null float64
v_14                 150000 non-null float64
dtypes: float64(20), int64(10), object(1)
memory usage: 35.5+ MB
Train_data['notRepairedDamage'].value_counts()
0.0    111361
-       24324
1.0     14315
Name: notRepairedDamage, dtype: int64
Train_data['notRepairedDamage'].replace('-', np.nan, inplace=True)
Train_data['notRepairedDamage'].value_counts()
0.0    111361
1.0     14315
Name: notRepairedDamage, dtype: int64
Train_data.isnull().sum()

SaleID                   0
name                     0
regDate                  0
model                    1
brand                    0
bodyType              4506
fuelType              8680
gearbox               5981
power                    0
kilometer                0
notRepairedDamage    24324
regionCode               0
seller                   0
offerType                0
creatDate                0
price                    0
v_0                      0
v_1                      0
v_2                      0
v_3                      0
v_4                      0
v_5                      0
v_6                      0
v_7                      0
v_8                      0
v_9                      0
v_10                     0
v_11                     0
v_12                     0
v_13                     0
v_14                     0
dtype: int64
Test_data['notRepairedDamage'].value_counts()
0.0    37249
-       8031
1.0     4720
Name: notRepairedDamage, dtype: int64
Test_data['notRepairedDamage'].replace('-', np.nan, inplace=True)
Train_data["seller"].value_counts()
0    149999
1         1
Name: seller, dtype: int64
Train_data["offerType"].value_counts()
0    150000
Name: offerType, dtype: int64
del Train_data["seller"]
del Train_data["offerType"]
del Test_data["seller"]
del Test_data["offerType"]
Train_data['price']
0         1850
1         3600
2         6222
3         2400
4         5200
          ... 
149995    5900
149996    9500
149997    7500
149998    4999
149999    4700
Name: price, Length: 150000, dtype: int64
Train_data['price'].value_counts()
500      2337
1500     2158
1200     1922
1000     1850
2500     1821
         ... 
25321       1
8886        1
8801        1
37920       1
8188        1
Name: price, Length: 3763, dtype: int64
## 1) 总体分布概况(无界约翰逊分布等)
import scipy.stats as st
y = Train_data['price']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GBJdgFoL-1585030415048)(output_29_1.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-HNAWZ1K5-1585030415049)(output_29_2.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-WwqRuGgW-1585030415050)(output_29_3.png)]

## 2) 查看skewness and kurtosis
sns.distplot(Train_data['price']);
print("Skewness: %f" % Train_data['price'].skew())
print("Kurtosis: %f" % Train_data['price'].kurt())
Skewness: 3.346487
Kurtosis: 18.995183

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-S4LbJwwN-1585030415050)(output_30_1.png)]

Train_data.skew(), Train_data.kurt()
(SaleID               6.017846e-17
 name                 5.576058e-01
 regDate              2.849508e-02
 model                1.484388e+00
 brand                1.150760e+00
 bodyType             9.915299e-01
 fuelType             1.595486e+00
 gearbox              1.317514e+00
 power                6.586318e+01
 kilometer           -1.525921e+00
 notRepairedDamage    2.430640e+00
 regionCode           6.888812e-01
 creatDate           -7.901331e+01
 price                3.346487e+00
 v_0                 -1.316712e+00
 v_1                  3.594543e-01
 v_2                  4.842556e+00
 v_3                  1.062920e-01
 v_4                  3.679890e-01
 v_5                 -4.737094e+00
 v_6                  3.680730e-01
 v_7                  5.130233e+00
 v_8                  2.046133e-01
 v_9                  4.195007e-01
 v_10                 2.522046e-02
 v_11                 3.029146e+00
 v_12                 3.653576e-01
 v_13                 2.679152e-01
 v_14                -1.186355e+00
 dtype: float64, SaleID                 -1.200000
 name                   -1.039945
 regDate                -0.697308
 model                   1.740483
 brand                   1.076201
 bodyType                0.206937
 fuelType                5.880049
 gearbox                -0.264161
 power                5733.451054
 kilometer               1.141934
 notRepairedDamage       3.908072
 regionCode             -0.340832
 creatDate            6881.080328
 price                  18.995183
 v_0                     3.993841
 v_1                    -1.753017
 v_2                    23.860591
 v_3                    -0.418006
 v_4                    -0.197295
 v_5                    22.934081
 v_6                    -1.742567
 v_7                    25.845489
 v_8                    -0.636225
 v_9                    -0.321491
 v_10                   -0.577935
 v_11                   12.568731
 v_12                    0.268937
 v_13                   -0.438274
 v_14                    2.393526
 dtype: float64)
sns.distplot(Train_data.skew(),color='blue',axlabel ='Skewness')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-0605nR7W-1585030415051)(output_32_1.png)]

sns.distplot(Train_data.kurt(),color='orange',axlabel ='Kurtness')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-yptoR4Lm-1585030415051)(output_33_1.png)]

## 3) 查看预测值的具体频数
plt.hist(Train_data['price'], orientation = 'vertical',histtype = 'bar', color ='red')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-mXVgIiQM-1585030415052)(output_34_0.png)]

# log变换 z之后的分布较均匀,可以进行log变换进行预测,这也是预测问题常用的trick
plt.hist(np.log(Train_data['price']), orientation = 'vertical',histtype = 'bar', color ='red') 
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-s8GFqVBd-1585030415052)(output_35_0.png)]

# 分离label即预测值
Y_train = Train_data['price']
numeric_features = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13','v_14' ]

categorical_features = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode',]
# 特征nunique分布
for cat_fea in categorical_features:
    print(cat_fea + "的特征分布如下:")
    print("{}特征有个{}不同的值".format(cat_fea, Train_data[cat_fea].nunique()))
    print(Train_data[cat_fea].value_counts())
name的特征分布如下:
name特征有个99662不同的值
708       282
387       282
55        280
1541      263
203       233
         ... 
5074        1
7123        1
11221       1
13270       1
174485      1
Name: name, Length: 99662, dtype: int64
model的特征分布如下:
model特征有个248不同的值
0.0      11762
19.0      9573
4.0       8445
1.0       6038
29.0      5186
         ...  
245.0        2
209.0        2
240.0        2
242.0        2
247.0        1
Name: model, Length: 248, dtype: int64
brand的特征分布如下:
brand特征有个40不同的值
0     31480
4     16737
14    16089
10    14249
1     13794
6     10217
9      7306
5      4665
13     3817
11     2945
3      2461
7      2361
16     2223
8      2077
25     2064
27     2053
21     1547
15     1458
19     1388
20     1236
12     1109
22     1085
26      966
30      940
17      913
24      772
28      649
32      592
29      406
37      333
2       321
31      318
18      316
36      228
34      227
33      218
23      186
35      180
38       65
39        9
Name: brand, dtype: int64
bodyType的特征分布如下:
bodyType特征有个8不同的值
0.0    41420
1.0    35272
2.0    30324
3.0    13491
4.0     9609
5.0     7607
6.0     6482
7.0     1289
Name: bodyType, dtype: int64
fuelType的特征分布如下:
fuelType特征有个7不同的值
0.0    91656
1.0    46991
2.0     2212
3.0      262
4.0      118
5.0       45
6.0       36
Name: fuelType, dtype: int64
gearbox的特征分布如下:
gearbox特征有个2不同的值
0.0    111623
1.0     32396
Name: gearbox, dtype: int64
notRepairedDamage的特征分布如下:
notRepairedDamage特征有个2不同的值
0.0    111361
1.0     14315
Name: notRepairedDamage, dtype: int64
regionCode的特征分布如下:
regionCode特征有个7905不同的值
419     369
764     258
125     137
176     136
462     134
       ... 
6414      1
7063      1
4239      1
5931      1
7267      1
Name: regionCode, Length: 7905, dtype: int64
# 特征nunique分布
for cat_fea in categorical_features:
    print(cat_fea + "的特征分布如下:")
    print("{}特征有个{}不同的值".format(cat_fea, Test_data[cat_fea].nunique()))
    print(Test_data[cat_fea].value_counts())
name的特征分布如下:
name特征有个37453不同的值
55       97
708      96
387      95
1541     88
713      74
         ..
22270     1
89855     1
42752     1
48899     1
11808     1
Name: name, Length: 37453, dtype: int64
model的特征分布如下:
model特征有个247不同的值
0.0      3896
19.0     3245
4.0      3007
1.0      1981
29.0     1742
         ... 
242.0       1
240.0       1
244.0       1
243.0       1
246.0       1
Name: model, Length: 247, dtype: int64
brand的特征分布如下:
brand特征有个40不同的值
0     10348
4      5763
14     5314
10     4766
1      4532
6      3502
9      2423
5      1569
13     1245
11      919
7       795
3       773
16      771
8       704
25      695
27      650
21      544
15      511
20      450
19      450
12      389
22      363
30      324
17      317
26      303
24      268
28      225
32      193
29      117
31      115
18      106
2       104
37       92
34       77
33       76
36       67
23       62
35       53
38       23
39        2
Name: brand, dtype: int64
bodyType的特征分布如下:
bodyType特征有个8不同的值
0.0    13985
1.0    11882
2.0     9900
3.0     4433
4.0     3303
5.0     2537
6.0     2116
7.0      431
Name: bodyType, dtype: int64
fuelType的特征分布如下:
fuelType特征有个7不同的值
0.0    30656
1.0    15544
2.0      774
3.0       72
4.0       37
6.0       14
5.0       10
Name: fuelType, dtype: int64
gearbox的特征分布如下:
gearbox特征有个2不同的值
0.0    37301
1.0    10789
Name: gearbox, dtype: int64
notRepairedDamage的特征分布如下:
notRepairedDamage特征有个2不同的值
0.0    37249
1.0     4720
Name: notRepairedDamage, dtype: int64
regionCode的特征分布如下:
regionCode特征有个6971不同的值
419     146
764      78
188      52
125      51
759      51
       ... 
7753      1
7463      1
7230      1
826       1
112       1
Name: regionCode, Length: 6971, dtype: int64
numeric_features.append('price')
numeric_features
['power',
 'kilometer',
 'v_0',
 'v_1',
 'v_2',
 'v_3',
 'v_4',
 'v_5',
 'v_6',
 'v_7',
 'v_8',
 'v_9',
 'v_10',
 'v_11',
 'v_12',
 'v_13',
 'v_14',
 'price']
Train_data.head()
SaleID name regDate model brand bodyType fuelType gearbox power kilometer ... v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
0 0 736 20040402 30.0 6 1.0 0.0 0.0 60 12.5 ... 0.235676 0.101988 0.129549 0.022816 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762
1 1 2262 20030301 40.0 1 2.0 0.0 0.0 0 15.0 ... 0.264777 0.121004 0.135731 0.026597 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522
2 2 14874 20040403 115.0 15 1.0 0.0 0.0 163 12.5 ... 0.251410 0.114912 0.165147 0.062173 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963
3 3 71865 19960908 109.0 10 0.0 0.0 1.0 193 15.0 ... 0.274293 0.110300 0.121964 0.033395 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699
4 4 111080 20120103 110.0 5 1.0 0.0 0.0 68 5.0 ... 0.228036 0.073205 0.091880 0.078819 0.121534 -1.896240 0.910783 0.931110 2.834518 1.923482

5 rows × 29 columns

## 1) 相关性分析
price_numeric = Train_data[numeric_features]
correlation = price_numeric.corr()
print(correlation['price'].sort_values(ascending = False),'\n')
price        1.000000
v_12         0.692823
v_8          0.685798
v_0          0.628397
power        0.219834
v_5          0.164317
v_2          0.085322
v_6          0.068970
v_1          0.060914
v_14         0.035911
v_13        -0.013993
v_7         -0.053024
v_4         -0.147085
v_9         -0.206205
v_10        -0.246175
v_11        -0.275320
kilometer   -0.440519
v_3         -0.730946
Name: price, dtype: float64 
f , ax = plt.subplots(figsize = (7, 7))

plt.title('Correlation of Numeric Features with Price',y=1,size=16)

sns.heatmap(correlation,square = True,  vmax=0.8)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QeH1wkm6-1585030415053)(output_44_1.png)]

del price_numeric['price']
## 2) 查看几个特征得 偏度和峰值
for col in numeric_features:
    print('{:15}'.format(col), 
          'Skewness: {:05.2f}'.format(Train_data[col].skew()) , 
          '   ' ,
          'Kurtosis: {:06.2f}'.format(Train_data[col].kurt())  
         )
power           Skewness: 65.86     Kurtosis: 5733.45
kilometer       Skewness: -1.53     Kurtosis: 001.14
v_0             Skewness: -1.32     Kurtosis: 003.99
v_1             Skewness: 00.36     Kurtosis: -01.75
v_2             Skewness: 04.84     Kurtosis: 023.86
v_3             Skewness: 00.11     Kurtosis: -00.42
v_4             Skewness: 00.37     Kurtosis: -00.20
v_5             Skewness: -4.74     Kurtosis: 022.93
v_6             Skewness: 00.37     Kurtosis: -01.74
v_7             Skewness: 05.13     Kurtosis: 025.85
v_8             Skewness: 00.20     Kurtosis: -00.64
v_9             Skewness: 00.42     Kurtosis: -00.32
v_10            Skewness: 00.03     Kurtosis: -00.58
v_11            Skewness: 03.03     Kurtosis: 012.57
v_12            Skewness: 00.37     Kurtosis: 000.27
v_13            Skewness: 00.27     Kurtosis: -00.44
v_14            Skewness: -1.19     Kurtosis: 002.39
price           Skewness: 03.35     Kurtosis: 019.00
## 3) 每个数字特征得分布可视化
f = pd.melt(Train_data, value_vars=numeric_features)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vDfas4is-1585030415054)(output_47_0.png)]

## 4) 数字特征相互之间的关系可视化
sns.set()
columns = ['price', 'v_12', 'v_8' , 'v_0', 'power', 'v_5',  'v_2', 'v_6', 'v_1', 'v_14']
sns.pairplot(Train_data[columns],size = 2 ,kind ='scatter',diag_kind='kde')
plt.show()
Matplotlib is building the font cache using fc-list. This may take a moment.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-zJNdOBqu-1585030415054)(output_48_1.png)]

Train_data.columns
Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
       'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
       'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6',
       'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14'],
      dtype='object')
Y_train
0         1850
1         3600
2         6222
3         2400
4         5200
          ... 
149995    5900
149996    9500
149997    7500
149998    4999
149999    4700
Name: price, Length: 150000, dtype: int64
## 5) 多变量互相回归关系可视化
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10)) = plt.subplots(nrows=5, ncols=2, figsize=(24, 20))
# ['v_12', 'v_8' , 'v_0', 'power', 'v_5',  'v_2', 'v_6', 'v_1', 'v_14']
v_12_scatter_plot = pd.concat([Y_train,Train_data['v_12']],axis = 1)
sns.regplot(x='v_12',y = 'price', data = v_12_scatter_plot,scatter= True, fit_reg=True, ax=ax1)

v_8_scatter_plot = pd.concat([Y_train,Train_data['v_8']],axis = 1)
sns.regplot(x='v_8',y = 'price',data = v_8_scatter_plot,scatter= True, fit_reg=True, ax=ax2)

v_0_scatter_plot = pd.concat([Y_train,Train_data['v_0']],axis = 1)
sns.regplot(x='v_0',y = 'price',data = v_0_scatter_plot,scatter= True, fit_reg=True, ax=ax3)

power_scatter_plot = pd.concat([Y_train,Train_data['power']],axis = 1)
sns.regplot(x='power',y = 'price',data = power_scatter_plot,scatter= True, fit_reg=True, ax=ax4)

v_5_scatter_plot = pd.concat([Y_train,Train_data['v_5']],axis = 1)
sns.regplot(x='v_5',y = 'price',data = v_5_scatter_plot,scatter= True, fit_reg=True, ax=ax5)

v_2_scatter_plot = pd.concat([Y_train,Train_data['v_2']],axis = 1)
sns.regplot(x='v_2',y = 'price',data = v_2_scatter_plot,scatter= True, fit_reg=True, ax=ax6)

v_6_scatter_plot = pd.concat([Y_train,Train_data['v_6']],axis = 1)
sns.regplot(x='v_6',y = 'price',data = v_6_scatter_plot,scatter= True, fit_reg=True, ax=ax7)

v_1_scatter_plot = pd.concat([Y_train,Train_data['v_1']],axis = 1)
sns.regplot(x='v_1',y = 'price',data = v_1_scatter_plot,scatter= True, fit_reg=True, ax=ax8)

v_14_scatter_plot = pd.concat([Y_train,Train_data['v_14']],axis = 1)
sns.regplot(x='v_14',y = 'price',data = v_14_scatter_plot,scatter= True, fit_reg=True, ax=ax9)

v_13_scatter_plot = pd.concat([Y_train,Train_data['v_13']],axis = 1)
sns.regplot(x='v_13',y = 'price',data = v_13_scatter_plot,scatter= True, fit_reg=True, ax=ax10)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-42C0YSh0-1585030415055)(output_51_1.png)]

## 1) unique分布
for fea in categorical_features:
    print(Train_data[fea].nunique())
99662
248
40
8
7
2
2
7905
categorical_features
['name',
 'model',
 'brand',
 'bodyType',
 'fuelType',
 'gearbox',
 'notRepairedDamage',
 'regionCode']
## 2) 类别特征箱形图可视化

# 因为 name和 regionCode的类别太稀疏了,这里我们把不稀疏的几类画一下
categorical_features = ['model',
 'brand',
 'bodyType',
 'fuelType',
 'gearbox',
 'notRepairedDamage']
for c in categorical_features:
    Train_data[c] = Train_data[c].astype('category')
    if Train_data[c].isnull().any():
        Train_data[c] = Train_data[c].cat.add_categories(['MISSING'])
        Train_data[c] = Train_data[c].fillna('MISSING')

def boxplot(x, y, **kwargs):
    sns.boxplot(x=x, y=y)
    x=plt.xticks(rotation=90)

f = pd.melt(Train_data, id_vars=['price'], value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(boxplot, "value", "price")

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nAoZcnXO-1585030415056)(output_54_0.png)]

Train_data.columns
Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
       'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
       'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6',
       'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14'],
      dtype='object')
## 3) 类别特征的小提琴图可视化
catg_list = categorical_features
target = 'price'
for catg in catg_list :
    sns.violinplot(x=catg, y=target, data=Train_data)
    plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-OJP32fGN-1585030415056)(output_56_0.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-KuVCutEG-1585030415057)(output_56_1.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qYYZRJpS-1585030415057)(output_56_2.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-77xnKQpU-1585030415057)(output_56_3.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-2xjBsUd6-1585030415058)(output_56_4.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ZWk4z5qc-1585030415058)(output_56_5.png)]

categorical_features = ['model',
 'brand',
 'bodyType',
 'fuelType',
 'gearbox',
 'notRepairedDamage']
## 4) 类别特征的柱形图可视化
def bar_plot(x, y, **kwargs):
    sns.barplot(x=x, y=y)
    x=plt.xticks(rotation=90)

f = pd.melt(Train_data, id_vars=['price'], value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(bar_plot, "value", "price")

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XavL2t13-1585030415059)(output_58_0.png)]

pip install pandas_profiling -i https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting pandas_profiling
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/e8/b0/bd5e3aaf37302fbe581b6947dc5ec1cda02a0ffe50fc823123def73e4d7a/pandas-profiling-2.5.0.tar.gz (192kB)
[K     |████████████████████████████████| 194kB 938kB/s eta 0:00:01
[?25hRequirement already satisfied: numpy>=1.16.0 in ./opt/anaconda3/lib/python3.7/site-packages (from pandas_profiling) (1.17.2)
Collecting scipy>=1.4.1 (from pandas_profiling)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/85/7a/ae480be23b768910a9327c33517ced4623ba88dc035f9ce0206657c353a9/scipy-1.4.1-cp37-cp37m-macosx_10_6_intel.whl (28.4MB)
[K     |████████████████████████████████| 28.4MB 6.3MB/s eta 0:00:01
[?25hCollecting pandas==0.25.3 (from pandas_profiling)
[33m  WARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='pypi.tuna.tsinghua.edu.cn', port=443): Read timed out. (read timeout=15)")': /simple/pandas/[0m
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/16/b5/bab3477466a4d9e705d40829ac65683155e7977acbc07f05b06fabded1be/pandas-0.25.3-cp37-cp37m-macosx_10_9_x86_64.whl (10.2MB)
[K     |████████████████████████████████| 10.2MB 5.6MB/s eta 0:00:01
[?25hRequirement already satisfied: matplotlib>=3.0.3 in ./opt/anaconda3/lib/python3.7/site-packages (from pandas_profiling) (3.1.1)
Collecting confuse==1.0.0 (from pandas_profiling)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/4c/6f/90e860cba937c174d8b3775729ccc6377eb91f52ad4eeb008e7252a3646d/confuse-1.0.0.tar.gz
Collecting jinja2==2.11.1 (from pandas_profiling)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/27/24/4f35961e5c669e96f6559760042a55b9bcfcdb82b9bdb3c8753dbe042e35/Jinja2-2.11.1-py2.py3-none-any.whl (126kB)
[K     |████████████████████████████████| 133kB 1.3MB/s eta 0:00:01
[?25hCollecting visions==0.2.2 (from pandas_profiling)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/07/4a/ab37f8bafda516b66c4f475b221a6c170097c0db203750a4aafb01023339/visions-0.2.2.tar.gz
Collecting htmlmin==0.1.12 (from pandas_profiling)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/b3/e7/fcd59e12169de19f0131ff2812077f964c6b960e7c09804d30a7bf2ab461/htmlmin-0.1.12.tar.gz
Requirement already satisfied: missingno==0.4.2 in ./opt/anaconda3/lib/python3.7/site-packages (from pandas_profiling) (0.4.2)
Collecting phik==0.9.9 (from pandas_profiling)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/03/cf/b8cef2778104dc5d319f36dd836efaceb07a037cbf63f27c966b5a193ce9/phik-0.9.9-py3-none-any.whl (607kB)
[K     |████████████████████████████████| 614kB 1.1MB/s eta 0:00:01
[?25hCollecting astropy>=3.2.3 (from pandas_profiling)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/0a/35/36310d2430c9eb8ffa4df4cb5140dc8b51796bbfabc731735832b71a0cf0/astropy-4.0-cp37-cp37m-macosx_10_6_intel.whl (8.3MB)
[K     |████████████████████████████████| 8.3MB 75kB/s eta 0:00:011
[?25hCollecting tangled-up-in-unicode==0.0.3 (from pandas_profiling)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/43/fc/e3c970c5007b405827a4623e70fc4eb966ee49bc1edbd56c33e85e1c6534/tangled_up_in_unicode-0.0.3.tar.gz (1.5MB)
[K     |████████████████████████████████| 1.5MB 6.4MB/s eta 0:00:01
[?25hCollecting tqdm==4.42.0 (from pandas_profiling)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/cc/2e/4307206db63f05ed37e21d4c0d843d0fbcacd62479f8ce99ba0f2c0875e0/tqdm-4.42.0-py2.py3-none-any.whl (59kB)
[K     |████████████████████████████████| 61kB 11.6MB/s eta 0:00:01
[?25hCollecting kaggle==1.5.6 (from pandas_profiling)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/62/ab/bb20f9b9e24f9a6250f95a432f8d9a7d745f8d24039d7a5a6eaadb7783ba/kaggle-1.5.6.tar.gz (58kB)
[K     |████████████████████████████████| 61kB 10.3MB/s eta 0:00:01
[?25hRequirement already satisfied: ipywidgets==7.5.1 in ./opt/anaconda3/lib/python3.7/site-packages (from pandas_profiling) (7.5.1)
Requirement already satisfied: requests==2.22.0 in ./opt/anaconda3/lib/python3.7/site-packages (from pandas_profiling) (2.22.0)
Requirement already satisfied: pytz>=2017.2 in ./opt/anaconda3/lib/python3.7/site-packages (from pandas==0.25.3->pandas_profiling) (2019.3)
Requirement already satisfied: python-dateutil>=2.6.1 in ./opt/anaconda3/lib/python3.7/site-packages (from pandas==0.25.3->pandas_profiling) (2.8.0)
Requirement already satisfied: cycler>=0.10 in ./opt/anaconda3/lib/python3.7/site-packages (from matplotlib>=3.0.3->pandas_profiling) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in ./opt/anaconda3/lib/python3.7/site-packages (from matplotlib>=3.0.3->pandas_profiling) (1.1.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in ./opt/anaconda3/lib/python3.7/site-packages (from matplotlib>=3.0.3->pandas_profiling) (2.4.2)
Requirement already satisfied: pyyaml in ./opt/anaconda3/lib/python3.7/site-packages (from confuse==1.0.0->pandas_profiling) (5.1.2)
Requirement already satisfied: MarkupSafe>=0.23 in ./opt/anaconda3/lib/python3.7/site-packages (from jinja2==2.11.1->pandas_profiling) (1.1.1)
Requirement already satisfied: networkx in ./opt/anaconda3/lib/python3.7/site-packages (from visions==0.2.2->pandas_profiling) (2.3)
Collecting attr (from visions==0.2.2->pandas_profiling)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/de/be/ddc7f84d4e087144472a38a373d3e319f51a6faf6e5fc1ae897173675f21/attr-0.3.1.tar.gz
Requirement already satisfied: seaborn in ./opt/anaconda3/lib/python3.7/site-packages (from missingno==0.4.2->pandas_profiling) (0.9.0)
Collecting joblib>=0.14.1 (from phik==0.9.9->pandas_profiling)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/28/5c/cf6a2b65a321c4a209efcdf64c2689efae2cb62661f8f6f4bb28547cf1bf/joblib-0.14.1-py2.py3-none-any.whl (294kB)
[K     |████████████████████████████████| 296kB 5.9MB/s eta 0:00:01
[?25hRequirement already satisfied: pytest>=4.0.2 in ./opt/anaconda3/lib/python3.7/site-packages (from phik==0.9.9->pandas_profiling) (5.2.1)
Requirement already satisfied: nbconvert>=5.3.1 in ./opt/anaconda3/lib/python3.7/site-packages (from phik==0.9.9->pandas_profiling) (5.6.0)
Requirement already satisfied: numba>=0.38.1 in ./opt/anaconda3/lib/python3.7/site-packages (from phik==0.9.9->pandas_profiling) (0.45.1)
Collecting pytest-pylint>=0.13.0 (from phik==0.9.9->pandas_profiling)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/31/ef/e848f832a596a8a40b32d8aa169788b4df167c2d6a5960c01e83a30ebaa7/pytest_pylint-0.15.1-py3-none-any.whl
Requirement already satisfied: jupyter-client>=5.2.3 in ./opt/anaconda3/lib/python3.7/site-packages (from phik==0.9.9->pandas_profiling) (5.3.3)
Requirement already satisfied: urllib3<1.25,>=1.21.1 in ./opt/anaconda3/lib/python3.7/site-packages (from kaggle==1.5.6->pandas_profiling) (1.24.2)
Requirement already satisfied: six>=1.10 in ./opt/anaconda3/lib/python3.7/site-packages (from kaggle==1.5.6->pandas_profiling) (1.12.0)
Requirement already satisfied: certifi in ./opt/anaconda3/lib/python3.7/site-packages (from kaggle==1.5.6->pandas_profiling) (2019.9.11)
Collecting python-slugify (from kaggle==1.5.6->pandas_profiling)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/92/5f/7b84a0bba8a0fdd50c046f8b57dcf179dc16237ad33446079b7c484de04c/python-slugify-4.0.0.tar.gz
Requirement already satisfied: ipython>=4.0.0; python_version >= "3.3" in ./opt/anaconda3/lib/python3.7/site-packages (from ipywidgets==7.5.1->pandas_profiling) (7.8.0)
Requirement already satisfied: widgetsnbextension~=3.5.0 in ./opt/anaconda3/lib/python3.7/site-packages (from ipywidgets==7.5.1->pandas_profiling) (3.5.1)
Requirement already satisfied: nbformat>=4.2.0 in ./opt/anaconda3/lib/python3.7/site-packages (from ipywidgets==7.5.1->pandas_profiling) (4.4.0)
Requirement already satisfied: ipykernel>=4.5.1 in ./opt/anaconda3/lib/python3.7/site-packages (from ipywidgets==7.5.1->pandas_profiling) (5.1.2)
Requirement already satisfied: traitlets>=4.3.1 in ./opt/anaconda3/lib/python3.7/site-packages (from ipywidgets==7.5.1->pandas_profiling) (4.3.3)
Requirement already satisfied: idna<2.9,>=2.5 in ./opt/anaconda3/lib/python3.7/site-packages (from requests==2.22.0->pandas_profiling) (2.8)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in ./opt/anaconda3/lib/python3.7/site-packages (from requests==2.22.0->pandas_profiling) (3.0.4)
Requirement already satisfied: setuptools in ./opt/anaconda3/lib/python3.7/site-packages (from kiwisolver>=1.0.1->matplotlib>=3.0.3->pandas_profiling) (41.4.0)
Requirement already satisfied: decorator>=4.3.0 in ./opt/anaconda3/lib/python3.7/site-packages (from networkx->visions==0.2.2->pandas_profiling) (4.4.0)
Requirement already satisfied: py>=1.5.0 in ./opt/anaconda3/lib/python3.7/site-packages (from pytest>=4.0.2->phik==0.9.9->pandas_profiling) (1.8.0)
Requirement already satisfied: packaging in ./opt/anaconda3/lib/python3.7/site-packages (from pytest>=4.0.2->phik==0.9.9->pandas_profiling) (19.2)
Requirement already satisfied: attrs>=17.4.0 in ./opt/anaconda3/lib/python3.7/site-packages (from pytest>=4.0.2->phik==0.9.9->pandas_profiling) (19.2.0)
Requirement already satisfied: more-itertools>=4.0.0 in ./opt/anaconda3/lib/python3.7/site-packages (from pytest>=4.0.2->phik==0.9.9->pandas_profiling) (7.2.0)
Requirement already satisfied: atomicwrites>=1.0 in ./opt/anaconda3/lib/python3.7/site-packages (from pytest>=4.0.2->phik==0.9.9->pandas_profiling) (1.3.0)
Requirement already satisfied: pluggy<1.0,>=0.12 in ./opt/anaconda3/lib/python3.7/site-packages (from pytest>=4.0.2->phik==0.9.9->pandas_profiling) (0.13.0)
Requirement already satisfied: wcwidth in ./opt/anaconda3/lib/python3.7/site-packages (from pytest>=4.0.2->phik==0.9.9->pandas_profiling) (0.1.7)
Requirement already satisfied: importlib-metadata>=0.12 in ./opt/anaconda3/lib/python3.7/site-packages (from pytest>=4.0.2->phik==0.9.9->pandas_profiling) (0.23)
Requirement already satisfied: testpath in ./opt/anaconda3/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik==0.9.9->pandas_profiling) (0.4.2)
Requirement already satisfied: defusedxml in ./opt/anaconda3/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik==0.9.9->pandas_profiling) (0.6.0)
Requirement already satisfied: pygments in ./opt/anaconda3/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik==0.9.9->pandas_profiling) (2.4.2)
Requirement already satisfied: pandocfilters>=1.4.1 in ./opt/anaconda3/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik==0.9.9->pandas_profiling) (1.4.2)
Requirement already satisfied: jupyter-core in ./opt/anaconda3/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik==0.9.9->pandas_profiling) (4.5.0)
Requirement already satisfied: mistune<2,>=0.8.1 in ./opt/anaconda3/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik==0.9.9->pandas_profiling) (0.8.4)
Requirement already satisfied: entrypoints>=0.2.2 in ./opt/anaconda3/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik==0.9.9->pandas_profiling) (0.3)
Requirement already satisfied: bleach in ./opt/anaconda3/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik==0.9.9->pandas_profiling) (3.1.0)
Requirement already satisfied: llvmlite>=0.29.0dev0 in ./opt/anaconda3/lib/python3.7/site-packages (from numba>=0.38.1->phik==0.9.9->pandas_profiling) (0.29.0)
Requirement already satisfied: pylint>=2.0.0 in ./opt/anaconda3/lib/python3.7/site-packages (from pytest-pylint>=0.13.0->phik==0.9.9->pandas_profiling) (2.4.2)
Requirement already satisfied: pyzmq>=13 in ./opt/anaconda3/lib/python3.7/site-packages (from jupyter-client>=5.2.3->phik==0.9.9->pandas_profiling) (18.1.0)
Requirement already satisfied: tornado>=4.1 in ./opt/anaconda3/lib/python3.7/site-packages (from jupyter-client>=5.2.3->phik==0.9.9->pandas_profiling) (6.0.3)
Collecting text-unidecode>=1.3 (from python-slugify->kaggle==1.5.6->pandas_profiling)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a6/a5/c0b6468d3824fe3fde30dbb5e1f687b291608f9473681bbf7dabbf5a87d7/text_unidecode-1.3-py2.py3-none-any.whl (78kB)
[K     |████████████████████████████████| 81kB 10kB/s  eta 0:00:011
[?25hRequirement already satisfied: prompt-toolkit<2.1.0,>=2.0.0 in ./opt/anaconda3/lib/python3.7/site-packages (from ipython>=4.0.0; python_version >= "3.3"->ipywidgets==7.5.1->pandas_profiling) (2.0.10)
Requirement already satisfied: pickleshare in ./opt/anaconda3/lib/python3.7/site-packages (from ipython>=4.0.0; python_version >= "3.3"->ipywidgets==7.5.1->pandas_profiling) (0.7.5)
Requirement already satisfied: backcall in ./opt/anaconda3/lib/python3.7/site-packages (from ipython>=4.0.0; python_version >= "3.3"->ipywidgets==7.5.1->pandas_profiling) (0.1.0)
Requirement already satisfied: appnope; sys_platform == "darwin" in ./opt/anaconda3/lib/python3.7/site-packages (from ipython>=4.0.0; python_version >= "3.3"->ipywidgets==7.5.1->pandas_profiling) (0.1.0)
Requirement already satisfied: jedi>=0.10 in ./opt/anaconda3/lib/python3.7/site-packages (from ipython>=4.0.0; python_version >= "3.3"->ipywidgets==7.5.1->pandas_profiling) (0.15.1)
Requirement already satisfied: pexpect; sys_platform != "win32" in ./opt/anaconda3/lib/python3.7/site-packages (from ipython>=4.0.0; python_version >= "3.3"->ipywidgets==7.5.1->pandas_profiling) (4.7.0)
Requirement already satisfied: notebook>=4.4.1 in ./opt/anaconda3/lib/python3.7/site-packages (from widgetsnbextension~=3.5.0->ipywidgets==7.5.1->pandas_profiling) (6.0.1)
Requirement already satisfied: ipython-genutils in ./opt/anaconda3/lib/python3.7/site-packages (from nbformat>=4.2.0->ipywidgets==7.5.1->pandas_profiling) (0.2.0)
Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in ./opt/anaconda3/lib/python3.7/site-packages (from nbformat>=4.2.0->ipywidgets==7.5.1->pandas_profiling) (3.0.2)
Requirement already satisfied: zipp>=0.5 in ./opt/anaconda3/lib/python3.7/site-packages (from importlib-metadata>=0.12->pytest>=4.0.2->phik==0.9.9->pandas_profiling) (0.6.0)
Requirement already satisfied: webencodings in ./opt/anaconda3/lib/python3.7/site-packages (from bleach->nbconvert>=5.3.1->phik==0.9.9->pandas_profiling) (0.5.1)
Requirement already satisfied: isort<5,>=4.2.5 in ./opt/anaconda3/lib/python3.7/site-packages (from pylint>=2.0.0->pytest-pylint>=0.13.0->phik==0.9.9->pandas_profiling) (4.3.21)
Requirement already satisfied: mccabe<0.7,>=0.6 in ./opt/anaconda3/lib/python3.7/site-packages (from pylint>=2.0.0->pytest-pylint>=0.13.0->phik==0.9.9->pandas_profiling) (0.6.1)
Requirement already satisfied: astroid<2.4,>=2.3.0 in ./opt/anaconda3/lib/python3.7/site-packages (from pylint>=2.0.0->pytest-pylint>=0.13.0->phik==0.9.9->pandas_profiling) (2.3.1)
Requirement already satisfied: parso>=0.5.0 in ./opt/anaconda3/lib/python3.7/site-packages (from jedi>=0.10->ipython>=4.0.0; python_version >= "3.3"->ipywidgets==7.5.1->pandas_profiling) (0.5.1)
Requirement already satisfied: ptyprocess>=0.5 in ./opt/anaconda3/lib/python3.7/site-packages (from pexpect; sys_platform != "win32"->ipython>=4.0.0; python_version >= "3.3"->ipywidgets==7.5.1->pandas_profiling) (0.6.0)
Requirement already satisfied: Send2Trash in ./opt/anaconda3/lib/python3.7/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets==7.5.1->pandas_profiling) (1.5.0)
Requirement already satisfied: terminado>=0.8.1 in ./opt/anaconda3/lib/python3.7/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets==7.5.1->pandas_profiling) (0.8.2)
Requirement already satisfied: prometheus-client in ./opt/anaconda3/lib/python3.7/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets==7.5.1->pandas_profiling) (0.7.1)
Requirement already satisfied: pyrsistent>=0.14.0 in ./opt/anaconda3/lib/python3.7/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets==7.5.1->pandas_profiling) (0.15.4)
Requirement already satisfied: lazy-object-proxy==1.4.* in ./opt/anaconda3/lib/python3.7/site-packages (from astroid<2.4,>=2.3.0->pylint>=2.0.0->pytest-pylint>=0.13.0->phik==0.9.9->pandas_profiling) (1.4.2)
Requirement already satisfied: wrapt==1.11.* in ./opt/anaconda3/lib/python3.7/site-packages (from astroid<2.4,>=2.3.0->pylint>=2.0.0->pytest-pylint>=0.13.0->phik==0.9.9->pandas_profiling) (1.11.2)
Collecting typed-ast<1.5,>=1.4.0; implementation_name == "cpython" and python_version < "3.8" (from astroid<2.4,>=2.3.0->pylint>=2.0.0->pytest-pylint>=0.13.0->phik==0.9.9->pandas_profiling)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/0c/f0/c351d6aeb7f5ba00da2e5aac43ef09b71be95aed45bfca943a5016854269/typed_ast-1.4.1-cp37-cp37m-macosx_10_9_x86_64.whl (223kB)
[K     |████████████████████████████████| 225kB 3.5kB/s eta 0:00:03
[?25hBuilding wheels for collected packages: pandas-profiling, confuse, visions, htmlmin, tangled-up-in-unicode, kaggle, attr, python-slugify
  Building wheel for pandas-profiling (setup.py) ... [?25ldone
[?25h  Created wheel for pandas-profiling: filename=pandas_profiling-2.5.0-py2.py3-none-any.whl size=241328 sha256=1e4aaf2446bd27621631ea7fd76937b7e82033bb11441b13ac7105de48050b98
  Stored in directory: /Users/dingyunfei/Library/Caches/pip/wheels/32/e8/98/e0f16fe7ea5db400f066377442575e0b643e30932eb81d305a
  Building wheel for confuse (setup.py) ... [?25ldone
[?25h  Created wheel for confuse: filename=confuse-1.0.0-cp37-none-any.whl size=17486 sha256=b5dfb2775412e4ebca5d2715524428f539894d5b8a73fb28e84809201855d9cb
  Stored in directory: /Users/dingyunfei/Library/Caches/pip/wheels/2e/29/13/3d3b672a7e69a0632566a0eb52bf2b298bb47fcacfde2347e7
  Building wheel for visions (setup.py) ... [?25ldone
[?25h  Created wheel for visions: filename=visions-0.2.2-cp37-none-any.whl size=53059 sha256=430188988bb1a552e04fd4eba8358085611d9ce03da9c2ebd783086952ba1207
  Stored in directory: /Users/dingyunfei/Library/Caches/pip/wheels/30/43/c7/22141fede13513944b7fb3badf662138b0e6ed65940954cb40
  Building wheel for htmlmin (setup.py) ... [?25ldone
[?25h  Created wheel for htmlmin: filename=htmlmin-0.1.12-cp37-none-any.whl size=27086 sha256=f25592857e0db5ed5d523cce439ffc10459e4da00c28b92b70c93949a6bbda82
  Stored in directory: /Users/dingyunfei/Library/Caches/pip/wheels/c6/df/37/df6d2781e8f353ba10238cb793f8ae645a06daecb84a8984e5
  Building wheel for tangled-up-in-unicode (setup.py) ... [?25ldone
[?25h  Created wheel for tangled-up-in-unicode: filename=tangled_up_in_unicode-0.0.3-cp37-none-any.whl size=1554154 sha256=3c71700144eaed3f16d7530f137fb6b58c84509dcea3727c48fead4d02265c28
  Stored in directory: /Users/dingyunfei/Library/Caches/pip/wheels/8c/7d/f8/096ecd4e41dcc1c000ce4ca3cb2a28d6c003d6a2f103e5fc1d
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.6-cp37-none-any.whl size=72859 sha256=09235edf86d67881a2facb5e98df06bbc82331cede2bedd1df5406c117816a99
  Stored in directory: /Users/dingyunfei/Library/Caches/pip/wheels/9e/24/2c/3f85e90237bf54335b87f3b3bc1bd48e51cd3b414d58b0a3ee
  Building wheel for attr (setup.py) ... [?25ldone
[?25h  Created wheel for attr: filename=attr-0.3.1-cp37-none-any.whl size=2459 sha256=d8f6650725de9ae5f8d552de4122a2bd4b7590e7a2de388812240a130394198b
  Stored in directory: /Users/dingyunfei/Library/Caches/pip/wheels/bc/f3/53/f03c38cdbbef30b91d7067f9c18ce724bc4d06b1aa0e30613c
  Building wheel for python-slugify (setup.py) ... [?25ldone
[?25h  Created wheel for python-slugify: filename=python_slugify-4.0.0-py2.py3-none-any.whl size=5487 sha256=9ed93d119fb92a381716f355933dc289b24e87ea02b03867d1f20624ec3b440c
  Stored in directory: /Users/dingyunfei/Library/Caches/pip/wheels/b5/13/2d/010f4b6155525d36203303a3e5617d2f4bf786aabe5040c7b9
Successfully built pandas-profiling confuse visions htmlmin tangled-up-in-unicode kaggle attr python-slugify
Installing collected packages: scipy, pandas, confuse, jinja2, tangled-up-in-unicode, attr, visions, htmlmin, joblib, pytest-pylint, phik, astropy, tqdm, text-unidecode, python-slugify, kaggle, pandas-profiling, typed-ast
  Found existing installation: scipy 1.3.1
    Uninstalling scipy-1.3.1:
      Successfully uninstalled scipy-1.3.1
  Found existing installation: pandas 0.25.1
    Uninstalling pandas-0.25.1:
      Successfully uninstalled pandas-0.25.1
  Found existing installation: Jinja2 2.10.3
    Uninstalling Jinja2-2.10.3:
      Successfully uninstalled Jinja2-2.10.3
  Found existing installation: joblib 0.13.2
    Uninstalling joblib-0.13.2:
      Successfully uninstalled joblib-0.13.2
  Found existing installation: astropy 3.2.2
    Uninstalling astropy-3.2.2:
      Successfully uninstalled astropy-3.2.2
  Found existing installation: tqdm 4.36.1
    Uninstalling tqdm-4.36.1:
      Successfully uninstalled tqdm-4.36.1
Successfully installed astropy-4.0 attr-0.3.1 confuse-1.0.0 htmlmin-0.1.12 jinja2-2.11.1 joblib-0.14.1 kaggle-1.5.6 pandas-0.25.3 pandas-profiling-2.5.0 phik-0.9.9 pytest-pylint-0.15.1 python-slugify-4.0.0 scipy-1.4.1 tangled-up-in-unicode-0.0.3 text-unidecode-1.3 tqdm-4.42.0 typed-ast-1.4.1 visions-0.2.2
Note: you may need to restart the kernel to use updated packages.
import pandas_profiling
pfr = pandas_profiling.ProfileReport(Train_data)
pfr.to_file("./example.html")
HBox(children=(FloatProgress(value=0.0, description='variables', max=29.0, style=ProgressStyle(description_wid…

你可能感兴趣的:(数据挖掘)