import pandas as pd
df = pd.read_csv('used_car_train_20200313.csv',delimiter=' ',index_col=0)
df
name regDate model brand bodyType fuelType gearbox power kilometer notRepairedDamage ... v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
SaleID
0 736 20040402 30.0 6 1.0 0.0 0.0 60 12.5 0.0 ... 0.235676 0.101988 0.129549 0.022816 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762
1 2262 20030301 40.0 1 2.0 0.0 0.0 0 15.0 - ... 0.264777 0.121004 0.135731 0.026597 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522
2 14874 20040403 115.0 15 1.0 0.0 0.0 163 12.5 0.0 ... 0.251410 0.114912 0.165147 0.062173 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963
3 71865 19960908 109.0 10 0.0 0.0 1.0 193 15.0 0.0 ... 0.274293 0.110300 0.121964 0.033395 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699
4 111080 20120103 110.0 5 1.0 0.0 0.0 68 5.0 0.0 ... 0.228036 0.073205 0.091880 0.078819 0.121534 -1.896240 0.910783 0.931110 2.834518 1.923482
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
149995 163978 20000607 121.0 10 4.0 0.0 1.0 163 15.0 0.0 ... 0.280264 0.000310 0.048441 0.071158 0.019174 1.988114 -2.983973 0.589167 -1.304370 -0.302592
149996 184535 20091102 116.0 11 0.0 0.0 0.0 125 10.0 0.0 ... 0.253217 0.000777 0.084079 0.099681 0.079371 1.839166 -2.774615 2.553994 0.924196 -0.272160
149997 147587 20101003 60.0 11 1.0 1.0 0.0 90 6.0 0.0 ... 0.233353 0.000705 0.118872 0.100118 0.097914 2.439812 -1.630677 2.290197 1.891922 0.414931
149998 45907 20060312 34.0 10 3.0 1.0 0.0 156 15.0 0.0 ... 0.256369 0.000252 0.081479 0.083558 0.081498 2.075380 -2.633719 1.414937 0.431981 -1.659014
149999 177672 19990204 19.0 28 6.0 0.0 1.0 193 12.5 0.0 ... 0.284475 0.000000 0.040072 0.062543 0.025819 1.978453 -3.179913 0.031724 -1.483350 -0.342674
150000 rows × 30 columns
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Data columns (total 30 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 150000 non-null int64
1 regDate 150000 non-null int64
2 model 149999 non-null float64
3 brand 150000 non-null int64
4 bodyType 145494 non-null float64
5 fuelType 141320 non-null float64
6 gearbox 144019 non-null float64
7 power 150000 non-null int64
8 kilometer 150000 non-null float64
9 notRepairedDamage 150000 non-null object
10 regionCode 150000 non-null int64
11 seller 150000 non-null int64
12 offerType 150000 non-null int64
13 creatDate 150000 non-null int64
14 price 150000 non-null int64
15 v_0 150000 non-null float64
16 v_1 150000 non-null float64
17 v_2 150000 non-null float64
18 v_3 150000 non-null float64
19 v_4 150000 non-null float64
...
28 v_13 150000 non-null float64
29 v_14 150000 non-null float64
dtypes: float64(20), int64(9), object(1)
memory usage: 35.5+ MB
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
数据存在一定的缺失值,例如model、bodyType、fuelType、gearbox
列均存在一定量的缺失值。
df=df[~(df['col'].isnull())] #删掉空行
df=df.dropna(axis=0) #删除有空值的行,使用参数axis=0
df=df.dropna(axis=1) #删除有空值的列,使用参数axis=1
axis
:默认为 0
,表示逢空值剔除整行,如果设置参数 axis=1
表示逢空值去掉整列。how
:默认为 'any'
如果一行(或一列)里任何一个数据有出现 NA
就去掉整行,如果设置 how='all'
一行(或列)都是 NA
才去掉这整行。thresh
:设置需要多少非空值的数据才可以保留下来的。subset
:设置想要检查的列。如果是多个列,可以使用列名的 list
作为参数。inplace
:如果设置 True
,将计算得到的值直接覆盖之前的值并返回 None
,修改的是源数据。any
或all
方式删除空行df.dropna(how='all') #可见df没有完全的空行
name regDate model brand bodyType fuelType gearbox power kilometer notRepairedDamage ... v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
SaleID
0 736 20040402 30.0 6 1.0 0.0 0.0 60 12.5 0.0 ... 0.235676 0.101988 0.129549 0.022816 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762
1 2262 20030301 40.0 1 2.0 0.0 0.0 0 15.0 - ... 0.264777 0.121004 0.135731 0.026597 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522
2 14874 20040403 115.0 15 1.0 0.0 0.0 163 12.5 0.0 ... 0.251410 0.114912 0.165147 0.062173 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963
3 71865 19960908 109.0 10 0.0 0.0 1.0 193 15.0 0.0 ... 0.274293 0.110300 0.121964 0.033395 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699
4 111080 20120103 110.0 5 1.0 0.0 0.0 68 5.0 0.0 ... 0.228036 0.073205 0.091880 0.078819 0.121534 -1.896240 0.910783 0.931110 2.834518 1.923482
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
149995 163978 20000607 121.0 10 4.0 0.0 1.0 163 15.0 0.0 ... 0.280264 0.000310 0.048441 0.071158 0.019174 1.988114 -2.983973 0.589167 -1.304370 -0.302592
149996 184535 20091102 116.0 11 0.0 0.0 0.0 125 10.0 0.0 ... 0.253217 0.000777 0.084079 0.099681 0.079371 1.839166 -2.774615 2.553994 0.924196 -0.272160
149997 147587 20101003 60.0 11 1.0 1.0 0.0 90 6.0 0.0 ... 0.233353 0.000705 0.118872 0.100118 0.097914 2.439812 -1.630677 2.290197 1.891922 0.414931
149998 45907 20060312 34.0 10 3.0 1.0 0.0 156 15.0 0.0 ... 0.256369 0.000252 0.081479 0.083558 0.081498 2.075380 -2.633719 1.414937 0.431981 -1.659014
149999 177672 19990204 19.0 28 6.0 0.0 1.0 193 12.5 0.0 ... 0.284475 0.000000 0.040072 0.062543 0.025819 1.978453 -3.179913 0.031724 -1.483350 -0.342674
150000 rows × 30 columns
df.dropna(how='any') #df至少有一项空缺的空行
name regDate model brand bodyType fuelType gearbox power kilometer notRepairedDamage ... v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
SaleID
0 736 20040402 30.0 6 1.0 0.0 0.0 60 12.5 0.0 ... 0.235676 0.101988 0.129549 0.022816 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762
1 2262 20030301 40.0 1 2.0 0.0 0.0 0 15.0 - ... 0.264777 0.121004 0.135731 0.026597 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522
2 14874 20040403 115.0 15 1.0 0.0 0.0 163 12.5 0.0 ... 0.251410 0.114912 0.165147 0.062173 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963
3 71865 19960908 109.0 10 0.0 0.0 1.0 193 15.0 0.0 ... 0.274293 0.110300 0.121964 0.033395 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699
4 111080 20120103 110.0 5 1.0 0.0 0.0 68 5.0 0.0 ... 0.228036 0.073205 0.091880 0.078819 0.121534 -1.896240 0.910783 0.931110 2.834518 1.923482
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
149995 163978 20000607 121.0 10 4.0 0.0 1.0 163 15.0 0.0 ... 0.280264 0.000310 0.048441 0.071158 0.019174 1.988114 -2.983973 0.589167 -1.304370 -0.302592
149996 184535 20091102 116.0 11 0.0 0.0 0.0 125 10.0 0.0 ... 0.253217 0.000777 0.084079 0.099681 0.079371 1.839166 -2.774615 2.553994 0.924196 -0.272160
149997 147587 20101003 60.0 11 1.0 1.0 0.0 90 6.0 0.0 ... 0.233353 0.000705 0.118872 0.100118 0.097914 2.439812 -1.630677 2.290197 1.891922 0.414931
149998 45907 20060312 34.0 10 3.0 1.0 0.0 156 15.0 0.0 ... 0.256369 0.000252 0.081479 0.083558 0.081498 2.075380 -2.633719 1.414937 0.431981 -1.659014
149999 177672 19990204 19.0 28 6.0 0.0 1.0 193 12.5 0.0 ... 0.284475 0.000000 0.040072 0.062543 0.025819 1.978453 -3.179913 0.031724 -1.483350 -0.342674
135884 rows × 30 columns
subset
删除特定列空缺的行df.dropna(subset=['model','bodyType']) #model或者bodyType有空缺即可删除
name regDate model brand bodyType fuelType gearbox power kilometer notRepairedDamage ... v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
SaleID
0 736 20040402 30.0 6 1.0 0.0 0.0 60 12.5 0.0 ... 0.235676 0.101988 0.129549 0.022816 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762
1 2262 20030301 40.0 1 2.0 0.0 0.0 0 15.0 - ... 0.264777 0.121004 0.135731 0.026597 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522
2 14874 20040403 115.0 15 1.0 0.0 0.0 163 12.5 0.0 ... 0.251410 0.114912 0.165147 0.062173 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963
3 71865 19960908 109.0 10 0.0 0.0 1.0 193 15.0 0.0 ... 0.274293 0.110300 0.121964 0.033395 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699
4 111080 20120103 110.0 5 1.0 0.0 0.0 68 5.0 0.0 ... 0.228036 0.073205 0.091880 0.078819 0.121534 -1.896240 0.910783 0.931110 2.834518 1.923482
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
149995 163978 20000607 121.0 10 4.0 0.0 1.0 163 15.0 0.0 ... 0.280264 0.000310 0.048441 0.071158 0.019174 1.988114 -2.983973 0.589167 -1.304370 -0.302592
149996 184535 20091102 116.0 11 0.0 0.0 0.0 125 10.0 0.0 ... 0.253217 0.000777 0.084079 0.099681 0.079371 1.839166 -2.774615 2.553994 0.924196 -0.272160
149997 147587 20101003 60.0 11 1.0 1.0 0.0 90 6.0 0.0 ... 0.233353 0.000705 0.118872 0.100118 0.097914 2.439812 -1.630677 2.290197 1.891922 0.414931
149998 45907 20060312 34.0 10 3.0 1.0 0.0 156 15.0 0.0 ... 0.256369 0.000252 0.081479 0.083558 0.081498 2.075380 -2.633719 1.414937 0.431981 -1.659014
149999 177672 19990204 19.0 28 6.0 0.0 1.0 193 12.5 0.0 ... 0.284475 0.000000 0.040072 0.062543 0.025819 1.978453 -3.179913 0.031724 -1.483350 -0.342674
145493 rows × 30 columns
inplace
将删除行作用在源数据df.dropna(inplace = True)