import numpy as np
import pandas as pd
df=pd.read_csv('datalab/74955/train.csv')
df.head(3)
(1)查看每个特征缺失值个数
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
df.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
df[['Age','Cabin','Embarked']]
Age Cabin Embarked
0 22.0 NaN S
1 38.0 C85 C
2 26.0 NaN S
3 35.0 C123 S
df[df['Age']==None]=0
df['Age'].isnull().sum()
##177
df[df['Age'].isnull()]=0
df['Age'].isnull().sum()
##0
df[df['Age']==np.nan]=0
df.head(3)
思考:检索空缺值用np.nan要比用None好的原因是空缺值的数据类型为float64所以用None一般索引不到。
#将所有包含nan的行删除,并不会对原数据造成影响
df.dropna().head(3)
df.fillna(0).head(3)
#查看数据中的重复值
df[df.duplicated()]
df.drop_duplicates().head()
#将年龄进行分箱操作
df['AgeBand_ave']=pd.cut(df['Age'],5,labels=['1','2','3','4','5'])
df.head(3)
df['AgeBand_cut']=pd.cut(df['Age'],[0,5,15,30,50,80],labels=['1','2','3','4','5'])
df.head(3)
#查看文本变量名及种类
df['Sex'].value_counts()
male 453
female 261
0 177
Name: Sex, dtype: int64
df['Cabin'].value_counts()
0 177
C23 C25 C27 4
G6 4
B96 B98 4
F2 3
C22 C26 3
F33 3
D 3
E8 2
C2 2
B22 2
B28 2
C125 2
D26 2
C83 2
B57 B59 B63 B66 2
C78 2
D35 2
C123 2
B18 2
F G73 2
E25 2
D33 2
B20 2
B5 2
B35 2
E24 2
D20 2
B58 B60 2
D36 2
...
C99 1
C90 1
B50 1
C148 1
A5 1
C111 1
D46 1
B79 1
D10 D12 1
B41 1
A26 1
A36 1
D11 1
D47 1
B94 1
E17 1
B4 1
E46 1
D50 1
A24 1
C110 1
A7 1
C92 1
B42 1
A16 1
B101 1
A23 1
C30 1
C54 1
E34 1
Name: Cabin, Length: 135, dtype: int64
df['Embarked'].value_counts()
S 554
0 177
C 130
Q 28
Name: Embarked, dtype: int64
df['Sex'].unique()
array(['male', 'female', 0], dtype=object)
df['Sex'].nunique()
##3
df['Sex_num']=df['Sex'].replace(['male','female'],[1,2])
df.head(3)
df['Sex_num']=df['Sex'].map({'male':1,'female':2})
df.head(3)
#使用sklearn.preprocessing的LabelEncoder
from sklearn.preprocessing import LabelEncoder
for feat in ['Cabin','Ticket']:
lbl=LabelEncoder()
label_dict=dict(zip(df[feat].unique(),range(df[feat].nunique())))
df[feat+"_labelEncode"]=df[feat].map(label_dict)
df[feat+"_labelEncode"]=lbl.fit_transform(df[feat].astype(str))
df.head()
label_dict
{'A/5 21171': 0,
'PC 17599': 1,
'STON/O2. 3101282': 2,
'113803': 3,
'373450': 4,
0: 5,
'17463': 6,
'349909': 7,
'347742': 8,
'237736': 9,
'PP 9549': 10,
'113783': 11,
'A/5. 2151': 12,
'347082': 13,
'350406': 14,
'248706': 15,
.
.
.
'C.A./SOTON 34068': 537,
'SOTON/OQ 392076': 538,
'211536': 539,
'112053': 540,
'111369': 541,
'370376': 542}
for feat in ["Age",'Embarked']:
x=pd.get_dummies(df[feat],prefix=feat)
df=pd.concat([df,x],axis=1)
df.head()
df['Title']=df.Name.str.extract('([A-Za-z]+)\.',expand=False)
df.head()
df.to_csv('test_fin.csv')