开始之前,导入numpy、pandas包和数据
import pandas as pd
import numpy as np
df=pd.read_csv("train.csv")
数据清洗(Data cleaning):对数据进行重新审查和校验的过程,目的在于删除重复信息、纠正存在的错误,并提供数据一致性。
数据清洗原理:利用有关技术如数理统计、数据挖掘或预定义的清理规则将脏数据转化为满足数据质量要求的数据
主要类型:
- 缺失值
- 错误值(异常值)
- 重复值
- 不一致问题
df.info()
'''
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB'''
df.isnull().sum()#返回各列缺失值数量
df.isnull().sum()/df.shape[0]#返回各列缺失值比例
'''
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
-----------------------
PassengerId 0.000000
Survived 0.000000
Pclass 0.000000
Name 0.000000
Sex 0.000000
Age 0.198653
SibSp 0.000000
Parch 0.000000
Ticket 0.000000
Fare 0.000000
Cabin 0.771044
Embarked 0.002245
dtype: float64
'''
df[['Age','Cabin','Embarked']]
df.loc[:,[['Age','Cabin','Embarked']]]
df.iloc[:,[5,10,11]]
df.ix[:,['Age','Cabin','Embarked']]
DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)#去掉含有缺失值的样本(行)
DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')#将含有缺失值的列(特征向量)去掉
DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None)#将缺失值用某些值填充(0,平均值,中值等)
fillna()函数详解
df.fillna(value=0, method=None, axis=None, inplace=False, limit=None, downcast=None)#将缺失值用某些值填充(0,平均值,中值等)
#使用0进行填充
df.fillna(value=0)
#使用前一个非空值进行填充
df.fillna(method = 'ffill')
#使用后一个非空值进行填充
df.fillna(method = 'bfill')
pandas numpy处理缺失值,none与nan比较
pandas.DataFrame.duplicated(self, subset=None, keep=‘first’)
#subset:用于识别重复的列标签或列标签序列,默认所有列标签
#keep=‘frist’:除了第一次出现外,其余相同的被标记为重复
#keep=‘last’:除了最后一次出现外,其余相同的被标记为重复
#keep=False:所有相同的都被标记为重复
pandas.DataFrame.duplicated(self, subset=None, keep='first')
pandas.DataFrame.duplicated(self, subset=None, keep='last')
pandas.DataFrame.duplicated(self, subset=None, keep=None)
df[df.duplicated()]#返回为空,本例中没有重复数据
df.to_csv("train_clean.csv")
df['AgeBand'] = pd.cut(df['Age'], 5,labels = ['1','2','3','4','5'])
df['AgeBand_1'] = pd.cut(df['Age'], [0,5,15,30,50,80],labels = ['1','2','3','4','5'])
df['AgeBand_2'] = pd.qcut(df['Age'],[0,0.1,0.3,0.5,0.7,0.9],labels = ['1','2','3','4','5'])
#方法1
set(df['Sex'])
#{'female', 'male'}
#方法2
df['Sex'].value_counts()
#male 577
#female 314
#Name: Sex, dtype: int64
#方法3
df['Sex'].unique()
#array(['male', 'female'], dtype=object)
df['Sex_num']=df['Sex'].map(lambda x: 1 if x=='female'else 2)
df['Sex_num'] = df['Sex'].replace(['male','female'],[1,2])
df['Sex_num']=df['Sex'].map({'male':1,'female':2})
#方法三: 使用sklearn.preprocessing的LabelEncoder
from sklearn.preprocessing import LabelEncoder
for feat in ['Cabin', 'Ticket']:
lbl = LabelEncoder()
df[feat + "_labelEncode"] = lbl.fit_transform(df[feat].astype(str))
df.head()
#方法四
for feat in ['Cabin', 'Ticket']:
label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
df[feat + "_labelEncode"] = df[feat].map(label_dict)
#方法一: OneHotEncoder
for feat in ["Age", "Embarked"]:
x = pd.get_dummies(df[feat], prefix=feat)
df = pd.concat([df, x], axis=1)
#df[feat] = pd.get_dummies(df[feat], prefix=feat)
df.head()
#方法二
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
for feat in ["Age", "Embarked"]:
enc.fit(df[[feat]])
enc.categories_
pd.DataFrame(enc.transform(df[[feat]]).toarray(),columns=[feat+'_'+str(i) for i in sorted(df[feat].unique())])
数据预处理之将类别数据数字化的方法 —— LabelEncoder VSOneHotEncoder大鱼
OneHotEncoder独热编码和LabelEncoder标签编码
One Hot Encoder 常用方法
df['Name'].str.extract('([A-Za-z]+)\.',expand=False)
#使用pandas的str.extract()方法可以使用正则表达式匹配字符串
#'([A-Za-z]+)\.'中()表示字符串的起止位置;[A-Za-z]匹配任意一个大小写字母,+号代表前面匹配的[A-Za-z]至少出现一次,等价于{1,};\.代表匹配'.',但.不在返回的字符串内部。该正则表达式等价于'([A-Za-z]{1,})\.'或'(\w+)\.'
#进入data路径的上级路径,输入tree data/
tree data/
data/
├── train-left-down.csv
├── train-left-up.csv
├── train-right-down.csv
└── train-right-up.csv
可以看到data文件夹中有四个文件分别是:train-left-down.csv、train-left-up.csv、train-right-down.csv、train-right-up.csv。
将其导入:
train_left_up=pd.read_csv("train-left-up.csv")
train_left_down=pd.read_csv("train-left-down.csv")
train_right_up=pd.read_csv("train-right-up.csv")
train_right_down=pd.read_csv("train-right-down.csv")
经过查看四个DataFrame,我们发现四个DataFrame是之前train.csv的子集,通过某种方式合并可以得到之前的名为train的DataFrame。
result_up=pd.concat([train_left_up,train_right_up],axis=1,join='inner')#内连接取交集
result_down=pd.concat([train_left_down,train_right_down],join='inner',axis=1)
result=pd.concat([result_up,result_down],join='outer',axis=0)#外连接取并集
train_up_1=train_left_up.join(train_right_up)
train_down_1=train_left_down.join(train_right_down)
result_1=train_up_1.append(train_down_1)
train_up_2=pd.merge(train_left_up,train_right_up,left_index=True,right_index=True)
train_down_2=pd.merge(train_left_down,train_right_down,left_index=True,right_index=True)
result_2=train_up_2.append(train_down_2)
上述各种关联方法参考本链接
result.to_csv("result.csv")
result.stack()
Pandas之stack()和unstack()用法
用途: 对数据进行分组及分组后组内运算!类似于数据透视和sql里面的聚合函数
可用方法:
常用: count()、sum()、cumsum()、mean()等
pandas官方文档
df = text['Fare'].groupby(text['Sex'])
means = df.mean()
df.groupby(['Sex']).mean()['Fare']
survived_sex = text['Survived'].groupby(text['Sex']).sum()
df.groupby(['Sex']).sum()['Survived']
df.groupby(['Pclass']).sum()['Survived']
df['Survived'].groupby(df['Pclass']).sum()
agg数据聚合方法
df.groupby(['Sex']).agg('mean')['Fare']
df.groupby(['Sex']).agg('sum')['Survived']
df.groupby(['Pclass']).agg('sum')['Survived']
```任务五:统计在不同等级的票中的不同年龄的船票花费的平均值
### 任务五:统计在不同等级的票中的不同年龄的船票花费的平均值
```python
df.groupby(['Pclass','Age'])['Fare'].mean()
pd.DataFrame([df.groupby(['Sex']).mean()['Fare'],df.groupby(['Sex']).sum()['Survived']]).to_csv("sex_fare_survived.csv")
df.groupby(['Age'])['Survived'].sum()#各年龄段存活人数
df.groupby(['Age']).count()['Embarked']#各年龄段总人数
df.groupby(['Age'])['Survived'].sum()/df.groupby(['Age']).count()['Embarked']#各年龄段存活率
df.groupby(['Age'])['Survived'].sum()[df.groupby(['Age'])['Survived'].sum()==max(df.groupby(['Age'])['Survived'].sum())]#存活人数最多的年龄段
max(df.groupby(['Age'])['Survived'].sum())/df['Survived'].sum()#存活人数最多的年龄段的存活率
max(df.groupby(['Age'])['Survived'].sum())/sum(df.groupby(['Age'])['Survived'].sum())#本写法不对因为age列存在nan,groupby结果中会默认删除nan的数据
sex = df.groupby('Sex')['Survived'].sum()
sex.plot.bar()
plt.title('survived_count')
plt.show()
a=pd.DataFrame([df.groupby('Sex')['Survived'].count()-sex,sex]).T
a.columns=['NoSurvived','Survived']
a.plot.bar()
plt.title('survived_count')
plt.show()
df.groupby(['Sex','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked=True)
#排序
fig = plt.figure(figsize=(20, 18))
df.groupby(['Fare'])['Survived'].value_counts().sort_values(ascending=False).plot(grid=True)
plt.legend()#图例
plt.show()
#不排序
fig = plt.figure(figsize=(20, 18))
df.groupby(['Fare'])['Survived'].value_counts().plot(grid=True)
plt.legend()#图例
plt.show()
df.groupby(['Pclass','Survived'])['Survived'].count().unstack().plot(kind='bar')
对于数据,人往往没有直观的概念,而经过绘图展示后人往往对数据有一个清晰的认知。这就是数据可视化的价值所在。
而良好的数据可视化往往需要一些技巧。
plt.figure(figsize=(16,10), dpi= 80)
sns.kdeplot(df.loc[df['Survived'] == 0, "Age"], shade=True, color="g", label="Survived=0", alpha=.7)
sns.kdeplot(df.loc[df['Survived'] == 1, "Age"], shade=True, color="deeppink", label="Survived=1", alpha=.7)
facet = sns.FacetGrid(df, hue="Survived",aspect=3)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, df['Age'].max()))
facet.add_legend()
pd.DataFrame(df.groupby(['Survived','Age'])['Unnamed: 0'].count().unstack().iloc[0,:]).plot(kind='bar',)
pd.DataFrame(df.groupby(['Survived','Age'])['Unnamed: 0'].count().unstack().iloc[1,:]).plot(kind='bar',)
fig,subs=plt.subplots(1,2,figsize=(16,6), dpi= 80)
subs[0].hist(df[df['Survived']==1]['Age'])
subs[0].set_title('Survived',fontsize=12)
subs[1].hist(df[df['Survived']==0]['Age'])
subs[1].set_title('NoSurvived',fontsize=12)
plt.show()
df_sex1=df['Age'][df['Survived']==1]
df_sex0=df['Age'][df['Survived']==0]
plt.hist([df_sex1,df_sex0],
stacked=True,
label=['Survived','NoSurvived'])
plt.legend()
plt.title('title')
plt.title('Age_Survived')
从上面可视化结果可以大致看出年龄小的存活可能性更高。为了更清楚的看一下,对年龄进行经验分箱。
df['Ageban']=df['Age'].map(lambda x: '儿童' if x<=10 else '青少年'if x<=18 else '青年' x<=30 else '中年' if x<60 else '老年')
df['Ageban']=df['Age'].map(lambda x: '0' if x<=10 else '1'if x<=18 else '2'if x<=30 else '3' if x<60 else '4')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
df.groupby(['Ageban','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked=True)
#代码编写
df[df['Pclass']==1]['Age'].plot(kind='kde')
df.Age[df.Pclass == 2].plot(kind='kde')
df.Age[df.Pclass == 3].plot(kind='kde')
plt.xlabel("age")
plt.legend((1,2,3),loc="best")