导入pandas库和numpy库
import pandas as pd
import numpy as np
我们以一个csv文件来展示pandas是如何来进行数据预处理的:titanic_train.csv
读入文件titanic_train.csv
titanic_survival = pd.read_csv("titanic_train.csv")
1、求平均值
①通过求和和求长函数来计算平均值
#去除缺失值
age_is_null = pd.isnull(titanic_survival["Age"])
good_ages = titanic_survival["Age"][age_is_null == False]
#求和后的数据除以总数据长度
correct_mean_age = sum(good_ages) / len(good_ages)
print(correct_mean_age)
OUT:
29.6991176471
②通过pandas函数来求平均值
correct_mean_age_pn = titanic_survival["Age"].mean()
print(correct_mean_age_pn)
OUT:
29.69911764705882
③计算不同等级船舱对应的平均消费
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
#取出Pclass值为this_class的行
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
#取出Pclass值为this_class的行的Fare值
pclass_fare = pclass_rows["Fare"]
#计算取出的Fare值的平均数
fare_to_class = pclass_fare.mean()
#将计算出的平均数传入fares_by_class字典
fares_by_class[this_class] = fare_to_class
print(fares_by_class)
OUT:
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
2、透视表
①统计船舱等级和获救几率之间的关系
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print(passenger_survival)
参数解释:
index:统计基准
values:被统计数据
aggfunc:统计方式(默认值是mean)
OUT:
Pclass Survived
1 0.629630
2 0.472826
3 0.242363
②统计性别和获救几率的关系
sex_survival = titanic_survival.pivot_table(index="Sex", values="Survived", aggfunc=np.mean)
print(sex_survival)
OUT:
Sex Survived
female 0.742038
male 0.188908
③统计登船地点与消费和获救人数的关系
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare", "Survived"], aggfunc=np.sum)
print(port_stats)
OUT:
Embarked Fare Survived
C 10072.2962 93
Q 1022.2543 30
S 17439.3988 217
3、去除缺失值
①去除有缺失值的列
drop_na_columns = titanic_survival.dropna(axis=1)
OUT:
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
5 6 0 3
6 7 0 1
7 8 0 3
8 9 1 3
9 10 1 2
Name Sex SibSp Parch \
0 Braund, Mr. Owen Harris male 1 0
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 1 0
2 Heikkinen, Miss. Laina female 0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 1 0
4 Allen, Mr. William Henry male 0 0
5 Moran, Mr. James male 0 0
6 McCarthy, Mr. Timothy J male 0 0
7 Palsson, Master. Gosta Leonard male 3 1
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 0 2
9 Nasser, Mrs. Nicholas (Adele Achem) female 1 0
Ticket Fare
0 A/5 21171 7.2500
1 PC 17599 71.2833
2 STON/O2. 3101282 7.9250
3 113803 53.1000
4 373450 8.0500
5 330877 8.4583
6 17463 51.8625
7 349909 21.0750
8 347742 11.1333
9 237736 30.0708
②去除有缺失值的行
new_titanic_survival = titanic_survival.dropna(axis=0, subset=["Age", "Sex"])
print(new_titanic_survival)
OUT:
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
6 7 0 1
7 8 0 3
8 9 1 3
9 10 1 2
10 11 1 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0
6 McCarthy, Mr. Timothy J male 54.0 0
7 Palsson, Master. Gosta Leonard male 2.0 3
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0
9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1
10 Sandstrom, Miss. Marguerite Rut female 4.0 1
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S
6 0 17463 51.8625 E46 S
7 1 349909 21.0750 NaN S
8 2 347742 11.1333 NaN S
9 0 237736 30.0708 NaN C
10 1 PP 9549 16.7000 G6 S
4、定位
row_index_age = titanic_survival.loc[83, "Age"]
print(row_index_age)
row_index_pclass = titanic_survival.loc[766, "Pclass"]
print(row_index_pclass)
OUT:
28.0
1
5、排序
new_titanic_survival = titanic_survival.sort_values("Age", ascending=False)
print(new_titanic_survival[:5])
print("----------")
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print(titanic_reindexed[:5])
OUT:
PassengerId Survived Pclass Name \
630 631 1 1 Barkworth, Mr. Algernon Henry Wilson
851 852 0 3 Svensson, Mr. Johan
493 494 0 1 Artagaveytia, Mr. Ramon
96 97 0 1 Goldschmidt, Mr. George B
116 117 0 3 Connors, Mr. Patrick
Sex Age SibSp Parch Ticket Fare Cabin Embarked
630 male 80.0 0 0 27042 30.0000 A23 S
851 male 74.0 0 0 347060 7.7750 NaN S
493 male 71.0 0 0 PC 17609 49.5042 NaN C
96 male 71.0 0 0 PC 17754 34.6542 A5 C
116 male 70.5 0 0 370369 7.7500 NaN Q
----------
PassengerId Survived Pclass Name Sex \
0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male
1 852 0 3 Svensson, Mr. Johan male
2 494 0 1 Artagaveytia, Mr. Ramon male
3 97 0 1 Goldschmidt, Mr. George B male
4 117 0 3 Connors, Mr. Patrick male
Age SibSp Parch Ticket Fare Cabin Embarked
0 80.0 0 0 27042 30.0000 A23 S
1 74.0 0 0 347060 7.7750 NaN S
2 71.0 0 0 PC 17609 49.5042 NaN C
3 71.0 0 0 PC 17754 34.6542 A5 C
4 70.5 0 0 370369 7.7500 NaN Q
6、自定义函数apply()
①查看第100个样本的数据
def hundredth_row(column):
hundredth_item = column.loc[99]
return hundredth_item
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)
OUT:
PassengerId 100
Survived 0
Pclass 2
Name Kantor, Mr. Sinai
Sex male
Age 34
SibSp 1
Parch 0
Ticket 244367
Fare 26
Cabin NaN
Embarked S
dtype: object
②统计每一列的缺失值情况
def not_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print(column_null_count)
OUT:
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
③替换船舱等级称呼
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return "Unknow"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass == 3:
return "Third Class"
classes = titanic_survival.apply(which_class, axis=1)
print(classes[:10])
OUT:
0 Third Class
1 First Class
2 Third Class
3 First Class
4 Third Class
5 Third Class
6 First Class
7 Third Class
8 Third Class
9 Second Class
dtype: object
7、综合:查看获救几率与年龄之间的关系
#年龄分段
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "Unknow"
elif age < 18:
return "Minor"
else:
return "Adult"
age_label = titanic_survival.apply(generate_age_label, axis=1)
#分析获救几率与年龄之间的关系
titanic_survival['age_label'] = age_label
age_group_survival = titanic_survival.pivot_table(index="age_label", values="Survived")
print(age_group_survival)
OUT:
age_label Survived
Adult 0.381032
Minor 0.539823
Unknow 0.293785
附上:
数据分析处理库Pandas-数据读取
数据分析处理库Pandas-数据预处理
数据分析处理库Pandas-Series结构