使用关于泰坦尼克号的获救人信息csv表格进行常见的数据预处理
import pandas as pd
import numpy as np
titanic_survival=pd.read_csv("titanic_train.csv")
titanic_survival.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# 其中NAN表示缺失值
# 我们可以使用pandas.isnull()函数来返回一个True或者False来判断值是否缺失
age=titanic_survival["Age"]
# 注意访问具体的元素一定要使用.loc[]函数
#print(age.loc[0:10])
age_is_null=pd.isnull(age) #返回一个true and false的数组
#print(age_is_null)
#true和false可以作为索引来得到true的对应行的值,得到一个新的列表
age_null_true=age[age_is_null]
print(age_null_true)
age_null_count=len(age_null_true)
print(age_null_count)
5 NaN
17 NaN
19 NaN
26 NaN
28 NaN
..
859 NaN
863 NaN
868 NaN
878 NaN
888 NaN
Name: Age, Length: 177, dtype: float64
177
#首先展示一下如果不对缺失值进行处理就计算一系列数据的结果:
mean_age=sum(titanic_survival["Age"])/len(titanic_survival["Age"])
print(mean_age)
nan
对于含有缺失值的数据进行均值的计算:
#我们必须得先对缺失值进行处理
#直接使用mean函数是一种方法
print(titanic_survival["Age"].mean())
#0为列,1为行
#我们也可以先对缺失值进行剔除
true_age=titanic_survival["Age"][age_is_null==False]
#print(true_age)
correct_mean_age=sum(true_age)/len(true_age)
print(correct_mean_age)
29.69911764705882
29.69911764705882
passenger_classes=[1,2,3]
#建立一个字典作为最后的数据存储的目的地
fares_by_class={}
for this in passenger_classes:
#找出船舱等级分别等于1 2 3的整体数据
pclass_rows=titanic_survival[titanic_survival["Pclass"]==this]
#提取对应仓的整体数据的价格值
pclass_fares=pclass_rows["Fare"]
#将价格值求平均
fare_for_class=pclass_fares.mean()
#结果存入对应下标的value位置
fares_by_class[this]=fare_for_class
print(fares_by_class)
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
函数的详细操作可参考该链接:https://www.cnblogs.com/huangchenggener/p/10983516.html
#求解 一等舱 二等仓 三等舱 分别平均获救多少人
passenger_survival=titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
print(passenger_survival)
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
#计算平均年龄
passenger_age=titanic_survival.pivot_table(index="Pclass",values="Age")
print(passenger_age)
#默认就是求均值
Age
Pclass
1 38.233441
2 29.877630
3 25.140620
#同时看一个值和两个值之间的关系:
port_states=titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.mean)
print(port_states)
Fare Survived
Embarked
C 59.954144 0.553571
Q 13.276030 0.389610
S 27.079812 0.336957
#对于空值的丢弃处理:
drop_na_columns=titanic_survival.dropna(axis=1)
#print(drop_na_columns)
new_titanic_survival=titanic_survival.dropna(axis=0,subset=["Age","Sex"])
# 0为行,1为列
new_titanic_survival.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
row_83_age=titanic_survival.loc[83,"Age"]
row_766_pclass=titanic_survival.loc[766,"Pclass"]
print(row_83_age)
print(row_766_pclass)
28.0
1
new_titanic_survival=titanic_survival.sort_values("Age",ascending=False)
#其中ascending表示“升序”,False表示进行降序
print(new_titanic_survival[0:10])
#对Index重新进行编号;
titanic_reindexed=new_titanic_survival.reset_index(drop=True)
#True表示原来数组的元素被覆盖,即在原来的数组上直接进行修改
print("-----------------------------------------")
print(titanic_reindexed.loc[0:10])
#print(titanic_survival.loc[0:10])
PassengerId Survived Pclass Name \
630 631 1 1 Barkworth, Mr. Algernon Henry Wilson
851 852 0 3 Svensson, Mr. Johan
493 494 0 1 Artagaveytia, Mr. Ramon
96 97 0 1 Goldschmidt, Mr. George B
116 117 0 3 Connors, Mr. Patrick
672 673 0 2 Mitchell, Mr. Henry Michael
745 746 0 1 Crosby, Capt. Edward Gifford
33 34 0 2 Wheadon, Mr. Edward H
54 55 0 1 Ostby, Mr. Engelhart Cornelius
280 281 0 3 Duane, Mr. Frank
Sex Age SibSp Parch Ticket Fare Cabin Embarked
630 male 80.0 0 0 27042 30.0000 A23 S
851 male 74.0 0 0 347060 7.7750 NaN S
493 male 71.0 0 0 PC 17609 49.5042 NaN C
96 male 71.0 0 0 PC 17754 34.6542 A5 C
116 male 70.5 0 0 370369 7.7500 NaN Q
672 male 70.0 0 0 C.A. 24580 10.5000 NaN S
745 male 70.0 1 1 WE/P 5735 71.0000 B22 S
33 male 66.0 0 0 C.A. 24579 10.5000 NaN S
54 male 65.0 0 1 113509 61.9792 B30 C
280 male 65.0 0 0 336439 7.7500 NaN Q
-----------------------------------------
PassengerId Survived Pclass Name Sex \
0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male
1 852 0 3 Svensson, Mr. Johan male
2 494 0 1 Artagaveytia, Mr. Ramon male
3 97 0 1 Goldschmidt, Mr. George B male
4 117 0 3 Connors, Mr. Patrick male
5 673 0 2 Mitchell, Mr. Henry Michael male
6 746 0 1 Crosby, Capt. Edward Gifford male
7 34 0 2 Wheadon, Mr. Edward H male
8 55 0 1 Ostby, Mr. Engelhart Cornelius male
9 281 0 3 Duane, Mr. Frank male
10 457 0 1 Millet, Mr. Francis Davis male
Age SibSp Parch Ticket Fare Cabin Embarked
0 80.0 0 0 27042 30.0000 A23 S
1 74.0 0 0 347060 7.7750 NaN S
2 71.0 0 0 PC 17609 49.5042 NaN C
3 71.0 0 0 PC 17754 34.6542 A5 C
4 70.5 0 0 370369 7.7500 NaN Q
5 70.0 0 0 C.A. 24580 10.5000 NaN S
6 70.0 1 1 WE/P 5735 71.0000 B22 S
7 66.0 0 0 C.A. 24579 10.5000 NaN S
8 65.0 0 1 113509 61.9792 B30 C
9 65.0 0 0 336439 7.7500 NaN Q
10 65.0 0 0 13509 26.5500 E38 S
# This function returns the hundredth item from the series
def hundreds_row(column):
hundreds_item=column[99]
return hundreds_item
#retrun the hundredth item from each column
hundreds_row=titanic_survival.apply(hundreds_row)
print(hundreds_row)
PassengerId 100
Survived 0
Pclass 2
Name Kantor, Mr. Sinai
Sex male
Age 34
SibSp 1
Parch 0
Ticket 244367
Fare 26
Cabin NaN
Embarked S
dtype: object
#求每个属性的缺失值有多少个
def not_null_count(column):
column_null=pd.isnull(column)
null=column[column_null]
return len(null)
#column_null_count=titanic_survival.apply(not_null_count)
not_null_count(titanic_survival.columns)
print(column_null_count)
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
fullevent=pd.read_csv("passingevents.csv")
fullevent.head()
# print(fullevent.loc[0,"OriginPlayerID"].endswith("ID"))
MatchID | TeamID | OriginPlayerID | DestinationPlayerID | MatchPeriod | EventTime | EventSubType | EventOrigin_x | EventOrigin_y | EventDestination_x | EventDestination_y | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Huskies | Huskies_D1 | Huskies_F1 | 1H | 46.323501 | Head pass | 34 | 97 | 59.0 | 95.0 |
1 | 1 | Huskies | Huskies_M1 | Huskies_F2 | 1H | 51.022546 | Simple pass | 53 | 89 | 69.0 | 91.0 |
2 | 1 | Opponent1 | Opponent1_D2 | Opponent1_G1 | 1H | 89.008721 | Simple pass | 19 | 16 | 5.0 | 50.0 |
3 | 1 | Opponent1 | Opponent1_G1 | Opponent1_F1 | 1H | 92.216160 | Launch | 5 | 50 | 67.0 | 44.0 |
4 | 1 | Huskies | Huskies_M2 | Huskies_M3 | 1H | 98.265191 | Simple pass | 42 | 55 | 36.0 | 54.0 |
#打个字典?
myNumberDict={"D1":"1","D2":"2","D3":"3","D4":"4","D5":"5","D6":"6","D7":"7","D8":"8","D9":"9",
"D10":"10","F1":"11","F2":"12","F3":"13","F4":"14","F5":"15","F6":"16","G1":"17","G2":"31","M1":"18",
"M2":"19","M3":"20","M4":"21","M5":"22","M6":"23","M7":"24","M8":"25","M9":"26","M10":"27","M11":"28",
"M12":"29","M13":"30"}
def cerial_number(column):
string=column["OriginPlayerID"].split("_")[1]
return myNumberDict[string]
def num2(column):
string2=column["DestinationPlayerID"].split("_")[1]
return myNumberDict[string2]
a=fullevent.apply(cerial_number,axis=1)
b=fullevent.apply(num2,axis=1)
fullevent["OriginPlayerID"]=a
fullevent["DestinationPlayerID"]=b
#fullevent[]
fullevent.head()
writer = pd.ExcelWriter('test.xlsx', engine='xlsxwriter')
fullevent.to_excel(writer, sheet_name='Sheet1')
writer.save()