先导入工具包模块 import pandas as pd import matplotlib.pyplot as plt ----------------------------------------------- #读取train.csv中的数据并解析 titanic=pd.read_csv("train.csv") #年龄中位数 print(titanic.Age.median()) print("\n") 运行结果: 28.0 ----------------------------------------------- #填充所有age字段的空值为中位数,不改变源数据 print(titanic.Age.fillna(titanic.Age.median())) print("\n") #打印前五行 print(titanic.head()) print("\n") 运行结果: PassengerId Survived Pclass ... Fare Cabin Embarked 0 1 0 3 ... 7.2500 NaN S 1 2 1 1 ... 71.2833 C85 C 2 3 1 3 ... 7.9250 NaN S 3 4 1 1 ... 53.1000 C123 S 4 5 0 3 ... 8.0500 NaN S [5 rows x 12 columns] ----------------------------------------------- #查看数据类型 print(titanic.info()) print("\n") 运行结果
RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): PassengerId 891 non-null int64 Survived 891 non-null int64 Pclass 891 non-null int64 Name 891 non-null object Sex 891 non-null object Age 714 non-null float64 SibSp 891 non-null int64 Parch 891 non-null int64 Ticket 891 non-null object Fare 891 non-null float64 Cabin 204 non-null object Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.6+ KB None ----------------------------------------------- #统计所有数据类型 print(titanic.describe()) print("\n") 运行结果 PassengerId Survived Pclass ... SibSp Parch Fare count 891.000000 891.000000 891.000000 ... 891.000000 891.000000 891.000000 mean 446.000000 0.383838 2.308642 ... 0.523008 0.381594 32.204208 std 257.353842 0.486592 0.836071 ... 1.102743 0.806057 49.693429 min 1.000000 0.000000 1.000000 ... 0.000000 0.000000 0.000000 25% 223.500000 0.000000 2.000000 ... 0.000000 0.000000 7.910400 50% 446.000000 0.000000 3.000000 ... 0.000000 0.000000 14.454200 75% 668.500000 1.000000 3.000000 ... 1.000000 0.000000 31.000000 max 891.000000 1.000000 3.000000 ... 8.000000 6.000000 512.329200 [8 rows x 7 columns] ----------------------------------------------- #填充所有age字段的空值为中位数,改变源数据,无返回值 titanic.Age.fillna(titanic.Age.median(),inplace=True) #统计所有空值个数 print(titanic.isnull().sum()) print("\n") 运行结果 PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64 ----------------------------------------------- #统计生还者、遇难者、性别 dead=titanic[titanic.Survived==0].Sex.value_counts() survived=titanic[titanic.Survived==1].Sex.value_counts() #绘制图表,index设置y轴字段名 df=pd.DataFrame([survived,dead],index=["survived","dead"]) df=df.T #转置矩阵 #增加百分比字段 df["p_survived"]=df.survived/(df.survived+df.dead) df["p_dead"]=df.dead/(df.survived+df.dead) print(df) print("\n") 运行结果 survived dead p_survived p_dead female 233 81 0.742038 0.257962 male 109 468 0.188908 0.811092 ----------------------------------------------- #分析性别对生存率的影响 df[["p_survived","p_dead"]].plot.bar() plt.show() #分析年龄对生存率影响 dead=titanic[titanic.Survived==0].Age survived=titanic[titanic.Survived==1].Age df=pd.DataFrame([survived,dead],index=["survived","dead"]) df=df.T #转置矩阵 print(df) df.plot.hist(stacked=True,bins=30) plt.show() print("\n") #分析是否成年 生存率影响 adult=titanic[titanic.Age>=18]["Survived"].value_counts() child=titanic[titanic.Age<18]["Survived"].value_counts() df=pd.DataFrame([adult,child],index=["adult","child"]) df.columns=["dead","survived"] df.plot.bar() plt.show() #分析 票价多少 年龄多大的人遇难最多 ax=plt.subplot() age=titanic[titanic.Survived==0].Age fare=titanic[titanic.Survived==0].Fare plt.scatter(age,fare,s=10) ax.set_xlabel("age") ax.set_ylabel("fare") plt.show()