使用python和pandas分析泰坦尼克号乘坐者数据

先导入工具包模块
import pandas as pd
import matplotlib.pyplot as plt

-----------------------------------------------

#读取train.csv中的数据并解析
titanic=pd.read_csv("train.csv")

#年龄中位数
print(titanic.Age.median())
print("\n")

运行结果:
28.0
-----------------------------------------------

#填充所有age字段的空值为中位数,不改变源数据
print(titanic.Age.fillna(titanic.Age.median()))
print("\n")


#打印前五行
print(titanic.head())
print("\n")


运行结果:
   PassengerId  Survived  Pclass  ...     Fare Cabin  Embarked
0            1         0       3  ...   7.2500   NaN         S
1            2         1       1  ...  71.2833   C85         C
2            3         1       3  ...   7.9250   NaN         S
3            4         1       1  ...  53.1000  C123         S
4            5         0       3  ...   8.0500   NaN         S

[5 rows x 12 columns]

-----------------------------------------------

#查看数据类型
print(titanic.info())
print("\n")

运行结果

RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None

-----------------------------------------------

#统计所有数据类型
print(titanic.describe())
print("\n")
运行结果
       PassengerId    Survived      Pclass  ...       SibSp       Parch        Fare
count   891.000000  891.000000  891.000000  ...  891.000000  891.000000  891.000000
mean    446.000000    0.383838    2.308642  ...    0.523008    0.381594   32.204208
std     257.353842    0.486592    0.836071  ...    1.102743    0.806057   49.693429
min       1.000000    0.000000    1.000000  ...    0.000000    0.000000    0.000000
25%     223.500000    0.000000    2.000000  ...    0.000000    0.000000    7.910400
50%     446.000000    0.000000    3.000000  ...    0.000000    0.000000   14.454200
75%     668.500000    1.000000    3.000000  ...    1.000000    0.000000   31.000000
max     891.000000    1.000000    3.000000  ...    8.000000    6.000000  512.329200

[8 rows x 7 columns]

-----------------------------------------------

#填充所有age字段的空值为中位数,改变源数据,无返回值
titanic.Age.fillna(titanic.Age.median(),inplace=True)


#统计所有空值个数
print(titanic.isnull().sum())
print("\n")
运行结果
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

-----------------------------------------------

#统计生还者、遇难者、性别
dead=titanic[titanic.Survived==0].Sex.value_counts()
survived=titanic[titanic.Survived==1].Sex.value_counts()


#绘制图表,index设置y轴字段名
df=pd.DataFrame([survived,dead],index=["survived","dead"])
df=df.T #转置矩阵
#增加百分比字段
df["p_survived"]=df.survived/(df.survived+df.dead)
df["p_dead"]=df.dead/(df.survived+df.dead)
print(df)
print("\n")

运行结果
        survived  dead  p_survived    p_dead
female       233    81    0.742038  0.257962
male         109   468    0.188908  0.811092

-----------------------------------------------

#分析性别对生存率的影响
df[["p_survived","p_dead"]].plot.bar()
plt.show()


#分析年龄对生存率影响
dead=titanic[titanic.Survived==0].Age
survived=titanic[titanic.Survived==1].Age
df=pd.DataFrame([survived,dead],index=["survived","dead"])
df=df.T #转置矩阵
print(df)
df.plot.hist(stacked=True,bins=30)
plt.show()
print("\n")

#分析是否成年 生存率影响
adult=titanic[titanic.Age>=18]["Survived"].value_counts()
child=titanic[titanic.Age<18]["Survived"].value_counts()
df=pd.DataFrame([adult,child],index=["adult","child"])
df.columns=["dead","survived"]
df.plot.bar()
plt.show()

#分析 票价多少 年龄多大的人遇难最多
ax=plt.subplot()
age=titanic[titanic.Survived==0].Age
fare=titanic[titanic.Survived==0].Fare
plt.scatter(age,fare,s=10)
ax.set_xlabel("age")
ax.set_ylabel("fare")
plt.show()

 

你可能感兴趣的:(python,大数据)