#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/8/10 10:09
# @Author : limingyu
# @Site :
# @File : Test_pandas_titanic_train.py
# @Software: PyCharm
#泰坦尼克船员获救的案例
#属性:PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
#编号,标签,仓位等级,乘客姓名,性别,年龄,家属数量,老人和孩子数量,船票编码,船票价格,船舱编号,登船地点
#NaN表示缺失值
import pandas as pd
import numpy as np
from pandas import Series
#读文本
titanic_survival = pd.read_csv("titanic_train.csv")
print(titanic_survival.head()) #取出csv中前5行
#PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
# 1 0 3 Braund male 22 1 0 A/5 21171 7.25 S
# 2 1 1 Cumings female 38 1 0 PC 17599 71.2833 C85 C
# 3 1 3 Heikkinen female 26 0 0 STON/O2.3101282 7.925 S
#取年龄属性
age = titanic_survival["Age"]
print(age.loc[0:5]) #取0到5行数据
#[5 rows x 12 columns]
#0 22.0
#1 38.0
#2 26.0
#3 35.0
#4 35.0
#5 NaN
age_is_null = pd.isnull(age) #判断是否是缺失值,有的人没登记年龄
print(age_is_null) #Name: Age, dtype: float64
#0 False
#1 False
#2 False
#3 False
#4 False
#5 True
#6 False等共890条
age_null_true = age[age_is_null] #boolen当索引,将所有True的值输出
print(age_null_true) #Name: Age, Length: 891, dtype: bool
#5 NaN
#17 NaN
#19 NaN等
age_null_count = len(age_null_true)
print(age_null_count) #Name: Age, Length: 177, dtype: float64
#177
#当数据中有缺失值,而没处理时,影响一些操作eg:求均值
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age) #结果异常 nan
#对缺失值处理后的操作 ,正常
good_ages = titanic_survival["Age"][age_is_null == False]
print(good_ages) #打印所有有值的年龄
#0 22.0
#1 38.0
#2 26.0
#3 35.0
#4 35.0
#6 54.0等
#对正常年龄进行操作
correct_mean_age = sum(good_ages) / len(good_ages)
print(correct_mean_age) #29.69911764705882
#使用方法求均值
correct_mean_age = titanic_survival["Age"].mean()
print(correct_mean_age) #29.69911764705882
#计算每个仓位等级船票的平均价格
#方法一:自己写逻辑
passenger_classes = [1,2,3]
fares_by_class = {}#存放均值
for this_class in passenger_classes:
#如果等级匹配正确,取出所有匹配样本
pclass_rows =titanic_survival[titanic_survival["Pclass"] == this_class]
print(pclass_rows)
# PassengerId Survived Pclass ... Fare Cabin Embarked
#1 2 1 1 ... 71.2833 C85 C
#3 4 1 1 ... 53.1000 C123 S 等
pclass_fares = pclass_rows["Fare"]
fare_mean = pclass_fares.mean()
fares_by_class[this_class] = fare_mean
print(fares_by_class) # 打印等级:价格均值
#{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
#方法二:调用函数
#以index为基准,统计values与index的关系,统计量:aggfunc
fares_by_class = titanic_survival.pivot_table(index="Pclass",values="Fare",aggfunc=np.mean)
print(fares_by_class) #Pclass
#1 84.154687
#2 20.662183
#3 13.675550
#统计每个等级获救平均人数
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
print(passenger_survival) #Pclass
#1 0.629630
#2 0.472826
#3 0.242363
#统计每个仓位等级平均年龄,默认统计均值
passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age")
print(passenger_age) #Pclass
# 1 38.233441
# 2 29.877630
# 3 25.140620
#统计一个量与其他两个量间的关系
#统计登船地点与船票价格和获救与否的关系
port_stats = titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
print(port_stats) #每个码头,总收入和总获救人数
#Embarked
# C 10072.2962 93
# Q 1022.2543 30
# S 17439.3988 217
print("------------------------")
#处理缺失值:全部丢掉
drop_na_columns = titanic_survival.dropna(axis=1) #对有缺失值的列全部丢掉
print(drop_na_columns)
# PassengerId Survived Pclass ... Parch Ticket Fare
#0 1 0 3 ... 0 A/5 21171 7.2500
#eg:丢掉Cabin或Fare中缺失值的行
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Fare","Cabin"]) #对有缺失值的列全部丢掉
print(new_titanic_survival)
# PassengerId Survived Pclass ... Fare Cabin Embarked
#1 2 1 1 ... 71.2833 C85 C
#3 4 1 1 ... 53.1000 C123 S
#6 7 0 1 ... 51.8625 E46 S
#定位到具体值:输入样本号和列名
row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age) #28.0
print(row_index_1000_pclass) #1
#按年龄降序排序输出
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print(new_titanic_survival[0:5])
# PassengerId Survived Pclass Age ... Fare Cabin Embarked
#630 631 1 1 80.0 ... 30.0000 A23 S
#851 852 0 3 74.0 ... 7.7750 NaN S
#493 494 0 1 71.0 ... 49.5042 NaN C
#将上述标号从0开始,改变index索引
titanic_reindex = new_titanic_survival.reset_index(drop=True)
print(titanic_reindex[0:5])
# PassengerId Survived Pclass Age ... Fare Cabin Embarked
#0 631 1 1 80.0 ... 30.0000 A23 S
#1 852 0 3 74.0 ... 7.7750 NaN S
#2 494 0 1 71.0 ... 49.5042 NaN C
#自定义函数:输出第100行的数据
def hundredth_row(column):
hundredth_item = column.loc[99]
return hundredth_item
#apply():执行自定义函数
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row) #PassengerId 100
#Survived 0
#Pclass 2
#Name Kantor, Mr. Sinai
#Sex male
#Age 34
#SibSp 1
#Parch 0
#Ticket 244367
#Fare 26
#Cabin NaN
#Embarked S
#dtype: object
#统计每列缺失值的个数
def not_null_count(column): #循环所有列
column_null = pd.isnull(column) #判断第一列是否有空值
print(column_null) #输出bool值
null = column[column_null] #bool值作为索引,将空值对应的列输出
print("--",null)
#Series([], Name: PassengerId, dtype: object)
#Series([], Name: Survived, dtype: object)
#Series([], Name: Pclass, dtype: object)
#Series([], Name: Name, dtype: object)
#Series([], Name: Sex, dtype: object)
#5 NaN
#17 NaN
#19 NaN
#26 NaN
#28 NaN
#29 NaN
#([], Name: SibSp, dtype: object)
#Series([], Name: Parch, dtype: object)
#Series([], Name: Ticket, dtype: object)
#Series([], Name: Fare, dtype: object)
#0 NaN
#2 NaN
#4 NaN
#5 NaN ...
#61 NaN
#829 NaN
return len(null) #统计所有空值个数
column_null_count = titanic_survival.apply(not_null_count)
print(column_null_count) #输出列及对应的空值数
#Name: Embarked, dtype: object
#PassengerId 0
#Survived 0
#Pclass 0
#Name 0
#Sex 0
#Age 177
#SibSp 0
#Parch 0
#Ticket 0
#Fare 0
#Cabin 687
#Embarked 2
#dtype: int64
#对仓位等级123,变成First Class,Second Class,Third Class
def which_class(row): #判断属性pclass的所有行样本
pclass =row['Pclass'] #定位pclass列
if pd.isnull(pclass):
return "Unknow"
elif pclass ==1:
return "First Class"
elif pclass ==2:
return "Second Class"
elif pclass ==3:
return "Third Class"
classes = titanic_survival.apply(which_class,axis=1)
print(classes) #0 Third Class
#1 First Class
#2 Third Class
#将连续值变成离散值eg:年龄分成未成年,成年,缺失值
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "Unknow"
elif age > 18:
return "adult"
else:
return "minor"
age_labels = titanic_survival.apply(generate_age_label,axis=1)
print(age_labels) #0 adult
#1 adult
#统计当前获救人数和是否是成年人的关系
titanic_survival["age_labels"] = age_labels
age_grouo_survival = titanic_survival.pivot_table(index="age_labels",values="Survived")
print(age_grouo_survival) #age_labels
#Unknow 0.293785
#adult 0.382609
#minor 0.503597
#DataFrame由一系列Series组成,Series里面的结构式ndarray
#DataFrame:相当于读取的矩阵,Series:矩阵中的一行或一列,ndarray:Series中的值
fandango = pd.read_csv('fandango_score_comparison.csv')
print(type(fandango)) #
series_film = fandango["FILM"]
print(type(series_film)) #
print(series_film[0:5]) #打印FILM前五列
#0 Avengers: Age of Ultron (2015)
#1 Cinderella (2015)
#2 Ant-Man (2015)
#3 Do You Believe? (2015)
#4 Hot Tub Time Machine 2 (2015)
#Name: FILM, dtype: object
series_rt = fandango['RottenTomatoes'] #媒体烂番茄
print(series_rt[0:5]) #打印烂番茄前五行
#0 74
#1 85
#2 80
#3 18
#4 14
#Name: RottenTomatoes, dtype: int64
#Series取出电影的值
film_names = series_film.values
print(series_film) #取出索引及电影名
#0 Avengers: Age of Ultron (2015)
#1 Cinderella (2015)...
print(film_names) #只取出电影名
#['Avengers: Age of Ultron (2015)' 'Cinderella (2015)' 'Ant-Man (2015)'...
print(type(film_names)) #
rt_scores = series_rt.values #得到媒体的分值
print(rt_scores) #[ 74 85 80 18 14 63 ...]
#自定义Series,一个电影名对应一个媒体评分,用电影名为索引
series_custom = Series(rt_scores,index=film_names) #用string作为索引
print("--",series_custom) #Avengers: Age of Ultron (2015) 74
#Cinderella (2015) 85
#series_custom[['Minions (2015)','Leviathan (2014)']]
fiveten = series_custom[5:10]
print(fiveten)
#The Water Diviner (2015) 63
#Irrational Man (2015) 42
#Top Five (2014) 86
#Shaun the Sheep Movie (2015) 99
#Love & Mercy (2015) 89
#dtype:
#Series排序
original_index = series_custom.index.tolist() #将索引电影名转为list
print(original_index) #打印电影名
sorted_index = sorted(original_index) #对电影名按字典序排序
sorted_by_index = series_custom.reindex(sorted_index)
print(sorted_by_index)
#Series按照键(index)或值(value)排序
sc2 = series_custom.sort_index()
print(sc2[0:5]) #'71 (2015) 97
#5 Flights Up (2015) 52
#A Little Chaos (2015) 40
#A Most Violent Year (2014) 90
#About Elly (2015) 97...
sc3 = series_custom.sort_values()
print(sc3[0:5]) #Paul Blart: Mall Cop 2 (2015) 5
#Hitman: Agent 47 (2015) 7
#Hot Pursu (2015) 8
#Fantastic Four (2015) 9
#Taken 3 (2015) 9...
#Series加法
print(np.add(series_custom,series_custom))
#Avengers: Age of Ultron (2015) 148
#Cinderella (2015) 170
print(np.sin(series_custom))
print(np.max(series_custom))