Pandas库数据基本处理

Pandas常用预处理办法

使用关于泰坦尼克号的获救人信息csv表格进行常见的数据预处理

  • 缺失值的剔除
  • pivot_table()函数对数据进行透视处理
  • dropna()方法对缺失值进行丢弃
  • .loc()函数对变量进行定位
import pandas as pd
import numpy as np
titanic_survival=pd.read_csv("titanic_train.csv")
titanic_survival.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
# 其中NAN表示缺失值
# 我们可以使用pandas.isnull()函数来返回一个True或者False来判断值是否缺失
age=titanic_survival["Age"]
# 注意访问具体的元素一定要使用.loc[]函数

#print(age.loc[0:10])
age_is_null=pd.isnull(age)     #返回一个true  and  false的数组
#print(age_is_null)
#true和false可以作为索引来得到true的对应行的值,得到一个新的列表

age_null_true=age[age_is_null]
print(age_null_true)
age_null_count=len(age_null_true)
print(age_null_count)
5     NaN
17    NaN
19    NaN
26    NaN
28    NaN
       ..
859   NaN
863   NaN
868   NaN
878   NaN
888   NaN
Name: Age, Length: 177, dtype: float64
177

需求一:对缺失值进行处理以进行后续计算

#首先展示一下如果不对缺失值进行处理就计算一系列数据的结果:
mean_age=sum(titanic_survival["Age"])/len(titanic_survival["Age"])
print(mean_age)
nan

对于含有缺失值的数据进行均值的计算:

  • 法①直接使用mean()函数可以跳过缺失值
  • 法②使用age_is_null数组进行处理
#我们必须得先对缺失值进行处理
#直接使用mean函数是一种方法
print(titanic_survival["Age"].mean())
#0为列,1为行
#我们也可以先对缺失值进行剔除
true_age=titanic_survival["Age"][age_is_null==False]
#print(true_age)
correct_mean_age=sum(true_age)/len(true_age)
print(correct_mean_age)
29.69911764705882
29.69911764705882

需求二:求各个舱位等级的船票价格

passenger_classes=[1,2,3]
#建立一个字典作为最后的数据存储的目的地
fares_by_class={}
for this in passenger_classes:
    #找出船舱等级分别等于1 2 3的整体数据
    pclass_rows=titanic_survival[titanic_survival["Pclass"]==this]
    #提取对应仓的整体数据的价格值
    pclass_fares=pclass_rows["Fare"]
    #将价格值求平均
    fare_for_class=pclass_fares.mean()
    #结果存入对应下标的value位置
    fares_by_class[this]=fare_for_class
print(fares_by_class)
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}

pivot_table()函数对上述过程化简

函数的详细操作可参考该链接:https://www.cnblogs.com/huangchenggener/p/10983516.html

#求解  一等舱   二等仓   三等舱   分别平均获救多少人
passenger_survival=titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
print(passenger_survival)
        Survived
Pclass          
1       0.629630
2       0.472826
3       0.242363
#计算平均年龄
passenger_age=titanic_survival.pivot_table(index="Pclass",values="Age")
print(passenger_age)
#默认就是求均值
              Age
Pclass           
1       38.233441
2       29.877630
3       25.140620
#同时看一个值和两个值之间的关系:
port_states=titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.mean)
print(port_states)
               Fare  Survived
Embarked                     
C         59.954144  0.553571
Q         13.276030  0.389610
S         27.079812  0.336957
#对于空值的丢弃处理:
drop_na_columns=titanic_survival.dropna(axis=1)
#print(drop_na_columns)
new_titanic_survival=titanic_survival.dropna(axis=0,subset=["Age","Sex"])
# 0为行,1为列
new_titanic_survival.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

需求三:索引.loc函数对元素的获取

row_83_age=titanic_survival.loc[83,"Age"]
row_766_pclass=titanic_survival.loc[766,"Pclass"]
print(row_83_age)
print(row_766_pclass)
28.0
1

需求四:排序和索引值的修改

排序-sort_values函数

new_titanic_survival=titanic_survival.sort_values("Age",ascending=False)
#其中ascending表示“升序”,False表示进行降序
print(new_titanic_survival[0:10])

#对Index重新进行编号;
titanic_reindexed=new_titanic_survival.reset_index(drop=True)
#True表示原来数组的元素被覆盖,即在原来的数组上直接进行修改
print("-----------------------------------------")
print(titanic_reindexed.loc[0:10])
#print(titanic_survival.loc[0:10])
     PassengerId  Survived  Pclass                                  Name  \
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
851          852         0       3                   Svensson, Mr. Johan   
493          494         0       1               Artagaveytia, Mr. Ramon   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   
672          673         0       2           Mitchell, Mr. Henry Michael   
745          746         0       1          Crosby, Capt. Edward Gifford   
33            34         0       2                 Wheadon, Mr. Edward H   
54            55         0       1        Ostby, Mr. Engelhart Cornelius   
280          281         0       3                      Duane, Mr. Frank   

      Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
630  male  80.0      0      0       27042  30.0000   A23        S  
851  male  74.0      0      0      347060   7.7750   NaN        S  
493  male  71.0      0      0    PC 17609  49.5042   NaN        C  
96   male  71.0      0      0    PC 17754  34.6542    A5        C  
116  male  70.5      0      0      370369   7.7500   NaN        Q  
672  male  70.0      0      0  C.A. 24580  10.5000   NaN        S  
745  male  70.0      1      1   WE/P 5735  71.0000   B22        S  
33   male  66.0      0      0  C.A. 24579  10.5000   NaN        S  
54   male  65.0      0      1      113509  61.9792   B30        C  
280  male  65.0      0      0      336439   7.7500   NaN        Q  
-----------------------------------------
    PassengerId  Survived  Pclass                                  Name   Sex  \
0           631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   
1           852         0       3                   Svensson, Mr. Johan  male   
2           494         0       1               Artagaveytia, Mr. Ramon  male   
3            97         0       1             Goldschmidt, Mr. George B  male   
4           117         0       3                  Connors, Mr. Patrick  male   
5           673         0       2           Mitchell, Mr. Henry Michael  male   
6           746         0       1          Crosby, Capt. Edward Gifford  male   
7            34         0       2                 Wheadon, Mr. Edward H  male   
8            55         0       1        Ostby, Mr. Engelhart Cornelius  male   
9           281         0       3                      Duane, Mr. Frank  male   
10          457         0       1             Millet, Mr. Francis Davis  male   

     Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
0   80.0      0      0       27042  30.0000   A23        S  
1   74.0      0      0      347060   7.7750   NaN        S  
2   71.0      0      0    PC 17609  49.5042   NaN        C  
3   71.0      0      0    PC 17754  34.6542    A5        C  
4   70.5      0      0      370369   7.7500   NaN        Q  
5   70.0      0      0  C.A. 24580  10.5000   NaN        S  
6   70.0      1      1   WE/P 5735  71.0000   B22        S  
7   66.0      0      0  C.A. 24579  10.5000   NaN        S  
8   65.0      0      1      113509  61.9792   B30        C  
9   65.0      0      0      336439   7.7500   NaN        Q  
10  65.0      0      0       13509  26.5500   E38        S  

需求五:Pandas自定义函数

apply()函数

# This function returns the hundredth item from the series
def hundreds_row(column):
    hundreds_item=column[99]
    return hundreds_item

#retrun the hundredth item from each column
hundreds_row=titanic_survival.apply(hundreds_row)
print(hundreds_row)

PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object

缺失值的计算:

#求每个属性的缺失值有多少个
def not_null_count(column):
    column_null=pd.isnull(column)
    null=column[column_null]
    return len(null)
#column_null_count=titanic_survival.apply(not_null_count)
not_null_count(titanic_survival.columns)
print(column_null_count)
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

对球员进行重新编号

fullevent=pd.read_csv("passingevents.csv")
fullevent.head()
# print(fullevent.loc[0,"OriginPlayerID"].endswith("ID"))
MatchID TeamID OriginPlayerID DestinationPlayerID MatchPeriod EventTime EventSubType EventOrigin_x EventOrigin_y EventDestination_x EventDestination_y
0 1 Huskies Huskies_D1 Huskies_F1 1H 46.323501 Head pass 34 97 59.0 95.0
1 1 Huskies Huskies_M1 Huskies_F2 1H 51.022546 Simple pass 53 89 69.0 91.0
2 1 Opponent1 Opponent1_D2 Opponent1_G1 1H 89.008721 Simple pass 19 16 5.0 50.0
3 1 Opponent1 Opponent1_G1 Opponent1_F1 1H 92.216160 Launch 5 50 67.0 44.0
4 1 Huskies Huskies_M2 Huskies_M3 1H 98.265191 Simple pass 42 55 36.0 54.0
#打个字典?
myNumberDict={"D1":"1","D2":"2","D3":"3","D4":"4","D5":"5","D6":"6","D7":"7","D8":"8","D9":"9",
              "D10":"10","F1":"11","F2":"12","F3":"13","F4":"14","F5":"15","F6":"16","G1":"17","G2":"31","M1":"18",
             "M2":"19","M3":"20","M4":"21","M5":"22","M6":"23","M7":"24","M8":"25","M9":"26","M10":"27","M11":"28",
             "M12":"29","M13":"30"}

def cerial_number(column):
    string=column["OriginPlayerID"].split("_")[1]
    return myNumberDict[string]
def num2(column):
    string2=column["DestinationPlayerID"].split("_")[1]
    return myNumberDict[string2]   
a=fullevent.apply(cerial_number,axis=1)
b=fullevent.apply(num2,axis=1)
fullevent["OriginPlayerID"]=a
fullevent["DestinationPlayerID"]=b
#fullevent[]
fullevent.head()
writer = pd.ExcelWriter('test.xlsx', engine='xlsxwriter')
fullevent.to_excel(writer, sheet_name='Sheet1')
writer.save()

你可能感兴趣的:(python,python,数据分析,大数据)