pandas是python中高性能的数据分析库。为数据的统计分析带来了极大的便利。
本文以pandas中最常用的数据结构DataFrame为主,总结常用知识点,如增删改查,分组统计等等。
创建空DataFrame
df=pd.DataFrame()
判断DataFrame是否为空
df.empty
DataFrame去重
df.drop_duplicates()
替换DataFrame中指定字符串
df.replace({"2003-05-10": "2008-08-08"}, regex=True, inplace=True)
填充DataFrame中Nan
import pandas as pd
data=[[101,90],[102,99],[103,]]
df=pd.DataFrame(data=data,columns=['id','score'])
df.fillna("未知", inplace=True)
重塑DataFrame
import pandas as pd
data=[['101',90,'2003-05-10'],['102',99,'2003-05-12'],['103',105,'2003-05-11']]
df=pd.DataFrame(data=data,columns=['id','score','dt'])
#重塑 以dt作为index,id的值作为列名,去填充对应score的值
print df.pivot(index='dt', columns='id', values='score').reset_index()
DataFrame每列数据类型
df.dtypes
DataFrame行数列数
df.shape
DataFrame列名
df.columns.values
修改列名
df.rename(columns={'id':'studentID', 'score':'studentScore'}, inplace=True)
删除列
df.drop(['id'], axis=1)
增加列
df['id2']= pd.Series(range(df.shape[0]))
某列转换成int类型
df['id'] = df['id'].astype('int')
或
df['id']=df['id'].map(lambda x:int(x))
选出某列转List
df["id"].values.tolist()
对每列求和
df.loc['合 计'] = df.apply(lambda x: x.sum())
注意:需要注意map()与apply()的区别,map中的函数作用于Series中的每一个元素,而apply中的函数作用于DataFrame一列或一行。
按多列排序
df.sort_values(['colNameA','colNameB'], ascending=False)
某列最小值
int(df[['score']].min())
Union ALL
pd.concat([df1,df2])
Left/Right/Inner/Outer Join
import pandas as pd
data=[['101',90,'2003-05-10'],['102',99,'2003-05-10'],['103',105,'2003-05-11']]
df=pd.DataFrame(data=data,columns=['id','score','dt'])
data2=[('101','1'),('102','0')]
df2=pd.DataFrame(data=data2,columns=['id','sex'])
#left join
left_join_df=df.merge(right=df2, how="left", on="id")
#right join
# right_join_df=df.merge(right=df2, how="right", on="id")
# 或
right_join_df=df.merge(right=df2, how="right", left_on="id",right_on="id")
#inner join
inner_join_df=df.merge(right=df2, how="inner", left_on="id",right_on="id")
#outer join
outer_join_df=df.merge(right=df2, how="outer", on="id")
总结:left join类似于SQL的left join。如A left join B,产生A的完整集,B中没有匹配上的以nan代替。同理 right join、inner join、outer join。
pandas中DataFrame中merge,默认是inner join。
筛选数据
#这些操作类似于Sql中的where条件。
import pandas as pd
data=[['101',90,'2003-05-10'],['102',99,'2003-05-10'],['103',105,]]
df=pd.DataFrame(data=data,columns=['id','score','dt'])
#且
print df[(df['id'].isin(['101','102']) & (df['score']==90))]
#取反
print df[~df['id'].isin(['101','102'])]
#不等式
print df[df['score']>=99]
#过滤掉非空
print df[df['dt'].notnull()]
分组排序聚合
import pandas as pd
data=[
(1,80,90,1,2),
(2,60,80,1,2),
(3,70,90,1,3),
(4,90,80,1,3),
(5,60,90,1,3),
(6,50,80,2,3),
(7,80,90,2,3),
(8,70,70,2,1),
(9,90,90,2,1),
(10,80,90,2,1)
]
df=pd.DataFrame(data=data,columns=['id','language','math','grade','class'])
#分组求和---按grade,class分组,对language,math分别求和
print df.groupby(['grade','class'])['language','math'].sum().reset_index()
#分组排序求TopN----按grade,class分组,取每组math最大的topN
print pd.concat([subGroup.sort_values(['math'],ascending=False).head(2)
for subGroupName,subGroup in df.groupby(['grade','class'])])
List转DataFrame
List-Tuple转DataFrame
data=[(101,90),(102,99),(103,99)]
df=pd.DataFrame(data=data,columns=['id','score'])
List-List转DataFrame
data=[[101,90],[102,99],[103,99]]
df=pd.DataFrame(data=data,columns=['id','score'])
List-Dict转DataFrame
data=[{'id':101,'score':90},{'id':102,'score':99},{'id':103,'score':99}]
df=pd.DataFrame(data=data)
excel转DataFrame
df=pd.read_excel(io="excelPath.xlsx", sheetname="sheetName")
Json、Csv与DataFrame互转
Csv文件转DataFrame
#注:读取Tsv文件,将分割符换成\t即可。
df=pd.read_csv(filePath,sep=',',names=['colNameA','colNameB'...])
Json文件转DataFrame
df=pd.read_json(filePath, lines=True)
DataFrame保存成Csv文件
df.to_csv("csvResult.csv",index=False,header=True,sep=',',encoding='utf-8-sig')
DataFrame保存成Json文件
df.to_json(path_or_buf="jsonData.json",orient='records', lines=True)
Sql执行结果转DataFrame
df=pd.read_sql(sql=sql,con=conn)