Pandas使用一个二维的数据结构DataFrame来表示表格式的数据。
首先将pandas和numpy加载进来
import pandas as pd
import numpy as np
csv文件和xlsx分别用read_csv() 和 read_xlsx()
df = pd.read_csv('./data/HR.csv')
df = pd.DataFrame({
"id": [1001,1002,1003,1004,1005,1006],
"date": pd.date_range('20130102', periods=6),
"city": ['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
"age": [23,44,54,32,34,32],
"category": ['100-A','100-B','110-A','110-C','210-A','130-F'],
"price": [1200,np.nan,2133,5433,np.nan,4432]},
columns = ['id','date','city','category','age','price'])
执行结果:
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
df.shape #(6,6)
df.info()
df.dtypes
执行结果:
id int64
date datetime64[ns]
city object
category object
age int64
price float64
df['date'].dtypes
df.isnull
df['date'].isnull # 某一列的空值
df['date'].unique()
df.values
df.head() # 默认为前5行,也可自定义行数
df.tail() # 默认为后5行,也可自定义行数
比如:max_time和min_time是现有的两列,现在业务需要生成一列gs,gs=max_time-min_time
df.['gs']=df.['max_time']-['min_time']
df.describe()
执行结果:
a b c d
count 4.000000 4.000000 4.000000 4.000000
mean -0.058927 -0.474549 1.019342 -0.750464
std 0.595253 0.530539 0.753136 1.022685
min -0.640585 -0.997408 0.160999 -1.855990
25% -0.532082 -0.812058 0.509721 -1.489673
50% -0.065873 -0.561149 1.077771 -0.708147
75% 0.407282 -0.223640 1.587391 0.031062
max 0.536626 0.221508 1.760826 0.270427
df.head(1)['data'] # 获取第一行的date列
df.head(1)['data'][0] # 获取第一行的date列的元素值
sum(df['ability']) # 计算整个列的和
df[df['data'] == '20161111'] # 获取符合这个条件的行
df[df['data'] == '20161111'].index[0] # 获取符合这个条件的行的行索引的值
df.index # 获取行的索引
df.index[0] # 获取第一个行索引
df.index[-1] # 获取最后一个行索引,只是获取索引值
df.columns # 获取列标签
df[0:2] # 获取第1到第2行,从0开始,不包含末端
df.fillna(value=0)
df['price'].fillna(df['price'].mean())
df['city']=df['city'].map(str.strip)
df['city']=df['city'].str.lower
df['price'].astype('int')
df.rename(columns={'category': 'category-size'})
df['city'].drop_duplicates() # 删除后出现的重复值
df['city'].drop_duplicates(keep='last') # 删除先出现的重复值
df['city'].replace('sh', 'shanghai')
df1=pd.DataFrame({
"id":[1001,1002,1003,1004,1005,1006,1007,1008],
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y',],
"m-point":[10,12,20,40,40,40,30,20]})
df_inner = pd.merge(df,df1,how='inner')
df_left = pd.merge(df,df1,how='left')
df_right = pd.merge(df,df1,how='right')
df_outer = pd.merge(df,df1,how='outer')
df_left.set_index('id')
df_left.sort_values(by=['age'])
df_left.sort_index()
df_left['group'] = np.where(df_left['price'] > 3000,'high','low')
df_left.loc[(df_left['city'] == 'beijing') & (df_left['price'] >= 4000), 'sign'] = 1
pd.DataFrame((x.split('-') for x in df_left['category']),index=df_left.index,columns=['category','size'])
d = df_left=pd.merge(df_left,'分裂后的表',right_index=True, left_index=True)
主要用到三个函数:loc,iloc,ix
df_left.loc[3] # 索引为3,而不是第三个
df_left.iloc[0:5] # 0,1,2,3,4
df_left.reset_index()
df_left=df_left.set_index('date')
df_left[:'2013-01-04']
df_left.iloc[:3,:2] # 冒号前后的数字不再是索引的标签名称,而是数据所在的位置,从0开始,前三行,前两列。
df_left.iloc[[0,2,5],[4,5]] #提取第0、2、5行,4、5列
df_left.ix[:'2013-01-03',:4] #2013-01-03号之前,前四列数据
df_left['city'].isin(['beijing'])
df_left.loc[df_left['city'].isin(['beijing','shanghai'])]
pd.DataFrame(category.str[:3])
# 与
df_left.loc[(df_left['age'] > 25) & (df_left['city'] == 'beijing'), ['id','city','age','category','gender']]
# 或
df_left.loc[(df_left['age'] > 25) | (df_left['city'] == 'beijing'), ['id','city','age','category','gender']]
# 非
df_left.loc[(df_left['city'] != 'beijing'), ['id','city','age','category','gender']]
df_left.loc[(df_left['age'] > 25) & (df_left['city'] == 'shanghai'), ['id','city','age','category','gender']].city.count()
df_left.query('city == ["Beijing", "shanghai"]')
df_left.query('city == ["beijing", "Shenzhen"]').price.sum()
主要函数是groupby和pivote_table
df_left.groupby('city').count()
df_left.groupby('city')['id'].count()
df_left.groupby(['city','size'])['id'].count()
df_left.groupby('city')['price'].agg([len,np.sum, np.mean])
数据采样,计算标准差,协方差和相关系数
df_left.sample(n=3)
weights = [0, 0, 0, 0, 0.5, 0.5]
df_left.sample(n=2, weights=weights)
df_left.sample(n=6, replace=True) # 采样后放回
df_left.sample(n=6, replace=False) # 采样后不放回
df_left['price'].std()
df_inner.cov() # 数据表中所有字段间的协方差
df_left['price'].cov(df_inner['m-point']) # 计算两个字段间的协方差
# 数据表的相关性分析
df_inner.corr()
# 两个字段的相关性分析
df_left['price'].corr(df_inner['m-point']) #相关系数在-1到1之间,接近1为正相关,接近-1为负相关,0为不相关
分析后的数据可以输出为xlsx格式和csv格式
df_left.to_excel('excel_to_python.xlsx', sheet_name='bluewhale_cc')
df_left.to_csv('excel_to_python.csv')