Pandas
1. 读取数据
import pandas as pd
fpath = './ratings.csv'
xlsPath = './test.xlsx'
dbPath = './dbPath'
def test():
df = pd.read_csv(fpath)
pd.read_excel()
pd.read_sql()
df.head()
df.shape
df.columns
df.index
df.dtypes
pd.read_csv(fpath,
sep = '\t',
header=None,
names=['pdata','pv','uv']
)
pd.read_excel(xlsPath)
import pymysql
conn = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='123456',
database='test',
charset='utf8'
)
sql = 'select * from user'
mysql_page = pd.read_sql(sql,con = conn)
if __name__ == '__main__':
test()
2. DataFrame和Series
import pandas as pd
import numpy as np
def test():
'''
Series
'''
s1 = pd.Series([1,'a',5.2,7])
print(s1)
s1.index
s1.values1
s2 = pd.Series([1,'a',5.2,7],index=['d','b','a','c'])
dict = {'a':1,'b':2,'c':3}
s3 = pd.Series(dict)
s2['a']
s2[['a','b']]
'''
DataFrame,类似二维数组,每列的值类型可不同,既有行索引 index,也有列索引 columns,是 Series组成的字典
'''
data = {
'state':['a','b','c','d','e'],
'year':[2017,2018,2019,2020,2021]
}
df = pd.DataFrame(data)
df.dtypes
df.columns
df.index
df['year']
df[['state','year']]
df.loc[1]
type(df.loc[1])
df.loc[1:3]
type(df.loc[1:3])
if __name__ == '__main__':
test()
3. 查询数据
import pandas as pd
fpath = './ratings.csv'
def test():
'''
1. df.loc 根据行、列的标签值查询
2. df.iloc 根据行、列的数字位置查询
3. df.where
4. df.query
其中.loc 既能查询,又能覆盖写入,强烈推荐,使用方法
1. 使用单个 label 值查询
2. 使用值列表批量查询
3. 使用数值区间进行范围查询
4. 使用条件表达式查询
5. 使用函数查询
注意
- 以上查询方法行列均适用
- 观察降维 dataframe > Series > 值
:return:
'''
df = pd.read_csv(fpath)
df.head()
df.set_index('ymd',inplace=True)
df.head()
df.loc[:,'bWendu'] = df['bWendu'].str.replace('℃','').astype('int32')
df.head()
df.loc('2018-01-03','bWendu')
df.loc('2018-01-03',['bWendu','yWendu'])
df.loc(['2018-01-03','2018-01-04','2018-01-05'], ['bWendu','yWendu'])
df.loc['2018-01-03':'2018-01-06','wWendu']
df.loc['2018-01-03':'2018-01-06', 'wWendu':'yWendu']
df.loc[df['yWendu']< -10,:]
print(df['yWendu']< -10)
df[(df['bWendu'] <= 30) & (df['yWendu'] >= 15) & (df['level']==1),:]
df.loc[lambda df : (df['bWendu'] <= 30 ) & (df['yWendu'] >= 15),:]
if __name__ == '__main__':
test()
4. 新增数据列
import pandas as pd
fpath = './ratings.csv'
def test():
df = pd.read_csv(fpath)
df.loc[:,'wencha'] = df['wWendu'] - df['yWendu']
def getWenduType(df):
if df['bWendu'] > 33:
return '高温'
if df['yWendy'] < -10:
return '低温'
return '常温'
df.loc[:,'wendu_type'] = df.apply(getWenduType,axis=1)
df['wendu_type'].value_counts()
df.assign(
yWendu_huashi = lambda x : x['yWendu'] * 9 / 5 +32,
bWendu_huashi = lambda x : x['bWendu'] * 9 / 5 + 32,
)
df['wencha_type'] = ''
df.loc[df['wWendu'] - df['yWendu']>10,'wencha_type'] = '温差大'
df.loc[df['wWendu'] - df['yWendu'] <= 10, 'wencha_type'] = '温差正常'
df['wencha_type'].value_counts()
if __name__ == '__main__':
test()
5. 数据统计
import pandas as pd
fpath = './ratings.csv'
def test():
df = pd.read_csv(fpath)
df.loc[:,'bWendu'] = df['bWendu'].str.replace('℃','').astype('int32')
df.loc[:,'yWendu'] = df['yWendu'].str.replace('℃','').astype('int32')
df.head(3)
'''
1. 汇总类统计
2. 唯一去重和按值计数
3. 相关系数和协方差
'''
df.describe()
df['bWendu'].mean()
df['bWendu'].max()
df['yWendu'].min()
df['tianqi'].unique()
df['fengli'].unique()
df['tianqi'].value_counts()
df['fengli'].value_counts()
if __name__ == '__main__':
test()
6. 缺失值处理
import pandas as pd
fpath = './ratings.csv'
def test():
'''
如何清洗不规则数据与缺失值数据
处理缺失值
1. isnull 和 notnull:检测是否为空值,可用于 df 和 Series
2. dropna:丢弃、删除缺失值
- axis:删除行还是删除列, 0 or 'index',1 or 'columns',默认为 0
- how:如果为 any 则任何值为空都删除,如果等于all则所有值为空才删除
- inplace:如果为 True,则修改当前 df,否则返回新的 df
3. fillna:填充空值
- value:用于填充的值,可以是单个值,或者字典(key 为列名,value 为值列表)
- method:等于 fill 使用前一个不为空的值填充 forward fill,等于 bfill 使用后一个不为空的值填充 backword fill
- axis:按行填充还是按列填充,0 or 'index',1 or 'columns'
- inplace:如果为 True,则修改当前 df,否则返回新的 df
'''
origin_df = pd.read_csv(fpath,skiprows=2)
print(origin_df)
origin_df.isnull()
origin_df['分数'].isnull()
origin_df['分数'].notnull()
origin_df.loc[origin_df['分数'].notnull,:]
origin_df.dropna(axis='columns',how='all',inplace=True)
origin_df.dropna(axis='index',how='all',inplace=True)
origin_df.loc[:,'分数'] = origin_df.fillna(0)
origin_df.loc[:,'姓名'] = origin_df['姓名'].fillna(method='ffill')
origin_df.to_excel('./xxx.excel',index=False)
if __name__ == '__main__':
test()
7. 数据排序
import pandas as pd
fpath = './ratings.csv'
def test():
df = pd.read_csv(fpath)
'''
1. Series 的排序
Series.sort_values(ascending=True,inplace=False)
- ascending: 默认为 True 升序,False 则为降序
- inplace:是否修改原始 Series
2. DataFrame 排序
DataFrame.sort_values(by, ascending=True,inplace=False)
- by: 字符串或者 List<字符串>,单列排序或者多列排序
- ascending: bool 或者 List,升序还是降序,如果是 list 对应 by 的多列
- inplace:是否修改原始 DataFrame
'''
df['aqi'].sort_values()
df['aqi'].sort_values(ascending=False)
df.sort_values(by=['aqiLevel','bWendu'])
df.sort_values(by=['aqiLevel', 'bWendu'],ascending=False)
df.sort_values(by=['aqiLevel', 'bWendu'], ascending=[True,False])
if __name__ == '__main__':
test()
8. 字符串处理
import pandas as pd
fpath = './ratings.csv'
def test():
df = pd.read_csv(fpath)
'''
规则
1. 使用方法:先获取 Series 的 str 属性,然后在属性上调用函数
2. 只能在字符串列上使用,不能在数字列上使用
3. Dataframe 上没有 str 属性和处理方法
4. Series.str 并不是 python 的原生字符串,二是自己的一套方法,大部分和原生 str 相似
5. 详见文档 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.html
'''
'''
常用
1. 字符串处理函数
2. startswith、contains 等 bool 类 Series 可以做条件查询
3. 需要多次 str 处理的链式操作
4. 使用正则表达式处理
'''
df.dtypes
df['bWendu'].str
df['bWendu'].str.replace('℃','')
df['bWendu'].str.isnumeric()
df['aqi'].str.len()
condition = df['ymd'].str.startswith('2018-03')
df[condition].head()
df['ymd'].str.replace('-','')
df['ymd'].str.replace('-','').slice(0,6)
df['ymd'].str.replace('-','').str.slice(0,6)
df['ymd'].str.replace('-','').str[0:6]
def get_date(x):
year,month,day = x['ymd'].split('-')
return f'{year}年{month}月{day}日'
df['中文日期'] = df.apply(get_date,axis=1)
df['中文日期'].str.replace('年','').str.replace('月','').str.replace('、日','')
df['中文日期'].str.replace('[年月日]','')
if __name__ == '__main__':
test()
9. axis参数
import pandas as pd
import numpy as np
fpath = './ratings.csv'
def test():
'''
1. axis=0 或者 'index'
- 单行操作,即指某一行
- 聚合操作,指的是跨行 cross rows
2. axis=1 或者 'columns'
- 单列操作,即指某一列
- 聚合操作,指的是跨行 cross columns
按哪个轴,这个轴要动起来,其他 axis 保持不动
'''
df = pd.DataFrame(
np.arange(12).reshape(3,4),
columns=['A','B','C','D']
)
df.drop('A',axis=1)
df.drop(1, axis=0)
df.mean(axis=0)
def get_sum(x):
return x['A'] + x['B'] + x['C'] +x ['D']
df['sum'] = df.apply(get_sum, axis=1)
if __name__ == '__main__':
test()
10. index
import pandas as pd
fpath = './ratings.csv'
def test():
df = pd.read_csv(fpath)
'''
index 的用途
1. 方便数据查询
2. 性能提升
- index 唯一,使用哈希优化,O(1)
- index 不唯一,但是有序,二分查找 O(logN)
- index 随机,扫描全表,O(N)
3. 自动的数据对齐
4. 更多数据结构支持
'''
df.set_index('userId',inplace=True,drop=False)
df.index
df.loc[500].head(5)
df.loc[df['userId']==500].head()
s1 = pd.Series([1,2,3],index=list('abc'))
s2 = pd.Series([2,3,4],index=list('bcd'))
print(s1+s2)
'''
CategoricalIndex,基于分类数据的 Index
MultiIndex,多维索引,用于 groupby多维聚合后结果
DatetimeIndex,时间类型索引,强大的日期和时间的方法支持
'''
if __name__ == '__main__':
test()
11. DataFrame数据合并
import pandas as pd
ratings = './ratings.dat'
users = './users.dat'
movies = './movies.dat'
def test():
'''
相当于 join,将不同的表按照 key 关联到一个表
pd.merge(left, right, how = 'inner', on = None,
left_on = None, right_on = None,
left_index = False, right_Index = False,
sort = True,suffixes = ('_x','_y'),
copy = True,
indicator = False,
validate = None)
- left,right: 要 merge 的 df 或者有 name 的 Series
- how: join类型,比如 'left','right','outer','inner'
- on: join 的 key,left 和 right 都需要这个 key
- left_on: left 的 df 或者 series 的 key
- right_on: right 的 df 或者 series 的 key
- left_index,right_index: 使用 index 而不是普通的 column 做 join
- suffixes:两个元素的后缀,如果列有重名,自动添加后缀,默认('_x','_y')
'''
df_ratings = pd.read_csv(
ratings,
sep='::',
engine='python',
names='UserID::MovieID::Rating::Timestamp'.split('::')
)
df_users = pd.read_csv(
users,
sep='::',
engine='python',
names='UserID::MovieID::Rating::Timestamp'.split('::')
)
df_movies = pd.read_csv(
movies,
sep='::',
engine='python',
names='MovieID::Title::Genres'.split('::')
)
df_ratings_users = pd.merge(df_ratings,df_users,left_on='UserID',right_on='UserID',how='inner')
df_ratings_users_movies = pd.merge(df_ratings_users, df_movies,left_on='MovieID', right_on='MovieID',how='inner')
df_ratings_users_movies.head(10)
left = pd.DataFrame(
{'sno':[11,12,13,14],
'name':['A','B','C','D']
}
)
right = pd.DataFrame(
{'sno': [11, 12, 13, 14],
'age': ['21', '22', '23', '24']
}
)
pd.merge(left,right,on='sno')
right2 = pd.DataFrame(
{'sno': [11, 11, 11, 12, 12, 13],
'grade': ['语文88', '数学90', '英语100', '语文87', '数学80', '英语95']
}
)
pd.merge(left, right2, on='sno')
'''
left join 完全保留左边的数据(左边的都会出现在结果里,右边的如果无法匹配则为 null)
right join 完全保留右边的数据(右边的都会出现在结果里,左边的如果无法匹配则为 null)
inner join 保留有交集的数据 (默认)(左边、右边的key都有,才会出现再结果里)
outer join 保留全部数据(左边、右边的都会出现在结果里,无法匹配则为 null)
'''
if __name__ == '__main__':
test()
12. 数据Concat合并
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
fpath = './ratings.csv'
def test():
df = pd.read_csv(fpath)
'''
使用场景
批量合并相同格式的 excel、给 dataframe 添加行、列
concat 语法
- 使用某种合并方式
- 沿着某个轴向(axis=0/1)
- 把多个 Pandas 对象(DataFrame/Series)合并成一个
pandas.concat(objs, axis=0, join='outer', ignore_index=False)
- objs:列表,内容可以是 DataFrame 或者 Series,可以混合
- axis:默认是 0,0 按行合并,1 按列合并
- join:合并时的索引对齐方式,默认是 outer join,也可以是 inner join
- ignore_index: 是否忽略掉原来的数据索引
append 语法
DataFrame.append(other, ignore_index=False)
- other: 单个 dataframe、series、dict、列表
其余参考官方 api 文档
'''
df1 = pd.DataFrame(
{
'A':['A0','A1','A2'],
'B': ['B0', 'B1', 'B2'],
'C': ['C0', 'C1', 'C2'],
'D': ['D0', 'D1', 'D2'],
'E': ['E0', 'E1', 'E2'],
}
)
df2 = pd.DataFrame(
{
'A': ['A4', 'A5', 'A6'],
'B': ['B4', 'B5', 'B6'],
'C': ['C4', 'C5', 'C6'],
'D': ['D4', 'D5', 'D6'],
'F': ['F4', 'F5', 'F6'],
}
)
pd.concat([df1,df2])
pd.concat([df1, df2], ignore_index=True)
pd.concat([df1,df2],ignore_index=True,join='inner')
s1 = pd.Series(list(range(4)), name='F')
pd.concat([df1,s1],axis=1)
s2 = df1.apply(lambda x:x['A']+'_GG', axis=1)
s2.name = 'G'
pd.concat([df1,s1,s2], axis=1)
pd.concat([s1,s2],axis=1)
pd.concat([s1,df1,s2],axis=1)
df3 = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df4 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
df3.append(df4)
df3.append(df4,ignore_index=True)
df = pd.DataFrame(columns=['A'])
for i in range(5):
df = df.append({'A':i}, ignore_index=True)
pd.concat(
[pd.DataFrame([i],columns=['A']) for i in range(5)],
ignore_index=True
)
if __name__ == '__main__':
test()
13. 拆分与合并Excel
import pandas as pd
import os
work_dir='./excel_split_merge'
splits_dir = f'{work_dir}/splits'
if not os.path.exists(splits_dir):
os.mkdir(splits_dir)
def test():
'''
1. 将一个大 Excel 拆分成多个 Excel
- 使用 df.iloc 方法,将一个大的 dataframe 拆分为多个小 dataframe
- 使用 dataframe.to_excel 保存
2. 将多个小 Excel 合并成成大 Excel 并标记来源
- 遍历文件夹得到文件列表
- 分别读取到 df,给每个 df 标记来源
- 使用 pd.concat 进行 df 批量合并
- 将合并后的 df 输出到 excel
'''
df_source = pd.read_excel(f'{work_dir}/atricles_source.xlsx')
df_source.head()
df_source.index
df_source.shape
row_count = df_source.shape[0]
col_count = df_source.shape[1]
user_list = ['A','B','C','D','E','F']
split_size = row_count // len(user_list)
if row_count % len(user_list) != 0:
split_size += 1
df_subs = []
for idx, user in enumerate(user_list):
begin = idx * split_size
end = begin + split_size
sub = df_source.iloc[begin:end]
df_subs.append((idx,user,sub))
for idx, user, sub in df_subs:
file_name = f'{splits_dir}/articles_{idx}_{user}.xlsx'
sub.to_excel(file_name,index=False)
excel_names = []
for excel_name in os.listdir(splits_dir):
excel_names.append(excel_name)
df_list = []
for excel_name in excel_names:
excel_path = f'{splits_dir}/{excel_name}'
df_split = pd.read_excel(excel_path)
user_name = excel_name.replace('atricles_','').replace('.xlsx','')[2:]
df_split['username'] = user_name
df_list.append(df_split)
df_merged = pd.concat(df_list)
df_merged.shape
df_merged['username'].value_counts()
df_merged.to_excel(f'{work_dir}/articles_merged_xlsx',index=False)
if __name__ == '__main__':
test()