那年夏天抓住了蝉的尾巴
gitbook
前言
pandas 抓住 Series (排序的字典), DataFrame (row + 多个 Series) 对象 , 就如同 numpy 里抓住 ndarray 多维数组一样
可是人的精力始终是有限的,没有过目不忘的本领,那就记住 API 以及常用参数, 其他的交给字典吧
下面学习 示例 可能会用到的 两个函数
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
def generate_df(rows, cols):
data = np.arange(rows*cols).reshape((rows,cols))
columns = ['col_'+str(i) for i in range(cols)]
indices = ['row_'+str(j) for j in range(rows)]
return DataFrame(data,columns=columns,index=indices)
generate_df(3,4)
修改 dataframe 中数据
from pandas import DataFrame
import numpy as np
import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG)
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars}{text}{stars}'.format(text=text,stars=stars))
data = {'open':[8.08, 7.93, 7.97, 8.00],
'close':[7.93,8.05,7.97,8.05],
'high':[8.10,8.12,8.00,8.09],
'low':[7.88,7.92,7.91,8.00]}
df = DataFrame(data,index=pd.date_range('02/01/2016','02/04/2016'))
print(df.head(10))
pretty_print('华丽的分隔符')
df[~df.isin([7.93])] = 0 # 将 df 中不为 6.93 的变为 0
print(df)
"""
open close high low
2016-02-01 8.08 7.93 8.10 7.88
2016-02-02 7.93 8.05 8.12 7.92
2016-02-03 7.97 7.97 8.00 7.91
2016-02-04 8.00 8.05 8.09 8.00
********************华丽的分隔符********************
open close high low
2016-02-01 0.00 7.93 0.0 0.0
2016-02-02 7.93 0.00 0.0 0.0
2016-02-03 0.00 0.00 0.0 0.0
2016-02-04 0.00 0.00 0.0 0.0
"""
"""
df[] 这样返回的都是 DataFrame , df.ix ,df.loc , df.iloc 这类返回的都是 Series
"""
apply + map == applymap
from pandas import DataFrame
import numpy as np
import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG)
from functools import reduce
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars}{text}{stars}'.format(text=text,stars=stars))
data = {'open':[8.08, 7.93, 7.97, 8.00],
'close':[7.93,8.05,7.97,8.05],
'high':[8.10,8.12,8.00,8.09],
'low':[7.88,7.92,7.91,8.00]}
df = DataFrame(data,index=pd.date_range('02/01/2016','02/04/2016'))
print(df.head(10))
pretty_print('华丽的分隔线')
# 新增一列 下面两种方法等价
df['new'] = df.apply(lambda cols:reduce(lambda x,y:x+y,cols), axis=1)
df['new_2'] = df.apply(sum,axis=1)
print(df.head(10))
"""
open close high low
2016-02-01 8.08 7.93 8.10 7.88
2016-02-02 7.93 8.05 8.12 7.92
2016-02-03 7.97 7.97 8.00 7.91
2016-02-04 8.00 8.05 8.09 8.00
********************华丽的分隔线********************
open close high low new new_2
2016-02-01 8.08 7.93 8.10 7.88 31.99 63.98
2016-02-02 7.93 8.05 8.12 7.92 32.02 64.04
2016-02-03 7.97 7.97 8.00 7.91 31.85 63.70
2016-02-04 8.00 8.05 8.09 8.00 32.14 64.28
"""
分组后 分组col 会被作为 key 索引
from pandas import Series,DataFrame
a=[['Li','男','PE',98.],['Li','男','MATH',60.],['liu','男','MATH',60.],['yu','男','PE',100.]]
af=DataFrame(a,columns=['name','sex','course','score'])
print(af.head(10))
print('*'*50)
print(af.groupby(['name','course'])['score'].sum())
print('*'*50)
print(af.groupby(['name','course'])['score'].sum()['Li'])
"""
name sex course score
0 Li 男 PE 98.0
1 Li 男 MATH 60.0
2 liu 男 MATH 60.0
3 yu 男 PE 100.0
**************************************************
name course
Li MATH 60.0
PE 98.0
liu MATH 60.0
yu PE 100.0
Name: score, dtype: float64
**************************************************
course
MATH 60.0
PE 98.0
Name: score, dtype: float64
"""
骚操作
# -*- coding: utf-8 -*-
__author__ = 'Frank Li'
from pandas import Series,DataFrame
import pandas as pd
import pdb
addr = pd.Series([
'Washington, D.C. 20003',
'Brooklyn, NY 11211-1755',
'Omaha, NE 68154',
'Pittsburgh, PA 15211' ])
# Series._accessors 有 str , cat , dt 三个对象
addr.str.upper() # 将字符大写
print(addr.str.count(r'\d')) # 计数 每一个单元格包含数字个数
regex = (r'(?P[A-Za-z ]+), ' # 一个或更多字母
r'(?P[A-Z]{2}) ' # 两个大写字母
r'(?P\d{5}(?:-\d{4})?)') # 可选的4个延伸数字
print(addr.str.replace('.','').str.extract(regex))
print([i for i in dir(pd.Series.str) if not i.startswith('_')])
"""
0 5
1 9
2 5
3 5
dtype: int64
city state zip
0 Washington DC 20003
1 Brooklyn NY 11211-1755
2 Omaha NE 68154
3 Pittsburgh PA 15211
['capitalize', 'cat', 'center', 'contains', 'count', 'decode', 'encode', 'endswith', 'extract', 'extractall', 'find', 'findall', 'get', 'get_dummies', 'index', 'isalnum', 'isalpha', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isspace', 'istitle', 'isupper', 'join', 'len', 'ljust', 'lower', 'lstrip', 'match', 'normalize', 'pad', 'partition', 'repeat', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'slice', 'slice_replace', 'split', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'wrap', 'zfill']
"""
daterng = pd.Series(pd.date_range('2017', periods=9, freq='Q'))
print(daterng)
print(daterng.dt.day_name())
# 查看下半年
print(daterng[daterng.dt.quarter > 2])
print(daterng[daterng.dt.is_year_end])
"""
Series.dt.day_name():从日期判断出所处星期数;
Series.dt.quarter:从日期判断所处季节;
Series.dt.is_year_end:从日期判断是否处在年底
"""
colors = pd.Series([
'periwinkle',
'mint green',
'burnt orange',
'periwinkle',
'burnt orange',
'rose',
'rose',
'mint green',
'rose',
'navy'])
import sys
print(colors.apply(sys.getsizeof))
mapper = {v: k for k, v in enumerate(colors.unique())}
as_int = colors.map(mapper)
print(as_int)
print(as_int.apply(sys.getsizeof))
# 节省内存
primary_usage = colors.memory_usage(index=False, deep=True)
category_usage = colors.astype('category').memory_usage(index=False, deep=True)
print('primary: {}\ncategory_usage: {}'.format(primary_usage,category_usage))
"""
primary: 370
category_usage: 291
这样看起来似乎并没有什么很大区别
但是我们可以 repeat 多次试试
"""
manycolors = colors.repeat(10)
print(len(manycolors) / manycolors.nunique())
print(manycolors.memory_usage(index=False, deep=True))
# pdb.set_trace()
print(manycolors.astype('category').memory_usage(index=False, deep=True))
数据清洗与准备
# 处理缺失数据
API
dropna
fillna
isnull
notnull
# 过滤缺失值
data.dropna() 等价于 data[data.notnull()]
对于 DataFrame 还有 axis= 0 or 1 , how = 'all' 等可选 ,当某列全为 NaN 时候删除 , thresh=2 当 NA 个数 > = 2 时候触发删除操作
df.dropna(axis=1,how='all')
df.fillna({1: 0.5, 2: 0}, inplace=True, method='ffill', limit=2)为 1 列 2 列 分别填充不同的默认值
fillna 参数 有:
value 标量值或字典型对象用于填充确实值
method 插值方法, 如果没有其他参数, 默认是 ‘ffill’
axis 需要填充的轴, 默认 axis = 0
inplace
limit 用于前向或后向填充时最大的填充范围
### 查询与删除重复值
data.duplicated()
data.drop_duplicates(['col1','col2'], keep='last' or 'first')
### 使用函数或映射进行数据转换
data = {'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
'ounces':[4,5,12,6,7.5,8,3,5,6]}
df = DataFrame(data)
print(df.head(10))
print('*'*50)
meat_to_animal = {'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}
df['meat_to_animall'] = df['food'].map(lambda x:meat_to_animal.get(x.lower(),'unknown'))
print(df.head(10))
### 替代值
data.replace(-999, np.nan)
### 重命名轴索引
data.index.map(lambda x: x[:4].upper())
data.rename(index=str.title, columns=str.upper)
dta.rename(index={'old_idx':'new_idx'}, columns={'old_col':'new_col'}, inplace=True)
### 离散化和分箱 cut , qcut
pd.cut(ages, bins)
from pandas import Series,DataFrame
import pandas as pd
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18,35,50,70,90,120]
cats = pd.cut(ages, bins)
print(cats.codes)
df = DataFrame({'ages':ages})
df['ages_dicretes'] = pd.cut(ages, bins,right=False).codes
print(df.head(10))
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
data = DataFrame(np.random.randn(1000,4))
data[np.abs(data)>3] = np.sign(data) * 3 # 将数值限定于 -3 to +3
print(data.head(10))
# 置换和随机抽样
numpy.random.permutation
df.sample , series.sample
df = DataFrame(np.arange(20).reshape((5,4)))
print(df)
print('*'*50)
sampler = np.random.permutation(5)
print(df.take(sampler))
# 组合使用
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
np.random.seed(12345)
values = np.random.rand(10)
print(values)
pretty_print('离散分箱')
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
df = pd.get_dummies(pd.cut(values,bins))
print(df.head(10))
"""
[0.92961609 0.31637555 0.18391881 0.20456028 0.56772503 0.5955447
0.96451452 0.6531771 0.74890664 0.65356987]
******************** 离散分箱 ********************
(0.0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1.0]
0 0 0 0 0 1
1 0 1 0 0 0
2 1 0 0 0 0
3 0 1 0 0 0
4 0 0 1 0 0
5 0 0 1 0 0
6 0 0 0 0 1
7 0 0 0 1 0
8 0 0 0 1 0
9 0 0 0 1 0
"""
### 向量化 字符串函数
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
data = {'col1':['001100110111'],
'col2': ['001100110111'],
'col3': ['001100110111']}
df = DataFrame(data)
print(df.head(10))
pretty_print('华丽的分割线')
df2 = df.apply(lambda s:s.str.extract(r'(?P\d{3})(?P\d{3})(?P\d{3})(?P\d{3})') ,axis=1)
print(df2.values)
"""
col1 col2 col3
0 001100110111 001100110111 001100110111
******************** 华丽的分割线 ********************
[ nums_1 nums_2 nums_3 nums_4
col1 001 100 110 111
col2 001 100 110 111
col3 001 100 110 111]
"""
# 部分向量化字符串方法列表
cat 根据可选的分隔符暗元素年和字符串
contains 返回是否含有某个模式 / 正则表达式的 布尔值数组
count 模式出现次数的计数
extract 使用正则表达式从字符串Series 中分组抽取一个 或多个字符串, 返回的结果是 每个分组形成一列的 DataFrame
endswith 等价于对每个元素使用 x.endswith
startswith 等价于 对每个元素使用 x.statswith
findall 找出字符串中所有的 模式 / 正则表达式 匹配项 ,以列表返回
get 对每个元素进行索引 (获得第 i 个元素)
isalnum
is alhpa
isdecimal
isdigit
islower
isnumeric
isupper
join
len
loer, upper
match
pad
center
repeat
replace
slice
split
strip
rstrip
lstrip
第八章 数据规整 , 连接,联合 重塑
# 分层索引 部分索引
分层索引允许你在一个轴向上拥有多个 (两个或两个以上) 索引层级, 龙宫地说 分层索引提供了一种在耕地唯独的形式中处理更高维度 数据的方法。
# -*- coding: utf-8 -*-
__author__ = 'Frank Li'
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
data = pd.Series(np.random.randn(9),
index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
[1, 2, 3, 1, 3, 1, 2, 2, 3]])
print(data.head(10))
pretty_print('华丽的分割线')
print(data['b'])
pretty_print('华丽的分割线')
print(data.unstack())
pretty_print('华丽的分割线')
print(data.unstack().stack())
# 重排序 和 层级排序
swaplevel sort_index
# 按 层级进行汇总统计
# 使用 DataFrame 的列 进行索引
set_index() 提出 层级索引
reset_index() 反操作 封层索引会被移动到 列中
# 联合于合并数据集
类似于 SQL 表关联操作 merge
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
df1 = pd.DataFrame({'key1':['b','b','a','c','a','a','b'],
'data1':range(7)})
df2 = pd.DataFrame({'key1':['a','b','d'],
'data2':range(3)})
df3 = pd.merge(df1,df2,left_on='key1',right_on='key1',how='inner', suffixes=('_left','_right'))
# inner ,left ,right ,outer ,如果 是多个列进行关联 则 on = ['key1','key2'] ,如果有相同的列,则使用
# suffixes 参数 给列取别名 , 根据索引来合并 (关联) left_index=True, right_index=True
print(df3.head(10))
# join 按照 索引进行合并
df4 = df1.join(df2,how='inner', lsuffix='_left', rsuffix='_right')
pretty_print('华丽的分割线')
print(df4.head(10))
left.join([right1, right2,right3], how='outer') 默认 inner
concat 类似于 union all 其实又不仅仅是
arr = np.arange(12).reshape((3,4))
print(arr)
pretty_print('华丽的分割线')
result = np.concatenate([arr,arr],axis=1)
result_2 = np.concatenate([arr,arr],axis=0)
print(result)
pretty_print('华丽的分割线')
print(result_2)
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
def generate_df(rows, cols):
data = np.arange(rows*cols).reshape((rows,cols))
columns = ['col_'+str(i) for i in range(cols)]
indices = ['row_'+str(j) for j in range(rows)]
return DataFrame(data,columns=columns,index=indices)
arr = generate_df(3,4)
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
print(pd.concat([s1,s2,s3],axis=0,sort=False))
pretty_print('华丽的分割线')
print(pd.concat([s1,s2,s3],axis=1,sort=False))
"""
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64
******************** 华丽的分割线 ********************
0 1 2
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
"""
s4 = pd.concat([s1,s3])
print(pd.concat([s1,s4],axis=1,join='inner',sort=False)) # join = outer or inner
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
def generate_df(rows, cols):
data = np.arange(rows*cols).reshape((rows,cols))
columns = ['col_'+str(i) for i in range(cols)]
indices = ['row_'+str(j) for j in range(rows)]
return DataFrame(data,columns=columns,index=indices)
arr = generate_df(3,4)
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
s5 = pd.concat([s1,s2,s3],axis=0,keys=['one', 'two', 'three'],sort=False) # 可以为每一个 series 做分层索引便于区分
print(s5)
pretty_print('华丽的分割线')
"""
one a 0
b 1
two c 2
d 3
e 4
three f 5
g 6
dtype: int64
******************** 华丽的分割线 ********************
a b c d e f g
one 0.0 1.0 NaN NaN NaN NaN NaN
two NaN NaN 2.0 3.0 4.0 NaN NaN
three NaN NaN NaN NaN NaN 5.0 6.0
"""
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
def generate_df(rows, cols):
data = np.arange(rows*cols).reshape((rows,cols))
columns = ['col_'+str(i) for i in range(cols)]
indices = ['row_'+str(j) for j in range(rows)]
return DataFrame(data,columns=columns,index=indices)
df1 = generate_df(3,2)
df2 = generate_df(2,2).applymap(lambda x:x+5)
print(df1)
pretty_print('华丽的分割线')
print(df2)
pretty_print('华丽的分割线')
print(pd.concat([df1,df2],axis=1,sort=False,keys=('lvl1','lvl2')))
"""
col_0 col_1
row_0 0 1
row_1 2 3
row_2 4 5
******************** 华丽的分割线 ********************
col_0 col_1
row_0 5 6
row_1 7 8
******************** 华丽的分割线 ********************
lvl1 lvl2
col_0 col_1 col_0 col_1
row_0 0 1 5.0 6.0
row_1 2 3 7.0 8.0
row_2 4 5 NaN NaN
"""
联合重叠数据
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
def generate_df(rows, cols):
data = np.arange(rows*cols).reshape((rows,cols))
columns = ['col_'+str(i) for i in range(cols)]
indices = ['row_'+str(j) for j in range(rows)]
return DataFrame(data,columns=columns,index=indices)
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64),
index=['f', 'e', 'd', 'c', 'b', 'a'])
print(a)
pretty_print('华丽的分割线')
print(b)
pretty_print('华丽的分割线')
print(np.where(pd.isnull(a),b,a))
"""
f NaN
e 2.5
d NaN
c 3.5
b 4.5
a NaN
dtype: float64
******************** 华丽的分割线 ********************
f 0.0
e 1.0
d 2.0
c 3.0
b 4.0
a 5.0
dtype: float64
******************** 华丽的分割线 ********************
[0. 2.5 2. 3.5 4.5 5. ]
"""
print(b.combine_first(a))
8.3 重中之重 重塑或透视
stack
unstack
往往结合 分层索引来做
pivot <==> 等价于 set_index 创建分层索引, 然后调用 unstack 拆堆
pivot 反过来操作 就是 pd.melt
import pandas as pd
import numpy as np
"""
对比 某一行 第二三列的 差值 == 下一行 第一列的值,找出这样的行
"""
data = np.zeros((20,3))
df = pd.DataFrame(data, columns=['col_'+str(i) for i in range(3)], index=['row_'+str(i) for i in range(20)])
df.iloc[:, 1:] = 1
print(df.head(10))
print('{stars} {text} {stars}'.format(stars='*'*20,text='华丽的分割线'))
def func(row):
return abs(row[1] - row[2])
df['col_3'] = df.apply(func, axis=1)
df['col_4'] = np.where(df['col_3'] == df['col_0'].shift(1), True, False)
df = df[df['col_4']]
print(df.head(20))
判断类似 gene_1|gene_2 ==> gene_1 = 1 , gene_2 = 1 gene_3 = 0....
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
def generate_df(rows, cols):
data = np.arange(rows*cols).reshape((rows,cols))
columns = ['col_'+str(i) for i in range(cols)]
indices = ['row_'+str(j) for j in range(rows)]
return DataFrame(data,columns=columns,index=indices)
df = generate_df(3,6)
print(df.head(10))
df['col_1'] = 'col_2|col_4'
df.ix[1:2,'col_1'] = 'col_3|col_5'
cols = [col for col in df.columns if col!='col_0']
def do_apply(row):
for col in cols:
print('*'*100)
if col in row['col_1'].split(r'|'):
tmp = row['col_0']
df.loc[df['col_0'] == tmp,col] = 1
df.apply(do_apply, axis=1)
# df.apply(lambda row:do_apply(row) ,axis=1)
print('df shape: {}'.format(df.shape))
print('df 前十行: ',df.head(10))
SQLpd