pandas 常规操作大全

那年夏天抓住了蝉的尾巴
gitbook

前言

pandas 抓住 Series (排序的字典), DataFrame (row + 多个 Series) 对象 , 就如同 numpy 里抓住 ndarray  多维数组一样
可是人的精力始终是有限的,没有过目不忘的本领,那就记住 API 以及常用参数, 其他的交给字典吧

下面学习 示例 可能会用到的 两个函数

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

def generate_df(rows, cols):
    data = np.arange(rows*cols).reshape((rows,cols))
    columns = ['col_'+str(i) for i in range(cols)]
    indices = ['row_'+str(j) for j in range(rows)]
    return DataFrame(data,columns=columns,index=indices)

generate_df(3,4)

修改 dataframe 中数据

from pandas import DataFrame
import numpy as np
import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG)

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars}{text}{stars}'.format(text=text,stars=stars))

data = {'open':[8.08, 7.93, 7.97, 8.00],
        'close':[7.93,8.05,7.97,8.05],
        'high':[8.10,8.12,8.00,8.09],
        'low':[7.88,7.92,7.91,8.00]}

df = DataFrame(data,index=pd.date_range('02/01/2016','02/04/2016'))
print(df.head(10))
pretty_print('华丽的分隔符')
df[~df.isin([7.93])] = 0 # 将 df 中不为 6.93 的变为 0
print(df)
"""
            open  close  high   low
2016-02-01  8.08   7.93  8.10  7.88
2016-02-02  7.93   8.05  8.12  7.92
2016-02-03  7.97   7.97  8.00  7.91
2016-02-04  8.00   8.05  8.09  8.00
********************华丽的分隔符********************
            open  close  high  low
2016-02-01  0.00   7.93   0.0  0.0
2016-02-02  7.93   0.00   0.0  0.0
2016-02-03  0.00   0.00   0.0  0.0
2016-02-04  0.00   0.00   0.0  0.0
"""

"""
df[]  这样返回的都是 DataFrame , df.ix ,df.loc , df.iloc 这类返回的都是 Series
"""

apply + map == applymap

from pandas import DataFrame
import numpy as np
import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG)
from functools import reduce

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars}{text}{stars}'.format(text=text,stars=stars))

data = {'open':[8.08, 7.93, 7.97, 8.00],
        'close':[7.93,8.05,7.97,8.05],
        'high':[8.10,8.12,8.00,8.09],
        'low':[7.88,7.92,7.91,8.00]}

df = DataFrame(data,index=pd.date_range('02/01/2016','02/04/2016'))
print(df.head(10))
pretty_print('华丽的分隔线')
# 新增一列 下面两种方法等价
df['new'] = df.apply(lambda cols:reduce(lambda x,y:x+y,cols), axis=1)
df['new_2'] = df.apply(sum,axis=1)
print(df.head(10))
"""
            open  close  high   low
2016-02-01  8.08   7.93  8.10  7.88
2016-02-02  7.93   8.05  8.12  7.92
2016-02-03  7.97   7.97  8.00  7.91
2016-02-04  8.00   8.05  8.09  8.00
********************华丽的分隔线********************
            open  close  high   low    new  new_2
2016-02-01  8.08   7.93  8.10  7.88  31.99  63.98
2016-02-02  7.93   8.05  8.12  7.92  32.02  64.04
2016-02-03  7.97   7.97  8.00  7.91  31.85  63.70
2016-02-04  8.00   8.05  8.09  8.00  32.14  64.28
"""

分组后 分组col 会被作为 key 索引

from pandas import Series,DataFrame
a=[['Li','男','PE',98.],['Li','男','MATH',60.],['liu','男','MATH',60.],['yu','男','PE',100.]]

af=DataFrame(a,columns=['name','sex','course','score'])
print(af.head(10))
print('*'*50)
print(af.groupby(['name','course'])['score'].sum())
print('*'*50)
print(af.groupby(['name','course'])['score'].sum()['Li'])
"""
name sex course  score
0   Li   男     PE   98.0
1   Li   男   MATH   60.0
2  liu   男   MATH   60.0
3   yu   男     PE  100.0
**************************************************
name  course
Li    MATH       60.0
      PE         98.0
liu   MATH       60.0
yu    PE        100.0
Name: score, dtype: float64
**************************************************
course
MATH    60.0
PE      98.0
Name: score, dtype: float64
"""

骚操作

# -*- coding: utf-8 -*-
__author__ = 'Frank Li'
from pandas import Series,DataFrame
import pandas as pd
import pdb
addr = pd.Series([
 'Washington, D.C. 20003',
 'Brooklyn, NY 11211-1755',
 'Omaha, NE 68154',
 'Pittsburgh, PA 15211' ])
# Series._accessors 有 str , cat , dt 三个对象
addr.str.upper() # 将字符大写
print(addr.str.count(r'\d')) # 计数 每一个单元格包含数字个数

regex = (r'(?P[A-Za-z ]+), '      # 一个或更多字母
    r'(?P[A-Z]{2}) '        # 两个大写字母
   r'(?P\d{5}(?:-\d{4})?)')  # 可选的4个延伸数字
print(addr.str.replace('.','').str.extract(regex))

print([i for i in dir(pd.Series.str) if not i.startswith('_')])
"""
0    5
1    9
2    5
3    5
dtype: int64
         city state         zip
0  Washington    DC       20003
1    Brooklyn    NY  11211-1755
2       Omaha    NE       68154
3  Pittsburgh    PA       15211
['capitalize', 'cat', 'center', 'contains', 'count', 'decode', 'encode', 'endswith', 'extract', 'extractall', 'find', 'findall', 'get', 'get_dummies', 'index', 'isalnum', 'isalpha', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isspace', 'istitle', 'isupper', 'join', 'len', 'ljust', 'lower', 'lstrip', 'match', 'normalize', 'pad', 'partition', 'repeat', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'slice', 'slice_replace', 'split', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'wrap', 'zfill']
"""

daterng = pd.Series(pd.date_range('2017', periods=9, freq='Q'))
print(daterng)
print(daterng.dt.day_name())
# 查看下半年
print(daterng[daterng.dt.quarter > 2])
print(daterng[daterng.dt.is_year_end])
"""
Series.dt.day_name():从日期判断出所处星期数;

Series.dt.quarter:从日期判断所处季节;

Series.dt.is_year_end:从日期判断是否处在年底
"""

colors = pd.Series([
'periwinkle',
'mint green',
'burnt orange',
'periwinkle',
'burnt orange',
'rose',
'rose',
'mint green',
'rose',
'navy'])
import sys
print(colors.apply(sys.getsizeof))
mapper = {v: k for k, v in enumerate(colors.unique())}
as_int = colors.map(mapper)
print(as_int)
print(as_int.apply(sys.getsizeof))
# 节省内存
primary_usage = colors.memory_usage(index=False, deep=True)
category_usage = colors.astype('category').memory_usage(index=False, deep=True)
print('primary: {}\ncategory_usage: {}'.format(primary_usage,category_usage))
"""
primary: 370
category_usage: 291
这样看起来似乎并没有什么很大区别
但是我们可以 repeat 多次试试
"""
manycolors = colors.repeat(10)
print(len(manycolors) / manycolors.nunique())

print(manycolors.memory_usage(index=False, deep=True))
# pdb.set_trace()
print(manycolors.astype('category').memory_usage(index=False, deep=True))

数据清洗与准备

# 处理缺失数据
API 
dropna
fillna
isnull
notnull

# 过滤缺失值
data.dropna()   等价于 data[data.notnull()]

对于 DataFrame 还有  axis= 0 or 1 , how = 'all' 等可选 ,当某列全为 NaN 时候删除 , thresh=2 当 NA 个数 > = 2 时候触发删除操作
df.dropna(axis=1,how='all')
df.fillna({1: 0.5, 2: 0}, inplace=True, method='ffill', limit=2)为 1 列 2 列 分别填充不同的默认值
fillna 参数 有:
value 标量值或字典型对象用于填充确实值
method 插值方法, 如果没有其他参数, 默认是 ‘ffill’
axis  需要填充的轴, 默认 axis = 0
inplace 
limit 用于前向或后向填充时最大的填充范围

### 查询与删除重复值
data.duplicated()
data.drop_duplicates(['col1','col2'], keep='last' or 'first')

### 使用函数或映射进行数据转换
data = {'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
        'ounces':[4,5,12,6,7.5,8,3,5,6]}
df = DataFrame(data)
print(df.head(10))
print('*'*50)
meat_to_animal = {'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}

df['meat_to_animall'] = df['food'].map(lambda x:meat_to_animal.get(x.lower(),'unknown'))
print(df.head(10))

### 替代值
data.replace(-999, np.nan)

### 重命名轴索引
data.index.map(lambda x: x[:4].upper())
data.rename(index=str.title, columns=str.upper)
dta.rename(index={'old_idx':'new_idx'}, columns={'old_col':'new_col'}, inplace=True)


### 离散化和分箱  cut , qcut
pd.cut(ages, bins)

from pandas import Series,DataFrame
import pandas as pd

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18,35,50,70,90,120]
cats = pd.cut(ages, bins)
print(cats.codes)

df = DataFrame({'ages':ages})
df['ages_dicretes'] = pd.cut(ages, bins,right=False).codes
print(df.head(10))
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
data = DataFrame(np.random.randn(1000,4))
data[np.abs(data)>3] = np.sign(data) * 3 # 将数值限定于 -3 to +3
print(data.head(10))
# 置换和随机抽样
numpy.random.permutation
df.sample , series.sample

df = DataFrame(np.arange(20).reshape((5,4)))
print(df)
print('*'*50)
sampler = np.random.permutation(5)
print(df.take(sampler))


# 组合使用
from pandas import Series,DataFrame
import pandas as pd
import numpy as np

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

np.random.seed(12345)
values = np.random.rand(10)
print(values)
pretty_print('离散分箱')
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
df = pd.get_dummies(pd.cut(values,bins))
print(df.head(10))

"""
[0.92961609 0.31637555 0.18391881 0.20456028 0.56772503 0.5955447
 0.96451452 0.6531771  0.74890664 0.65356987]
******************** 离散分箱 ********************
   (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]
0           0           0           0           0           1
1           0           1           0           0           0
2           1           0           0           0           0
3           0           1           0           0           0
4           0           0           1           0           0
5           0           0           1           0           0
6           0           0           0           0           1
7           0           0           0           1           0
8           0           0           0           1           0
9           0           0           0           1           0
"""

### 向量化 字符串函数
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

data = {'col1':['001100110111'],
        'col2': ['001100110111'],
        'col3': ['001100110111']}
df = DataFrame(data)
print(df.head(10))
pretty_print('华丽的分割线')
df2 = df.apply(lambda s:s.str.extract(r'(?P\d{3})(?P\d{3})(?P\d{3})(?P\d{3})') ,axis=1)
print(df2.values)
"""
           col1          col2          col3
0  001100110111  001100110111  001100110111
******************** 华丽的分割线 ********************
[     nums_1 nums_2 nums_3 nums_4
col1    001    100    110    111
col2    001    100    110    111
col3    001    100    110    111]
"""

# 部分向量化字符串方法列表
cat 根据可选的分隔符暗元素年和字符串
contains 返回是否含有某个模式 / 正则表达式的 布尔值数组
count 模式出现次数的计数
extract 使用正则表达式从字符串Series 中分组抽取一个 或多个字符串, 返回的结果是 每个分组形成一列的 DataFrame
endswith  等价于对每个元素使用 x.endswith 
startswith 等价于 对每个元素使用 x.statswith
findall  找出字符串中所有的 模式 / 正则表达式 匹配项 ,以列表返回
get 对每个元素进行索引 (获得第 i 个元素)
isalnum
is alhpa
isdecimal
isdigit
islower
isnumeric
isupper
join
len
loer, upper
match
pad
center
repeat
replace
slice

split
strip
rstrip
lstrip

第八章 数据规整 , 连接,联合 重塑

# 分层索引  部分索引
分层索引允许你在一个轴向上拥有多个 (两个或两个以上) 索引层级, 龙宫地说 分层索引提供了一种在耕地唯独的形式中处理更高维度 数据的方法。

# -*- coding: utf-8 -*-
__author__ = 'Frank Li'
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))


data = pd.Series(np.random.randn(9),
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
print(data.head(10))
pretty_print('华丽的分割线')
print(data['b'])
pretty_print('华丽的分割线')
print(data.unstack())
pretty_print('华丽的分割线')
print(data.unstack().stack())  

#  重排序 和 层级排序
swaplevel   sort_index
# 按 层级进行汇总统计

# 使用 DataFrame 的列 进行索引

set_index()   提出 层级索引
reset_index()  反操作 封层索引会被移动到 列中

# 联合于合并数据集 
类似于 SQL 表关联操作  merge
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))


df1 = pd.DataFrame({'key1':['b','b','a','c','a','a','b'],
                    'data1':range(7)})
df2 = pd.DataFrame({'key1':['a','b','d'],
                   'data2':range(3)})

df3 = pd.merge(df1,df2,left_on='key1',right_on='key1',how='inner', suffixes=('_left','_right')) 
# inner ,left ,right ,outer ,如果 是多个列进行关联 则 on = ['key1','key2'] ,如果有相同的列,则使用
# suffixes 参数 给列取别名   , 根据索引来合并 (关联) left_index=True, right_index=True

print(df3.head(10))

# join 按照 索引进行合并
df4 = df1.join(df2,how='inner', lsuffix='_left', rsuffix='_right')
pretty_print('华丽的分割线')
print(df4.head(10))

left.join([right1, right2,right3], how='outer')  默认 inner 

concat  类似于  union all 其实又不仅仅是 

arr = np.arange(12).reshape((3,4))
print(arr)
pretty_print('华丽的分割线')
result = np.concatenate([arr,arr],axis=1)
result_2 = np.concatenate([arr,arr],axis=0)
print(result)
pretty_print('华丽的分割线')
print(result_2)

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

def generate_df(rows, cols):
    data = np.arange(rows*cols).reshape((rows,cols))
    columns = ['col_'+str(i) for i in range(cols)]
    indices = ['row_'+str(j) for j in range(rows)]
    return DataFrame(data,columns=columns,index=indices)

arr = generate_df(3,4)
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

print(pd.concat([s1,s2,s3],axis=0,sort=False))
pretty_print('华丽的分割线')
print(pd.concat([s1,s2,s3],axis=1,sort=False))
"""
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
******************** 华丽的分割线 ********************
     0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  NaN  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
f  NaN  NaN  5.0
g  NaN  NaN  6.0
"""

s4 = pd.concat([s1,s3])
print(pd.concat([s1,s4],axis=1,join='inner',sort=False))  # join = outer or inner
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

def generate_df(rows, cols):
    data = np.arange(rows*cols).reshape((rows,cols))
    columns = ['col_'+str(i) for i in range(cols)]
    indices = ['row_'+str(j) for j in range(rows)]
    return DataFrame(data,columns=columns,index=indices)

arr = generate_df(3,4)
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

s5 = pd.concat([s1,s2,s3],axis=0,keys=['one', 'two', 'three'],sort=False) # 可以为每一个 series 做分层索引便于区分
print(s5)
pretty_print('华丽的分割线')
"""
one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64
******************** 华丽的分割线 ********************
         a    b    c    d    e    f    g
one    0.0  1.0  NaN  NaN  NaN  NaN  NaN
two    NaN  NaN  2.0  3.0  4.0  NaN  NaN
three  NaN  NaN  NaN  NaN  NaN  5.0  6.0

"""
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

def generate_df(rows, cols):
    data = np.arange(rows*cols).reshape((rows,cols))
    columns = ['col_'+str(i) for i in range(cols)]
    indices = ['row_'+str(j) for j in range(rows)]
    return DataFrame(data,columns=columns,index=indices)

df1 = generate_df(3,2)
df2 = generate_df(2,2).applymap(lambda x:x+5)
print(df1)
pretty_print('华丽的分割线')
print(df2)
pretty_print('华丽的分割线')
print(pd.concat([df1,df2],axis=1,sort=False,keys=('lvl1','lvl2')))
"""
       col_0  col_1
row_0      0      1
row_1      2      3
row_2      4      5
******************** 华丽的分割线 ********************
       col_0  col_1
row_0      5      6
row_1      7      8
******************** 华丽的分割线 ********************
       lvl1        lvl2      
      col_0 col_1 col_0 col_1
row_0     0     1   5.0   6.0
row_1     2     3   7.0   8.0
row_2     4     5   NaN   NaN
"""

联合重叠数据

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

def generate_df(rows, cols):
    data = np.arange(rows*cols).reshape((rows,cols))
    columns = ['col_'+str(i) for i in range(cols)]
    indices = ['row_'+str(j) for j in range(rows)]
    return DataFrame(data,columns=columns,index=indices)

a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
              index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64),
              index=['f', 'e', 'd', 'c', 'b', 'a'])
print(a)
pretty_print('华丽的分割线')
print(b)
pretty_print('华丽的分割线')
print(np.where(pd.isnull(a),b,a))
"""
f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
******************** 华丽的分割线 ********************
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64
******************** 华丽的分割线 ********************
[0.  2.5 2.  3.5 4.5 5. ]
"""
print(b.combine_first(a))

8.3 重中之重 重塑或透视

stack 
unstack
往往结合 分层索引来做

pivot <==> 等价于  set_index 创建分层索引, 然后调用 unstack 拆堆
pivot 反过来操作 就是 pd.melt 
import pandas as pd
import numpy as np

"""
对比 某一行 第二三列的 差值 == 下一行 第一列的值,找出这样的行
"""

data = np.zeros((20,3))
df = pd.DataFrame(data, columns=['col_'+str(i) for i in range(3)], index=['row_'+str(i) for i in range(20)])
df.iloc[:, 1:] = 1

print(df.head(10))

print('{stars} {text} {stars}'.format(stars='*'*20,text='华丽的分割线'))
def func(row):
    return abs(row[1] - row[2])

df['col_3'] = df.apply(func, axis=1)
df['col_4'] = np.where(df['col_3'] == df['col_0'].shift(1), True, False)

df = df[df['col_4']]
print(df.head(20))

判断类似 gene_1|gene_2 ==> gene_1 = 1 , gene_2 = 1 gene_3 = 0....

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

def generate_df(rows, cols):
    data = np.arange(rows*cols).reshape((rows,cols))
    columns = ['col_'+str(i) for i in range(cols)]
    indices = ['row_'+str(j) for j in range(rows)]
    return DataFrame(data,columns=columns,index=indices)

df = generate_df(3,6)
print(df.head(10))
df['col_1'] = 'col_2|col_4'
df.ix[1:2,'col_1'] = 'col_3|col_5'

cols = [col for col in df.columns if col!='col_0']

def do_apply(row):
    for col in cols:
        print('*'*100)
        if col in row['col_1'].split(r'|'):
            tmp = row['col_0']
            df.loc[df['col_0'] == tmp,col] = 1

df.apply(do_apply, axis=1)
# df.apply(lambda row:do_apply(row) ,axis=1)
print('df shape: {}'.format(df.shape))
print('df 前十行: ',df.head(10))

SQLpd

转载于:https://www.cnblogs.com/Frank99/p/10972238.html

你可能感兴趣的:(pandas 常规操作大全)