1.离散化和面元划分
已知实验数据的范围为0-100,我们把数据范围均分,比如分为四个部分,也就是四个面源bin,第一个面源包含0-25,第二个26-50,第三个51-75,最后一个76-100
>>> results=[12,34,67,55,28,90,99,12,3,56,74,44,87,23,49,89,87] #一组离散的实验数据需要被划分到好几个类bin中
>>> bins=[0,25,50,75,100]#用bings存储划分类的数据节点
>>> cat=pd.cut(results,bins)#利用pandas的cut函数将results数据按照bins进行划分
>>> cat
[(0, 25], (25, 50], (50, 75], (50, 75], (25, 50], ..., (75, 100], (0, 25], (25, 50], (75, 100], (75, 100]]
Length: 17
Categories (4, interval[int64]): [(0, 25] < (25, 50] < (50, 75] < (75, 100]] #cat的每个元素与results的每个元素一一对应,表示results中每个元素的类别
>>> pd.value_counts(cat)#查看cat中每个类别有多少元素落入
(75, 100] 5
(50, 75] 4
(25, 50] 4
(0, 25] 4
dtype: int64
>>> bin_names=['unlikely','less likely','likely','highly likely']#bin_names存储字符串形式的类名
>>> pd.cut(results,bins,labels=bin_names)#利用bins对results进行切分落袋,不过bins类名取自bin_names中
[unlikely, less likely, likely, likely, less likely, ..., highly likely, unlikely, less likely, highly likely, highly likely]
Length: 17
Categories (4, object): [unlikely < less likely < likely < highly likely]
>>> pd.cut(results,5)#当没有bin和bin_names时候,指定5表示将results分割成5类,每一类跨度差不多,元素量不一样
[(2.904, 22.2], (22.2, 41.4], (60.6, 79.8], (41.4, 60.6], (22.2, 41.4], ..., (79.8, 99.0], (22.2, 41.4], (41.4, 60.6], (79.8, 99.0], (79.8, 99.0]]
Length: 17
Categories (5, interval[float64]): [(2.904, 22.2] < (22.2, 41.4] < (41.4, 60.6] < (60.6, 79.8] <
(79.8, 99.0]]
>>> quintiles=pd.qcut(results,5)#利用pandas的qcut函数,指定将results划分成5类,每类含有元素量差不多
>>> pd.value_counts(quintiles)
(62.6, 87.0] 4
(2.999, 24.0] 4
(87.0, 99.0] 3
(46.0, 62.6] 3
(24.0, 46.0] 3
dtype: int64
2.异常值检测和过滤
>>> randframe=pd.DataFrame(np.random.rand(1000,3))
>>> randframe.describe()
0 1 2
count 1000.000000 1000.000000 1000.000000
mean 0.508710 0.507959 0.489647
std 0.284606 0.287100 0.289212
min 0.001098 0.002748 0.002282
25% 0.260409 0.255102 0.236675
50% 0.517665 0.514616 0.473680
75% 0.754937 0.754878 0.746290
max 0.999385 0.999262 0.999661
>>> randframe.std()
0 0.284606
1 0.287100
2 0.289212
dtype: float64
randframe[(np.abs(randframe)>(3*randframe.std()).any(1))]#多虑出来每一列中比该列标准差大三倍的元素
3.排序
>>> nframe=pd.DataFrame(np.arange(25).reshape(5,5))
>>> nframe
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
4 20 21 22 23 24
>>> new_order=np.random.permutation(5)#利用这个函数创建一个随机的order
>>> new_order
array([1, 3, 2, 4, 0])
>>> nframe.take(new_order)#利用take函数对nframe进行排序重构
0 1 2 3 4
1 5 6 7 8 9
3 15 16 17 18 19
2 10 11 12 13 14
4 20 21 22 23 24
0 0 1 2 3 4
>>> new_order=[3,4,2]#支队部分行进行排序
>>> nframe.take(new_order)#利用take函数对部分行重构
0 1 2 3 4
3 15 16 17 18 19
4 20 21 22 23 24
2 10 11 12 13 14
4.随机取样
>>> len(nframe)
5
>>> sample=np.random.randint(0,len(nframe),size=3)
>>> nframe.take(sample)
0 1 2 3 4
3 15 16 17 18 19
0 0 1 2 3 4
3 15 16 17 18 19
5.字符串处理
1)内置的字符串处理办法
利用split函数切分字符串
>>> text='BoltonAvenue ,Boston'
>>> text.split(',')
['BoltonAvenue ', 'Boston']
#切分后有个空格
继续利用strip函数删除多余空格
>>> token=[s.strip() for s in text.split(',')]
>>> token
['BoltonAvenue', 'Boston']
有意思的切分后赋值
>>> address,city=[s.strip() for s in text.split(',')]
>>> address
'BoltonAvenue'
>>> city
'Boston'
文本拼接利用join
>>> string=['a','b','c','d']
>>> ';'.join(string)
'a;b;c;d'
查找子串
>>> text
'BoltonAvenue ,Boston'
>>> 'Boston'in text
True
>>> text.index('Boston')
14
>>> text.find('Boston')
14
>>> text.index('Bo')
0
>>> text.find('Bo')
0
>>> text.count('e')
2
替换和删除子串
>>> text
'BoltonAvenue ,Boston'
>>> text.replace('Boston','street')
'BoltonAvenue ,street'
>>> text.replace('Boston','')#如果用空字符串替换子串,效果等同于删除子串
'BoltonAvenue ,'
2)正则表达式
导入re模块
>>> import re
使用正则分割字符串
>>> text='This is an \t odd \n text'
>>> text
'This is an \t odd \n text'
>>> re.split('\s+',text) #'\s+'代表空格
['This', 'is', 'an', 'odd', 'text']
findall函数:匹配字符串中所有符合正则表达式的子串
>>> text='This is my address :16 Bolton Avenue,Boston'
>>> re.findall('A\w+',text)
['Avenue']
>>> re.findall('[A,a]\w+',text)
['address', 'Avenue', ',Boston']
search函数:返回第一处符合模式的子串所在的位置,可以通过start和end函数获取位置
>>> re.search('[A,a]\w+',text)
<_sre.SRE_Match object; span=(11, 18), match='address'>
>>> search=re.search('[A,a]\w+',text)
>>> search.start
>>> search.start()
11
>>> search.end()
18
>>> text[search.start():search.end()]
'address'
match函数:从第一个字符开始查,如果查不到就返回空
6.数据聚合
1)按照某一列分组
>>> frame=pd.DataFrame({'item':['ball','mug','pen','pencil','pen'],'color':['white','red','freen','black','yellow'],'price':[1,2,3,4,5]})
>>> group=frame['price'].groupby(frame['color'])
>>> group
>>> group.groups
{'white': Int64Index([0], dtype='int64'), 'freen': Int64Index([2], dtype='int64'), 'black': Int64Index([3], dtype='int64'), 'red': Int64Index([1], dtype='int64'), 'yellow': Int64Index([4], dtype='int64')}
>>> group.mean()
color
black 4
freen 3
red 2
white 1
yellow 5
Name: price, dtype: int64
>>> group.sum()
color
black 4
freen 3
red 2
white 1
yellow 5
Name: price, dtype: int64
2)等级分组
7.组迭代
1)链式转换
>>> frame.groupby(frame['color'])['price'].mean()
color
black 4
freen 3
red 2
white 1
yellow 5
Name: price, dtype: int64
2)分组函数
使用quantile函数计算每个分组的分位数
使用自定义函数
使用多个聚合函数
>>> group=frame.groupby(['color'])
>>> group['price'].quantile(0.6)
color
black 4.0
freen 3.0
red 2.0
white 1.0
yellow 5.0
Name: price, dtype: float64
>>> def agg(series):
... return series.max()
...
>>> group=frame.groupby('color')
>>> group['price'].agg(agg)
color
black 4
freen 3
red 2
white 1
yellow 5
Name: price, dtype: int64
>>> group['price'].agg(['sum','max'])
sum max
color
black 4 4
freen 3 3
red 2 2
white 1 1
yellow 5 5
8.高级数据聚合
transfor函数
>>> frame=pd.DataFrame({'color':['white','red','green','red','green'],'price1':[5.56,4.20,1.30,0.56,2.75],'price2':[4.75,4.12,1.60,0.75,3.15]})
>>> frame
color price1 price2
0 white 5.56 4.75
1 red 4.20 4.12
2 green 1.30 1.60
3 red 0.56 0.75
4 green 2.75 3.15
>>> sums=frame.groupby('color').sum().add_prefix('tot_')
>>> sums
tot_price1 tot_price2
color
green 4.05 4.75
red 4.76 4.87
white 5.56 4.75
>>> frame.groupby('color').transform(np.sum).add_prefix('tot_')
tot_price1 tot_price2
0 5.56 4.75
1 4.76 4.87
2 4.05 4.75
3 4.76 4.87
4 4.05 4.75
apply函数
>>> frame=pd.DataFrame({'color':['white','black','white','white','black','black'],'status':['up','up','down','down','down','up'],'values1':[12.33,14.55,22.34,27.84,23.40,18.33],'values2':[11.23,31.80,29.99,31.18,18.25,22.44]})
>>> frame
color status values1 values2
0 white up 12.33 11.23
1 black up 14.55 31.80
2 white down 22.34 29.99
3 white down 27.84 31.18
4 black down 23.40 18.25
5 black up 18.33 22.44
>>> frame.groupby(['color','status']).apply(lambda x:x.max())
color status values1 values2
color status
black down black down 23.40 18.25
up black up 18.33 31.80
white down white down 27.84 31.18
up white up 12.33 11.23