Python数据分析笔记-10

1.离散化和面元划分

已知实验数据的范围为0-100,我们把数据范围均分,比如分为四个部分,也就是四个面源bin,第一个面源包含0-25,第二个26-50,第三个51-75,最后一个76-100

>>> results=[12,34,67,55,28,90,99,12,3,56,74,44,87,23,49,89,87] #一组离散的实验数据需要被划分到好几个类bin中

>>> bins=[0,25,50,75,100]#用bings存储划分类的数据节点

>>> cat=pd.cut(results,bins)#利用pandas的cut函数将results数据按照bins进行划分

>>> cat

[(0, 25], (25, 50], (50, 75], (50, 75], (25, 50], ..., (75, 100], (0, 25], (25, 50], (75, 100], (75, 100]]

Length: 17

Categories (4, interval[int64]): [(0, 25] < (25, 50] < (50, 75] < (75, 100]] #cat的每个元素与results的每个元素一一对应,表示results中每个元素的类别

>>> pd.value_counts(cat)#查看cat中每个类别有多少元素落入

(75, 100]   5

(50, 75]    4

(25, 50]    4

(0, 25]     4

dtype: int64

>>> bin_names=['unlikely','less likely','likely','highly likely']#bin_names存储字符串形式的类名

>>> pd.cut(results,bins,labels=bin_names)#利用bins对results进行切分落袋,不过bins类名取自bin_names中

[unlikely, less likely, likely, likely, less likely, ..., highly likely, unlikely, less likely, highly likely, highly likely]

Length: 17

Categories (4, object): [unlikely < less likely < likely < highly likely]

>>> pd.cut(results,5)#当没有bin和bin_names时候,指定5表示将results分割成5类,每一类跨度差不多,元素量不一样

[(2.904, 22.2], (22.2, 41.4], (60.6, 79.8], (41.4, 60.6], (22.2, 41.4], ..., (79.8, 99.0], (22.2, 41.4], (41.4, 60.6], (79.8, 99.0], (79.8, 99.0]]

Length: 17

Categories (5, interval[float64]): [(2.904, 22.2] < (22.2, 41.4] < (41.4, 60.6] < (60.6, 79.8] <

                                   (79.8, 99.0]]

>>> quintiles=pd.qcut(results,5)#利用pandas的qcut函数,指定将results划分成5类,每类含有元素量差不多

>>> pd.value_counts(quintiles)

(62.6, 87.0]    4

(2.999, 24.0]   4

(87.0, 99.0]    3

(46.0, 62.6]    3

(24.0, 46.0]    3

dtype: int64

2.异常值检测和过滤

>>> randframe=pd.DataFrame(np.random.rand(1000,3))

>>> randframe.describe()

                0           1           2

count 1000.000000 1000.000000 1000.000000

mean     0.508710    0.507959    0.489647

std      0.284606    0.287100    0.289212

min      0.001098    0.002748    0.002282

25%      0.260409    0.255102    0.236675

50%      0.517665    0.514616    0.473680

75%      0.754937    0.754878    0.746290

max      0.999385    0.999262    0.999661

>>> randframe.std()

0   0.284606

1   0.287100

2   0.289212

dtype: float64

 randframe[(np.abs(randframe)>(3*randframe.std()).any(1))]#多虑出来每一列中比该列标准差大三倍的元素

3.排序

>>> nframe=pd.DataFrame(np.arange(25).reshape(5,5))

>>> nframe

   0  1  2  3  4

0  0  1  2  3  4

1  5  6  7  8  9

2 10 11 12 13 14

3 15 16 17 18 19

4 20 21 22 23 24

>>> new_order=np.random.permutation(5)#利用这个函数创建一个随机的order

>>> new_order

array([1, 3, 2, 4, 0])

>>> nframe.take(new_order)#利用take函数对nframe进行排序重构

   0  1  2  3  4

1  5  6  7  8  9

3 15 16 17 18 19

2 10 11 12 13 14

4 20 21 22 23 24

0  0  1  2  3  4

>>> new_order=[3,4,2]#支队部分行进行排序

>>> nframe.take(new_order)#利用take函数对部分行重构

   0  1  2  3  4

3 15 16 17 18 19

4 20 21 22 23 24

2 10 11 12 13 14

4.随机取样

>>> len(nframe)

5

>>> sample=np.random.randint(0,len(nframe),size=3)

>>> nframe.take(sample)

   0  1  2  3  4

3 15 16 17 18 19

0  0  1  2  3  4

3 15 16 17 18 19

5.字符串处理

1)内置的字符串处理办法

利用split函数切分字符串

>>> text='BoltonAvenue ,Boston'

>>> text.split(',')

['BoltonAvenue ', 'Boston']

#切分后有个空格

继续利用strip函数删除多余空格

>>> token=[s.strip() for s in text.split(',')]

>>> token

['BoltonAvenue', 'Boston']

有意思的切分后赋值

>>> address,city=[s.strip() for s in text.split(',')]

>>> address

'BoltonAvenue'

>>> city

'Boston'

文本拼接利用join

>>> string=['a','b','c','d']

>>> ';'.join(string)

'a;b;c;d'

查找子串

>>> text

'BoltonAvenue ,Boston'

>>> 'Boston'in text

True

>>> text.index('Boston')

14

>>> text.find('Boston')

14

>>> text.index('Bo')

0

>>> text.find('Bo')

0

>>> text.count('e')

2

替换和删除子串

>>> text

'BoltonAvenue ,Boston'

>>> text.replace('Boston','street')

'BoltonAvenue ,street'

>>> text.replace('Boston','')#如果用空字符串替换子串,效果等同于删除子串

'BoltonAvenue ,'

2)正则表达式

导入re模块

>>> import re

使用正则分割字符串

>>> text='This is    an \t odd \n text'

>>> text

'This is    an \t odd \n text'

>>> re.split('\s+',text) #'\s+'代表空格

['This', 'is', 'an', 'odd', 'text']

findall函数:匹配字符串中所有符合正则表达式的子串

>>> text='This is my address :16 Bolton Avenue,Boston'

>>> re.findall('A\w+',text)

['Avenue']

>>> re.findall('[A,a]\w+',text)

['address', 'Avenue', ',Boston']

search函数:返回第一处符合模式的子串所在的位置,可以通过start和end函数获取位置

>>> re.search('[A,a]\w+',text)

<_sre.SRE_Match object; span=(11, 18), match='address'>

>>> search=re.search('[A,a]\w+',text)

>>> search.start

>>> search.start()

11

>>> search.end()

18

>>> text[search.start():search.end()]

'address'

match函数:从第一个字符开始查,如果查不到就返回空

6.数据聚合

1)按照某一列分组

>>> frame=pd.DataFrame({'item':['ball','mug','pen','pencil','pen'],'color':['white','red','freen','black','yellow'],'price':[1,2,3,4,5]})

>>> group=frame['price'].groupby(frame['color'])

>>> group

>>> group.groups

{'white': Int64Index([0], dtype='int64'), 'freen': Int64Index([2], dtype='int64'), 'black': Int64Index([3], dtype='int64'), 'red': Int64Index([1], dtype='int64'), 'yellow': Int64Index([4], dtype='int64')}

>>> group.mean()

color

black    4

freen    3

red      2

white    1

yellow   5

Name: price, dtype: int64

>>> group.sum()

color

black    4

freen    3

red      2

white    1

yellow   5

Name: price, dtype: int64

2)等级分组

7.组迭代

1)链式转换

>>> frame.groupby(frame['color'])['price'].mean()

color

black    4

freen    3

red      2

white    1

yellow   5

Name: price, dtype: int64

2)分组函数

使用quantile函数计算每个分组的分位数

使用自定义函数

使用多个聚合函数

>>> group=frame.groupby(['color'])

>>> group['price'].quantile(0.6)

color

black    4.0

freen    3.0

red      2.0

white    1.0

yellow   5.0

Name: price, dtype: float64

>>> def agg(series):

...    return series.max()

...

>>> group=frame.groupby('color')

>>> group['price'].agg(agg)

color

black    4

freen    3

red      2

white    1

yellow   5

Name: price, dtype: int64

>>> group['price'].agg(['sum','max'])

       sum max

color

black    4   4

freen    3   3

red      2   2

white    1   1

yellow   5   5

8.高级数据聚合

transfor函数

>>> frame=pd.DataFrame({'color':['white','red','green','red','green'],'price1':[5.56,4.20,1.30,0.56,2.75],'price2':[4.75,4.12,1.60,0.75,3.15]})

>>> frame

  color price1 price2

0 white   5.56   4.75

1   red   4.20   4.12

2 green   1.30   1.60

3   red   0.56   0.75

4 green   2.75   3.15

>>> sums=frame.groupby('color').sum().add_prefix('tot_')

>>> sums

      tot_price1 tot_price2

color

green       4.05       4.75

red         4.76       4.87

white       5.56       4.75

>>> frame.groupby('color').transform(np.sum).add_prefix('tot_')

  tot_price1 tot_price2

0       5.56       4.75

1       4.76       4.87

2       4.05       4.75

3       4.76       4.87

4       4.05       4.75

apply函数

>>> frame=pd.DataFrame({'color':['white','black','white','white','black','black'],'status':['up','up','down','down','down','up'],'values1':[12.33,14.55,22.34,27.84,23.40,18.33],'values2':[11.23,31.80,29.99,31.18,18.25,22.44]})

>>> frame

  color status values1 values2

0 white    up   12.33   11.23

1 black    up   14.55   31.80

2 white  down   22.34   29.99

3 white  down   27.84   31.18

4 black  down   23.40   18.25

5 black    up   18.33   22.44

>>> frame.groupby(['color','status']).apply(lambda x:x.max())

             color status values1 values2

color status

black down   black  down   23.40   18.25

     up     black    up   18.33   31.80

white down   white  down   27.84   31.18

     up     white    up   12.33   11.23

你可能感兴趣的:(Python数据分析笔记-10)