9-Python 科学计算_pandas 篇

课程概要:
  1、pandas 库之数据筛选及过滤
  2、pandas 库之字符串提取与操作
  3、pandas库之散点图
  4、pandas 库之直方图

1、pandas 库之数据筛选及过滤

>>> import numpy as np
>>> import pandas as pd
>>> dates = pd.date_range('20160101',periods=6)     #  从当前日期往后生成6个日期元素
>>> dates

[2016-01-01, ..., 2016-01-06]
Length: 6, Freq: D, Timezone: None
>>> df = pd.DataFrame(np.random.rand(6,4),index=dates,columns=list('ABCD'))
        #   DataFrame 不可以小写
        #   np.random.rand(6,4),随机生成一个6*4的矩阵,其元素介于0-1之间
        #   index=dates 索引按照dates的日期元素作为索引
        #   columns=list('ABCD'),列名为A,B,C,D
>>> df
                   A         B         C         D
2016-01-01  0.144196  0.647273  0.085791  0.754298
2016-01-02  0.256549  0.141528  0.045407  0.925604
2016-01-03  0.090722  0.565770  0.077992  0.370326
2016-01-04  0.569605  0.011851  0.844495  0.059195
2016-01-05  0.125042  0.436778  0.911231  0.133522
2016-01-06  0.028161  0.169375  0.637185  0.271154

[6 rows x 4 columns]
>>> df2 = pd.DataFrame({'A':np.random.rand(6)})
>>> df2
          A
0  0.942403
1  0.849205
2  0.745943
3  0.741686
4  0.916450
5  0.480425

[6 rows x 1 columns]
>>> df3 = pd.DataFrame({'A':np.random.rand(6)},index=dates)
>>> df3
                   A
2016-01-01  0.954572
2016-01-02  0.673110
2016-01-03  0.991391
2016-01-04  0.662053
2016-01-05  0.191929
2016-01-06  0.896591

[6 rows x 1 columns]
>>> df['A']         
2016-01-01    0.144196
2016-01-02    0.256549
2016-01-03    0.090722
2016-01-04    0.569605
2016-01-05    0.125042
2016-01-06    0.028161
Freq: D, Name: A, dtype: float64
>>> df[1:3]                 
                   A         B         C         D
2016-01-02  0.256549  0.141528  0.045407  0.925604
2016-01-03  0.090722  0.565770  0.077992  0.370326

[2 rows x 4 columns]
>>> df['20160102':'20160104']       #   行切片,注意这个切片是取到右边
                   A         B         C         D
2016-01-02  0.256549  0.141528  0.045407  0.925604
2016-01-03  0.090722  0.565770  0.077992  0.370326
2016-01-04  0.569605  0.011851  0.844495  0.059195

[3 rows x 4 columns]
df.loc['20160101':'20160104',['A','B']]
                   A         B
2016-01-01  0.144196  0.647273
2016-01-02  0.256549  0.141528
2016-01-03  0.090722  0.565770
2016-01-04  0.569605  0.011851

[4 rows x 2 columns]
>>> df.at[dates[0],'A']
0.14419598649708365
>>> df.dtypes
A    float64
B    float64
C    float64
D    float64
dtype: object
>>> df.head()           #   查看前5行,默认的就是 n=5
                   A         B         C         D
2016-01-01  0.144196  0.647273  0.085791  0.754298
2016-01-02  0.256549  0.141528  0.045407  0.925604
2016-01-03  0.090722  0.565770  0.077992  0.370326
2016-01-04  0.569605  0.011851  0.844495  0.059195
2016-01-05  0.125042  0.436778  0.911231  0.133522

[5 rows x 4 columns]
>>> df.head(3)      #   查看前3行
                   A         B         C         D
2016-01-01  0.144196  0.647273  0.085791  0.754298
2016-01-02  0.256549  0.141528  0.045407  0.925604
2016-01-03  0.090722  0.565770  0.077992  0.370326

[3 rows x 4 columns]
>>> df.tail()           #   查看后5行
                   A         B         C         D
2016-01-02  0.256549  0.141528  0.045407  0.925604
2016-01-03  0.090722  0.565770  0.077992  0.370326
2016-01-04  0.569605  0.011851  0.844495  0.059195
2016-01-05  0.125042  0.436778  0.911231  0.133522
2016-01-06  0.028161  0.169375  0.637185  0.271154

[5 rows x 4 columns]
>>> df.tail(3)
                   A         B         C         D
2016-01-04  0.569605  0.011851  0.844495  0.059195
2016-01-05  0.125042  0.436778  0.911231  0.133522
2016-01-06  0.028161  0.169375  0.637185  0.271154

[3 rows x 4 columns]
>>> df.index

[2016-01-01, ..., 2016-01-06]
Length: 6, Freq: D, Timezone: None
>>> df.columns
Index([u'A', u'B', u'C', u'D'], dtype='object')
>>> df.values
array([[ 0.14419599,  0.6472727 ,  0.08579066,  0.75429817],
       [ 0.25654929,  0.1415283 ,  0.04540702,  0.92560391],
       [ 0.09072181,  0.56576979,  0.07799159,  0.37032625],
       [ 0.56960508,  0.01185102,  0.84449454,  0.05919541],
       [ 0.1250417 ,  0.43677787,  0.91123057,  0.13352195],
       [ 0.02816118,  0.16937545,  0.63718452,  0.27115381]])
>>> df.describe()
              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.202379  0.328763  0.433683  0.419017
std    0.194923  0.256976  0.409032  0.347637
min    0.028161  0.011851  0.045407  0.059195
25%    0.099302  0.148490  0.079941  0.167930
50%    0.134619  0.303077  0.361488  0.320740
75%    0.228461  0.533522  0.792667  0.658305
max    0.569605  0.647273  0.911231  0.925604

[8 rows x 4 columns]
>>> df.T
   2016-01-01  2016-01-02  2016-01-03  2016-01-04  2016-01-05  2016-01-06
A    0.144196    0.256549    0.090722    0.569605    0.125042    0.028161
B    0.647273    0.141528    0.565770    0.011851    0.436778    0.169375
C    0.085791    0.045407    0.077992    0.844495    0.911231    0.637185
D    0.754298    0.925604    0.370326    0.059195    0.133522    0.271154

[4 rows x 6 columns]
>>> df.sort(columns='C')        #   按照C列进行排序
                   A         B         C         D
2016-01-02  0.256549  0.141528  0.045407  0.925604
2016-01-03  0.090722  0.565770  0.077992  0.370326
2016-01-01  0.144196  0.647273  0.085791  0.754298
2016-01-06  0.028161  0.169375  0.637185  0.271154
2016-01-04  0.569605  0.011851  0.844495  0.059195
2016-01-05  0.125042  0.436778  0.911231  0.133522

[6 rows x 4 columns]
>>> df = pd.DataFrame(np.random.randn(6,4),columns=list('abcd'))
>>> df
          a         b         c         d
0  1.198543 -1.868721  1.745448 -1.036422
1 -0.529202 -0.012269  0.969534  0.023551
2  1.630468 -1.562662  0.396634  0.483880
3  0.113079 -0.791460  1.127796  1.232607
4  0.012088  0.848480 -1.202130 -0.066336
5 -0.937329 -0.798681 -1.716528 -0.528337

[6 rows x 4 columns]
>>> df[df.d > 0]        #   选出d列
          a         b         c         d
1 -0.529202 -0.012269  0.969534  0.023551
2  1.630468 -1.562662  0.396634  0.483880
3  0.113079 -0.791460  1.127796  1.232607

[3 rows x 4 columns]
>>> df[df.d > 0] & df[df.c < 0]
    a   b   c   d
1 NaN NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
5 NaN NaN NaN NaN

[5 rows x 4 columns]
>>> df[df.c < 0][['a','b']]
          a         b
4  0.012088  0.848480
5 -0.937329 -0.798681

[2 rows x 2 columns]

2、pandas 库之字符串提取与操作

>>> import pandas as pd
>>> s = pd.Series(list('ABCDEF'))       #   产生的是一个字符串的序列(n*1)
>>> s
0    A
1    B
2    C
3    D
4    E
5    F
dtype: object
>>> s.str.lower()       #   将字符串的字母小写
0    a
1    b
2    c
3    d
4    e
5    f
dtype: object
>>> s.str.upper()       #   将字符串的字母大写
0    A
1    B
2    C
3    D
4    E
5    F
dtype: object
>>> s.str.replace('A','B')          #   替换,将s序列中所有的字母A替换成B
0    B
1    B
2    C
3    D
4    E
5    F
dtype: object
>>> s = pd.Series(['a1','a2','a3','a4'])
>>> s
0    a1
1    a2
2    a3
3    a4
dtype: object
#   提取字符串
>>> s.str.extract('[ab](\d)')       #   正则表达式
0    1                      #   [ab] 表示匹配内容是a,b字符中其中一个
1    2                      #   (\d)::\d表示匹配的是整型
2    3                      #         :( ) 表示返回的内容
3    4
dtype: object
>>> s.str.extract('([abc])(\d)')        #   返回的是两列
   0  1
0  a  1
1  a  2
2  a  3
3  a  4

[4 rows x 2 columns]
>>> s.str.extract('([abc]\d)')
0    a1
1    a2
2    a3
3    a4
dtype: object
>>> s.str.extract('(?P[abc])(?P(\d))')      #   ?:匹配前面的子表达式零次或一次
  str digit  2      #   ?P:非获取匹配,匹配pattern但不获取匹配结果,不进行存储供以后使用
0   a     1  1
1   a     2  2
2   a     3  3
3   a     4  4

[4 rows x 3 columns]
>>> s = pd.Series(['a','B','c','d'])
>>> pattern = r'[a-z]'
>>> s.str.contains(pattern)
0     True
1    False
2     True
3     True
dtype: bool
>>> pattern = r'A-Za-z]'
>>> s.str.contains(pattern)
0    False
1    False
2    False
3    False
dtype: bool
>>> s = pd.Series(['ab','Ba','c','d'])
>>> s.str.contains('^a')            #   ^a 表示匹配以a开头的字母
0     True  
1    False
2    False
3    False
dtype: bool
>>> s.str.startswith('a')
0     True
1    False
2    False
3    False
dtype: bool
>>> s.str.endswith('a')
0    False
1     True
2    False
3    False
dtype: bool
>>> s.str.contains('a$')        #   a$ 表示匹配以a 结尾
0    False
1     True
2    False
3    False
dtype: bool

三、pandas 库之散点图

#   如何读取Excel中的数据
>>> df = pd.read_excel(r'D:\Doc_Py\test.xls','Sheet1',index_col=0)
        #   三个参数:①路径;②标签;③index_col = 0 让excel中默认索引值不显示
>>> df
    child  parent
id               
1      10      30
2      11      31
3      12      32
4      13      33
5      14      34
6      15      35
7      16      36
8      17      37
9      18      38
10     19      39
11     20      40

[11 rows x 2 columns]
>>> df1 = pd.read_excel(r'D:\Doc_Py\test.xls','Sheet1')
>>> df1
    id  child  parent
0    1     10      30
1    2     11      31
2    3     12      32
3    4     13      33
4    5     14      34
5    6     15      35
6    7     16      36
7    8     17      37
8    9     18      38
9   10     19      39
10  11     20      40

[11 rows x 3 columns]
>>> pl = df.plot(kind='scatter',x='child',y='parent').get_figure()
>>> pl.savefig(r'D:\1.jpg')
import pandas as pd
import numpy as np

def func(ser,factor):
    ''' ser:传入的序列,factor:传入的抖动值
    '''
    z = float(ser.max())-float(ser.min())
    a = float(factor)*z/50
    return ser.apply(lambda x:x+np.random.uniform(-a,a))
        #   np.random.uniform(-a,a) :-a和a之间随机浮点数
    
df = pd.read_excel(r'D:\Doc_Py\test.xls','Sheet1',index_col=0)
df['child']=func(df['child'],1)
df['parent'] = func(df['parent'],1)

plt = df.plot(kind='scatter',x='child',y='parent').get_figure()
plt.savefig(r'D:\2.jpg')
from statsmodels.formula.api import ols     
#   statsmodels.formula.api是统计建模的工具包,ols 是建立一个线性回归方程
#   使用statsmodels时要求有patsy这个库
import matplotlib.pyplot as  plt    
import numpy as np
import pandas as pd

path = r'D:\Doc_Py\test.xls'
df = pd.read_excel(path,'Sheet1',index_col=0)
l = ols('child~parent',df).fit()        #   ols 以parent 为自变量建立线性回归方程

plt.plot(df['parent'],df['child'],'ob') #   传入parent和child的值,先绘制散点图
    #   ‘ob’ 表示散点使用蓝色原点标记
plt.plot(df['parent'],l.fittedvalues,'r',linewidth=2)   #   l.fittedvalues:回归方程的预测的值
                        #   ‘r’ 表示红色线,再绘制出线性回归方程的拟合线
plt.show()

四、pandas 库之直方图

import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.rand(10,4),columns=list('ABCD'))
plt = df.plot(kind='bar').get_figure()      #   kind=’bar’,表示为柱形图
plt.savefig(r'D:\4.jpg')
import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.rand(10,4),columns=list('ABCD'))
pd.set_option('mpl_style','default')        #   设置mpl格式
plt = df.plot(kind='bar').get_figure()
plt.savefig(r'D:\4-2.jpg')
df = pd.DataFrame(np.random.rand(10,4),columns=list('ABCD'))
pd.set_option('mpl_style','default')
plt = df.plot(kind='bar',stacked=True).get_figure()     #   stacked=True:设置堆积模式
plt.savefig(r'D:\4.jpg')        #   可以覆盖掉前面的图片文件
df = pd.DataFrame(np.random.rand(10,4),columns=list('ABCD'))
pd.set_option('mpl_style','default')
plt = df.plot(kind='barh',stacked=True).get_figure()        #   设置水平样式
plt.savefig(r'D:\4.jpg')
df = pd.DataFrame(np.random.rand(100,4),columns=list('abcd'))
d = df['a'].hist().get_figure()
d.savefig(r'D:\4.jpg')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('mpl_style','default')
fig,ax = plt.subplots()
df = pd.DataFrame(np.random.rand(100,2),columns=list('ab'))
df.boxplot(ax=ax)
d.savefig(r'D:\4.jpg')          #   这里是不能使用show( )的
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas.util.testing as tm

pd.set_option('mpl_style','default')
fig,ax = plt.subplots()
df = pd.DataFrame(np.random.rand(100,2),columns=list('ab'))
df['x'] =tm.choice(['M','F'],size=100)
df.boxplot(ax=ax,by='x')
plt.savefig(r'D:\5.jpg')

你可能感兴趣的:(9-Python 科学计算_pandas 篇)