课程概要:
1、pandas 库之数据筛选及过滤
2、pandas 库之字符串提取与操作
3、pandas库之散点图
4、pandas 库之直方图
1、pandas 库之数据筛选及过滤
>>> import numpy as np
>>> import pandas as pd
>>> dates = pd.date_range('20160101',periods=6) # 从当前日期往后生成6个日期元素
>>> dates
[2016-01-01, ..., 2016-01-06]
Length: 6, Freq: D, Timezone: None
>>> df = pd.DataFrame(np.random.rand(6,4),index=dates,columns=list('ABCD'))
# DataFrame 不可以小写
# np.random.rand(6,4),随机生成一个6*4的矩阵,其元素介于0-1之间
# index=dates 索引按照dates的日期元素作为索引
# columns=list('ABCD'),列名为A,B,C,D
>>> df
A B C D
2016-01-01 0.144196 0.647273 0.085791 0.754298
2016-01-02 0.256549 0.141528 0.045407 0.925604
2016-01-03 0.090722 0.565770 0.077992 0.370326
2016-01-04 0.569605 0.011851 0.844495 0.059195
2016-01-05 0.125042 0.436778 0.911231 0.133522
2016-01-06 0.028161 0.169375 0.637185 0.271154
[6 rows x 4 columns]
>>> df2 = pd.DataFrame({'A':np.random.rand(6)})
>>> df2
A
0 0.942403
1 0.849205
2 0.745943
3 0.741686
4 0.916450
5 0.480425
[6 rows x 1 columns]
>>> df3 = pd.DataFrame({'A':np.random.rand(6)},index=dates)
>>> df3
A
2016-01-01 0.954572
2016-01-02 0.673110
2016-01-03 0.991391
2016-01-04 0.662053
2016-01-05 0.191929
2016-01-06 0.896591
[6 rows x 1 columns]
>>> df['A']
2016-01-01 0.144196
2016-01-02 0.256549
2016-01-03 0.090722
2016-01-04 0.569605
2016-01-05 0.125042
2016-01-06 0.028161
Freq: D, Name: A, dtype: float64
>>> df[1:3]
A B C D
2016-01-02 0.256549 0.141528 0.045407 0.925604
2016-01-03 0.090722 0.565770 0.077992 0.370326
[2 rows x 4 columns]
>>> df['20160102':'20160104'] # 行切片,注意这个切片是取到右边
A B C D
2016-01-02 0.256549 0.141528 0.045407 0.925604
2016-01-03 0.090722 0.565770 0.077992 0.370326
2016-01-04 0.569605 0.011851 0.844495 0.059195
[3 rows x 4 columns]
df.loc['20160101':'20160104',['A','B']]
A B
2016-01-01 0.144196 0.647273
2016-01-02 0.256549 0.141528
2016-01-03 0.090722 0.565770
2016-01-04 0.569605 0.011851
[4 rows x 2 columns]
>>> df.at[dates[0],'A']
0.14419598649708365
>>> df.dtypes
A float64
B float64
C float64
D float64
dtype: object
>>> df.head() # 查看前5行,默认的就是 n=5
A B C D
2016-01-01 0.144196 0.647273 0.085791 0.754298
2016-01-02 0.256549 0.141528 0.045407 0.925604
2016-01-03 0.090722 0.565770 0.077992 0.370326
2016-01-04 0.569605 0.011851 0.844495 0.059195
2016-01-05 0.125042 0.436778 0.911231 0.133522
[5 rows x 4 columns]
>>> df.head(3) # 查看前3行
A B C D
2016-01-01 0.144196 0.647273 0.085791 0.754298
2016-01-02 0.256549 0.141528 0.045407 0.925604
2016-01-03 0.090722 0.565770 0.077992 0.370326
[3 rows x 4 columns]
>>> df.tail() # 查看后5行
A B C D
2016-01-02 0.256549 0.141528 0.045407 0.925604
2016-01-03 0.090722 0.565770 0.077992 0.370326
2016-01-04 0.569605 0.011851 0.844495 0.059195
2016-01-05 0.125042 0.436778 0.911231 0.133522
2016-01-06 0.028161 0.169375 0.637185 0.271154
[5 rows x 4 columns]
>>> df.tail(3)
A B C D
2016-01-04 0.569605 0.011851 0.844495 0.059195
2016-01-05 0.125042 0.436778 0.911231 0.133522
2016-01-06 0.028161 0.169375 0.637185 0.271154
[3 rows x 4 columns]
>>> df.index
[2016-01-01, ..., 2016-01-06]
Length: 6, Freq: D, Timezone: None
>>> df.columns
Index([u'A', u'B', u'C', u'D'], dtype='object')
>>> df.values
array([[ 0.14419599, 0.6472727 , 0.08579066, 0.75429817],
[ 0.25654929, 0.1415283 , 0.04540702, 0.92560391],
[ 0.09072181, 0.56576979, 0.07799159, 0.37032625],
[ 0.56960508, 0.01185102, 0.84449454, 0.05919541],
[ 0.1250417 , 0.43677787, 0.91123057, 0.13352195],
[ 0.02816118, 0.16937545, 0.63718452, 0.27115381]])
>>> df.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.202379 0.328763 0.433683 0.419017
std 0.194923 0.256976 0.409032 0.347637
min 0.028161 0.011851 0.045407 0.059195
25% 0.099302 0.148490 0.079941 0.167930
50% 0.134619 0.303077 0.361488 0.320740
75% 0.228461 0.533522 0.792667 0.658305
max 0.569605 0.647273 0.911231 0.925604
[8 rows x 4 columns]
>>> df.T
2016-01-01 2016-01-02 2016-01-03 2016-01-04 2016-01-05 2016-01-06
A 0.144196 0.256549 0.090722 0.569605 0.125042 0.028161
B 0.647273 0.141528 0.565770 0.011851 0.436778 0.169375
C 0.085791 0.045407 0.077992 0.844495 0.911231 0.637185
D 0.754298 0.925604 0.370326 0.059195 0.133522 0.271154
[4 rows x 6 columns]
>>> df.sort(columns='C') # 按照C列进行排序
A B C D
2016-01-02 0.256549 0.141528 0.045407 0.925604
2016-01-03 0.090722 0.565770 0.077992 0.370326
2016-01-01 0.144196 0.647273 0.085791 0.754298
2016-01-06 0.028161 0.169375 0.637185 0.271154
2016-01-04 0.569605 0.011851 0.844495 0.059195
2016-01-05 0.125042 0.436778 0.911231 0.133522
[6 rows x 4 columns]
>>> df = pd.DataFrame(np.random.randn(6,4),columns=list('abcd'))
>>> df
a b c d
0 1.198543 -1.868721 1.745448 -1.036422
1 -0.529202 -0.012269 0.969534 0.023551
2 1.630468 -1.562662 0.396634 0.483880
3 0.113079 -0.791460 1.127796 1.232607
4 0.012088 0.848480 -1.202130 -0.066336
5 -0.937329 -0.798681 -1.716528 -0.528337
[6 rows x 4 columns]
>>> df[df.d > 0] # 选出d列
a b c d
1 -0.529202 -0.012269 0.969534 0.023551
2 1.630468 -1.562662 0.396634 0.483880
3 0.113079 -0.791460 1.127796 1.232607
[3 rows x 4 columns]
>>> df[df.d > 0] & df[df.c < 0]
a b c d
1 NaN NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
5 NaN NaN NaN NaN
[5 rows x 4 columns]
>>> df[df.c < 0][['a','b']]
a b
4 0.012088 0.848480
5 -0.937329 -0.798681
[2 rows x 2 columns]
2、pandas 库之字符串提取与操作
>>> import pandas as pd
>>> s = pd.Series(list('ABCDEF')) # 产生的是一个字符串的序列(n*1)
>>> s
0 A
1 B
2 C
3 D
4 E
5 F
dtype: object
>>> s.str.lower() # 将字符串的字母小写
0 a
1 b
2 c
3 d
4 e
5 f
dtype: object
>>> s.str.upper() # 将字符串的字母大写
0 A
1 B
2 C
3 D
4 E
5 F
dtype: object
>>> s.str.replace('A','B') # 替换,将s序列中所有的字母A替换成B
0 B
1 B
2 C
3 D
4 E
5 F
dtype: object
>>> s = pd.Series(['a1','a2','a3','a4'])
>>> s
0 a1
1 a2
2 a3
3 a4
dtype: object
# 提取字符串
>>> s.str.extract('[ab](\d)') # 正则表达式
0 1 # [ab] 表示匹配内容是a,b字符中其中一个
1 2 # (\d)::\d表示匹配的是整型
2 3 # :( ) 表示返回的内容
3 4
dtype: object
>>> s.str.extract('([abc])(\d)') # 返回的是两列
0 1
0 a 1
1 a 2
2 a 3
3 a 4
[4 rows x 2 columns]
>>> s.str.extract('([abc]\d)')
0 a1
1 a2
2 a3
3 a4
dtype: object
>>> s.str.extract('(?P[abc])(?P(\d))') # ?:匹配前面的子表达式零次或一次
str digit 2 # ?P:非获取匹配,匹配pattern但不获取匹配结果,不进行存储供以后使用
0 a 1 1
1 a 2 2
2 a 3 3
3 a 4 4
[4 rows x 3 columns]
>>> s = pd.Series(['a','B','c','d'])
>>> pattern = r'[a-z]'
>>> s.str.contains(pattern)
0 True
1 False
2 True
3 True
dtype: bool
>>> pattern = r'A-Za-z]'
>>> s.str.contains(pattern)
0 False
1 False
2 False
3 False
dtype: bool
>>> s = pd.Series(['ab','Ba','c','d'])
>>> s.str.contains('^a') # ^a 表示匹配以a开头的字母
0 True
1 False
2 False
3 False
dtype: bool
>>> s.str.startswith('a')
0 True
1 False
2 False
3 False
dtype: bool
>>> s.str.endswith('a')
0 False
1 True
2 False
3 False
dtype: bool
>>> s.str.contains('a$') # a$ 表示匹配以a 结尾
0 False
1 True
2 False
3 False
dtype: bool
三、pandas 库之散点图
# 如何读取Excel中的数据
>>> df = pd.read_excel(r'D:\Doc_Py\test.xls','Sheet1',index_col=0)
# 三个参数:①路径;②标签;③index_col = 0 让excel中默认索引值不显示
>>> df
child parent
id
1 10 30
2 11 31
3 12 32
4 13 33
5 14 34
6 15 35
7 16 36
8 17 37
9 18 38
10 19 39
11 20 40
[11 rows x 2 columns]
>>> df1 = pd.read_excel(r'D:\Doc_Py\test.xls','Sheet1')
>>> df1
id child parent
0 1 10 30
1 2 11 31
2 3 12 32
3 4 13 33
4 5 14 34
5 6 15 35
6 7 16 36
7 8 17 37
8 9 18 38
9 10 19 39
10 11 20 40
[11 rows x 3 columns]
>>> pl = df.plot(kind='scatter',x='child',y='parent').get_figure()
>>> pl.savefig(r'D:\1.jpg')
import pandas as pd
import numpy as np
def func(ser,factor):
''' ser:传入的序列,factor:传入的抖动值
'''
z = float(ser.max())-float(ser.min())
a = float(factor)*z/50
return ser.apply(lambda x:x+np.random.uniform(-a,a))
# np.random.uniform(-a,a) :-a和a之间随机浮点数
df = pd.read_excel(r'D:\Doc_Py\test.xls','Sheet1',index_col=0)
df['child']=func(df['child'],1)
df['parent'] = func(df['parent'],1)
plt = df.plot(kind='scatter',x='child',y='parent').get_figure()
plt.savefig(r'D:\2.jpg')
from statsmodels.formula.api import ols
# statsmodels.formula.api是统计建模的工具包,ols 是建立一个线性回归方程
# 使用statsmodels时要求有patsy这个库
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
path = r'D:\Doc_Py\test.xls'
df = pd.read_excel(path,'Sheet1',index_col=0)
l = ols('child~parent',df).fit() # ols 以parent 为自变量建立线性回归方程
plt.plot(df['parent'],df['child'],'ob') # 传入parent和child的值,先绘制散点图
# ‘ob’ 表示散点使用蓝色原点标记
plt.plot(df['parent'],l.fittedvalues,'r',linewidth=2) # l.fittedvalues:回归方程的预测的值
# ‘r’ 表示红色线,再绘制出线性回归方程的拟合线
plt.show()
四、pandas 库之直方图
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.rand(10,4),columns=list('ABCD'))
plt = df.plot(kind='bar').get_figure() # kind=’bar’,表示为柱形图
plt.savefig(r'D:\4.jpg')
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.rand(10,4),columns=list('ABCD'))
pd.set_option('mpl_style','default') # 设置mpl格式
plt = df.plot(kind='bar').get_figure()
plt.savefig(r'D:\4-2.jpg')
df = pd.DataFrame(np.random.rand(10,4),columns=list('ABCD'))
pd.set_option('mpl_style','default')
plt = df.plot(kind='bar',stacked=True).get_figure() # stacked=True:设置堆积模式
plt.savefig(r'D:\4.jpg') # 可以覆盖掉前面的图片文件
df = pd.DataFrame(np.random.rand(10,4),columns=list('ABCD'))
pd.set_option('mpl_style','default')
plt = df.plot(kind='barh',stacked=True).get_figure() # 设置水平样式
plt.savefig(r'D:\4.jpg')
df = pd.DataFrame(np.random.rand(100,4),columns=list('abcd'))
d = df['a'].hist().get_figure()
d.savefig(r'D:\4.jpg')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('mpl_style','default')
fig,ax = plt.subplots()
df = pd.DataFrame(np.random.rand(100,2),columns=list('ab'))
df.boxplot(ax=ax)
d.savefig(r'D:\4.jpg') # 这里是不能使用show( )的
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas.util.testing as tm
pd.set_option('mpl_style','default')
fig,ax = plt.subplots()
df = pd.DataFrame(np.random.rand(100,2),columns=list('ab'))
df['x'] =tm.choice(['M','F'],size=100)
df.boxplot(ax=ax,by='x')
plt.savefig(r'D:\5.jpg')