注意: 教程内容来自 https://nbviewer.jupyter.org/github/twiecki/financial-analysis-python-tutorial/tree/master/ 这不是完整的系统的pandas教程,此外源教程测试demo老旧,新版本pandas可能无法兼容源程序,本教程是在原教程基础上进行的修正. 使用Spyder IDE进行测试
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
print(pd.__version__)
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
print(mpl.__version__)
测试程序使用的pandas版本为1.0.1,matplotlib版本为3.1.3
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)
# 通过python构造一维序列
labels = ['a', 'b', 'c', 'd', 'e']
s = Series([1, 2, 3, 4, 5], index=labels)
print(s)
print(r"'b' in s?")
print('b' in s)
print(s['b'])
print("Series.to_dict() can convert Series to dict")
mapping = s.to_dict()
print(mapping)
print("Series(dict) can convert dict to Series")
print(Series(mapping))
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eKV3iS94-1586606244035)(assets/2020-03-29-18-12-01.png)]
用法总结
Series
对象的构造函数可以传递dict
字典类型变量,也可以使用python中list
列表.Series.to_dict()
方法能够把Series对象转化为python内置dict
对象- 通过
'b' in s
可以判断键'b'
是否存在于Series
对象的index
中,返回值为True/False
s['b']
能够获取Series中数据
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)
import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)
# 定义获取数据的时间段
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2016,5,20)
sh = data.DataReader("000001.SS", 'yahoo', start, end)
print(sh.head(3)) # 输出数据前三行
# 将从Yahoo上获取的pandas.Dataframe数据保存到.csv文件中
sh.to_csv('sh.csv')
注意:
- 版本有更新,之前
pd.io.data.get_data_yahoo()
方法已经被弃用,当前使用DataReader
方法进行实验操作.- 国内访问数据可能会比较慢,需要开启VPN. 为简单起见(代码直接复用,而不需要其他文件),下面得案例将按照从网上获取数据进行
用法小结:
from pandas_datareader import data
之后,data.DataReader()
方法能够从网络获取金融数据(股票历史数据)DataFrame
对象如果行数太多,可以使用.head()
方法输出其前5行,.head(n)
输出前n
行
注意:这里Date为对象
sh
的索引
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)
import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)
# 定义获取数据的时间段
df = pd.read_csv('sh.csv', index_col='Date', parse_dates=True)
print(df)
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)
import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)
# 定义获取数据的时间段
df = pd.read_csv('sh.csv', index_col='Date', parse_dates=True)
# print(df)
# type(ts) = pandas.core.series.Series
ts = df['Close'][-10:]
print(ts)
print(type(ts))
在IPython中继续执行:
date = ts.index[5]
date
输出:
Timestamp('2016-05-16 00:00:00')
输入:
ts[date]
输出:
2850.862060546875
输入:
ts[5]
输出:
2850.862060546875
输入:
df[['Open', 'Close']].head()
输出:
Open Close
Date
2010-01-04 3289.750000 3243.760010
2010-01-05 3254.468018 3282.178955
2010-01-06 3277.517090 3254.215088
2010-01-07 3253.990967 3192.775879
2010-01-08 3177.259033 3195.997070
输入:
df['diff'] = df.Open - df.Close
df.head()
输出:
High Low ... Adj Close diff
Date ...
2010-01-04 3295.279053 3243.319092 ... 3243.760010 45.989990
2010-01-05 3290.511963 3221.461914 ... 3282.178955 -27.710938
2010-01-06 3295.867920 3253.043945 ... 3254.215088 23.302002
2010-01-07 3268.819092 3176.707031 ... 3192.775879 61.215088
2010-01-08 3198.919922 3149.017090 ... 3195.997070 -18.738037
[5 rows x 7 columns]
输入:
del df['diff']
df.head()
输出:
High Low ... Volume Adj Close
Date ...
2010-01-04 3295.279053 3243.319092 ... 109400 3243.760010
2010-01-05 3290.511963 3221.461914 ... 126200 3282.178955
2010-01-06 3295.867920 3253.043945 ... 123600 3254.215088
2010-01-07 3268.819092 3176.707031 ... 128600 3192.775879
2010-01-08 3198.919922 3149.017090 ... 98400 3195.997070
[5 rows x 6 columns]
用法小结:
- 指令
del df['diff']
可以删除数据df
中'diff'
列
close_px = df['Adj Close']
mavg = pd.rolling_mean(close_px, 40)
mavg[-10:]
输出:
File "", line 2, in
mavg = pd.rolling_mean(close_px, 40)
File "C:\Python\lib\site-packages\pandas\__init__.py", line 262, in __getattr__
raise AttributeError(f"module 'pandas' has no attribute '{name}'")
AttributeError: module 'pandas' has no attribute 'rolling_mean'
pandas版本更新,启用
rolling_mean()
方法
close_px = df['Adj Close']
mavg = close_px.rolling(40).mean()
mavg
输出:
Date
2010-01-04 NaN
2010-01-05 NaN
2010-01-06 NaN
2010-01-07 NaN
2010-01-08 NaN
2016-05-16 2970.439978
2016-05-17 2967.653333
2016-05-18 2962.371130
2016-05-19 2957.559705
2016-05-20 2952.947778
Name: Adj Close, Length: 1550, dtype: float64
输入:
rets = close_px / close_px.shift(1) - 1
# rets = close_px.pct_change()
rets.head()
输出:
Date
2010-01-04 NaN
2010-01-05 0.011844
2010-01-06 -0.008520
2010-01-07 -0.018880
2010-01-08 0.001009
Name: Adj Close, dtype: float64
close_px.plot(label='AAPL')
mavg.plot(label='mavg')
plt.legend()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VS5moCnW-1586606244038)(assets/2020-03-29-20-40-41.png)]
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)
import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)
# 定义获取数据的时间段
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2016,5,20)
sh = data.DataReader(['AAPL','GE','GOOG','IBM','KO', 'MSFT', 'PEP'],'yahoo', start, end)['Adj Close']
print(sh.head(3)) # 输出数据前三行
Symbols AAPL GE GOOG ... KO MSFT PEP
Date ...
2009-12-31 26.131752 10.526512 308.832428 ... 19.278732 23.925440 44.622261
2010-01-04 26.538483 10.749147 312.204773 ... 19.292267 24.294369 44.945187
2010-01-05 26.584366 10.804806 310.829926 ... 19.058893 24.302216 45.488274
[3 rows x 7 columns]
输入:
rets = df.pct_change()
plt.scatter(rets.PEP, rets.KO)
plt.xlabel('Returns PEP')
plt.ylabel('Returns KO')
pd.scatter_matrix(rets, diagonal='kde', figsize=(10, 10));
输出:
File "C:\Python\lib\site-packages\pandas\__init__.py", line 262, in __getattr__
raise AttributeError(f"module 'pandas' has no attribute '{name}'")
AttributeError: module 'pandas' has no attribute 'scatter_matrix'
pandas的scatter_matrix用法已经发生变化了,变成了pandas.plotting.scatter_matrix
重新输入:
pd.plotting.scatter_matrix(rets, diagonal='kde', figsize=(10, 10));
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-aAdqCmPm-1586606244040)(assets/2020-03-29-21-18-44.png)]
知识小结:
- 散点图是用来判断两个变量之间的相互关系的工具,一般情况下,散点图用两组数据构成多个坐标点,通过观察坐标点的分布,判断变量间是否存在关联关系,以及相关关系的强度。此外,如果不存在相关关系,可以使用散点图总结特征点的分布模式,即矩阵图(象限图)
pd.scatter_matrix()
和pd.scatter()
用于绘制散点图矩阵和散点图
输入:
corr = rets.corr()
corr
输出:
Symbols AAPL GE GOOG IBM KO MSFT PEP
Symbols
AAPL 1.000000 0.387574 0.406971 0.387261 0.298461 0.393892 0.273217
GE 0.387574 1.000000 0.423675 0.532942 0.491217 0.478202 0.485198
GOOG 0.406971 0.423675 1.000000 0.402424 0.329096 0.463922 0.322701
IBM 0.387261 0.532942 0.402424 1.000000 0.449300 0.495341 0.412432
KO 0.298461 0.491217 0.329096 0.449300 1.000000 0.402174 0.643624
MSFT 0.393892 0.478202 0.463922 0.495341 0.402174 1.000000 0.414073
PEP 0.273217 0.485198 0.322701 0.412432 0.643624 0.414073 1.000000
输入:
plt.imshow(corr, cmap='hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns)
plt.yticks(range(len(corr)), corr.columns);
用法小结:
pd.corr()
方法用于计算两个序列之间的相关性
我们经常感兴趣的一件事是预期回报(通常是回报率的均值)与我们承担的风险之间(回报率的方差)的关系。这两者之间往往存在一种权衡。
这里我们使用plt.annotate
在散点图上标注标签。
plt.scatter(rets.mean(), rets.std())
plt.xlabel('Expected returns')
plt.ylabel('Risk')
for label, x, y in zip(rets.columns, rets.mean(), rets.std()):
plt.annotate(
label,
xy = (x, y), xytext = (20, -20),
textcoords = 'offset points', ha = 'right', va = 'bottom',
bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
程序:
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)
import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)
series_list = []
securities = ['AAPL', 'GOOG', 'IBM', 'MSFT']
for security in securities:
s = data.DataReader(security,'yahoo',
start=datetime.datetime(2011, 10, 1),
end=datetime.datetime(2013, 1, 1))['Adj Close']
s.name = security # Rename series to match security name
series_list.append(s)
df = pd.concat(series_list, axis=1)
print(df.head())
AAPL GOOG IBM MSFT
Date
2011-09-30 47.285904 256.558350 130.822800 20.321293
2011-10-03 46.452591 246.834808 129.640747 20.027370
2011-10-04 46.192177 250.012894 130.725555 20.688694
2011-10-05 46.905209 251.407669 132.304108 21.137737
2011-10-06 46.796078 256.393982 135.924973 21.505136
输入:
df.ix[0, 'AAPL'] = np.nan
df.ix[1, ['GOOG', 'IBM']] = np.nan
df.ix[[1, 2, 3], 'MSFT'] = np.nan
df.head()
输出:
AttributeError: 'DataFrame' object has no attribute 'ix'
输入:
df.loc[df.index[0],['AAPL']] = np.nan
df.loc[df.index[1],['GOOG','IBM']]=np.nan
df.loc[df.index[1:3],'MSFT']=np.nan
输出:
AAPL GOOG IBM MSFT
Date
2011-09-30 NaN 256.558350 130.822800 20.321293
2011-10-03 46.452591 NaN NaN NaN
2011-10-04 46.192177 250.012894 130.725555 NaN
2011-10-05 46.905209 251.407669 132.304108 NaN
2011-10-06 46.796078 256.393982 135.924973 21.505136
输入:
(df.AAPL + df.GOOG).head()
输出:
Date
2011-09-30 NaN
2011-10-03 NaN
2011-10-04 296.205070
2011-10-05 298.312878
2011-10-06 303.190060
dtype: float64
输入:
df.ffill().head()
输出:
AAPL GOOG IBM MSFT
Date
2011-09-30 NaN 256.558350 130.822800 20.321293
2011-10-03 46.452591 256.558350 130.822800 20.321293
2011-10-04 46.192177 250.012894 130.725555 20.321293
2011-10-05 46.905209 251.407669 132.304108 20.321293
2011-10-06 46.796078 256.393982 135.924973 21.505136
NaN
2011-10-04 296.205070
2011-10-05 298.312878
2011-10-06 303.190060
dtype: float64
输入:
```python
df.ffill().head()
输出:
AAPL GOOG IBM MSFT
Date
2011-09-30 NaN 256.558350 130.822800 20.321293
2011-10-03 46.452591 256.558350 130.822800 20.321293
2011-10-04 46.192177 250.012894 130.725555 20.321293
2011-10-05 46.905209 251.407669 132.304108 20.321293
2011-10-06 46.796078 256.393982 135.924973 21.505136