import pandas as pd
import numpy as np
df = pd.DataFrame([10, 20, 30, 40], columns=['numbers'], index=['a', 'b', 'c', 'd'])
df
# numbers
# a 10
# b 20
# c 30
# d 40
df.index
# Index(['a', 'b', 'c', 'd'], dtype='object')
df.columns
# Index(['numbers'], dtype='object')
df.loc['c']
# numbers 30
# Name: c, dtype: int64
df.loc[['a', 'b']]
# numbers
# a 10
# b 20
df.loc[df.index[1:3]]
# numbers
# b 20
# c 30
df.sum()
# numbers 100
# dtype: int64
df.apply(lambda x: x ** 2)
# numbers
# a 100
# b 400
# c 900
# d 1600
df ** 2
# numbers
# a 100
# b 400
# c 900
# d 1600
df['floats'] = (1.5, 2.5, 3.5, 4.5)
df
# numbers floats
# a 10 1.5
# b 20 2.5
# c 30 3.5
# d 40 4.5
df['floats']
# a 1.5
# b 2.5
# c 3.5
# d 4.5
# Name: floats, dtype: float64
df.floats
# a 1.5
# b 2.5
# c 3.5
# d 4.5
# Name: floats, dtype: float64
df['names'] = pd.DataFrame(['Yves', 'Guido', 'Felix', 'Francesc'], index=['d', 'a', 'b', 'c'])
df
# numbers floats names
# a 10 1.5 Guido
# b 20 2.5 Felix
# c 30 3.5 Francesc
# d 40 4.5 Yves
df.append({'numbers': 100, 'floats': 5.75, 'names': 'Henry'}, ignore_index=True)
# numbers floats names
# 0 10 1.50 Guido
# 1 20 2.50 Felix
# 2 30 3.50 Francesc
# 3 40 4.50 Yves
# 4 100 5.75 Henry
df = df.append(pd.DataFrame({'numbers': 100, 'floats': 5.75, 'names': 'Henry'}, index=['z', ]))
df
# floats names numbers
# a 1.50 Guido 10
# b 2.50 Felix 20
# c 3.50 Francesc 30
# d 4.50 Yves 40
# z 5.75 Henry 100
df.join(pd.DataFrame([1, 4, 9, 16, 25], index=['a', 'b', 'c', 'd', 'y'], columns=['squares', ]))
# floats names numbers squares
# a 1.50 Guido 10 1.0
# b 2.50 Felix 20 4.0
# c 3.50 Francesc 30 9.0
# d 4.50 Yves 40 16.0
# z 5.75 Henry 100 NaN
df = df.join(pd.DataFrame([1, 4, 9, 16, 25],
index=['a', 'b', 'c', 'd', 'y'],
columns=['squares', ]),
how='outer')
df
# floats names numbers squares
# a 1.50 Guido 10.0 1.0
# b 2.50 Felix 20.0 4.0
# c 3.50 Francesc 30.0 9.0
# d 4.50 Yves 40.0 16.0
# y NaN NaN NaN 25.0
# z 5.75 Henry 100.0 NaN
df[['numbers', 'squares']].mean()
# numbers 40.0
# squares 11.0
# dtype: float64
df[['numbers', 'squares']].std()
# numbers 35.355339
# squares 9.669540
# dtype: float64
a = np.random.standard_normal((9, 4))
a.round(6)
# array([[ 0.109076, -1.05275 , 1.253471, 0.39846 ],
# [-1.561175, -1.997425, 1.158739, -2.030734],
# [ 0.764723, 0.760368, 0.864103, -0.174079],
# [ 2.429043, 0.281962, -0.496606, 0.009445],
# [-1.679758, -1.02374 , -1.135922, 0.077649],
# [-0.247692, 0.301198, 2.156474, 1.537902],
# [ 1.162934, 2.102327, -0.4501 , 0.812529],
# [-0.374749, -0.818229, -1.013962, -0.476855],
# [ 0.626347, 2.294829, -1.29531 , -0.031501]])
df = pd.DataFrame(a)
df
# 0 1 2 3
# 0 0.109076 -1.052750 1.253471 0.398460
# 1 -1.561175 -1.997425 1.158739 -2.030734
# 2 0.764723 0.760368 0.864103 -0.174079
# 3 2.429043 0.281962 -0.496606 0.009445
# 4 -1.679758 -1.023740 -1.135922 0.077649
# 5 -0.247692 0.301198 2.156474 1.537902
# 6 1.162934 2.102327 -0.450100 0.812529
# 7 -0.374749 -0.818229 -1.013962 -0.476855
# 8 0.626347 2.294829 -1.295310 -0.031501
df.columns = [['No1', 'No2', 'No3', 'No4']]
df
# No1 No2 No3 No4
# 0 0.109076 -1.052750 1.253471 0.398460
# 1 -1.561175 -1.997425 1.158739 -2.030734
# 2 0.764723 0.760368 0.864103 -0.174079
# 3 2.429043 0.281962 -0.496606 0.009445
# 4 -1.679758 -1.023740 -1.135922 0.077649
# 5 -0.247692 0.301198 2.156474 1.537902
# 6 1.162934 2.102327 -0.450100 0.812529
# 7 -0.374749 -0.818229 -1.013962 -0.476855
# 8 0.626347 2.294829 -1.295310 -0.031501
df['No2'][3]
# 0.2819621128403918
dates = pd.date_range('2018-01-01', periods=9, freq='M')
dates
# DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
# '2018-05-31', '2018-06-30', '2018-07-31', '2018-08-31',
# '2018-09-30'],
# dtype='datetime64[ns]', freq='M')
data_range函数参数
参数 | 格式 | 描述 |
---|---|---|
start | 字符串/日期时间 | 生成日期的左界 |
end | 字符串/日期时间 | 生成日期的右界 |
periods | 整数/None | 期数(如果start或者end空缺) |
freq | 字符串/日期偏移 | 频率字符串,例如5D (5天) |
tz | 字符串/None | 本地化索引的时区名称 |
nonnalize | 布尔值,默认None | 将star和end规范化为午夜 |
name | 字符串,默认None | 结果索引名称 |
df.index = dates
df
# No1 No2 No3 No4
# 2018-01-31 0.109076 -1.052750 1.253471 0.398460
# 2018-02-28 -1.561175 -1.997425 1.158739 -2.030734
# 2018-03-31 0.764723 0.760368 0.864103 -0.174079
# 2018-04-30 2.429043 0.281962 -0.496606 0.009445
# 2018-05-31 -1.679758 -1.023740 -1.135922 0.077649
# 2018-06-30 -0.247692 0.301198 2.156474 1.537902
# 2018-07-31 1.162934 2.102327 -0.450100 0.812529
# 2018-08-31 -0.374749 -0.818229 -1.013962 -0.476855
# 2018-09-30 0.626347 2.294829 -1.295310 -0.031501
data_range函数频率参数值
别名 | 描述 |
---|---|
B | 交易日 |
C | 自定义交易日(试验性) |
D | 日历日 |
W | 每周 |
M | 每月底 |
BM | 每月最后一个交易日 |
MS | 月初 |
BMS | 每月第一个交易日 |
Q | 季度末 |
BQ | 每季度最后一个交易日 |
QS | 季度初 |
BQS | 每季度第一个交易日 |
A | 每年底 |
BA | 每年最后一个交易日 |
AS | 每年初 |
BAS | 每年第一个交易日 |
H | 每小时 |
T | 每分钟 |
S | 每秒 |
L | 毫秒 |
U | 微秒 |
通常可以从一个 ndarray 对象生成 DataFrame 对象。 但是也可以简单地使用NumPy的array函数从DataFrame 生成一个 ndarray。
np.array(df).round(6)
# array([[ 0.109076, -1.05275 , 1.253471, 0.39846 ],
# [-1.561175, -1.997425, 1.158739, -2.030734],
# [ 0.764723, 0.760368, 0.864103, -0.174079],
# [ 2.429043, 0.281962, -0.496606, 0.009445],
# [-1.679758, -1.02374 , -1.135922, 0.077649],
# [-0.247692, 0.301198, 2.156474, 1.537902],
# [ 1.162934, 2.102327, -0.4501 , 0.812529],
# [-0.374749, -0.818229, -1.013962, -0.476855],
# [ 0.626347, 2.294829, -1.29531 , -0.031501]])
df.sum()
# No1 1.228750
# No2 0.848540
# No3 1.040888
# No4 0.122816
# dtype: float64
df.mean()
# No1 0.136528
# No2 0.094282
# No3 0.115654
# No4 0.013646
# dtype: float64
df.cumsum()
# No1 No2 No3 No4
# 2018-01-31 0.109076 -1.052750 1.253471 0.398460
# 2018-02-28 -1.452099 -3.050176 2.412210 -1.632274
# 2018-03-31 -0.687376 -2.289807 3.276313 -1.806353
# 2018-04-30 1.741667 -2.007845 2.779707 -1.796908
# 2018-05-31 0.061909 -3.031585 1.643786 -1.719259
# 2018-06-30 -0.185783 -2.730387 3.800259 -0.181357
# 2018-07-31 0.977152 -0.628061 3.350160 0.631172
# 2018-08-31 0.602403 -1.446289 2.336198 0.154317
# 2018-09-30 1.228750 0.848540 1.040888 0.122816
df.describe()
# No1 No2 No3 No4
# count 9.000000 9.000000 9.000000 9.000000
# mean 0.136528 0.094282 0.115654 0.013646
# std 1.300700 1.465006 1.256782 0.972826
# min -1.679758 -1.997425 -1.295310 -2.030734
# 25% -0.374749 -1.023740 -1.013962 -0.174079
# 50% 0.109076 0.281962 -0.450100 0.009445
# 75% 0.764723 0.760368 1.158739 0.398460
# max 2.429043 2.294829 2.156474 1.537902
np.sqrt(df)
# No1 No2 No3 No4
# 2018-01-31 0.330266 NaN 1.119585 0.631236
# 2018-02-28 NaN NaN 1.076447 NaN
# 2018-03-31 0.874484 0.871991 0.929571 NaN
# 2018-04-30 1.558539 0.531001 NaN 0.097184
# 2018-05-31 NaN NaN NaN 0.278656
# 2018-06-30 NaN 0.548815 1.468494 1.240122
# 2018-07-31 1.078394 1.449940 NaN 0.901404
# 2018-08-31 NaN NaN NaN NaN
# 2018-09-30 0.791421 1.514869 NaN NaN
np.sqrt(df).sum()
# No1 4.633104
# No2 4.916616
# No3 4.594098
# No4 3.148602
# dtype: float64
df.cumsum().plot(lw=2.0)
参数 | 格式 | 描述 |
---|---|---|
x | 标签/位置,默认None | 只在列值为x刻度时使用 |
y | 标签/位置,默认None | 只在列值为x刻度时使用 |
subplots | 布尔值,默认False | 子图中的绘图列 |
sharex | 布尔值,默认True | 共用x轴 |
sharey | 布尔值,默认False | 共用y轴 |
use_index | 布尔值,默认True | 使用DataFrame.index作为x轴刻度 |
stacked | 布尔值,默认False | 堆叠(只用于柱状图) |
sort_columns | 布尔值,默认False | 在绘图之前将列按字母顺序排列 |
title | 字符串,默认None | 图表标题 |
grid | 布尔值,默认False | 水平和垂直网格线 |
legend | 布尔值,默认True | 标签图例 |
ax | matplotlib轴对象 | 绘图使用的matplotlib轴对象 |
style | 字符串或者列表/字典 | 绘图线形(每列) |
kind | ‘line’ : line plot (default) ‘bar’ : vertical bar plot ‘barh’ : horizontal bar plot ‘hist’ : histogram ‘box’ : boxplot ‘kde’ : Kernel Density Estimation plot ‘density’ : same as ‘kde’ ‘area’ : area plot ‘pie’ : pie plot ‘scatter’ : scatter plot ‘hexbin’ : hexbin plot |
图表类型 |
logx | 布尔值,默认False | x轴的对数刻度 |
logy | 布尔值,默认False | y轴的对数刻度 |
xticks | 序列,默认index | x轴刻度 |
yticks | 序列,默认Values | y轴刻度 |
xlim | 二元组,列表 | x轴界限 |
ylim | 二元组,列表 | y轴界限 |
rot | 整数,默认为None | 旋转x刻度 |
secondary_y | 布尔值/序列,默认False | 第二个y轴 |
mark_right | 布尔值,默认True | 第二个y轴自动设置标签 |
colormap | 字符串/颜色映射对象,默认None | 用于绘图的颜色映射 |
kwds | 关键字 | 传递給matplotlib选项 |
type(df)
# pandas.core.frame.DataFrame
df['No1']
# 2018-01-31 0.109076
# 2018-02-28 -1.561175
# 2018-03-31 0.764723
# 2018-04-30 2.429043
# 2018-05-31 -1.679758
# 2018-06-30 -0.247692
# 2018-07-31 1.162934
# 2018-08-31 -0.374749
# 2018-09-30 0.626347
# Freq: M, Name: No1, dtype: float64
type(df['No1'])
# pandas.core.series.Series
# DataFrame的主要方法也可用于Series对象
# 6-2 Series对象的线图
import matplotlib.pyplot as plt
df['No1'].cumsum().plot(style='r', lw=2)
plt.xlabel('date')
plt.ylabel('value')
DataFrame的主要方法也可用于Series对象
Series对象的线图
import matplotlib.pyplot as plt
df['No1'].cumsum().plot(style='r', lw=2)
plt.xlabel('date')
plt.ylabel('value')
df['Quarter'] = ['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2', 'Q3', 'Q3', 'Q3']
df
# No1 No2 No3 No4 Quarter
# 2018-01-31 0.109076 -1.052750 1.253471 0.398460 Q1
# 2018-02-28 -1.561175 -1.997425 1.158739 -2.030734 Q1
# 2018-03-31 0.764723 0.760368 0.864103 -0.174079 Q1
# 2018-04-30 2.429043 0.281962 -0.496606 0.009445 Q2
# 2018-05-31 -1.679758 -1.023740 -1.135922 0.077649 Q2
# 2018-06-30 -0.247692 0.301198 2.156474 1.537902 Q2
# 2018-07-31 1.162934 2.102327 -0.450100 0.812529 Q3
# 2018-08-31 -0.374749 -0.818229 -1.013962 -0.476855 Q3
# 2018-09-30 0.626347 2.294829 -1.295310 -0.031501 Q3
groups = df.groupby('Quarter')
groups.mean()
# No1 No2 No3 No4
# Quarter
# Q1 -0.229125 -0.763269 1.092104 -0.602118
# Q2 0.167198 -0.146860 0.174649 0.541665
# Q3 0.471511 1.192976 -0.919790 0.101391
groups.max()
# No1 No2 No3 No4
# Quarter
# Q1 0.764723 0.760368 1.253471 0.398460
# Q2 2.429043 0.301198 2.156474 1.537902
# Q3 1.162934 2.294829 -0.450100 0.812529
groups.size()
# Quarter
# Q1 3
# Q2 3
# Q3 3
# dtype: int64
df['Odd_Even'] = ['Odd', 'Even', 'Odd', 'Even', 'Odd', 'Even', 'Odd', 'Even', 'Odd']
groups = df.groupby(['Quarter', 'Odd_Even'])
groups.size()
# Quarter Odd_Even
# Q1 Even 1
# Odd 2
# Q2 Even 2
# Odd 1
# Q3 Even 1
# Odd 2
# dtype: int64
groups.mean()
# No1 No2 No3 No4
# Quarter Odd_Even
# Q1 Even -1.561175 -1.997425 1.158739 -2.030734
# Odd 0.436899 -0.146191 1.058787 0.112190
# Q2 Even 1.090676 0.291580 0.829934 0.773673
# Odd -1.679758 -1.023740 -1.135922 0.077649
# Q3 Even -0.374749 -0.818229 -1.013962 -0.476855
# Odd 0.894640 2.198578 -0.872705 0.390514
import datetime
import pandas_datareader.data as web
start = datetime.datetime(2016, 1, 1) # or start = '1/1/2016' or '2016-1-1'
end = datetime.date.today()
prices = web.DataReader('AAPL', 'yahoo', start, end)
# 从雅虎财经拉取的苹果股价
prices.head()
Open | High | Low | Close | Adj | Close | Volume | |
---|---|---|---|---|---|---|---|
Date | |||||||
2015-12-31 | 107.010002 | 107.029999 | 104.820000 | 105.260002 | 100.540207 | 40635300 | |
2016-01-04 | 102.610001 | 105.370003 | 102.000000 | 105.349998 | 100.626175 | 67649400 | |
2016-01-05 | 105.750000 | 105.849998 | 102.410004 | 102.709999 | 98.104546 | 55791000 | |
2016-01-06 | 100.559998 | 102.370003 | 99.870003 | 100.699997 | 96.184654 | 68457400 | |
2016-01-07 | 98.680000 | 100.129997 | 96.430000 | 96.449997 | 92.125244 | 81094400 |
DataReader函数参数(源代码的参数说明已经很详细了)
参数 | 格式 | 描述 |
---|---|---|
name | 字符串 | 数据集名称——通常是股票代码 |
data_source | 如“yahoo” | “yahoo”:Yahoo! Finance, ”google”:Google Finance, ”fred”:St. Louis FED (FRED), ”famafrench”:Kenneth French’s data library, ”edgar-index”:the SEC’s EDGAR Index |
start | 字符串/日期时间/None | 范围左界(默认”2010/1/1”) |
end | 字符串/日期时间/None | 范围右界(默认当天) |
指数历史水平
prices['Close'].plot(figsize=(8, 5))
指数和每日指数收益
prices[['Close', 'Return']].plot(subplots=True, style='b', figsize=(8, 5))
指数及移动平均线
prices[['Close', '42d', '252d']].plot(figsize=(8, 5))
指数和移动年化波动率
import math
prices['Mov_Vol'] = pd.rolling_std(prices['Return'], window=252) * math.sqrt(252)
prices[['Close', 'Mov_Vol', 'Return']].plot(subplots=True, style='b', figsize=(8, 7))
import pandas as pd
import urllib.request
es_url = 'https://www.stoxx.com/document/Indices/Current/HistoricalData/hbrbcpe.txt'
vs_url = 'https://www.stoxx.com/document/Indices/Current/HistoricalData/h_vstoxx.txt'
es_txt = 'E:/data/es.txt'
vs_txt = 'E:/data/vs.txt'
urllib.request.urlretrieve(es_url, es_txt)
urllib.request.urlretrieve(vs_url, vs_txt)
# 数据处理
lines = open(es_txt, 'r').readlines()
lines = [line.replace(' ', '') for line in lines]
# 生成一个新的文本文件
es50_txt = 'E:/data/es50.txt'
new_file = open(es50_txt, 'w')
new_file.writelines('date' + lines[3][:-1] + ';DEL' + lines[3][-1]) # DEL用来占位
new_file.writelines(lines[4:])
new_file.close()
new_lines = open(es50_txt, 'r').readlines()
new_lines[:5]
# ['date;SX5P;SX5E;SXXP;SXXE;SXXF;SXXA;DK5F;DKXF;DEL\n',
# '31.12.1986;775.00;900.82;82.76;98.58;98.06;69.06;645.26;65.56\n',
# '01.01.1987;775.00;900.82;82.76;98.58;98.06;69.06;645.26;65.56\n',
# '02.01.1987;770.89;891.78;82.57;97.80;97.43;69.37;647.62;65.81\n',
# '05.01.1987;771.89;898.33;82.82;98.60;98.19;69.16;649.94;65.82\n']
es = pd.read_csv(es50_txt, index_col=0, parse_dates=True, sep=';', dayfirst=True)
es.head()
# SX5P SX5E SXXP SXXE SXXF SXXA DK5F DKXF DEL
# date
# 1986-12-31 775.00 900.82 82.76 98.58 98.06 69.06 645.26 65.56 NaN
# 1987-01-01 775.00 900.82 82.76 98.58 98.06 69.06 645.26 65.56 NaN
# 1987-01-02 770.89 891.78 82.57 97.80 97.43 69.37 647.62 65.81 NaN
# 1987-01-05 771.89 898.33 82.82 98.60 98.19 69.16 649.94 65.82 NaN
# 1987-01-06 775.92 902.32 83.28 99.19 98.83 69.50 652.49 66.06 NaN
# 辅助列已经完成了使命,可以删除
del es['DEL']
es.info()
#
# DatetimeIndex: 7673 entries, 1986-12-31 to 2016-10-04
# Data columns (total 8 columns):
# SX5P 7673 non-null float64
# SX5E 7673 non-null float64
# SXXP 7673 non-null float64
# SXXE 7673 non-null float64
# SXXF 7673 non-null float64
# SXXA 7673 non-null float64
# DK5F 7673 non-null float64
# DKXF 7673 non-null float64
# dtypes: float64(8)
# memory usage: 539.5 KB
# 使用 read_csv 函数的高级功能,使导人更加紧凑和高效:
cols = ['SX5P', 'SX5E', 'SXXP', 'SXXE', 'SXXF', 'SXXA', 'DK5F', 'DKXF']
es = pd.read_csv(es_url, index_col=0, parse_dates=True, sep=';', dayfirst=True, header=None, skiprows=4, names=cols)
es.tail()
# SX5P SX5E SXXP SXXE SXXF SXXA DK5F DKXF
# 2016-09-28 2846.55 2991.11 342.57 324.24 407.97 350.45 9072.09 581.27
# 2016-09-29 2848.93 2991.58 342.72 324.08 407.65 350.90 9112.09 582.60
# 2016-09-30 2843.17 3002.24 342.92 325.31 408.27 350.09 9115.81 583.26
# 2016-10-03 2845.43 2998.50 343.23 325.08 408.44 350.92 9131.24 584.32
# 2016-10-04 2871.06 3029.50 346.10 327.73 411.41 353.92 9212.05 588.71
vs = pd.read_csv(vs_txt, index_col=0, header=2, parse_dates=True, sep=',', dayfirst=True)
vs.info()
#
# DatetimeIndex: 4357 entries, 1999-01-04 to 2016-02-12
# Data columns (total 9 columns):
# V2TX 4357 non-null float64
# V6I1 3906 non-null float64
# V6I2 4357 non-null float64
# V6I3 4296 non-null float64
# V6I4 4357 non-null float64
# V6I5 4357 non-null float64
# V6I6 4340 non-null float64
# V6I7 4357 non-null float64
# V6I8 4343 non-null float64
# dtypes: float64(9)
# memory usage: 340.4 KB
pd.read_csv参数
pd.read_csv(filepath_or_buffer, sep=’,’, delimiter=None, header=’infer’, names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression=’infer’, thousands=None, decimal=b’.’, lineterminator=None, quotechar=’”’, quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=False, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, skip_footer=0, doublequote=True, delim_whitespace=False, as_recarray=False, compact_ints=False, use_unsigned=False, low_memory=True, buffer_lines=None, memory_map=False, float_precision=None)
import datetime as dt
data = pd.DataFrame({'EUROSTOXX': es['SX5E'][es.index > dt.datetime(1999, 1, 1)]})
data = data.join(pd.DataFrame({'VSTOXX': vs['V2TX'][vs.index > dt.datetime(1999, 1, 1)]}))
data = data.fillna(method='ffill')
data.info()
#
# DatetimeIndex: 4554 entries, 1999-01-04 to 2016-10-04
# Data columns (total 2 columns):
# EUROSTOXX 4554 non-null float64
# VSTOXX 4554 non-null float64
# dtypes: float64(2)
# memory usage: 266.7 KB
data.tail()
# EUROSTOXX VSTOXX
# 2016-09-28 2991.11 35.6846
# 2016-09-29 2991.58 35.6846
# 2016-09-30 3002.24 35.6846
# 2016-10-03 2998.50 35.6846
# 2016-10-04 3029.50 35.6846
# EURO STOXX 50指数和VSTOXX波动率指数
data.plot(subplots=True,grid=True,style='b',figsize=(8,6))
EURO STOXX 50和VSTOXX对数收益率
rets=np.log(data/data.shift(1))
rets.head()
# EUROSTOXX VSTOXX
# 1999-01-04 NaN NaN
# 1999-01-05 0.017228 0.489248
# 1999-01-06 0.022138 -0.165317
# 1999-01-07 -0.015723 0.256337
# 1999-01-08 -0.003120 0.021570
# EURO STOXX 50和VSTOXX对数收益率
rets.plot(subplots=True,grid=True,style='b',figsize=(8,6))
对数收益率散点图和回归线
# 对数收益率散点图和回归线
y = rets.VSTOXX # 结果变量
X = rets.EUROSTOXX # 预测变量
p_inf=float("inf") # 正无穷
n_inf=float("-inf") # 负无穷
y = y.map(lambda x: y.median() if x == p_inf or x == n_inf or np.isnan(x) else x)
X = X.map(lambda x: y.median() if x == p_inf or x == n_inf or np.isnan(x) else x)
import matplotlib.pyplot as plt
import statsmodels.api as sm
model = sm.OLS(y, X)
model = model.fit()
model.summary()
#
# """
# OLS Regression Results
# ==============================================================================
# Dep. Variable: VSTOXX R-squared: 0.526
# Model: OLS Adj. R-squared: 0.525
# Method: Least Squares F-statistic: 5043.
# Date: Wed, 27 Jun 2018 Prob (F-statistic): 0.00
# Time: 19:55:10 Log-Likelihood: 8271.0
# No. Observations: 4554 AIC: -1.654e+04
# Df Residuals: 4553 BIC: -1.653e+04
# Df Model: 1
# Covariance Type: nonrobust
# ==============================================================================
# coef std err t P>|t| [0.025 0.975]
# ------------------------------------------------------------------------------
# EUROSTOXX -2.7538 0.039 -71.015 0.000 -2.830 -2.678
# ==============================================================================
# Omnibus: 1298.794 Durbin-Watson: 2.085
# Prob(Omnibus): 0.000 Jarque-Bera (JB): 24719.733
# Skew: 0.876 Prob(JB): 0.00
# Kurtosis: 14.279 Cond. No. 1.00
# ==============================================================================
# Warnings:
# [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# """
model.params
# EUROSTOXX -2.753815
# dtype: float64
# 选择100个从最小值到最大值平均分布(equally spaced)的数据点
X_prime = np.linspace(X.min(), X.max(), 100)[:, np.newaxis]
# 计算预测值
y_hat = model.predict(X_prime)
plt.scatter(X, y, alpha=0.3) # 画出原始数据
plt.xlabel("EURO STOXX 50 returns")
plt.ylabel("VSTOXX returns")
plt.plot(X_prime, y_hat, 'r', alpha=0.9) # 添加回归线,红色
EURO STOXX 50和VSTOXX之间的滚动相关
rets.corr()
# EUROSTOXX VSTOXX
# EUROSTOXX 1.000000 -0.724945
# VSTOXX -0.724945 1.000000
pd.rolling_corr(rets.EUROSTOXX,rets.VSTOXX,window=252).plot(grid=True,style='b')
一个交易日的股价分时数据和交易量
import numpy as np
import datetime as dt
import tushare as ts
import datetime
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 这里用tushare获取了600118中国卫星一周的5分钟分时数据
k_5 = ts.get_k_data(code="600118", start="2018-06-18", end="2018-06-22", ktype='5')
k_5.head()
# 这个接口返回的数据和查询条件有些对不上,当测试数据还是可以的
# date open close high low volume code
# 0 2018-06-12 14:55 20.57 20.57 20.58 20.54 489.0 600118
# 1 2018-06-12 15:00 20.56 20.56 20.57 20.54 1163.0 600118
# 2 2018-06-13 09:35 20.48 20.30 20.48 20.26 2895.0 600118
# 3 2018-06-13 09:40 20.30 20.34 20.39 20.30 1258.0 600118
# 4 2018-06-13 09:45 20.35 20.40 20.45 20.35 535.0 600118
k_5['date'] = k_5['date'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M'))
# 筛选出一天的数据
plot_data = k_5[['date','close', 'volume']][(k_5['date'] > dt.datetime(2018, 6, 20)) & (k_5['date'] < dt.datetime(2018, 6, 21))]
# 横坐标
l = plot_data.index
lx = plot_data['date'].map(lambda x: datetime.datetime.strftime(x,'%m-%d %H:%M'))
# 画图
fig, (ax1, ax2) = plt.subplots(2, sharex=True, figsize=(8, 6))
# 折线图
ax1.plot(l, plot_data['close'].values)
ax1.set_title('中国卫星')
ax1.set_ylabel('index level')
# 柱状图
ax2.bar(l, plot_data['volume'].values.astype('int'), width=0.5)
ax2.set_ylabel('volume')
plt.xticks(l, lx, rotation=270)
fig.subplots_adjust(bottom=0.3)
plt.show()