7.8.1 导入数据
# Tushare是一个免费、开源的python财经数据接口包。主要实现对股票等金融数据从数据采集、清洗加工 到 数据存储的过程
import tushare as ts
cons = ts.get_apis()
df = ts.bar('000300', conn=cons, asset='INDEX', start_date='2010-01-01', end_date='')
df = df.dropna()
Index(['code', 'open', 'close', 'high', 'low', 'vol', 'amount', 'p_change'], dtype='object')
7.8.2 数据概览
open | close | high | low | vol | amount | p_change | |
count | 2751.000000 | 2751.000000 | 2751.000000 | 2751.000000 | 2.751000e+03 | 2.751000e+03 | 2751.000000 |
mean | 3312.708859 | 3315.500174 | 3341.218680 | 3284.866252 | 1.142116e+06 | 1.474558e+11 | 0.024391 |
std | 782.131796 | 782.340288 | 788.871807 | 773.029955 | 8.836562e+05 | 1.300980e+11 | 1.454752 |
min | 2079.870000 | 2086.970000 | 2118.790000 | 2023.170000 | 2.190120e+05 | 2.120044e+10 | -8.750000 |
25% | 2611.760000 | 2613.520000 | 2632.355000 | 2591.375000 | 6.063705e+05 | 6.562710e+10 | -0.640000 |
50% | 3273.890000 | 3276.670000 | 3304.260000 | 3247.690000 | 8.833630e+05 | 1.065559e+11 | 0.040000 |
75% | 3822.735000 | 3827.870000 | 3847.855000 | 3790.325000 | 1.329321e+06 | 1.751813e+11 | 0.720000 |
max | 5922.070000 | 5807.720000 | 5930.910000 | 5747.660000 | 6.864391e+06 | 9.494980e+11 | 6.710000 |
7.8.3 预处理数据
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.transforms as transforms
%matplotlib inline
n = 30
LR = 0.001
EPOCH = 200
train_end =-600
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def generate_data_by_n_days(series, n, index=False):
if len(series) <= n:
raise Exception("The Length of series is %d, while affect by (n=%d)." % (len(series), n))
df = pd.DataFrame()
for i in range(n):
df['c%d' % i] = series.tolist()[i:-(n - i)]
df['y'] = series.tolist()[n:]
if index:
df.index = series.index[n:]
return df
def readData(column='high', n=30, all_too=True, index=False, train_end=-500):
df = pd.read_csv("sh300.csv", index_col=0)
df.index = list(map(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"), df.index))
df_column = df[column].copy()
df_column_train, df_column_test = df_column[:train_end], df_column[train_end - n:]
df_generate_train = generate_data_by_n_days(df_column_train, n, index=index)
if all_too:
return df_generate_train, df_column, df.index.tolist()
return df_generate_train
7.8.4 定义模型
class RNN(nn.Module):
def __init__(self, input_size):
super(RNN, self).__init__()
self.rnn = nn.LSTM(
self.out = nn.Sequential(
nn.Linear(64, 1)
def forward(self, x):
r_out, (h_n, h_c) = self.rnn(x, None) #None即隐层状态用0初始化
out = self.out(r_out)
return out
class mytrainset(Dataset):
def __init__(self, data):
self.data, self.label = data[:, :-1].float(), data[:, -1].float()
def __getitem__(self, index):
return self.data[index], self.label[index]
def __len__(self):
return len(self.data)
7.8.5 训练模型
from pandas.plotting import register_matplotlib_converters
# 获取训练数据、原始数据、索引等信息
df, df_all, df_index = readData('high', n=n, train_end=train_end)
df_all = np.array(df_all.tolist())
plt.plot(df_index, df_all, label='real-data')
plt.legend(loc='upper right')
df_numpy = np.array(df)
df_numpy_mean = np.mean(df_numpy)
df_numpy_std = np.std(df_numpy)
df_numpy = (df_numpy - df_numpy_mean) / df_numpy_std
df_tensor = torch.Tensor(df_numpy)
trainset = mytrainset(df_tensor)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=False)
from tensorboardX import SummaryWriter
writer = SummaryWriter(log_dir='logs')
rnn = RNN(n).to(device)
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)
loss_func = nn.MSELoss()
for step in range(EPOCH):
for tx, ty in trainloader:
output = rnn(torch.unsqueeze(tx, dim=1)).to(device)
loss = loss_func(torch.squeeze(output), ty)
writer.add_scalar('sh300_loss', loss, step)
D:\sofewore\anaconda\lib\site-packages\torch\nn\modules\loss.py:432: UserWarning: Using a target size (torch.Size([1])) that is different to the input size (torch.Size([])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input, target, reduction=self.reduction)
7.8.6 测试模型
generate_data_train = []
generate_data_test = []
test_index = len(df_all) + train_end
df_all_normal = (df_all - df_numpy_mean) / df_numpy_std
df_all_normal_tensor = torch.Tensor(df_all_normal)
for i in range(n, len(df_all)):
x = df_all_normal_tensor[i - n:i].to(device)
x = torch.unsqueeze(torch.unsqueeze(x, dim=0), dim=0)
y = rnn(x).to(device)
if i < test_index:
generate_data_train.append(torch.squeeze(y).detach().cpu().numpy() * df_numpy_std + df_numpy_mean)
generate_data_test.append(torch.squeeze(y).detach().cpu().numpy() * df_numpy_std + df_numpy_mean)
plt.plot(df_index[n:train_end], generate_data_train, label='generate_train')
plt.plot(df_index[train_end:], generate_data_test, label='generate_test')
plt.plot(df_index[train_end:], df_all[train_end:], label='real-data')
plt.plot(df_index[train_end:-500], df_all[train_end:-500], label='real-data')
plt.plot(df_index[train_end:-500], generate_data_test[-600:-500], label='generate_test')