参考:https://zhuanlan.zhihu.com/p/128927771
"""
https://github.com/yhannahwang/stock-prediction-on-lstm
https://zhuanlan.zhihu.com/p/128927771
"""
# 回归
# 导入相关包
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
from pandas import datetime
import math,time
import itertools
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import datetime
from operator import itemgetter
from sklearn.metrics import mean_squared_error
from math import sqrt
import torch
import torch.nn as nn
from torch.autograd import Variable
class LSTM(nn.Module):
def __init__(self,input_dim,hidden_dim,num_layers,output_dim):
super(LSTM,self).__init__()
# Hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.num_layers = num_layers
# Building LSTM
#batch_first=True #causes input/output tensors to be of shape (batch_dim, seq_dim, feature_dim)
# DataLoader返回数据时候一般第一维都是batch,pytorch的LSTM层默认输入和输出都是batch在第二维
self.lstm = nn.LSTM(input_dim,hidden_dim,num_layers,batch_first=True)
# 最后一个输出层
self.fc = nn.Linear(hidden_dim,output_dim) # 使用LSTM 时,最后一个全连接输出层另外写
def forward(self,x):
# initial hidden state with zeros
#h0 = torch.zeros(self.num_layers,x.size(0),self.hidden_dim) # 什么情况下要初始化h0 ,什么情况下不用
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
# Initialize cell state
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
# One time step
# We need to detach as we are doing truncated backpropagation through time (BPTT)
# If we don't, we'll backprop all the way to the start even after going through another batch
out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
out = self.fc(out)
#lstm_out, _ = self.lstm(x)
#out = self.fc(lstm_out) # out:(num,seq,target) # 初始化与不初始化有啥区别呢
return out
# 4. 根据LSTM需要的数据格式来创造数据集。 Lstm 需要的输入是一个 3D array [x,y,z]。 其中x是样本数,y是seq length,即需要看多少天的数据,z是特征数。 ???
def create_seq_data(data_raw, seq):
data_feat, data_target = [], []
for index in range(len(data_raw) - seq):
# 构建数据集
data_feat.append(data_raw[['Open', 'High', 'Low', 'Close']][index:index + seq])
# 构建target集
data_target.append(data_raw['target'][index:index + seq])
data_feat = np.array(data_feat) # (1808,20,4)
data_target = np.array(data_target) # 转变成ndarraay 格式转出 转变成 (1808,20)
data_target = data_target[:, :, np.newaxis] # 转变成 (1808,20,1)
return data_feat, data_target
# 5. 划分数据集 8:2
def train_test(data_feat, data_target, test_set_size, seq):
train_size = data_feat.shape[0] - (test_set_size)
trainX = torch.from_numpy(data_feat[:train_size].reshape(-1, seq, 4)).type(torch.Tensor) # 为了确认数据格式统一,再做一次 reshape
trainY = torch.from_numpy(data_target[:train_size].reshape(-1, seq, 1)).type(torch.Tensor)
testX = torch.from_numpy(data_feat[train_size:].reshape(-1, seq, 4)).type(torch.Tensor)
testY = torch.from_numpy(data_target[train_size:].reshape(-1, seq, 1)).type(torch.Tensor)
return trainX, trainY, testX, testY
def TransformerToDataloader(trainX, trainY, testX, testY,batch_size):
"""
将数据转变成dataloader格式
将X 和 y放在一起
tranin_loader的大小 = 长度/batch_size
"""
# trainX trainY 放在一起变换
train = torch.utils.data.TensorDataset(trainX, trainY) # 把feature 和 target 放在一起 # TensorDataset:1442
test = torch.utils.data.TensorDataset(testX, testY) # TensorDataset:366
train_loader = torch.utils.data.DataLoader(dataset=train,
batch_size=batch_size,
shuffle=False)
# train_loader: DataLoader :46 1442/32
test_loader = torch.utils.data.DataLoader(dataset=test,
batch_size=batch_size,
shuffle=False)
# test_loader: DataLoader :12 366/32
return train_loader,test_loader
if __name__ == '__main__':
# 1. 加载数据
dates = pd.date_range('2010-10-11','2017-10-11',freq ='B') # Q: 季度 Y:年 B:工作日
df_main = pd.DataFrame(index=dates) # (1828,0)
df_aaxj = pd.read_csv("C:\WORK\\xuxiu\learn\AI\stock-prediction-on-lstm-master\stock-prediction-on-lstm-master\data_stock\ETFs\\aaxj.us.txt", parse_dates=True, index_col=0) #(2325,6)
df_main = df_main.join(df_aaxj) #(1828,6) # 只要目标范围内的数据
sel_col = ['Open', 'High', 'Low', 'Close']
df_main = df_main[sel_col] #(1828,4)
# 2.归一化 两种方式,一种数据转换为 ndarry 的格式, 另外一种仍保持 dataframe的格式
df_main = df_main.fillna(method='ffill') # pad/ffill:用前一个非缺失值去填充该缺失值 backfill/bfill:用下一个非缺失值填充该缺失值
scaler = MinMaxScaler(feature_range=(-1,1))
#df_main = scaler.fit_transform(df_main) # 目标项也会送进去训练,所以所有值都要进行归一化 数据格式变成了 ndarry
for col in sel_col:
df_main[col] = scaler.fit_transform(df_main[col].values.reshape(-1, 1)) # 这种方法, df_main 还是 dataFrame的格式
# 3. 创建需要预测的序列 target 因为我们要预测下一个时间的收盘价,所以把close向上shift1个单位
df_main['target'] = df_main['Close'].shift(-1) # (1828,5) Index(['Open', 'High', 'Low', 'Close', 'target'], dtype='object')
df_main.dropna()
df_main = df_main.astype(np.float32)
seq = 20
test_set_size = int(np.round(0.2*df_main.shape[0]))
# 4
data_feat, data_target = create_seq_data(df_main,seq) # 所设定的每个时间区间是20天,通过高开低收4个来预测,datafeat的维度是(1808,20,4),对应的data_target的维度是(1808,20,1)
# data_feat : (1808,20,4) data_target: (1808,20,1)
# 5
trainX, trainY, testX, testY = train_test(data_feat, data_target, test_set_size, seq) # ([1442,20,4])
# 6 将数据转换成pytorch 可以接受的格式 通过dataloader 来读取数据,其中batch_size 等于要训练的样本树
n_steps = seq
batch_size = 32
num_epochs = 100
train_loader,test_loader = TransformerToDataloader(trainX, trainY, testX, testY,batch_size)
# 7 建立Lstm model
#LSTM的参数主要有input_dim, hidden_dim, num_layers, output_dim.
#其中input_dim可以看成是输入的特征数,在我们这里就是4;hidden_dim这里我们选了32,num_layers是有几层的lstm层,
# output_dim是最后输出几维,由于最后我们的target只是一维所以output_dim = 1.
input_dim = 4
hidden_dim = 32
num_layers = 2
output_dim = 1
model = LSTM(input_dim=input_dim,hidden_dim=hidden_dim,output_dim=output_dim,num_layers=num_layers)
print(model)
# 8 定义loss function 和优化函数
loss_fn = torch.nn.MSELoss(size_average=True)
optimiser = torch.optim.Adam(model.parameters(),lr=0.01)
# 9 训练模型
hist = np.zeros(num_epochs)
seq_dim = seq
for t in range(num_epochs):
# Initialise hidden state
# Don't do this if you want your LSTM to be stateful
# model.hidden = model.init_hidden()
y_train_pred = model(trainX) # 为啥是trainX, train_loader 怎么使用?
loss =loss_fn(y_train_pred,trainY)
if t%10 ==0 and t!=0:
print('Epoch',t,"MSE",loss.item())
hist[t] = loss.item()
# Zero out gradient, else they will accumulate between epochs
optimiser.zero_grad()
# Backward pass
loss.backward()
# Update parameters
optimiser.step()
# 10 预测
# make predictions
y_test_pred = model(testX)
# 下面进行比较。因为我们所拿到的最后的预测结果其实还是(num_sample,20, 1)
# 这样一个状态,但我们想要的是每个sample只要最后一个时期的预测,
# 所以我们就只拿每个sample的20天的预测结结果的最后一天的进行和真实的target比较就好,
# 即下面的y_test_pred.detach().numpy()[:,-1,0]
y_train_pred = scaler.inverse_transform(y_train_pred.detach().numpy()[:,-1,0].reshape(-1,1)) # 上面采用循环的方式进行scaler, 最后scaler是针对close进行的,所以这边可以直接拿来用。否则repeat 为5维再反归一化
# y_train_pred = scaler.inverse_transform(y_train_pred.detach().numpy()[:, -1, 0].reshape(-1, 1))
y_train = scaler.inverse_transform(trainY.detach().numpy()[:,-1,0].reshape(-1,1))
y_test_pred = scaler.inverse_transform(y_test_pred.detach().numpy()[:, -1, 0].reshape(-1, 1))
y_test = scaler.inverse_transform(testY.detach().numpy()[:, -1, 0].reshape(-1, 1))
# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(y_train, y_train_pred))
print('Train Score: %.2f RMSE' % (trainScore)) #0.68
testScore = math.sqrt(mean_squared_error(y_test, y_test_pred))
print('Test Score: %.2f RMSE' % (testScore)) # 0.85
# 绘图 训练集比较
plt.plot(y_train_pred, label="Preds")
plt.plot(y_train, label="Data")
plt.legend()
plt.show()
plt.plot(hist, label="Training loss")
plt.legend()
plt.show()
# 测试集比较
df_test_final = pd.DataFrame(y_test, columns=['y_test']).join(pd.DataFrame(y_test_pred, columns=['y_test_pred']))
df_test_final[['y_test', 'y_test_pred']].plot()
plt.ylabel("ETFs_price")
plt.show()
# shift train predictions for plotting
look_back = seq
trainPredictPlot = np.empty_like(df_main)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(y_train_pred) + look_back, :] = y_train_pred
# shift test predictions for plotting
testPredictPlot = np.empty_like(df_main)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(y_train_pred) + look_back - 1:len(df_main) - 1, :] = y_test_pred
# plot baseline and predictions
plt.figure(figsize=(15, 8))
plt.plot(scaler.inverse_transform(df_main))
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
# 定义LSTM分类模型
class LSTMClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super(LSTMClassifier, self).__init__()
self.hidden_dim = hidden_dim
self.lstm = nn.LSTM(input_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim) # 最后加一个全连接层
def forward(self, x):
lstm_out, _ = self.lstm(x)
out = self.fc(lstm_out[:, -1, :]) # 这个-1是什么意思
return out
# 准备数据
input_dim = 10 # 是序列长度还是 单个的特征个数? -- > 特征的个数
hidden_dim = 20
output_dim = 2 # outputdim 怎么确定? 这是一个二分类问题, 0/1 所以输出维度是2
num_epochs = 10
batch_size = 32
# 生成随机数据
data = torch.randn(100, 10, input_dim) # (100,10,10)
labels = torch.randint(0, 2, (100,)) #(100,)
# 创建数据加载器
dataset = torch.utils.data.TensorDataset(data, labels) #(TensorDataset:100)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# 创建模型和优化器
model = LSTMClassifier(input_dim, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练模型
for epoch in range(num_epochs):
for batch_data, batch_labels in dataloader:
optimizer.zero_grad()
output = model(batch_data)
loss = F.cross_entropy(output, batch_labels)
loss.backward()
optimizer.step()
print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")
# 使用模型进行预测
test_data = torch.randn(10, 10, input_dim)
predictions = model(test_data)
predicted_labels = torch.argmax(predictions, dim=1)
print("Predicted Labels:", predicted_labels)