)
# -*- coding: utf-8 -*-
# In[]
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data_train=pd.read_csv('task1_data_train.csv')
price_close=data_train.loc[:,'close']
# In[]
#数据可视化
plt1 =plt.figure(figsize=(8,5))
plt.plot(price_close)
plt.title('gzmt price close')
plt.xlabel('time serice')
plt.ylabel('price')
plt.show()
# In[]
#归一化
price_n=price_close/max(price_close)
print(price_n)
# In[]
#Input_shape(samples,time_steps.features)
#samples:样本数量(可不填写)
#time_steps:序列长度,及用多少个连续样本预测一个输出
#每个样本的特征数features
# In[]
#提取出符合要求的序列数据
def extract_data(data,time_step):
X=[]
y=[]
#0,1,2,3,...,10:11个样本;time_step=10;0,1...9;1,2,...,10两组(两组样本)
for i in range(len(data)-time_step):
X.append([a for a in data[i:i+time_step]])
y.append(data[i+time_step])
X=np.array(X)
X=X.reshape(X.shape[0],X.shape[1],1)
return X,y
# In[]
#函数功能确认
# test_data=[i for i in range(1,10)]
# test_step=5
# X,y=extract_data(test_data,test_step)
# print(test_data)
# print(X,y)
# In[]
#股票价格数据处理
time_step=10
X,y=extract_data(price_n, time_step)
X=np.array(X)
y=np.array(y)
print(X[0:2,:,:])
print(y)
print(X.shape,len(y))
# In[]
from keras.models import Sequential
from keras.layers import Dense,SimpleRNN
model=Sequential()
model.add(SimpleRNN(units=5,input_shape=(10,1),activation='relu'))
model.add(Dense(units=1,activation='linear'))
model.summary()
# In[]
#模型配置
model.compile(optimizer='adam',loss='mean_squared_error')
# In[]
#模型训练
model.fit(X,y,batch_size=30,epochs=200)
# In[]
#结果预测
y_train_predict=model.predict(X)
y_train_predict=y_train_predict*max(price_close)
print(y_train_predict)
y=y*max(price_close)
# In[]
#结构可视化
plt1 =plt.figure(figsize=(8,5))
plt.plot(y,label='real price')
plt.plot(y_train_predict,label='predict price')
plt.title('gzmt price close')
plt.xlabel('time serice')
plt.ylabel('price')
plt.legend()
plt.show()
# In[]
#模型评估
from sklearn.metrics import r2_score
r2_train=r2_score(y,y_train_predict)
print(r2_train)
# In[]
#测试集
data_test=pd.read_csv('task1_data_test.csv')
price_test=data_test.loc[:,'close']
# In[]
#归一化(使用前面数据的最大值来完成归一化)
price_test_n=price_test/max(price_close)
# In[]
#测试数据的数据提取
X_test,y_test=extract_data(price_test_n, time_step)
print(X_test.shape,len(y_test))
# In[]
#测试数据的预测
y_test_predict=model.predict(X_test)
y_test_predict=y_test_predict*max(price_close)
y_test=np.array(y_test)
y_test=y_test*max(price_close)
# In[]
plt1 =plt.figure(figsize=(8,5))
plt.plot(y_test,label='real price')
plt.plot(y_test_predict,label='predict price')
plt.title('gzmt price close')
plt.xlabel('time serice')
plt.ylabel('price')
plt.legend()
plt.show()
# In[]
r2_train=r2_score(y_test,y_test_predict)
print(r2_train)
# In[]
#数据存储
y_test_r=np.array(y_test).reshape(-1,1)
print(y_test_r.shape)
print(y_test_predict.shape)
final_result=np.concatenate((y_test_r,y_test_predict),axis=1)
final_result_df=pd.DataFrame(final_result,columns=["real price","predict price"])
final_result_df.to_csv('predict.csv')
# In[]
# -*- coding: utf-8 -*-
# In[]
# In[]
#文本数据加载
data=open('LSTM_data').read()
#移除换行符
data=data.replace('\n','').replace('\r','')
print(data)
# In[]
#字符去重处理
letters=list(set(data))
print(letters)
features=len(letters)
print(features)
# In[]
#建立字典
# int to char
int_to_char={a:b for a,b in enumerate(letters)}
print(int_to_char)
#char to int
char_to_int={b:a for a,b in enumerate(letters)}
print(char_to_int)
# In[]
time_step=30
import numpy as np
from keras.utils import to_categorical
#滑动窗口提取数据
def extract_data(data, slide):
x = []
y = []
for i in range(len(data) - slide):
x.append([a for a in data[i:i+slide]])
y.append(data[i+slide])
return x,y
#字符到数字的批量转化
def char_to_int_Data(x,y, char_to_int):
x_to_int = []
y_to_int = []
for i in range(len(x)):
x_to_int.append([char_to_int[char] for char in x[i]])
y_to_int.append([char_to_int[char] for char in y[i]])
return x_to_int, y_to_int
#实现输入字符文章的批量处理,输入整个字符、滑动窗口大小、转化字典
def data_preprocessing(data, slide, num_letters, char_to_int):
char_Data = extract_data(data, slide)
int_Data = char_to_int_Data(char_Data[0], char_Data[1], char_to_int)
Input = int_Data[0]
Output = list(np.array(int_Data[1]).flatten())
Input_RESHAPED = np.array(Input).reshape(len(Input), slide)
new = np.random.randint(0,10,size=[Input_RESHAPED.shape[0],Input_RESHAPED.shape[1],num_letters])
for i in range(Input_RESHAPED.shape[0]):
for j in range(Input_RESHAPED.shape[1]):
new[i,j,:] = to_categorical(Input_RESHAPED[i,j],num_classes=num_letters)
return new, Output
# In[]
#完成字符串预处理
X,y=data_preprocessing(data,time_step,features,char_to_int)
#data:待处理的字符串
#time_step:序列的长度
#features:样本的特征数
#char_to_int:字符转数字的字典
#X:转化为one-hot格式的数组
#y:转化为字符对应数值的列表
# In[]
#确认维度
print(X.shape)
print(len(y))
#数据分离
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=0)
print(X.shape,X_train.shape,X_test.shape)
# In[]
#输出结果格式转化
y_train_c=to_categorical(y_train,features)
print(y_train_c)
print(y_train_c.shape)
# In[]
#建立LSTM模型
from keras.models import Sequential
from keras.layers import Dense,LSTM
model=Sequential()
model.add(LSTM(units=30,input_shape=(X_train.shape[1],X_train.shape[2]),activation='relu'))
model.add(Dense(units=features,activation='softmax'))
model.summary()
# In[]
#参数配置与训练
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(X_train,y_train_c,batch_size=1000,epochs=50)
# In[]
model.fit(X_train,y_train_c,batch_size=1000,epochs=50)
# In[]
#新字符的预测
y_train_predict=model.predict_classes(X_train)
print(y_train_predict)
# In[]
#新字符串预测
new_letters='Artficial intelligence(AI),sometimes called machine intelligence,is intelligence demonstrated by machines'
X_new,y_new=data_preprocessing(new_letters,time_step,features,char_to_int)
y_new_predict=model.predict_classes(X_new)
# print(y_new_predict)
# In[]
#结果转化为字符
y_train_predict_char=[int_to_char[i] for i in y_train_predict]
# print(y_train_predict_char)
# In[]
#准确率
from sklearn.metrics import accuracy_score
accuracy_train=accuracy_score(y_train,y_train_predict)
#In[]
#测试数据的预测
y_test_predict=model.predict_classes(X_test)
y_test_predict_char=[int_to_char[i] for i in y_train_predict]
# print(y_test_predict_char)
# In[]
#准确率
accuracy_test=accuracy_score(y_test,y_test_predict)
print(accuracy_test)
# In[]
#新字符串的预测
new_letters='Artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines'
X_new,y_new=data_preprocessing(new_letters,time_step,features,char_to_int)
print(X_new.shape,len(y_new))
# In[]
y_new_predict=model.predict_classes(X_new)
print(y_new_predict)
# In[]
y_new_predict_char=[int_to_char[i] for i in y_new_predict]
print(y_new_predict_char)
# In[]
for i in range(0,X_new.shape[0]-30):
print(new_letters[i:i+30],'--predict new letters is >>>',y_new_predict_char[i])