尝试使用线性回归模型通过已有的数据来预测股票收盘问题。
https://pan.baidu.com/s/1gVb99g7LXyicrv2RPbHQCQ
提取码: uq76
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import datetime as dt1
from datetime import datetime as dt
#导入数据集
df = pd.read_csv(r'E:/桌面/个人文件/数据挖掘/prices.csv',engine='python')
#显示数据集
print(type(df))
display(df)
# 查找symbol下面为CSCO的特征并取CSCO的最后5行
df = df.loc[df['symbol']=='CSCO']
display(df)
#sklearn库 机器学习库,
#sklearn库2,preprocessing预处理、调用模型(svm)、model_selection(模型相关操作)
#sklearn库3 ,linearregression线性回归库
import sklearn
from sklearn import preprocessing,svm,model_selection
from sklearn.linear_model import LinearRegression
import math
print(sklearn.__version__)
#输出 预测值 = 收盘价
forecast_col = 'close'
# fillna 使用指定的方法填充NA/NaN值。如用0替换所有 NaN;inplace 操作是改变后的值和原来的值内存地址是同一个。
df.fillna(value=0,inplace=True)
print(len(df))#1762
# 设置预测因子系数
forecast_out = int(math.ceil(0.01*len(df)))
print('预测长度:',forecast_out)#18
# 来获得标签
df['label'] = df[forecast_col].shift(-forecast_out)
df.head()
# 删去标记 股票代号 交易日 这三类是不需要进行预处理
X = np.array(df.drop(['label','symbol','date'],axis=1))# drop()删除指定的列或者行
print('pre',X)
#pre [[2.4110001e+01 2.4690001e+01 2.4010000e+01 2.4840000e+01 5.9853700e+07]
#[2.4600000e+01 2.4580000e+01 2.4379999e+01 2.4730000e+01 4.5124500e+07]
#[2.4540001e+01 2.4420000e+01 2.4340000e+01 2.4740000e+01 3.5715700e+07]
#...
#[3.0700001e+01 3.0420000e+01 3.0350000e+01 3.0770000e+01 1.2022200e+07]
#[3.0370001e+01 3.0459999e+01 3.0330000e+01 3.0549999e+01 1.0995600e+07]
#[3.0559999e+01 3.0219999e+01 3.0129999e+01 3.0600000e+01 2.0190000e+07]]
print(X.shape)#(1762, 5)
print('----------------')
X = preprocessing.scale(X)# preprocessing.scale()进行输入特征的均值方差标准化
print('pro',X)
#pro [[ 0.20065804 0.33166515 0.22464105 0.31929195 0.57852431]
#[ 0.31349753 0.30639196 0.31004432 0.29407151 0.10161558]
#[ 0.29968065 0.26963129 0.30081174 0.29636427 -0.20302682]
#...
#[ 1.71823707 1.64815648 1.6880416 1.6789031 -0.97018573]
#[ 1.64224298 1.65734642 1.68342519 1.62846199 -1.00342545]
#[ 1.68599669 1.60220541 1.6372609 1.63992605 -0.70572499]]
print(X.shape)#(1762, 5)
X_lately = X[-forecast_out:]
print(X_lately)
'''
[[ 1.42347187 1.39772441 1.43183109 1.40835653 -0.47382726]
[ 1.43728898 1.54017224 1.45029671 1.5115313 -0.38226099]
[ 1.58236884 1.54017224 1.58878888 1.57802133 -0.66752165]
[ 1.55243154 1.56544474 1.54493303 1.51840937 -0.66777744]
[ 1.57546006 1.59071793 1.61417934 1.54821558 -0.79969374]
[ 1.62612256 1.6872147 1.65341879 1.64909689 -0.5482718 ]
[ 1.69520833 1.65734642 1.70189081 1.74768613 -0.4672253 ]
[ 1.69290548 1.69640464 1.72497284 1.68578117 -0.68227975]
[ 1.71823707 1.6872147 1.66957597 1.66743903 0.1259318 ]
[ 1.70672258 1.72397537 1.73189768 1.701831 -0.77085425]
[ 1.74817391 1.68032184 1.7134316 1.701831 -0.79362602]
[ 1.71593422 1.64815648 1.70189081 1.66056119 -0.8903178 ]
[ 1.6491513 1.65734642 1.66495957 1.61241284 -0.94060477]
[ 1.68139145 1.67342967 1.69958261 1.63992605 -1.0626165 ]
[ 1.7044195 1.70789257 1.74574666 1.69036694 -0.9004652 ]
[ 1.71823707 1.64815648 1.6880416 1.6789031 -0.97018573]
[ 1.64224298 1.65734642 1.68342519 1.62846199 -1.00342545]
[ 1.68599669 1.60220541 1.6372609 1.63992605 -0.70572499]]
'''
print(X_lately.shape)#(18, 5)
X = X[:-forecast_out]
print(X)
'''
[[ 0.20065804 0.33166515 0.22464105 0.31929195 0.57852431]
[ 0.31349753 0.30639196 0.31004432 0.29407151 0.10161558]
[ 0.29968065 0.26963129 0.30081174 0.29636427 -0.20302682]
...
[ 1.52019163 1.42529514 1.46876234 1.46796849 -0.51772921]
[ 1.39814027 1.37934407 1.40413289 1.401478 -0.48297741]
[ 1.40274621 1.44367548 1.44337234 1.4106493 -0.68384363]]
'''
print(X.shape)#(1744, 5)
df.fillna(value=0,inplace=True)#缺失值补0
# 检索
y = np.array(df['label'])
y = y[:-forecast_out]
print(X.shape)#(1744, 5)
print(y.shape)#(1744,)
from sklearn.model_selection import train_test_split
# 训练集包括1280个样本,测试集合包括346个样本 交叉校验 划分
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
'''
(1395, 5)
(349, 5)
(1395,)
(349,)
'''
clf = LinearRegression(n_jobs=-1)#LinearRegression线性回归方法,jobs控制并行化的线程数量
clf.fit(X_train, y_train)#fit()训练并拟合模型
confidence = clf.score(X_test, y_test)# score用测试集进行测试并且返回评估结果
print(confidence)#0.881276125509551
forecast_set = clf.predict(X_lately)# predict()把训练好的clf模型拿来做预测
print(forecast_set)
'''
[29.01489553 29.81355237 29.72397749 29.63729188 29.61532773 30.19002157
30.44898265 30.17253188 30.12247329 30.31882204 30.30561733 30.06491368
29.96730369 30.04939818 30.19005909 30.23193704 29.98774384 30.05285623]
'''
df['Forecast'] = np.nan
last_date = df.iloc[-1].date# .iloc()用来进行索引,返回一个series类型,series类型由一组数据及与之相关的数据索引组成,Series类型可以由如下类型创建
print(last_date)#2016/12/30
display(df)
last_date=dt.strptime('2016-12-30', '%Y-%m-%d').timestamp()#转换成时间戳
print(last_date)#1483027200.0
last_unix = last_date
one_day = 86400
next_unix = last_unix+one_day
for i in forecast_set:
next_date = dt.fromtimestamp(next_unix)
next_unix+=86400
df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]
print(df.tail())
plt.figure(figsize=(20,10))
df['close'].plot()
df['Forecast'].plot()
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()