机器学习 线性回归案例实战(python)

一、分析问题

	尝试使用线性回归模型通过已有的数据来预测股票收盘问题。

二、获取数据

https://pan.baidu.com/s/1gVb99g7LXyicrv2RPbHQCQ
提取码: uq76

三、实战操作

1、导入数据
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib import  style
import datetime as dt1
from datetime import datetime as dt
#导入数据集
df = pd.read_csv(r'E:/桌面/个人文件/数据挖掘/prices.csv',engine='python')
#显示数据集
print(type(df))
display(df)
# 查找symbol下面为CSCO的特征并取CSCO的最后5行
df = df.loc[df['symbol']=='CSCO']
display(df)
2、数据处理
#sklearn库 机器学习库,
#sklearn库2,preprocessing预处理、调用模型(svm)、model_selection(模型相关操作)
#sklearn库3 ,linearregression线性回归库
import sklearn
from sklearn import  preprocessing,svm,model_selection
from sklearn.linear_model import  LinearRegression
import math
print(sklearn.__version__)
#输出 预测值 = 收盘价
forecast_col = 'close'
# fillna  使用指定的方法填充NA/NaN值。如用0替换所有 NaN;inplace 操作是改变后的值和原来的值内存地址是同一个。
df.fillna(value=0,inplace=True)
print(len(df))#1762
# 设置预测因子系数
forecast_out = int(math.ceil(0.01*len(df)))
print('预测长度:',forecast_out)#18
# 来获得标签
df['label'] = df[forecast_col].shift(-forecast_out)
df.head()
# 删去标记 股票代号 交易日 这三类是不需要进行预处理
X = np.array(df.drop(['label','symbol','date'],axis=1))# drop()删除指定的列或者行
print('pre',X)
#pre [[2.4110001e+01 2.4690001e+01 2.4010000e+01 2.4840000e+01 5.9853700e+07]
 #[2.4600000e+01 2.4580000e+01 2.4379999e+01 2.4730000e+01 4.5124500e+07]
 #[2.4540001e+01 2.4420000e+01 2.4340000e+01 2.4740000e+01 3.5715700e+07]
 #...
 #[3.0700001e+01 3.0420000e+01 3.0350000e+01 3.0770000e+01 1.2022200e+07]
 #[3.0370001e+01 3.0459999e+01 3.0330000e+01 3.0549999e+01 1.0995600e+07]
 #[3.0559999e+01 3.0219999e+01 3.0129999e+01 3.0600000e+01 2.0190000e+07]]
print(X.shape)#(1762, 5)
print('----------------')
X = preprocessing.scale(X)# preprocessing.scale()进行输入特征的均值方差标准化
print('pro',X)
 #pro [[ 0.20065804  0.33166515  0.22464105  0.31929195  0.57852431]
 #[ 0.31349753  0.30639196  0.31004432  0.29407151  0.10161558]
 #[ 0.29968065  0.26963129  0.30081174  0.29636427 -0.20302682]
 #...
 #[ 1.71823707  1.64815648  1.6880416   1.6789031  -0.97018573]
 #[ 1.64224298  1.65734642  1.68342519  1.62846199 -1.00342545]
 #[ 1.68599669  1.60220541  1.6372609   1.63992605 -0.70572499]]
print(X.shape)#(1762, 5)
X_lately = X[-forecast_out:]
print(X_lately)
'''
[[ 1.42347187  1.39772441  1.43183109  1.40835653 -0.47382726]
 [ 1.43728898  1.54017224  1.45029671  1.5115313  -0.38226099]
 [ 1.58236884  1.54017224  1.58878888  1.57802133 -0.66752165]
 [ 1.55243154  1.56544474  1.54493303  1.51840937 -0.66777744]
 [ 1.57546006  1.59071793  1.61417934  1.54821558 -0.79969374]
 [ 1.62612256  1.6872147   1.65341879  1.64909689 -0.5482718 ]
 [ 1.69520833  1.65734642  1.70189081  1.74768613 -0.4672253 ]
 [ 1.69290548  1.69640464  1.72497284  1.68578117 -0.68227975]
 [ 1.71823707  1.6872147   1.66957597  1.66743903  0.1259318 ]
 [ 1.70672258  1.72397537  1.73189768  1.701831   -0.77085425]
 [ 1.74817391  1.68032184  1.7134316   1.701831   -0.79362602]
 [ 1.71593422  1.64815648  1.70189081  1.66056119 -0.8903178 ]
 [ 1.6491513   1.65734642  1.66495957  1.61241284 -0.94060477]
 [ 1.68139145  1.67342967  1.69958261  1.63992605 -1.0626165 ]
 [ 1.7044195   1.70789257  1.74574666  1.69036694 -0.9004652 ]
 [ 1.71823707  1.64815648  1.6880416   1.6789031  -0.97018573]
 [ 1.64224298  1.65734642  1.68342519  1.62846199 -1.00342545]
 [ 1.68599669  1.60220541  1.6372609   1.63992605 -0.70572499]]
 '''
print(X_lately.shape)#(18, 5)
X = X[:-forecast_out]
print(X)
'''
[[ 0.20065804  0.33166515  0.22464105  0.31929195  0.57852431]
 [ 0.31349753  0.30639196  0.31004432  0.29407151  0.10161558]
 [ 0.29968065  0.26963129  0.30081174  0.29636427 -0.20302682]
 ...
 [ 1.52019163  1.42529514  1.46876234  1.46796849 -0.51772921]
 [ 1.39814027  1.37934407  1.40413289  1.401478   -0.48297741]
 [ 1.40274621  1.44367548  1.44337234  1.4106493  -0.68384363]]
 '''
print(X.shape)#(1744, 5)

df.fillna(value=0,inplace=True)#缺失值补0
# 检索
y = np.array(df['label'])
y = y[:-forecast_out]
print(X.shape)#(1744, 5)
print(y.shape)#(1744,)
3、模型构建
from sklearn.model_selection import train_test_split
# 训练集包括1280个样本,测试集合包括346个样本   交叉校验 划分
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
'''
(1395, 5)
(349, 5)
(1395,)
(349,)
'''
clf = LinearRegression(n_jobs=-1)#LinearRegression线性回归方法,jobs控制并行化的线程数量
clf.fit(X_train, y_train)#fit()训练并拟合模型
confidence = clf.score(X_test, y_test)# score用测试集进行测试并且返回评估结果
print(confidence)#0.881276125509551
4、模型预测
forecast_set = clf.predict(X_lately)# predict()把训练好的clf模型拿来做预测
print(forecast_set)
'''
[29.01489553 29.81355237 29.72397749 29.63729188 29.61532773 30.19002157
 30.44898265 30.17253188 30.12247329 30.31882204 30.30561733 30.06491368
 29.96730369 30.04939818 30.19005909 30.23193704 29.98774384 30.05285623]
 '''
df['Forecast'] = np.nan
last_date = df.iloc[-1].date# .iloc()用来进行索引,返回一个series类型,series类型由一组数据及与之相关的数据索引组成,Series类型可以由如下类型创建
print(last_date)#2016/12/30
display(df)
last_date=dt.strptime('2016-12-30', '%Y-%m-%d').timestamp()#转换成时间戳
print(last_date)#1483027200.0
last_unix = last_date
one_day = 86400
next_unix = last_unix+one_day

for i in forecast_set:
    next_date = dt.fromtimestamp(next_unix)
    next_unix+=86400
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]
print(df.tail())
5、显示图像
plt.figure(figsize=(20,10))
df['close'].plot()
df['Forecast'].plot()
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()
6、结果显示

机器学习 线性回归案例实战(python)_第1张图片

你可能感兴趣的:(机器学习的应用,python,线性回归,sklearn)