----是使用Python实现的一些常见数据挖掘建模;
----对相关的必要的基础代码进行收集与改写后,可作为之后相关的数据工作的baseline;
----在Python 上跑通,对从事Python相关工作的程序员都了解:Python 2.x和3.x之间的语法差异其实几乎不会对开发造成影响
----2.x与3.x的代码迁移工作较小,注意几个书写的标点符号等小点便可以轻松地将2.x与3.x两种代码进行转换;2.x与3.x在代码中的函数调用方法以及代码书写几乎没有差异,因此大家完全可以直接简单修改print处的括号等偶尔版本间的差异便可以实现代码的转换–>使用2.x的代码来作为baseline完全不会对熟悉3.x的工作者造成影响;
----用于提升之后进行相关的数据挖掘的工作效率;
----希望对大家有所帮助;
# ----创建 数组---
arr1 = np.array([2,3,4]) # 通过列表创建数组
# result:
# [2 3 4]
arr2 = np.array([(1.3,9,2.0),(7,6,1)]) # 通过元组创建数组
# result:
# [[ 1.3 9. 2. ]
# [ 7. 6. 1. ]]
arr3 = np.zeros((2,3)) # 通过元组(2, 3)生成全零矩阵
# result:
# [[ 0. 0. 0.]
# [ 0. 0. 0.]]
arr4 = np.identity(3) # 生成3维的单位矩阵
# result:
# [[ 1. 0. 0.]
# [ 0. 1. 0.]
# [ 0. 0. 1.]]
arr5 = np.random.random(size = (2,3)) # 生成每个元素都在[0,1]之间的随机矩阵
# result:
# [[ 0.31654004 0.87056375 0.29050563]
# [ 0.55267505 0.59191276 0.20174988]]
arr6 = np.arange(5,20,3) # 生成等距序列,参数为起点,终点,步长值.含起点值,不含终点值
# result: [ 5 8 11 14 17]
arr7 = np.linspace(0,2,9) # 生成等距序列,参数为起点,终点,步长值.含起点值和终点值
# result: [ 0. 0.25 0.5 0.75 1. 1.25 1.5 1.75 2. ]
# 查看数组的属性
print arr2.shape # 返回矩阵的规格
# result: (2,3)
print arr2.ndim # 返回矩阵的秩
# result: 2
print arr2.size # 返回矩阵元素总数
# result: 6
print arr2.dtype.name # 返回矩阵元素的数据类型
# result: float64
print type(arr2) # 查看整个数组对象的类型
# result:
# 通过索引和切片访问数组元素
def f(x,y):
return 10*x+y
arr8 = np.fromfunction(f,(4,3),dtype = int)
print arr8
# result:
# [[ 0 1 2]
# [10 11 12]
# [20 21 22]
# [30 31 32]]
print arr8[1,2] #返回矩阵第1行,第2列的元素(注意下标从0开始)
# result: 12
print arr8[0:2,:] #切片,返回矩阵前2行
# result:
# [[ 0 1 2]
# [10 11 12]]
print arr8[:,1] #切片,返回矩阵第1列
# result: [ 1 11 21 31]
print arr8[-1] #切片,返回矩阵最后一行
# reuslt: [30 31 32]
# 通过迭代器访问数组元素
for row in arr8:
print row
# result:
# [0 1 2]
# [10 11 12]
# [20 21 22]
# [30 31 32]
for element in arr8.flat:
print element
# 输出矩阵全部元素
print '-'*70
print '''数组的运算'''
arr9 = np.array([[2,1],[1,2]])
arr10 = np.array([[1,2],[3,4]])
print arr9 - arr10
# result:
# [[ 1 -1]
# [-2 -2]]
print arr9**2
# result:
# [[4 1]
# [1 4]]
print 3*arr10
# result:
# [[ 3 6]
# [ 9 12]]
print arr9*arr10 #普通乘法
# result:
# [[2 2]
# [3 8]]
print np.dot(arr9,arr10) #矩阵乘法
# result:
# [[ 5 8]
# [ 7 10]]
print arr10.T #转置
# result:
# [[1 3]
# [2 4]]
print np.linalg.inv(arr10) #返回逆矩阵
# result:
# [[-2. 1. ]
# [ 1.5 -0.5]]
print arr10.sum() #数组元素求和
# result: 10
print arr10.max() #返回数组最大元素
# result: 4
print arr10.cumsum(axis = 1) #沿行累计总和
# result:
# [[1 3]
# [3 7]]
print '-'*70
print '''NumPy通用函数'''
print np.exp(arr9) #指数函数
# result:
# [[ 7.3890561 2.71828183]
# [ 2.71828183 7.3890561 ]]
print np.sin(arr9) #正弦函数(弧度制)
# result:
# [[ 0.90929743 0.84147098]
# [ 0.84147098 0.90929743]]
print np.sqrt(arr9) #开方函数
# result:
# [[ 1.41421356 1. ]
# [ 1. 1.41421356]]
print np.add(arr9,arr10) #和arr9+arr10效果一样
# result:
# [[3 3]
# [4 6]]
print '-'*70
# 合并
arr11 = np.vstack((arr9,arr10)) #纵向合并数组,由于与堆栈类似,故命名为vstack
print arr11
# result:
# [[2 1]
# [1 2]
# [1 2]
# [3 4]]
arr12 = np.hstack((arr9,arr10)) #横向合并数组
print arr12
# result:
# [[2 1 1 2]
# [1 2 3 4]]
# 分割
print np.hsplit(arr12,2) # 将数组横向分为2部分
# result:
# [array([[2, 1],
# [1, 2]]), array([[1, 2],
# [3, 4]])]
print np.vsplit(arr11,2) # 数组纵向分为2部分
# result:
# [array([[2, 1],
# [1, 2]]), array([[1, 2],
# [3, 4]])]
# -*- coding:utf-8 -*-
import pandas as pd # 为pandas取一个别名pd
data = {'id': ['Jack', 'Sarah', 'Mike'],
'age': [18, 35, 20],
'cash': [10.53, 500.7, 13.6]}
df = pd.DataFrame(data) # 调用构造函数并将结果赋值给df
print df
# result:
# age cash id
# 0 18 10.53 Jack
# 1 35 500.70 Sarah
# 2 20 13.60 Mike
df2 = pd.DataFrame(data, columns=['id', 'age', 'cash'],index=['one', 'two', 'three'])
print df2
# result:
# id age cash
# one Jack 18 10.53
# two Sarah 35 500.70
# three Mike 20 13.60
print df2['id']
# result:
# 0 Jack
# 1 Sarah
# 2 Mike
# Name: ID, dtype: object
s = pd.Series({'a': 4, 'b': 9, 'c': 16}, name='number')
print s
# result:
# a4
# b9
# c16
# Name: number, dtype: int64
print s[0]
# result: 4
print s[:3]
# result:
# a 4
# b 9
# c 16
# Name: number, dtype: int64
print s['a']
# result: 4
s['d'] = 25 # 如果系列中本身没有这个键值,则会新增一行
print s
# result:
# a 4
# b 9
# c 16
# d 25
# Name: number, dtype: int64
import numpy as np
print np.sqrt(s)
# result:
# a 2.0
# b 3.0
# c 4.0
# d 5.0
# Name: number, dtype: float64
print s*s
# result:
# a 16
# b 81
# c 256
# d 625
# Name: number, dtype: int64
print df['id'] # 按列名访问(call-by-column)
# result:
# one Jack
# two Sarah
# three Mike
# Name: ID, dtype: object
df['rich'] = df['cash'] > 200.0
print df
# result:
# age cash id rich
# 0 18 10.53 Jack False
# 1 35 500.70 Sarah True
# 2 20 13.60 Mike False
del df['rich']
print df
# result:
# age cash id
# 0 18 10.53 Jack
# 1 35 500.70 Sarah
# 2 20 13.60 Mike
# -*- coding:utf-8 -*-
from scipy import poly1d
p = poly1d([3, 4, 5])
print p
# result:
# 2
# 3 x + 4 x + 5
print p*p
# result:
# 4 3 2
# 9 x + 24 x + 46 x + 40 x + 25
print p.integ(k=6) # 求p(x)的不定积分,指定常数项为6
# result:
# 3 2
# 1 x + 2 x + 5 x + 6
print p.deriv() # 求p(x)的一阶导数
# result:
# 6 x + 4
p([4, 5]) # 计算每个值代入p(x)的结果
# result:
# array([ 69, 100])
# -*- coding:utf-8 -*-
import numpy as np
def addsubtract(a, b): # 按照原始定义,仅接受可比较的数字作为参数
if a > b:
return a - b
else:
return a + b
vec_addsubtract = np.vectorize(addsubtract)
print vec_addsubtract([0, 3, 6, 9], [1, 3, 5, 7])
# result:
# [1 6 1 2]
from sklearn import datasets
# 数据集类似字典对象,包括了所有的数据和关于数据的元数据(metadata)。
# 数据被存储在.data成员内,是一个n_samples*n_features的数组。
# 在有监督问题的情形下,一个或多个因变量(response variables)被储存在.target成员中
digits = datasets.load_digits()
# 例如在digits数据集中,digits.data是可以用来分类数字样本的特征
print digits.data
# result:
# [[ 0. 0. 5. ..., 0. 0. 0.]
# [ 0. 0. 0. ..., 10. 0. 0.]
# [ 0. 0. 0. ..., 16. 9. 0.]
# ...,
# [ 0. 0. 1. ..., 6. 0. 0.]
# [ 0. 0. 2. ..., 12. 0. 0.]
# [ 0. 0. 10. ..., 12. 1. 0.]]
# digits.target给出了digits数据集的目标变量,即每个数字图案对应的我们想预测的真是数字
print digits.target
# result:
# [0 1 2 ..., 8 9 8]
print '''训练和预测'''
from sklearn import svm
# 选择模型参数
clf = svm.SVC(gamma=0.0001,C=100)
# 我们的预测器的名字叫做clf。现在clf必须通过fit方法来从模型中学习。
# 这个过程是通过将训练集传递给fit方法来实现的。我们将除了最后一个样本的数据全部作为训练集。
# 进行训练
clf.fit(digits.data[:-1], digits.target[:-1])
# 进行预测
print clf.predict(digits.data[-1])
# result: 8
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(0, 10, 1000)
y = np.sin(x)
z = np.cos(x**2)
plt.figure(figsize=(8,4))
plt.plot(x,y,label="$sin(x)$",color="red",linewidth=2)
plt.plot(x,z,"b--",label="$cos(x^2)$")
plt.xlabel("Time(s)")
plt.ylabel("Volt")
plt.title("PyPlot First Example")
plt.ylim(-1.2,1.2)
plt.legend()
plt.show()
# -*- coding:utf-8 -*-
import matplotlib.pylab as plt
import numpy as np
# 第一部分
plt.subplot(2,1,1) # 参数依次为:行,列,第几项
# 第二部分
plt.subplot(2,2,3)
# 第三部分
plt.subplot(2,2,4)
plt.show()
# -*- coding:utf-8 -*-
import matplotlib.pylab as plt
import numpy as np
# 第一部分
plt.subplot(2,1,1) # 参数依次为:行,列,第几项
n = 12
X = np.arange(n)
Y1 = (1-X/float(n)) * np.random.uniform(0.5,1.0,n)
Y2 = (1-X/float(n)) * np.random.uniform(0.5,1.0,n)
# 利用plt.bar(x, y)绘制柱状图,并指定柱状图颜色,柱子边框颜色
plt.bar(X, +Y1, facecolor='#9999ff', edgecolor='white')
plt.bar(X, -Y2, facecolor='#ff9999', edgecolor='white')
for x, y in zip(X,Y1):
# 利用plt.text()指定文字出现的坐标和内容
plt.text(x+0.4, y+0.05, '%.2f' % y, ha='center', va='bottom')
# 利用plt.ylim(y1, y2)限制图形打印时对应的纵坐标范围
plt.ylim(-1.25,+1.25)
# 第二部分
plt.subplot(2,2,3)
n = 20
Z = np.random.uniform(0,1,n)
plt.pie(Z)
# 第三部分
plt.subplot(2,2,4)
X = np.linspace(-np.pi, np.pi, 256,endpoint=True)
Y_C, Y_S = np.cos(X), np.sin(X)
plt.plot(X, Y_C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, Y_S, color="red", linewidth=2.5, linestyle="-")
plt.xlim(X.min()*1.1, X.max()*1.1)
plt.xticks([-np.pi, -np.pi/2, 0, np.pi/2, np.pi],
[r'$-\pi$', r'$-\pi/2$', r'$0$', r'$+\pi/2$', r'$+\pi$'])
plt.ylim(Y_C.min()*1.1, Y_C.max()*1.1)
plt.yticks([-1, 0, +1],
[r'$-1$', r'$0$', r'$+1$'])
plt.show()
# -*- coding:utf-8 -*-
from bokeh.plotting import figure, output_file, show
x = [1, 2, 3, 4, 5]
y = [6, 7, 2, 4, 5]
# 输出为静态文件
output_file("../tmp/lines.html", title="line plot example")
# 创建一个figure对象,附带标题和坐标轴标记
p = figure(title="simple line example", x_axis_label='x', y_axis_label='y')
# 添加一条线,设置图例
p.line(x, y, legend="Line A.", line_width=2)
show(p)
# -*- coding:utf-8 -*-
from bokeh.plotting import figure, output_file, show
x = [1, 2, 3, 4, 5]
y = [6, 7, 2, 4, 5]
# 输出为静态文件
output_file("../tmp/lines.html", title="line plot example")
# 创建一个figure对象,附带标题和坐标轴标记
p = figure(title="simple line example", x_axis_label='x', y_axis_label='y')
# 添加一条线,设置图例
p.line(x, y, legend="Line A.", line_width=2)
show(p)
# -*- coding:utf8 -*-
import pandas as pd
from sklearn.linear_model import LogisticRegression, RandomizedLogisticRegression
from sklearn.cross_validation import train_test_split
# 导入数据并观察
data = pd.read_csv('../data/LogisticRegression.csv', encoding='utf-8')
# print data.head(5) # 查看数据框的头五行
# 将类别型变量进行独热编码one-hot encoding
data_dum = pd.get_dummies(data, prefix='rank', columns=['rank'], drop_first=True)
print data_dum.tail(5) # 查看数据框的最后五行
# result:
# admit gre gpa rank_2 rank_3 rank_4
# 395 0 620 4.00 1.0 0.0 0.0
# 396 0 560 3.04 0.0 1.0 0.0
# 397 0 460 2.63 1.0 0.0 0.0
# 398 0 700 3.65 1.0 0.0 0.0
# 399 0 600 3.89 0.0 1.0 0.0
# 切分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data_dum.ix[:, 1:], data_dum.ix[:, 0], test_size=.1, random_state=520)
lr = LogisticRegression() # 建立LR模型
lr.fit(X_train, y_train) # 用处理好的数据训练模型
print '逻辑回归的准确率为:{0:.2f}%'.format(lr.score(X_test, y_test) *100)
# -*- coding:utf-8 -*-
# 使用ID3算法进行分类
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as DTC, export_graphviz
data = pd.read_csv('../data/titanic_data.csv', encoding='utf-8')
data.drop(['PassengerId'], axis=1, inplace=True) # 舍弃ID列,不适合作为特征
# 数据是类别标签,将其转换为数,用1表示男,0表示女。
data.loc[data['Sex'] == 'male', 'Sex'] = 1
data.loc[data['Sex'] == 'female', 'Sex'] = 0
data.fillna(int(data.Age.mean()), inplace=True)
print data.head(5) # 查看数据
X = data.iloc[:, 1:3] # 为便于展示,未考虑年龄(最后一列)
y = data.iloc[:, 0]
dtc = DTC(criterion='entropy') # 初始化决策树对象,基于信息熵
dtc.fit(X, y) # 训练模型
print '输出准确率:', dtc.score(X,y)
# 可视化决策树,导出结果是一个dot文件,需要安装Graphviz才能转换为.pdf或.png格式
with open('../tmp/tree.dot', 'w') as f:
f = export_graphviz(dtc, feature_names=X.columns, out_file=f)
from sklearn import ensemble
from sklearn import tree
from sklearn import datasets
from sklearn.model_selection import train_test_split
wine = datasets.load_wine()
X_data = wine.data
y_data = wine.target
X_train,X_test,y_train,y_test = train_test_split(X_data,y_data,test_size = 0.3)
clf = tree.DecisionTreeClassifier(criterion='gini')
rfc = ensemble.RandomForestClassifier(random_state=10) # 随机森林中树的棵数
clf = clf.fit(X_train,y_train)
rfc = rfc.fit(X_train,y_train)
### 多次交叉验证
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
rfc_l = []
clf_l = []
for i in range(10):
rfc = ensemble.RandomForestClassifier(n_estimators=25)
rfc_s = cross_val_score(rfc,X_train,y_train,cv = 10).mean()
rfc_l.append(rfc_s)
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf_s = cross_val_score(clf,X_train,y_train,cv = 10).mean()
clf_l.append(clf_s)
plt.plot(range(1,11),rfc_l,label='RandomForest')
plt.plot(range(1,11),clf_l,label = "DecisionTree")
plt.legend()
plt.show()
#随机森林模型的属性.feature_importances_可以查看每个特征的重要性的比例
rfc.feature_importances_
>> array([0.22856109, 0.03452478, 0.01369924, 0.00668104, 0.01473769,
0.09992003, 0.08866531, 0.00509946, 0.02717142, 0.14646766,
0.07152866, 0.13577966, 0.12716396])
#随机森林模型的属性.feature_importances_可以查看每个特征的重要性的比例
clf.apply(X_test)
>> array([[ 9, 10, 2, ..., 6, 17, 10],
[14, 18, 10, ..., 10, 23, 16],
[14, 18, 10, ..., 10, 23, 16],
...,
[ 5, 14, 7, ..., 9, 24, 5],
[ 9, 16, 5, ..., 6, 16, 10],
[14, 18, 10, ..., 10, 23, 16]], dtype=int64)
#predice属性返回每个测试样本的预测结果,可以对比一下预测样本的真实标签 y_test。
rfc.predict(X_test)
>> array([0.22856109, 0.03452478, 0.01369924, 0.00668104, 0.01473769,
0.09992003, 0.08866531, 0.00509946, 0.02717142, 0.14646766,
0.07152866, 0.13577966, 0.12716396])
y_test
>> array([2, 0, 0, 1, 1, 2, 1, 2, 2, 1, 2, 2, 0, 1, 1, 2, 0, 2, 1, 0, 2, 1,
1, 0, 1, 0, 0, 2, 2, 0, 0, 1, 0, 2, 0, 0, 2, 1, 1, 1, 1, 2, 0, 1,
0, 1, 1, 0, 2, 1, 1, 1, 2, 0])
# BP神经网络Python实现
import numpy as np
from numpy import random
import math
import copy
import sklearn.datasets
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
# 获取数据并分为训练集与测试集
trainingSet, trainingLabels = sklearn.datasets.make_moons(400, noise=0.20)
plt.scatter(trainingSet[trainingLabels==1][:,0], trainingSet[trainingLabels==1][:,1], s=40, c='r', marker='x',cmap=plt.cm.Spectral)
plt.scatter(trainingSet[trainingLabels==0][:,0], trainingSet[trainingLabels==0][:,1], s=40, c='y', marker='+',cmap=plt.cm.Spectral)
plt.show()
testSet = trainingSet[320:]
testLabels = trainingLabels[320:]
trainingSet = trainingSet[:320]
trainingLabels = trainingLabels[:320]
# 设置网络参数
layer =[2,3,1] # 设置层数和节点数
Lambda = 0.005 # 正则化系数
alpha = 0.2 # 学习速率
num_passes = 10000 # 迭代次数
m = len(trainingSet) # 样本数量
# 建立网络
# 网络采用列表存储每层的网络结构,网络的层数和各层节点数都可以自由设定
b = [] # 偏置元,共layer-1个元素,b[0]代表第一个隐藏层的偏置元(向量形式)
W = []
for i in range(len(layer)-1):
W.append(random.random(size = (layer[i+1],layer[i]))) # W[i]表示网络第i层到第i+1层的转移矩阵(NumPy数组),输入层是第0层
b.append(np.array([0.1]*layer[i+1])) # 偏置元,b[i]的规模是1*第i+1个隐藏层节点数
a = [np.array(0)]*(len(W)+1) # a[0] = x,即输入,a[1]=f(z[0]),a[len(W)+1] = 最终输出
z = [np.array(0)]*len(W) # 注意z[0]表示是网络输入层的输出,即未被激活的第一个隐藏层
W = np.array(W)
def costfunction(predict,labels):
# 不加入正则化项的代价函数
# 输入参数格式为numpy的向量
return sum((predict - labels)**2)
def error_rate(predict,labels):
# 计算错误率,针对二分类问题,分类标签为0或1
# 输入参数格式为numpy的向量
error =0.0
for i in range(len(predict)):
predict[i] = round(predict[i])
if predict[i]!=labels[i]:
error+=1
return error/len(predict)
def sigmoid(z): # 激活函数sigmoid
return 1/(1+np.exp(-z))
def diff_sigmoid(z): # 激活函数sigmoid的导数
return sigmoid(z)*(1-sigmoid(z))
activation_function = sigmoid # 设置激活函数
diff_activation_function = diff_sigmoid # 设置激活函数的导数
# 开始训练BP神经网络
a[0] = np.array(trainingSet).T # 改一列为一个样本,一行代表一个特征
y = np.array(trainingLabels)
for v in range(num_passes):
# 前向传播
for i in range(len(W)):
z[i] = np.dot(W[i],a[i])
for j in range(m):
z[i][:,j]+=b[i] # 加上偏置元
a[i+1] = activation_function(z[i]) # 激活节点
predict = a[-1][0] # a[-1]是输出层的结果,即为预测值
# 反向传播
delta = [np.array(0)]*len(W) # delta[0]是第一个隐藏层的残差,delta[-1]是输出层的残差
# 计算输出层的残差
delta[-1] = -(y-a[-1])*diff_activation_function(z[-1])
# 计算第二层起除输出层外的残差
for i in range(len(delta)-1):
delta[-i-2] = np.dot(W[-i-1].T,delta[-i-1])*diff_activation_function(z[-i-2]) # 这里是倒序遍历
# 设下标-i-2代表第L层,则W[-i-1]是第L层到L+1层的转移矩阵,delta[-i-1]是第L+1层的残差,而z[-i-2]是未激活的第L层
# 计算最终需要的偏导数值
delta_w = [np.array(0)]*len(W)
delta_b = [np.array(0)]*len(W)
for i in range(len(W)):
# 使用矩阵运算简化公式,下面2行代码已将全部样本反向传播得到的偏导数值求和
delta_w[i] = np.dot(delta[i],a[i].T)
delta_b[i] = np.sum(delta[i],axis=1)
# 更新权重参数
for i in range(len(W)):
W[i] -= alpha*(Lambda*W[i]+delta_w[i]/m)
b[i] -= alpha/m*delta_b[i]
print '训练样本的未正则化代代函数值:',costfunction(predict,np.array(trainingLabels))
print '训练样本错误率:',error_rate(predict,np.array(trainingLabels))
# 使用测试集测试网络
a[0] = np.array(testSet).T # 改一列为一个样本,一行代表一个特征
# 前向传播
m = len(testSet)
for i in range(len(W)):
z[i] = np.dot(W[i],a[i])
for j in range(m):
z[i][:,j]+=b[i].T[0]
a[i+1] = activation_function(z[i])
predict = a[-1][0]
print '测试样本的未正则化代代函数值:',costfunction(predict,np.array(testLabels))
print '测试样本错误率:',error_rate(predict,np.array(testLabels))
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
iris = load_iris() # 加载数据
X = iris.data[:, :2] # 为方便画图,仅采用数据的其中两个特征
y = iris.target
print iris.DESCR
print iris.feature_names
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
clf = KNeighborsClassifier(n_neighbors=15, weights='uniform') # 初始化分类器对象
clf.fit(X, y)
# 画出决策边界,用不同颜色表示
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light) # 绘制预测结果图
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) # 补充训练数据点
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = 15, weights = 'uniform')")
plt.show()
from sklearn.naive_bayes import GaussianNB #高斯朴素贝叶斯
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
datas = load_iris()
# print(datas)
iris_x = datas.data
iris_y = datas.target
# print(iris_x)
# print(iris_y)
iris_x0 = iris_x[ :, 0:2]
# print(iris_x0)
X_train,X_test,y_train,y_test =train_test_split(iris_x0, iris_y, test_size=0.3)
clf = GaussianNB( )
'''
GaussianNB 参数只有一个:先验概率priors
MultinomialNB参数有三个:alpha是常量,一般取值1,fit_prior是否考虑先验概率,class_prior自行输入先验概率
BernoulliNB参数有四个:前三个与MultinomialNB一样,第四个binarize 标签二值化
这里的参数的意义主要参考https://www.cnblogs.com/pinard/p/6074222.html'''
clf.fit(X_train,y_train)
per = clf.predict(X_test)
print(per)
print(y_test)
# -*- coding:utf-8 -*-
# k-means实验
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
plt.figure(figsize=(12, 12))
# 选取样本数量
n_samples = 1500
# 选取随机因子
random_state = 170
# 获取数据集
X, y = make_blobs(n_samples=n_samples, random_state=random_state)
# 聚类数量不正确时的效果
y_pred = KMeans(n_clusters=2, random_state=random_state).fit_predict(X)
plt.subplot(221)
plt.scatter(X[y_pred==0][:, 0], X[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X[y_pred==1][:, 0], X[y_pred==1][:, 1], marker='+',color='r')
plt.title("Incorrect Number of Blobs")
# 聚类数量正确时的效果
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X)
plt.subplot(222)
plt.scatter(X[y_pred==0][:, 0], X[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X[y_pred==1][:, 0], X[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X[y_pred==2][:, 0], X[y_pred==2][:, 1], marker='1',color='m')
plt.title("Correct Number of Blobs")
# 类间的方差存在差异的效果
X_varied, y_varied = make_blobs(n_samples=n_samples,
cluster_std=[1.0, 2.5, 0.5],
random_state=random_state)
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_varied)
plt.subplot(223)
plt.scatter(X_varied[y_pred==0][:, 0], X_varied[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X_varied[y_pred==1][:, 0], X_varied[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X_varied[y_pred==2][:, 0], X_varied[y_pred==2][:, 1], marker='1',color='m')
plt.title("Unequal Variance")
# 类的规模差异较大的效果
X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_filtered)
plt.subplot(224)
plt.scatter(X_filtered[y_pred==0][:, 0], X_filtered[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X_filtered[y_pred==1][:, 0], X_filtered[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X_filtered[y_pred==2][:, 0], X_filtered[y_pred==2][:, 1], marker='1',color='m')
plt.title("Unevenly Sized Blobs")
plt.show()
# -*- coding:utf-8 -*-
# 系统聚类实验
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import make_blobs
plt.figure(figsize=(12, 12))
# 选取样本数量
n_samples = 1500
# 选取随机因子
random_state = 170
# 获取数据集
X, y = make_blobs(n_samples=n_samples, random_state=random_state)
# 聚类数量不正确时的效果
y_pred = AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=2).fit_predict(X)
# 选取欧几里德距离和离差平均和法
plt.subplot(221)
plt.scatter(X[y_pred==0][:, 0], X[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X[y_pred==1][:, 0], X[y_pred==1][:, 1], marker='+',color='r')
plt.title("Incorrect Number of Blobs")
# 聚类数量正确时的效果
y_pred = AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=3).fit_predict(X)
plt.subplot(222)
plt.scatter(X[y_pred==0][:, 0], X[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X[y_pred==1][:, 0], X[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X[y_pred==2][:, 0], X[y_pred==2][:, 1], marker='1',color='m')
plt.title("Correct Number of Blobs")
# 类间的方差存在差异的效果
X_varied, y_varied = make_blobs(n_samples=n_samples,
cluster_std=[1.0, 2.5, 0.5],
random_state=random_state)
y_pred = AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=3).fit_predict(X_varied)
plt.subplot(223)
plt.scatter(X_varied[y_pred==0][:, 0], X_varied[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X_varied[y_pred==1][:, 0], X_varied[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X_varied[y_pred==2][:, 0], X_varied[y_pred==2][:, 1], marker='1',color='m')
plt.title("Unequal Variance")
# 类的规模差异较大的效果
X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
y_pred = AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=3).fit_predict(X_filtered)
plt.subplot(224)
plt.scatter(X_filtered[y_pred==0][:, 0], X_filtered[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X_filtered[y_pred==1][:, 0], X_filtered[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X_filtered[y_pred==2][:, 0], X_filtered[y_pred==2][:, 1], marker='1',color='m')
plt.title("Unevenly Sized Blobs")
plt.show()
# -*- coding:utf-8 -*-
# 密度聚类模型
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
##############################################################################
# 获取make_blobs数据
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
random_state=0)
# 数据预处理
X = StandardScaler().fit_transform(X)
##############################################################################
# 执行DBSCAN算法
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
# 标记核心对象,后面作图需要用到
core_samples_mask[db.core_sample_indices_] = True
# 算法得出的聚类标签,-1代表样本点是噪声点,其余值表示样本点所属的类
labels = db.labels_
# 获取聚类数量
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
# 输出算法性能的信息
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels))
##############################################################################
# 绘图
import matplotlib.pyplot as plt
# 黑色用作标记噪声点
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
i = -1
# 标记样式,x点表示噪声点
marker = ['v','^','o','x']
for k, col in zip(unique_labels, colors):
if k == -1:
# 黑色表示标记噪声点.
col = 'k'
class_member_mask = (labels == k)
i += 1
if (i>=len(unique_labels)):
i = 0
# 绘制核心对象
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], marker[i], markerfacecolor=col,
markeredgecolor='k', markersize=14)
# 绘制非核心对象
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], marker[i], markerfacecolor=col,
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn import linear_model
df=pd.read_csv('result.csv')
sns.set(style='whitegrid', context='notebook') #style控制默认样式,context控制着默认的画幅大小
cols = ['man', 'woman', 'money','workyears']
sns.pairplot(df[cols], size=2.5)
plt.tight_layout()
plt.show()
# 建立模型
model =linear_model.LinearRegression()
# 开始训练
model.fit(df[['man', 'woman','workyears']], df['money'])
print("coefficients: ", model.coef_)
w1 = model.coef_[0]
w2 = model.coef_[1]
w2 = model.coef_[2]
print("intercept: ", model.intercept_)
b = model.intercept_
x_test = [[1,0,6]]
predict = model.predict(x_test)
print("predict: ", predict)
#-*- coding: utf-8 -*-
#使用Apriori算法挖掘菜品订单关联规则
from __future__ import print_function
import pandas as pd
from apriori import * #导入自行编写的apriori函数
inputfile = '../data/menu_orders.xls'
outputfile = '../tmp/apriori_rules.xls' #结果文件
data = pd.read_excel(inputfile, header = None)
print(u'\n转换原始数据至0-1矩阵...')
ct = lambda x : pd.Series(1, index = x[pd.notnull(x)]) #转换0-1矩阵的过渡函数
b = map(ct, data.as_matrix()) #用map方式执行
data = pd.DataFrame(list(b)).fillna(0) #实现矩阵转换,空值用0填充
print(u'\n转换完毕。')
del b #删除中间变量b,节省内存
support = 0.2 #最小支持度
confidence = 0.5 #最小置信度
ms = '---' #连接符,默认'--',用来区分不同元素,如A--B。需要保证原始表格中不含有该字符
find_rule(data, support, confidence, ms).to_excel(outputfile) #保存结果
#-*- coding: utf-8 -*-
from __future__ import print_function
import pandas as pd
#自定义连接函数,用于实现L_{k-1}到C_k的连接
def connect_string(x, ms):
x = list(map(lambda i:sorted(i.split(ms)), x))
l = len(x[0])
r = []
for i in range(len(x)):
for j in range(i,len(x)):
if x[i][:l-1] == x[j][:l-1] and x[i][l-1] != x[j][l-1]:
r.append(x[i][:l-1]+sorted([x[j][l-1],x[i][l-1]]))
return r
#寻找关联规则的函数
def find_rule(d, support, confidence, ms = u'--'):
result = pd.DataFrame(index=['support', 'confidence']) #定义输出结果
support_series = 1.0*d.sum()/len(d) #支持度序列
column = list(support_series[support_series > support].index) #初步根据支持度筛选
k = 0
while len(column) > 1:
k = k+1
print(u'\n正在进行第%s次搜索...' %k)
column = connect_string(column, ms)
print(u'数目:%s...' %len(column))
sf = lambda i: d[i].prod(axis=1, numeric_only = True) #新一批支持度的计算函数
#创建连接数据,这一步耗时、耗内存最严重。当数据集较大时,可以考虑并行运算优化。
d_2 = pd.DataFrame(list(map(sf,column)), index = [ms.join(i) for i in column]).T
support_series_2 = 1.0*d_2[[ms.join(i) for i in column]].sum()/len(d) #计算连接后的支持度
column = list(support_series_2[support_series_2 > support].index) #新一轮支持度筛选
support_series = support_series.append(support_series_2)
column2 = []
for i in column: #遍历可能的推理,如{A,B,C}究竟是A+B-->C还是B+C-->A还是C+A-->B?
i = i.split(ms)
for j in range(len(i)):
column2.append(i[:j]+i[j+1:]+i[j:j+1])
cofidence_series = pd.Series(index=[ms.join(i) for i in column2]) #定义置信度序列
for i in column2: #计算置信度序列
cofidence_series[ms.join(i)] = support_series[ms.join(sorted(i))]/support_series[ms.join(i[:len(i)-1])]
for i in cofidence_series[cofidence_series > confidence].index: #置信度筛选
result[i] = 0.0
result[i]['confidence'] = cofidence_series[i]
result[i]['support'] = support_series[ms.join(sorted(i.split(ms)))]
result = result.T.sort_values(['confidence','support'], ascending = False) #结果整理,输出
print(u'\n结果为:')
print(result)
return result
#-*- coding: utf-8 -*-
#使用基于UBCF算法对电影进行推荐
from __future__ import print_function
import pandas as pd
############ 主程序 ##############
if __name__ == "__main__":
print("\n--------------使用基于UBCF算法对电影进行推荐 运行中... -----------\n")
traindata = pd.read_csv('/media/dp_zhou/Knowledge/Learning data/Python books/数据与代码/数据与代码/示例程序/data/u1.base',sep='\t', header=None,index_col=None)
testdata = pd.read_csv('/media/dp_zhou/Knowledge/Learning data/Python books/数据与代码/数据与代码/示例程序/data/u1.test',sep='\t', header=None,index_col=None)
#删除时间标签列
traindata.drop(3,axis=1, inplace=True)
testdata.drop(3,axis=1, inplace=True)
#行与列重新命名
traindata.rename(columns={0:'userid',1:'movid',2:'rat'}, inplace=True)
testdata.rename(columns={0:'userid',1:'movid',2:'rat'}, inplace=True)
traindf=traindata.pivot(index='userid', columns='movid', values='rat')
testdf=testdata.pivot(index='userid', columns='movid', values='rat')
traindf.rename(index={i:'usr%d'%(i) for i in traindf.index} , inplace=True)
traindf.rename(columns={i:'mov%d'%(i) for i in traindf.columns} , inplace=True)
testdf.rename(index={i:'usr%d'%(i) for i in testdf.index} , inplace=True)
testdf.rename(columns={i:'mov%d'%(i) for i in testdf.columns} , inplace=True)
userdf=traindf.loc[testdf.index]
#获取预测评分和推荐列表
trainrats,trainrecomm=recomm(traindf,userdf)
#-*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import math
def prediction(df,userdf,Nn=15):#Nn邻居个数
corr=df.T.corr();
rats=userdf.copy()
for usrid in userdf.index:
dfnull=df.loc[usrid][df.loc[usrid].isnull()]
usrv=df.loc[usrid].mean()#评价平均值
for i in range(len(dfnull)):
nft=(df[dfnull.index[i]]).notnull()
#获取邻居列表
if(Nn<=len(nft)):
nlist=df[dfnull.index[i]][nft][:Nn]
else:
nlist=df[dfnull.index[i]][nft][:len(nft)]
nlist=nlist[corr.loc[usrid,nlist.index].notnull()]
nratsum=0
corsum=0
if(0!=nlist.size):
nv=df.loc[nlist.index,:].T.mean()#邻居评价平均值
for index in nlist.index:
ncor=corr.loc[usrid,index]
nratsum+=ncor*(df[dfnull.index[i]][index]-nv[index])
corsum+=abs(ncor)
if(corsum!=0):
rats.at[usrid,dfnull.index[i]]= usrv + nratsum/corsum
else:
rats.at[usrid,dfnull.index[i]]= usrv
else:
rats.at[usrid,dfnull.index[i]]= None
return rats
def recomm(df,userdf,Nn=15,TopN=3):
ratings=prediction(df,userdf,Nn)#获取预测评分
recomm=[]#存放推荐结果
for usrid in userdf.index:
#获取按NA值获取未评分项
ratft=userdf.loc[usrid].isnull()
ratnull=ratings.loc[usrid][ratft]
#对预测评分进行排序
if(len(ratnull)>=TopN):
sortlist=(ratnull.sort_values(ascending=False)).index[:TopN]
else:
sortlist=ratnull.sort_values(ascending=False).index[:len(ratnull)]
recomm.append(sortlist)
return ratings,recomm
#_*_coding:utf-8_*_
import pandas as pd
import numpy as np
header = ['user_id', 'item_id', 'rating', 'timestamp']
dataset = pd.read_csv('../data/u.data',sep='\t',names=header)
#计算唯一用户和电影的数量
# unique对以为数组去重 shape[0] shape为矩阵的长度
users = dataset.user_id.unique().shape[0]
items = dataset.item_id.unique().shape[0]
from sklearn.model_selection import train_test_split
train_data,test_data = train_test_split(dataset,test_size=0.25)
'''
创建user-item矩阵
itertuples pandas dataframe 建立索引的方式
结果为: Pandas(Index=77054, user_id=650, item_id=528, rating=3, timestamp=891370998)
'''
train_data_matrix = np.zeros((users,items))
for line in train_data.itertuples():
train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
test_data_matrix = np.zeros((users,items))
for line in test_data.itertuples():
test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
#计算相似度
from sklearn.metrics.pairwise import pairwise_distances
#相似度相当于权重w
user_similarity = pairwise_distances(train_data_matrix,metric='cosine')
#train_data_matrix.T 矩阵转置
items_similarity = pairwise_distances(train_data_matrix.T,metric='cosine')
'''
基于用户相似矩阵 -> 基于用户的推荐
mean函数求取均值 axis=1 对各行求取均值,返回一个m*1的矩阵
np.newaxis 给矩阵增加一个列 一维矩阵变为多维矩阵 mean_user_rating(n*1)
train_data_matrix所有行都减去mean_user_rating对应行的数 此为规范化评分,使其在统一的范围内
numpy a.dot(b) -> 两个矩阵的点积
np.abs(a) ->计算矩阵a各元素的绝对值
np.sum() -> 无参数 矩阵全部元素相加
-> axis=0 按列相加
-> axis=1 按行相加
b /a 矩阵对应为相除
'''
mean_user_rating = train_data_matrix.mean(axis = 1) #计算每行的平均数
rating_diff = train_data_matrix - mean_user_rating[:,np.newaxis] #评分规范化
pred = mean_user_rating[:, np.newaxis] \
+ user_similarity.dot(rating_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T #权重w*平均化的评分
'''
评估指标 均方差误差
'''
from sklearn.metrics import mean_squared_error
from math import sqrt
pred = pred[test_data_matrix.nonzero()].flatten()
test_data_matrix = test_data_matrix[test_data_matrix.nonzero()].flatten()
result = sqrt(mean_squared_error(pred,test_data_matrix))
print(result)
#-*- coding: utf-8 -*-
#arima时序模型
from __future__ import print_function
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.graphics.api import qqplot
from statsmodels.graphics.tsaplots import plot_acf
# 参数初始化
discfile = '../data/arima_data.xls'
# 读取数据,指定日期列为指标,Pandas自动将“日期”列识别为Datetime格式
data = pd.read_excel(discfile,index_col=0)
print(data.head())
print('\n Data Types:')
print(data.dtypes)
# 时序图
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
data.plot()
plt.show()
#自相关图
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(data).show()
#平稳性检测
from statsmodels.tsa.stattools import adfuller as ADF
print(u'原始序列的ADF检验结果为:', ADF(data[u'销量']))
#返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore
#差分后的时序图
D_data = data.diff().dropna()
D_data.columns = [u'销量差分']
D_data.plot() #时序图
plt.show()
#自相关图
plot_acf(D_data).show()
#偏自相关图
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(D_data).show()
#平稳性检测
print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分']))
#白噪声检验
from statsmodels.stats.diagnostic import acorr_ljungbox
print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1))
#返回统计量和p值
# 一阶差分
fig = plt.figure(figsize=(12,8))
ax1= fig.add_subplot(111)
diff1 = data.diff(1)
diff1.plot(ax=ax1)
# 二阶差分
fig = plt.figure(figsize=(12,8))
ax2= fig.add_subplot(111)
diff2 = data.diff(2)
diff2.plot(ax=ax2)
# 合适的p,q
dta = data.diff(1)[1:]
fig = plt.figure(figsize=(12,8))
ax1=fig.add_subplot(211)
fig1 = sm.graphics.tsa.plot_acf(dta[u'销量'],lags=10,ax=ax1)
ax2 = fig.add_subplot(212)
fig2 = sm.graphics.tsa.plot_pacf(dta[u'销量'],lags=10,ax=ax2)
#模型
arma_mod20 = sm.tsa.ARMA(dta,(2,0)).fit()
print(arma_mod20.aic,arma_mod20.bic,arma_mod20.hqic)
arma_mod01 = sm.tsa.ARMA(dta,(0,1)).fit()
print(arma_mod01.aic,arma_mod01.bic,arma_mod01.hqic)
arma_mod10 = sm.tsa.ARMA(dta,(1,0)).fit()
print(arma_mod10.aic,arma_mod10.bic,arma_mod10.hqic)
#残差QQ图
resid = arma_mod01.resid
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
fig = qqplot(resid, line='q', ax=ax, fit=True)
#残差自相关检验
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(arma_mod01.resid.values.squeeze(), lags=10, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(arma_mod01.resid, lags=10, ax=ax2)
#D-W检验
print(sm.stats.durbin_watson(arma_mod01.resid.values))
# Ljung-Box检验
import numpy as np
r,q,p = sm.tsa.acf(resid.values.squeeze(), qstat=True)
datap = np.c_[range(1,36), r[1:], q, p]
table = pd.DataFrame(datap, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))
#预测
predict_sunspots = arma_mod01.predict('2015-2-07', '2015-2-15', dynamic=True)
fig, ax = plt.subplots(figsize=(12, 8))
print(predict_sunspots)
predict_sunspots[0] += data['2015-02-06':][u'销量']
data=pd.DataFrame(data)
for i in range(len(predict_sunspots)-1):
predict_sunspots[i+1]=predict_sunspots[i]+predict_sunspots[i+1]
print(predict_sunspots)
ax = data.ix['2015':].plot(ax=ax)
predict_sunspots.plot(ax=ax)
plt.show()