Leon_数据挖掘-工具箱

前言

----是使用Python实现的一些常见数据挖掘建模;
----对相关的必要的基础代码进行收集与改写后,可作为之后相关的数据工作的baseline;
----在Python 上跑通,对从事Python相关工作的程序员都了解:Python 2.x和3.x之间的语法差异其实几乎不会对开发造成影响
----2.x与3.x的代码迁移工作较小,注意几个书写的标点符号等小点便可以轻松地将2.x与3.x两种代码进行转换;2.x与3.x在代码中的函数调用方法以及代码书写几乎没有差异,因此大家完全可以直接简单修改print处的括号等偶尔版本间的差异便可以实现代码的转换–>使用2.x的代码来作为baseline完全不会对熟悉3.x的工作者造成影响;
----用于提升之后进行相关的数据挖掘的工作效率;
----希望对大家有所帮助;

数据分析库

Numpy

创建数组

#  ----创建 数组---
arr1 = np.array([2,3,4])    # 通过列表创建数组
# result:
# [2 3 4]
arr2 = np.array([(1.3,9,2.0),(7,6,1)])    # 通过元组创建数组
# result:
# [[ 1.3  9.   2. ]
#  [ 7.   6.   1. ]]


arr3 = np.zeros((2,3))    # 通过元组(2, 3)生成全零矩阵
# result:
# [[ 0.  0.  0.]
#  [ 0.  0.  0.]]


arr4 = np.identity(3)    # 生成3维的单位矩阵
# result:
# [[ 1.  0.  0.]
#  [ 0.  1.  0.]
#  [ 0.  0.  1.]]


arr5 = np.random.random(size = (2,3)) # 生成每个元素都在[0,1]之间的随机矩阵
# result:
# [[ 0.31654004  0.87056375  0.29050563]
#  [ 0.55267505  0.59191276  0.20174988]]

arr6 = np.arange(5,20,3)  # 生成等距序列,参数为起点,终点,步长值.含起点值,不含终点值
# result: [ 5  8 11 14 17]
arr7 = np.linspace(0,2,9)  # 生成等距序列,参数为起点,终点,步长值.含起点值和终点值
# result: [ 0.    0.25  0.5   0.75  1.    1.25  1.5   1.75  2.  ]

访问数组

# 查看数组的属性
print arr2.shape # 返回矩阵的规格
# result: (2,3)
print arr2.ndim  # 返回矩阵的秩
# result: 2
print arr2.size  # 返回矩阵元素总数
# result: 6
print arr2.dtype.name   # 返回矩阵元素的数据类型
# result: float64
print type(arr2) # 查看整个数组对象的类型
# result: 

# 通过索引和切片访问数组元素
def f(x,y):
    return 10*x+y
arr8 = np.fromfunction(f,(4,3),dtype = int)
print arr8
# result:
# [[ 0  1  2]
# [10 11 12]
# [20 21 22]
# [30 31 32]]
print arr8[1,2] #返回矩阵第1行,第2列的元素(注意下标从0开始)
# result: 12
print arr8[0:2,:]  #切片,返回矩阵前2行
# result:
# [[ 0  1  2]
#  [10 11 12]]
print arr8[:,1]    #切片,返回矩阵第1列
# result: [ 1 11 21 31]
print arr8[-1]     #切片,返回矩阵最后一行
# reuslt: [30 31 32]

# 通过迭代器访问数组元素
for row in arr8:
    print row
# result:
# [0 1 2]
# [10 11 12]
# [20 21 22]
# [30 31 32]
for element in arr8.flat:
    print element
# 输出矩阵全部元素
print '-'*70

数组运算

数组间

print '''数组的运算'''
arr9 = np.array([[2,1],[1,2]])
arr10 = np.array([[1,2],[3,4]])
print arr9 - arr10  
# result:
# [[ 1 -1]
#  [-2 -2]]
print arr9**2
# result:
# [[4 1]
#  [1 4]]
print 3*arr10
# result:
# [[ 3  6]
#  [ 9 12]]
print arr9*arr10  #普通乘法
# result:
# [[2 2]
#  [3 8]]
print np.dot(arr9,arr10)  #矩阵乘法
# result:
# [[ 5  8]
#  [ 7 10]]
print arr10.T  #转置
# result:
# [[1 3]
#  [2 4]]
print np.linalg.inv(arr10) #返回逆矩阵
# result:
# [[-2.   1. ]
#  [ 1.5 -0.5]]
print arr10.sum()  #数组元素求和
# result: 10
print arr10.max()  #返回数组最大元素
# result: 4
print arr10.cumsum(axis = 1)  #沿行累计总和
# result: 
# [[1 3]
#  [3 7]]
print '-'*70

数组与函数

print '''NumPy通用函数'''
print np.exp(arr9)     #指数函数
# result:
# [[ 7.3890561   2.71828183]
#  [ 2.71828183  7.3890561 ]]
print np.sin(arr9)      #正弦函数(弧度制)
# result:
# [[ 0.90929743  0.84147098]
#  [ 0.84147098  0.90929743]]
print np.sqrt(arr9)     #开方函数
# result:
# [[ 1.41421356  1.        ]
#  [ 1.          1.41421356]]
print np.add(arr9,arr10)  #和arr9+arr10效果一样
# result:
# [[3 3]
#  [4 6]]
print '-'*70

数组分割与集合

# 合并
arr11 = np.vstack((arr9,arr10))  #纵向合并数组,由于与堆栈类似,故命名为vstack
print arr11
# result:
# [[2 1]
#  [1 2]
#  [1 2]
#  [3 4]]
arr12 = np.hstack((arr9,arr10))  #横向合并数组
print arr12
# result:
# [[2 1 1 2]
#  [1 2 3 4]]
# 分割
print np.hsplit(arr12,2)  # 将数组横向分为2部分
# result:
# [array([[2, 1],
#        [1, 2]]), array([[1, 2],
#        [3, 4]])]
print np.vsplit(arr11,2)   # 数组纵向分为2部分
# result:
# [array([[2, 1],
#        [1, 2]]), array([[1, 2],
#        [3, 4]])]

Pandas

数据框

# -*- coding:utf-8 -*-
import pandas as pd    # 为pandas取一个别名pd
data = {'id': ['Jack', 'Sarah', 'Mike'],
        'age': [18, 35, 20],
        'cash': [10.53, 500.7, 13.6]}
df = pd.DataFrame(data)    # 调用构造函数并将结果赋值给df
print df
# result:
#    age    cash     id
# 0   18   10.53   Jack
# 1   35  500.70  Sarah
# 2   20   13.60   Mike


df2 = pd.DataFrame(data, columns=['id', 'age', 'cash'],index=['one', 'two', 'three'])
print df2
# result:
#          id   age    cash
# one     Jack   18   10.53
# two    Sarah   35  500.70
# three   Mike   20   13.60


print df2['id']
# result:
# 0     Jack
# 1    Sarah
# 2     Mike
# Name: ID, dtype: object


s = pd.Series({'a': 4, 'b': 9, 'c': 16}, name='number')
print s
# result:
# a4
# b9
# c16
# Name: number, dtype: int64

系列

print s[0]
# result: 4
print s[:3]
# result:
# a     4
# b     9
# c    16
# Name: number, dtype: int64


print s['a']
# result: 4
s['d'] = 25    # 如果系列中本身没有这个键值,则会新增一行
print s
# result:
# a     4
# b     9
# c    16
# d    25
# Name: number, dtype: int64


import numpy as np
print np.sqrt(s)
# result:
# a    2.0
# b    3.0
# c    4.0
# d    5.0
# Name: number, dtype: float64
print s*s
# result:
# a     16
# b     81
# c    256
# d    625
# Name: number, dtype: int64


print df['id']    # 按列名访问(call-by-column)
# result:
# one       Jack
# two      Sarah
# three     Mike
# Name: ID, dtype: object

df['rich'] = df['cash'] > 200.0
print df
# result:
#    age    cash     id   rich
# 0   18   10.53   Jack  False
# 1   35  500.70  Sarah   True
# 2   20   13.60   Mike  False

del df['rich']
print df
# result:
#    age    cash     id
# 0   18   10.53   Jack
# 1   35  500.70  Sarah
# 2   20   13.60   Mike

Scipy

向量化运算

# -*- coding:utf-8 -*-
from scipy import poly1d
p = poly1d([3, 4, 5])
print p
# result:
#    2
# 3 x + 4 x + 5

print p*p
# result:
#   4      3      2
# 9 x + 24 x + 46 x + 40 x + 25

print p.integ(k=6)    # 求p(x)的不定积分,指定常数项为6
# result:
#   3   2
# 1 x + 2 x + 5 x + 6
print p.deriv()    # 求p(x)的一阶导数
# result:
# 6 x + 4

p([4, 5])    # 计算每个值代入p(x)的结果
# result:
# array([ 69, 100])



# -*- coding:utf-8 -*-
import numpy as np

def addsubtract(a, b):    # 按照原始定义,仅接受可比较的数字作为参数
    if a > b:
        return a - b
    else:
        return a + b

vec_addsubtract = np.vectorize(addsubtract)
print vec_addsubtract([0, 3, 6, 9], [1, 3, 5, 7])
# result:
# [1 6 1 2]

scikit-learn

机器学习基本函数的实现与调用

from sklearn import datasets


# 数据集类似字典对象,包括了所有的数据和关于数据的元数据(metadata)。
# 数据被存储在.data成员内,是一个n_samples*n_features的数组。
# 在有监督问题的情形下,一个或多个因变量(response variables)被储存在.target成员中

digits = datasets.load_digits()

# 例如在digits数据集中,digits.data是可以用来分类数字样本的特征
print digits.data
# result:
# [[  0.   0.   5. ...,   0.   0.   0.]
#  [  0.   0.   0. ...,  10.   0.   0.]
#  [  0.   0.   0. ...,  16.   9.   0.]
#  ..., 
#  [  0.   0.   1. ...,   6.   0.   0.]
#  [  0.   0.   2. ...,  12.   0.   0.]
#  [  0.   0.  10. ...,  12.   1.   0.]]


# digits.target给出了digits数据集的目标变量,即每个数字图案对应的我们想预测的真是数字
print digits.target
# result:
# [0 1 2 ..., 8 9 8]



print '''训练和预测'''

from sklearn import svm

# 选择模型参数
clf = svm.SVC(gamma=0.0001,C=100)

# 我们的预测器的名字叫做clf。现在clf必须通过fit方法来从模型中学习。
# 这个过程是通过将训练集传递给fit方法来实现的。我们将除了最后一个样本的数据全部作为训练集。

# 进行训练
clf.fit(digits.data[:-1], digits.target[:-1])
 
# 进行预测
print clf.predict(digits.data[-1])
# result: 8

绘图

matplotlib–常见的统计数据可视化

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 1000)
y = np.sin(x)
z = np.cos(x**2)

plt.figure(figsize=(8,4))
plt.plot(x,y,label="$sin(x)$",color="red",linewidth=2)
plt.plot(x,z,"b--",label="$cos(x^2)$")
plt.xlabel("Time(s)")
plt.ylabel("Volt")
plt.title("PyPlot First Example")
plt.ylim(-1.2,1.2)
plt.legend()
plt.show()



# -*- coding:utf-8 -*-
import matplotlib.pylab as plt
import numpy as np
# 第一部分
plt.subplot(2,1,1)    # 参数依次为:行,列,第几项
# 第二部分
plt.subplot(2,2,3)
# 第三部分
plt.subplot(2,2,4)
plt.show()



# -*- coding:utf-8 -*-
import matplotlib.pylab as plt
import numpy as np
# 第一部分
plt.subplot(2,1,1)    # 参数依次为:行,列,第几项
n = 12
X = np.arange(n)
Y1 = (1-X/float(n)) * np.random.uniform(0.5,1.0,n)
Y2 = (1-X/float(n)) * np.random.uniform(0.5,1.0,n)

# 利用plt.bar(x, y)绘制柱状图,并指定柱状图颜色,柱子边框颜色
plt.bar(X, +Y1, facecolor='#9999ff', edgecolor='white')
plt.bar(X, -Y2, facecolor='#ff9999', edgecolor='white')

for x, y in zip(X,Y1):
    # 利用plt.text()指定文字出现的坐标和内容
    plt.text(x+0.4, y+0.05, '%.2f' % y, ha='center', va='bottom')

# 利用plt.ylim(y1, y2)限制图形打印时对应的纵坐标范围
plt.ylim(-1.25,+1.25)


# 第二部分
plt.subplot(2,2,3)
n = 20
Z = np.random.uniform(0,1,n)
plt.pie(Z)

# 第三部分
plt.subplot(2,2,4)
X = np.linspace(-np.pi, np.pi, 256,endpoint=True)
Y_C, Y_S = np.cos(X), np.sin(X)

plt.plot(X, Y_C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, Y_S, color="red", linewidth=2.5, linestyle="-")

plt.xlim(X.min()*1.1, X.max()*1.1)
plt.xticks([-np.pi, -np.pi/2, 0, np.pi/2, np.pi],
       [r'$-\pi$', r'$-\pi/2$', r'$0$', r'$+\pi/2$', r'$+\pi$'])

plt.ylim(Y_C.min()*1.1, Y_C.max()*1.1)
plt.yticks([-1, 0, +1],
       [r'$-1$', r'$0$', r'$+1$'])
plt.show()


bokeh–Web端数据可视化

# -*- coding:utf-8 -*-
from bokeh.plotting import figure, output_file, show
x = [1, 2, 3, 4, 5]
y = [6, 7, 2, 4, 5]
# 输出为静态文件
output_file("../tmp/lines.html", title="line plot example")
# 创建一个figure对象,附带标题和坐标轴标记
p = figure(title="simple line example", x_axis_label='x', y_axis_label='y')
# 添加一条线,设置图例
p.line(x, y, legend="Line A.", line_width=2)
show(p)

分类与预测

线性回归

# -*- coding:utf-8 -*-
from bokeh.plotting import figure, output_file, show
x = [1, 2, 3, 4, 5]
y = [6, 7, 2, 4, 5]
# 输出为静态文件
output_file("../tmp/lines.html", title="line plot example")
# 创建一个figure对象,附带标题和坐标轴标记
p = figure(title="simple line example", x_axis_label='x', y_axis_label='y')
# 添加一条线,设置图例
p.line(x, y, legend="Line A.", line_width=2)
show(p)

逻辑回归

# -*- coding:utf8 -*-
import pandas as pd
from sklearn.linear_model import LogisticRegression, RandomizedLogisticRegression
from sklearn.cross_validation import train_test_split

# 导入数据并观察
data = pd.read_csv('../data/LogisticRegression.csv', encoding='utf-8')
# print data.head(5)    # 查看数据框的头五行

# 将类别型变量进行独热编码one-hot encoding
data_dum = pd.get_dummies(data, prefix='rank', columns=['rank'], drop_first=True)
print data_dum.tail(5)    # 查看数据框的最后五行
# result:
#     admit  gre   gpa  rank_2  rank_3  rank_4
# 395      0  620  4.00     1.0     0.0     0.0
# 396      0  560  3.04     0.0     1.0     0.0
# 397      0  460  2.63     1.0     0.0     0.0
# 398      0  700  3.65     1.0     0.0     0.0
# 399      0  600  3.89     0.0     1.0     0.0

# 切分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data_dum.ix[:, 1:], data_dum.ix[:, 0], test_size=.1, random_state=520)

lr = LogisticRegression()    # 建立LR模型
lr.fit(X_train, y_train)    # 用处理好的数据训练模型
print '逻辑回归的准确率为:{0:.2f}%'.format(lr.score(X_test, y_test) *100)

决策树

ID3算法分类

# -*- coding:utf-8 -*-
# 使用ID3算法进行分类
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as DTC, export_graphviz

data = pd.read_csv('../data/titanic_data.csv', encoding='utf-8')
data.drop(['PassengerId'], axis=1, inplace=True)    # 舍弃ID列,不适合作为特征

# 数据是类别标签,将其转换为数,用1表示男,0表示女。
data.loc[data['Sex'] == 'male', 'Sex'] = 1
data.loc[data['Sex'] == 'female', 'Sex'] = 0
data.fillna(int(data.Age.mean()), inplace=True)
print data.head(5)   # 查看数据

X = data.iloc[:, 1:3]    # 为便于展示,未考虑年龄(最后一列)
y = data.iloc[:, 0]

dtc = DTC(criterion='entropy')    # 初始化决策树对象,基于信息熵
dtc.fit(X, y)    # 训练模型
print '输出准确率:', dtc.score(X,y)

# 可视化决策树,导出结果是一个dot文件,需要安装Graphviz才能转换为.pdf或.png格式
with open('../tmp/tree.dot', 'w') as f:
    f = export_graphviz(dtc, feature_names=X.columns, out_file=f)

随机森林

from sklearn import ensemble
from sklearn import tree
from sklearn import datasets
from sklearn.model_selection import train_test_split

wine = datasets.load_wine()
X_data = wine.data
y_data = wine.target

X_train,X_test,y_train,y_test = train_test_split(X_data,y_data,test_size = 0.3)
clf = tree.DecisionTreeClassifier(criterion='gini')
rfc = ensemble.RandomForestClassifier(random_state=10)    # 随机森林中树的棵数
clf = clf.fit(X_train,y_train)
rfc  = rfc.fit(X_train,y_train)

### 多次交叉验证
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
rfc_l = []
clf_l = []
for i in range(10):
    rfc = ensemble.RandomForestClassifier(n_estimators=25)
    rfc_s = cross_val_score(rfc,X_train,y_train,cv = 10).mean()
    rfc_l.append(rfc_s)
    
    clf = tree.DecisionTreeClassifier(criterion='entropy')
    clf_s = cross_val_score(clf,X_train,y_train,cv  = 10).mean()
    clf_l.append(clf_s)
    
plt.plot(range(1,11),rfc_l,label='RandomForest')
plt.plot(range(1,11),clf_l,label = "DecisionTree")

plt.legend()
plt.show()

#随机森林模型的属性.feature_importances_可以查看每个特征的重要性的比例
rfc.feature_importances_
>> array([0.22856109, 0.03452478, 0.01369924, 0.00668104, 0.01473769,
       0.09992003, 0.08866531, 0.00509946, 0.02717142, 0.14646766,
       0.07152866, 0.13577966, 0.12716396])

#随机森林模型的属性.feature_importances_可以查看每个特征的重要性的比例
clf.apply(X_test)
>> array([[ 9, 10,  2, ...,  6, 17, 10],
       [14, 18, 10, ..., 10, 23, 16],
       [14, 18, 10, ..., 10, 23, 16],
       ...,
       [ 5, 14,  7, ...,  9, 24,  5],
       [ 9, 16,  5, ...,  6, 16, 10],
       [14, 18, 10, ..., 10, 23, 16]], dtype=int64)
       
#predice属性返回每个测试样本的预测结果,可以对比一下预测样本的真实标签 y_test。
rfc.predict(X_test)
>> array([0.22856109, 0.03452478, 0.01369924, 0.00668104, 0.01473769,
       0.09992003, 0.08866531, 0.00509946, 0.02717142, 0.14646766,
       0.07152866, 0.13577966, 0.12716396])
       
y_test
>> array([2, 0, 0, 1, 1, 2, 1, 2, 2, 1, 2, 2, 0, 1, 1, 2, 0, 2, 1, 0, 2, 1,
       1, 0, 1, 0, 0, 2, 2, 0, 0, 1, 0, 2, 0, 0, 2, 1, 1, 1, 1, 2, 0, 1,
       0, 1, 1, 0, 2, 1, 1, 1, 2, 0])



BP神经网络

# BP神经网络Python实现

import numpy as np
from numpy import random
import math
import copy
import sklearn.datasets
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt

# 获取数据并分为训练集与测试集
trainingSet, trainingLabels = sklearn.datasets.make_moons(400, noise=0.20)
plt.scatter(trainingSet[trainingLabels==1][:,0], trainingSet[trainingLabels==1][:,1], s=40, c='r', marker='x',cmap=plt.cm.Spectral)
plt.scatter(trainingSet[trainingLabels==0][:,0], trainingSet[trainingLabels==0][:,1], s=40, c='y', marker='+',cmap=plt.cm.Spectral)
plt.show()
testSet = trainingSet[320:]
testLabels = trainingLabels[320:]
trainingSet = trainingSet[:320]
trainingLabels = trainingLabels[:320]

# 设置网络参数
layer =[2,3,1] # 设置层数和节点数
Lambda = 0.005 # 正则化系数
alpha = 0.2 # 学习速率
num_passes = 10000 # 迭代次数
m = len(trainingSet) # 样本数量

# 建立网络
# 网络采用列表存储每层的网络结构,网络的层数和各层节点数都可以自由设定
b = [] # 偏置元,共layer-1个元素,b[0]代表第一个隐藏层的偏置元(向量形式)
W = []
for i in range(len(layer)-1):
    W.append(random.random(size = (layer[i+1],layer[i]))) # W[i]表示网络第i层到第i+1层的转移矩阵(NumPy数组),输入层是第0层
    b.append(np.array([0.1]*layer[i+1]))  # 偏置元,b[i]的规模是1*第i+1个隐藏层节点数
a = [np.array(0)]*(len(W)+1) # a[0] = x,即输入,a[1]=f(z[0]),a[len(W)+1] = 最终输出
z = [np.array(0)]*len(W) # 注意z[0]表示是网络输入层的输出,即未被激活的第一个隐藏层

W = np.array(W)

def costfunction(predict,labels):
    # 不加入正则化项的代价函数
    # 输入参数格式为numpy的向量
    return sum((predict - labels)**2)
def error_rate(predict,labels):
    # 计算错误率,针对二分类问题,分类标签为0或1
    # 输入参数格式为numpy的向量
    error =0.0
    for i in range(len(predict)):
        predict[i] = round(predict[i]) 
        if predict[i]!=labels[i]:
            error+=1
    return error/len(predict)
def sigmoid(z):  # 激活函数sigmoid
    return 1/(1+np.exp(-z))
def diff_sigmoid(z): # 激活函数sigmoid的导数
    return sigmoid(z)*(1-sigmoid(z))

activation_function = sigmoid  # 设置激活函数
diff_activation_function = diff_sigmoid # 设置激活函数的导数


# 开始训练BP神经网络
a[0] = np.array(trainingSet).T # 改一列为一个样本,一行代表一个特征
y = np.array(trainingLabels)

for v in range(num_passes):
    # 前向传播
    for i in range(len(W)):
        z[i] = np.dot(W[i],a[i])
        for j in range(m):
            z[i][:,j]+=b[i] # 加上偏置元
        a[i+1] = activation_function(z[i]) # 激活节点

    predict = a[-1][0] # a[-1]是输出层的结果,即为预测值

    # 反向传播
    delta = [np.array(0)]*len(W) # delta[0]是第一个隐藏层的残差,delta[-1]是输出层的残差

    # 计算输出层的残差
    delta[-1] = -(y-a[-1])*diff_activation_function(z[-1])

    # 计算第二层起除输出层外的残差
    for i in range(len(delta)-1):
        delta[-i-2] = np.dot(W[-i-1].T,delta[-i-1])*diff_activation_function(z[-i-2]) # 这里是倒序遍历
        # 设下标-i-2代表第L层,则W[-i-1]是第L层到L+1层的转移矩阵,delta[-i-1]是第L+1层的残差,而z[-i-2]是未激活的第L层

    # 计算最终需要的偏导数值
    delta_w = [np.array(0)]*len(W)
    delta_b = [np.array(0)]*len(W)
    for i in range(len(W)):
        # 使用矩阵运算简化公式,下面2行代码已将全部样本反向传播得到的偏导数值求和
        delta_w[i] = np.dot(delta[i],a[i].T) 
        delta_b[i] = np.sum(delta[i],axis=1) 

    # 更新权重参数
    for i in range(len(W)):
        W[i] -= alpha*(Lambda*W[i]+delta_w[i]/m)
        b[i] -= alpha/m*delta_b[i]
  
print '训练样本的未正则化代代函数值:',costfunction(predict,np.array(trainingLabels))  
print '训练样本错误率:',error_rate(predict,np.array(trainingLabels)) 

# 使用测试集测试网络
a[0] = np.array(testSet).T # 改一列为一个样本,一行代表一个特征
# 前向传播
m = len(testSet)
for i in range(len(W)):
    z[i] = np.dot(W[i],a[i])
    for j in range(m):
        z[i][:,j]+=b[i].T[0]
    a[i+1] = activation_function(z[i])
predict = a[-1][0]

print '测试样本的未正则化代代函数值:',costfunction(predict,np.array(testLabels))
print '测试样本错误率:',error_rate(predict,np.array(testLabels))

KNN算法

# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris

iris = load_iris()     # 加载数据
X = iris.data[:, :2]    # 为方便画图,仅采用数据的其中两个特征
y = iris.target
print iris.DESCR
print iris.feature_names
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

clf = KNeighborsClassifier(n_neighbors=15, weights='uniform')    # 初始化分类器对象
clf.fit(X, y)

# 画出决策边界,用不同颜色表示
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))

Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)    # 绘制预测结果图

plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)    # 补充训练数据点
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = 15, weights = 'uniform')")
plt.show()

朴素贝叶斯分类器

from sklearn.naive_bayes import GaussianNB  #高斯朴素贝叶斯

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

datas = load_iris()
# print(datas)

iris_x = datas.data
iris_y = datas.target

# print(iris_x)
# print(iris_y)

iris_x0 = iris_x[ :, 0:2]
# print(iris_x0)

X_train,X_test,y_train,y_test =train_test_split(iris_x0, iris_y, test_size=0.3)


clf = GaussianNB( )

'''
GaussianNB 参数只有一个:先验概率priors

MultinomialNB参数有三个:alpha是常量,一般取值1,fit_prior是否考虑先验概率,class_prior自行输入先验概率

BernoulliNB参数有四个:前三个与MultinomialNB一样,第四个binarize 标签二值化


这里的参数的意义主要参考https://www.cnblogs.com/pinard/p/6074222.html'''

clf.fit(X_train,y_train)

per = clf.predict(X_test)
print(per)
print(y_test)

聚类分析

K-means

# -*- coding:utf-8 -*-
# k-means实验

import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

plt.figure(figsize=(12, 12))

# 选取样本数量
n_samples = 1500
# 选取随机因子
random_state = 170
# 获取数据集
X, y = make_blobs(n_samples=n_samples, random_state=random_state)

# 聚类数量不正确时的效果
y_pred = KMeans(n_clusters=2, random_state=random_state).fit_predict(X)

plt.subplot(221)
plt.scatter(X[y_pred==0][:, 0], X[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X[y_pred==1][:, 0], X[y_pred==1][:, 1], marker='+',color='r')
plt.title("Incorrect Number of Blobs")

# 聚类数量正确时的效果
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X)

plt.subplot(222)
plt.scatter(X[y_pred==0][:, 0], X[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X[y_pred==1][:, 0], X[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X[y_pred==2][:, 0], X[y_pred==2][:, 1], marker='1',color='m')
plt.title("Correct Number of Blobs")

# 类间的方差存在差异的效果
X_varied, y_varied = make_blobs(n_samples=n_samples,
                                cluster_std=[1.0, 2.5, 0.5],
                                random_state=random_state)
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_varied)

plt.subplot(223)
plt.scatter(X_varied[y_pred==0][:, 0], X_varied[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X_varied[y_pred==1][:, 0], X_varied[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X_varied[y_pred==2][:, 0], X_varied[y_pred==2][:, 1], marker='1',color='m')
plt.title("Unequal Variance")

# 类的规模差异较大的效果
X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_filtered)

plt.subplot(224)
plt.scatter(X_filtered[y_pred==0][:, 0], X_filtered[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X_filtered[y_pred==1][:, 0], X_filtered[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X_filtered[y_pred==2][:, 0], X_filtered[y_pred==2][:, 1], marker='1',color='m')
plt.title("Unevenly Sized Blobs")

plt.show()

系统聚类

# -*- coding:utf-8 -*-
# 系统聚类实验

import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import make_blobs

plt.figure(figsize=(12, 12))

# 选取样本数量
n_samples = 1500
# 选取随机因子
random_state = 170
# 获取数据集
X, y = make_blobs(n_samples=n_samples, random_state=random_state)

# 聚类数量不正确时的效果
y_pred = AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=2).fit_predict(X)
# 选取欧几里德距离和离差平均和法

plt.subplot(221)
plt.scatter(X[y_pred==0][:, 0], X[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X[y_pred==1][:, 0], X[y_pred==1][:, 1], marker='+',color='r')
plt.title("Incorrect Number of Blobs")

# 聚类数量正确时的效果
y_pred = AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=3).fit_predict(X)

plt.subplot(222)
plt.scatter(X[y_pred==0][:, 0], X[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X[y_pred==1][:, 0], X[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X[y_pred==2][:, 0], X[y_pred==2][:, 1], marker='1',color='m')
plt.title("Correct Number of Blobs")

# 类间的方差存在差异的效果
X_varied, y_varied = make_blobs(n_samples=n_samples,
                                cluster_std=[1.0, 2.5, 0.5],
                                random_state=random_state)
y_pred = AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=3).fit_predict(X_varied)

plt.subplot(223)
plt.scatter(X_varied[y_pred==0][:, 0], X_varied[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X_varied[y_pred==1][:, 0], X_varied[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X_varied[y_pred==2][:, 0], X_varied[y_pred==2][:, 1], marker='1',color='m')
plt.title("Unequal Variance")

# 类的规模差异较大的效果
X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
y_pred = AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=3).fit_predict(X_filtered)

plt.subplot(224)
plt.scatter(X_filtered[y_pred==0][:, 0], X_filtered[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X_filtered[y_pred==1][:, 0], X_filtered[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X_filtered[y_pred==2][:, 0], X_filtered[y_pred==2][:, 1], marker='1',color='m')
plt.title("Unevenly Sized Blobs")

plt.show()

DBSCAN聚类

# -*- coding:utf-8 -*-
# 密度聚类模型

import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler


##############################################################################
# 获取make_blobs数据
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
                            random_state=0)
# 数据预处理
X = StandardScaler().fit_transform(X)

##############################################################################
# 执行DBSCAN算法
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
# 标记核心对象,后面作图需要用到
core_samples_mask[db.core_sample_indices_] = True
# 算法得出的聚类标签,-1代表样本点是噪声点,其余值表示样本点所属的类
labels = db.labels_

# 获取聚类数量
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

# 输出算法性能的信息
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

##############################################################################
# 绘图
import matplotlib.pyplot as plt

# 黑色用作标记噪声点
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))

i = -1
# 标记样式,x点表示噪声点
marker = ['v','^','o','x']
for k, col in zip(unique_labels, colors):
    if k == -1:
        # 黑色表示标记噪声点.
        col = 'k'

    class_member_mask = (labels == k)

    i += 1
    if (i>=len(unique_labels)):
        i = 0

    # 绘制核心对象
    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], marker[i], markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    # 绘制非核心对象
    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], marker[i], markerfacecolor=col,
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

关联分析

关联度计算

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn import linear_model
 
 
df=pd.read_csv('result.csv')
sns.set(style='whitegrid', context='notebook')   #style控制默认样式,context控制着默认的画幅大小
cols = ['man', 'woman', 'money','workyears']
sns.pairplot(df[cols], size=2.5)
plt.tight_layout()
plt.show()
# 建立模型
model =linear_model.LinearRegression()
# 开始训练
model.fit(df[['man', 'woman','workyears']], df['money'])
print("coefficients: ", model.coef_)
w1 = model.coef_[0]
w2 = model.coef_[1]
w2 = model.coef_[2]
print("intercept: ", model.intercept_)
b = model.intercept_
x_test = [[1,0,6]]
predict = model.predict(x_test)
print("predict: ", predict)
 
 
 

Apriori算法

main

#-*- coding: utf-8 -*-
#使用Apriori算法挖掘菜品订单关联规则
from __future__ import print_function
import pandas as pd
from apriori import * #导入自行编写的apriori函数

inputfile = '../data/menu_orders.xls'
outputfile = '../tmp/apriori_rules.xls' #结果文件
data = pd.read_excel(inputfile, header = None)

print(u'\n转换原始数据至0-1矩阵...')
ct = lambda x : pd.Series(1, index = x[pd.notnull(x)]) #转换0-1矩阵的过渡函数
b = map(ct, data.as_matrix()) #用map方式执行
data = pd.DataFrame(list(b)).fillna(0) #实现矩阵转换,空值用0填充
print(u'\n转换完毕。')
del b #删除中间变量b,节省内存

support = 0.2 #最小支持度
confidence = 0.5 #最小置信度
ms = '---' #连接符,默认'--',用来区分不同元素,如A--B。需要保证原始表格中不含有该字符

find_rule(data, support, confidence, ms).to_excel(outputfile) #保存结果

aprior

#-*- coding: utf-8 -*-
from __future__ import print_function
import pandas as pd

#自定义连接函数,用于实现L_{k-1}到C_k的连接
def connect_string(x, ms):
  x = list(map(lambda i:sorted(i.split(ms)), x))
  l = len(x[0])
  r = []
  for i in range(len(x)):
    for j in range(i,len(x)):
      if x[i][:l-1] == x[j][:l-1] and x[i][l-1] != x[j][l-1]:
        r.append(x[i][:l-1]+sorted([x[j][l-1],x[i][l-1]]))
  return r

#寻找关联规则的函数
def find_rule(d, support, confidence, ms = u'--'):
  result = pd.DataFrame(index=['support', 'confidence']) #定义输出结果
  
  support_series = 1.0*d.sum()/len(d) #支持度序列
  column = list(support_series[support_series > support].index) #初步根据支持度筛选
  k = 0
  
  while len(column) > 1:
    k = k+1
    print(u'\n正在进行第%s次搜索...' %k)
    column = connect_string(column, ms)
    print(u'数目:%s...' %len(column))
    sf = lambda i: d[i].prod(axis=1, numeric_only = True) #新一批支持度的计算函数
    
    #创建连接数据,这一步耗时、耗内存最严重。当数据集较大时,可以考虑并行运算优化。
    d_2 = pd.DataFrame(list(map(sf,column)), index = [ms.join(i) for i in column]).T
    
    support_series_2 = 1.0*d_2[[ms.join(i) for i in column]].sum()/len(d) #计算连接后的支持度
    column = list(support_series_2[support_series_2 > support].index) #新一轮支持度筛选
    support_series = support_series.append(support_series_2)
    column2 = []
    
    for i in column: #遍历可能的推理,如{A,B,C}究竟是A+B-->C还是B+C-->A还是C+A-->B?
      i = i.split(ms)
      for j in range(len(i)):
        column2.append(i[:j]+i[j+1:]+i[j:j+1])
    
    cofidence_series = pd.Series(index=[ms.join(i) for i in column2]) #定义置信度序列
 
    for i in column2: #计算置信度序列
      cofidence_series[ms.join(i)] = support_series[ms.join(sorted(i))]/support_series[ms.join(i[:len(i)-1])]
    
    for i in cofidence_series[cofidence_series > confidence].index: #置信度筛选
      result[i] = 0.0
      result[i]['confidence'] = cofidence_series[i]
      result[i]['support'] = support_series[ms.join(sorted(i.split(ms)))]
  
  result = result.T.sort_values(['confidence','support'], ascending = False) #结果整理,输出
  print(u'\n结果为:')
  print(result)
  
  return result

智能推荐

UBCF算法

#-*- coding: utf-8 -*-
#使用基于UBCF算法对电影进行推荐
from __future__ import print_function
import pandas as pd

############    主程序   ##############
if __name__ == "__main__":
    print("\n--------------使用基于UBCF算法对电影进行推荐 运行中... -----------\n")
    traindata = pd.read_csv('/media/dp_zhou/Knowledge/Learning data/Python books/数据与代码/数据与代码/示例程序/data/u1.base',sep='\t', header=None,index_col=None)
    testdata = pd.read_csv('/media/dp_zhou/Knowledge/Learning data/Python books/数据与代码/数据与代码/示例程序/data/u1.test',sep='\t', header=None,index_col=None)
    #删除时间标签列
    traindata.drop(3,axis=1, inplace=True)
    testdata.drop(3,axis=1, inplace=True)
    #行与列重新命名
    traindata.rename(columns={0:'userid',1:'movid',2:'rat'}, inplace=True)
    testdata.rename(columns={0:'userid',1:'movid',2:'rat'}, inplace=True)
    traindf=traindata.pivot(index='userid', columns='movid', values='rat')
    testdf=testdata.pivot(index='userid', columns='movid', values='rat')
    traindf.rename(index={i:'usr%d'%(i) for i in traindf.index} , inplace=True)
    traindf.rename(columns={i:'mov%d'%(i) for i in traindf.columns} , inplace=True)
    testdf.rename(index={i:'usr%d'%(i) for i in testdf.index} , inplace=True)
    testdf.rename(columns={i:'mov%d'%(i) for i in testdf.columns} , inplace=True)
    userdf=traindf.loc[testdf.index]
    #获取预测评分和推荐列表
    trainrats,trainrecomm=recomm(traindf,userdf)
#-*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import math
def prediction(df,userdf,Nn=15):#Nn邻居个数
    corr=df.T.corr();
    rats=userdf.copy()
    for usrid in userdf.index:
        dfnull=df.loc[usrid][df.loc[usrid].isnull()]
        usrv=df.loc[usrid].mean()#评价平均值
        for i in range(len(dfnull)):
            nft=(df[dfnull.index[i]]).notnull()
            #获取邻居列表
            if(Nn<=len(nft)):
                nlist=df[dfnull.index[i]][nft][:Nn]
            else:
                nlist=df[dfnull.index[i]][nft][:len(nft)]
            nlist=nlist[corr.loc[usrid,nlist.index].notnull()]
            nratsum=0
            corsum=0
            if(0!=nlist.size):
                nv=df.loc[nlist.index,:].T.mean()#邻居评价平均值
                for index in nlist.index:
                    ncor=corr.loc[usrid,index]
                    nratsum+=ncor*(df[dfnull.index[i]][index]-nv[index])
                    corsum+=abs(ncor)
                if(corsum!=0):
                    rats.at[usrid,dfnull.index[i]]= usrv + nratsum/corsum
                else:
                    rats.at[usrid,dfnull.index[i]]= usrv
            else:
                rats.at[usrid,dfnull.index[i]]= None
    return rats
def recomm(df,userdf,Nn=15,TopN=3):
    ratings=prediction(df,userdf,Nn)#获取预测评分
    recomm=[]#存放推荐结果
    for usrid in userdf.index:
        #获取按NA值获取未评分项
        ratft=userdf.loc[usrid].isnull()
        ratnull=ratings.loc[usrid][ratft]
        #对预测评分进行排序
        if(len(ratnull)>=TopN):
            sortlist=(ratnull.sort_values(ascending=False)).index[:TopN]
        else:
            sortlist=ratnull.sort_values(ascending=False).index[:len(ratnull)]
        recomm.append(sortlist)
    return ratings,recomm


协同过滤

#_*_coding:utf-8_*_

import pandas as pd
import numpy as np

header = ['user_id', 'item_id', 'rating', 'timestamp']
dataset = pd.read_csv('../data/u.data',sep='\t',names=header)

#计算唯一用户和电影的数量
# unique对以为数组去重  shape[0] shape为矩阵的长度
users = dataset.user_id.unique().shape[0]
items = dataset.item_id.unique().shape[0]
from sklearn.model_selection import train_test_split
train_data,test_data = train_test_split(dataset,test_size=0.25)

'''
创建user-item矩阵
itertuples         pandas dataframe 建立索引的方式
结果为:   Pandas(Index=77054, user_id=650, item_id=528, rating=3, timestamp=891370998)
'''
train_data_matrix = np.zeros((users,items))
for line in train_data.itertuples():
    train_data_matrix[line[1] - 1, line[2] - 1] = line[3]

test_data_matrix = np.zeros((users,items))
for line in test_data.itertuples():
    test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
#计算相似度
from sklearn.metrics.pairwise import pairwise_distances
#相似度相当于权重w
user_similarity = pairwise_distances(train_data_matrix,metric='cosine')
#train_data_matrix.T 矩阵转置
items_similarity = pairwise_distances(train_data_matrix.T,metric='cosine')

'''
基于用户相似矩阵 -> 基于用户的推荐
mean函数求取均值  axis=1 对各行求取均值,返回一个m*1的矩阵
np.newaxis 给矩阵增加一个列 一维矩阵变为多维矩阵 mean_user_rating(n*1)
train_data_matrix所有行都减去mean_user_rating对应行的数    此为规范化评分,使其在统一的范围内
numpy a.dot(b) -> 两个矩阵的点积
      np.abs(a) ->计算矩阵a各元素的绝对值
      np.sum()  -> 无参数 矩阵全部元素相加
                -> axis=0   按列相加
                -> axis=1   按行相加
      b /a 矩阵对应为相除
'''
mean_user_rating = train_data_matrix.mean(axis = 1) #计算每行的平均数
rating_diff = train_data_matrix - mean_user_rating[:,np.newaxis]  #评分规范化
pred = mean_user_rating[:, np.newaxis] \
       + user_similarity.dot(rating_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T  #权重w*平均化的评分

'''
评估指标    均方差误差
'''
from sklearn.metrics import mean_squared_error
from math import sqrt

pred = pred[test_data_matrix.nonzero()].flatten()
test_data_matrix = test_data_matrix[test_data_matrix.nonzero()].flatten()
result = sqrt(mean_squared_error(pred,test_data_matrix))
print(result)

时序分析

arima模型

#-*- coding: utf-8 -*-
#arima时序模型
from __future__ import print_function
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.graphics.api import qqplot
from statsmodels.graphics.tsaplots import plot_acf

# 参数初始化
discfile = '../data/arima_data.xls'

# 读取数据,指定日期列为指标,Pandas自动将“日期”列识别为Datetime格式
data = pd.read_excel(discfile,index_col=0)
print(data.head())
print('\n Data Types:')
print(data.dtypes)


# 时序图
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
data.plot()
plt.show()


#自相关图
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(data).show()


#平稳性检测
from statsmodels.tsa.stattools import adfuller as ADF
print(u'原始序列的ADF检验结果为:', ADF(data[u'销量']))
#返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore


#差分后的时序图
D_data = data.diff().dropna()
D_data.columns = [u'销量差分']
D_data.plot() #时序图
plt.show()


#自相关图
plot_acf(D_data).show()


#偏自相关图
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(D_data).show() 


#平稳性检测
print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分'])) 


#白噪声检验
from statsmodels.stats.diagnostic import acorr_ljungbox
print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1))
#返回统计量和p值 


# 一阶差分
fig = plt.figure(figsize=(12,8))
ax1= fig.add_subplot(111)
diff1 = data.diff(1)
diff1.plot(ax=ax1)


# 二阶差分
fig = plt.figure(figsize=(12,8))
ax2= fig.add_subplot(111)
diff2 = data.diff(2)
diff2.plot(ax=ax2)


# 合适的p,q
dta = data.diff(1)[1:]
fig = plt.figure(figsize=(12,8))
ax1=fig.add_subplot(211)
fig1 = sm.graphics.tsa.plot_acf(dta[u'销量'],lags=10,ax=ax1)
ax2 = fig.add_subplot(212)
fig2 = sm.graphics.tsa.plot_pacf(dta[u'销量'],lags=10,ax=ax2)


#模型
arma_mod20 = sm.tsa.ARMA(dta,(2,0)).fit()
print(arma_mod20.aic,arma_mod20.bic,arma_mod20.hqic)
arma_mod01 = sm.tsa.ARMA(dta,(0,1)).fit()
print(arma_mod01.aic,arma_mod01.bic,arma_mod01.hqic)
arma_mod10 = sm.tsa.ARMA(dta,(1,0)).fit()
print(arma_mod10.aic,arma_mod10.bic,arma_mod10.hqic)


#残差QQ图
resid = arma_mod01.resid
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
fig = qqplot(resid, line='q', ax=ax, fit=True)


#残差自相关检验
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(arma_mod01.resid.values.squeeze(), lags=10, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(arma_mod01.resid, lags=10, ax=ax2)


#D-W检验
print(sm.stats.durbin_watson(arma_mod01.resid.values))


# Ljung-Box检验
import numpy as np
r,q,p = sm.tsa.acf(resid.values.squeeze(), qstat=True)
datap = np.c_[range(1,36), r[1:], q, p]
table = pd.DataFrame(datap, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))


#预测
predict_sunspots = arma_mod01.predict('2015-2-07', '2015-2-15', dynamic=True)
fig, ax = plt.subplots(figsize=(12, 8))
print(predict_sunspots)
predict_sunspots[0] += data['2015-02-06':][u'销量']
data=pd.DataFrame(data)
for i in range(len(predict_sunspots)-1):
    predict_sunspots[i+1]=predict_sunspots[i]+predict_sunspots[i+1]
print(predict_sunspots)
ax = data.ix['2015':].plot(ax=ax)
predict_sunspots.plot(ax=ax)
plt.show()

你可能感兴趣的:(Leon-数据挖掘,python,数据挖掘,机器学习)