Leon_数据挖掘-工具箱

前言

----是使用Python实现的一些常见数据挖掘建模；
----对相关的必要的基础代码进行收集与改写后，可作为之后相关的数据工作的baseline；
----在Python 上跑通，对从事Python相关工作的程序员都了解：Python 2.x和3.x之间的语法差异其实几乎不会对开发造成影响
----2.x与3.x的代码迁移工作较小，注意几个书写的标点符号等小点便可以轻松地将2.x与3.x两种代码进行转换；2.x与3.x在代码中的函数调用方法以及代码书写几乎没有差异，因此大家完全可以直接简单修改print处的括号等偶尔版本间的差异便可以实现代码的转换–>使用2.x的代码来作为baseline完全不会对熟悉3.x的工作者造成影响；
----用于提升之后进行相关的数据挖掘的工作效率；
----希望对大家有所帮助；

数据分析库

Numpy

创建数组

#  ----创建 数组---
arr1 = np.array([2,3,4])    # 通过列表创建数组
# result:
# [2 3 4]
arr2 = np.array([(1.3,9,2.0),(7,6,1)])    # 通过元组创建数组
# result:
# [[ 1.3  9.   2. ]
#  [ 7.   6.   1. ]]


arr3 = np.zeros((2,3))    # 通过元组(2, 3)生成全零矩阵
# result:
# [[ 0.  0.  0.]
#  [ 0.  0.  0.]]


arr4 = np.identity(3)    # 生成3维的单位矩阵
# result:
# [[ 1.  0.  0.]
#  [ 0.  1.  0.]
#  [ 0.  0.  1.]]


arr5 = np.random.random(size = (2,3)) # 生成每个元素都在[0,1]之间的随机矩阵
# result:
# [[ 0.31654004  0.87056375  0.29050563]
#  [ 0.55267505  0.59191276  0.20174988]]

arr6 = np.arange(5,20,3)  # 生成等距序列,参数为起点,终点,步长值.含起点值，不含终点值
# result: [ 5  8 11 14 17]
arr7 = np.linspace(0,2,9)  # 生成等距序列,参数为起点,终点,步长值.含起点值和终点值
# result: [ 0.    0.25  0.5   0.75  1.    1.25  1.5   1.75  2.  ]

访问数组

# 查看数组的属性
print arr2.shape # 返回矩阵的规格
# result: (2,3)
print arr2.ndim  # 返回矩阵的秩
# result: 2
print arr2.size  # 返回矩阵元素总数
# result: 6
print arr2.dtype.name   # 返回矩阵元素的数据类型
# result: float64
print type(arr2) # 查看整个数组对象的类型
# result： 

# 通过索引和切片访问数组元素
def f(x,y):
    return 10*x+y
arr8 = np.fromfunction(f,(4,3),dtype = int)
print arr8
# result:
# [[ 0  1  2]
# [10 11 12]
# [20 21 22]
# [30 31 32]]
print arr8[1,2] #返回矩阵第1行，第2列的元素（注意下标从0开始）
# result: 12
print arr8[0:2,:]  #切片，返回矩阵前2行
# result:
# [[ 0  1  2]
#  [10 11 12]]
print arr8[:,1]    #切片，返回矩阵第1列
# result: [ 1 11 21 31]
print arr8[-1]     #切片，返回矩阵最后一行
# reuslt: [30 31 32]

# 通过迭代器访问数组元素
for row in arr8:
    print row
# result:
# [0 1 2]
# [10 11 12]
# [20 21 22]
# [30 31 32]
for element in arr8.flat:
    print element
# 输出矩阵全部元素
print '-'*70

数组运算

数组间

print '''数组的运算'''
arr9 = np.array([[2,1],[1,2]])
arr10 = np.array([[1,2],[3,4]])
print arr9 - arr10  
# result:
# [[ 1 -1]
#  [-2 -2]]
print arr9**2
# result:
# [[4 1]
#  [1 4]]
print 3*arr10
# result:
# [[ 3  6]
#  [ 9 12]]
print arr9*arr10  #普通乘法
# result：
# [[2 2]
#  [3 8]]
print np.dot(arr9,arr10)  #矩阵乘法
# result:
# [[ 5  8]
#  [ 7 10]]
print arr10.T  #转置
# result:
# [[1 3]
#  [2 4]]
print np.linalg.inv(arr10) #返回逆矩阵
# result:
# [[-2.   1. ]
#  [ 1.5 -0.5]]
print arr10.sum()  #数组元素求和
# result: 10
print arr10.max()  #返回数组最大元素
# result: 4
print arr10.cumsum(axis = 1)  #沿行累计总和
# result： 
# [[1 3]
#  [3 7]]
print '-'*70

数组与函数

print '''NumPy通用函数'''
print np.exp(arr9)     #指数函数
# result:
# [[ 7.3890561   2.71828183]
#  [ 2.71828183  7.3890561 ]]
print np.sin(arr9)      #正弦函数（弧度制）
# result:
# [[ 0.90929743  0.84147098]
#  [ 0.84147098  0.90929743]]
print np.sqrt(arr9)     #开方函数
# result:
# [[ 1.41421356  1.        ]
#  [ 1.          1.41421356]]
print np.add(arr9,arr10)  #和arr9+arr10效果一样
# result:
# [[3 3]
#  [4 6]]
print '-'*70

数组分割与集合

# 合并
arr11 = np.vstack((arr9,arr10))  #纵向合并数组，由于与堆栈类似，故命名为vstack
print arr11
# result:
# [[2 1]
#  [1 2]
#  [1 2]
#  [3 4]]
arr12 = np.hstack((arr9,arr10))  #横向合并数组
print arr12
# result:
# [[2 1 1 2]
#  [1 2 3 4]]
# 分割
print np.hsplit(arr12,2)  # 将数组横向分为2部分
# result:
# [array([[2, 1],
#        [1, 2]]), array([[1, 2],
#        [3, 4]])]
print np.vsplit(arr11,2)   # 数组纵向分为2部分
# result:
# [array([[2, 1],
#        [1, 2]]), array([[1, 2],
#        [3, 4]])]

Pandas

数据框

# -*- coding:utf-8 -*-
import pandas as pd    # 为pandas取一个别名pd
data = {'id': ['Jack', 'Sarah', 'Mike'],
        'age': [18, 35, 20],
        'cash': [10.53, 500.7, 13.6]}
df = pd.DataFrame(data)    # 调用构造函数并将结果赋值给df
print df
# result:
#    age    cash     id
# 0   18   10.53   Jack
# 1   35  500.70  Sarah
# 2   20   13.60   Mike


df2 = pd.DataFrame(data, columns=['id', 'age', 'cash'],index=['one', 'two', 'three'])
print df2
# result:
#          id   age    cash
# one     Jack   18   10.53
# two    Sarah   35  500.70
# three   Mike   20   13.60


print df2['id']
# result:
# 0     Jack
# 1    Sarah
# 2     Mike
# Name: ID, dtype: object


s = pd.Series({'a': 4, 'b': 9, 'c': 16}, name='number')
print s
# result:
# a4
# b9
# c16
# Name: number, dtype: int64

系列

print s[0]
# result: 4
print s[:3]
# result:
# a     4
# b     9
# c    16
# Name: number, dtype: int64


print s['a']
# result: 4
s['d'] = 25    # 如果系列中本身没有这个键值，则会新增一行
print s
# result:
# a     4
# b     9
# c    16
# d    25
# Name: number, dtype: int64


import numpy as np
print np.sqrt(s)
# result:
# a    2.0
# b    3.0
# c    4.0
# d    5.0
# Name: number, dtype: float64
print s*s
# result:
# a     16
# b     81
# c    256
# d    625
# Name: number, dtype: int64


print df['id']    # 按列名访问(call-by-column)
# result:
# one       Jack
# two      Sarah
# three     Mike
# Name: ID, dtype: object

df['rich'] = df['cash'] > 200.0
print df
# result:
#    age    cash     id   rich
# 0   18   10.53   Jack  False
# 1   35  500.70  Sarah   True
# 2   20   13.60   Mike  False

del df['rich']
print df
# result:
#    age    cash     id
# 0   18   10.53   Jack
# 1   35  500.70  Sarah
# 2   20   13.60   Mike

Scipy

向量化运算

# -*- coding:utf-8 -*-
from scipy import poly1d
p = poly1d([3, 4, 5])
print p
# result:
#    2
# 3 x + 4 x + 5

print p*p
# result:
#   4      3      2
# 9 x + 24 x + 46 x + 40 x + 25

print p.integ(k=6)    # 求p(x)的不定积分，指定常数项为6
# result:
#   3   2
# 1 x + 2 x + 5 x + 6
print p.deriv()    # 求p(x)的一阶导数
# result:
# 6 x + 4

p([4, 5])    # 计算每个值代入p(x)的结果
# result:
# array([ 69, 100])



# -*- coding:utf-8 -*-
import numpy as np

def addsubtract(a, b):    # 按照原始定义，仅接受可比较的数字作为参数
    if a > b:
        return a - b
    else:
        return a + b

vec_addsubtract = np.vectorize(addsubtract)
print vec_addsubtract([0, 3, 6, 9], [1, 3, 5, 7])
# result:
# [1 6 1 2]

scikit-learn

机器学习基本函数的实现与调用

from sklearn import datasets


# 数据集类似字典对象，包括了所有的数据和关于数据的元数据（metadata）。
# 数据被存储在.data成员内，是一个n_samples*n_features的数组。
# 在有监督问题的情形下，一个或多个因变量（response variables）被储存在.target成员中

digits = datasets.load_digits()

# 例如在digits数据集中，digits.data是可以用来分类数字样本的特征
print digits.data
# result:
# [[  0.   0.   5. ...,   0.   0.   0.]
#  [  0.   0.   0. ...,  10.   0.   0.]
#  [  0.   0.   0. ...,  16.   9.   0.]
#  ..., 
#  [  0.   0.   1. ...,   6.   0.   0.]
#  [  0.   0.   2. ...,  12.   0.   0.]
#  [  0.   0.  10. ...,  12.   1.   0.]]


# digits.target给出了digits数据集的目标变量，即每个数字图案对应的我们想预测的真是数字
print digits.target
# result:
# [0 1 2 ..., 8 9 8]



print '''训练和预测'''

from sklearn import svm

# 选择模型参数
clf = svm.SVC(gamma=0.0001,C=100)

# 我们的预测器的名字叫做clf。现在clf必须通过fit方法来从模型中学习。
# 这个过程是通过将训练集传递给fit方法来实现的。我们将除了最后一个样本的数据全部作为训练集。

# 进行训练
clf.fit(digits.data[:-1], digits.target[:-1])
 
# 进行预测
print clf.predict(digits.data[-1])
# result: 8

绘图

matplotlib–常见的统计数据可视化

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 1000)
y = np.sin(x)
z = np.cos(x**2)

plt.figure(figsize=(8,4))
plt.plot(x,y,label="$sin(x)$",color="red",linewidth=2)
plt.plot(x,z,"b--",label="$cos(x^2)$")
plt.xlabel("Time(s)")
plt.ylabel("Volt")
plt.title("PyPlot First Example")
plt.ylim(-1.2,1.2)
plt.legend()
plt.show()



# -*- coding:utf-8 -*-
import matplotlib.pylab as plt
import numpy as np
# 第一部分
plt.subplot(2,1,1)    # 参数依次为：行，列，第几项
# 第二部分
plt.subplot(2,2,3)
# 第三部分
plt.subplot(2,2,4)
plt.show()



# -*- coding:utf-8 -*-
import matplotlib.pylab as plt
import numpy as np
# 第一部分
plt.subplot(2,1,1)    # 参数依次为：行，列，第几项
n = 12
X = np.arange(n)
Y1 = (1-X/float(n)) * np.random.uniform(0.5,1.0,n)
Y2 = (1-X/float(n)) * np.random.uniform(0.5,1.0,n)

# 利用plt.bar(x, y)绘制柱状图，并指定柱状图颜色，柱子边框颜色
plt.bar(X, +Y1, facecolor='#9999ff', edgecolor='white')
plt.bar(X, -Y2, facecolor='#ff9999', edgecolor='white')

for x, y in zip(X,Y1):
    # 利用plt.text()指定文字出现的坐标和内容
    plt.text(x+0.4, y+0.05, '%.2f' % y, ha='center', va='bottom')

# 利用plt.ylim(y1, y2)限制图形打印时对应的纵坐标范围
plt.ylim(-1.25,+1.25)


# 第二部分
plt.subplot(2,2,3)
n = 20
Z = np.random.uniform(0,1,n)
plt.pie(Z)

# 第三部分
plt.subplot(2,2,4)
X = np.linspace(-np.pi, np.pi, 256,endpoint=True)
Y_C, Y_S = np.cos(X), np.sin(X)

plt.plot(X, Y_C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, Y_S, color="red", linewidth=2.5, linestyle="-")

plt.xlim(X.min()*1.1, X.max()*1.1)
plt.xticks([-np.pi, -np.pi/2, 0, np.pi/2, np.pi],
       [r'$-\pi$', r'$-\pi/2$', r'$0$', r'$+\pi/2$', r'$+\pi$'])

plt.ylim(Y_C.min()*1.1, Y_C.max()*1.1)
plt.yticks([-1, 0, +1],
       [r'$-1$', r'$0$', r'$+1$'])
plt.show()

bokeh–Web端数据可视化

# -*- coding:utf-8 -*-
from bokeh.plotting import figure, output_file, show
x = [1, 2, 3, 4, 5]
y = [6, 7, 2, 4, 5]
# 输出为静态文件
output_file("../tmp/lines.html", title="line plot example")
# 创建一个figure对象，附带标题和坐标轴标记
p = figure(title="simple line example", x_axis_label='x', y_axis_label='y')
# 添加一条线，设置图例
p.line(x, y, legend="Line A.", line_width=2)
show(p)

分类与预测

线性回归

# -*- coding:utf-8 -*-
from bokeh.plotting import figure, output_file, show
x = [1, 2, 3, 4, 5]
y = [6, 7, 2, 4, 5]
# 输出为静态文件
output_file("../tmp/lines.html", title="line plot example")
# 创建一个figure对象，附带标题和坐标轴标记
p = figure(title="simple line example", x_axis_label='x', y_axis_label='y')
# 添加一条线，设置图例
p.line(x, y, legend="Line A.", line_width=2)
show(p)

逻辑回归

# -*- coding:utf8 -*-
import pandas as pd
from sklearn.linear_model import LogisticRegression, RandomizedLogisticRegression
from sklearn.cross_validation import train_test_split

# 导入数据并观察
data = pd.read_csv('../data/LogisticRegression.csv', encoding='utf-8')
# print data.head(5)    # 查看数据框的头五行

# 将类别型变量进行独热编码one-hot encoding
data_dum = pd.get_dummies(data, prefix='rank', columns=['rank'], drop_first=True)
print data_dum.tail(5)    # 查看数据框的最后五行
# result:
#     admit  gre   gpa  rank_2  rank_3  rank_4
# 395      0  620  4.00     1.0     0.0     0.0
# 396      0  560  3.04     0.0     1.0     0.0
# 397      0  460  2.63     1.0     0.0     0.0
# 398      0  700  3.65     1.0     0.0     0.0
# 399      0  600  3.89     0.0     1.0     0.0

# 切分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data_dum.ix[:, 1:], data_dum.ix[:, 0], test_size=.1, random_state=520)

lr = LogisticRegression()    # 建立LR模型
lr.fit(X_train, y_train)    # 用处理好的数据训练模型
print '逻辑回归的准确率为：{0:.2f}%'.format(lr.score(X_test, y_test) *100)

决策树

ID3算法分类

# -*- coding:utf-8 -*-
# 使用ID3算法进行分类
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as DTC, export_graphviz

data = pd.read_csv('../data/titanic_data.csv', encoding='utf-8')
data.drop(['PassengerId'], axis=1, inplace=True)    # 舍弃ID列，不适合作为特征

# 数据是类别标签，将其转换为数，用1表示男，0表示女。
data.loc[data['Sex'] == 'male', 'Sex'] = 1
data.loc[data['Sex'] == 'female', 'Sex'] = 0
data.fillna(int(data.Age.mean()), inplace=True)
print data.head(5)   # 查看数据

X = data.iloc[:, 1:3]    # 为便于展示，未考虑年龄（最后一列）
y = data.iloc[:, 0]

dtc = DTC(criterion='entropy')    # 初始化决策树对象，基于信息熵
dtc.fit(X, y)    # 训练模型
print '输出准确率：', dtc.score(X,y)

# 可视化决策树，导出结果是一个dot文件，需要安装Graphviz才能转换为.pdf或.png格式
with open('../tmp/tree.dot', 'w') as f:
    f = export_graphviz(dtc, feature_names=X.columns, out_file=f)

随机森林

from sklearn import ensemble
from sklearn import tree
from sklearn import datasets
from sklearn.model_selection import train_test_split

wine = datasets.load_wine()
X_data = wine.data
y_data = wine.target

X_train,X_test,y_train,y_test = train_test_split(X_data,y_data,test_size = 0.3)
clf = tree.DecisionTreeClassifier(criterion='gini')
rfc = ensemble.RandomForestClassifier(random_state=10)    # 随机森林中树的棵数
clf = clf.fit(X_train,y_train)
rfc  = rfc.fit(X_train,y_train)

### 多次交叉验证
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
rfc_l = []
clf_l = []
for i in range(10):
    rfc = ensemble.RandomForestClassifier(n_estimators=25)
    rfc_s = cross_val_score(rfc,X_train,y_train,cv = 10).mean()
    rfc_l.append(rfc_s)
    
    clf = tree.DecisionTreeClassifier(criterion='entropy')
    clf_s = cross_val_score(clf,X_train,y_train,cv  = 10).mean()
    clf_l.append(clf_s)
    
plt.plot(range(1,11),rfc_l,label='RandomForest')
plt.plot(range(1,11),clf_l,label = "DecisionTree")

plt.legend()
plt.show()

#随机森林模型的属性.feature_importances_可以查看每个特征的重要性的比例
rfc.feature_importances_
>> array([0.22856109, 0.03452478, 0.01369924, 0.00668104, 0.01473769,
       0.09992003, 0.08866531, 0.00509946, 0.02717142, 0.14646766,
       0.07152866, 0.13577966, 0.12716396])

#随机森林模型的属性.feature_importances_可以查看每个特征的重要性的比例
clf.apply(X_test)
>> array([[ 9, 10,  2, ...,  6, 17, 10],
       [14, 18, 10, ..., 10, 23, 16],
       [14, 18, 10, ..., 10, 23, 16],
       ...,
       [ 5, 14,  7, ...,  9, 24,  5],
       [ 9, 16,  5, ...,  6, 16, 10],
       [14, 18, 10, ..., 10, 23, 16]], dtype=int64)
       
#predice属性返回每个测试样本的预测结果，可以对比一下预测样本的真实标签 y_test。
rfc.predict(X_test)
>> array([0.22856109, 0.03452478, 0.01369924, 0.00668104, 0.01473769,
       0.09992003, 0.08866531, 0.00509946, 0.02717142, 0.14646766,
       0.07152866, 0.13577966, 0.12716396])
       
y_test
>> array([2, 0, 0, 1, 1, 2, 1, 2, 2, 1, 2, 2, 0, 1, 1, 2, 0, 2, 1, 0, 2, 1,
       1, 0, 1, 0, 0, 2, 2, 0, 0, 1, 0, 2, 0, 0, 2, 1, 1, 1, 1, 2, 0, 1,
       0, 1, 1, 0, 2, 1, 1, 1, 2, 0])

BP神经网络

# BP神经网络Python实现

import numpy as np
from numpy import random
import math
import copy
import sklearn.datasets
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt

# 获取数据并分为训练集与测试集
trainingSet, trainingLabels = sklearn.datasets.make_moons(400, noise=0.20)
plt.scatter(trainingSet[trainingLabels==1][:,0], trainingSet[trainingLabels==1][:,1], s=40, c='r', marker='x',cmap=plt.cm.Spectral)
plt.scatter(trainingSet[trainingLabels==0][:,0], trainingSet[trainingLabels==0][:,1], s=40, c='y', marker='+',cmap=plt.cm.Spectral)
plt.show()
testSet = trainingSet[320:]
testLabels = trainingLabels[320:]
trainingSet = trainingSet[:320]
trainingLabels = trainingLabels[:320]

# 设置网络参数
layer =[2,3,1] # 设置层数和节点数
Lambda = 0.005 # 正则化系数
alpha = 0.2 # 学习速率
num_passes = 10000 # 迭代次数
m = len(trainingSet) # 样本数量

# 建立网络
# 网络采用列表存储每层的网络结构,网络的层数和各层节点数都可以自由设定
b = [] # 偏置元,共layer-1个元素,b[0]代表第一个隐藏层的偏置元(向量形式)
W = []
for i in range(len(layer)-1):
    W.append(random.random(size = (layer[i+1],layer[i]))) # W[i]表示网络第i层到第i+1层的转移矩阵(NumPy数组),输入层是第0层
    b.append(np.array([0.1]*layer[i+1]))  # 偏置元,b[i]的规模是1*第i+1个隐藏层节点数
a = [np.array(0)]*(len(W)+1) # a[0] = x,即输入,a[1]=f(z[0]),a[len(W)+1] = 最终输出
z = [np.array(0)]*len(W) # 注意z[0]表示是网络输入层的输出，即未被激活的第一个隐藏层

W = np.array(W)

def costfunction(predict,labels):
    # 不加入正则化项的代价函数
    # 输入参数格式为numpy的向量
    return sum((predict - labels)**2)
def error_rate(predict,labels):
    # 计算错误率,针对二分类问题,分类标签为0或1
    # 输入参数格式为numpy的向量
    error =0.0
    for i in range(len(predict)):
        predict[i] = round(predict[i]) 
        if predict[i]!=labels[i]:
            error+=1
    return error/len(predict)
def sigmoid(z):  # 激活函数sigmoid
    return 1/(1+np.exp(-z))
def diff_sigmoid(z): # 激活函数sigmoid的导数
    return sigmoid(z)*(1-sigmoid(z))

activation_function = sigmoid  # 设置激活函数
diff_activation_function = diff_sigmoid # 设置激活函数的导数


# 开始训练BP神经网络
a[0] = np.array(trainingSet).T # 改一列为一个样本，一行代表一个特征
y = np.array(trainingLabels)

for v in range(num_passes):
    # 前向传播
    for i in range(len(W)):
        z[i] = np.dot(W[i],a[i])
        for j in range(m):
            z[i][:,j]+=b[i] # 加上偏置元
        a[i+1] = activation_function(z[i]) # 激活节点

    predict = a[-1][0] # a[-1]是输出层的结果,即为预测值

    # 反向传播
    delta = [np.array(0)]*len(W) # delta[0]是第一个隐藏层的残差，delta[-1]是输出层的残差

    # 计算输出层的残差
    delta[-1] = -(y-a[-1])*diff_activation_function(z[-1])

    # 计算第二层起除输出层外的残差
    for i in range(len(delta)-1):
        delta[-i-2] = np.dot(W[-i-1].T,delta[-i-1])*diff_activation_function(z[-i-2]) # 这里是倒序遍历
        # 设下标-i-2代表第L层，则W[-i-1]是第L层到L+1层的转移矩阵，delta[-i-1]是第L+1层的残差，而z[-i-2]是未激活的第L层

    # 计算最终需要的偏导数值
    delta_w = [np.array(0)]*len(W)
    delta_b = [np.array(0)]*len(W)
    for i in range(len(W)):
        # 使用矩阵运算简化公式,下面2行代码已将全部样本反向传播得到的偏导数值求和
        delta_w[i] = np.dot(delta[i],a[i].T) 
        delta_b[i] = np.sum(delta[i],axis=1) 

    # 更新权重参数
    for i in range(len(W)):
        W[i] -= alpha*(Lambda*W[i]+delta_w[i]/m)
        b[i] -= alpha/m*delta_b[i]
  
print '训练样本的未正则化代代函数值:',costfunction(predict,np.array(trainingLabels))  
print '训练样本错误率:',error_rate(predict,np.array(trainingLabels)) 

# 使用测试集测试网络
a[0] = np.array(testSet).T # 改一列为一个样本，一行代表一个特征
# 前向传播
m = len(testSet)
for i in range(len(W)):
    z[i] = np.dot(W[i],a[i])
    for j in range(m):
        z[i][:,j]+=b[i].T[0]
    a[i+1] = activation_function(z[i])
predict = a[-1][0]

print '测试样本的未正则化代代函数值:',costfunction(predict,np.array(testLabels))
print '测试样本错误率:',error_rate(predict,np.array(testLabels))

KNN算法

# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris

iris = load_iris()     # 加载数据
X = iris.data[:, :2]    # 为方便画图，仅采用数据的其中两个特征
y = iris.target
print iris.DESCR
print iris.feature_names
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

clf = KNeighborsClassifier(n_neighbors=15, weights='uniform')    # 初始化分类器对象
clf.fit(X, y)

# 画出决策边界，用不同颜色表示
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))

Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)    # 绘制预测结果图

plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)    # 补充训练数据点
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = 15, weights = 'uniform')")
plt.show()

朴素贝叶斯分类器

from sklearn.naive_bayes import GaussianNB  #高斯朴素贝叶斯

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

datas = load_iris()
# print(datas)

iris_x = datas.data
iris_y = datas.target

# print(iris_x)
# print(iris_y)

iris_x0 = iris_x[ :, 0:2]
# print(iris_x0)

X_train,X_test,y_train,y_test =train_test_split(iris_x0, iris_y, test_size=0.3)


clf = GaussianNB( )

'''
GaussianNB 参数只有一个：先验概率priors

MultinomialNB参数有三个：alpha是常量，一般取值1，fit_prior是否考虑先验概率，class_prior自行输入先验概率

BernoulliNB参数有四个：前三个与MultinomialNB一样，第四个binarize 标签二值化


这里的参数的意义主要参考https://www.cnblogs.com/pinard/p/6074222.html'''

clf.fit(X_train,y_train)

per = clf.predict(X_test)
print(per)
print(y_test)

聚类分析

K-means

# -*- coding:utf-8 -*-
# k-means实验

import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

plt.figure(figsize=(12, 12))

# 选取样本数量
n_samples = 1500
# 选取随机因子
random_state = 170
# 获取数据集
X, y = make_blobs(n_samples=n_samples, random_state=random_state)

# 聚类数量不正确时的效果
y_pred = KMeans(n_clusters=2, random_state=random_state).fit_predict(X)

plt.subplot(221)
plt.scatter(X[y_pred==0][:, 0], X[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X[y_pred==1][:, 0], X[y_pred==1][:, 1], marker='+',color='r')
plt.title("Incorrect Number of Blobs")

# 聚类数量正确时的效果
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X)

plt.subplot(222)
plt.scatter(X[y_pred==0][:, 0], X[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X[y_pred==1][:, 0], X[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X[y_pred==2][:, 0], X[y_pred==2][:, 1], marker='1',color='m')
plt.title("Correct Number of Blobs")

# 类间的方差存在差异的效果
X_varied, y_varied = make_blobs(n_samples=n_samples,
                                cluster_std=[1.0, 2.5, 0.5],
                                random_state=random_state)
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_varied)

plt.subplot(223)
plt.scatter(X_varied[y_pred==0][:, 0], X_varied[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X_varied[y_pred==1][:, 0], X_varied[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X_varied[y_pred==2][:, 0], X_varied[y_pred==2][:, 1], marker='1',color='m')
plt.title("Unequal Variance")

# 类的规模差异较大的效果
X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_filtered)

plt.subplot(224)
plt.scatter(X_filtered[y_pred==0][:, 0], X_filtered[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X_filtered[y_pred==1][:, 0], X_filtered[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X_filtered[y_pred==2][:, 0], X_filtered[y_pred==2][:, 1], marker='1',color='m')
plt.title("Unevenly Sized Blobs")

plt.show()

系统聚类

# -*- coding:utf-8 -*-
# 系统聚类实验

import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import make_blobs

plt.figure(figsize=(12, 12))

# 选取样本数量
n_samples = 1500
# 选取随机因子
random_state = 170
# 获取数据集
X, y = make_blobs(n_samples=n_samples, random_state=random_state)

# 聚类数量不正确时的效果
y_pred = AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=2).fit_predict(X)
# 选取欧几里德距离和离差平均和法

plt.subplot(221)
plt.scatter(X[y_pred==0][:, 0], X[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X[y_pred==1][:, 0], X[y_pred==1][:, 1], marker='+',color='r')
plt.title("Incorrect Number of Blobs")

# 聚类数量正确时的效果
y_pred = AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=3).fit_predict(X)

plt.subplot(222)
plt.scatter(X[y_pred==0][:, 0], X[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X[y_pred==1][:, 0], X[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X[y_pred==2][:, 0], X[y_pred==2][:, 1], marker='1',color='m')
plt.title("Correct Number of Blobs")

# 类间的方差存在差异的效果
X_varied, y_varied = make_blobs(n_samples=n_samples,
                                cluster_std=[1.0, 2.5, 0.5],
                                random_state=random_state)
y_pred = AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=3).fit_predict(X_varied)

plt.subplot(223)
plt.scatter(X_varied[y_pred==0][:, 0], X_varied[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X_varied[y_pred==1][:, 0], X_varied[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X_varied[y_pred==2][:, 0], X_varied[y_pred==2][:, 1], marker='1',color='m')
plt.title("Unequal Variance")

# 类的规模差异较大的效果
X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
y_pred = AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=3).fit_predict(X_filtered)

plt.subplot(224)
plt.scatter(X_filtered[y_pred==0][:, 0], X_filtered[y_pred==0][:, 1], marker='x',color='b')
plt.scatter(X_filtered[y_pred==1][:, 0], X_filtered[y_pred==1][:, 1], marker='+',color='r')
plt.scatter(X_filtered[y_pred==2][:, 0], X_filtered[y_pred==2][:, 1], marker='1',color='m')
plt.title("Unevenly Sized Blobs")

plt.show()

DBSCAN聚类

# -*- coding:utf-8 -*-
# 密度聚类模型

import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler


##############################################################################
# 获取make_blobs数据
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
                            random_state=0)
# 数据预处理
X = StandardScaler().fit_transform(X)

##############################################################################
# 执行DBSCAN算法
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
# 标记核心对象,后面作图需要用到
core_samples_mask[db.core_sample_indices_] = True
# 算法得出的聚类标签,-1代表样本点是噪声点,其余值表示样本点所属的类
labels = db.labels_

# 获取聚类数量
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

# 输出算法性能的信息
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

##############################################################################
# 绘图
import matplotlib.pyplot as plt

# 黑色用作标记噪声点
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))

i = -1
# 标记样式,x点表示噪声点
marker = ['v','^','o','x']
for k, col in zip(unique_labels, colors):
    if k == -1:
        # 黑色表示标记噪声点.
        col = 'k'

    class_member_mask = (labels == k)

    i += 1
    if (i>=len(unique_labels)):
        i = 0

    # 绘制核心对象
    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], marker[i], markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    # 绘制非核心对象
    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], marker[i], markerfacecolor=col,
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

关联分析

关联度计算

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn import linear_model
 
 
df=pd.read_csv('result.csv')
sns.set(style='whitegrid', context='notebook')   #style控制默认样式,context控制着默认的画幅大小
cols = ['man', 'woman', 'money','workyears']
sns.pairplot(df[cols], size=2.5)
plt.tight_layout()
plt.show()
# 建立模型
model =linear_model.LinearRegression()
# 开始训练
model.fit(df[['man', 'woman','workyears']], df['money'])
print("coefficients: ", model.coef_)
w1 = model.coef_[0]
w2 = model.coef_[1]
w2 = model.coef_[2]
print("intercept: ", model.intercept_)
b = model.intercept_
x_test = [[1,0,6]]
predict = model.predict(x_test)
print("predict: ", predict)

Apriori算法

main

#-*- coding: utf-8 -*-
#使用Apriori算法挖掘菜品订单关联规则
from __future__ import print_function
import pandas as pd
from apriori import * #导入自行编写的apriori函数

inputfile = '../data/menu_orders.xls'
outputfile = '../tmp/apriori_rules.xls' #结果文件
data = pd.read_excel(inputfile, header = None)

print(u'\n转换原始数据至0-1矩阵...')
ct = lambda x : pd.Series(1, index = x[pd.notnull(x)]) #转换0-1矩阵的过渡函数
b = map(ct, data.as_matrix()) #用map方式执行
data = pd.DataFrame(list(b)).fillna(0) #实现矩阵转换，空值用0填充
print(u'\n转换完毕。')
del b #删除中间变量b，节省内存

support = 0.2 #最小支持度
confidence = 0.5 #最小置信度
ms = '---' #连接符，默认'--'，用来区分不同元素，如A--B。需要保证原始表格中不含有该字符

find_rule(data, support, confidence, ms).to_excel(outputfile) #保存结果

aprior

#-*- coding: utf-8 -*-
from __future__ import print_function
import pandas as pd

#自定义连接函数，用于实现L_{k-1}到C_k的连接
def connect_string(x, ms):
  x = list(map(lambda i:sorted(i.split(ms)), x))
  l = len(x[0])
  r = []
  for i in range(len(x)):
    for j in range(i,len(x)):
      if x[i][:l-1] == x[j][:l-1] and x[i][l-1] != x[j][l-1]:
        r.append(x[i][:l-1]+sorted([x[j][l-1],x[i][l-1]]))
  return r

#寻找关联规则的函数
def find_rule(d, support, confidence, ms = u'--'):
  result = pd.DataFrame(index=['support', 'confidence']) #定义输出结果
  
  support_series = 1.0*d.sum()/len(d) #支持度序列
  column = list(support_series[support_series > support].index) #初步根据支持度筛选
  k = 0
  
  while len(column) > 1:
    k = k+1
    print(u'\n正在进行第%s次搜索...' %k)
    column = connect_string(column, ms)
    print(u'数目：%s...' %len(column))
    sf = lambda i: d[i].prod(axis=1, numeric_only = True) #新一批支持度的计算函数
    
    #创建连接数据，这一步耗时、耗内存最严重。当数据集较大时，可以考虑并行运算优化。
    d_2 = pd.DataFrame(list(map(sf,column)), index = [ms.join(i) for i in column]).T
    
    support_series_2 = 1.0*d_2[[ms.join(i) for i in column]].sum()/len(d) #计算连接后的支持度
    column = list(support_series_2[support_series_2 > support].index) #新一轮支持度筛选
    support_series = support_series.append(support_series_2)
    column2 = []
    
    for i in column: #遍历可能的推理，如{A,B,C}究竟是A+B-->C还是B+C-->A还是C+A-->B？
      i = i.split(ms)
      for j in range(len(i)):
        column2.append(i[:j]+i[j+1:]+i[j:j+1])
    
    cofidence_series = pd.Series(index=[ms.join(i) for i in column2]) #定义置信度序列
 
    for i in column2: #计算置信度序列
      cofidence_series[ms.join(i)] = support_series[ms.join(sorted(i))]/support_series[ms.join(i[:len(i)-1])]
    
    for i in cofidence_series[cofidence_series > confidence].index: #置信度筛选
      result[i] = 0.0
      result[i]['confidence'] = cofidence_series[i]
      result[i]['support'] = support_series[ms.join(sorted(i.split(ms)))]
  
  result = result.T.sort_values(['confidence','support'], ascending = False) #结果整理，输出
  print(u'\n结果为：')
  print(result)
  
  return result

智能推荐

UBCF算法

#-*- coding: utf-8 -*-
#使用基于UBCF算法对电影进行推荐
from __future__ import print_function
import pandas as pd

############    主程序   ##############
if __name__ == "__main__":
    print("\n--------------使用基于UBCF算法对电影进行推荐 运行中... -----------\n")
    traindata = pd.read_csv('/media/dp_zhou/Knowledge/Learning data/Python books/数据与代码/数据与代码/示例程序/data/u1.base',sep='\t', header=None,index_col=None)
    testdata = pd.read_csv('/media/dp_zhou/Knowledge/Learning data/Python books/数据与代码/数据与代码/示例程序/data/u1.test',sep='\t', header=None,index_col=None)
    #删除时间标签列
    traindata.drop(3,axis=1, inplace=True)
    testdata.drop(3,axis=1, inplace=True)
    #行与列重新命名
    traindata.rename(columns={0:'userid',1:'movid',2:'rat'}, inplace=True)
    testdata.rename(columns={0:'userid',1:'movid',2:'rat'}, inplace=True)
    traindf=traindata.pivot(index='userid', columns='movid', values='rat')
    testdf=testdata.pivot(index='userid', columns='movid', values='rat')
    traindf.rename(index={i:'usr%d'%(i) for i in traindf.index} , inplace=True)
    traindf.rename(columns={i:'mov%d'%(i) for i in traindf.columns} , inplace=True)
    testdf.rename(index={i:'usr%d'%(i) for i in testdf.index} , inplace=True)
    testdf.rename(columns={i:'mov%d'%(i) for i in testdf.columns} , inplace=True)
    userdf=traindf.loc[testdf.index]
    #获取预测评分和推荐列表
    trainrats,trainrecomm=recomm(traindf,userdf)

#-*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import math
def prediction(df,userdf,Nn=15):#Nn邻居个数
    corr=df.T.corr();
    rats=userdf.copy()
    for usrid in userdf.index:
        dfnull=df.loc[usrid][df.loc[usrid].isnull()]
        usrv=df.loc[usrid].mean()#评价平均值
        for i in range(len(dfnull)):
            nft=(df[dfnull.index[i]]).notnull()
            #获取邻居列表
            if(Nn<=len(nft)):
                nlist=df[dfnull.index[i]][nft][:Nn]
            else:
                nlist=df[dfnull.index[i]][nft][:len(nft)]
            nlist=nlist[corr.loc[usrid,nlist.index].notnull()]
            nratsum=0
            corsum=0
            if(0!=nlist.size):
                nv=df.loc[nlist.index,:].T.mean()#邻居评价平均值
                for index in nlist.index:
                    ncor=corr.loc[usrid,index]
                    nratsum+=ncor*(df[dfnull.index[i]][index]-nv[index])
                    corsum+=abs(ncor)
                if(corsum!=0):
                    rats.at[usrid,dfnull.index[i]]= usrv + nratsum/corsum
                else:
                    rats.at[usrid,dfnull.index[i]]= usrv
            else:
                rats.at[usrid,dfnull.index[i]]= None
    return rats
def recomm(df,userdf,Nn=15,TopN=3):
    ratings=prediction(df,userdf,Nn)#获取预测评分
    recomm=[]#存放推荐结果
    for usrid in userdf.index:
        #获取按NA值获取未评分项
        ratft=userdf.loc[usrid].isnull()
        ratnull=ratings.loc[usrid][ratft]
        #对预测评分进行排序
        if(len(ratnull)>=TopN):
            sortlist=(ratnull.sort_values(ascending=False)).index[:TopN]
        else:
            sortlist=ratnull.sort_values(ascending=False).index[:len(ratnull)]
        recomm.append(sortlist)
    return ratings,recomm

协同过滤

#_*_coding:utf-8_*_

import pandas as pd
import numpy as np

header = ['user_id', 'item_id', 'rating', 'timestamp']
dataset = pd.read_csv('../data/u.data',sep='\t',names=header)

#计算唯一用户和电影的数量
# unique对以为数组去重  shape[0] shape为矩阵的长度
users = dataset.user_id.unique().shape[0]
items = dataset.item_id.unique().shape[0]
from sklearn.model_selection import train_test_split
train_data,test_data = train_test_split(dataset,test_size=0.25)

'''
创建user-item矩阵
itertuples         pandas dataframe 建立索引的方式
结果为：   Pandas(Index=77054, user_id=650, item_id=528, rating=3, timestamp=891370998)
'''
train_data_matrix = np.zeros((users,items))
for line in train_data.itertuples():
    train_data_matrix[line[1] - 1, line[2] - 1] = line[3]

test_data_matrix = np.zeros((users,items))
for line in test_data.itertuples():
    test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
#计算相似度
from sklearn.metrics.pairwise import pairwise_distances
#相似度相当于权重w
user_similarity = pairwise_distances(train_data_matrix,metric='cosine')
#train_data_matrix.T 矩阵转置
items_similarity = pairwise_distances(train_data_matrix.T,metric='cosine')

'''
基于用户相似矩阵 -> 基于用户的推荐
mean函数求取均值  axis=1 对各行求取均值，返回一个m*1的矩阵
np.newaxis 给矩阵增加一个列 一维矩阵变为多维矩阵 mean_user_rating(n*1)
train_data_matrix所有行都减去mean_user_rating对应行的数    此为规范化评分，使其在统一的范围内
numpy a.dot(b) -> 两个矩阵的点积
      np.abs(a) ->计算矩阵a各元素的绝对值
      np.sum()  -> 无参数 矩阵全部元素相加
                -> axis=0   按列相加
                -> axis=1   按行相加
      b /a 矩阵对应为相除
'''
mean_user_rating = train_data_matrix.mean(axis = 1) #计算每行的平均数
rating_diff = train_data_matrix - mean_user_rating[:,np.newaxis]  #评分规范化
pred = mean_user_rating[:, np.newaxis] \
       + user_similarity.dot(rating_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T  #权重w*平均化的评分

'''
评估指标    均方差误差
'''
from sklearn.metrics import mean_squared_error
from math import sqrt

pred = pred[test_data_matrix.nonzero()].flatten()
test_data_matrix = test_data_matrix[test_data_matrix.nonzero()].flatten()
result = sqrt(mean_squared_error(pred,test_data_matrix))
print(result)

时序分析

arima模型

#-*- coding: utf-8 -*-
#arima时序模型
from __future__ import print_function
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.graphics.api import qqplot
from statsmodels.graphics.tsaplots import plot_acf

# 参数初始化
discfile = '../data/arima_data.xls'

# 读取数据，指定日期列为指标，Pandas自动将“日期”列识别为Datetime格式
data = pd.read_excel(discfile,index_col=0)
print(data.head())
print('\n Data Types:')
print(data.dtypes)


# 时序图
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
data.plot()
plt.show()


#自相关图
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(data).show()


#平稳性检测
from statsmodels.tsa.stattools import adfuller as ADF
print(u'原始序列的ADF检验结果为：', ADF(data[u'销量']))
#返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore


#差分后的时序图
D_data = data.diff().dropna()
D_data.columns = [u'销量差分']
D_data.plot() #时序图
plt.show()


#自相关图
plot_acf(D_data).show()


#偏自相关图
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(D_data).show() 


#平稳性检测
print(u'差分序列的ADF检验结果为：', ADF(D_data[u'销量差分'])) 


#白噪声检验
from statsmodels.stats.diagnostic import acorr_ljungbox
print(u'差分序列的白噪声检验结果为：', acorr_ljungbox(D_data, lags=1))
#返回统计量和p值 


# 一阶差分
fig = plt.figure(figsize=(12,8))
ax1= fig.add_subplot(111)
diff1 = data.diff(1)
diff1.plot(ax=ax1)


# 二阶差分
fig = plt.figure(figsize=(12,8))
ax2= fig.add_subplot(111)
diff2 = data.diff(2)
diff2.plot(ax=ax2)


# 合适的p,q
dta = data.diff(1)[1:]
fig = plt.figure(figsize=(12,8))
ax1=fig.add_subplot(211)
fig1 = sm.graphics.tsa.plot_acf(dta[u'销量'],lags=10,ax=ax1)
ax2 = fig.add_subplot(212)
fig2 = sm.graphics.tsa.plot_pacf(dta[u'销量'],lags=10,ax=ax2)


#模型
arma_mod20 = sm.tsa.ARMA(dta,(2,0)).fit()
print(arma_mod20.aic,arma_mod20.bic,arma_mod20.hqic)
arma_mod01 = sm.tsa.ARMA(dta,(0,1)).fit()
print(arma_mod01.aic,arma_mod01.bic,arma_mod01.hqic)
arma_mod10 = sm.tsa.ARMA(dta,(1,0)).fit()
print(arma_mod10.aic,arma_mod10.bic,arma_mod10.hqic)


#残差QQ图
resid = arma_mod01.resid
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
fig = qqplot(resid, line='q', ax=ax, fit=True)


#残差自相关检验
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(arma_mod01.resid.values.squeeze(), lags=10, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(arma_mod01.resid, lags=10, ax=ax2)


#D-W检验
print(sm.stats.durbin_watson(arma_mod01.resid.values))


# Ljung-Box检验
import numpy as np
r,q,p = sm.tsa.acf(resid.values.squeeze(), qstat=True)
datap = np.c_[range(1,36), r[1:], q, p]
table = pd.DataFrame(datap, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))


#预测
predict_sunspots = arma_mod01.predict('2015-2-07', '2015-2-15', dynamic=True)
fig, ax = plt.subplots(figsize=(12, 8))
print(predict_sunspots)
predict_sunspots[0] += data['2015-02-06':][u'销量']
data=pd.DataFrame(data)
for i in range(len(predict_sunspots)-1):
    predict_sunspots[i+1]=predict_sunspots[i]+predict_sunspots[i+1]
print(predict_sunspots)
ax = data.ix['2015':].plot(ax=ax)
predict_sunspots.plot(ax=ax)
plt.show()

你可能感兴趣的:(Leon-数据挖掘,python,数据挖掘,机器学习)

苹果 python idle_Mac Python 开发环境IDLE 搭建 weixin_39710966 苹果 python idle
一直在做android开发，想学一门语言用来开发后台，经过考虑最终选择了，简单易学的Python，下面就是Python开发环境的搭建.1.首先前往Python官网下载idle工具：https://www.python.org/downloads/，进入官网，选择download，下载对应的Idle,我下载的是Python3.5.2。2.下载完成后点解安装程序，依次点击下一步，直到完成至此，IDLE
mac苹果电脑搭建Python开发环境 Python私教 python macos python 开发语言
公司的新电脑上要搭建Python的开发环境，由于给的是mac电脑，所以需要重新搭建python环境。这里我首先考虑的还是miniconda。由于官网下载太慢了，所以我选择从清华源下载：https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/我下载的是这个：Miniconda3-py311_24.5.0-0-MacOSX-x86_64.pkg下
快速收集信息，Python爬虫教你一招爬取豆瓣Top250信息！不想秃头的里里
随着科技不断发展，互联网已经进入了大数据时代。我们过去只能通过报刊杂志、电视广播获取到有限的信息，而现在，互联网上的海量数据，让我们享受到了信息自由。但是，我们获取到了海量的信息同时，也带来了大量的垃圾信息。所以必须要通过一些技术手段进行收集、整理、分析、筛选，然后才能获取到对我们有用的相关内容。而这个技术手段，就叫网络爬虫技术。前两天老铁跟我吐槽，他的老板突然要他收集豆瓣电影Top250榜单上的
2024年最全Python使用打码平台进行识别验证码_python验证码识别文字坐标 2401_84584831 程序员 python 开发语言
打码平台介绍一般使用超级鹰或打码兔的打码平台。超级鹰介绍打开http://www.chaojiying.com/contact.html注册用户，生成软件ID下载python的demo文件查看打码类型使用方法逻辑实现1.获取需要识别的图片在获取需要的识别的
【Python】正则表达式秋秋晗晗 Python 正则表达式 python
1.正则表达式模式1.1字符模式描述a,b,c,1,2,3,-一个普通字符\d一个数字字符\D一个非数字字符\s一个空白字符\S一个非空白字符[ab12]a,b,1,2中的一个字符[a-e]a~e范围内的一个字符[^a-e]不在a~e范围内的一个字符.一个非换行符字符1.2字符的重复次数模式描述*0次或多次+1次或多次?0次或1次{2}2次{2,5}2~5次{2,}至少2次{,5}至多5次1.3组
CMake构建学习笔记9-Eigen库的构建 charlee44 CMake C++学习 CMake 构建 C/C++Eigen
Eigen是一个高性能的C++线性代数库，广泛用于科学计算、机器学习、计算机视觉等领域。不过，Eigen有点特别，它是一个纯头文件实现的库；也就是说，任何一个程序要引入它，只要include它的头文件就可以了。这天然就规避了不同操作系统不同编译器造成的二进制兼容的问题，所有的实现都include源代码了，那还不是轻松跨平台？像Eigen这种风格的库就被称为HeaderOnly库。这种库使用起来确实
随机森林（Random Forest）VS 提升树（Boosting Trees）高大黑白涂鸦随机森林 boosting 算法机器学习人工智能
随机森林（RandomForest）和提升树（BoostingTrees）都是常见的机器学习算法，它们都基于决策树，但使用的策略和目标不同。随机森林（RandomForest）通俗的类比：想象你有一个班级里的多位老师（决策树），你让他们每个人都独立地给出意见（预测）。每个老师的意见可能不完全一致，因为他们对问题的理解和方法不同。然后，你把所有老师的意见汇总，得到一个“班级意见”的结果。优点：减少过
Python知识点：如何使用MongoDB与PyMongo进行NoSQL数据库操作杰哥在此 Python系列 python mongodb nosql
使用MongoDB与PyMongo进行NoSQL数据库操作的步骤如下。PyMongo是一个用于与MongoDB交互的Python库，它提供了对MongoDB的完整访问。1.安装PyMongo首先，你需要安装PyMongo库。如果还未安装，可以使用以下命令进行安装：pipinstallpymongo2.连接到MongoDB数据库使用MongoClient类来创建一个连接对象，连接到MongoDB实例
分享一个基于Spark的招聘数据可视化与分析系统Hadoop大数据源码（源码、调试、LW、开题、PPT）计算机源码社大数据大数据 spark 毕业设计项目计算机毕业设计源码计算机毕设论文 hadoop 计算机课程设计
作者：计算机源码社个人简介：本人八年开发经验，擅长Java、Python、PHP、.NET、Node.js、Android、微信小程序、爬虫、大数据、机器学习等，大家有这一块的问题可以一起交流！学习资料、程序开发、技术解答、文档报告如需要源码，可以扫取文章下方二维码联系咨询Java项目微信小程序项目Android项目Python项目PHP项目ASP.NET项目Node.js项目选题推荐项目实战|基
探索 `multiprocessing-logging`：优化Python多进程日志处理的新工具强妲佳Darlene
探索multiprocessing-logging：优化Python多进程日志处理的新工具multiprocessing-loggingHandlerforloggingfrommultipleprocesses项目地址:https://gitcode.com/gh_mirrors/mu/multiprocessing-logging在Python编程中，日志管理是诊断和调试的关键部分。特别是在多
在Python程序中实现分布式进程的教程程序员牡蛎 python 分布式 python 编程语言
@本文来源于公众号：csdn2299，喜欢可以关注公众号程序员学府这篇文章主要介绍了在Python程序中实现分布式进程的教程,在多进程编程中十分有用,示例代码基于Python2.x版本,需要的朋友可以参考下在Thread和Process中，应当优选Process，因为Process更稳定，而且，Process可以分布到多台机器上，而Thread最多只能分布到同一台机器的多个CPU上。Python的
Python之路：Django进阶篇中國颜值的半壁江山 django django
Modeldjango为使用一种新的方式，即：关系对象映射（ObjectRelationalMapping，简称ORM）。PHP：activerecordJava：HibernateC#：EntityFrameworkdjango中遵循CodeFrist的原则，即：根据代码中定义的类来自动生成数据库表。一、创建表1、基本结构fromdjango.dbimportmodels#Createyourm
简易Python：xlrd 和 openpyxl 库读取Excel单元格数据几种方式 PythonKaiser python windows excel
xlrd库是比较经典的一个库了，经典到vscode都没有代码提示，也没有高亮显示，堪称古典。xlrd也是很轻量的库，用起来不难。初步了解面向对象编码后，也可以尝试阅读源码学习代码组织方式。以下进入正题。首先当然是下载安装xlrd库了，然后import该库。在链式调用的各个函数中填入相应参数：文件路径和工作表序号（或名称），以上都是读取同一个单元格的数据，可以看出，几种读取方式的代码数量是一样的。而
p2p、分布式，区块链笔记:基于IPFS实现的数据库orbitdb笔记 FakeOccupational 分布式 p2p 分布式区块链
orbitdborbitdb：Peer-to-PeerDatabasesfortheDecentralizedWeb特性说明特点无服务器、分布式、p2p编程语言JavaScript对其他语言的支持ApythonclientfortheOrbitdbHTTPAPI，go-orbit-db，让我们了解一下谁在使用js-ipfs！是否为区块链不是区块链。使用强最终一致性模型，而非强一致性模型。许可MIT
Python基础进阶知识点小小毛球球 Python学习算法青少年编程 python c语言学习
一、Web开发同步框架：Django、Flask；异步框架：Tornado、Sanic；Tornado框架；Git版本控制；HTTP方法以及响应码；关系型数据库：MySQL、PostgreSQL、Oracle；非关系型数据库：Redis、Mongo；日志；项目的部署；实战练习。二、爬虫爬虫应用领域；爬虫的合法性：Robots.txt；HTTP请求：请求行、请求头，请求体；响应：响应码、响应头、响应
python多进程日志以及分布式日志的实现方式互联网架构小马 python 分布式开发语言编程后端
python日志模块logging支持多线程，但是在多进程下写入日志文件容易出现下面的问题：PermissionError:[WinError32]另一个程序正在使用此文件，进程无法访问。也就是日志文件被占用的情况，原因是多个进程的文件handler对日志文件进行操作产生的。这个问题经常在TimedRotatingFileHandler、RotatingFileHandler中出现。解决办法题主在
一点机器学习的体会 zfq212
我关心的主要是机器学习有什么用，对未来哪些行业，领域，应用会产生变革级别的影响。为了了解这些，我感觉我首先要了解现在机器学习的主要方向，他们的主要应用，相关的主要工程工具。机器学习跟线性代数和统计联系比较多，所以在做了解时，很容易一不小心就会陷入理论的坑中，不易自拔。传统的机器学习有一套较完整的理论和算法，去scikitlearning网站可以有个大致的了解，或者听下吴恩达老师那门基础ML课程（网
【云原生】Kubernetes中常见的Pod故障排查定位与解决方案景天科技苑云原生K8S 零基础到进阶实战云原生 kubernetes 容器 pod故障排查 pod故障排查与解决方案 k8s故障排查
✨✨欢迎大家来到景天科技苑✨✨养成好习惯，先赞后看哦~作者简介：景天科技苑《头衔》：大厂架构师，华为云开发者社区专家博主，阿里云开发者社区专家博主，CSDN全栈领域优质创作者，掘金优秀博主，51CTO博客专家等。《博客》：Python全栈，前后端开发，小程序开发，人工智能，js逆向，App逆向，网络系统安全，数据分析，Django，fastapi，flask等框架，云原生k8s，linux，she
Python接口测试之如何使用requests发起请求例子解析乔丹搞IT Python python 开发语言
在Python中，使用requests库发起HTTP请求是一种常见的接口测试方法。以下是一些使用requests库的基本示例，涵盖了GET、POST、PUT、DELETE等HTTP方法。安装requests库首先，确保你已经安装了requests库。如果未安装，可以通过以下命令安装：pipinstallrequestsGET请求示例importrequests#发起GET请求response=re
机器学习——第十二章计算学习理论 adchloe 机器学习学习人工智能
目录1基础知识2PAC学习3有限假设空间3.1可分情形3.2不可分情形4VC维5Rademacher复杂度6稳定性1基础知识该理论研究的是关于通过计算来进行学习的理论，目的是分析学习任务的困难本质，为学习算法提供理论保证，并根据法分析结果指导算法设计。给定样例集D，假设χ\chiχ中的所有样本服从一个隐含未知的分布TTT,D中所有样本都是独立地从这个分布上采样而得。令h为χ到y\chi到yχ到y的
Python计算机视觉编程——第二章局部图像描述子 adchloe python 计算机视觉开发语言
目录1Harris角点检测器2SIFT2.1兴趣点2.2描述子2.3检测兴趣点2.4匹配描述子1Harris角点检测器Harris角点检测算法是简单的角点检测算法，主要思想是，如果像素周围显示存在多于一个方向的边，认为该点为兴趣点，称为角点。把图像域中点x上的对称半正定矩阵Mr=Ml(x)M_{r}=M_{l}(\mathbf{x})Mr=Ml(x)定义为：M1=∇I ∇IT=[IxIy][IxI
游戏AI中的模仿学习 SEU-WYL 深度学习dnn 人工智能游戏学习
模仿学习在游戏AI中的应用已经逐渐成为提升游戏智能和玩家体验的重要技术。通过模仿人类玩家的行为，游戏AI可以表现出更加智能、自然的决策和操作能力，使得游戏更加富有挑战性和趣味性。以下是关于游戏AI中模仿学习的详细探讨。1.什么是模仿学习？模仿学习（ImitationLearning）是一种机器学习技术，通过观察和模仿专家（通常是人类玩家）的行为，训练AI模型，使其能够在游戏中执行类似的任务。与传统
python import settings报错的解决 yuan_yuan_z python python
首先是按上一篇文章把pip更新了环境变量配好了然后importsettings还是不行…最后发现原因是pycharm下的文件夹是PythonProject再里面是Archive文件夹，再往里面才是我运行的文件和settings.py。需要写成importArchive.settings或from.importsettings
如何提升PHP开发技能：从初学者到高级开发者的进阶之路专家大圣 PHP php 开发语言
PHP作为一种流行的服务器端脚本语言，广泛用于Web开发。尽管近年来许多新兴语言如Node.js、Python等开始流行，但PHP仍然是开发动态网站和Web应用程序的主要选择之一。本文将探讨如何从基础知识到高级开发技巧，不断提升你的PHP开发技能。目录1.掌握PHP的基础知识2.理解面向对象编程（OOP）3.深入理解PHP的高级功能4.关注安全性5.学习使用框架6.探索现代开发工具和流程7.关注社
【机器学习】梯度下降算法 de-feedback 机器学习算法人工智能
梯度下降算法这篇博客更加详细，以下只是我个人的理解梯度下降算法原理讲解——机器学习-CSDN博客梯度下降算法是一种优化算法，通过梯度下降找到函数最小值时的自变量值。其基本思想是沿着梯度方向的反方向更新参数，直到逼近函数的极值或者函数值足够小，或者是到达最大迭代次数。目标函数求目标函数的导数和梯度值沿着梯度方向的反方向更新参数重复直到满足条件以线性回归为例，通过找均方差损失函数最小值，得到最优的权重
【机器学习】特征提取特征降维 de-feedback 机器学习人工智能
特征工程特征工程是将原始数据转化为可以用于机器学习的数字特征，比如字典的特征提取，文档的特征提取等。字典特征提取把字典的每个唯一的键作为数据集特征的一个维度，有这个维度的就为1，没有就是0。其他相同的键，该维度的值就是其键值。这样的操作把字典样本的每一条数据转化为了矩阵，但是矩阵中含有大量的0（因为数据中的键和值有很多不同），所以称之为稀疏矩阵为了保存数据的高效，一般使用三元组表存储。保存非零数据
【机器学习】以KNN为例的交叉验证网格搜索 de-feedback 机器学习算法人工智能
KNNK-NearestNeighbors简称为KNN，根据k个最近的邻居的类别判断当前样本的类别，k一般取奇数。k个邻居中哪种类别的样本多，就判断这个为这个类别距离判断knn首先要判断两个样本之间的距离，距离有多种表示方式欧氏距离生活中常用的距离公式，二维空间中的两点(x1,y1)(x2,y2)(x_1,y_1)(x_2,y_2)(x1,y1)(x2,y2)距离表示为(x1−x2)2+(y1−y
浅学爬虫-python爬虫基础 Jr_l 网络爬虫网络爬虫 python
介绍与应用Python爬虫是指利用Python编写程序从互联网上自动获取信息的技术。爬虫广泛应用于数据收集、价格监控、内容聚合、市场分析等领域。其基本原理是模拟浏览器发送HTTP请求获取网页数据，并通过解析HTML来提取所需的信息。基本工具Python中有许多强大的爬虫库，最常用的两个库是requests和BeautifulSoup。requests库:一个简单易用的HTTP库，用于发送HTTP请
【pytorch(cuda)】基于DQN算法的无人机三维城市空间航线规划（Python代码实现）程序猿鑫 python pytorch 算法
欢迎来到本博客❤️❤️博主优势：博客内容尽量做到思维缜密，逻辑清晰，为了方便读者。⛳️座右铭：行百里者，半于九十。本文目录如下：目录⛳️赠与读者1概述一、研究背景与意义二、DQN算法概述三、基于DQN的无人机三维航线规划方法1.环境建模2.状态与动作定义3.奖励函数设计4.深度神经网络训练5.航线规划四、研究挑战与展望2运行结果3参考文献4Python代码实现⛳️赠与读者‍做科研，涉及到一个深在的
python常用库（二） Jr_l python python 开发语言
文章目录python常用库4、sys库4.1、**命令行参数**4.2、**模块导入**4.3、**退出程序**4.4、**标准输入输出**4.5、**系统相关信息**4.6、**内存管理**4.7、**其他功能**5、collections库5.1、**Counter计数器**5.2、**defaultdict默认字典**5.3、**OrderedDict有序字典**5.4、**deque双端队
JAVA基础灵静志远位运算加载 Date 字符串池覆盖
一、类的初始化顺序 1 （静态变量，静态代码块）-->（变量，初始化块）--> 构造器同一括号里的，根据它们在程序中的顺序来决定。上面所述是同一类中。如果是继承的情况，那就在父类到子类交替初始化。二、String 1 String a = "abc"; JAVA虚拟机首先在字符串池中查找是否已经存在了值为"abc"的对象，根
keepalived实现redis主从高可用 bylijinnan redis
方案说明两台机器（称为A和B），以统一的VIP对外提供服务 1.正常情况下，A和B都启动，B会把A的数据同步过来（B is slave of A） 2.当A挂了后，VIP漂移到B；B的keepalived 通知redis 执行：slaveof no one，由B提供服务 3.当A起来后，VIP不切换，仍在B上面；而A的keepalived 通知redis 执行slaveof B，开始
java文件操作大全 0624chenhong java
最近在博客园看到一篇比较全面的文件操作文章，转过来留着。 http://www.cnblogs.com/zhuocheng/archive/2011/12/12/2285290.html 转自http://blog.sina.com.cn/s/blog_4a9f789a0100ik3p.html 一.获得控制台用户输入的信息 &nbs
android学习任务不懂事的小屁孩工作
任务完成情况搞清楚带箭头的pupupwindows和不带的使用已完成熟练使用pupupwindows和alertdialog，并搞清楚两者的区别已完成熟练使用android的线程handler,并敲示例代码进行中了解游戏2048的流程，并完成其代码工作进行中-差几个actionbar 研究一下android的动画效果，写一个实例已完成复习fragem
zoom.js 换个号韩国红果果 oom
它的基于bootstrap 的 https://raw.github.com/twbs/bootstrap/master/js/transition.js transition.js模块引用顺序 <link rel="stylesheet" href="style/zoom.css"> <script src=&q
详解Oracle云操作系统Solaris 11.2 蓝儿唯美 Solaris
当Oracle发布Solaris 11时，它将自己的操作系统称为第一个面向云的操作系统。Oracle在发布Solaris 11.2时继续它以云为中心的基调。但是，这些说法没有告诉我们为什么Solaris是配得上云的。幸好，我们不需要等太久。Solaris11.2有4个重要的技术可以在一个有效的云实现中发挥重要作用：OpenStack、内核域、统一存档（UA）和弹性虚拟交换（EVS）。
spring学习——springmvc（一） a-john springMVC
Spring MVC基于模型-视图-控制器（Model-View-Controller，MVC）实现，能够帮助我们构建像Spring框架那样灵活和松耦合的Web应用程序。 1，跟踪Spring MVC的请求请求的第一站是Spring的DispatcherServlet。与大多数基于Java的Web框架一样，Spring MVC所有的请求都会通过一个前端控制器Servlet。前
hdu4342 History repeat itself-------多校联合五 aijuans 数论
水题就不多说什么了。 #include<iostream>#include<cstdlib>#include<stdio.h>#define ll __int64using namespace std;int main(){ int t; ll n; scanf("%d",&t); while(t--)
EJB和javabean的区别 asia007 bean ejb
EJB不是一般的JavaBean,EJB是企业级JavaBean,EJB一共分为3种,实体Bean,消息Bean,会话Bean,书写EJB是需要遵循一定的规范的,具体规范你可以参考相关的资料.另外,要运行EJB,你需要相应的EJB容器,比如Weblogic,Jboss等,而JavaBean不需要,只需要安装Tomcat就可以了 1.EJB用于服务端应用开发, 而JavaBeans
Struts的action和Result总结百合不是茶 struts Action配置 Result配置
一:Action的配置详解: 下面是一个Struts中一个空的Struts.xml的配置文件 <?xml version="1.0" encoding="UTF-8" ?> <!DOCTYPE struts PUBLIC &quo
如何带好自已的团队 bijian1013 项目管理团队管理团队
在网上看到博客" 怎么才能让团队成员好好干活"的评论，觉得写的比较好。原文如下：我做团队管理有几年了吧，我和你分享一下我认为带好团队的几点： 1.诚信对团队内成员，无论是技术研究、交流、问题探讨，要尽可能的保持一种诚信的态度，用心去做好，你的团队会感觉得到。 2.努力提
Java代码混淆工具 sunjing ProGuard
Open Source Obfuscators ProGuard http://java-source.net/open-source/obfuscators/proguardProGuard is a free Java class file shrinker and obfuscator. It can detect and remove unused classes, fields, m
【Redis三】基于Redis sentinel的自动failover主从复制 bit1129 redis
在第二篇中使用2.8.17搭建了主从复制，但是它存在Master单点问题，为了解决这个问题，Redis从2.6开始引入sentinel，用于监控和管理Redis的主从复制环境，进行自动failover，即Master挂了后，sentinel自动从从服务器选出一个Master使主从复制集群仍然可以工作，如果Master醒来再次加入集群，只能以从服务器的形式工作。什么是Sentine
使用代理实现Hibernate Dao层自动事务白糖_ DAO spring AOP 框架 Hibernate
都说spring利用AOP实现自动事务处理机制非常好，但在只有hibernate这个框架情况下，我们开启session、管理事务就往往很麻烦。 public void save(Object obj){ Session session = this.getSession(); Transaction tran = session.beginTransaction(); try
maven3实战读书笔记 braveCS maven3
Maven简介是什么？ Is a software project management and comprehension tool.项目管理工具是基于POM概念(工程对象模型) [设计重复、编码重复、文档重复、构建重复，maven最大化消除了构建的重复] [与XP：简单、交流与反馈；测试驱动开发、十分钟构建、持续集成、富有信息的工作区] 功能：
编程之美-子数组的最大乘积 bylijinnan 编程之美
public class MaxProduct { /** * 编程之美子数组的最大乘积 * 题目: 给定一个长度为N的整数数组，只允许使用乘法，不能用除法，计算任意N-1个数的组合中乘积中最大的一组，并写出算法的时间复杂度。 * 以下程序对应书上两种方法，求得“乘积中最大的一组”的乘积——都是有溢出的可能的。 * 但按题目的意思，是要求得这个子数组，而不
读书笔记-2 chengxuyuancsdn 读书笔记
1、反射 2、oracle年-月-日时-分-秒 3、oracle创建有参、无参函数 4、oracle行转列 5、Struts2拦截器 6、Filter过滤器(web.xml) 1、反射 (1)检查类的结构在java.lang.reflect包里有3个类Field,Method,Constructor分别用于描述类的域、方法和构造器。 2、oracle年月日时分秒 s
[求学与房地产]慎重选择IT培训学校 comsci it
关于培训学校的教学和教师的问题,我们就不讨论了,我主要关心的是这个问题培训学校的教学楼和宿舍的环境和稳定性问题我们大家都知道，房子是一个比较昂贵的东西，特别是那种能够当教室的房子... &nb
RMAN配置中通道(CHANNEL)相关参数 PARALLELISM 、FILESPERSET的关系 daizj oracle rman filesperset PARALLELISM
RMAN配置中通道(CHANNEL)相关参数 PARALLELISM 、FILESPERSET的关系转 PARALLELISM --- 我们还可以通过parallelism参数来指定同时"自动"创建多少个通道： RMAN > configure device type disk parallelism 3 ; 表示启动三个通道，可以加快备份恢复的速度。
简单排序:冒泡排序 dieslrae 冒泡排序
public void bubbleSort(int[] array){ for(int i=1;i<array.length;i++){ for(int k=0;k<array.length-i;k++){ if(array[k] > array[k+1]){
初二上学期难记单词三 dcj3sjt126com sciet
concert 音乐会 tonight 今晚 famous 有名的；著名的 song 歌曲 thousand 千 accident 事故；灾难 careless 粗心的，大意的 break 折断；断裂；破碎 heart 心（脏） happen 偶尔发生，碰巧 tourist 旅游者；观光者 science （自然）科学 marry 结婚 subject 题目；
I.安装Memcahce 1. 安装依赖包libevent Memcache需要安装libevent,所以安装前可能需要执行 Shell代码收藏代码 dcj3sjt126com redis
wget http://download.redis.io/redis-stable.tar.gz tar xvzf redis-stable.tar.gz cd redis-stable make 前面3步应该没有问题，主要的问题是执行make的时候，出现了异常。异常一： make[2]: cc: Command not found 异常原因：没有安装g
并发容器 shuizhaosi888 并发容器
通过并发容器来改善同步容器的性能，同步容器将所有对容器状态的访问都串行化，来实现线程安全，这种方式严重降低并发性，当多个线程访问时，吞吐量严重降低。并发容器ConcurrentHashMap 替代同步基于散列的Map，通过Lock控制。 &nb
Spring Security（12）——Remember-Me功能 234390216 Spring Security Remember Me 记住我
Remember-Me功能目录 1.1 概述 1.2 基于简单加密token的方法 1.3 基于持久化token的方法 1.4 Remember-Me相关接口和实现
位运算焦志广位运算
一、位运算符Ｃ语言提供了六种位运算符： & 按位与 | 按位或 ^ 按位异或 ~ 取反 << 左移 >> 右移 1. 按位与运算按位与运算符"&"是双目运算符。其功能是参与运算的两数各对应的二进位相与。只有对应的两个二进位均为1时，结果位才为1 ，否则为0。参与运算的数以补码方式出现。例如：9&am
nodejs 数据库连接 mongodb mysql liguangsong mongodb mysql node 数据库连接
1.mysql 连接 package.json中dependencies加入 "mysql":"~2.7.0" 执行 npm install 在config 下创建文件 database.js
java动态编译 olive6615 java HotSpot jvm 动态编译
在HotSpot虚拟机中，有两个技术是至关重要的，即动态编译(Dynamic compilation)和Profiling。 HotSpot是如何动态编译Javad的bytecode呢？Java bytecode是以解释方式被load到虚拟机的。HotSpot里有一个运行监视器，即Profile Monitor,专门监视
Storm0.9.5的集群部署配置优化 roadrunners 优化 storm.yaml
nimbus结点配置（storm.yaml）信息： # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional inf
101个MySQL 的调节和优化的提示 tomcat_oracle mysql
　1. 拥有足够的物理内存来把整个InnoDB文件加载到内存中——在内存中访问文件时的速度要比在硬盘中访问时快的多。　　2. 不惜一切代价避免使用Swap交换分区 – 交换时是从硬盘读取的，它的速度很慢。　　3. 使用电池供电的RAM（注：RAM即随机存储器）。　　4. 使用高级的RAID（注：Redundant Arrays of Inexpensive Disks，即磁盘阵列
zoj 3829 Known Notation(贪心) 阿尔萨斯 ZOJ
题目链接：zoj 3829 Known Notation 题目大意：给定一个不完整的后缀表达式，要求有2种不同操作，用尽量少的操作使得表达式完整。解题思路：贪心，数字的个数要要保证比∗的个数多1，不够的话优先补在开头是最优的。然后遍历一遍字符串，碰到数字+1，碰到∗-1,保证数字的个数大于等1，如果不够减的话，可以和最后面的一个数字交换位置（用栈维护十分方便），因为添加和交换代价都是1