在上一篇文章主要介绍了机器学习中常用的模型使用方法,比如线性回归、逻辑回归、决策树、随机森林、bagging、random forest、xgboost、adaboost、svm、k-means、密度聚类、谱和谱聚类等,这些在很多的比赛中是比较常用的算法,前几天看一篇文章还介绍xgboost为何在很多比赛中都有较好的效果,速度快、精度也较高,xgboost 与深度学习到底孰优孰劣
以后有空要单独写一次xgboost的算法的使用。
今天开始介绍相对比较难一点的算法,包括EM、贝叶斯网络、HMM、LDA等,记得上课时,邹博老师说过到了EM算法就是机器学习里比较难、深入的部分了。
EM算法简单来说就是分为两步E、M两步,核心利用jensen不等式;
E步骤:根据参数初始值或者上一次迭代的模型参数来计算出隐变量的后验概率,其实就是隐变量的期望。
M步骤:将似然函数最大化以获得新的参数值
不断迭代,使得似然函数最大化的参数,比较典型的有高斯混合模型。
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances_argmin
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
if __name__ == '__main__':
style = 'sklearn'
np.random.seed(0)
mu1_fact = (0, 0, 0)
cov_fact = np.identity(3)
data1 = np.random.multivariate_normal(mu1_fact, cov_fact, 400)
mu2_fact = (2, 2, 1)
cov_fact = np.identity(3)
data2 = np.random.multivariate_normal(mu2_fact, cov_fact, 100)
data = np.vstack((data1, data2))
y = np.array([True] * 400 + [False] * 100)
if style == 'sklearn':
g = GaussianMixture(n_components=2, covariance_type='full', tol=1e-6, max_iter=1000)
g.fit(data)
print '类别概率:\t', g.weights_[0]
print '均值:\n', g.means_, '\n'
print '方差:\n', g.covariances_, '\n'
mu1, mu2 = g.means_
sigma1, sigma2 = g.covariances_
else:
num_iter = 100
n, d = data.shape
# 随机指定
# mu1 = np.random.standard_normal(d)
# print mu1
# mu2 = np.random.standard_normal(d)
# print mu2
mu1 = data.min(axis=0)
mu2 = data.max(axis=0)
sigma1 = np.identity(d)
sigma2 = np.identity(d)
pi = 0.5
# EM
for i in range(num_iter):
# E Step
norm1 = multivariate_normal(mu1, sigma1)
norm2 = multivariate_normal(mu2, sigma2)
tau1 = pi * norm1.pdf(data)
tau2 = (1 - pi) * norm2.pdf(data)
gamma = tau1 / (tau1 + tau2)
# M Step
mu1 = np.dot(gamma, data) / np.sum(gamma)
mu2 = np.dot((1 - gamma), data) / np.sum((1 - gamma))
sigma1 = np.dot(gamma * (data - mu1).T, data - mu1) / np.sum(gamma)
sigma2 = np.dot((1 - gamma) * (data - mu2).T, data - mu2) / np.sum(1 - gamma)
pi = np.sum(gamma) / n
print i, ":\t", mu1, mu2
print '类别概率:\t', pi
print '均值:\t', mu1, mu2
print '方差:\n', sigma1, '\n\n', sigma2, '\n'
# 预测分类
norm1 = multivariate_normal(mu1, sigma1)
norm2 = multivariate_normal(mu2, sigma2)
tau1 = norm1.pdf(data)
tau2 = norm2.pdf(data)
fig = plt.figure(figsize=(13, 7), facecolor='w')
ax = fig.add_subplot(121, projection='3d')
ax.scatter(data[:, 0], data[:, 1], data[:, 2], c='b', s=30, marker='o', depthshade=True)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title(u'原始数据', fontsize=18)
ax = fig.add_subplot(122, projection='3d')
order = pairwise_distances_argmin([mu1_fact, mu2_fact], [mu1, mu2], metric='euclidean')
if order[0] == 0:
c1 = tau1 > tau2
else:
c1 = tau1 < tau2
c2 = ~c1
acc = np.mean(y == c1)
print u'准确率:%.2f%%' % (100*acc)
ax.scatter(data[c1, 0], data[c1, 1], data[c1, 2], c='r', s=30, marker='o', depthshade=True)
ax.scatter(data[c2, 0], data[c2, 1], data[c2, 2], c='g', s=30, marker='^', depthshade=True)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title(u'EM算法分类', fontsize=18)
# plt.suptitle(u'EM算法的实现', fontsize=20)
# plt.subplots_adjust(top=0.92)
plt.tight_layout()
plt.show()
高斯混合模型GMM
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
# from matplotlib.font_manager import FontProperties
# font_set = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=15)
# fontproperties=font_set
def expand(a, b):
d = (b - a) * 0.05
return a-d, b+d
if __name__ == '__main__':
data = np.loadtxt('18.HeightWeight.csv', dtype=np.float, delimiter=',', skiprows=1)
print data.shape
y, x = np.split(data, [1, ], axis=1)
x, x_test, y, y_test = train_test_split(x, y, train_size=0.6, random_state=0)
gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=0)
x_min = np.min(x, axis=0)
x_max = np.max(x, axis=0)
gmm.fit(x)
print '均值 = \n', gmm.means_
print '方差 = \n', gmm.covariances_
y_hat = gmm.predict(x)
y_test_hat = gmm.predict(x_test)
change = (gmm.means_[0][0] > gmm.means_[1][0])
if change:
z = y_hat == 0
y_hat[z] = 1
y_hat[~z] = 0
z = y_test_hat == 0
y_test_hat[z] = 1
y_test_hat[~z] = 0
acc = np.mean(y_hat.ravel() == y.ravel())
acc_test = np.mean(y_test_hat.ravel() == y_test.ravel())
acc_str = u'训练集准确率:%.2f%%' % (acc * 100)
acc_test_str = u'测试集准确率:%.2f%%' % (acc_test * 100)
print acc_str
print acc_test_str
cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0'])
cm_dark = mpl.colors.ListedColormap(['r', 'g'])
x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
grid_test = np.stack((x1.flat, x2.flat), axis=1)
grid_hat = gmm.predict(grid_test)
grid_hat = grid_hat.reshape(x1.shape)
if change:
z = grid_hat == 0
grid_hat[z] = 1
grid_hat[~z] = 0
plt.figure(figsize=(9, 7), facecolor='w')
plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light)
plt.scatter(x[:, 0], x[:, 1], s=50, c=y, marker='o', cmap=cm_dark, edgecolors='k')
plt.scatter(x_test[:, 0], x_test[:, 1], s=60, c=y_test, marker='^', cmap=cm_dark, edgecolors='k')
p = gmm.predict_proba(grid_test)
p = p[:, 0].reshape(x1.shape)
CS = plt.contour(x1, x2, p, levels=(0.2, 0.5, 0.8), colors=list('rgb'), linewidths=2)
plt.clabel(CS, fontsize=15, fmt='%.1f', inline=True)
ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
xx = 0.9*ax1_min + 0.1*ax1_max
yy = 0.1*ax2_min + 0.9*ax2_max
plt.text(xx, yy, acc_str, fontsize=18)
yy = 0.15*ax2_min + 0.85*ax2_max
plt.text(xx, yy, acc_test_str, fontsize=18)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.xlabel(u'身高(cm)', fontsize='large')
plt.ylabel(u'体重(kg)', fontsize='large')
plt.title(u'EM算法估算GMM的参数', fontsize=20)
plt.grid()
plt.show()
DPGMM
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
import scipy as sp
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
def expand(a, b, rate=0.05):
d = (b - a) * rate
return a-d, b+d
matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
if __name__ == '__main__':
np.random.seed(0)
cov1 = np.diag((1, 2))
N1 = 500
N2 = 300
N = N1 + N2
x1 = np.random.multivariate_normal(mean=(3, 2), cov=cov1, size=N1)
m = np.array(((1, 1), (1, 3)))
x1 = x1.dot(m)
x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2)
x = np.vstack((x1, x2))
y = np.array([0]*N1 + [1]*N2)
n_components = 3
# 绘图使用
colors = '#A0FFA0', '#2090E0', '#FF8080'
cm = mpl.colors.ListedColormap(colors)
x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
grid_test = np.stack((x1.flat, x2.flat), axis=1)
plt.figure(figsize=(9, 9), facecolor='w')
plt.suptitle(u'GMM/DPGMM比较', fontsize=23)
ax = plt.subplot(211)
gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0)
gmm.fit(x)
centers = gmm.means_
covs = gmm.covariances_
print 'GMM均值 = \n', centers
print 'GMM方差 = \n', covs
y_hat = gmm.predict(x)
grid_hat = gmm.predict(grid_test)
grid_hat = grid_hat.reshape(x1.shape)
plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')
clrs = list('rgbmy')
for i, cc in enumerate(zip(centers, covs)):
center, cov = cc
value, vector = sp.linalg.eigh(cov)
width, height = value[0], value[1]
v = vector[0] / sp.linalg.norm(vector[0])
angle = 180* np.arctan(v[1] / v[0]) / np.pi
e = Ellipse(xy=center, width=width, height=height,
angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox)
ax.add_artist(e)
ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.title(u'GMM', fontsize=20)
plt.grid(True)
# DPGMM
dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5,
weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=10)
dpgmm.fit(x)
centers = dpgmm.means_
covs = dpgmm.covariances_
print 'DPGMM均值 = \n', centers
print 'DPGMM方差 = \n', covs
y_hat = dpgmm.predict(x)
# print y_hat
ax = plt.subplot(212)
grid_hat = dpgmm.predict(grid_test)
grid_hat = grid_hat.reshape(x1.shape)
plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')
for i, cc in enumerate(zip(centers, covs)):
if i not in y_hat:
continue
center, cov = cc
value, vector = sp.linalg.eigh(cov)
width, height = value[0], value[1]
v = vector[0] / sp.linalg.norm(vector[0])
angle = 180* np.arctan(v[1] / v[0]) / np.pi
e = Ellipse(xy=center, width=width, height=height,
angle=angle, color='m', alpha=0.5, clip_box = ax.bbox)
ax.add_artist(e)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.title('DPGMM', fontsize=20)
plt.grid(True)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
贝叶斯网络是在贝叶斯算法上拓展而来构建贝叶斯网络,基本原理是朴素贝叶斯算法。
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
def iris_type(s):
it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
return it[s]
if __name__ == "__main__":
data = np.loadtxt('..\8.Regression\8.iris.data', dtype=float, delimiter=',', converters={4: iris_type})
print data
x, y = np.split(data, (4,), axis=1)
x = x[:, :2]
print x
print y
gnb = Pipeline([
('sc', StandardScaler()),
('clf', GaussianNB())])
gnb.fit(x, y.ravel())
# gnb = MultinomialNB().fit(x, y.ravel())
# gnb = KNeighborsClassifier(n_neighbors=5).fit(x, y.ravel())
# 画图
N, M = 500, 500 # 横纵各采样多少个值
x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围
x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, M)
x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点
x_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点
# 无意义,只是为了凑另外两个维度
# x3 = np.ones(x1.size) * np.average(x[:, 2])
# x4 = np.ones(x1.size) * np.average(x[:, 3])
# x_test = np.stack((x1.flat, x2.flat, x3, x4), axis=1) # 测试点
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
y_hat = gnb.predict(x_test) # 预测值
y_hat = y_hat.reshape(x1.shape) # 使之与输入的形状相同
plt.figure(facecolor='w')
plt.pcolormesh(x1, x2, y_hat, cmap=cm_light) # 预测值的显示
plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本的显示
plt.xlabel(u'花萼长度', fontsize=14)
plt.ylabel(u'花萼宽度', fontsize=14)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.title(u'GaussianNB对鸢尾花数据的分类结果', fontsize=18)
plt.grid(True)
plt.show()
# 训练集上的预测结果
y_hat = gnb.predict(x)
y = y.reshape(-1)
result = y_hat == y
print y_hat
print result
acc = np.mean(result)
print '准确度: %.2f%%' % (100 * acc)
主题模型可以用生成模型来看文档和主题这两件事。所谓生成模型,就是说,我们认为一篇文章的每个词都是通过“以一定概率选择了某个主题,并从这个主题中以一定概率选择某个词语”这样一个过程得到的。
# !/usr/bin/python
# -*- coding:utf-8 -*-
from gensim import corpora, models, similarities
from pprint import pprint
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
if __name__ == '__main__':
f = open('22.LDA_test.txt')
stop_list = set('for a of the and to in'.split())
# texts = [line.strip().split() for line in f]
# print(texts)
texts = [[word for word in line.strip().lower().split() if word not in stop_list] for line in f]
print 'Text = '
pprint(texts)
dictionary = corpora.Dictionary(texts)
V = len(dictionary)
corpus = [dictionary.doc2bow(text) for text in texts]
corpus_tfidf = models.TfidfModel(corpus)[corpus]
print 'TF-IDF:'
for c in corpus_tfidf:
print c
print '\nLSI Model:'
lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary)
topic_result = [a for a in lsi[corpus_tfidf]]
pprint(topic_result)
print 'LSI Topics:'
pprint(lsi.print_topics(num_topics=2, num_words=5))
similarity = similarities.MatrixSimilarity(lsi[corpus_tfidf]) # similarities.Similarity()
print 'Similarity:'
pprint(list(similarity))
print '\nLDA Model:'
num_topics = 2
lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
alpha='auto', eta='auto', minimum_probability=0.001)
doc_topic = [doc_t for doc_t in lda[corpus_tfidf]]
print 'Document-Topic:\n'
pprint(doc_topic)
for doc_topic in lda.get_document_topics(corpus_tfidf):
print doc_topic
for topic_id in range(num_topics):
print 'Topic', topic_id
# pprint(lda.get_topic_terms(topicid=topic_id))
pprint(lda.show_topic(topic_id))
similarity = similarities.MatrixSimilarity(lda[corpus_tfidf])
print 'Similarity:'
pprint(list(similarity))
hda = models.HdpModel(corpus_tfidf, id2word=dictionary)
topic_result = [a for a in hda[corpus_tfidf]]
print '\n\nUSE WITH CARE--\nHDA Model:'
pprint(topic_result)
print 'HDA Topics:'
print hda.print_topics(num_topics=2, num_words=5)
LDA包使用
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import lda
import lda.datasets
from pprint import pprint
if __name__ == "__main__":
# document-term matrix
X = lda.datasets.load_reuters()
print("type(X): {}".format(type(X)))
print("shape: {}\n".format(X.shape))
print(X[:10, :10])
# the vocab
vocab = lda.datasets.load_reuters_vocab()
print("type(vocab): {}".format(type(vocab)))
print("len(vocab): {}\n".format(len(vocab)))
print(vocab[:10])
# titles for each story
titles = lda.datasets.load_reuters_titles()
print("type(titles): {}".format(type(titles)))
print("len(titles): {}\n".format(len(titles)))
pprint(titles[:10])
print 'LDA start ----'
topic_num = 20
model = lda.LDA(n_topics=topic_num, n_iter=500, random_state=1)
model.fit(X)
# topic-word
topic_word = model.topic_word_
print("type(topic_word): {}".format(type(topic_word)))
print("shape: {}".format(topic_word.shape))
print(vocab[:5])
print(topic_word[:, :5])
# Print Topic distribution
n = 7
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n + 1):-1]
print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))
# Document - topic
doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))
for i in range(10):
topic_most_pr = doc_topic[i].argmax()
print(u"文档: {} 主题: {} value: {}".format(i, topic_most_pr, doc_topic[i][topic_most_pr]))
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
# Topic - word
plt.figure(figsize=(8, 9))
# f, ax = plt.subplots(5, 1, sharex=True)
for i, k in enumerate([0, 5, 9, 14, 19]):
ax = plt.subplot(5, 1, i+1)
ax.plot(topic_word[k, :], 'r-')
ax.set_xlim(-50, 4350) # [0,4258]
ax.set_ylim(0, 0.08)
ax.set_ylabel(u"概率")
ax.set_title(u"主题 {}".format(k))
plt.xlabel(u"词", fontsize=14)
plt.tight_layout()
plt.suptitle(u'主题的词分布', fontsize=18)
plt.subplots_adjust(top=0.9)
plt.show()
# Document - Topic
plt.figure(figsize=(8, 9))
# f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
for i, k in enumerate([1, 3, 4, 8, 9]):
ax = plt.subplot(5, 1, i+1)
ax.stem(doc_topic[k, :], linefmt='g-', markerfmt='ro')
ax.set_xlim(-1, topic_num+1)
ax.set_ylim(0, 1)
ax.set_ylabel(u"概率")
ax.set_title(u"文档 {}".format(k))
plt.xlabel(u"主题", fontsize=14)
plt.suptitle(u'文档的主题分布', fontsize=18)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
隐马模型(Hidden Markov)可用标注问题,在语音识别、NLP、生物信息、模式识别等领域被实践证明是有效的算法。
HMM是关于时序的概率模型,描述由一个隐藏的马尔科夫链生成不可观测的状态随机序列,再由各个状态生成观测随机序列的过程。
隐马模型随机生成的状态随机序列,称为状态序列,每个状态生成一个观测,由此产生的观测随机序列,称为观测序列。
HMM主要包括三部分信息:初始概率分布π,状态转移概率分布A,观测概率分布B三部分。
隐马模型主要包括三部分的内容:
# !/usr/bin/python
# -*- coding:utf-8 -*-
import math
import matplotlib.pyplot as plt
import numpy as np
import codecs
import random
infinite = -(2**31)
def log_normalize(a):
s = 0
for x in a:
s += x
s = math.log(s)
for i in range(len(a)):
if a[i] == 0:
a[i] = infinite
else:
a[i] = math.log(a[i]) - s
def log_sum(a):
if not a: # a为空
return infinite
m = max(a)
s = 0
for t in a:
s += math.exp(t-m)
return m + math.log(s)
def calc_alpha(pi, A, B, o, alpha):
for i in range(4):
alpha[0][i] = pi[i] + B[i][ord(o[0])]
T = len(o)
temp = [0 for i in range(4)]
del i
for t in range(1, T):
for i in range(4):
for j in range(4):
temp[j] = (alpha[t-1][j] + A[j][i])
alpha[t][i] = log_sum(temp)
alpha[t][i] += B[i][ord(o[t])]
def calc_beta(pi, A, B, o, beta):
T = len(o)
for i in range(4):
beta[T-1][i] = 1
temp = [0 for i in range(4)]
del i
for t in range(T-2, -1, -1):
for i in range(4):
beta[t][i] = 0
for j in range(4):
temp[j] = A[i][j] + B[j][ord(o[t+1])] + beta[t+1][j]
beta[t][i] += log_sum(temp)
def calc_gamma(alpha, beta, gamma):
for t in range(len(alpha)):
for i in range(4):
gamma[t][i] = alpha[t][i] + beta[t][i]
s = log_sum(gamma[t])
for i in range(4):
gamma[t][i] -= s
def calc_ksi(alpha, beta, A, B, o, ksi):
T = len(alpha)
temp = [0 for x in range(16)]
for t in range(T-1):
k = 0
for i in range(4):
for j in range(4):
ksi[t][i][j] = alpha[t][i] + A[i][j] + B[j][ord(o[t+1])] + beta[t+1][j]
temp[k] =ksi[t][i][j]
k += 1
s = log_sum(temp)
for i in range(4):
for j in range(4):
ksi[t][i][j] -= s
def bw(pi, A, B, alpha, beta, gamma, ksi, o):
T = len(alpha)
for i in range(4):
pi[i] = gamma[0][i]
s1 = [0 for x in range(T-1)]
s2 = [0 for x in range(T-1)]
for i in range(4):
for j in range(4):
for t in range(T-1):
s1[t] = ksi[t][i][j]
s2[t] = gamma[t][i]
A[i][j] = log_sum(s1) - log_sum(s2)
s1 = [0 for x in range(T)]
s2 = [0 for x in range(T)]
for i in range(4):
for k in range(65536):
if k % 5000 == 0:
print i, k
valid = 0
for t in range(T):
if ord(o[t]) == k:
s1[valid] = gamma[t][i]
valid += 1
s2[t] = gamma[t][i]
if valid == 0:
B[i][k] = -log_sum(s2) # 平滑
else:
B[i][k] = log_sum(s1[:valid]) - log_sum(s2)
def baum_welch(pi, A, B):
f = file(".\\2.txt")
sentence = f.read()[3:].decode('utf-8') # 跳过文件头
f.close()
T = len(sentence) # 观测序列
alpha = [[0 for i in range(4)] for t in range(T)]
beta = [[0 for i in range(4)] for t in range(T)]
gamma = [[0 for i in range(4)] for t in range(T)]
ksi = [[[0 for j in range(4)] for i in range(4)] for t in range(T-1)]
for time in range(100):
print "time:", time
calc_alpha(pi, A, B, sentence, alpha) # alpha(t,i):给定lamda,在时刻t的状态为i且观测到o(1),o(2)...o(t)的概率
calc_beta(pi, A, B, sentence, beta) # beta(t,i):给定lamda和时刻t的状态i,观测到o(t+1),o(t+2)...oT的概率
calc_gamma(alpha, beta, gamma) # gamma(t,i):给定lamda和O,在时刻t状态位于i的概率
calc_ksi(alpha, beta, A, B, sentence, ksi) # ksi(t,i,j):给定lamda和O,在时刻t状态位于i且在时刻i+1,状态位于j的概率
bw(pi, A, B, alpha, beta, gamma, ksi, sentence) #baum_welch算法
save_parameter(pi, A, B, time)
def list_write(f, v):
for a in v:
f.write(str(a))
f.write(' ')
f.write('\n')
def save_parameter(pi, A, B, time):
f_pi = open(".\\pi%d.txt" % time, "w")
list_write(f_pi, pi)
f_pi.close()
f_A = open(".\\A%d.txt" % time, "w")
for a in A:
list_write(f_A, a)
f_A.close()
f_B = open(".\\B%d.txt" % time, "w")
for b in B:
list_write(f_B, b)
f_B.close()
def train():
# 初始化pi,A,B
pi = [random.random() for x in range(4)] # 初始分布
log_normalize(pi)
A = [[random.random() for y in range(4)] for x in range(4)] # 转移矩阵:B/M/E/S
A[0][0] = A[0][3] = A[1][0] = A[1][3]\
= A[2][1] = A[2][2] = A[3][1] = A[3][2] = 0 # 不可能事件
B = [[random.random() for y in range(65536)] for x in range(4)]
for i in range(4):
log_normalize(A[i])
log_normalize(B[i])
baum_welch(pi, A, B)
return pi, A, B
def load_train():
f = file(".\\pi.txt", mode="r")
for line in f:
pi = map(float, line.split(' ')[:-1])
f.close()
f = file(".\\A.txt", mode="r")
A = [[] for x in range(4)] # 转移矩阵:B/M/E/S
i = 0
for line in f:
A[i] = map(float, line.split(' ')[:-1])
i += 1
f.close()
f = file(".\\B.txt", mode="r")
B = [[] for x in range(4)]
i = 0
for line in f:
B[i] = map(float, line.split(' ')[:-1])
i += 1
f.close()
return pi, A, B
def viterbi(pi, A, B, o):
T = len(o) # 观测序列
delta = [[0 for i in range(4)] for t in range(T)]
pre = [[0 for i in range(4)] for t in range(T)] # 前一个状态 # pre[t][i]:t时刻的i状态,它的前一个状态是多少
for i in range(4):
delta[0][i] = pi[i] + B[i][ord(o[0])]
for t in range(1, T):
for i in range(4):
delta[t][i] = delta[t-1][0] + A[0][i]
for j in range(1,4):
vj = delta[t-1][j] + A[j][i]
if delta[t][i] < vj:
delta[t][i] = vj
pre[t][i] = j
delta[t][i] += B[i][ord(o[t])]
decode = [-1 for t in range(T)] # 解码:回溯查找最大路径
q = 0
for i in range(1, 4):
if delta[T-1][i] > delta[T-1][q]:
q = i
decode[T-1] = q
for t in range(T-2, -1, -1):
q = pre[t+1][q]
decode[t] = q
return decode
def segment(sentence, decode):
N = len(sentence)
i = 0
while i < N: #B/M/E/S
if decode[i] == 0 or decode[i] == 1: # Begin
j = i+1
while j < N:
if decode[j] == 2:
break
j += 1
print sentence[i:j+1], "|",
i = j+1
elif decode[i] == 3 or decode[i] == 2: # single
print sentence[i:i+1], "|",
i += 1
else:
print 'Error:', i, decode[i]
i += 1
if __name__ == "__main__":
pi, A, B = load_train()
f = file(".\\24.mybook.txt")
data = f.read()[3:].decode('utf-8')
f.close()
decode = viterbi(pi, A, B, data)
segment(data, decode)
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from hmmlearn import hmm
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.metrics.pairwise import pairwise_distances_argmin
import warnings
def expand(a, b):
d = (b - a) * 0.05
return a-d, b+d
if __name__ == "__main__":
warnings.filterwarnings("ignore") # hmmlearn(0.2.0) < sklearn(0.18)
np.random.seed(0)
n = 5 # 隐状态数目
n_samples = 1000
pi = np.random.rand(n)
pi /= pi.sum()
print '初始概率:', pi
A = np.random.rand(n, n)
mask = np.zeros((n, n), dtype=np.bool)
mask[0][1] = mask[0][4] = True
mask[1][0] = mask[1][2] = True
mask[2][1] = mask[2][3] = True
mask[3][2] = mask[3][4] = True
mask[4][0] = mask[4][3] = True
A[mask] = 0
for i in range(n):
A[i] /= A[i].sum()
print '转移概率:\n', A
means = np.array(((30, 30), (0, 50), (-25, 30), (-15, 0), (15, 0)))
print '均值:\n', means
covars = np.empty((n, 2, 2))
for i in range(n):
# covars[i] = np.diag(np.random.randint(1, 5, size=2))
covars[i] = np.diag(np.random.rand(2)+0.001)*10 # np.random.rand ∈[0,1)
print '方差:\n', covars
model = hmm.GaussianHMM(n_components=n, covariance_type='full')
model.startprob_ = pi
model.transmat_ = A
model.means_ = means
model.covars_ = covars
sample, labels = model.sample(n_samples=n_samples, random_state=0)
# 估计参数
model = hmm.GaussianHMM(n_components=n, covariance_type='full', n_iter=10)
model = model.fit(sample)
y = model.predict(sample)
np.set_printoptions(suppress=True)
print '##估计初始概率:', model.startprob_
print '##估计转移概率:\n', model.transmat_
print '##估计均值:\n', model.means_
print '##估计方差:\n', model.covars_
# 类别
order = pairwise_distances_argmin(means, model.means_, metric='euclidean')
print order
pi_hat = model.startprob_[order]
A_hat = model.transmat_[order]
A_hat = A_hat[:, order]
means_hat = model.means_[order]
covars_hat = model.covars_[order]
change = np.empty((n, n_samples), dtype=np.bool)
for i in range(n):
change[i] = y == order[i]
for i in range(n):
y[change[i]] = i
print '估计初始概率:', pi_hat
print '估计转移概率:\n', A_hat
print '估计均值:\n', means_hat
print '估计方差:\n', covars_hat
print labels
print y
acc = np.mean(labels == y) * 100
print '准确率:%.2f%%' % acc
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
plt.scatter(sample[:, 0], sample[:, 1], s=50, c=labels, cmap=plt.cm.Spectral, marker='o',
label=u'观测值', linewidths=0.5, zorder=20)
plt.plot(sample[:, 0], sample[:, 1], 'r-', zorder=10)
plt.scatter(means[:, 0], means[:, 1], s=100, c=np.random.rand(n), marker='D', label=u'中心', alpha=0.8, zorder=30)
x1_min, x1_max = sample[:, 0].min(), sample[:, 0].max()
x2_min, x2_max = sample[:, 1].min(), sample[:, 1].max()
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.legend(loc='upper left')
plt.grid(True)
plt.show()
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from hmmlearn import hmm
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.metrics.pairwise import pairwise_distances_argmin
import warnings
def expand(a, b):
d = (b - a) * 0.05
return a-d, b+d
if __name__ == "__main__":
warnings.filterwarnings("ignore") # hmmlearn(0.2.0) < sklearn(0.18)
# 0日期 1开盘 2最高 3最低 4收盘 5成交量 6成交额
x = np.loadtxt('24.SH600000.txt', delimiter='\t', skiprows=2, usecols=(4, 5, 6, 2, 3))
close_price = x[:, 0]
volumn = x[:, 1]
amount = x[:, 2]
amplitude_price = x[:, 3] - x[:, 4] # 每天的最高价与最低价的差
diff_price = np.diff(close_price) # 涨跌值
volumn = volumn[1:] # 成交量
amount = amount[1:] # 成交额
amplitude_price = amplitude_price[1:] # 每日振幅
sample = np.column_stack((diff_price, volumn, amount, amplitude_price)) # 观测值
n = 5
model = hmm.GaussianHMM(n_components=n, covariance_type='full')
model.fit(sample)
y = model.predict_proba(sample)
np.set_printoptions(suppress=True)
print y
t = np.arange(len(diff_price))
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(10,8), facecolor='w')
plt.subplot(421)
plt.plot(t, diff_price, 'r-')
plt.grid(True)
plt.title(u'涨跌幅')
plt.subplot(422)
plt.plot(t, volumn, 'g-')
plt.grid(True)
plt.title(u'交易量')
clrs = plt.cm.terrain(np.linspace(0, 0.8, n))
plt.subplot(423)
for i, clr in enumerate(clrs):
plt.plot(t, y[:, i], '-', color=clr, alpha=0.7)
plt.title(u'所有组分')
plt.grid(True)
for i, clr in enumerate(clrs):
axes = plt.subplot(4, 2, i+4)
plt.plot(t, y[:, i], '-', color=clr)
plt.title(u'组分%d' % (i+1))
plt.grid(True)
plt.suptitle(u'SH600000股票:GaussianHMM分解隐变量', fontsize=18)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()