数据
提取码:sodv
隐马尔可夫模型(Hidden Markov Model,HMM)是统计模型,它用来描述一个含有隐含未知参数的马尔可夫过程。其难点是从可观察的参数中确定该过程的隐含参数。然后利用这些参数来作进一步的分析,例如模式识别。
训练集长这样:
HMM中文分词原理: 对于一个词语,比如“我爱中国”,每个字有都对应的状态:B、M、E、S中的一个。其中B表示开始,M表示中间,E表示结尾,S表示单独一个字。因此上述四个字的隐状态为:“BMME”。
使用hmmlearn实现中文分词,我们要解决的核心问题就是计算三大矩阵:初始概率矩阵、转移概率矩阵以及发射概率矩阵。
具体步骤:
def load_data():
data = open('HMM/TrainData.txt', encoding='utf-8')
file = []
for line in data.readlines():
file.append(line.strip().split(' '))
max_len = 0; temp = 0
res = []
for i in range(len(file)):
for j in range(len(file[i])):
for k in range(len(file[i][j])):
res.append(file[i][j][k])
real_file = []
for i in range(len(file)):
x = []
for j in range(len(file[i])):
if len(file[i][j]) == 1:
x.append('S')
elif len(file[i][j]) == 2:
x.append('BE')
else:
str = 'B'
for k in range(len(file[i][j])-2):
str += 'M'
str += 'E'
x.append(str)
real_file.append(x)
return file, real_file
file表示所有词语的集合,是一个二维列表,每个列表表示一句话,每句话又被分成了词语。real_file与file一一对应,只不过是编码BMES的集合。
def hMM():
file, data = load_data() #lens表示一个出现了多少个字
# print(list_set)
states = ['B', 'M', 'E', 'S'] #分别表示开始中间结尾以及单个字
A = np.zeros((4, 4))
B = np.zeros((4, 65536)) #编码表示汉字,不用顺序
pi = np.zeros(4) #初始状态
for i in range(len(data)):
if data[i][0][0] == 'B': #开头只能是B或者S
pi[0] += 1
if data[i][0][0] == 'S':
pi[3] += 1
pi /= np.sum(pi) #初始状态
for i in range(len(data)):
for j in range(len(data[i])):
for k in range(len(data[i][j])):
B[states.index(data[i][j][k]), ord(file[i][j][k])] += 1 #隐状态为data[i][j][k]时对应汉字file[i][j][k]
if len(data[i][j]) == 1 and j + 1 < len(data[i]):
if data[i][j+1][0] == 'B': #S后面接B
A[3, 0] += 1
if data[i][j+1][0] == 'S': #S后面接S
A[3, 3] += 1
continue
A[0, 1] += data[i][j].count('BM')
A[0, 2] += data[i][j].count('BE')
A[1, 2] += data[i][j].count('ME')
if j + 1 < len(data[i]) and data[i][j + 1][0] == 'B':
A[2, 0] += 1
if j + 1 < len(data[i]) and data[i][j + 1][0] == 'S':
A[2, 3] += 1
for i in range(4):
if np.sum(A[i]) != 0:
A[i] = A[i] / np.sum(A[i])
for i in range(4):
B[i] /= np.sum(B[i])
model = hmm.MultinomialHMM(n_components=len(states))
model.startprob_ = pi
model.emissionprob_ = B
model.transmat_ = A
if __name__ == '__main__':
print('请稍候...')
model = hMM()
dataset = []
data = open('HMM/TestData.txt', 'r+')
for line in data.readlines():
temp = line.strip().split('\t')
file = []
for x in temp:
file.append(x)
dataset.append(file) #处理数据
data = np.array(dataset)
for k in range(4): #四句话
print('分词前:', str(data[k]))
datas = []
x = str(*data[k]) #变成字符串
for j in x:
datas.append(ord(j)) #寻找汉字的编码,进行decode
xd = np.asarray(datas).reshape(-1, 1)
pre = model.predict(xd)
final = []
for p, q in enumerate(pre):
if q == 0:
t = p
elif q == 2:
final.append(x[t:p + 1])
elif q == 3:
final.append(x[p])
print("分词后:", '/'.join(final))
print('\n')
完整代码:
from hmmlearn import hmm
import numpy as np
def load_data():
data = open('HMM/TrainData.txt', encoding='utf-8')
file = []
for line in data.readlines():
file.append(line.strip().split(' '))
max_len = 0; temp = 0
res = []
for i in range(len(file)):
for j in range(len(file[i])):
for k in range(len(file[i][j])):
res.append(file[i][j][k])
real_file = []
for i in range(len(file)):
x = []
for j in range(len(file[i])):
if len(file[i][j]) == 1:
x.append('S')
elif len(file[i][j]) == 2:
x.append('BE')
else:
str = 'B'
for k in range(len(file[i][j])-2):
str += 'M'
str += 'E'
x.append(str)
real_file.append(x)
return file, real_file
#训练
def hMM():
file, data = load_data() #lens表示一个出现了多少个字
# print(list_set)
states = ['B', 'M', 'E', 'S'] #分别表示开始中间结尾以及单个字
A = np.zeros((4, 4))
B = np.zeros((4, 65536)) #编码表示汉字,不用顺序
pi = np.zeros(4) #初始状态
for i in range(len(data)):
if data[i][0][0] == 'B': #开头只能是B或者S
pi[0] += 1
if data[i][0][0] == 'S':
pi[3] += 1
pi /= np.sum(pi) #初始状态
for i in range(len(data)):
for j in range(len(data[i])):
for k in range(len(data[i][j])):
B[states.index(data[i][j][k]), ord(file[i][j][k])] += 1 #隐状态为data[i][j][k]时对应汉字file[i][j][k]
if len(data[i][j]) == 1 and j + 1 < len(data[i]):
if data[i][j+1][0] == 'B': #S后面接B
A[3, 0] += 1
if data[i][j+1][0] == 'S': #S后面接S
A[3, 3] += 1
continue
A[0, 1] += data[i][j].count('BM')
A[0, 2] += data[i][j].count('BE')
A[1, 2] += data[i][j].count('ME')
if j + 1 < len(data[i]) and data[i][j + 1][0] == 'B':
A[2, 0] += 1
if j + 1 < len(data[i]) and data[i][j + 1][0] == 'S':
A[2, 3] += 1
for i in range(4):
if np.sum(A[i]) != 0:
A[i] = A[i] / np.sum(A[i])
for i in range(4):
B[i] /= np.sum(B[i])
#训练模型
model = hmm.MultinomialHMM(n_components=len(states))
model.startprob_ = pi
model.emissionprob_ = B
model.transmat_ = A
return model
if __name__ == '__main__':
print('请稍候...')
model = hMM()
dataset = []
data = open('HMM/TestData.txt', 'r+')
for line in data.readlines():
temp = line.strip().split('\t')
file = []
for x in temp:
file.append(x)
dataset.append(file) #处理数据
data = np.array(dataset)
for k in range(4): #四句话
print('分词前:', str(data[k]))
datas = []
x = str(*data[k]) #变成字符串
for j in x:
datas.append(ord(j)) #寻找汉字的编码,进行decode
xd = np.asarray(datas).reshape(-1, 1)
pre = model.predict(xd)
final = []
for p, q in enumerate(pre):
if q == 0:
t = p
elif q == 2:
final.append(x[t:p + 1])
elif q == 3:
final.append(x[p])
print("分词后:", '/'.join(final))
print('\n')
输出:
分词前: ['长春市长春节讲话。']
分词后: 长春/市长/春节/讲话/。
分词前: ['他说的确实在理.']
分词后: 他/说/的/确实/在理
分词前: ['毛主席万岁。']
分词后: 毛主席/万/岁/。
分词前: ['我有一台电脑。']
分词后: 我有/一台/电脑/。