http://004123.ichengyun.net/thread-1598-1-1.html
下载word2vec源代码
使用MSR分词语料库http://www.threedweb.cn/thread-1593-1-1.html
./word2vec -train msr.txt -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
./distance vectors.bin
可使用更大规模的语料
也可使用词向量模型数据http://txt2vec.codeplex.com/下载二进制包
需要使用https://github.com/zhongkaifu/RNNSharp解析,这是一个基于CRF-LSTM的高精度命名实体识别工具。
人工构建知识库是一种知识密集且费时的工作。然而,类似HowNet和WordNet这些语义资源却在实际使用中都极大地受限于它们覆盖的领域,效果并不好。因此,许多研究人员多年来都不断尝试自动提取语义关系或构建分类法。
由哈工大的刘挺为首的《大词林》(http://www.bigcilin.com/browser/)项目组最终解决上述问题。经过大量的研究,他们观察到,词嵌入通过捕捉大量的句法/语义关系,保留了很多语言规律。一个著名的例子:v(king>-v(queen) 约等于 v(man) - v(woman),表明词嵌入向量的差值确实代表两个词对之间的某种语义关系。
是否这种情况也适用于上下位关系。他们设计了一个简单的实验,使用一些随机抽取的汉语中的上下位词对,计算嵌入向量的差值以测量它们之间的相似性。
为了应对这一挑战,我们提出了使用投影矩阵学习的上下文义关系。
1、均匀线性映射:假设所有的词可以基于统一的转移矩阵被映射到其上位词。y=Qx
2、分段线性映射:先聚类再映射
3、识别上下位关系:他们设计了三个模型
一个神经网络如果能够应用于NLP,则必须能够解决序列标注的问题。而RNN通过使用反向传播和记忆的机制,能够处理任意长度的序列,在架构上比前馈神经网络更加符合生物神经网络的结构。因此,其正是为了解决这类问题应运而生的。
RNN和LSTM从原理来看,它们都源于认知语言学中的“顺序相似性”原理:文字符号与其上下文构成了一个“像”,这个“像”可以被认为是符号与符号的组合----词汇,也可以被认为是词汇与词汇的句法关系----依存关系。算法的训练过程,是通过正向和反馈两个过程从训练语料中学习出识别这些“像”的能力,并记录下识别“像”的模型数据,当输入新的句子时,算法可以利用存储的模型数据识别出新输入中类似的“像”。
序列标注任务使用的是“many to many”方式。
RNN由于能够接受序列输入,也能得到序列输出,在NLP中取得了巨大成功,并得到广泛应用。
RNN是简单的循环神经网络,称为Simple-RNN:循环可以使得信息从当前时间步传递到下一时间步。
www.threedweb.cn/thread-1595-1-1.html
虽然Simple-RNN实现了循环网络的最初雏形,但是在实际应用中,使用得并不多,主要原因有如下两个:
LSTM:是RNN一个变种,专门用于解决Simple-RNN的上述两个问题,Hocheriter & Schmidhuber早在1997年就提出了LSTM网络,后来Alex Graves对其进行了改良和推广。在NLP问题上,LSTM都取得了巨大的成功,并得到了广泛的使用。
LSTM通过对循环层的刻意设计来避免长期依赖和梯度消失等问题。长期信息的记忆在LSTM中是默认行为,而无须付出代价即可获得此能力。
内部结构http://colah.github.io/posts/2015-08-Understanding-LSTMs/
Keras框架介绍
两种常用的网络架构如下:
1)堆叠的LSTM网络架构
有状态循环模型(LSTM)将前一批样本生成的内部状态(记忆)保存,在下一批重用。它能处理更长的序列,并降低计算复杂性
2、Keras序列标注:中文分词的例子
seqlib.py
# -*- coding: utf-8 -*-
import os
import sys
import numpy as np
from numpy import *
import nltk
import codecs
import pandas as pd
from nltk.probability import FreqDist
from gensim.models import word2vec
from cPickle import load, dump
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential #, Graph
from keras.layers.core import Dense, Dropout, Activation #, TimeDistributedDense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.layers.core import Reshape, Flatten, Dropout
from keras.regularizers import l1, l2
from keras.layers.convolutional import Convolution2D, MaxPooling2D, MaxPooling1D
from sklearn.cross_validation import train_test_split
#读单个文本
def load_file(input_file):
input_data = codecs.open(input_file, 'r', 'utf-8')
input_text = input_data.read()
return input_text
#我们使用gensim的word2vec库
def trainW2V(corpus, epochs=20, num_features = 100, sg=1,
min_word_count = 1, num_workers=4,
context=4, sample=1e-5, negative=5):
w2v = word2vec.Word2Vec(workers=num_workers, sample=sample,
size=num_features, min_count=min_word_count, window=context)
np.random.shuffle(corpus)
w2v.build_vocab(corpus)
w2v.train(corpus, total_examples=w2v.corpus_count, epochs=epochs)
'''
for epoch in range(epochs):
print('epoch' + str(epoch))
np.random.shuffle(corpus)
w2v.train(corpus, total_examples=w2v.corpus_count, epochs=epochs)
w2v.alpha *= 0.9
w2v.min_alpha = w2v.alpha
'''
print("word2vec DONE.")
return w2v
# nltk 输入文本,输出词频表
def freq_func(input_txt):
corpus = nltk.Text(input_txt)
fdist = FreqDist(corpus)
w = fdist.keys()
v = fdist.values()
#print(w[:10], v[:10])
freqdf = pd.DataFrame({'word':w, 'freq':v})
freqdf.sort_values('freq', ascending=False, inplace=True)
freqdf['idx'] = np.arange(len(v))
#print(freqdf)
return freqdf
# 初始化权重
def initweightlist(w2v, idx2word, word2idx):
init_weight_wv = []
for i in range(len(idx2word)):
init_weight_wv.append(w2v[idx2word[i]])
# 定义'U'为未登录新字,‘P’为两头padding用途,并增加两个相应的向量表示
char_num = len(init_weight_wv)
idx2word[char_num] = u'U'
word2idx[u'U'] = char_num
idx2word[char_num+1] = u'P'
word2idx[u'P'] = char_num+1
init_weight_wv.append(np.random.randn(100, ))
init_weight_wv.append(np.zeros(100, ))
return init_weight_wv, idx2word, word2idx
# 加入标注标签: SBME
def character_tagging(input_file, output_file):
input_data = codecs.open(input_file, 'r', 'utf-8')
output_data = codecs.open(output_file, 'w', 'utf-8')
for line in input_data.readlines():
word_list = line.strip().split()
for word in word_list:
if len(word) == 1:
output_data.write(word + "/S ")
else:
output_data.write(word[0] + "/B ")
for w in word[1: len(word) -1]:
output_data.write(w + "/M ")
output_data.write(word[len(word)-1] + "/E ")
output_data.write("\n")
input_data.close()
output_data.close()
def featContext(sentence, word2idx = '', context=7):
predict_word_num = []
# 文本中的字典如果在词典中则转为数字,如果不在则设置为'U'
for w in sentence:
if w in word2idx:
predict_word_num.append(word2idx[w])
else:
predict_word_num.append(word2idx[u'U'])
num = len(predict_word_num) # 首尾 padding
pad = int( (context-1) * 0.5 )
for i in range(pad):
predict_word_num.insert(0, word2idx[u'P'])
predict_word_num.append(word2idx[u'P'])
train_x = []
for i in range(num):
train_x.append(predict_word_num[i: i+context])
return train_x
# 训练语料
class Lstm_Net(object):
def __init__(self):
self.init_weight = []
self.batch_size = 128
self.word_dim = 100
self.maxlen = 7
self.hidden_units = 100
self.nb_classes = 0
def buildnet(self):
self.maxfeatures = self.init_weight[0].shape[0] #词典大小
self.model = Sequential()
print 'stacking LSTM' #使用了堆叠的LSTM架构
self.model.add(Embedding(self.maxfeatures, self.word_dim, input_length=self.maxlen))
self.model.add(LSTM(output_dim=self.hidden_units, return_sequences=True))
self.model.add(LSTM(output_dim=self.hidden_units, return_sequences=False))
self.model.add(Dropout(0.5))
self.model.add(Dense(self.nb_classes))
self.model.add(Activation('softmax'))
self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
def train(self, modelname):
result = self.model.fit(self.train_X, self.Y_train,
batch_size=self.batch_size, epochs=20,
validation_data=(self.test_X, self.Y_test), show_accuracy=True)
self.model.save_weights(modelname)
def splitset(self, train_word_num, train_label, train_size=0.9, random_state=1):
self.train_X, self.test_X, train_y, test_y = train_test_split(train_word_num, train_label, train_size=0.9, random_state=1)
self.Y_train = np_utils.to_categorical(train_y, self.nb_classes)
self.Y_test = np_utils.to_categorical(test_y, self.nb_classes)
# 根据输入得到标注推断
def predict_num(self, input_num, input_txt, label_dict='', num_dict=''):
input_num = np.array(input_num)
predict_prob = self.model.predict_proba(input_num, verbose=False)
predict_lable = self.model.predict_classes(input_num, verbose=False)
for i, lable in enumerate(predict_lable[:-1]):
if i == 0: # 如果是首字,不可为E, M
predict_prob[i, label_dict[u'E']] = 0
predict_prob[i, label_dict[u'M']] = 0
if lable == label_dict[u'B']: # 前字为B,后字不可为B,S
predict_prob[i+1, label_dict[u'B']] = 0
predict_prob[i+1, label_dict[u'S']] = 0
if lable == label_dict[u'E']: # 前字为E,后字不可为M,E
predict_prob[i+1, label_dict[u'M']] = 0
predict_prob[i+1, label_dict[u'E']] = 0
if lable == label_dict[u'M']: # 前字为M,后字不可为B,S
predict_prob[i+1, label_dict[u'B']] = 0
predict_prob[i+1, label_dict[u'S']] = 0
if lable == label_dict[u'S']: # 前字为S,后字不可为M,E
predict_prob[i+1, label_dict[u'M']] = 0
predict_prob[i+1, label_dict[u'E']] = 0
predict_lable[i+1] = predict_prob[i+1].argmax()
predict_lable_new = [num_dict[x] for x in predict_lable]
result = [w+'/' + l for w, l in zip(input_txt, predict_lable_new)]
return ' '.join(result) + '\n'
def getweights(self, wfname):
return self.model.load_weights(wfname)
corpus2vector.py
# -*- coding: utf-8 -*-
from seqlib import *
reload(sys)
sys.setdefaultencoding('utf-8')
corpuspath = "msr.txt"
input_text = load_file(corpuspath)
# word2vec是一个二维数组
txtwv = [line.split() for line in input_text.split('\n') if line != '']
#print(txtwv[0].encode('utf8'))
#
w2v = trainW2V(txtwv)
w2v.save("wordvector.bin")
preprocess.py 执行结果
# -*- coding: utf-8 -*-
from seqlib import *
corpuspath = "msr.txt"
input_text = load_file(corpuspath)
#计算词频
txtnltk = [w for w in input_text.split()] # 为计算词频准备的文本格式
freqdf = freq_func(txtnltk) # 计算词频表
# 建立两个映射词典
#print(freqdf)
word2idx = dict( (c, i) for c, i in zip(freqdf.word, freqdf.idx) )
idx2word = dict( (i, c) for c, i in zip(freqdf.word, freqdf.idx) )
w2v = word2vec.Word2Vec.load("wordvector.bin")
#print(idx2word[0])
#print(w2v[idx2word[0]])
#初始化向量
init_weight_wv, idx2word, word2idx = initweightlist(w2v, idx2word, word2idx)
dump(word2idx, open('word2idx.pickle', 'wb'))
dump(idx2word, open('idx2word.pickle', 'wb'))
dump(init_weight_wv, open('init_weight_wv.pickle', 'wb'))
#读取数据,将格式进行转换为带4种标签 S B M E
output_file = 'msr.tagging.utf8'
character_tagging(corpuspath, output_file)
# 分离 word 和 label
with open(output_file) as f:
lines = f.readlines()
train_line = [[w[0] for w in line.decode('utf-8').split() ] for line in lines]
train_label = [ w[2] for line in lines for w in line.decode('utf-8').split() ]
# 将所有训练文本转成数字list
train_word_num = []
for line in train_line:
train_word_num.extend(featContext(line, word2idx))
# 持久化
dump(train_word_num, open('train_word_num.pickle', 'wb'))
dump(train_label, open('train_label.pickle', 'wb'))
segment_lstm.py
# -*- coding: utf-8 -*-
from seqlib import *
train_word_num = load(open('train_word_num.pickle', 'rb'))
train_label = load(open('train_label.pickle', 'rb'))
nb_classes = len(np.unique(train_label))
print(nb_classes) # 4
# 初始字向量格式准备
init_weight_wv = load(open('init_weight_wv.pickle', 'rb'))
print shape(init_weight_wv) # (68947, 100)
# 建立两个词典
label_dict = dict(zip(np.unique(train_label), range(4)))
num_dict = {n: l for l, n in label_dict.iteritems()}
# 将目标变量转为数字
train_label = [ label_dict[y] for y in train_label ]
print shape(train_label) # (3763026,)
train_word_num = np.array(train_word_num)
print shape(train_word_num) # (3763026, 7)
# stacking LSTM
modelname = 'my_model_weights.h5'
net = Lstm_Net()
net.init_weight = [np.array(init_weight_wv)]
net.nb_classes = nb_classes
net.splitset(train_word_num, train_label)
print "Train..."
net.buildnet()
net.train(modelname)
执行分词:
# -*- coding: utf-8 -*-
from seqlib import *
word2idx = load(open('word2idx.pickle', 'rb'))
train_word_num = load(open('train_word_num.pickle', 'rb'))
train_label = load(open('train_label.pickle', 'rb'))
nb_classes = len(np.unique(train_label))
init_weight_wv = load(open('init_weight_wv.pickle', 'rb'))
# 建立两个词典
label_dict = dict(zip(np.unique(train_label), range(4)))
num_dict = {n: l for l, n in label_dict.iteritems()}
temp_txt = u'罗马尼亚的首都是布加勒斯特。'
temp_txt = list(temp_txt)
temp_num = featContext(temp_txt, word2idx = word2idx)
net = Lstm_Net()
net.init_weight = [np.array(init_weight_wv)]
net.nb_classes = nb_classes
net.buildnet()
net.getweights('my_model_weights.h5')
temp = net.predict_num(temp_num, temp_txt, label_dict=label_dict, num_dict=num_dict)
print(temp)
实现了98.71%的精度
由于最新的依存句法分析用到了DP的思想