# -*- coding:utf-8 -*-
from __future__ import division
''' 使用线性 CRF 实现实体识别的任务
使用 sklearn-crfsuite 中的 CRF '''
from itertools import chain # 迭代器
import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
# 开发测试集 先 nltk.download("conll2002")
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
def word2features(sent,i):
''' 特征提取器 '''
word = sent[i][0] # 词
postag = sent[i][1] # 词性
features = {'bias':1.0,
'word.lower()':word.lower(),
'word[-3:]':word[-3:],
'word[-2:]':word[-2:],
'word.isupper()':word.isupper(),
'word.istitle()':word.istitle(),
'word.isdigit()':word.isdigit(),
'postag':postag,
'postag[:2]':postag[:2]}
if i > 0 : # 前一个词的特征
word1 = sent[i-1][0]
postag1 = sent[i-1][1]
features.update({'-1:word.lower()':word1.lower(),
'-1:word.istitle()':word1.istitle(),
'-1:word.isupper()':word1.isupper(),
'-1:postag':postag1,
'-1:postag[:2]':postag1[:2]})
else:
features['BOS'] = True
if i < len(sent) -1: # 后一个词特征
word1 = sent[i+1][0]
postag1 = sent[i+1][1]
features.update({'+1:word.lower()':word1.lower(),
'+1:word.istitle()':word1.istitle(),
'+1:word.isupper()':word1.isupper(),
'+1:postag':postag1,
'+1:postag[:2]':postag1[:2]})
else:
features['BOS'] = True
return features
def sent2features(sent):
'''
提取句子特征
'''
return [word2features(sent,i) for i in xrange(len(sent))]
def sent2labels(sent):
'''
提取句子 label
'''
return [label for token,postag,label in sent]
def sent2tokens(sent):
'''
提取句子词
'''
return [token for token,postag,label in sent]
X_train = [sent2features(s) for s in train_sents]
Y_train = [sent2labels(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]
Y_test = [sent2labels(s) for s in test_sents]
from collections import Counter
def print_transitions(trans_features):
for (label_from,label_to),weight in trans_features:
print "%-6s -> %-7s %.6f" %(label_from,label_to,weight)
def print_state_features(state_features):
for (attr,label),weight in state_features:
print "%.6f %-8s %s" %(weight,label,attr)
if __name__ == "__main__": # 注意,迭代取最优模型时一定要在 if __name__ == "__main__" 中
# 设定模型和超参数
crf = sklearn_crfsuite.CRF(
algorithm = 'lbfgs',
c1 = 0.1,
c2 = 0.1,
max_iterations = 100,
all_possible_transitions = True)
# 开始训练
crf.fit(X_train,Y_train)
labels = list(crf.classes_)
print labels # ['B-LOC','O','B-ORG','B-PER','I-PER','B-MISC','I-ORG','I-LOC','I-MISC']
labels.remove('O')
# 使用测试集评测
Y_pred = crf.predict(X_test)
metrics.flat_f1_score(Y_test,Y_pred,average='weighted',labels = labels)
# 获得标记是 B 或者 I 的结果
sorted_labels = sorted(labels,
key = lambda x:(x[1:],x[0]))
print "初始模型效果如下...".decode("utf-8")
print metrics.flat_classification_report(Y_test,Y_pred,
labels = sorted_labels,
digits = 3) # digits 表示保留几位小数
# 定义超参数和参数查找空间
crf = sklearn_crfsuite.CRF(
algorithm = 'lbfgs',
max_iterations = 100,
all_possible_transitions = True)
params_space = {'c1':scipy.stats.expon(scale = 0.5),
'c2':scipy.stats.expon(scale = 0.05)}
# 使用相同的基准评估数据
f1_scorer = make_scorer(metrics.flat_f1_score,average='weighted',labels = labels)
# 查询最佳模型
rs = RandomizedSearchCV(crf,params_space,
cv = 3,
verbose = 1,
n_jobs = -1,
n_iter = 50,
scoring = f1_scorer)
rs.fit(X_train,Y_train)
# 输出最佳模型参数
print "The Best Params:",rs.best_params_
print "The Best CV score:",rs.best_score_
print "Model Size:{:.2f}M".format(rs.best_estimator_.size_ / 1000000)
crf = rs.best_estimator_
Y_pred = crf.predict(X_test)
print "最佳模型效果如下...".decode("utf-8")
print metrics.flat_classification_report(Y_test,Y_pred,
labels = sorted_labels,
digits = 3)
print "\n最大转移概率".decode("utf-8")
print_transitions(Counter(crf.transition_features_).most_common(20))
print "\n最低转移概率".decode("utf-8")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])
print "\nTop Positive"
print_state_features(Counter(crf.state_features_).most_common(30))
print "\nTop Negative"
print_state_features(Counter(crf.state_features_).most_common()[-30:])
CRF 算法和 HMM 一样用于解决序列标注的问题,HMM一般用来解决有时间序列关系的数据,但是由于HMM 模型的局限性,比如当前状态仅依赖于上一状态这种苛刻的条件,CRF 模型采用特征函数解放了HMM 的假设条件局限性。可作为超越HMM模型的一种改进。CRF算法主要做了四件事情:
1.选择特征模板:抽取文本中的字符组合或具有其他意义的标记组成特征,作为特征函数的参数。
2.构造特征函数:通过一组函数来完成由特征向数值转换的过程,使特征和一个权值对应。
3.进行前向计算:每个状态特征函数(0-1二值特征函数)对应一个多维的向量,最终状态特征函数权值的和即该位置上激活了的状态特征函数对应的多维向量之和。
4.解码:利用维特比算法解码出最优标注序列。