hanlp 训练模型_hanlp学习三:自定义命名体识别

from pyhanlp import *

importosfrom pyhanlp.static importdownload, remove_file, HANLP_DATA_PATHimportzipfile

NERTrainer= JClass('com.hankcs.hanlp.model.perceptron.NERTrainer')

AbstractLexicalAnalyzer= JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer')

PerceptronSegmenter= JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')

CWSTrainer= JClass('com.hankcs.hanlp.model.perceptron.CWSTrainer')

PerceptronPOSTagger= JClass('com.hankcs.hanlp.model.perceptron.PerceptronPOSTagger')

PerceptronNERecognizer= JClass('com.hankcs.hanlp.model.perceptron.PerceptronNERecognizer')

POSTrainer= JClass('com.hankcs.hanlp.model.perceptron.POSTrainer')

PLANE_ROOT= 'C:\\Users\\DELL\\Desktop\\NL'PLANE_CORPUS= os.path.join(PLANE_ROOT, 'test_ner.txt')

PLANE_MODEL= os.path.join(PLANE_ROOT, 'model.bin')if __name__ == '__main__':#训练模型并保存模型

trainer = NERTrainer()#命令实体分词器

trainer.tagSet.nerLabels.clear() # 不识别nr、ns、nt

trainer.tagSet.nerLabels.add("purchaser") # 目标是识别purchase

trainer.train(PLANE_CORPUS, PLANE_MODEL).getModel() #生成命令实体分词模型,并保存

# #加载模型

recognizer = PerceptronNERecognizer(os.path.join(PLANE_ROOT, 'model.bin')) #加载命令实体分词模型

##分词训练 在NER预测前,需要一个分词器,最好训练自同源语料库CWS_CORPUS= 'C:\\Users\\DELL\\Desktop\\NL\\test_ner.txt'PLANE_EMPTY_MODEL='C:\\Users\\DELL\\Desktop\\NL\\cws.bin'

CWS_MODEL = CWSTrainer().train(CWS_CORPUS,PLANE_EMPTY_MODEL).getModel() #普通分词器训练后得到的分词模型并保存模型

CWS_MODEL = os.path.join(PLANE_ROOT, 'cws.bin')#分词模型保存路径#词性训练

corpus = 'C:\\Users\\DELL\\Desktop\\NL\\test_pos.txt'POS_MODEL= 'C:\\Users\\DELL\\Desktop\\NL\\pos.bin'

trainer = POSTrainer()

POSMODEL = trainer.train(corpus, POS_MODEL).getModel() # 标注训练并保存文件

POSMODEL = 'C:\\Users\\DELL\\Desktop\\NL\\pos.bin' #词性模型文件保存路径

analyzer=AbstractLexicalAnalyzer(PerceptronSegmenter(CWS_MODEL), PerceptronPOSTagger(POSMODEL), recognizer).enableCustomDictionary(True)print(analyzer.analyze("2、采购人名称:;福建福州人民医院;"))#print(analyzer.analyze("F-22横空出世。"))

你可能感兴趣的:(hanlp,训练模型)