词性标注和命名实体识别

词性标注

import jieba.posseg as psg #加载 jieba 模块中的分词函数
sent = "去森林公园爬山。"
for w,t in psg.cut(sent):
 print(w,"/",t)

命名实体识别

构建训练集和测试集:

import os
def corpus(corpus_path):
    data = open(corpus_path,encoding='utf-8') #人民日报数据 
    train = open("train.data",'w+',encoding='utf-8') #训练集
    test = open("test.data",'w+',encoding='utf-8') #测试集
    #划分训练集与测试集
    pos=0
    while True:
        line = data.readline()
        if line: 
            Test_pos = True if pos % 5 == 0 else False #抽样20%作为测试集使用
            save = test if Test_pos else train #训练集与测试集分别保存
            words = line.split()[1:]
            for word in words:
                if (word.endswith("/nr") == True): #是人名
                    if(len(word)==1): #只有一个字符,如:信
                        save.write(word + ' ' + 'S' + ' '+'B'+'\n')
                    else: #两个字符以上的词 ,如:袁隆平 
                        save.write(word[0] + ' '+ 'B'+' '+ 'B' +'\n') #袁 B B
                        for j in range(1,len(word)-1-3):
                            save.write(word[j] + ' '+ 'M'+' '+ 'I' +'\n') #隆 M I 
                        save.write(word[len(word)-4] + ' '+ 'E'+' '+ 'I' +'\n') #平 E I 	
                else: 
                    if(len(word)==1):
                        save.write(word + ' ' + 'S' + ' '+'O'+'\n')
                    else: 
                        save.write(word[0] + ' '+ 'B'+ ' '+'O' +'\n')
                        for j in range(1,len(word)-1):
                            save.write(word[j] + ' '+ 'M'+' '+ 'O' +'\n')
                        save.write(word[len(word)-1] + ' '+ 'E'+' '+ 'O' +'\n')
            save.write('\n')
            pos+=1
        else:
            break
    data.close()
    train.close()
    test.close()

if __name__ == '__main__':
    os.chdir(r'F:\大三下\自然语言处理\测试')
    corpus('./people-daily.txt')

用 CRF++工具 训练和测试模型

词性标注和命名实体识别_第1张图片
在此处打开cmd输入命令,进行训练和测试

测试评估:

def Verification(result_path):
    test = open(result_path,'r',encoding='utf_8_sig')
    test_name_tag = 0 #测试数据的人名标记数
    predict_name_tag = 0 #预测的人名标记数
    correct_name_tag = 0 #预测正确的人名标记数
    all_tag=0 #全部标记数
    pos = True
    
    for l in test:
        if l=='\n':
            continue
        _, a, g, r = l.strip().split()
        if a in ('B','S'):
            all_tag+=1
        if r != g:
            pos = False
        if r == 'B':
            predict_name_tag += 1
            if pos:
                correct_name_tag +=1
            pos = True
        if g == 'B':
            test_name_tag += 1
    #准确率
    P = correct_name_tag/float(predict_name_tag)
    #召回率
    R = correct_name_tag/float(test_name_tag)
    print('全部词数有:' + str(all_tag) + '个')
    print('其中人名词有:' + str(test_name_tag) + '个')
    print('准确率为:{}, 召回率为:{}, F值为:{}'.format(P, R, (2*P*R)/(P+R)))
    test.close()

if __name__ == '__main__':
    Verification(r'F:\大三下\自然语言处理\测试\result')

实测:

import os
import jieba.posseg as psg

os.chdir(r'F:\大三下\自然语言处理\测试')
#人名识别的内容
sent = "法外狂徒老张三"

data = open("data.txt",'w+',encoding='utf-8')
save = open("sentence.data",'w+',encoding='utf_8_sig')
for word,t in psg.cut(sent): #分词
    data.write(word+"/"+t+" ")
data.close()

data2 = open(r'F:\大三下\自然语言处理\测试\data.txt',encoding='utf-8')
line = data2.readline()
words = line.split()[0:]
for word in words:
    if (word.endswith("/nr") == True): 
        if(len(word)==1): #只有一个字符
            save.write(word + ' ' + 'S' + ' '+'S'+'\n')
        else: #两个字符以上的词 ,如:谢广坤
            save.write(word[0] + ' '+ 'B'+' '+ 'B' +'\n') 
            for j in range(1,len(word)-3-1):
                save.write(word[j] + ' '+ 'M'+' '+ 'I' +'\n') 
            save.write(word[len(word)-4] + ' '+ 'E'+' '+ 'I' +'\n') 
    else: 
        if(len(word)==1):
            save.write(word + ' ' + 'S' + ' '+'O'+'\n')
        else: 
            save.write(word[0] + ' '+ 'B'+ ' '+'O' +'\n')
            for j in range(1,len(word)-1):
                save.write(word[j] + ' '+ 'M'+' '+ 'O' +'\n')
            save.write(word[len(word)-1] + ' '+ 'E'+' '+ 'O' +'\n')
data.close()
save.close()#写入文件
os.system("crf_test -m model sentence.data > sent_result") #进行人名识别并输出结果到文件中
#查看人名识别的效果
result = open("sent_result",'r+',encoding='utf_8_sig')
result1 = result.readlines()
for i in range(len(result1)):
    if len(result1[i])>1:
        if result1[i][4] in 'BI':
            print(result1[i][0])
result.close()

你可能感兴趣的:(NLP)