使用jieba加载自定义词典对语料进行BIO词性标注。

本文章只提供大致思路,至于结巴的具体使用,和其他相关知识不做阐述

标注前的装备工作:

(1)::已经预处理过的需要分词的语料(需要分词的语料要符合结巴分词的要求,每句一行,并且以句号做结尾)
(2):一份手动标注好的自定义词典(包含词性),如下使用jieba加载自定义词典对语料进行BIO词性标注。_第1张图片

进行词性标注处理:

具体代码如下:

import codecs
import jieba.posseg as ps
import jieba
infile='network_security_yuliao.txt'#需要进行词性标注的语料
outfile='data_open_BIOfenci.txt'#用于保存词性标注好的数据
jieba.load_userdict("network_security_dict.txt")#加载自定义的词典,用于jieba分词
descsFile=codecs.open(infile,'r',encoding='utf-8')
with open(outfile,'w',encoding='utf-8')as f:#以w的形式打开文件outfile,若没有该文件,会自动创建。
    for line in descsFile:
        line = line.strip()
        yuliao = ps.cut(line)#进行分词
        #把分好的词与自定义的词典的词性进行比对,进行相关操作。
        for worlds in yuliao:
            if worlds.flag == "nznet":
                for i in range(len(worlds.word)):#遍历字符串,循环输出
                    if i ==0:
                        f.write(worlds.word[i]+' '+'B-'+worlds.flag+'\n')#起始点为B开头
                    else:
                        f.write(worlds.word[i]+' '+'I-'+worlds.flag+'\n')#后面的都为I开头
            elif worlds.flag=='nzatt':
                for i in range(len(worlds.word)):
                    if i ==0:
                        f.write(worlds.word[i]+' '+'B-'+worlds.flag+'\n')
                    else:
                        f.write(worlds.word[i]+' '+'I-'+worlds.flag+'\n')
            elif worlds.flag=='nzdef':
                for i in range(len(worlds.word)):
                    if i ==0:
                        f.write(worlds.word[i]+' '+'B-'+worlds.flag+'\n')
                    else:
                        f.write(worlds.word[i]+' '+'I-'+worlds.flag+'\n')
            elif worlds.flag == 'nzper':
                for i in range(len(worlds.word)):
                    if i == 0:
                        f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
                    else:
                        f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
            elif worlds.flag == 'nzvul':
                for i in range(len(worlds.word)):
                    if i == 0:
                        f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
                    else:
                        f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
            elif worlds.flag == 'vir':
                for i in range(len(worlds.word)):
                    if i == 0:
                        f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
                    else:
                        f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
            elif worlds.flag == 'nzsys':
                for i in range(len(worlds.word)):
                    if i == 0:
                        f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
                    else:
                        f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
            elif worlds.flag == 'nzsoft':
                for i in range(len(worlds.word)):
                    if i == 0:
                        f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
                    else:
                        f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
            elif worlds.flag == 'nzhard':
                for i in range(len(worlds.word)):
                    if i == 0:
                        f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
                    else:
                        f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
            elif worlds.flag == 'nzalg':
                for i in range(len(worlds.word)):
                    if i == 0:
                        f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
                    else:
                        f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
            elif worlds.flag == 'nzprot':
                for i in range(len(worlds.word)):
                    if i == 0:
                        f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
                    else:
                        f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
            elif worlds.flag == 'prog':
                for i in range(len(worlds.word)):
                    if i == 0:
                        f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
                    else:
                        f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
            #如果标注的词性不适于自定义词典中的,则标注为‘O’
            else:
                for i in range(len(worlds.word)):
                    if worlds.word[i]=='。':
                        f.write(worlds.word[i] + ' ' + 'O'+'\n')
                        f.write('\r')#遇到句号结尾要换行
                    else:
                        f.write(worlds.word[i] + ' ' + 'O'+'\n')

具体步骤:
(1):首先我们需要把需要分词的语料进行jieba分词,进行jieba分词的时候加载自定义词典,结巴分词完成后,对于分完的数据有两个属性值。一个是word一个是flag。
若加载自定义词典之后,flag会参照你给的词典进行标注。
(2):通过比较分完的数据的flag是否属于自定义词典中的词性来进行标注,若属于根据数据的长短,来遍历(数据)字符串进行词性标注。书写相关代码,若不属于则标注为O。最后写入到txt中即可。

如果需要把得到的数据分成三份用于训练集和测试集,和偏差。则需把数据随机按照7:2:1的比例分成三份`

代码如下(不做具体解释)

import codecs
import jieba.posseg as ps
import jieba
import numpy as np
resultList=[]
infile='network_security_yuliao.txt'
outfile1='BIO1.txt'#用于保存训练集
outfile2='BIO2.txt'#用于保存测试集
outfile3='BIO3.txt'#用于保存偏差
jieba.load_userdict("network_security_dict.txt")
descsFile=codecs.open(infile,'r',encoding='utf-8')
with open(outfile1,'w',encoding='utf-8')as f:
    with open(outfile2, 'w', encoding='utf-8')as g:
        with open(outfile3, 'w', encoding='utf-8')as h:
            line = [line.strip()for line in descsFile]
            np.random.shuffle(line)
            for x in range(500):
                yuliao = ps.cut(line[x])
                for worlds in yuliao:
                    if worlds.flag == "nznet":
                        for i in range(len(worlds.word)):
                            if i ==0:
                                if x<350:
                                    f.write(worlds.word[i]+' '+'B-'+worlds.flag+'\n')
                                elif 349<x<450:
                                    g.write(worlds.word[i]+' '+'B-'+worlds.flag+'\n')
                                else:
                                    h.write(worlds.word[i]+' '+'B-'+worlds.flag+'\n')
                            else:
                                if x<350:
                                    f.write(worlds.word[i]+' '+'I-'+worlds.flag+'\n')
                                elif 349<x<450:
                                    g.write(worlds.word[i]+' '+'I-'+worlds.flag+'\n')
                                else:
                                    h.write(worlds.word[i]+' '+'I-'+worlds.flag+'\n')
                    elif worlds.flag=='nzatt':
                        for i in range(len(worlds.word)):
                            if i ==0:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                            else:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                    elif worlds.flag=='nzdef':
                        for i in range(len(worlds.word)):
                            if i ==0:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                            else:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                    elif worlds.flag == 'nzper':
                        for i in range(len(worlds.word)):
                            if i == 0:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                            else:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                    elif worlds.flag == 'nzvul':
                        for i in range(len(worlds.word)):
                            if i == 0:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                            else:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                    elif worlds.flag == 'nzvir':
                        for i in range(len(worlds.word)):
                            if i == 0:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                            else:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                    elif worlds.flag == 'nzsys':
                        for i in range(len(worlds.word)):
                            if i == 0:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                            else:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                    elif worlds.flag == 'nzsoft':
                        for i in range(len(worlds.word)):
                            if i == 0:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                            else:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                    elif worlds.flag == 'nzhard':
                        for i in range(len(worlds.word)):
                            if i == 0:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                            else:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                    elif worlds.flag == 'nzalg':
                        for i in range(len(worlds.word)):
                            if i == 0:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                            else:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                    elif worlds.flag == 'nzprot':
                        for i in range(len(worlds.word)):
                            if i == 0:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                            else:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                    elif worlds.flag == 'prog':
                        for i in range(len(worlds.word)):
                            if i == 0:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
                            else:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
                    else:
                        for i in range(len(worlds.word)):
                            if worlds.word[i]=='。':
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'O'+'\n')
                                    f.write('\r')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'O' + '\n')
                                    g.write('\r')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'O' + '\n')
                                    h.write('\r')
                            else:
                                if x < 350:
                                    f.write(worlds.word[i] + ' ' + 'O'+'\n')
                                elif 349 < x < 450:
                                    g.write(worlds.word[i] + ' ' + 'O' + '\n')
                                else:
                                    h.write(worlds.word[i] + ' ' + 'O' + '\n')

你可能感兴趣的:(自然语言处理)