本文章只提供大致思路,至于结巴的具体使用,和其他相关知识不做阐述
(1)::已经预处理过的需要分词的语料(需要分词的语料要符合结巴分词的要求,每句一行,并且以句号做结尾)
(2):一份手动标注好的自定义词典(包含词性),如下
具体代码如下:
import codecs
import jieba.posseg as ps
import jieba
infile='network_security_yuliao.txt'#需要进行词性标注的语料
outfile='data_open_BIOfenci.txt'#用于保存词性标注好的数据
jieba.load_userdict("network_security_dict.txt")#加载自定义的词典,用于jieba分词
descsFile=codecs.open(infile,'r',encoding='utf-8')
with open(outfile,'w',encoding='utf-8')as f:#以w的形式打开文件outfile,若没有该文件,会自动创建。
for line in descsFile:
line = line.strip()
yuliao = ps.cut(line)#进行分词
#把分好的词与自定义的词典的词性进行比对,进行相关操作。
for worlds in yuliao:
if worlds.flag == "nznet":
for i in range(len(worlds.word)):#遍历字符串,循环输出
if i ==0:
f.write(worlds.word[i]+' '+'B-'+worlds.flag+'\n')#起始点为B开头
else:
f.write(worlds.word[i]+' '+'I-'+worlds.flag+'\n')#后面的都为I开头
elif worlds.flag=='nzatt':
for i in range(len(worlds.word)):
if i ==0:
f.write(worlds.word[i]+' '+'B-'+worlds.flag+'\n')
else:
f.write(worlds.word[i]+' '+'I-'+worlds.flag+'\n')
elif worlds.flag=='nzdef':
for i in range(len(worlds.word)):
if i ==0:
f.write(worlds.word[i]+' '+'B-'+worlds.flag+'\n')
else:
f.write(worlds.word[i]+' '+'I-'+worlds.flag+'\n')
elif worlds.flag == 'nzper':
for i in range(len(worlds.word)):
if i == 0:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
else:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
elif worlds.flag == 'nzvul':
for i in range(len(worlds.word)):
if i == 0:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
else:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
elif worlds.flag == 'vir':
for i in range(len(worlds.word)):
if i == 0:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
else:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
elif worlds.flag == 'nzsys':
for i in range(len(worlds.word)):
if i == 0:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
else:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
elif worlds.flag == 'nzsoft':
for i in range(len(worlds.word)):
if i == 0:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
else:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
elif worlds.flag == 'nzhard':
for i in range(len(worlds.word)):
if i == 0:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
else:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
elif worlds.flag == 'nzalg':
for i in range(len(worlds.word)):
if i == 0:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
else:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
elif worlds.flag == 'nzprot':
for i in range(len(worlds.word)):
if i == 0:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
else:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
elif worlds.flag == 'prog':
for i in range(len(worlds.word)):
if i == 0:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag+'\n')
else:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag+'\n')
#如果标注的词性不适于自定义词典中的,则标注为‘O’
else:
for i in range(len(worlds.word)):
if worlds.word[i]=='。':
f.write(worlds.word[i] + ' ' + 'O'+'\n')
f.write('\r')#遇到句号结尾要换行
else:
f.write(worlds.word[i] + ' ' + 'O'+'\n')
具体步骤:
(1):首先我们需要把需要分词的语料进行jieba分词,进行jieba分词的时候加载自定义词典,结巴分词完成后,对于分完的数据有两个属性值。一个是word一个是flag。
若加载自定义词典之后,flag会参照你给的词典进行标注。
(2):通过比较分完的数据的flag是否属于自定义词典中的词性来进行标注,若属于根据数据的长短,来遍历(数据)字符串进行词性标注。书写相关代码,若不属于则标注为O。最后写入到txt中即可。
代码如下(不做具体解释)
import codecs
import jieba.posseg as ps
import jieba
import numpy as np
resultList=[]
infile='network_security_yuliao.txt'
outfile1='BIO1.txt'#用于保存训练集
outfile2='BIO2.txt'#用于保存测试集
outfile3='BIO3.txt'#用于保存偏差
jieba.load_userdict("network_security_dict.txt")
descsFile=codecs.open(infile,'r',encoding='utf-8')
with open(outfile1,'w',encoding='utf-8')as f:
with open(outfile2, 'w', encoding='utf-8')as g:
with open(outfile3, 'w', encoding='utf-8')as h:
line = [line.strip()for line in descsFile]
np.random.shuffle(line)
for x in range(500):
yuliao = ps.cut(line[x])
for worlds in yuliao:
if worlds.flag == "nznet":
for i in range(len(worlds.word)):
if i ==0:
if x<350:
f.write(worlds.word[i]+' '+'B-'+worlds.flag+'\n')
elif 349<x<450:
g.write(worlds.word[i]+' '+'B-'+worlds.flag+'\n')
else:
h.write(worlds.word[i]+' '+'B-'+worlds.flag+'\n')
else:
if x<350:
f.write(worlds.word[i]+' '+'I-'+worlds.flag+'\n')
elif 349<x<450:
g.write(worlds.word[i]+' '+'I-'+worlds.flag+'\n')
else:
h.write(worlds.word[i]+' '+'I-'+worlds.flag+'\n')
elif worlds.flag=='nzatt':
for i in range(len(worlds.word)):
if i ==0:
if x < 350:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
if x < 350:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif worlds.flag=='nzdef':
for i in range(len(worlds.word)):
if i ==0:
if x < 350:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
if x < 350:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif worlds.flag == 'nzper':
for i in range(len(worlds.word)):
if i == 0:
if x < 350:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
if x < 350:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif worlds.flag == 'nzvul':
for i in range(len(worlds.word)):
if i == 0:
if x < 350:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
if x < 350:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif worlds.flag == 'nzvir':
for i in range(len(worlds.word)):
if i == 0:
if x < 350:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
if x < 350:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif worlds.flag == 'nzsys':
for i in range(len(worlds.word)):
if i == 0:
if x < 350:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
if x < 350:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif worlds.flag == 'nzsoft':
for i in range(len(worlds.word)):
if i == 0:
if x < 350:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
if x < 350:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif worlds.flag == 'nzhard':
for i in range(len(worlds.word)):
if i == 0:
if x < 350:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
if x < 350:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif worlds.flag == 'nzalg':
for i in range(len(worlds.word)):
if i == 0:
if x < 350:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
if x < 350:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif worlds.flag == 'nzprot':
for i in range(len(worlds.word)):
if i == 0:
if x < 350:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
if x < 350:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif worlds.flag == 'prog':
for i in range(len(worlds.word)):
if i == 0:
if x < 350:
f.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'B-' + worlds.flag + '\n')
else:
if x < 350:
f.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
else:
h.write(worlds.word[i] + ' ' + 'I-' + worlds.flag + '\n')
else:
for i in range(len(worlds.word)):
if worlds.word[i]=='。':
if x < 350:
f.write(worlds.word[i] + ' ' + 'O'+'\n')
f.write('\r')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'O' + '\n')
g.write('\r')
else:
h.write(worlds.word[i] + ' ' + 'O' + '\n')
h.write('\r')
else:
if x < 350:
f.write(worlds.word[i] + ' ' + 'O'+'\n')
elif 349 < x < 450:
g.write(worlds.word[i] + ' ' + 'O' + '\n')
else:
h.write(worlds.word[i] + ' ' + 'O' + '\n')