记录一次做NER时的数据格式转换过程:
1、将BMEWO–>BMESO–>BIOES
2、将数据集按8:1:1划分为train、dev、test
数据集划分方法借鉴:https://blog.csdn.net/allan2222/article/details/99672868
import random
def split(all_list, shuffle=False, ratio=0.8):
#按比例随机抽取数据,格式为每行一句话
num = len(all_list)
offset = int(num * ratio)
if num == 0 or offset < 1:
return [], all_list
if shuffle:
random.shuffle(all_list)
train = all_list[:offset]
dev_test = all_list[offset:]
return train, dev_test
def write_split(film, train, dev_test):
#将数据分为train、dev_test
infilm = open(film, 'r', encoding='utf-8')
trainfilm = open(train, 'w', encoding='utf-8')
dev_testfilm = open(dev_test, 'w', encoding='utf-8')
list = []
for datas in infilm.readlines():
datas = datas.replace('\n','')
list.append(datas)
traindatas, dev_testdatas = split(list, shuffle=True, ratio=0.8)
for traindata in traindatas:
trainfilm.write(traindata + '\n')
for dev_testdata in dev_testdatas:
dev_testfilm.write(dev_testdata + '\n')
infilm.close()
trainfilm.close()
dev_testfilm.close()
def Extract_tag(input_txt, output_txt):
'''
原格式为:vocal--label,如'菜 O'。
只要句子中含有非‘O’标签,就提取该句子。
'''
with open(input_txt,'r' ,encoding='utf-8') as f:
with open(output_txt,'a',encoding='utf-8') as g:
list = []
# list_ =[]
flag = 0
lines = f.readlines()
for line in lines:
if line.isspace() == False:#一句话未结束
list.append(line.strip())#先将该句子加入list
# for i,word in enumerate(line):
# if word.isspace()==True:#空格
else:#一句话结束,判断list该句子中是否全为‘O’
# print(list)
for words in list:
for i,word in enumerate(words):
# print(word)
if word.isspace()==True:
if words[i+1]!='O':
flag=1
break#将该list写入新的文件
else:
continue
if flag == 1:#存在不为‘O’的标签
break
if flag == 1:
for words in list:
g.write(words+'\n')
g.write('\n')
flag = 0
list = []
g.close()
f.close()
def Num_tag(input_txt):
'''
统计标签数量,原格式为:vocal--label,如'菜 O'
'''
with open(input_txt, 'r', encoding='utf-8') as f:
# sum = 46364
list = []
lines = f.readlines()
for line in lines:
for i,words in enumerate(line):
if words.isspace()==True or words == '\t':
word = line[i+1:].strip()
if word not in list:
list.append(word)
# print(list)
# print(len(list))
f.close()
return list,len(list)
def GetWords_Tags(input_txt, output_words_txt, output_tags_txt):
'''
分别获取词和标签,原格式为:vocal--label,如'菜 O'
'''
with open(input_txt, 'r', encoding='utf-8') as f:
lines = f.readlines()#获取所有行
sum = 0
words = []
tags = []
for line in lines:
if line.isspace() == False:#针对一行
for i,word in enumerate(line):
if word.isspace()==True or word==' ':
words.append(line[:i].strip())
tags.append(line[i:].strip())
sum += 1
break
else:#一句话结束
with open(output_words_txt, 'a', encoding='utf-8') as g:
for word in words:
g.write(word+' ')
g.write('\n')
words = []
with open(output_tags_txt, 'a', encoding='utf-8') as z:
for tag in tags:
z.write(tag+' ')
z.write('\n')
tags = []
with open(output_words_txt, 'a', encoding='utf-8') as g:
for word in words:
g.write(word + ' ')
g.write('\n')
with open(output_tags_txt, 'a', encoding='utf-8') as z:
for tag in tags:
z.write(tag + ' ')
z.write('\n')
# print(words)
# print(tags)
print(sum)
f.close()
g.close()
z.close()
def delete_space(input_txt, output_txt):
'''
删除多余空行,每行为一句话
'''
f = open(input_txt, 'r', encoding='utf-8')
g = open(output_txt, 'w', encoding='utf-8')
lines = f.readlines()
for line in lines:
if line.isspace()==False and line!='\n':
g.write(line)
f.close()
g.close()
def NewFileOfWordsAndTags(input_words_txt, input_tags_txt, output_txt):
'''
将词和标签合并到一句话中,输入输出每行都对应一句话。
'''
f = open(input_words_txt, 'r', encoding='utf-8')
g = open(input_tags_txt, 'r', encoding='utf-8')
z = open(output_txt, 'w', encoding='utf-8')
f_lines = f.readlines()
g_lines = g.readlines()
str = []
for i, line in enumerate(f_lines):
if line.isspace() or line == '\n':
continue
else:
# str.append(line.strip()+g_lines[i].strip())
line = line.strip()
for word in line:
str.append(word)
str.append(' ')
line_ = g_lines[i].strip()
for word_ in line_:
str.append(word_)
for i in str:
z.write(i)
z.write('\n')
str = []
f.close()
g.close()
z.close()
def SplitSentence(input_txt, output_txt):
'''
将一句话中的词和标签分隔开,输入为每行一句话。
输出格式为:vocal--label,如'菜 O',每行一个词,一句话对应多行。
这种方法有漏洞,需要注意,比如在文章中出现'O',会把它当作标签
'''
f = open(input_txt, 'r', encoding='utf-8')
g = open(output_txt, 'w', encoding='utf-8')
new_words = []
new_tags = []
new_line = []
for line in f.readlines(): # 针对一句话,包含实体和标签
words = line.strip().split(' ') # 针对一句话,这里是空格,也可能是\t
# print(words)
for i, word in enumerate(words): # 针对每个词
if word == 'O' or word == 'B-ORGANIZATION' or word == 'B-PERSON' or word == 'B-TIME' or word == 'B-LOCATION':
new_words = words[:i]
new_tags = words[i:]
for j, w in enumerate(new_words):
new_line.append(w + ' ' + new_tags[j])
for label in new_line:
g.write(label+'\n')
g.write('\n')
new_line = []
break
# print(new_line)
f.close()
g.close()
def SplitSentence_new(input_txt, output_txt):
f = open(input_txt, 'r', encoding='utf-8')
g = open(output_txt, 'a', encoding='utf-8')
lines = f.readlines()
for line in lines:
new_words,new_tags = line.strip().split('\t')
new_words = new_words.split()
new_tags = new_tags .split()
# print(new_words )
# print(new_tags )
for i,j in enumerate(new_words ):
g.write(j + ' ' + new_tags [i] + '\n')
g.write('\n')
return 0
if __name__ == '__main__':
input_txt = './data/RenminNER/BIOES/train.txt'
output_txt = './data/RenminNER/BIOES/new_train.txt'
SplitSentence(input_txt, output_txt)