NER数据格式转化及数据集划分

记录一次做NER时的数据格式转换过程:
1、将BMEWO–>BMESO–>BIOES
2、将数据集按8:1:1划分为train、dev、test

数据集划分方法借鉴:https://blog.csdn.net/allan2222/article/details/99672868

import random

def split(all_list, shuffle=False, ratio=0.8):
    #按比例随机抽取数据,格式为每行一句话
    num = len(all_list)
    offset = int(num * ratio)
    if num == 0 or offset < 1:
        return [], all_list
    if shuffle:
        random.shuffle(all_list)
    train = all_list[:offset]
    dev_test = all_list[offset:]

    return train, dev_test

def write_split(film, train, dev_test):
    #将数据分为train、dev_test
    infilm = open(film, 'r', encoding='utf-8')
    trainfilm = open(train, 'w', encoding='utf-8')
    dev_testfilm = open(dev_test, 'w', encoding='utf-8')
    list = []
    for datas in infilm.readlines():
        datas = datas.replace('\n','')
        list.append(datas)
    traindatas, dev_testdatas = split(list, shuffle=True, ratio=0.8)
    for traindata in traindatas:
        trainfilm.write(traindata + '\n')
    for dev_testdata in dev_testdatas:
        dev_testfilm.write(dev_testdata + '\n')

    infilm.close()
    trainfilm.close()
    dev_testfilm.close()

def Extract_tag(input_txt, output_txt):
    '''
        原格式为:vocal--label,如'菜 O'。
        只要句子中含有非‘O’标签,就提取该句子。
    '''

    with open(input_txt,'r' ,encoding='utf-8') as f:
        with open(output_txt,'a',encoding='utf-8') as g:
            list = []
            # list_ =[]
            flag = 0
            lines = f.readlines()
            for line in lines:
                if line.isspace() == False:#一句话未结束
                    list.append(line.strip())#先将该句子加入list
                    # for i,word in enumerate(line):
                    #     if word.isspace()==True:#空格
                else:#一句话结束,判断list该句子中是否全为‘O’
                    # print(list)
                    for words in list:
                        for i,word in enumerate(words):
                            # print(word)
                            if word.isspace()==True:
                                if words[i+1]!='O':
                                    flag=1
                                    break#将该list写入新的文件
                                else:
                                    continue
                        if flag == 1:#存在不为‘O’的标签
                            break
                    if flag == 1:
                        for words in list:
                            g.write(words+'\n')
                        g.write('\n')
                    flag = 0
                    list = []
        g.close()
    f.close()

def Num_tag(input_txt):
    '''
        统计标签数量,原格式为:vocal--label,如'菜 O'
    '''
    with open(input_txt, 'r', encoding='utf-8') as f:
        # sum = 46364
        list = []
        lines = f.readlines()
        for line in lines:
            for i,words in enumerate(line):
                if words.isspace()==True or words == '\t':
                    word = line[i+1:].strip()
                    if word not in list:
                        list.append(word)

    # print(list)
    # print(len(list))
    f.close()
    return  list,len(list)

def GetWords_Tags(input_txt, output_words_txt, output_tags_txt):
    '''
        分别获取词和标签,原格式为:vocal--label,如'菜 O'
    '''
    with open(input_txt, 'r', encoding='utf-8') as f:
        lines = f.readlines()#获取所有行
        sum = 0
        words = []
        tags = []
        for line in lines:
            if line.isspace() == False:#针对一行

                for i,word in enumerate(line):
                    if word.isspace()==True or word=='  ':
                        words.append(line[:i].strip())
                        tags.append(line[i:].strip())
                        sum += 1
                        break
            else:#一句话结束
                with open(output_words_txt, 'a', encoding='utf-8') as g:
                    for word in words:
                        g.write(word+' ')
                    g.write('\n')
                words = []
                with open(output_tags_txt, 'a', encoding='utf-8') as z:
                    for tag in tags:
                        z.write(tag+' ')
                    z.write('\n')
                tags = []

        with open(output_words_txt, 'a', encoding='utf-8') as g:
            for word in words:
                g.write(word + ' ')
            g.write('\n')

        with open(output_tags_txt, 'a', encoding='utf-8') as z:
            for tag in tags:
                z.write(tag + ' ')
            z.write('\n')
        # print(words)
        # print(tags)
        print(sum)

    f.close()
    g.close()
    z.close()

def delete_space(input_txt, output_txt):
    '''
        删除多余空行,每行为一句话
    '''

    f = open(input_txt, 'r', encoding='utf-8')
    g = open(output_txt, 'w', encoding='utf-8')
    lines = f.readlines()
    for line in lines:
        if line.isspace()==False and line!='\n':
            g.write(line)
    f.close()
    g.close()

def NewFileOfWordsAndTags(input_words_txt, input_tags_txt, output_txt):
    '''
        将词和标签合并到一句话中,输入输出每行都对应一句话。
    '''
    f = open(input_words_txt, 'r', encoding='utf-8')
    g = open(input_tags_txt, 'r', encoding='utf-8')
    z = open(output_txt, 'w', encoding='utf-8')
    f_lines = f.readlines()
    g_lines = g.readlines()
    str = []
    for i, line in enumerate(f_lines):
        if line.isspace() or line == '\n':
            continue
        else:
            # str.append(line.strip()+g_lines[i].strip())
            line = line.strip()
            for word in line:
                str.append(word)
            str.append(' ')
            line_ = g_lines[i].strip()
            for word_ in line_:
                str.append(word_)

            for i in str:
                z.write(i)
            z.write('\n')
            str = []
    f.close()
    g.close()
    z.close()

def SplitSentence(input_txt, output_txt):
    '''
        将一句话中的词和标签分隔开,输入为每行一句话。
        输出格式为:vocal--label,如'菜 O',每行一个词,一句话对应多行。
        这种方法有漏洞,需要注意,比如在文章中出现'O',会把它当作标签
    '''
    f = open(input_txt, 'r', encoding='utf-8')
    g = open(output_txt, 'w', encoding='utf-8')
    new_words = []
    new_tags = []
    new_line = []
    for line in f.readlines():  # 针对一句话,包含实体和标签

        words = line.strip().split(' ')  # 针对一句话,这里是空格,也可能是\t
        # print(words)
        for i, word in enumerate(words):  # 针对每个词
            if word == 'O' or word == 'B-ORGANIZATION' or word == 'B-PERSON' or word == 'B-TIME' or word == 'B-LOCATION':
                new_words = words[:i]
                new_tags = words[i:]
                
                for j, w in enumerate(new_words):
                    new_line.append(w + ' ' + new_tags[j])
                for label in new_line:
                    g.write(label+'\n')
                g.write('\n')
                new_line = []
                break

    # print(new_line)
    f.close()
    g.close()

def SplitSentence_new(input_txt, output_txt):
    f = open(input_txt, 'r', encoding='utf-8')
    g = open(output_txt, 'a', encoding='utf-8')
    lines = f.readlines()
    for line in lines:
        new_words,new_tags = line.strip().split('\t')
        new_words = new_words.split()
        new_tags = new_tags .split()
        # print(new_words )
        # print(new_tags )
        for i,j in enumerate(new_words ):
            g.write(j + ' ' + new_tags [i] + '\n')
        g.write('\n')

    return 0


if __name__ == '__main__':
    
    input_txt = './data/RenminNER/BIOES/train.txt'
    output_txt = './data/RenminNER/BIOES/new_train.txt'
    SplitSentence(input_txt, output_txt)


你可能感兴趣的:(python编程)