如何将原始按照word为单位索引标识的数据集修改为以char单位

目录

问题描述:

问题解决:


问题描述:

原始数据集中,数据的标注是以word为单位,且计数从1开始的。如下图:

如何将原始按照word为单位索引标识的数据集修改为以char单位_第1张图片

如何修改为以char为单位的,从0开始计数的数据格式:

如何将原始按照word为单位索引标识的数据集修改为以char单位_第2张图片

问题解决:

# 将英文数据集中,原本按照word存储的数据集,按照char格式存储,处理成与中文一样的格式
path = '/home/qtxu/Sentiment-SPN/data/Camera-COQE/dev.txt'
path_w = '/home/qtxu/Sentiment-SPN/data/Camera-COQE/dev_char.txt'

from pdb import set_trace as stop

def obtain_index(cur_ele):  # '[10&&would 11&¬ , 17&&difference]' ## '[10&¬ , 16&&clearer]'  ##[13&&did 14&&n't , 20&&as 21&&well 22&&as]
    len_ele = len(cur_ele)

    if ' ,' in cur_ele: # 针对几个带有逗号的特殊处理
        start_index = cur_ele.find(',')
        cur_ele = '['+cur_ele[start_index+2:]


    if len_ele == 2:
        index_list = []
        span_str = ''
        return index_list,span_str
    else:
        cur_ele = cur_ele[1:-1]
        # try:
        index_list = [int(ele.split('&&')[0])-1 for ele in cur_ele.split(' ')]
        span_str = ' '.join(ele.split('&&')[1] for ele in cur_ele.split(' '))
        # except:
        #     stop()
        return index_list, span_str


def word_to_char(sentence, span, span_index):
    if len(span)==0:
        return '[]'
    else:
        span_start_index = span_index[0]
        front_str = ' '.join(sentence.split(' ')[:span_start_index])
        span_len = len(front_str)
        result_str = ""
        if span_start_index == 0:
            i = 0
        else:
            i = 1

        for char in span:
            start_index = span_len + i
            cur_char = f"{start_index}&&{char} "
            result_str += cur_char
            i += 1

        # 移除末尾的空格
        result_str = '['+result_str.rstrip()+']'
        return result_str
   
with open(path, 'r') as fr, open(path_w, 'w') as fw:
    lines = fr.readlines()
    for line in lines:
        try:
            sent, label = line.strip().split('\t')
            fw.write(line)
        except:
            if '[[];[];[];[];[]]' in line:
                fw.write(line)
            else:
                # stop()

                cur_line = line.strip()[1:-1]
                sub,obj,asp,op,polarity = cur_line.split(';')
                sub_index, sub_span =  obtain_index(sub)
                obj_index, obj_span =  obtain_index(obj)
                asp_index, asp_span =  obtain_index(asp)
                op_index, op_span =  obtain_index(op)
                sub_char= word_to_char(sent, sub_span,sub_index)
                obj_char= word_to_char(sent, obj_span,obj_index)
                asp_char= word_to_char(sent, asp_span,asp_index)
                op_char= word_to_char(sent, op_span,op_index)
                char_quintuple = '['+ str(sub_char) + ';' + str(obj_char) +';'+ str(asp_char) +';'+str(op_char) +';' + polarity + ']'
                # polarity
                fw.write(char_quintuple+'\n')
                # print(sub_char)

你可能感兴趣的:(程序,自然语言处理,人工智能,python)