实体抽取-将bio标签转为json文件的代码

def convert_biotext_to_json(input_file, save_file, format="json"):
    '''
        对于crf三种标注类型,bio, bieso, bmeo均会转为bio标签系统
    '''

    data = []
    with open(input_file, "r", encoding='utf-8') as f:
        text = ''
        labels = []
        for line in f.readlines():
            if len(line.strip()) == 0:
                assert len(text) == len(labels)
                data.append({
                    'text': text,
                    'labels': labels
                })
                text = ''
                labels = []
            else:
                w, t = line.strip().split()
                text += w
                if t.startswith("E"):
                    labels.append("I"+t[1:])
                elif t.startswith("M"):
                    labels.append("I"+t[1:])
                elif t.startswith("S"):
                    labels.append("B"+t[1:])
                else:
                    labels.append(t)

    for line in data:
        text = line["text"]
        bios = line["labels"]
        entities = []
        start_index, end_index = -1, -1
        ent_type = None
        for indx, tag in enumerate(bios):
            if tag.startswith("B-"):
                if end_index != -1:
                    entities.append(
                        {
                            "start_idx": start_index,
                            "end_idx": end_index,
                            "type": ent_type,
                            "entity": text[start_index:end_index + 1]
                        }
                    )
                # 新的实体
                start_index = indx
                end_index = indx
                ent_type = tag.split('-')[1]
                if indx == len(bios) - 1:
                    entities.append(
                        {
                            "start_idx": start_index,
                            "end_idx": end_index,
                            "type": ent_type,
                            "entity": text[start_index:end_index + 1]
                        }
                    )
            elif tag.startswith('I-') and start_index != -1:
                _type = tag.split('-')[1]
                if _type == ent_type:
                    end_index = indx

                if indx == len(bios) - 1:
                    entities.append(
                        {
                            "start_idx": start_index,
                            "end_idx": end_index,
                            "type": ent_type,
                            "entity": text[start_index:end_index + 1]
                        }
                    )
            else:
                if end_index != -1:
                    entities.append(
                        {
                            "start_idx": start_index,
                            "end_idx": end_index,
                            "type": ent_type,
                            "entity": text[start_index:end_index + 1]
                        }
                    )
                start_index, end_index = -1, -1
                ent_type = None

        line.pop("labels")
        line["entities"] = entities

    if format == "json":
        json.dump({"data": data}, open(save_file, "w"), ensure_ascii=False, indent=4)
    else:
        with open(save_file, "w") as f:
            for line in data:
                f.write(json.dumps(line, ensure_ascii=False)+"\n")

    logger.info(f"*** 转化后的json数据保存在:{save_file} ***")
    print(f"*** 转化后的json数据保存在:{save_file} ***")

你可能感兴趣的:(信息抽取,pytorch,python,json,python,开发语言)