ner数据扩充脚本例子

ner数据扩充脚本例子

import json
import random

js = {"id": 10000, "text": "彭于晏,1982年3月24日出生于台湾省澎湖县,彭于晏毕业于不列颠哥伦比亚大学,加拿大籍华裔影视男演员、歌手", "labels": [[0, 3, "姓名"], [4, 14, "时间"], [24, 27, "姓名"]]}
print(js["id"])

new_file = open("new.txt", "w", encoding="utf-8")
name_list = ["DXFDXF", "DJHDJH", "DJHH"]
with open("ner.txt", "r", encoding="utf-8") as f:
    to_augment_entity = []
    text = js["text"]
    labels = js["labels"]
    labels.sort(key=lambda x: float(x[1]), reverse=False)
    print("text:", text)
    print("labels:", labels)

    for i in range(len(labels)):
        if labels[i][2] == "姓名":
            to_augment_entity.append([labels[i], labels[i][1] - labels[i][0], i])
    print("to_augment_entity:", to_augment_entity)

    for aug in to_augment_entity:
        new_dict = {}
        new_labels = []
        # 替换前的labels
        for i in range(aug[2]):
            new_labels.append(labels[i])
        replace_text = random.choice(name_list)

        new_labels.append([aug[0][0], aug[0][0]+len(replace_text), "姓名"])
        # 替换后的labels
        diff = len(replace_text) - (aug[0][1]-aug[0][0])
        for i in range(aug[2]+1, len(labels)):
            new_labels.append([labels[i][0]+diff, labels[i][1]+diff, labels[i][2]])
        print("replace:", replace_text)
        print("to_replace:", text[aug[0][0]:aug[0][1]])

        # 替换指定位置大的字符串
        # new_text = text.replace(text[aug[0][0]:aug[0][1]], replace_text)
        # 新的字符串 = 老字符串[:要替换的索引位置] + 替换成的目标字符 + 老字符串[要替换的索引位置+1:]
        new_text = text[:aug[0][0]] + replace_text + text[aug[0][1]:]

        print("new_text:", new_text)
        print("new_label:", new_labels)

        new_dict["id"] = 1000
        new_dict["text"] = new_text
        new_dict["labels"] = new_labels
        print(new_dict)
        print("===========")
        new_file.write(str(new_dict) + "\n")

new_file.close()

你可能感兴趣的:(NLP,深度学习,机器学习,python)