

官方代码:GitHub - google-research/bert: TensorFlow code and pre-trained models for BERT












还是https://blog.csdn.net/u013066730/article/details/123207945?spm=1001.2014.3001.5502 这篇博客介绍的doccano这个软件。

1. 将原始的squad转化为jsonl格式文件

import jsonlines
import json

if __name__ == "__main__":
    rootPath = "/data2/PrivateExperiment/bert-master/squad/train-v1.1.json"
    with open(rootPath, "r") as reader:
        input_data = json.load(reader)["data"]

    collect_info = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            temp = {}
            paragraph_text = paragraph["context"]
            temp["title"] = entry["title"]
            temp["text"] = paragraph_text
            temp["label"] = []
            for i, qa in enumerate(paragraph["qas"]):
                qas_id = qa["id"]
                question_text = qa["question"]
                answer = qa["answers"][0]
                orig_answer_text = answer["text"]
                answer_char_start = answer["answer_start"]
                answer_char_end = answer_char_start + len(orig_answer_text)

                qa_text = "qa" + "_" + str(i)
                temp["label"].append([answer_char_start, answer_char_end, qa_text])
                temp[qa_text] = [question_text, qas_id]

    with jsonlines.open('data.jsonl', mode='w') as writer:
        for single_info in collect_info:


	"title": "University_of_Notre_Dame",
	"text": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.",
	"label": [
		[515, 541, "qa_0"],
		[188, 213, "qa_1"],
		[279, 296, "qa_2"],
		[381, 420, "qa_3"],
		[92, 126, "qa_4"]
	"To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?": ["qa_0", "5733be284776f41900661182"],
	"What is in front of the Notre Dame Main Building?": ["qa_1", "5733be284776f4190066117f"],
	"The Basilica of the Sacred heart at Notre Dame is beside to which structure?": ["qa_2", "5733be284776f41900661180"],
	"What is the Grotto at Notre Dame?": ["qa_3", "5733be284776f41900661181"],
	"What sits on top of the Main Building at Notre Dame?": ["qa_4", "5733be284776f4190066117e"]

2. jsonl数据导入doccano





3. 在doccano中标注


【Bert】(八)简易问答系统--数据介绍及标注_第7张图片 ​​​​ 这里可以看出,我将问题和答案通过一个字典映射来显示在右侧,只需要根据问题及对应标识来进行NER标注即可。这里只是我的一种解决方法,你也可以按照自己的方法来。

4. 从doccano中导出jsonl数据



5. 将jsonl格式文件转成squad格式文件



	"id": 152491,
	"data": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.",
	"label": [
		[515, 541, "qa_0"],
		[188, 213, "qa_1"],
		[279, 296, "qa_2"],
		[381, 420, "qa_3"],
		[92, 126, "qa_4"]
	"title": "University_of_Notre_Dame",
	"To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?": ["qa_0", "5733be284776f41900661182"],
	"What is in front of the Notre Dame Main Building?": ["qa_1", "5733be284776f4190066117f"],
	"The Basilica of the Sacred heart at Notre Dame is beside to which structure?": ["qa_2", "5733be284776f41900661180"],
	"What is the Grotto at Notre Dame?": ["qa_3", "5733be284776f41900661181"],
	"What sits on top of the Main Building at Notre Dame?": ["qa_4", "5733be284776f4190066117e"]


import json

import jsonlines

if __name__ == "__main__":
    rootPath = "/data2/PrivateExperiment/bert-master/temp/all.jsonl"
    collect_data = {}
    with open(rootPath, "r", encoding="utf-8") as f:
        for item in jsonlines.Reader(f):
            paragraph_text = item["data"]
            title = item["title"]
            labels = item["label"]
            labelDict = {}
            for label in labels:
                labelDict[label[2]] = [int(label[0]), paragraph_text[int(label[0]):int(label[1])]]
            questions = {}
            for qa_id in item.keys():
                if qa_id in ["id", "data", "title", "label"]:
                tempItem = item[qa_id]
                questions[qa_id] = tempItem

            if title not in collect_data:
                collect_data[title] = []
            paragraph = {}
            paragraph["context"] = paragraph_text
            qas = []
            for qa_id in labelDict.keys():
                cur_qa = {}
                cur_qa["answers"] = [{"answer_start":labelDict[qa_id][0], "text":labelDict[qa_id][1]}]
                cur_qa["question"] = questions[qa_id][0]
                cur_qa["id"] = questions[qa_id][1]
            paragraph["qas"] = qas

    # data format
    formatData = []
    for title, paragraphs in collect_data.items():
        temp = {}
        temp["title"] = title
        temp["paragraphs"] = paragraphs
    formatDataFinal = {}
    formatDataFinal["data"] = formatData

    with open("squad.json", "w") as f:
        json.dump(formatDataFinal, f)
