python读取xml文件

python处理sohu数据集的xml文件,并转为NER任务的输入形式:

import xml.dom.minidom as dom
from sklearn import model_selection
def statistics():
    data_path='../data/sohu/sohu.txt'
    length=[]
    with open(data_path, 'r', encoding='utf-8') as fp:
        for line in fp.readlines():
            content, keywords = line.split('\t')
            length.append(len(content))
    print(max(length),min(length))


def convert_xml2txt():
    xmlPath = '../data/sohu-dataset/'
    txtPath = '../data/sohu/'
    with open(txtPath + 'sohu.txt', 'w', encoding='utf-8') as fp:
        for i in range(1, 1001):
            length = len(str(i))
            name = xmlPath + '0' * (5 - length) + str(i) + '.xml'
            domTree = dom.parse(name)
            data = domTree.documentElement
            keywords = data.getElementsByTagName('keywords')[0].firstChild.data
            content = data.getElementsByTagName('content')[0].firstChild.data.replace('\n', '').replace(' ','')\
                .replace('\t', '')
            fp.write(content + '\t' + keywords + '\n')


def deal_dataset_for_nlp():
    data_path = '../data/sohu/sohu.txt'
    total_data = []
    output_path = 'sohu/'

    def findAll(substr, str):
        result = []
        index = 0
        while str.find(substr, index, len(str)) != -1:
            temIndex = str.find(substr, index, len(str))
            result.append(temIndex)
            index = temIndex + 1
        return result

    with open(data_path, 'r', encoding='utf-8') as fp:
        for line in fp.readlines():
            content, keywords = line.split('\t')
            tag_list = ['O'] * len(content)
            keywords = keywords.split(' ')
            keywords.sort()  # 对于嵌套关键词,例如(强奸,强奸犯),长关键词优先级更高
            for keyword in keywords:
                keyword = keyword.strip()
                if keyword != '':
                    length = len(keyword)
                    begin_indexs = findAll(keyword, content)
                    for begin_index in begin_indexs:
                        if len(keyword)==1:
                            tag_list[begin_index]='S-Sensitive'
                        else:
                            tag_list[begin_index] = 'B-Sensitive'
                            for i in range(1, length - 1):
                                tag_list[begin_index + i] = 'M-Sensitive'
                            tag_list[begin_index + length - 1] = 'E-Sensitive'
            total_data.append({"content": content, "tag": tag_list})

    # 按8:1:1划分train,dev,test
    train_dev, test = model_selection.train_test_split(total_data, test_size=1 / 10, random_state=0)
    train, dev = model_selection.train_test_split(train_dev, test_size=1 / 9, random_state=0)
    print('total size: ' + str(len(total_data)) + '  train size: ' + str(len(train)) + '  dev size: ' + str(
        len(dev)) + ' test size: ' + str(len(test)))

    def write_data_for_nlp(path, data):
        with open(path, 'w', encoding='utf-8') as writer:
            for line in data:
                content = line['content']
                tag_list = line['tag']
                for i in range(len(content)):
                    writer.write(content[i] + ' ' + tag_list[i] + '\n')
                writer.write('\n')

    write_data_for_nlp(output_path + 'train.txt', train)
    write_data_for_nlp(output_path + 'dev.txt', dev)
    write_data_for_nlp(output_path + 'test.txt', test)


# convert_xml2txt()
# deal_dataset_for_nlp()
statistics()

你可能感兴趣的:(中文命名实体识别,python)