Python并行实现XML文件转换为XLSX文件

最近需要读取一堆XML文件的数据,这批XML文件很大,虽说用Excel读取很方便,但是读取速度特别慢,故使用Python结合pandas、xml、multiprocessing包实现了数据的快速读取。

# xml2xlsx
import xml.dom.minidom  
import pandas as pd
import os
from multiprocessing import Pool
# Read XML file and convert it to XLSX
# Use multiprocess


def xml2excel(filename):
    print(filename + '...')
    save_path = '/home/pc/xadf'
    xml_report = os.path.join(output_path, filename)
    outputname = filename.split('.')[0]
    xls_report = os.path.join(save_path, outputname + '.xlsx')
    #XML文件读取
    dom = xml.dom.minidom.parse(xml_report)
    z_tag = dom.getElementsByTagName('z')
    N = len(z_tag)
    z = [z_tag[i].firstChild.data for i in range(N)]

    surface_name_tag = dom.getElementsByTagName('s')
    surface_name = [surface_name_tag[0].firstChild.data for i in range(N)]

    label_tag = dom.getElementsByTagName('l')
    label = [label_tag[i//(512*128)].firstChild.data for i in range(N)]
    #XLSX文件写入
    all_dict = {'surface_name': surface_name, 'label': label, 'z': z}  
    df = pd.DataFrame(all_dict)  
    with pd.ExcelWriter(xls_report) as Writer:
        df.to_excel(Writer, 'Sheet1', index=False, header=False)

output_path = '/home/pc/akjhfkd'
filenames = os.listdir(output_path)
#并行处理,但不能设置太大,否则容易卡死
pool = Pool(6)
pool.map(xml2excel, filenames)
pool.close()
pool.join()

你可能感兴趣的:(Python,xml,python,excel)