经典python编译参数行可执行程序-解析json写xlsx

不知不觉当初写的一个小工具用了三年多了,最近翻修下顺带优化了写入技术,留档方便以后同类需求进行copy代码

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File  : 解析数据.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date  : 2020/4/25
# UpDate  : 2023/03/13
import time

import xlwt
import sys
import json
from lxml import etree
import os
import copy


# 打包编译命令: pyinstaller -F -w 解析数据.py
# 测试搜索: 法格赫
def get_main_func():
    if len(sys.argv) < 3:
        jsd = {"ret": "0", "msg": f"参数数量有误,{len(sys.argv) - 1}!=2,格式为 func filename"}
        jst = json.dumps(jsd, ensure_ascii=False)
        exit(jst)
    arg1 = sys.argv[1]
    arg2 = sys.argv[2]
    if arg1 == "xls":
        ret = deal(arg2)
        print(ret)
    elif arg1 == "gen_xls":
        try:
            gen_xls(arg2)
            print("文件已保存!")
        except PermissionError:
            print("待导出的xls文件已经是打开状态,请先关闭再试")
    elif arg1 == "gkl":
        pass
    elif arg1 == "gxls":
        pass


def get_conten(filename):
    # print(filename)
    with open(filename, mode='rb') as f:
        ret = f.read()
    return ret


def cr(td):
    return "".join(td.xpath('.//text()')).replace("\n\t", "").replace("\t", "")


def hr(td):
    return "".join(td.xpath('./a/@href')).replace("\n\t", "").replace("\t", "")


def time2text(ms):
    if (isinstance(ms, int)):
        seconds = ms / 1000
        x = time.localtime(seconds)  # localtime参数为float类型,这里1317091800.0为float类型
        return time.strftime('%Y-%m-%d %H:%M:%S', x)
    else:
        return ''


# time2text(1671064641000)

def file2text(files):
    return ','.join([
                        f'http://www.cqccms.com.cn/cqc/sys.SysUploadedFileCtl.querySysUploadedFile.do?objID={file["objid"]}#{file["filename"]}'
                        for file in files])


def state2text(code, code_map):
    # print(code,code_map,code_map[str(code)])
    # return code
    return code_map[str(code)]


def deal(filename):
    ret = get_conten(filename)
    # print(ret.decode('utf-8'))
    json_data = json.loads(ret.decode('utf-8'))
    # print(json_data)
    jsd_list = []
    for index, value in enumerate(json_data['rows']):
        # print(index)
        jsd_list.append({
            '序号': index + 1,
            "证书编号": value.get('certinum', ''),
            "申请人": value.get('appname', ''),
            "制造商": value.get('manuname', ''),
            "生产商": value.get('facname', ''),
            "产品名称": value.get('product', ''),
            "型号规格": value.get('model', ''),
            "标准": value.get('teststandard', ''),
            "发证日期": time2text(value.get('issuedate', '')),
            "首次发证日期": time2text(value.get('originalIssueDate', '')),
            "证书截止日期": time2text(value.get('expireddate', '')),
            # "现状态": value.get('statusMap',{}).get(value.get('certistatus','0')),
            "现状态": state2text(value.get('certistatus', '0'), value.get('statusMap', {})),
            "证书变化时间": time2text(value.get('opersj', '')),
            "原因": value.get('operreason', ''),
            "附件": file2text((value.get('uploadFiles', '') or [])),

        })
    # print(jsd_list)
    jsd_list = json.dumps(jsd_list, ensure_ascii=False, indent=4)
    # os.remove(filename)
    return jsd_list


def deal_old(filename):
    ret = get_conten(filename)
    html = etree.HTML(ret)
    tr_list = html.xpath('//tr[@class="even" or @class="odd"]')
    jsd_list = []
    for tr in tr_list:
        td = tr.xpath('./td')
        # print(len(td))
        jsd = {
            '序号': cr(td[0]),
            "证书编号": cr(td[1]),
            "申请人": cr(td[2]),
            "制造商": cr(td[3]),
            "生产商": cr(td[4]),
            "产品名称": cr(td[5]),
            "型号规格": cr(td[6]),
            "标准": cr(td[7]),
            "发证日期": cr(td[8]),
            "首次发证日期": cr(td[9]),
            "证书截止日期": cr(td[10]),
            "现状态": cr(td[11]),
            "证书变化时间": cr(td[12]),
            "原因": cr(td[13]),
            "附件": hr(td[14])
        }

        jsd_list.append(jsd)
    print(jsd_list)

    # with open('2.txt', 'w+', encoding="utf-8") as f:
    #     f.write(json.dumps(jsd_list, ensure_ascii=False, indent=4))

    jsd_list = json.dumps(jsd_list, ensure_ascii=False, indent=4)
    # os.remove(filename)
    return jsd_list


def gen_xls_old(filename="飞利浦.json"):
    if not os.path.exists(filename):
        return f"找不到文件{filename}"

    name = filename.split(".")[0] + ".xls"

    # 创建一个workbook 设置编码
    workbook = xlwt.Workbook(encoding='utf-8')
    # 创建一个worksheet
    worksheet = workbook.add_sheet('爬取结果')

    head = [
        '序号',
        '证书编号',
        '申请人',
        '制造商',
        '生产商',
        '产品名称',
        '型号规格',
        '标准',
        '发证日期',
        '首次发证日期',
        '证书截止日期',
        '现状态',
        '证书变化时间',
        '原因',
        '附件']
    for i in range(len(head)):
        worksheet.write(0, i, label=head[i])

    with open(filename, mode="r", encoding="utf-8") as f:
        jsd_list = f.read()
    jsd_list = json.loads(jsd_list)

    for jsd in jsd_list:
        for i in range(len(head)):
            wtext = jsd[head[i]]
            # 附件写成超链接
            if i == len(head) - 1:
                link = ''
                # wtext = '\n'.join(wtext.split(','))
                for n, file in enumerate(wtext.split(',')):
                    if file:
                        # link += 'HYPERLINK("' + file.split('#')[0] + f'"; "附件{n+1}");\n'
                        link += 'HYPERLINK("' + file.split('#')[0] + f'"; "{file.split("#")[1]}");\n'
                if link:
                    wtext = xlwt.Formula(link)

            worksheet.write(1 + jsd_list.index(jsd), i, label=wtext)

    workbook.save(name)
    # os.remove(filename)


def gen_xls(filename="飞利浦.json"):
    if not os.path.exists(filename):
        return f"找不到文件{filename}"

    name = filename.split(".")[0] + ".xls"
    xlsx_data = get_xlsx_data(filename, with_link=True)
    # xlsx_data = get_xlsx_data(filename)
    write_excel(xlsx_data, '爬取结果', name)
    # os.remove(filename)


def get_xlsx_data(filename, with_link=False):
    """
    根据待写入的json文件构造xlsx数据格式
    :param filename:
    :return:
    """
    head = [
        '序号',
        '证书编号',
        '申请人',
        '制造商',
        '生产商',
        '产品名称',
        '型号规格',
        '标准',
        '发证日期',
        '首次发证日期',
        '证书截止日期',
        '现状态',
        '证书变化时间',
        '原因',
        '附件']
    xlsx_data = [head]
    with open(filename, mode="r", encoding="utf-8") as f:
        jsd_list = f.read()
    jsd_list = json.loads(jsd_list)
    for jsd in jsd_list:
        col_list = []
        for i in range(len(head)):
            wtext = jsd[head[i]]
            if with_link:
                # 附件写成超链接
                if i == len(head) - 1:
                    link = ''
                    # wtext = '\n'.join(wtext.split(','))
                    for n, file in enumerate(wtext.split(',')):
                        if file:
                            # link += 'HYPERLINK("' + file.split('#')[0] + f'"; "附件{n+1}");\n'
                            link += 'HYPERLINK("' + file.split('#')[0] + f'"; "{file.split("#")[1]}");\n'
                    if link:
                        wtext = xlwt.Formula(link)
            col_list.append(wtext)

        xlsx_data.append(col_list)

    return xlsx_data


def get_max_col(max_list):
    '''
    获取每列所占用的最大列宽
    利用python的xlwt模块自适应列宽写入excel
    '''
    line_list = []
    # i表示行,j代表列
    for j in range(len(max_list[0])):
        line_num = []
        for i in range(len(max_list)):
            line_num.append(max_list[i][j])  # 将每列的宽度存入line_num
        line_list.append(max(line_num))  # 将每列最大宽度存入line_list
    return line_list


def write_excel(data, sheet_name, save_name, max_col_words=100):
    """
    将xlsx格式的数据写成本地xlsx文件
    :param data: 固定格式的数据
    :param sheet_name: sheet页名称
    :param save_name: 保存文件名称
    :param max_col_words: 每个单元格最大字符数量,最大为255
    :return:
    """
    max_col_words = min(max_col_words, 256)
    row_num = 0  # 记录写入行数
    col_list = []  # 记录每行宽度
    # 个人信息:姓名,性别,年龄,手机号,固定电话,邮箱
    # 创建一个Workbook对象
    book = xlwt.Workbook(encoding="utf-8", style_compression=0)
    # 创建一个sheet对象
    sheet = book.add_sheet(sheet_name, cell_overwrite_ok=True)
    col_num = [0 for x in range(0, len(data))]
    # 写入数据
    for i in range(0, len(data)):
        for j in range(0, len(data[i])):
            sheet.write(row_num, j, data[i][j])
            col_num[j] = len(data[i][j].encode('gb18030')) if isinstance(data[i][j], str) else 20  # 计算每列值的大小
        col_list.append(copy.copy(col_num))  # 记录一行每列写入的长度
        row_num += 1
    # 获取每列最大宽度
    col_max_num = get_max_col(col_list)
    # 设置自适应列宽
    for i in range(0, len(col_max_num)):
        # 256*字符数得到excel列宽,为了不显得特别紧凑添加两个字符宽度
        # 最大限制100个字
        sheet.col(i).width = min(256 * (col_max_num[i] + 2), 256 * max_col_words - 1)
    # 保存excel文件
    book.save(save_name)


if __name__ == '__main__':
    get_main_func()

    # deal('法格赫.html')

你可能感兴趣的:(python,json,开发语言)