不知不觉当初写的一个小工具用了三年多了,最近翻修下顺带优化了写入技术,留档方便以后同类需求进行copy代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File : 解析数据.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date : 2020/4/25
# UpDate : 2023/03/13
import time
import xlwt
import sys
import json
from lxml import etree
import os
import copy
# 打包编译命令: pyinstaller -F -w 解析数据.py
# 测试搜索: 法格赫
def get_main_func():
if len(sys.argv) < 3:
jsd = {"ret": "0", "msg": f"参数数量有误,{len(sys.argv) - 1}!=2,格式为 func filename"}
jst = json.dumps(jsd, ensure_ascii=False)
exit(jst)
arg1 = sys.argv[1]
arg2 = sys.argv[2]
if arg1 == "xls":
ret = deal(arg2)
print(ret)
elif arg1 == "gen_xls":
try:
gen_xls(arg2)
print("文件已保存!")
except PermissionError:
print("待导出的xls文件已经是打开状态,请先关闭再试")
elif arg1 == "gkl":
pass
elif arg1 == "gxls":
pass
def get_conten(filename):
# print(filename)
with open(filename, mode='rb') as f:
ret = f.read()
return ret
def cr(td):
return "".join(td.xpath('.//text()')).replace("\n\t", "").replace("\t", "")
def hr(td):
return "".join(td.xpath('./a/@href')).replace("\n\t", "").replace("\t", "")
def time2text(ms):
if (isinstance(ms, int)):
seconds = ms / 1000
x = time.localtime(seconds) # localtime参数为float类型,这里1317091800.0为float类型
return time.strftime('%Y-%m-%d %H:%M:%S', x)
else:
return ''
# time2text(1671064641000)
def file2text(files):
return ','.join([
f'http://www.cqccms.com.cn/cqc/sys.SysUploadedFileCtl.querySysUploadedFile.do?objID={file["objid"]}#{file["filename"]}'
for file in files])
def state2text(code, code_map):
# print(code,code_map,code_map[str(code)])
# return code
return code_map[str(code)]
def deal(filename):
ret = get_conten(filename)
# print(ret.decode('utf-8'))
json_data = json.loads(ret.decode('utf-8'))
# print(json_data)
jsd_list = []
for index, value in enumerate(json_data['rows']):
# print(index)
jsd_list.append({
'序号': index + 1,
"证书编号": value.get('certinum', ''),
"申请人": value.get('appname', ''),
"制造商": value.get('manuname', ''),
"生产商": value.get('facname', ''),
"产品名称": value.get('product', ''),
"型号规格": value.get('model', ''),
"标准": value.get('teststandard', ''),
"发证日期": time2text(value.get('issuedate', '')),
"首次发证日期": time2text(value.get('originalIssueDate', '')),
"证书截止日期": time2text(value.get('expireddate', '')),
# "现状态": value.get('statusMap',{}).get(value.get('certistatus','0')),
"现状态": state2text(value.get('certistatus', '0'), value.get('statusMap', {})),
"证书变化时间": time2text(value.get('opersj', '')),
"原因": value.get('operreason', ''),
"附件": file2text((value.get('uploadFiles', '') or [])),
})
# print(jsd_list)
jsd_list = json.dumps(jsd_list, ensure_ascii=False, indent=4)
# os.remove(filename)
return jsd_list
def deal_old(filename):
ret = get_conten(filename)
html = etree.HTML(ret)
tr_list = html.xpath('//tr[@class="even" or @class="odd"]')
jsd_list = []
for tr in tr_list:
td = tr.xpath('./td')
# print(len(td))
jsd = {
'序号': cr(td[0]),
"证书编号": cr(td[1]),
"申请人": cr(td[2]),
"制造商": cr(td[3]),
"生产商": cr(td[4]),
"产品名称": cr(td[5]),
"型号规格": cr(td[6]),
"标准": cr(td[7]),
"发证日期": cr(td[8]),
"首次发证日期": cr(td[9]),
"证书截止日期": cr(td[10]),
"现状态": cr(td[11]),
"证书变化时间": cr(td[12]),
"原因": cr(td[13]),
"附件": hr(td[14])
}
jsd_list.append(jsd)
print(jsd_list)
# with open('2.txt', 'w+', encoding="utf-8") as f:
# f.write(json.dumps(jsd_list, ensure_ascii=False, indent=4))
jsd_list = json.dumps(jsd_list, ensure_ascii=False, indent=4)
# os.remove(filename)
return jsd_list
def gen_xls_old(filename="飞利浦.json"):
if not os.path.exists(filename):
return f"找不到文件{filename}"
name = filename.split(".")[0] + ".xls"
# 创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding='utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('爬取结果')
head = [
'序号',
'证书编号',
'申请人',
'制造商',
'生产商',
'产品名称',
'型号规格',
'标准',
'发证日期',
'首次发证日期',
'证书截止日期',
'现状态',
'证书变化时间',
'原因',
'附件']
for i in range(len(head)):
worksheet.write(0, i, label=head[i])
with open(filename, mode="r", encoding="utf-8") as f:
jsd_list = f.read()
jsd_list = json.loads(jsd_list)
for jsd in jsd_list:
for i in range(len(head)):
wtext = jsd[head[i]]
# 附件写成超链接
if i == len(head) - 1:
link = ''
# wtext = '\n'.join(wtext.split(','))
for n, file in enumerate(wtext.split(',')):
if file:
# link += 'HYPERLINK("' + file.split('#')[0] + f'"; "附件{n+1}");\n'
link += 'HYPERLINK("' + file.split('#')[0] + f'"; "{file.split("#")[1]}");\n'
if link:
wtext = xlwt.Formula(link)
worksheet.write(1 + jsd_list.index(jsd), i, label=wtext)
workbook.save(name)
# os.remove(filename)
def gen_xls(filename="飞利浦.json"):
if not os.path.exists(filename):
return f"找不到文件{filename}"
name = filename.split(".")[0] + ".xls"
xlsx_data = get_xlsx_data(filename, with_link=True)
# xlsx_data = get_xlsx_data(filename)
write_excel(xlsx_data, '爬取结果', name)
# os.remove(filename)
def get_xlsx_data(filename, with_link=False):
"""
根据待写入的json文件构造xlsx数据格式
:param filename:
:return:
"""
head = [
'序号',
'证书编号',
'申请人',
'制造商',
'生产商',
'产品名称',
'型号规格',
'标准',
'发证日期',
'首次发证日期',
'证书截止日期',
'现状态',
'证书变化时间',
'原因',
'附件']
xlsx_data = [head]
with open(filename, mode="r", encoding="utf-8") as f:
jsd_list = f.read()
jsd_list = json.loads(jsd_list)
for jsd in jsd_list:
col_list = []
for i in range(len(head)):
wtext = jsd[head[i]]
if with_link:
# 附件写成超链接
if i == len(head) - 1:
link = ''
# wtext = '\n'.join(wtext.split(','))
for n, file in enumerate(wtext.split(',')):
if file:
# link += 'HYPERLINK("' + file.split('#')[0] + f'"; "附件{n+1}");\n'
link += 'HYPERLINK("' + file.split('#')[0] + f'"; "{file.split("#")[1]}");\n'
if link:
wtext = xlwt.Formula(link)
col_list.append(wtext)
xlsx_data.append(col_list)
return xlsx_data
def get_max_col(max_list):
'''
获取每列所占用的最大列宽
利用python的xlwt模块自适应列宽写入excel
'''
line_list = []
# i表示行,j代表列
for j in range(len(max_list[0])):
line_num = []
for i in range(len(max_list)):
line_num.append(max_list[i][j]) # 将每列的宽度存入line_num
line_list.append(max(line_num)) # 将每列最大宽度存入line_list
return line_list
def write_excel(data, sheet_name, save_name, max_col_words=100):
"""
将xlsx格式的数据写成本地xlsx文件
:param data: 固定格式的数据
:param sheet_name: sheet页名称
:param save_name: 保存文件名称
:param max_col_words: 每个单元格最大字符数量,最大为255
:return:
"""
max_col_words = min(max_col_words, 256)
row_num = 0 # 记录写入行数
col_list = [] # 记录每行宽度
# 个人信息:姓名,性别,年龄,手机号,固定电话,邮箱
# 创建一个Workbook对象
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
# 创建一个sheet对象
sheet = book.add_sheet(sheet_name, cell_overwrite_ok=True)
col_num = [0 for x in range(0, len(data))]
# 写入数据
for i in range(0, len(data)):
for j in range(0, len(data[i])):
sheet.write(row_num, j, data[i][j])
col_num[j] = len(data[i][j].encode('gb18030')) if isinstance(data[i][j], str) else 20 # 计算每列值的大小
col_list.append(copy.copy(col_num)) # 记录一行每列写入的长度
row_num += 1
# 获取每列最大宽度
col_max_num = get_max_col(col_list)
# 设置自适应列宽
for i in range(0, len(col_max_num)):
# 256*字符数得到excel列宽,为了不显得特别紧凑添加两个字符宽度
# 最大限制100个字
sheet.col(i).width = min(256 * (col_max_num[i] + 2), 256 * max_col_words - 1)
# 保存excel文件
book.save(save_name)
if __name__ == '__main__':
get_main_func()
# deal('法格赫.html')