pandas对比excel数据是否一致

需求:
1、先下载不同环境下的表格
2、对比两个表格数据是否一致,包括:表格总数、内容
3、一致返回True,不一致展示出不一致的原因

导入

import json
import logging
import operator
from core.rest_client import RestClient
# import requests
from common.analysis import *
import os
import time
import copy
from testcases.diffRes.pre_case import *
from testcases.diffRes.post_case import *
from common.deal_data import data
from common.logger import logger
from datetime import datetime, date
from datetime import timedelta

登陆需要下载的链接

"""通过Ip登陆"""
ip_39 = "https://xxxxx:443"
ip_67 = "https://xxxxx:443"
ip_address = [ip_39, ip_67]
basePath = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
data_file_path = os.path.join(basePath, "config", "setting.ini")
cur_time = datetime.now().strftime("%Y-%m-%d")
# cur_time = datetime.now().strftime("%Y-%m-%d")

def login_xbb(ipAddress):
    url = ipAddress + "xxxxxx"
    headers = {"host": "xxxxx"}
    body = {"account": "xxxxx", "password": "xxxxxxxx", "login_style": 1}
    res = requests.post(url=url, json=body, headers=headers, verify=False)
    token = json.loads(res.content)["data"]["token"]
    return token

下载

def customer_settlement_bill_export(ipAddress, exportUrl, body, headers,method):
    # url = ipAddress+"/xxxx/all/export.json"
    url = ipAddress + exportUrl
    logging.info("链接:",url)
    data_list_url = ipAddress + "/export/list.json"

    headers = headers
    body = body
    RestClient.request(url=url,method=method, body=body, headers=headers, verify=False)
    data_list_body = {"moduleId": "", "curPage": 0, "pageSize": 10}
    data_list_res = requests.post(url=data_list_url, data=data_list_body, headers=headers, verify=False)
    dataInfo = json.loads(data_list_res.content)["data"]["dataList"][0]
    print(dataInfo)
    while dataInfo["status"] == "导出中":
        time.sleep(3)
        print("导出中")
        data_list_res = requests.post(url=data_list_url, data=data_list_body, headers=headers, verify=False)
        dataInfo = json.loads(data_list_res.content)["data"]["dataList"][0]
    else:
        if dataInfo["status"] == "导出成功":
            data_list_res = requests.post(url=data_list_url, data=data_list_body, headers=headers, verify=False)
            dataInfo = json.loads(data_list_res.content)["data"]["dataList"][0]
            downloadUrl = dataInfo["downloadUrl"].replace("https://xxxxx", ipAddress)
            taskName = dataInfo["taskName"]
            # taskName = dataInfo["taskName"].replace('.xlsx','csv')
            print("downloadUrl:", downloadUrl)
            f = requests.get(url=downloadUrl, headers=headers, params={}, verify=False)
            with open(taskName, "wb") as code:
                code.write(f.content)
        else:
            print("导出失败")

    return dataInfo["taskName"]

对比

import json
import logging
import operator
from deepdiff import DeepDiff
from common.logger import logger
import re
import copy
import time
import pandas as pd

# import feather


def make_df_from_excel(file_name, nrows=1):
    """Read from an Excel file in chunks and make a single DataFrame.
    Parameters
    ----------
    file_name : str
    nrows : int
        Number of rows to read at a time. These Excel files are too big,
        so we can't read all rows in one go.
    """
    # In this case, there was only a single Worksheet in the Workbook.
    # Read the header outside of the loop, so all chunk reads are
    # consistent across all loop iterations.
    df_file = pd.ExcelFile(file_name)
    # df_header = df_file.parse(nrows=1)
    # df_header=pd.read_excel(io=file_name,nrows=1)

    # print(f"Excel file: {file_name} (worksheet: {sheetname})")


    # num = 0
    # The first row is the header. We have already read it, so we skip it.
    num = []
    df_list=[]
    # 可能会存在多个sheet页   每个sheet页都需要进行对比
    for i in range(len(df_file.sheet_names)):
        df_header = df_file.parse(nrows=1,sheet_name=df_file.sheet_names[i])
        # df_header = pd.read_excel(io=file_name,nrows=1,sheet_name=df_file.sheet_names[i])
        # data = pd.DataFrame(pd.read_excel(io=file_name,))
        rownember=df_file.parse(sheet_name=df_file.sheet_names[i]).shape[0]+1
        columnnmber=df_file.parse(sheet_name=df_file.sheet_names[i]).shape[1]
        # rownember = data.shape[0] + 1
        skiprows = 1
        i_chunk = 0
        chunks = []

        if rownember>=10000:    #一次处理的数量     有可能表格数量小于这个数量  做个判断
            nrows=10000
        else:
            nrows=rownember
        while True:
            # df_chunk = pd.read_excel(io=file_name,nrows=nrows, skiprows=skiprows, header=None,sheet_name=df_file.sheet_names[i])
            df_chunk = df_file.parse(nrows=nrows, skiprows=skiprows, header=None,sheet_name=df_file.sheet_names[i])
            # print(type(df_chunk))
            skiprows += nrows
            # When there is no data, we know we can break out of the loop.
            if not df_chunk.shape[0]:
                break
            else:
                print(f"  - chunk {i_chunk} ({df_chunk.shape[0]} rows)")
                # nums += df_chunk.shape[0]
                chunks.append(df_chunk)
        # print("结束时间:", time.time())
        df_chunks = pd.concat(chunks)
        # Rename the columns to concatenate the chunks with the header.
        columns = {i: col for i, col in enumerate(df_header.columns.tolist())}
        df_chunks.rename(columns=columns, inplace=True)
        df = pd.concat([df_header, df_chunks])
        df=df.drop_duplicates()  # 合并成一个,并去重
        # df=df.sort_values("姓名")  # 合并成一个,并去重
        # print(type(df))
        df_list.append(df)
        num.append(rownember)
        num.append(columnnmber)
    return num,df_list


# data = read_csv_feature(filePath)


def count(filename):
    data = pd.DataFrame(pd.read_excel(filename))
    # data.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)  # 去重
    data.sort_index(axis=0, ascending=False, inplace=True)  # 排序
    rownember = data.shape[0] + 1

    columnnmber = data.shape[1]

    return [columnnmber, rownember]


def check(old_task_name, new_task_name,remarks):
    timestamp = int(time.time() * 1000)
    new_num,new_data = make_df_from_excel(new_task_name, 10000)
    old_num,old_data = make_df_from_excel(old_task_name, 10000)
    if operator.eq(new_num,old_num):
        for j in range(len(new_data)):
            ind=old_data[j].compare(new_data[j], keep_shape=True).dropna(axis=0,how="all")
            ind=ind.to_dict("records")
            # print(ind)
            # print(len(ind))
            if len(ind)==0:
                continue
            else:
                filename = "test" + str(timestamp) + "txt"
                with open(filename, 'w', encoding="utf-8") as file:
                    # alldata = list(res.values())
                    file.write("=========================================================")
                    file.write(remarks)
                    file.write(str(ind[0]))
                return False
        return True
    else:
        print("新的接口的导出数据的数量:", new_num)
        print("旧的接口的导出数据的数量:", old_num)
        return False
result = check("xxxxx.xlsx", "xxxxxx.xlsx",case_info["remarks"])
assert result is True

你可能感兴趣的:(pandas,excel,python)