【Python小程序】用python找出Excel文件中有缺漏的行和比较两个Excel文件的一致性

用 python 写了一个程序
有两个功能:
1. 找出Excel文件中有缺漏的行,并将有缺漏的行生成新的文件
2. 比较两个Excel文件某些属性的一致性(可以自己设置),并将不一致的行生成新的文件

import pandas as pd

import numpy as np

import os


# # 找出一个或多个文件的全部包含空单元格的行

data_file_paths = ['data/titanic3.xls', 'data/titanic3-2.xls']

for data_file_path in data_file_paths:
    xlsx = pd.ExcelFile(data_file_path)
    df = pd.read_excel(xlsx)
    df_null = df[0:1]
    df_null = df_null.drop([0])
    for row in df.iterrows():
        for i in range(len(row[1])):
            if row[1][i]==None and (row[1]['name']==None):
                df_null = df_null.append(row[1],  ignore_index=True)
                break
    new_file_dir = "data/null/"
    name = data_file_path.split('/')[-1].split('.')[0] 
    ext = data_file_path.split('/')[-1].split('.')[-1]
    new_file_path = new_file_dir + name + "." + ext
    if not os.path.exists(new_file_dir):
        os.makedirs(new_file_dir)
    df_null = df_null.to_excel(new_file_path)
    print('save to new_file_path', new_file_path)


# # 检查两个文件的一致性

data_file_paths = ['data/2020利诚签约表(核对表2.28杨美娟.xls', 'data/2020利诚签约表(核对表3.27杨美娟.xls']

df1 = pd.read_excel(data_file_paths[0])
df2 = pd.read_excel(data_file_paths[1])

df_diff = df1[0:1].drop([0])

# 这里可以设置需要检查的列名,一个或多个
checkCols = ['签约日期']

for row1 in df1.iterrows():
    for row2 in df2.iterrows():
#       注意,我这里 将 name 视为主键,可以自己修改
#         if row1[1]['楼号'] == row2[1]['楼号'] and ['单元'] == row2[1]['单元'] and ['房号'] == row2[1]['房号']:
        if row1[1]['姓名'] == row2[1]['姓名']:
            for col in checkCols:
                if row1[1][col]!= row2[1][col] :
                    df_diff = df_diff.append(row1[1],  ignore_index=True)
                    df_diff = df_diff.append(row2[1],  ignore_index=True)
                    break
            break
            
new_file_dir = "data/different"
if not os.path.exists(new_file_dir):
    os.makedirs(new_file_dir)
new_file_path = new_file_dir + "/different.xls"
df_diff.to_excel(new_file_path)
print('save to new_file_path', new_file_path)

重点就是这两个循环了:

for row in df.iterrows():
        for i in range(len(row[1])):
            if row[1][i]==None and (row[1]['name']==None):
                df_null = df_null.append(row[1],  ignore_index=True)
                break
for row1 in df1.iterrows():
    for row2 in df2.iterrows():
#       注意,我这里 将 name 视为主键,可以自己修改
#         if row1[1]['楼号'] == row2[1]['楼号'] and ['单元'] == row2[1]['单元'] and ['房号'] == row2[1]['房号']:
        if row1[1]['姓名'] == row2[1]['姓名']:
            for col in checkCols:
                if row1[1][col]!= row2[1][col] :
                    df_diff = df_diff.append(row1[1],  ignore_index=True)
                    df_diff = df_diff.append(row2[1],  ignore_index=True)
                    break
            break
            

你可能感兴趣的:(Python小程序)