python使用pandas合并excel文件并去重复

话不多说,直接贴代码

import pandas as pd
import os
import time


# 去重复
def drop_duplicates(df):
    # 完全重复则删除
    df = df.drop_duplicates()
    return df

def merge_execl(merge_file_dir):
    # 要合并的execl目录
    dir_name = merge_file_dir
    dir_list = os.listdir(dir_name)
    timer = time.time()
    # df.drop_duplicates(subset, keep, inplace)
    new_data_col = ''
    new_data = ''

    for file in dir_list:
        # 如果是文件并且以csv结尾
        file_name = dir_name + '\\' + file
        if os.path.isfile(file_name):
            if file_name.endswith('.csv'):
                new_file_name = dir_name + '\\' +  'new_file_{}.csv'.format(timer)
                df = pd.read_csv(file_name)
                df.to_csv(new_file_name, mode='a', index=False)
                # 打开csv文件去重复再次保存
                df = pd.read_csv(new_file_name, header=None)
                df = drop_duplicates(df)
                df.to_csv(new_file_name, header=False, index=False)
            # 合并xls文件
            elif file_name.endswith('.xls') or file_name.endswith('.xlsx'):
                new_file_name = dir_name + '\\' + 'new_file_{}.xls'.format(timer)
                df = pd.read_excel(file_name)

                if new_data_col == '':
                    new_data_col = list(df.columns)
                if new_data == '':
                    new_data = list(df.values)
                else:
                    new_data += list(df.values)
                new_df = pd.DataFrame(data=new_data, columns=new_data_col)
                new_df = drop_duplicates(new_df)
                new_df.to_excel(new_file_name, index=False)


if __name__ == '__main__':
    # 要合并execl文件夹
    file_dir_name = 'citibike'
    merge_execl(file_dir_name)

输出结果

python使用pandas合并excel文件并去重复_第1张图片

  • 代码中脚本和需要合并的excel文件夹在统一目录下

你可能感兴趣的:(python,excel,csv)