话不多说,直接贴代码
import pandas as pd
import os
import time
def drop_duplicates(df):
df = df.drop_duplicates()
return df
def merge_execl(merge_file_dir):
dir_name = merge_file_dir
dir_list = os.listdir(dir_name)
timer = time.time()
new_data_col = ''
new_data = ''
for file in dir_list:
file_name = dir_name + '\\' + file
if os.path.isfile(file_name):
if file_name.endswith('.csv'):
new_file_name = dir_name + '\\' + 'new_file_{}.csv'.format(timer)
df = pd.read_csv(file_name)
df.to_csv(new_file_name, mode='a', index=False)
df = pd.read_csv(new_file_name, header=None)
df = drop_duplicates(df)
df.to_csv(new_file_name, header=False, index=False)
elif file_name.endswith('.xls') or file_name.endswith('.xlsx'):
new_file_name = dir_name + '\\' + 'new_file_{}.xls'.format(timer)
df = pd.read_excel(file_name)
if new_data_col == '':
new_data_col = list(df.columns)
if new_data == '':
new_data = list(df.values)
else:
new_data += list(df.values)
new_df = pd.DataFrame(data=new_data, columns=new_data_col)
new_df = drop_duplicates(new_df)
new_df.to_excel(new_file_name, index=False)
if __name__ == '__main__':
file_dir_name = 'citibike'
merge_execl(file_dir_name)
输出结果
- 代码中脚本和需要合并的excel文件夹在统一目录下