功能:合并多表内指定sheet,指定字段
import numpy as np
import pandas as pd
import os
import re
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
def main_and_outCSV(f_path):
f_path = f_path
fPath_jymx_list_error = []
fPath_jymx_list = []
for root, dirs, files in os.walk(f_path):
for name in files:
if ('表格' in name or '附表' in name) and '' not in name:
fPath_jymx_list.append(os.path.join(root, name))
elif '表格' not in name and '附表' not in name and '' not in name:
fPath_jymx_list_error.append(os.path.join(root, name))
fPath_all = [fPath_jymx_list, fPath_jymx_list_error]
for fPath_num in range(len(fPath_all)):
fPath_jymx_list_NO = fPath_all[fPath_num]
concat_sheet(fPath_jymx_list_NO, fPath_num)
data_r_1 = pd.read_csv("证券数据/data_top_1.csv", encoding='utf8', dtype='str')
data_r_2 = pd.read_csv("证券数据/data_top_2.csv", encoding='utf8', dtype='str')
data_all = pd.concat([data_r_1, data_r_2], axis=0)
data_all = data_all.drop_duplicates(data_all.columns, keep='first')
data_all['身份证/证件号码'] = data_all['身份证/证件号码'].apply(lambda x: re.search("\D*(\d*)", x).group(1))
for col_name in list(data_all.columns):
data_all[col_name].str.strip()
list_strip = ['\t', ' ', ':', '-', ".", ' ',' ']
for i in list_strip:
data_all['资金账户开户日期'] = np.where(data_all['资金账户开户日期'].str.findall(i), data_all['资金账户开户日期'].str.replace(i, ""),
data_all['资金账户开户日期'])
data_all['资金账户开户日期'] = data_all['资金账户开户日期'].apply(lambda x: x[0:4] + "/" + x[4:6] + "/" + x[6:8])
data_all.to_excel('证券数据/data_all.xlsx', index=False)
print("--------证券数据/data_all.xlsx 已导出--------")
def concat_sheet(fPath_jymx_list_NO, num):
col_1 = ['营业部名称', '账户名称', '身份证/证件号码', '资金账户开户日期', '客户身份证联系地址', '客户工作单位(如有)', '客户工作单位地址(如有)', '客户联系电话']
col_2 = ['营业部名称', '账户名称', '身份证/证件号码', '资金账户开户日期', '资金账户开户日期2', '客户身份证联系地址', '客户工作单位(如有)', '客户工作单位地址(如有)',
'客户联系电话', ]
col_3 = ['营业部名称', '帐户名称', '身份证/证件号码', '资金账户开户日期', '客户身份证联系地址', '客户工作单位', '客户工作单位地址', '客户联系电话']
if num == 0:
col = col_1
print("--------正在处理分类1--------")
data_top_1 = do_concat_sheet(fPath_jymx_list_NO, col)
data_top_1.columns = col_2
data_top_1 = data_top_1[col_1]
data_top_1.to_csv('证券数据/data_top_1.csv', index=False)
print("--------分类1处理完毕--------")
if num == 1:
print("--------正在处理分类2--------")
col = col_3
data_top_2 = do_concat_sheet(fPath_jymx_list_NO, col)
data_top_2.columns = col_1
data_top_2.to_csv('证券数据/data_top_2.csv', index=False)
print("--------分类2处理完毕--------")
def do_concat_sheet(fPath_jymx_list_NO, col):
data_top = pd.read_excel(fPath_jymx_list_NO[0], sheet_name="附表1", dtype='str')
data_top.columns = list(data_top.iloc[0])
data_top = data_top.iloc[1:2, :]
data_top = data_top[col]
for file_name in fPath_jymx_list_NO:
print(file_name)
data_load = pd.read_excel(file_name, sheet_name="附表1", dtype='str')
data_load.columns = list(data_load.iloc[0])
data_load = data_load.iloc[1:2, :]
data_load = data_load[col]
data_top = pd.concat([data_top, data_load], axis=0)
return data_top
if __name__ == '__main__':
f_path = 'F:'
main_and_outCSV(f_path)