某证券公司数据清洗工具 20200911

功能:合并多表内指定sheet,指定字段
某证券公司数据清洗工具 20200911_第1张图片

import numpy as np
import pandas as pd
import os
import re

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)


def main_and_outCSV(f_path):
    f_path = f_path
    fPath_jymx_list_error = []
    fPath_jymx_list = []
    for root, dirs, files in os.walk(f_path):
        for name in files:
            if ('表格' in name or '附表' in name) and '' not in name:
                fPath_jymx_list.append(os.path.join(root, name))
            elif '表格' not in name and '附表' not in name and '' not in name:
                fPath_jymx_list_error.append(os.path.join(root, name))
    fPath_all = [fPath_jymx_list, fPath_jymx_list_error]
    for fPath_num in range(len(fPath_all)):
        #     print(fPath_all[fPath_num])
        fPath_jymx_list_NO = fPath_all[fPath_num]
        concat_sheet(fPath_jymx_list_NO, fPath_num)
    data_r_1 = pd.read_csv("证券数据/data_top_1.csv", encoding='utf8', dtype='str')
    data_r_2 = pd.read_csv("证券数据/data_top_2.csv", encoding='utf8', dtype='str')
    data_all = pd.concat([data_r_1, data_r_2], axis=0)
    data_all = data_all.drop_duplicates(data_all.columns, keep='first')
    data_all['身份证/证件号码'] = data_all['身份证/证件号码'].apply(lambda x: re.search("\D*(\d*)", x).group(1))
    for col_name in list(data_all.columns):
        data_all[col_name].str.strip()

    list_strip = ['\t', ' ', ':', '-', ".", ' ',' ']
    for i in list_strip:
        data_all['资金账户开户日期'] = np.where(data_all['资金账户开户日期'].str.findall(i), data_all['资金账户开户日期'].str.replace(i, ""),
                                        data_all['资金账户开户日期'])
    data_all['资金账户开户日期'] = data_all['资金账户开户日期'].apply(lambda x: x[0:4] + "/" + x[4:6] + "/" + x[6:8])
    data_all.to_excel('证券数据/data_all.xlsx', index=False)
    print("--------证券数据/data_all.xlsx 已导出--------")


def concat_sheet(fPath_jymx_list_NO, num):
    col_1 = ['营业部名称', '账户名称', '身份证/证件号码', '资金账户开户日期', '客户身份证联系地址', '客户工作单位(如有)', '客户工作单位地址(如有)', '客户联系电话']
    col_2 = ['营业部名称', '账户名称', '身份证/证件号码', '资金账户开户日期', '资金账户开户日期2', '客户身份证联系地址', '客户工作单位(如有)', '客户工作单位地址(如有)',
             '客户联系电话', ]
    col_3 = ['营业部名称', '帐户名称', '身份证/证件号码', '资金账户开户日期', '客户身份证联系地址', '客户工作单位', '客户工作单位地址', '客户联系电话']
    if num == 0:
        col = col_1
        print("--------正在处理分类1--------")
        data_top_1 = do_concat_sheet(fPath_jymx_list_NO, col)
        data_top_1.columns = col_2
        data_top_1 = data_top_1[col_1]
        data_top_1.to_csv('证券数据/data_top_1.csv', index=False)
        print("--------分类1处理完毕--------")
    if num == 1:
        print("--------正在处理分类2--------")
        col = col_3
        data_top_2 = do_concat_sheet(fPath_jymx_list_NO, col)
        data_top_2.columns = col_1
        data_top_2.to_csv('证券数据/data_top_2.csv', index=False)
        print("--------分类2处理完毕--------")


def do_concat_sheet(fPath_jymx_list_NO, col):
    data_top = pd.read_excel(fPath_jymx_list_NO[0], sheet_name="附表1", dtype='str')
    data_top.columns = list(data_top.iloc[0])
    data_top = data_top.iloc[1:2, :]
    data_top = data_top[col]
    for file_name in fPath_jymx_list_NO:
        print(file_name)
        data_load = pd.read_excel(file_name, sheet_name="附表1", dtype='str')
        data_load.columns = list(data_load.iloc[0])
        data_load = data_load.iloc[1:2, :]
        data_load = data_load[col]
        data_top = pd.concat([data_top, data_load], axis=0)
    return data_top



if __name__ == '__main__':
    ############请输入路径############
    f_path = 'F:'
    main_and_outCSV(f_path)


你可能感兴趣的:(zkjs)