解压指定路径下的压缩文件(zip),将相同名字的数据(csv)合并到一起

import os
import zipfile
import pandas as pd

def merge_csv_files(zip_folder, output_folder):
    # 确保输出文件夹存在
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # 遍历指定路径下的所有zip文件
    for zip_file in os.listdir(zip_folder):
        if zip_file.endswith('.zip'):
            zip_path = os.path.join(zip_folder, zip_file)

            temp_folder = os.path.join(output_folder, 'temp')
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(temp_folder)

            for csv_file in os.listdir(temp_folder):
                if csv_file.endswith('.csv'):
                    csv_path = os.path.join(temp_folder, csv_file)

                    # 通过尝试不同的编码方式来解决编码问题
                    try:
                        # 尝试使用utf-8编码读取CSV文件
                        df = pd.read_csv(csv_path, encoding='utf-8')
                    except UnicodeDecodeError:
                        # 如果utf-8解码失败,尝试使用latin1编码
                        df = pd.read_csv(csv_path, encoding='latin1')

                    merge_csv(df, csv_file, output_folder)

            clean_temp_folder(temp_folder)


def merge_csv(df, csv_file, output_folder):
    output_path = os.path.join(output_folder, os.path.basename(csv_file))

    if os.path.exists(output_path):
        existing_df = pd.read_csv(output_path)
        merged_df = pd.concat([existing_df, df], ignore_index=True)
        merged_df.to_csv(output_path, index=False)
    else:
        df.to_csv(output_path, index=False)

def clean_temp_folder(temp_folder):
    # 删除临时文件夹及其内容
    for file in os.listdir(temp_folder):
        file_path = os.path.join(temp_folder, file)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            clean_temp_folder(file_path)
    os.rmdir(temp_folder)

# 指定输入和输出文件夹
zip_folder_path = '/home/philtell/data/'
output_folder_path = '/home/philtell/data/test'

# 执行合并操作
merge_csv_files(zip_folder_path, output_folder_path)


功能增加,支持解压中文,同时支持所在行筛选

import os
import zipfile
import pandas as pd

def merge_csv_files(zip_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for zip_file in os.listdir(zip_folder):
        if zip_file.endswith('.zip'):
            zip_path = os.path.join(zip_folder, zip_file)

            temp_folder = os.path.join(output_folder, 'temp')
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(temp_folder)

            for csv_file in os.listdir(temp_folder):
                if csv_file.endswith('.csv'):
                    csv_path = os.path.join(temp_folder, csv_file)

                    # 读取CSV文件时指定GBK编码
                    df = pd.read_csv(csv_path, encoding='gbk')

                    # 保留第七列中文内容为"离线"的行
                    df = df[df.iloc[:, 6] == "离线"]

                    merge_csv(df, csv_file, output_folder)

            clean_temp_folder(temp_folder)

def merge_csv(df, csv_file, output_folder):
    output_path = os.path.join(output_folder, os.path.basename(csv_file))

    if os.path.exists(output_path):
        existing_df = pd.read_csv(output_path, encoding='utf-8')
        merged_df = pd.concat([existing_df, df], ignore_index=True)
        merged_df.to_csv(output_path, index=False, encoding='utf-8')
    else:
        df.to_csv(output_path, index=False, encoding='utf-8')

def clean_temp_folder(temp_folder):
    for file in os.listdir(temp_folder):
        file_path = os.path.join(temp_folder, file)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            clean_temp_folder(file_path)
    os.rmdir(temp_folder)

# 指定输入和输出文件夹
zip_folder_path = '/home/philtell/data/'
output_folder_path = '/home/philtell/data/test2'
# 执行合并操作
merge_csv_files(zip_folder_path, output_folder_path)

将CSV从默认编码格式转成UTF-8的格式

import os
import pandas as pd

def convert_csv_files(input_directory, output_directory, output_encoding='utf-8'):
    # 检查目标目录是否存在,如果不存在则创建
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # 遍历输入目录中的所有CSV文件
    for filename in os.listdir(input_directory):
        if filename.endswith(".csv"):
            input_filepath = os.path.join(input_directory, filename)
            output_filepath = os.path.join(output_directory, f"{filename.split('.')[0]}_utf8.csv")

            # 读取CSV文件
            df = pd.read_csv(input_filepath, encoding='utf-8')

            # 保存为UTF-8编码的CSV文件
            df.to_csv(output_filepath, index=False, encoding=output_encoding)

            print(f"Converted and saved: {output_filepath}")

if __name__ == "__main__":
    input_directory = "/path/to/your/input/directory"
    output_directory = "/path/to/your/output/directory"

    convert_csv_files(input_directory, output_directory)

你可能感兴趣的:(python,机器学习,数据挖掘)