Python实现均匀拆分大文件

Python实现均匀拆分大文件

对于大文件业务中有时候需要进行均匀拆分后分别进行处理,这里用python实现了均匀拆分,设定拆分的目标文件数量,输入路径(必须是一个目录),会自动进行拆分

# -*- coding: utf-8 -*-
import math
import os
import shutil
import sys

# 获取运行脚本的当前目录
ROOT_PATH = os.path.abspath(os.path.join(sys.path[1], "..")) + os.sep
if os.sep == '\\':
  ROOT_PATH = 'C:\\Users\\yourusername\\Desktop\\'
FILE_SPLIT_NUM = 10  # 分割后的文件总量


def savelist(path, record, seq='\n', mode='w'):
  print('saving ' + path)
  with open(path, 'w') as f:
    f.write(seq.join(record))


# 对于大文件不会出现崩溃
def count_file_lines(path):
  print('count # reading ' + path)
  count = -1
  for count, line in enumerate(open(path, 'rU')):
    pass
  count += 1
  return count


def split_files(input_dir, file_name_prefix):
  output_dir = ROOT_PATH + "split"
  if os.path.exists(output_dir):
    shutil.rmtree(output_dir)  # 删除已存在目录
  os.makedirs(output_dir)
  print('input_dir:' + input_dir)
  print('output_dir:' + output_dir)

  total = sum([count_file_lines(input_dir + os.sep + filename) for filename in
               os.listdir(input_dir)])
  file_records_num = int(math.ceil(float(total) / FILE_SPLIT_NUM))

  cur_records_num = 0
  cur_records_list = []
  file_index = 0
  for filename in sorted(os.listdir(input_dir)):
    filepath = input_dir + os.sep + filename
    if os.path.isfile(filepath):
      with open(filepath) as f:
        print('reading # reading ' + filename)
        for line in f:
          cur_records_num += 1
          cur_records_list.append(line)
          if cur_records_num >= file_records_num:
            file_index += 1
            output_file = output_dir + os.sep + file_name_prefix + '_' + str(file_index)
            savelist(output_file, cur_records_list, '')
            cur_records_num = 0
            cur_records_list = []

  if cur_records_list and (total - file_index * file_records_num > 0):
    file_index += 1
    output_file = output_dir + os.sep + file_name_prefix + '_' + str(file_index)
    savelist(output_file, cur_records_list, '')


# argv[1]:input_path argv[2]:file_split_num argv[3]:file_name_prefix
if __name__ == '__main__':
  file_name_prefix = 'part'  # 拆分后的文件前缀
  if len(sys.argv) < 2:
    print('sys.argv length must great than 1')
    sys.exit(-1)
  input_path = sys.argv[1]
  if len(sys.argv) > 2:
    FILE_SPLIT_NUM = int(sys.argv[2])
  if len(sys.argv) > 3:
    file_name_prefix = int(sys.argv[3])
  split_files(input_path, file_name_prefix)

你可能感兴趣的:(文件读写,Python)