对于大文件业务中有时候需要进行均匀拆分后分别进行处理,这里用python实现了均匀拆分,设定拆分的目标文件数量,输入路径(必须是一个目录),会自动进行拆分
# -*- coding: utf-8 -*-
import math
import os
import shutil
import sys
# 获取运行脚本的当前目录
ROOT_PATH = os.path.abspath(os.path.join(sys.path[1], "..")) + os.sep
if os.sep == '\\':
ROOT_PATH = 'C:\\Users\\yourusername\\Desktop\\'
FILE_SPLIT_NUM = 10 # 分割后的文件总量
def savelist(path, record, seq='\n', mode='w'):
print('saving ' + path)
with open(path, 'w') as f:
f.write(seq.join(record))
# 对于大文件不会出现崩溃
def count_file_lines(path):
print('count # reading ' + path)
count = -1
for count, line in enumerate(open(path, 'rU')):
pass
count += 1
return count
def split_files(input_dir, file_name_prefix):
output_dir = ROOT_PATH + "split"
if os.path.exists(output_dir):
shutil.rmtree(output_dir) # 删除已存在目录
os.makedirs(output_dir)
print('input_dir:' + input_dir)
print('output_dir:' + output_dir)
total = sum([count_file_lines(input_dir + os.sep + filename) for filename in
os.listdir(input_dir)])
file_records_num = int(math.ceil(float(total) / FILE_SPLIT_NUM))
cur_records_num = 0
cur_records_list = []
file_index = 0
for filename in sorted(os.listdir(input_dir)):
filepath = input_dir + os.sep + filename
if os.path.isfile(filepath):
with open(filepath) as f:
print('reading # reading ' + filename)
for line in f:
cur_records_num += 1
cur_records_list.append(line)
if cur_records_num >= file_records_num:
file_index += 1
output_file = output_dir + os.sep + file_name_prefix + '_' + str(file_index)
savelist(output_file, cur_records_list, '')
cur_records_num = 0
cur_records_list = []
if cur_records_list and (total - file_index * file_records_num > 0):
file_index += 1
output_file = output_dir + os.sep + file_name_prefix + '_' + str(file_index)
savelist(output_file, cur_records_list, '')
# argv[1]:input_path argv[2]:file_split_num argv[3]:file_name_prefix
if __name__ == '__main__':
file_name_prefix = 'part' # 拆分后的文件前缀
if len(sys.argv) < 2:
print('sys.argv length must great than 1')
sys.exit(-1)
input_path = sys.argv[1]
if len(sys.argv) > 2:
FILE_SPLIT_NUM = int(sys.argv[2])
if len(sys.argv) > 3:
file_name_prefix = int(sys.argv[3])
split_files(input_path, file_name_prefix)