def read_large_file(m_fr):
"""
生成器函数,按行读取大文件
:param m_fr:
:return: 每行的内容
"""
while True:
line = m_fr.readline()
if not line:
break
yield line
# 按行数拆分文件
# 将子文件存放到以文件名命名的文件夹中
def file_split_quick(m_filepath, m_num, m_dirpath, m_num_dict):
"""
按行分割大文件
:param m_filepath: 文件路径
:param m_num: 每个分割文件的行数
:param m_dirpath: 子文件存储目录
:param m_num_dict: 记录子文件行数的字典
"""
m_pathlist = []
if not os.path.exists(m_filepath):
print('error: not exist: {}'.format(m_filepath))
assert 0 == 1
if not os.path.exists(m_dirpath):
os.makedirs(m_dirpath)
m_filename = os.path.basename(m_filepath)
m_out = []
m_cmd = "wc -l {}".format(m_filepath)
execute_command(m_cmd, m_out)
m_total_num = int(m_out[0].split(' ')[0])
if m_total_num > m_num:
m_count = 0
with open(m_filepath, 'r', encoding='utf-8') as m_fr:
while True:
m_lines = list(islice(read_large_file(m_fr), m_num))
if not m_lines:
break
m_count += 1
m_subpath = os.path.join(m_dirpath, os.path.splitext(m_filename)[0] + '_' + str(m_count).zfill(3) + os.path.splitext(m_filename)[1])
m_fw = open(m_subpath, 'w', encoding='utf-8')
m_fw.writelines(m_lines)
m_fw.close()
m_pathlist.append(m_subpath)
m_num_dict[m_subpath] = len(m_lines)
print('done: {} {}'.format(m_num_dict[m_subpath], m_subpath))
else:
m_newpath = os.path.join(m_dirpath, m_filename)
m_pathlist.append(m_newpath)
m_num_dict[m_newpath] = m_total_num
shutil.copyfile(m_filepath, m_newpath)
return m_pathlist
ChatGPT真是个好东西!
用linux命令拆分:
# 命令:split 文件路径 分割出的文件前缀
# 示例:
split test.txt test_
# 文件按行分割
# -l 设置行数
# -a 指定后缀长度(默认为2)
# --numeric-suffixes=1 指定数字起始值(会影响分割效率)
# --additional-suffix=.txt 指定分割出来的文件格式(会影响分割效率)
split -l 10000 test.txt test_ -a 3 --numeric-suffixes=1 --additional-suffix=.txt
# 查看文件前10行内容
head -n 10 test_001.txt