#大文件处理
最近需要处理上百G的文件,速度很重要。对于读取有以下一些看法
目前我用的是
#!/usr/bin/python3
import time
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
def ProcessLargeTextFile(filename):
head=''
reads=0
i=0
with open (filename, 'r') as r:
next(r)
for line in r:
i+=1
if i%1000000000==0:
print(i)
x,y,z,t,rest = line.split('\t')
if x!=head:
reads+=1
head=x
else:
continue
print(reads)
ProcessLargeTextFile('27.5G-data.txt')
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
这个数据的计算时间是
一分钟大概1.5G。
这应该是中间处理耽搁了时间。我尝试一下读写
#!/usr/bin/python3
import time
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
def ProcessLargeTextFile(filename,outfile):
bunchsize = 1000000000
bunch=[]
with open (filename, 'r') as r, open(outfile, 'w') as w:
next(r)
for line in r:
x,y,z,t,rest = line.split('\t')
bunch.append(' '.join((x, z, t, rest)))
if len(bunch)==bunchsize:
w.writelines(bunch)
bunch=[]
print('output1000000000')
w.writelines(bunch)
ProcessLargeTextFile('27.5G-data.txt','test.txt')
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
这个的运行时间大概是4.6G每分钟的读写速度,一天能处理1T的样子
PS.最近拖沓了很多个星期的作业了,我会尽快补上来的。