python多进程处理大数据

主要用到multiprocessing库,思想是将大数据读入内存,然后切片存储,然后多进程分别处理分片。

from multiprocessing import Pool
import math
import os

# 读取数据
path = os.path.join(os.getcwd(), 'test.txt')
with open(path, 'r') as f:
    data = f.readlines()
    
processor=4
l_data = len(data)
size = math.ceil(l_data / processor)

# 切分数据并存储
for i in range(processor):
    start = size*i
    end = (i+1)*size if (i+1)*size < l_data else l_data
    
    filename = 'en_wiki_' + str(i) +'.txt'
    path = os.path.join(os.getcwd(), filename)
    with open(path, 'w') as f:
        for i in range(start, end):
            f.write(data[i])

# 删除读入内存的大数据,高效利用内存
del data,l_data  


# 处理数据
def proess(path1, path2, pid):
    # do something

def run(i):
    filename1 = 'en_wiki_piece_' + str(i) + '.txt'
    path1 = os.path.join(os.getcwd(), filename1)

    filename2 = 'processed_wiki_piece_' + str(i) + '.txt'
    path2 = os.path.join(os.getcwd(), filename2)

    process(path1, path2, i)


# 开启多进程处理数据
p=Pool(processor)
for i in range(processor):
    p.apply_async(run, args=(i,))
    print(str(i) + ' processor started !')
    
p.close()
p.join()
print("Process over!")

你可能感兴趣的:(python,机器学习)