最近看到一篇关于海量数据处理的python实现,具体参(http://blog.csdn.net/quicktest/article/details/7453189#comments)。文章中主要出现的问题是对于海量数据的处理分治这块实现思想不对,改进后采用了直接对文件进行处理会导致内存溢出,在此我将运用hash对原文件进行分割再进行处理(对于海量数据的处理大家可考:http://blog.csdn.net/hackbuteer1/article/details/7622869
http://blog.csdn.net/v_july_v/article/details/7382693)。对于大数据的处理一般思想是:分治,字典,tire树等等。在此本文的实现将运用分治,字典(C++中hash_map)这两种通用方法,因为其具有通用性所以很具有参考价值。
题目:海量日志数据,提取出某日访问百度次数最多的那个IP。
现将源代码示下欢迎指正:
#!/usr/bin/python
#######################################
#author zh01085105
#time 2014 2 21 17:37
#######################################
from random import randint
from time import ctime
import os
#generateRandom 产生随机IP地址没用到
def generateRandom(rangeFrom, rangeTo):
return randint(rangeFrom, rangeTo)
#产生IP地址 filelocat 为存放IP的文件 , numberOflines为产生IP个数
def generateMassiveIp(filelocation, numberOflines):
begintime = ctime()
print numberOflines
IP = []
j = 0
if not os.path.exists(filelocation):
os.mknod(filelocation)
print "begin to generate IP "+ begintime
file_handler = open(filelocation, 'r+')
for i in range(numberOflines):
#IP.append(str(randint(0,255)) + '.' + str(randint(0,255)) + '.' + str(randint(0,255)) + '.' + str(randint(0,255)) + "\n")
IP.append(str("192" + ". "+ str(randint(0,255)) + '.' + str(randint(0,255)) + '.' + str(randint(0,255)) + "\n"))
file_handler.writelines(IP)
file_handler.close()
endtime = ctime()
print "end to generate IP " + endtime
#判断文件路径是否存在如果存在直接返回,没有则创建该文件
def mkFile(file_path):
if(os.path.exists(file_path)):
return 1
else:
os.mknod(file_path)
return 1
return 0
#将大的文件分成多个小的文件,inputfile 为书输入的文件, splitNumbers为分割的个数
def splitFile(inputFile, splitNumbers):
print "begin to split files........"
file_handler = open(inputFile, 'r+')
for IP in file_handler:
IPtest = IP.strip("\n") #去掉IP地址后面的‘\n’
hashvalue = hash(IPtest) %splitNumbers
file_path = "./splitfile" + "/" + str(hashvalue) + ".txt"
if mkFile(file_path):
filePoint = open(file_path, "a+")
filePoint.writelines(IP)
filePoint.close() #这里存在多次必要的IO操作????待改进
file_handler.close()
#IP处理dirPath为分割后的文件所在的“目录”程序会自动搜索目录下的文件
def ipProcess(dirPath):
i = 0
maxvalue = 0
times = []
dire = {}
for root, dirs,files in os.walk(dirPath):
print "get file"
print "begin to statcstic............."
while i < len(files):
####print "file number" + str(len(files)) right
path = dirPath + "/" + files[i]
filePoint = open(path, 'r')
for cloums in filePoint:
if cloums in dire:
dire[cloums] = dire[cloums] + 1
else:
dire[cloums] = 1
for k,v in dire.iteritems(): #字典遍历也可采用items但是会产生一个大的list浪费内存所以采用了iteritens迭代器具体参见(python学习笔记)
if maxvalue < v:
maxvalue = v
times.append(maxvalue)
i = i + 1
dire = {}
###print "total number is" + str(j) right
return times
#删除文件dirPath为文件所在的目录,执行该程序会删除该文件夹下的所有文件
def deleteFile(direPath):
for root, dirs,files in os.walk(direPath):
print "delete files"
i = 0;
while i < len(files):
path = direPath + "/" + files[i]
if os.path.exists(path):
print "delet "+ path
os.remove(path)
i = i +1
if __name__ == "__main__":
generateMassiveIp("./IPtest.txt", 100000)
splitFile("./IPtest.txt",100)
time = ipProcess("./splitfile")
iptime = 0
i = 0
while i < len(time) :
iptime = iptime + time[i]
i = i + 1
deleteFile("./splitfile")
print iptime
os.remove("./IPtest.txt")
print time
print len(time)
经过多次测试程序正确运行,不过没有进行千万,亿级的测试(主要本人电脑太慢)。就讲到这有问题希望大家指正。在此感谢Wally_Yu的python实现才使我有了这个想法,以及july,Hackbuteer1所提供的数据处理思想