mapreduce python编程实例
1 - mapreduce使用python WordCount实例
1.1 - mapper函数使用
vi mapper.py
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper.py
import sys
for line in sys.stdin: #读取标准输入
line = line.strip() #删除前导和尾随空白
words = line.split() #用split讲该行的单词分割成列表,每个单词就时一个列表项目,split的默认参数是空格,所以不传递任何参数时分割空格,在英文中也就等同于分割单词
for word in words:
print'%s\t%s'%(word,1)
[root@lsn-linux python]# echo "foo foo quux labs foo bar quux" |python /hadoop/hadoop-2.6.0/python/mapper.py
foo 1
foo 1
quux 1
labs 1
foo 1
bar 1
quux 1
2.2 - reduce函数使用
vim reduce.py
#!/usr/bin/python
#_*_ coding:utf-8 _*_
#Filename:reduce.py
from operator import itemgetter //排序
import sys
word2count = {} #定义一个字典
for line in sys.stdin:
line = line.strip()
word,count = line.split('\t',1)
try:
count = int(count)
word2count[word] = word2count.get(word,0)+count #word2count.get(word,0),查找word键值,如果不存在返回0,如果存在返回键值
except ValueError:
pass
sorted_word2count = sorted(word2count.items(),key=itemgetter(0)) #用word2count.items()的第一个项目进行排序
for word,count in sorted_word2count:
print'%s\t%s'%(word,count)
[root@lsn-linux python]# echo "foo foo quux labs foo bar quux"|python mapper.py|python reduce.py
bar 1
foo 3
labs 1
quux 2
1.3 - 在mapreduce执行
拷贝./share/hadoop/tools/lib/hadoop-streaming-2.6.0.jar到hadoop目录
赋予脚本执行权限,否则会报Cannot run program "/hadoop/hadoop-2.6.0/python/mapper.py": error=13, Permission denied错误
chmod +x -R python
hadoop jar hadoop-streaming-2.6.0.jar -mapper /hadoop/hadoop-2.6.0/python/mapper.py -reducer /hadoop/hadoop-2.6.0/python/reduce.py -input /testin/* -output /testout
----------------------------------------------------------------------------------------------------------------------------------------------------------------------
3 - web访问日志分析
日志类型:
175.44.19.36 - - [29/Sep/2013:00:10:57 +0800] "GET /mapreduce-nextgen/client-codes/ HTTP/1.1" 200 25470 "-" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1;)"
112.111.183.57 - - [29/Sep/2013:00:10:58 +0800] "POST /wp-comments-post.php HTTP/1.1" 302 513 "http://dongxicheng.org/search-engine/scribe-intro/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
5.63.145.70 - - [29/Sep/2013:00:11:03 +0800] "HEAD / HTTP/1.1" 200 221 "-" "checks.panopta.com"
2.1 - 统计访问ip地址数目
mapper实现--正则表达式
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper_3_1.py
import re
import sys
for line in sys.stdin:
line = line.strip()
words=re.match('(\d{1,3}\.){3}\d{1,3}',line).group()
words = words.split('\n')
for i in range(0,len(words)):
print'%s\t%s'%(words[i],1)
mapper实现--字符串
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper_3_1_1.py
import sys
for line in sys.stdin:
line = line.strip()
words=line[:line.find(' ')]
words = words.split('\n')
for i in range(0,len(words)):
print'%s\t%s'%(words[i],1)
reduce与之前一样
2.2 - 统计目录访问次数(/mapreduce-nextgen/client-codes/)
mapper实现--filter(lambda)打印
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper_3_2.py
import sys
for line in sys.stdin:
line = line.strip()
if line.find('GET')!=-1:
words=line[line.find('GET')+3:line.find('HTTP')]
# if line.find('POST')!=-1:
elif line.find('HEAD')!=-1:
words=line[line.find('HEAD')+4:line.find('HTTP')]
else:
words=line[line.find('POST')+4:line.find('HTTP')]
words = filter(lambda word: word, words.split('\n'))
for word in words:
print'%s\t%s'%(word,1)
mapper实现--元组打印 (遇到空行实现不了)
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper_3_2.py
import sys
for line in sys.stdin:
line = line.strip()
if line.find('GET')!=-1:
words=line[line.find('GET')+3:line.find('HTTP')]
# if line.find('POST')!=-1:
elif line.find('HEAD')!=-1:
words=line[line.find('HEAD')+4:line.find('HTTP')]
else:
words=line[line.find('POST')+4:line.find('HTTP')]
words = filter(lambda word: word, words.split('\n'))
for word in words:
print'%s\t%s'%(word,1)
reduce与之前一样
2.3 - 统计每个 ip,访问的子目录次数,输出如:175.44.30.93 /structure/heap/ 8
取IP 和路径 1
如果一样 +1
思路:IP和目录用\t来做分隔符,然后使用特殊符号\@来做为和1的分隔符,在reduce中进行分割,然后比对IP和目录,进行累加
mapper实现
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper_3_3.py
import sys
for line in sys.stdin:
line = line.strip()
if line.find('GET')!=-1:
words=line[:line.find(' ')]+'\t'+line[line.find('GET')+3:line.find('HTTP')]
# if line.find('POST')!=-1:
elif line.find('HEAD')!=-1:
words=line[:line.find(' ')]+'\t'+line[line.find('HEAD')+4:line.find('HTTP')]
elif line.find('POST')!=-1:
words=line[:line.find(' ')]+'\t'+line[line.find('POST')+4:line.find('HTTP')]
else:
words=''
words = filter(lambda word:word, words.split('\n'))
for word in words:
print'%s\@%s'%(word,1)
reduce实现
#!/usr/bin/python
#_*_ coding:utf-8 _*_
#Filename:reduce.py
from operator import itemgetter
import sys
word2count = {}
for line in sys.stdin:
line = line.strip()
word,count = line.split('\@',1)
try:
count = int(count)
word2count[word] = word2count.get(word,0)+count
except ValueError:
pass
sorted_word2count = sorted(word2count.items(),key=itemgetter(0))
for word,count in sorted_word2count:
print'%s\t%s'%(word,count)
---------------------------------------------------------------------------------------------------------------------------------
3- mapreduce使用python WordCount实例,使用python的迭代器和生成器改进mapper和reducer代码
mapper
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper_yield.py
import sys
def read_file(file):
for line in file:
yield line.split()
def main(separator='\t'):
data=read_file(sys.stdin)
for words in data:
for word in words:
print'%s%s%d'%(word,separator,1)
if __name__=='__main__':
main()
reduce
#!/usr/bin/python
#_*_ coding:utf-8 _*_
#Filename:reduce_yield.py
from operator import itemgetter
import sys
from itertools import groupby
def read_file(file,separator):
for line in file:
yield line.strip('').split(separator,1)
def main():
separator='\t'
data=read_file(sys.stdin,separator)
word2count = {}
for line in data:
# print line
word,count = line
try:
count = int(count)
word2count[word] = word2count.get(word,0)+count
except ValueError:
pass
sorted_word2count = sorted(word2count.items(),key=itemgetter(0))
for word,count in sorted_word2count:
print'%s%s%s'%(word,separator,count)
if __name__=='__main__':
main()
-----------------------------------------------------------------------------------------------------------------------------------