mapreduce python编程实例

mapreduce python编程实例

1 - mapreduce使用python  WordCount实例
1.1 - mapper函数使用
vi mapper.py
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper.py

import sys
for line in sys.stdin:   #读取标准输入
    line = line.strip()  #删除前导和尾随空白
     words = line.split() #用split讲该行的单词分割成列表,每个单词就时一个列表项目,split的默认参数是空格,所以不传递任何参数时分割空格,在英文中也就等同于分割单词
    for word in words:
        print'%s\t%s'%(word,1)
 
[root@lsn-linux python]# echo "foo foo quux labs foo bar quux" |python /hadoop/hadoop-2.6.0/python/mapper.py
foo     1
foo     1
quux    1
labs    1
foo     1
bar     1
quux    1


2.2 - reduce函数使用
vim reduce.py
 #!/usr/bin/python
 #_*_ coding:utf-8 _*_
 #Filename:reduce.py
 
 from operator import itemgetter    //排序
 import sys
 
 word2count = {}   #定义一个字典
 
 for line in sys.stdin:
     line = line.strip()
     word,count = line.split('\t',1)
     try:
         count = int(count)
         word2count[word] = word2count.get(word,0)+count   #word2count.get(word,0),查找word键值,如果不存在返回0,如果存在返回键值
     except ValueError:
         pass
 
 sorted_word2count = sorted(word2count.items(),key=itemgetter(0)) #用word2count.items()的第一个项目进行排序

 for word,count in sorted_word2count:
     print'%s\t%s'%(word,count)
 
 [root@lsn-linux python]# echo "foo foo quux labs foo bar quux"|python mapper.py|python reduce.py 
bar     1
foo     3
labs    1
quux    2




1.3 - 在mapreduce执行
拷贝./share/hadoop/tools/lib/hadoop-streaming-2.6.0.jar到hadoop目录
赋予脚本执行权限,否则会报Cannot run program "/hadoop/hadoop-2.6.0/python/mapper.py": error=13, Permission denied错误
chmod +x -R python


hadoop jar hadoop-streaming-2.6.0.jar -mapper /hadoop/hadoop-2.6.0/python/mapper.py -reducer /hadoop/hadoop-2.6.0/python/reduce.py -input /testin/* -output /testout


----------------------------------------------------------------------------------------------------------------------------------------------------------------------


3 -  web访问日志分析
日志类型:
175.44.19.36 - - [29/Sep/2013:00:10:57 +0800] "GET /mapreduce-nextgen/client-codes/ HTTP/1.1" 200 25470 "-" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1;)"
112.111.183.57 - - [29/Sep/2013:00:10:58 +0800] "POST /wp-comments-post.php HTTP/1.1" 302 513 "http://dongxicheng.org/search-engine/scribe-intro/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
5.63.145.70 - - [29/Sep/2013:00:11:03 +0800] "HEAD / HTTP/1.1" 200 221 "-" "checks.panopta.com"


2.1 - 统计访问ip地址数目
mapper实现--正则表达式
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper_3_1.py
import re
import sys


for line in sys.stdin:
     line = line.strip()
     words=re.match('(\d{1,3}\.){3}\d{1,3}',line).group()
     words = words.split('\n')
     for i in range(0,len(words)):
         print'%s\t%s'%(words[i],1)
         
mapper实现--字符串
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper_3_1_1.py
import sys
for line in sys.stdin:
     line = line.strip()
     words=line[:line.find(' ')]
     words = words.split('\n')
     for i in range(0,len(words)):
         print'%s\t%s'%(words[i],1)


reduce与之前一样
                                   
2.2 - 统计目录访问次数(/mapreduce-nextgen/client-codes/)
mapper实现--filter(lambda)打印
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper_3_2.py
import sys
for line in sys.stdin:
    line = line.strip()
    if line.find('GET')!=-1:
        words=line[line.find('GET')+3:line.find('HTTP')]
    # if line.find('POST')!=-1:
    elif line.find('HEAD')!=-1:
        words=line[line.find('HEAD')+4:line.find('HTTP')]
    else:
        words=line[line.find('POST')+4:line.find('HTTP')]
    words = filter(lambda word: word, words.split('\n'))
    for word in words:
        print'%s\t%s'%(word,1)
        
mapper实现--元组打印 (遇到空行实现不了)
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper_3_2.py
import sys
for line in sys.stdin:
    line = line.strip()
    if line.find('GET')!=-1:
        words=line[line.find('GET')+3:line.find('HTTP')]
    # if line.find('POST')!=-1:
    elif line.find('HEAD')!=-1:
        words=line[line.find('HEAD')+4:line.find('HTTP')]
    else:
        words=line[line.find('POST')+4:line.find('HTTP')]
    words = filter(lambda word: word, words.split('\n'))
    for word in words:
        print'%s\t%s'%(word,1)


reduce与之前一样


2.3 - 统计每个 ip,访问的子目录次数,输出如:175.44.30.93  /structure/heap/  8
取IP 和路径  1
如果一样 +1
思路:IP和目录用\t来做分隔符,然后使用特殊符号\@来做为和1的分隔符,在reduce中进行分割,然后比对IP和目录,进行累加
mapper实现
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper_3_3.py
import sys
for line in sys.stdin:
    line = line.strip()
    if line.find('GET')!=-1:
        words=line[:line.find(' ')]+'\t'+line[line.find('GET')+3:line.find('HTTP')]
    # if line.find('POST')!=-1:
    elif line.find('HEAD')!=-1:
        words=line[:line.find(' ')]+'\t'+line[line.find('HEAD')+4:line.find('HTTP')]
    elif line.find('POST')!=-1:
        words=line[:line.find(' ')]+'\t'+line[line.find('POST')+4:line.find('HTTP')]
    else:
        words='' 
    words = filter(lambda word:word, words.split('\n'))
    for word in words:
        print'%s\@%s'%(word,1)
        
reduce实现
#!/usr/bin/python
#_*_ coding:utf-8 _*_
#Filename:reduce.py


from operator import itemgetter
import sys


word2count = {}


for line in sys.stdin:
    line = line.strip()
    word,count = line.split('\@',1)
    try:
        count = int(count)
        word2count[word] = word2count.get(word,0)+count
    except ValueError:
        pass


sorted_word2count = sorted(word2count.items(),key=itemgetter(0))


for word,count in sorted_word2count:
    print'%s\t%s'%(word,count)
    


---------------------------------------------------------------------------------------------------------------------------------
3- mapreduce使用python  WordCount实例,使用python的迭代器和生成器改进mapper和reducer代码


mapper
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper_yield.py


import sys
def read_file(file):
        for line in file:
                yield line.split()


def main(separator='\t'):
    data=read_file(sys.stdin)
    for words in data:
        for word in words:
            print'%s%s%d'%(word,separator,1)


if __name__=='__main__':
        main()




reduce
#!/usr/bin/python
#_*_ coding:utf-8 _*_
#Filename:reduce_yield.py


from operator import itemgetter
import sys
from itertools import groupby


def read_file(file,separator):
    for  line in file:
        yield line.strip('').split(separator,1)


def main():
    separator='\t'
    data=read_file(sys.stdin,separator)
    word2count = {}
    for line in data:
#        print line
        word,count = line
        try:
            count = int(count)
            word2count[word] = word2count.get(word,0)+count
        except ValueError:
            pass
    sorted_word2count = sorted(word2count.items(),key=itemgetter(0))


    for word,count in sorted_word2count:
            print'%s%s%s'%(word,separator,count)
            
if __name__=='__main__':
    main()


-----------------------------------------------------------------------------------------------------------------------------------

你可能感兴趣的:(hadoop)