4.5.4 Python编写MapReduce

# mkdir -p /data/test  && cd /data/test

# vim input.txt

foo foo quux labs foo bar quux abx bar see you bytest welcome test abc labs foo me python hadoop ab ac bc bec python

# vim mapper.py    # 刘天斯 著《Python 自动化运维》中的代码

#!/usr/bin/env python

import sys

for line in sys.stdin:

    line =line.strip()

    words =line.split()

    for word inwords:

        print'%s\t%s' % (word, 1)

# vim reducer.py   # 刘天斯 著《Python 自动化运维》中的代码

#!/usr/bin/env python

from operator import itemgetter

import sys

 

current_word = None

current_count = 0

word = None

 

for line in sys.stdin:

    line =line.strip()

    word,count= line.split('\t', 1)

 

    try:

        count =int(count)

    except ValueError:

       continue

 

    ifcurrent_word == word:

       current_count += count

    else:

        ifcurrent_word:

           print '%s\t%s' % (current_word, current_count)

       current_count = count

       current_word = word

 

if current_word == word:

    print'%s\t%s' % (current_word, current_count)

# chmod +x *.py

# cat input.txt |./mapper.py    # 测试py文件

# cat input.txt |./mapper.py |sort -k1,1|./reducer.py

$ hdf dfs -mkdir /user/hduser/input/word

$ hdfs dfs -put /data/test/input.txt/user/hduser/input/word

$ hadoop jar../share/hadoop/tools/lib/hadoop-streaming-3.0.0-alpha2.jar -file/data/test/mapper.py -mapper /data/test/mapper.py -file /data/test/reducer.py-reducer /data/test/reducer.py -input /user/hduser/input/word -output/user/hadoop/word

$ hdfs dfs -ls /user/hadoop/word   # 查看生成的文件

Found 2 items

-rw-r--r--   2hadoop supergroup          0 2017-04-0509:53 /user/hadoop/word/_SUCCESS

-rw-r--r--   2hadoop supergroup        117 2017-04-0509:53 /user/hadoop/word/part-00000

$ hdfs dfs -cat/user/hadoop/word/part-00000

ab      1

abc     1

abx     1

ac      1

bar     2

bc      1

bec     1

by      1

foo     4

hadoop  1

labs    2

me      1

python  2

quux    2

see     1

test    2

welcomne       1

you     1

 ### good  成功了,继续努力……

### 以上python代码也可以这里看到:http://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python/