itemcf的hadoop实现优化(Python)

原始数据如下:

u1  a,d,b,c
u2  a,a,c
u3  b,d
u4  a,d,c
u5  a,b,c

计算公式使用:sim = U(i)∩U(j) / (U(i)∪U(j))

其中: (U(i)∪U(j)) = U(i) + U(j) -  U(i)∩U(j) 

原始的Hadoop实现需要5轮MR,优化后只需要两轮就可以完成。

之前的轮数过多,主要在于计算(U(i)∪U(j)) 的时候,需要多次更改key,并非计算量大。只需要修改一下传递的key,就可以两轮实现。

mapper_1.py

#!/usr/bin/python
#-*-coding:utf-8-*-
import sys

for line in sys.stdin:
    user,item_str = line.strip().split()
    item_list = sorted(list(set(item_str.split(','))))
    print "item_str:",item_str,"item_list:",item_list
    for i in range(len(item_list)):
        i1 = item_list[i]
        print i1,1,'norm'
        for i2 in item_list[i+1:]:
            print i1,i2,1,'dot'

reducer_1.py

#!/usr/bin/python
#-*-coding:utf-8-*-
import sys

def PrintOut():
    i1 = old_key
    print i1,old_dict['norm'],'norm'
    for i2 in old_dict['dot']:
        print i1 + "-"  + i2,old_dict['dot'][i2],old_dict['norm'],'dot-norm_i1'

old_key = ""
old_dict = {'norm':0,'dot':{}}
for line in sys.stdin:
    sp = line.strip().split()
    if sp[-1] == 'norm':
        key,value = sp[:2]
        if key == old_key:
            old_dict['norm'] += int(value) 
        else:
            if old_key != "":
                PrintOut()
            old_key = key
            # Notice: norm part should be int(value)
            old_dict = {'norm':int(value),'dot':{}}
    elif sp[-1] ==  'dot':
        key,i2,value = sp[:3]
        if key == old_key:
            if i2 not in old_dict['dot']:
                old_dict['dot'][i2] = 0
            old_dict['dot'][i2] += int(value)
        else:
            if old_dot_key != "":
                PrintOut()
            old_key = key
            old_dict = {'norm':int(value),'dot':{}}
        
if old_key != "":
    PrintOut()

mapper_2.py

#!/usr/bin/python
#-*-coding:utf-8-*-
import sys

for line in sys.stdin:
    sp = line.strip().split()
    if sp[-1] == 'norm':
        print line.strip()
    elif sp[-1] == "dot-norm_i1":
        key,dot,norm_i1 = sp[:3]
        i1,i2 = key.split('-')
        print i2,i1,dot,norm_i1,'dot-norm_i1'

reducer_2.py

#!/usr/bin/python
#-*-coding:utf-8-*-
import sys

def GenSim(norm_i1,norm_i2,dot):
    return float(dot) / (int(norm_i1) + int(norm_i2) - int(dot))

def PrintOut():
    i2 = old_key
    norm_i2 = old_dict['norm']
    for i1 in old_dict['dot']:
        dot,norm_i1 = old_dict['dot'][i1]
        sim = GenSim(norm_i1,norm_i2,dot)
        print i1+"-"+i2,dot,norm_i1,norm_i2,sim,'dot,norm_i1,norm_i2,sim'

old_key = ""
old_dict = {'norm':"",'dot':{}}
for line in sys.stdin:
    sp = line.strip().split()
    if sp[-1] == 'norm':
        key,value = sp[:2]
        if key == old_key:
            old_dict['norm'] = value
        else:
            if old_key != "":
                PrintOut()
            old_key = key
            old_dict = {'norm':value,"dot":{}}
    elif sp[-1] == 'dot-norm_i1':
        key,i1,dot,norm_i1 = sp[:4]  #key is i2.
        if key == old_key:
            if i1 not in old_dict['dot']:
                old_dict['dot'][i1] = (dot,norm_i1)
            else:
                if old_key != "":
                    PrintOut()
                old_key = key
                old_dict = {'norm':value,'dot':{i1:(dot,norm_i1)}}

if old_key != "":
    PrintOut()

执行脚本 t.sh:

#!/bin/bash

cat user_log.txt |./mapper_1.py |sort -k1 > d.m.1
cat d.m.1 |./reducer_1.py > d.r.1

cat d.r.1 |./mapper_2.py |sort -k1 > d.m.2
cat d.m.2 |./reducer_2.py > d.r.2






你可能感兴趣的:(itemcf的hadoop实现优化(Python))