ODPS MapReduce入门 - 推荐商品

用户通过在jar命令中设置–local参数,在本地模拟MapReduce的运行过程,从而进行本地调试。本地运行时, 客户端会从ODPS中下载本地调试所需要的输入表的元信息、数据,所需要的资源以及输出表的元信息,并将这些信息保存到一个名为warehouse的本地目录中。 在程序运行结束后,会将计算结果输出到warehouse目录内的一个文件中。如果本地的warehouse目录下已经下载了输入表及被引用的资源,在下一次运行时, 会直接引用warehouse下的数据及文件,而不会重复下载。

在本地运行过程中,仍然会启动多个Map及Reduce进程处理数据,但这些进程不是并发运行,而是依次串行运行。此外这个模拟运行过程与真正的分布式运行有如下差别:

  1. 输入表行数限制:目前,最多只会下载100行数据;
  2. 资源的使用:在分布式环境中,ODPS会限制引用资源的大小,详情请参考 应用限制 。但在本地运行环境中,不会有资源大小的限制;
  3. 安全限制:ODPS MapReduce及UDF程序在分布式环境中运行时受到 Java沙箱 的限制。但在本地运行时,则没有此限制;

ItemBuyMapReduce

package mydemo;

import java.io.IOException;
import java.util.Iterator;

import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;

/** * 模型:商品id+商品类别id->浏览次数+收藏次数+放入购物车次数->购买次数 * * @author wwhhf * */
public class ItemBuyMapReduce {

    public static class ItemBuyMapper extends MapperBase {

        private Record key = null;
        private Record value = null;

        @Override
        public void setup(TaskContext context) throws IOException {
            key = context.createMapOutputKeyRecord();
            value = context.createMapOutputValueRecord();
        }

        @Override
        public void map(long recordNum, Record record, TaskContext context)
                throws IOException {
            if (record.getColumnCount() == 6) {
                Long item_id = record.getBigint(1);
                Long item_category = record.getBigint(4);
                Long behavior_type = record.getBigint(2);
                key.setBigint(0, item_id);
                key.setBigint(1, item_category);
                value.setBigint(0, behavior_type);
                context.write(key, value);
            }
        }

    }

    public static class ItemBuyReducer extends ReducerBase {

        private Record output = null;

        @Override
        public void setup(TaskContext context) throws IOException {
            output = context.createOutputRecord();
        }

        @Override
        public void reduce(Record key, Iterator<Record> values,
                TaskContext context) throws IOException {
            Long cnt[] = new Long[] { 0L, 0L, 0L, 0L };
            while (values.hasNext()) {
                Record val = values.next();
                cnt[(int) (val.getBigint(0) - 1)]++;
            }
            // item_id
            output.set(0, key.get(0));
            // item_cat
            output.set(1, key.get(1));
            // 浏览次数
            output.set(2, cnt[0]);
            // 是否收藏
            output.set(3, cnt[1]);
            // 是否放入购物车
            output.set(4, cnt[2]);
            // 是否购买
            output.set(5, cnt[3]);
            context.write(output);
        }

    }

    public static void main(String[] args) throws OdpsException {
        JobConf job = new JobConf();

        job.setMapOutputKeySchema(SchemaUtils.fromString("item_id:bigint,"
                + "item_category:bigint"));
        job.setMapOutputValueSchema(SchemaUtils.fromString("browse_cnt:bigint,"
                + "is_collect:boolean," + "is_shop_car:boolean,"
                + "is_pay:boolean"));

        InputUtils.addTable(
                TableInfo.builder().tableName("tianchi_fresh_comp_train_user")
                        .build(), job);
        OutputUtils.addTable(
                TableInfo.builder().tableName("fresh_comp_item_buy").build(),
                job);

        job.setMapperClass(ItemBuyMapper.class);
        job.setReducerClass(ItemBuyReducer.class);

        RunningJob rj = JobClient.runJob(job);
        rj.waitForCompletion();
    }

}

UserItemMapReduce.java

package fresh_comp_offline;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;

/** * 统计每个用户对哪些的商品id发生了哪些行为 * * @author wwhhf * */
public class UserItemMapReduce {

    private static class Node {
        private Long item_id;
        private Long type;
        private Date time;

        public Long getType() {
            return type;
        }

        public Long getItem_id() {
            return item_id;
        }

        public Date getTime() {
            return time;
        }

        public Node(Long item_id, Long type, Date time) {
            super();
            this.item_id = item_id;
            this.type = type;
            this.time = time;
        }

    }

    private static class MyComparator implements Comparator<Node> {

        @Override
        public int compare(Node o1, Node o2) {
            return o1.getTime().compareTo(o2.getTime());
        }

    }

    private static class UserItemMapper extends MapperBase {

        private Record key = null;
        private Record value = null;

        @Override
        public void setup(TaskContext context) throws IOException {
            key = context.createMapOutputKeyRecord();
            value = context.createMapOutputValueRecord();
        }

        @Override
        public void map(long recordNum, Record record, TaskContext context)
                throws IOException {
            if (record.getColumnCount() == 6) {
                key.set(0, record.getBigint(0));
                value.set(0, record.getBigint(1));
                value.set(1, record.getBigint(2));
                value.set(2, record.getDatetime(5));
                context.write(key, value);
            }
        }

    }

    private static class UserItemReducer extends ReducerBase {

        private Record output = null;

        @Override
        public void setup(TaskContext context) throws IOException {
            output = context.createOutputRecord();
        }

        @Override
        public void reduce(Record key, Iterator<Record> values,
                TaskContext context) throws IOException {
            StringBuffer sb = new StringBuffer();
            List<Node> list = new ArrayList<>();
            while (values.hasNext()) {
                Record val = values.next();
                list.add(new Node(val.getBigint(0), val.getBigint(1), val
                        .getDatetime(2)));
            }
            Collections.sort(list, new MyComparator());
            for (Node item : list) {
                // user_id,item_id type,....
                sb.append(item.getItem_id()).append(" ").append(item.getType())
                        .append(",");
            }
            output.set(0, key.getBigint(0));
            output.set(1, sb.toString().substring(0, sb.length() - 1));
            context.write(output);
        }

    }

    public static void main(String[] args) throws OdpsException {
        JobConf job = new JobConf();

        job.setMapOutputKeySchema(SchemaUtils.fromString("user_id:bigint"));
        job.setMapOutputValueSchema(SchemaUtils
                .fromString("item_id:bigint,type:bigint,time:datetime"));

        InputUtils.addTable(
                TableInfo.builder().tableName("tianchi_fresh_comp_train_user")
                        .build(), job);
        OutputUtils.addTable(
                TableInfo.builder().tableName("fresh_comp_user_item").build(),
                job);

        job.setMapperClass(UserItemMapper.class);
        job.setReducerClass(UserItemReducer.class);

        RunningJob rj = JobClient.runJob(job);
        rj.waitForCompletion();
    }

}

GenGraphMapReduce.java

package fresh_comp_offline;

import java.io.IOException;
import java.util.Iterator;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;

/** * 将购买商品的顺序形成有向图 * * @author wwhhf * */
public class GenGraphMapReduce {

    private static class GenGraphMapper extends MapperBase {

        private Record key = null;
        private Record value = null;

        private Record returnNum(String type) {
            value.setBigint(0, Long.valueOf(type));
            return value;
        }

        @Override
        public void setup(TaskContext context) throws IOException {
            key = context.createMapOutputKeyRecord();
            value = context.createMapOutputValueRecord();
        }

        @Override
        public void map(long recordNum, Record record, TaskContext context)
                throws IOException {
            // 所购买的商品item_id(时间升序)
            String item_ids[] = record.getString(1).split(",");
            for (int len = item_ids.length, i = len - 1; i > 0; i--) {
                String a[] = item_ids[i - 1].split(" ");
                String b[] = item_ids[i].split(" ");

                // 商品a
                key.set(0, Long.valueOf(a[0]));
                // 商品b
                key.set(1, Long.valueOf(b[0]));
                // a->b一条有向边
                context.write(key, returnNum(b[1]));

                // 任何商品
                key.set(1, -1L);
                // a->任何商品一条有向边
                context.write(key, returnNum(b[1]));
            }
        }
    }

    private static class GenGraphReducer extends ReducerBase {

        private Record output = null;

        @Override
        public void setup(TaskContext context) throws IOException {
            output = context.createOutputRecord();
        }

        @Override
        public void reduce(Record key, Iterator<Record> values,
                TaskContext context) throws IOException {
            Long sum = 0L;
            while (values.hasNext()) {
                Record val = values.next();
                sum = sum + val.getBigint(0);
            }
            output.set(0, key.get(0));
            output.set(1, key.get(1));
            output.set(2, sum);
            context.write(output);
        }
    }

    public static void main(String[] args) throws OdpsException {
        JobConf job = new JobConf();

        job.setMapOutputKeySchema(SchemaUtils
                .fromString("item_id_a:bigint,item_id_b:bigint"));
        job.setMapOutputValueSchema(SchemaUtils.fromString("count:bigint"));

        InputUtils.addTable(
                TableInfo.builder().tableName("fresh_comp_user_item").build(),
                job);
        OutputUtils.addTable(
                TableInfo.builder().tableName("fresh_comp_gengraph").build(),
                job);

        job.setMapperClass(GenGraphMapper.class);
        job.setReducerClass(GenGraphReducer.class);

        RunningJob rj = JobClient.runJob(job);
        rj.waitForCompletion();
    }

}

CalcProbMapReduce.java

package fresh_comp_offline;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;

public class CalcProbMapReduce {

    private static class Node {
        private Long dest = null;
        private Long count = null;

        public Node(Long dest, Long count) {
            super();
            this.dest = dest;
            this.count = count;
        }

        public Long getDest() {
            return dest;
        }

        public Long getCount() {
            return count;
        }

        @Override
        public String toString() {
            return "Node [dest=" + dest + ", count=" + count + "]";
        }

    }

    private static class CalcProbMapper extends MapperBase {

        private Record key = null;
        private Record value = null;

        @Override
        public void setup(TaskContext context) throws IOException {
            key = context.createMapOutputKeyRecord();
            value = context.createMapOutputValueRecord();
        }

        @Override
        public void map(long recordNum, Record record, TaskContext context)
                throws IOException {
            // a
            key.set(0, record.get(0));
            // b
            value.set(0, record.get(1));
            // count
            value.set(1, record.get(2));
            context.write(key, value);
        }
    }

    private static class CalcProbReducer extends ReducerBase {

        private Record output = null;

        @Override
        public void setup(TaskContext context) throws IOException {
            output = context.createOutputRecord();
        }

        @Override
        public void reduce(Record key, Iterator<Record> values,
                TaskContext context) throws IOException {
            // 不能直接使用record类型??
            List<Node> data = new ArrayList<>();
            Long sum = 0L;
            while (values.hasNext()) {
                Record val = values.next();
                Long dest = val.getBigint(0);
                if (dest == -1L) {
                    sum = val.getBigint(1);
                } else {
                    data.add(new Node(dest, val.getBigint(1)));
                }
            }
            for (Node node : data) {
                output.set(0, key.get(0));
                output.set(1, node.getDest());
                output.set(2, node.getCount() * 1.0 / sum);
                context.write(output);
            }
        }
    }

    public static void main(String[] args) throws OdpsException {
        JobConf job = new JobConf();

        job.setMapOutputKeySchema(SchemaUtils.fromString("a:bigint"));
        job.setMapOutputValueSchema(SchemaUtils
                .fromString("b:bigint,count:bigint"));

        InputUtils.addTable(TableInfo.builder()
                .tableName("fresh_comp_gengraph").build(), job);
        OutputUtils.addTable(
                TableInfo.builder().tableName("fresh_comp_graph_prob").build(),
                job);

        job.setMapperClass(CalcProbMapper.class);
        job.setReducerClass(CalcProbReducer.class);

        RunningJob rj = JobClient.runJob(job);
        rj.waitForCompletion();
    }

}

Input

39937040,56116303,1,9639p90,3544,2014-12-07 16:00:00 000
39937040,207737677,1,\N,5399,2014-12-10 09:00:00 000
39937040,172557089,1,96vulpw,3673,2014-12-15 12:00:00 000
39937040,230120970,1,96vujov,3528,2014-12-18 19:00:00 000
39937040,223424265,1,96vulqh,14079,2014-12-01 13:00:00 000
39937040,376053019,1,96vulsp,1838,2014-12-14 19:00:00 000
39937040,181946551,1,\N,1863,2014-12-06 10:00:00 000
39937040,153238305,1,96vulpe,8650,2014-12-12 12:00:00 000
39937040,126496010,1,96vujog,2953,2014-11-19 18:00:00 000
39937040,57167294,1,96vulqo,1863,2014-12-04 18:00:00 000
39937040,184420624,1,96vulp5,1863,2014-12-04 20:00:00 000
39937040,291670600,1,96vujo2,10894,2014-12-11 15:00:00 000
39937040,24900548,1,96vulp5,9541,2014-12-16 15:00:00 000
39937040,211568769,1,96vujod,9664,2014-12-12 21:00:00 000
39937040,383474182,1,96vujoh,6680,2014-11-20 08:00:00 000
39937040,89070719,1,96vujoa,7440,2014-12-12 21:00:00 000
39937040,264988743,1,96vulpu,1863,2014-12-16 12:00:00 000
39937040,264988743,1,96vulp4,1863,2014-12-16 12:00:00 000
39937040,282823170,1,96vulpj,13230,2014-12-17 17:00:00 000
39937040,63926863,1,96vulql,12560,2014-12-12 21:00:00 000
39937040,38038876,1,96vulph,11279,2014-11-27 10:00:00 000
39937040,178319842,1,96vulqk,2993,2014-12-15 17:00:00 000
39937040,178319842,1,96vulpj,2993,2014-12-10 17:00:00 000
39937040,209733546,1,96vulqr,1863,2014-11-28 20:00:00 000
39937040,225165987,3,96vulpn,1863,2014-11-28 21:00:00 000
39937040,378316358,1,96vulpc,13230,2014-12-09 17:00:00 000
39937040,176579443,1,96vulq7,1863,2014-11-28 20:00:00 000
39937040,323270984,1,96vujog,5232,2014-11-27 08:00:00 000
39937040,89182549,1,96vulp4,11721,2014-11-27 12:00:00 000
39937040,383474182,1,96vul9m,6680,2014-11-20 08:00:00 000
39937040,80991439,1,96vulqw,1863,2014-12-02 10:00:00 000
39937040,232219004,3,96vulpq,437,2014-12-10 17:00:00 000
39937040,235787727,1,96vulpw,6402,2014-12-06 11:00:00 000
39937040,235787727,1,96vul9m,6402,2014-12-03 14:00:00 000
39937040,21472534,1,96vulp5,5399,2014-12-14 19:00:00 000
39937040,138098039,1,\N,630,2014-12-10 09:00:00 000
39937040,171379996,1,96vujo7,11279,2014-11-18 19:00:00 000
39937040,365539187,1,96vulpw,3673,2014-11-27 21:00:00 000
39937040,278607965,1,96vujo4,10894,2014-12-11 15:00:00 000
39937040,390697266,1,96vulph,10894,2014-12-03 20:00:00 000
39937040,349933473,1,96vulpq,8874,2014-12-02 20:00:00 000
39937040,87480270,1,\N,2730,2014-12-06 18:00:00 000
39937040,235787727,1,96vujou,6402,2014-12-02 10:00:00 000
39937040,212790200,1,96vulpc,3673,2014-12-10 17:00:00 000
39937040,235787727,1,96vulp0,6402,2014-12-03 14:00:00 000
39937040,133421819,1,96vulpc,6402,2014-12-15 13:00:00 000
39937040,392416970,1,96vulpr,1863,2014-12-10 20:00:00 000
39937040,388400764,1,96vulqh,6513,2014-12-10 20:00:00 000
39937040,94439719,1,96vulqj,3673,2014-12-10 17:00:00 000
39937040,349516687,1,96vulph,8854,2014-11-23 19:00:00 000
39937040,340369269,1,96vulpg,11623,2014-12-10 17:00:00 000
39937040,84234904,1,96vulp1,1863,2014-11-22 19:00:00 000
39937040,92916363,1,96vulp7,1863,2014-12-10 16:00:00 000
39937040,34102362,1,96vujof,1863,2014-12-15 12:00:00 000
39937040,185782186,1,\N,1863,2014-12-06 09:00:00 000
39937040,30729236,1,96vulq6,5894,2014-11-30 17:00:00 000
39937040,112066266,1,96vulpm,2513,2014-12-15 17:00:00 000
39937040,333525414,1,\N,1863,2014-12-06 10:00:00 000
39937040,333525414,1,96vujou,1863,2014-12-10 20:00:00 000
39937040,283875166,1,96vulsp,12685,2014-12-03 21:00:00 000
39937040,7117934,1,\N,1863,2014-12-06 10:00:00 000
39937040,357386057,1,96vulqh,9752,2014-12-15 13:00:00 000
39937040,253857550,1,96vulpa,8291,2014-12-15 11:00:00 000
39937040,61837156,1,96vulpk,10875,2014-11-26 19:00:00 000
39937040,84234904,1,96vujl0,1863,2014-11-25 18:00:00 000
39937040,390535122,1,96vulqh,7580,2014-12-16 10:00:00 000
39937040,18236155,1,96vulp7,6000,2014-11-27 13:00:00 000
39937040,181364213,1,96vulpe,9516,2014-12-10 10:00:00 000
39937040,247894113,1,96vulpm,5027,2014-12-11 14:00:00 000
39937040,245229122,1,96vulpw,6344,2014-11-24 09:00:00 000
39937040,48989806,1,96vulp4,6680,2014-11-20 16:00:00 000
39937040,107143031,1,96vulq7,6792,2014-11-30 12:00:00 000
39937040,16914890,1,96vulpl,3660,2014-11-27 12:00:00 000
39937040,37572732,1,96vulpc,3472,2014-12-04 12:00:00 000
39937040,108987541,1,96vulph,5399,2014-11-28 19:00:00 000
39937040,57056012,1,96vujof,5232,2014-12-16 21:00:00 000
39937040,226040647,1,96vujjf,6680,2014-11-20 08:00:00 000
39937040,80461423,1,96vulqq,1863,2014-11-28 20:00:00 000
39937040,391325707,1,96vulpn,11623,2014-12-09 18:00:00 000
39937040,340790369,1,96vulpb,8291,2014-12-13 09:00:00 000
39937040,241385690,1,96vujmd,13230,2014-12-04 13:00:00 000
39937040,245495579,1,96vulqj,9517,2014-12-04 18:00:00 000
39937040,24530520,1,96vulpj,7273,2014-12-10 21:00:00 000
39937040,190421334,1,96vulpk,5894,2014-12-11 19:00:00 000
39937040,391325707,1,96vujov,11623,2014-12-10 17:00:00 000
39937040,214839404,1,96vulqm,1863,2014-12-03 11:00:00 000
39937040,347124429,1,96vulp4,5399,2014-12-15 12:00:00 000
39937040,268009377,1,96vulp5,11981,2014-12-03 21:00:00 000
39937040,194203437,1,96vujon,9720,2014-12-01 13:00:00 000
39937040,27434029,1,96vujoc,5232,2014-12-18 13:00:00 000
39937040,259414029,1,\N,1863,2014-12-06 10:00:00 000
39937040,174373414,1,96vujog,5894,2014-12-04 18:00:00 000
39937040,219105682,1,96vulpg,1083,2014-12-03 19:00:00 000
39937040,181777399,1,96vulpf,2825,2014-12-04 18:00:00 000
39937040,85464108,1,96vulqq,5027,2014-12-04 20:00:00 000
39937040,215520590,1,96vulpv,10392,2014-12-09 18:00:00 000
39937040,256722576,1,96vulp4,1863,2014-12-14 19:00:00 000
39937040,112905305,1,96vulp4,1863,2014-11-28 20:00:00 000
39937040,88326756,1,96vulqw,1863,2014-11-28 21:00:00 000
39937040,366042021,1,96vulp1,14079,2014-12-16 15:00:00 000

推荐

''' # Created on 2016/05/05 @author: wwhhf '''
from __init__ import FreshCompItemRes
from py_odps_project import table_fresh_comp_item_res
from py_odps_op import write, execsql, truncate
import re

def genRecord(table,list):
    record = table.new_record()
    for i in range(len(list)):
        record[i]=list[i]
    return record

if __name__ == '__main__':
# truncate table fresh_comp_item_res
    truncate(FreshCompItemRes)
# score max
    maxx_score_item_id=execsql(r'select item_id,score as val from fresh_comp_item_buy order by val desc Limit 1')[0]['item_id']
    users=execsql(r'select user_id from fresh_comp_user_item')
    for user in users:
        user_id=user[0]
        sql_res=execsql((r'select buy_item_ids from fresh_comp_user_item where user_id = %s')%user_id)[0]['buy_item_ids']
        item_ids,count=re.subn(ur'\s\d+?', '', str(sql_res))
        items_types=str(item_ids).split(",")
        for item_type in items_types[-3:-1]:
            item_id=str(item_type)
            next_items=execsql((r'select item_id_b,probability from fresh_comp_graph_prob where item_id_a = %s and item_id_b not in (%s) order by probability desc ,item_id_b asc Limit 2')%(item_id,str(item_ids)))
            flag=False
            for next_item in next_items:
                flag=True
                next_item_id=next_item[0]
                print [user_id,next_item_id]
                write(table_fresh_comp_item_res,genRecord(table_fresh_comp_item_res,list([user_id,next_item_id])))
            if flag is False:
                print [user_id,maxx_score_item_id]
                write(table_fresh_comp_item_res,genRecord(table_fresh_comp_item_res,list([user_id,maxx_score_item_id])))

你可能感兴趣的:(ODPS MapReduce入门 - 推荐商品)