用户通过在jar命令中设置–local参数,在本地模拟MapReduce的运行过程,从而进行本地调试。本地运行时, 客户端会从ODPS中下载本地调试所需要的输入表的元信息、数据,所需要的资源以及输出表的元信息,并将这些信息保存到一个名为warehouse的本地目录中。 在程序运行结束后,会将计算结果输出到warehouse目录内的一个文件中。如果本地的warehouse目录下已经下载了输入表及被引用的资源,在下一次运行时, 会直接引用warehouse下的数据及文件,而不会重复下载。
在本地运行过程中,仍然会启动多个Map及Reduce进程处理数据,但这些进程不是并发运行,而是依次串行运行。此外这个模拟运行过程与真正的分布式运行有如下差别:
package mydemo;
import java.io.IOException;
import java.util.Iterator;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;
/** * 模型:商品id+商品类别id->浏览次数+收藏次数+放入购物车次数->购买次数 * * @author wwhhf * */
public class ItemBuyMapReduce {
public static class ItemBuyMapper extends MapperBase {
private Record key = null;
private Record value = null;
@Override
public void setup(TaskContext context) throws IOException {
key = context.createMapOutputKeyRecord();
value = context.createMapOutputValueRecord();
}
@Override
public void map(long recordNum, Record record, TaskContext context)
throws IOException {
if (record.getColumnCount() == 6) {
Long item_id = record.getBigint(1);
Long item_category = record.getBigint(4);
Long behavior_type = record.getBigint(2);
key.setBigint(0, item_id);
key.setBigint(1, item_category);
value.setBigint(0, behavior_type);
context.write(key, value);
}
}
}
public static class ItemBuyReducer extends ReducerBase {
private Record output = null;
@Override
public void setup(TaskContext context) throws IOException {
output = context.createOutputRecord();
}
@Override
public void reduce(Record key, Iterator<Record> values,
TaskContext context) throws IOException {
Long cnt[] = new Long[] { 0L, 0L, 0L, 0L };
while (values.hasNext()) {
Record val = values.next();
cnt[(int) (val.getBigint(0) - 1)]++;
}
// item_id
output.set(0, key.get(0));
// item_cat
output.set(1, key.get(1));
// 浏览次数
output.set(2, cnt[0]);
// 是否收藏
output.set(3, cnt[1]);
// 是否放入购物车
output.set(4, cnt[2]);
// 是否购买
output.set(5, cnt[3]);
context.write(output);
}
}
public static void main(String[] args) throws OdpsException {
JobConf job = new JobConf();
job.setMapOutputKeySchema(SchemaUtils.fromString("item_id:bigint,"
+ "item_category:bigint"));
job.setMapOutputValueSchema(SchemaUtils.fromString("browse_cnt:bigint,"
+ "is_collect:boolean," + "is_shop_car:boolean,"
+ "is_pay:boolean"));
InputUtils.addTable(
TableInfo.builder().tableName("tianchi_fresh_comp_train_user")
.build(), job);
OutputUtils.addTable(
TableInfo.builder().tableName("fresh_comp_item_buy").build(),
job);
job.setMapperClass(ItemBuyMapper.class);
job.setReducerClass(ItemBuyReducer.class);
RunningJob rj = JobClient.runJob(job);
rj.waitForCompletion();
}
}
package fresh_comp_offline;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;
/** * 统计每个用户对哪些的商品id发生了哪些行为 * * @author wwhhf * */
public class UserItemMapReduce {
private static class Node {
private Long item_id;
private Long type;
private Date time;
public Long getType() {
return type;
}
public Long getItem_id() {
return item_id;
}
public Date getTime() {
return time;
}
public Node(Long item_id, Long type, Date time) {
super();
this.item_id = item_id;
this.type = type;
this.time = time;
}
}
private static class MyComparator implements Comparator<Node> {
@Override
public int compare(Node o1, Node o2) {
return o1.getTime().compareTo(o2.getTime());
}
}
private static class UserItemMapper extends MapperBase {
private Record key = null;
private Record value = null;
@Override
public void setup(TaskContext context) throws IOException {
key = context.createMapOutputKeyRecord();
value = context.createMapOutputValueRecord();
}
@Override
public void map(long recordNum, Record record, TaskContext context)
throws IOException {
if (record.getColumnCount() == 6) {
key.set(0, record.getBigint(0));
value.set(0, record.getBigint(1));
value.set(1, record.getBigint(2));
value.set(2, record.getDatetime(5));
context.write(key, value);
}
}
}
private static class UserItemReducer extends ReducerBase {
private Record output = null;
@Override
public void setup(TaskContext context) throws IOException {
output = context.createOutputRecord();
}
@Override
public void reduce(Record key, Iterator<Record> values,
TaskContext context) throws IOException {
StringBuffer sb = new StringBuffer();
List<Node> list = new ArrayList<>();
while (values.hasNext()) {
Record val = values.next();
list.add(new Node(val.getBigint(0), val.getBigint(1), val
.getDatetime(2)));
}
Collections.sort(list, new MyComparator());
for (Node item : list) {
// user_id,item_id type,....
sb.append(item.getItem_id()).append(" ").append(item.getType())
.append(",");
}
output.set(0, key.getBigint(0));
output.set(1, sb.toString().substring(0, sb.length() - 1));
context.write(output);
}
}
public static void main(String[] args) throws OdpsException {
JobConf job = new JobConf();
job.setMapOutputKeySchema(SchemaUtils.fromString("user_id:bigint"));
job.setMapOutputValueSchema(SchemaUtils
.fromString("item_id:bigint,type:bigint,time:datetime"));
InputUtils.addTable(
TableInfo.builder().tableName("tianchi_fresh_comp_train_user")
.build(), job);
OutputUtils.addTable(
TableInfo.builder().tableName("fresh_comp_user_item").build(),
job);
job.setMapperClass(UserItemMapper.class);
job.setReducerClass(UserItemReducer.class);
RunningJob rj = JobClient.runJob(job);
rj.waitForCompletion();
}
}
package fresh_comp_offline;
import java.io.IOException;
import java.util.Iterator;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;
/** * 将购买商品的顺序形成有向图 * * @author wwhhf * */
public class GenGraphMapReduce {
private static class GenGraphMapper extends MapperBase {
private Record key = null;
private Record value = null;
private Record returnNum(String type) {
value.setBigint(0, Long.valueOf(type));
return value;
}
@Override
public void setup(TaskContext context) throws IOException {
key = context.createMapOutputKeyRecord();
value = context.createMapOutputValueRecord();
}
@Override
public void map(long recordNum, Record record, TaskContext context)
throws IOException {
// 所购买的商品item_id(时间升序)
String item_ids[] = record.getString(1).split(",");
for (int len = item_ids.length, i = len - 1; i > 0; i--) {
String a[] = item_ids[i - 1].split(" ");
String b[] = item_ids[i].split(" ");
// 商品a
key.set(0, Long.valueOf(a[0]));
// 商品b
key.set(1, Long.valueOf(b[0]));
// a->b一条有向边
context.write(key, returnNum(b[1]));
// 任何商品
key.set(1, -1L);
// a->任何商品一条有向边
context.write(key, returnNum(b[1]));
}
}
}
private static class GenGraphReducer extends ReducerBase {
private Record output = null;
@Override
public void setup(TaskContext context) throws IOException {
output = context.createOutputRecord();
}
@Override
public void reduce(Record key, Iterator<Record> values,
TaskContext context) throws IOException {
Long sum = 0L;
while (values.hasNext()) {
Record val = values.next();
sum = sum + val.getBigint(0);
}
output.set(0, key.get(0));
output.set(1, key.get(1));
output.set(2, sum);
context.write(output);
}
}
public static void main(String[] args) throws OdpsException {
JobConf job = new JobConf();
job.setMapOutputKeySchema(SchemaUtils
.fromString("item_id_a:bigint,item_id_b:bigint"));
job.setMapOutputValueSchema(SchemaUtils.fromString("count:bigint"));
InputUtils.addTable(
TableInfo.builder().tableName("fresh_comp_user_item").build(),
job);
OutputUtils.addTable(
TableInfo.builder().tableName("fresh_comp_gengraph").build(),
job);
job.setMapperClass(GenGraphMapper.class);
job.setReducerClass(GenGraphReducer.class);
RunningJob rj = JobClient.runJob(job);
rj.waitForCompletion();
}
}
package fresh_comp_offline;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;
public class CalcProbMapReduce {
private static class Node {
private Long dest = null;
private Long count = null;
public Node(Long dest, Long count) {
super();
this.dest = dest;
this.count = count;
}
public Long getDest() {
return dest;
}
public Long getCount() {
return count;
}
@Override
public String toString() {
return "Node [dest=" + dest + ", count=" + count + "]";
}
}
private static class CalcProbMapper extends MapperBase {
private Record key = null;
private Record value = null;
@Override
public void setup(TaskContext context) throws IOException {
key = context.createMapOutputKeyRecord();
value = context.createMapOutputValueRecord();
}
@Override
public void map(long recordNum, Record record, TaskContext context)
throws IOException {
// a
key.set(0, record.get(0));
// b
value.set(0, record.get(1));
// count
value.set(1, record.get(2));
context.write(key, value);
}
}
private static class CalcProbReducer extends ReducerBase {
private Record output = null;
@Override
public void setup(TaskContext context) throws IOException {
output = context.createOutputRecord();
}
@Override
public void reduce(Record key, Iterator<Record> values,
TaskContext context) throws IOException {
// 不能直接使用record类型??
List<Node> data = new ArrayList<>();
Long sum = 0L;
while (values.hasNext()) {
Record val = values.next();
Long dest = val.getBigint(0);
if (dest == -1L) {
sum = val.getBigint(1);
} else {
data.add(new Node(dest, val.getBigint(1)));
}
}
for (Node node : data) {
output.set(0, key.get(0));
output.set(1, node.getDest());
output.set(2, node.getCount() * 1.0 / sum);
context.write(output);
}
}
}
public static void main(String[] args) throws OdpsException {
JobConf job = new JobConf();
job.setMapOutputKeySchema(SchemaUtils.fromString("a:bigint"));
job.setMapOutputValueSchema(SchemaUtils
.fromString("b:bigint,count:bigint"));
InputUtils.addTable(TableInfo.builder()
.tableName("fresh_comp_gengraph").build(), job);
OutputUtils.addTable(
TableInfo.builder().tableName("fresh_comp_graph_prob").build(),
job);
job.setMapperClass(CalcProbMapper.class);
job.setReducerClass(CalcProbReducer.class);
RunningJob rj = JobClient.runJob(job);
rj.waitForCompletion();
}
}
39937040,56116303,1,9639p90,3544,2014-12-07 16:00:00 000
39937040,207737677,1,\N,5399,2014-12-10 09:00:00 000
39937040,172557089,1,96vulpw,3673,2014-12-15 12:00:00 000
39937040,230120970,1,96vujov,3528,2014-12-18 19:00:00 000
39937040,223424265,1,96vulqh,14079,2014-12-01 13:00:00 000
39937040,376053019,1,96vulsp,1838,2014-12-14 19:00:00 000
39937040,181946551,1,\N,1863,2014-12-06 10:00:00 000
39937040,153238305,1,96vulpe,8650,2014-12-12 12:00:00 000
39937040,126496010,1,96vujog,2953,2014-11-19 18:00:00 000
39937040,57167294,1,96vulqo,1863,2014-12-04 18:00:00 000
39937040,184420624,1,96vulp5,1863,2014-12-04 20:00:00 000
39937040,291670600,1,96vujo2,10894,2014-12-11 15:00:00 000
39937040,24900548,1,96vulp5,9541,2014-12-16 15:00:00 000
39937040,211568769,1,96vujod,9664,2014-12-12 21:00:00 000
39937040,383474182,1,96vujoh,6680,2014-11-20 08:00:00 000
39937040,89070719,1,96vujoa,7440,2014-12-12 21:00:00 000
39937040,264988743,1,96vulpu,1863,2014-12-16 12:00:00 000
39937040,264988743,1,96vulp4,1863,2014-12-16 12:00:00 000
39937040,282823170,1,96vulpj,13230,2014-12-17 17:00:00 000
39937040,63926863,1,96vulql,12560,2014-12-12 21:00:00 000
39937040,38038876,1,96vulph,11279,2014-11-27 10:00:00 000
39937040,178319842,1,96vulqk,2993,2014-12-15 17:00:00 000
39937040,178319842,1,96vulpj,2993,2014-12-10 17:00:00 000
39937040,209733546,1,96vulqr,1863,2014-11-28 20:00:00 000
39937040,225165987,3,96vulpn,1863,2014-11-28 21:00:00 000
39937040,378316358,1,96vulpc,13230,2014-12-09 17:00:00 000
39937040,176579443,1,96vulq7,1863,2014-11-28 20:00:00 000
39937040,323270984,1,96vujog,5232,2014-11-27 08:00:00 000
39937040,89182549,1,96vulp4,11721,2014-11-27 12:00:00 000
39937040,383474182,1,96vul9m,6680,2014-11-20 08:00:00 000
39937040,80991439,1,96vulqw,1863,2014-12-02 10:00:00 000
39937040,232219004,3,96vulpq,437,2014-12-10 17:00:00 000
39937040,235787727,1,96vulpw,6402,2014-12-06 11:00:00 000
39937040,235787727,1,96vul9m,6402,2014-12-03 14:00:00 000
39937040,21472534,1,96vulp5,5399,2014-12-14 19:00:00 000
39937040,138098039,1,\N,630,2014-12-10 09:00:00 000
39937040,171379996,1,96vujo7,11279,2014-11-18 19:00:00 000
39937040,365539187,1,96vulpw,3673,2014-11-27 21:00:00 000
39937040,278607965,1,96vujo4,10894,2014-12-11 15:00:00 000
39937040,390697266,1,96vulph,10894,2014-12-03 20:00:00 000
39937040,349933473,1,96vulpq,8874,2014-12-02 20:00:00 000
39937040,87480270,1,\N,2730,2014-12-06 18:00:00 000
39937040,235787727,1,96vujou,6402,2014-12-02 10:00:00 000
39937040,212790200,1,96vulpc,3673,2014-12-10 17:00:00 000
39937040,235787727,1,96vulp0,6402,2014-12-03 14:00:00 000
39937040,133421819,1,96vulpc,6402,2014-12-15 13:00:00 000
39937040,392416970,1,96vulpr,1863,2014-12-10 20:00:00 000
39937040,388400764,1,96vulqh,6513,2014-12-10 20:00:00 000
39937040,94439719,1,96vulqj,3673,2014-12-10 17:00:00 000
39937040,349516687,1,96vulph,8854,2014-11-23 19:00:00 000
39937040,340369269,1,96vulpg,11623,2014-12-10 17:00:00 000
39937040,84234904,1,96vulp1,1863,2014-11-22 19:00:00 000
39937040,92916363,1,96vulp7,1863,2014-12-10 16:00:00 000
39937040,34102362,1,96vujof,1863,2014-12-15 12:00:00 000
39937040,185782186,1,\N,1863,2014-12-06 09:00:00 000
39937040,30729236,1,96vulq6,5894,2014-11-30 17:00:00 000
39937040,112066266,1,96vulpm,2513,2014-12-15 17:00:00 000
39937040,333525414,1,\N,1863,2014-12-06 10:00:00 000
39937040,333525414,1,96vujou,1863,2014-12-10 20:00:00 000
39937040,283875166,1,96vulsp,12685,2014-12-03 21:00:00 000
39937040,7117934,1,\N,1863,2014-12-06 10:00:00 000
39937040,357386057,1,96vulqh,9752,2014-12-15 13:00:00 000
39937040,253857550,1,96vulpa,8291,2014-12-15 11:00:00 000
39937040,61837156,1,96vulpk,10875,2014-11-26 19:00:00 000
39937040,84234904,1,96vujl0,1863,2014-11-25 18:00:00 000
39937040,390535122,1,96vulqh,7580,2014-12-16 10:00:00 000
39937040,18236155,1,96vulp7,6000,2014-11-27 13:00:00 000
39937040,181364213,1,96vulpe,9516,2014-12-10 10:00:00 000
39937040,247894113,1,96vulpm,5027,2014-12-11 14:00:00 000
39937040,245229122,1,96vulpw,6344,2014-11-24 09:00:00 000
39937040,48989806,1,96vulp4,6680,2014-11-20 16:00:00 000
39937040,107143031,1,96vulq7,6792,2014-11-30 12:00:00 000
39937040,16914890,1,96vulpl,3660,2014-11-27 12:00:00 000
39937040,37572732,1,96vulpc,3472,2014-12-04 12:00:00 000
39937040,108987541,1,96vulph,5399,2014-11-28 19:00:00 000
39937040,57056012,1,96vujof,5232,2014-12-16 21:00:00 000
39937040,226040647,1,96vujjf,6680,2014-11-20 08:00:00 000
39937040,80461423,1,96vulqq,1863,2014-11-28 20:00:00 000
39937040,391325707,1,96vulpn,11623,2014-12-09 18:00:00 000
39937040,340790369,1,96vulpb,8291,2014-12-13 09:00:00 000
39937040,241385690,1,96vujmd,13230,2014-12-04 13:00:00 000
39937040,245495579,1,96vulqj,9517,2014-12-04 18:00:00 000
39937040,24530520,1,96vulpj,7273,2014-12-10 21:00:00 000
39937040,190421334,1,96vulpk,5894,2014-12-11 19:00:00 000
39937040,391325707,1,96vujov,11623,2014-12-10 17:00:00 000
39937040,214839404,1,96vulqm,1863,2014-12-03 11:00:00 000
39937040,347124429,1,96vulp4,5399,2014-12-15 12:00:00 000
39937040,268009377,1,96vulp5,11981,2014-12-03 21:00:00 000
39937040,194203437,1,96vujon,9720,2014-12-01 13:00:00 000
39937040,27434029,1,96vujoc,5232,2014-12-18 13:00:00 000
39937040,259414029,1,\N,1863,2014-12-06 10:00:00 000
39937040,174373414,1,96vujog,5894,2014-12-04 18:00:00 000
39937040,219105682,1,96vulpg,1083,2014-12-03 19:00:00 000
39937040,181777399,1,96vulpf,2825,2014-12-04 18:00:00 000
39937040,85464108,1,96vulqq,5027,2014-12-04 20:00:00 000
39937040,215520590,1,96vulpv,10392,2014-12-09 18:00:00 000
39937040,256722576,1,96vulp4,1863,2014-12-14 19:00:00 000
39937040,112905305,1,96vulp4,1863,2014-11-28 20:00:00 000
39937040,88326756,1,96vulqw,1863,2014-11-28 21:00:00 000
39937040,366042021,1,96vulp1,14079,2014-12-16 15:00:00 000
''' # Created on 2016/05/05 @author: wwhhf '''
from __init__ import FreshCompItemRes
from py_odps_project import table_fresh_comp_item_res
from py_odps_op import write, execsql, truncate
import re
def genRecord(table,list):
record = table.new_record()
for i in range(len(list)):
record[i]=list[i]
return record
if __name__ == '__main__':
# truncate table fresh_comp_item_res
truncate(FreshCompItemRes)
# score max
maxx_score_item_id=execsql(r'select item_id,score as val from fresh_comp_item_buy order by val desc Limit 1')[0]['item_id']
users=execsql(r'select user_id from fresh_comp_user_item')
for user in users:
user_id=user[0]
sql_res=execsql((r'select buy_item_ids from fresh_comp_user_item where user_id = %s')%user_id)[0]['buy_item_ids']
item_ids,count=re.subn(ur'\s\d+?', '', str(sql_res))
items_types=str(item_ids).split(",")
for item_type in items_types[-3:-1]:
item_id=str(item_type)
next_items=execsql((r'select item_id_b,probability from fresh_comp_graph_prob where item_id_a = %s and item_id_b not in (%s) order by probability desc ,item_id_b asc Limit 2')%(item_id,str(item_ids)))
flag=False
for next_item in next_items:
flag=True
next_item_id=next_item[0]
print [user_id,next_item_id]
write(table_fresh_comp_item_res,genRecord(table_fresh_comp_item_res,list([user_id,next_item_id])))
if flag is False:
print [user_id,maxx_score_item_id]
write(table_fresh_comp_item_res,genRecord(table_fresh_comp_item_res,list([user_id,maxx_score_item_id])))