要求是:
根据movieid分组,求同一个电影的前二十的评分记录
数据如下:
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
{"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}
{"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}
{"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"}
{"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"}
{"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"}
.....
实现方法主要有两种,第一种是常规操作,map阶段产生Text MovieBean,reduce根据map阶段产生的结果进行聚合,遍历迭代器,将bean放到list里,然后集合排序,排序的时候按照rate进行排序,输出前二十条记录。
代码如下:
MovieBean:
package com.TopN4;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.codehaus.jackson.annotate.JsonIgnoreProperties;
@JsonIgnoreProperties(ignoreUnknown = true)
public class MovieBean implements WritableComparable {
//{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
private String movie;
private int rate;
private String timeStamp;
private String uid;
@Override
public int compareTo(MovieBean o) {
if(o.getUid().compareTo(this.getUid())==0) {
return o.getRate()-this.getRate();
}else {
return o.getUid().compareTo(this.getUid());
}
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
movie = in.readUTF();
rate = in.readInt();
timeStamp = in.readUTF();
uid = in.readUTF();
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeUTF(movie);
out.writeInt(rate);
out.writeUTF(timeStamp);
out.writeUTF(uid);
}
public String getMovie() {
return movie;
}
public void setMovie(String movie) {
this.movie = movie;
}
public int getRate() {
return rate;
}
public void setRate(int rate) {
this.rate = rate;
}
public String getTimeStamp() {
return timeStamp;
}
public void setTimeStamp(String timeStamp) {
this.timeStamp = timeStamp;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
@Override
public String toString() {
return "MovieBean [movie=" + movie + ", rate=" + rate + ", timestamp=" + timeStamp + ", uid=" + uid + "]";
}
public void set(String movie, int rate, String timeStamp, String uid) {
this.movie = movie;
this.rate = rate;
this.timeStamp = timeStamp;
this.uid = uid;
}
public MovieBean() {
super();
// TODO Auto-generated constructor stub
}
public MovieBean(String movie, int rate, String timeStamp, String uid) {
super();
this.movie = movie;
this.rate = rate;
this.timeStamp = timeStamp;
this.uid = uid;
}
public void set(MovieBean movieBean) {
// TODO Auto-generated method stub
this.movie=movieBean.getMovie();
this.rate=movieBean.getRate();
this.timeStamp=movieBean.getTimeStamp();
this.uid = movieBean.getUid();
}
}
package com.avg;
import java.io.File;
/**
* 每个电影前二十的评分
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.codehaus.jackson.map.ObjectMapper;
public class TopN2 {
public static class MapTask extends Mapper {
@Override
protected void map(LongWritable key, Text value, Mapper.Context context)
throws IOException, InterruptedException {
try {
ObjectMapper objectMapper = new ObjectMapper();
MovieBean bean = objectMapper.readValue(value.toString(), MovieBean.class);
String movie = bean.getMovie();
if (movie != null) {
context.write(new Text(movie), bean);
} else {
System.err.println(movie);
}
} catch (Exception e) {
// TODO: handle exception
}
}
}
public static class ReduceTask extends Reducer {
@Override
protected void reduce(Text key, Iterable values,
Reducer.Context context)
throws IOException, InterruptedException {
List list = new ArrayList<>();
for (MovieBean movieBean : values) {
MovieBean bean = new MovieBean(movieBean.getMovie(), movieBean.getRate(), movieBean.getTimeStamp(),
movieBean.getUid());
list.add(bean);
}
Collections.sort(list, new Comparator() {
@Override
public int compare(MovieBean o1, MovieBean o2) {
// TODO Auto-generated method stub
return o2.getRate() - o1.getRate();
}
});
for (int i = 0; i < Math.min(20, list.size()); i++) {
context.write(list.get(i), NullWritable.get());
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(TopN2.class);
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(MovieBean.class);
job.setOutputKeyClass(MovieBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("d://data//rating.json"));
FileOutputFormat.setOutputPath(job, new Path("d://out//rat2"));
File file = new File("d://out//rat2");
if (file.exists()) {
FileUtils.deleteDirectory(file);
}
boolean completion = job.waitForCompletion(true);
System.out.println(completion ? "执行成功" : "执行失败");
}
}
第二种实现:map阶段直接输出整个Moviebean,reduce进行分组。
代码如下:
package com.TopN4;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.codehaus.jackson.map.ObjectMapper;
public class TopN4 {
public static class MapTask extends Mapper {
@Override
protected void map(LongWritable key, Text value,
Mapper.Context context)
throws IOException, InterruptedException {
ObjectMapper mapper = new ObjectMapper();
MovieBean bean = mapper.readValue(value.toString(), MovieBean.class);
context.write(bean, NullWritable.get());
}
}
public static class ReduceTask extends Reducer {
@Override
protected void reduce(MovieBean key, Iterable values,
Reducer.Context context)
throws IOException, InterruptedException {
int num = 0;
//此处能进行遍历的原因是存储的时候并不是一个key后面跟许多的value,同样是key-value、key-value的形式,另外,key虽然是null,但是每次的null也是不一样的
for (NullWritable nullWritable : values) {
if (num >= 20) {
break;
}
num++;
context.write(key, NullWritable.get());
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "topn4");
// 设置map和reduce,以及提交的jar
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
job.setJarByClass(TopN4.class);
job.setNumReduceTasks(2);//设置reduce的数量
job.setPartitionerClass(MyPartition.class);//设置分区的规则
job.setGroupingComparatorClass(MyGroup.class);//设置分组的规则
// 设置输出类型
job.setMapOutputKeyClass(MovieBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(MovieBean.class);
job.setOutputValueClass(NullWritable.class);
// 输入和输出目录
FileInputFormat.addInputPath(job, new Path("d:/data/rating.json"));
FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\topN4"));
// 判断文件是否存在
File file = new File("d:\\data\\out\\topN4");
if (file.exists()) {
FileUtils.deleteDirectory(file);
}
// 提交任务
boolean completion = job.waitForCompletion(true);
System.out.println(completion ? "你很优秀!!!" : "滚去调bug!!");
}
}
重点内容
==
**