Hadoop中利用自身类实现json格式转换为对象。

假如在大数据中我们遇到了下面JSON格式的数据,我们想统计每部电影rate的综合。比较正规的做法是通过json解析成为对象的格式。虽然我们可以通过JSON-Simple导入jar包的方式处理,但实际上最方便的还是利用

hadoop自身 org.codehaus.jackson.map.ObjectMapper;类直接处理最为简单

{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
{"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}
{"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}

首先你需要定义一个javabean用来存储数据,这个bean需要实现Writable接口,因为网络直接传输对象是需要序列化的。

不了解序列化的同学,可以复习一下java序列化的知识。

package com.mre.moviecount;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class MovieBean implements Writable{
	private long movie;
	private long rate;
	private long timeStamp;
	private long uid;
	
	public MovieBean(){}
	public MovieBean(long movie,long rate){
		this.movie=movie;
		this.rate = rate;
	}
	public long getMovie() {
		return movie;
	}
	public void setMovie(long movie) {
		this.movie = movie;
	}
	public long getRate() {
		return rate;
	}
	public void setRate(long rate) {
		this.rate = rate;
	}
	public long gettimeStamp() {
		return timeStamp;
	}
	public void settimeStamp(long timeStamp) {
		this.timeStamp = timeStamp;
	}
	public long getUid() {
		return uid;
	}
	public void setUid(long uid) {
		this.uid = uid;
	}
	
	@Override
	public void write(DataOutput out) throws IOException {
		//把对象序列化的过程
		out.writeLong(movie);
		out.writeLong(rate);
		out.writeLong(timeStamp);
		out.writeLong(uid);
		
	}
	@Override
	public void readFields(DataInput in) throws IOException {
		//把把序列化的对象读取到内存中
		movie = in.readLong();
	    rate =in.readLong();
		timeStamp =in.readLong();
		uid=in.readLong();
	}
	
	
	@Override
	public String toString() {
		return "movie:" + movie + ", rate=" + rate;
	}
	
}

然后就是正常的流程mapper和reducer来实现具体的机能。

package com.mre.moviecount;

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.codehaus.jackson.map.ObjectMapper;

public class RateCount {
	public static class MyMapper extends Mapper {

		protected void map(Object key, Text value,	Context context) throws IOException, InterruptedException {
			String line = value.toString();	
			ObjectMapper om = new ObjectMapper();//把读取的json转化为对象
			MovieBean moviebean = om.readValue(line, MovieBean.class);
			//把movie对象和和id放入map处理   ※可以直接放rate,
			context.write(new LongWritable(moviebean.getMovie()), moviebean);
		}
		
	}
	
	public static class MyReducer extends Reducer {

		@Override
		protected void reduce(LongWritable key, Iterable values, Context context) 
					throws IOException, InterruptedException {
				
				long count =0;
				for(MovieBean mb:values){
					count+=mb.getRate();//把movie为单位的rate累加
				}
				
				//我们把count和movie存入对象,最终把对象输出到hdfs的文件中
				//输出的格式就是我们在MovieBean中的toString方法定义的格式。
				MovieBean moviebean = new MovieBean(key.get(),count);
				context.write(key, moviebean);
		}
		
	}
	
	public static void main(String[] args) throws Exception{
		Job job = Job.getInstance();
		job.setJarByClass(RateCount.class);
		job.setMapperClass(MyMapper.class);
		job.setReducerClass(MyReducer.class); 
		
//		job.setMapOutputKeyClass(Text.class);
//		job.setMapOutputValueClass(NullWritable.class);
		
		job.setOutputKeyClass(LongWritable.class);
		job.setOutputValueClass(MovieBean.class);
		
		FileInputFormat.addInputPath(job, new Path("hdfs://192.168.32.30:9000/input/movie.json" ));
		
		FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.32.30:9000/output6"));
		job.waitForCompletion(true);
	}
}

最后输出的结果

379	movie:379, rate=1260
380	movie:380, rate=5088
381	movie:381, rate=688
382	movie:382, rate=798
383	movie:383, rate=882
384	movie:384, rate=116
385	movie:385, rate=34
386	movie:386, rate=78
387	movie:387, rate=280
388	movie:388, rate=353
389	movie:389, rate=74
390	movie:390, rate=209
391	movie:391, rate=131
392	movie:392, rate=70
393	movie:393, rate=195
394	movie:394, rate=66
396	movie:396, rate=4



你可能感兴趣的:(Hadoop中利用自身类实现json格式转换为对象。)