mapreduce编程实例(4)-求中位数和标准差

这个实例解决问题是:计算一天的每个小时中,网站新增评论长度的中位数和这些长度之间的标准差。代码如下:

package mrdp.ch2;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Map;

import mrdp.utils.MRDPUtils;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class MedianStdDevDriver {

	public static class SOMedianStdDevMapper extends
			Mapper {

		private IntWritable outHour = new IntWritable();
		private IntWritable outCommentLength = new IntWritable();

		private final static SimpleDateFormat frmt = new SimpleDateFormat(
				"yyyy-MM-dd'T'HH:mm:ss.SSS");

		@SuppressWarnings("deprecation")
		@Override
		public void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {

			// Parse the input string into a nice map
			Map parsed = MRDPUtils.transformXmlToMap(value.toString());

			// Grab the "CreationDate" field,
			// since it is what we are grouping by
			String strDate = parsed.get("CreationDate");

			// Grab the comment to find the length
			String text = parsed.get("Text");

			// .get will return null if the key is not there
			if (strDate == null || text == null) {
				// skip this record
				return;
			}

			try {
				// get the hour this comment was posted in
				Date creationDate = frmt.parse(strDate);
				outHour.set(creationDate.getHours());

				// get the comment length
				outCommentLength.set(text.length());

				// write out the user ID with min max dates and count
				context.write(outHour, outCommentLength);

			} catch (ParseException e) {
				System.err.println(e.getMessage());
				return;
			}
		}
	}

	public static class SOMedianStdDevReducer extends
			Reducer {
		private MedianStdDevTuple result = new MedianStdDevTuple();
		private ArrayList commentLengths = new ArrayList();

		@Override
		public void reduce(IntWritable key, Iterable values,
				Context context) throws IOException, InterruptedException {

			float sum = 0;
			float count = 0;
			commentLengths.clear();
			result.setStdDev(0);
			
			// Iterate through all input values for this key
			for (IntWritable val : values) {
				commentLengths.add((float) val.get());
				sum += val.get();
				++count;
			}

			// sort commentLengths to calculate median
			Collections.sort(commentLengths);

			// if commentLengths is an even value, average middle two elements
			if (count % 2 == 0) {
				result.setMedian((commentLengths.get((int) count / 2 - 1) + commentLengths
						.get((int) count / 2)) / 2.0f);
			} else {
				// else, set median to middle value
				result.setMedian(commentLengths.get((int) count / 2));
			}

			// calculate standard deviation
			float mean = sum / count;

			float sumOfSquares = 0.0f;
			for (Float f : commentLengths) {
				sumOfSquares += (f - mean) * (f - mean);
			}

			result.setStdDev((float) Math.sqrt(sumOfSquares / (count - 1)));

			context.write(key, result);
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args)
				.getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: MedianStdDevDriver  ");
			System.exit(2);
		}
		Job job = new Job(conf,
				"StackOverflow Comment Length Median StdDev By Hour");
		job.setJarByClass(MedianStdDevDriver.class);
		job.setMapperClass(SOMedianStdDevMapper.class);
		job.setReducerClass(SOMedianStdDevReducer.class);
		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(MedianStdDevTuple.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

	public static class MedianStdDevTuple implements Writable {
		private float median = 0;
		private float stddev = 0f;

		public float getMedian() {
			return median;
		}

		public void setMedian(float median) {
			this.median = median;
		}

		public float getStdDev() {
			return stddev;
		}

		public void setStdDev(float stddev) {
			this.stddev = stddev;
		}

		@Override
		public void readFields(DataInput in) throws IOException {
			median = in.readFloat();
			stddev = in.readFloat();
		}

		@Override
		public void write(DataOutput out) throws IOException {
			out.writeFloat(median);
			out.writeFloat(stddev);
		}

		@Override
		public String toString() {
			return median + "\t" + stddev;
		}
	}
}
这里在计算中位数时稍微有点技巧,先把所有的commments 长度存入一个数组中,然后对这个数据进行排序,排序完后取下标为中间那个即可。求中间下标那个对应的长度时,分两种情况,即数组长度为偶数和奇数时,做了分别计算。

求标准差就是简单的根据数学定义求的。

计算结果如下:

jpan@jpan-Beijing:~/Mywork/mapreducepatterns/testdata$ hadoop fs -cat output3/part-r-00000
0	145.5	158.66512
1	218.0	150.04599
2	139.0	148.84734
3	200.0	158.28148
4	139.5	158.62466
5	122.5	167.31377
6	199.5	160.57263
7	238.0	175.86475
8	253.5	164.08226
9	232.0	167.5952
10	200.0	157.11778
11	179.0	144.3936
12	172.0	148.96738
13	229.0	134.17366
14	207.0	147.26193
15	224.0	147.52689
16	143.0	130.6711
17	177.0	158.20508
18	199.0	159.31636
19	175.5	147.4742
20	169.0	138.74756
21	164.0	141.22824
22	152.5	122.51671
23	145.0	160.20476


你可能感兴趣的:(hadoop)