package com.hdfsclient;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser;
public class MedianStdDevTuple implements Writable{
private float median = 0;
private float stdDev=0;
public float getMedian() {
return median;
}
public void setMedian(float median) {
this.median = median;
}
public float getStdDev() {
return stdDev;
}
public void setStdDev(float stdDev) {
this.stdDev = stdDev;
}
public void readFields(DataInput in) throws IOException{
median=in.readFloat();
stdDev=in.readFloat();
}
public void write(DataOutput out) throws IOException {
out.writeFloat(median);
out.writeFloat(stdDev);
}
public String toString(){///重载toString函数定义输出格式
return median+"\t"+stdDev;
}
public static final String[] REDIS_INSTANCES = { "p0", "p1", "p2", "p3",
"p4", "p6" };
public static Map<String, String> transformXmlToMap(String xml) {
Map<String, String> map = new HashMap<String, String>();
try {
String[] tokens = xml.trim().substring(5, xml.trim().length() - 3).split("\"");
for (int i = 0; i < tokens.length - 1; i += 2) {
String key = tokens[i].trim();
String val = tokens[i + 1];
map.put(key.substring(0, key.length() - 1), val);
}
} catch (StringIndexOutOfBoundsException e) {
System.err.println(xml);
}
return map;
}
public static class MedianStdDevMapper extends Mapper <Object, Text, IntWritable, IntWritable> {
private IntWritable outHour = new IntWritable();
//private MedianStdDevTuple outCountAverage = new MedianStdDevTuple();
private IntWritable outCommentLength = new IntWritable();
private final static SimpleDateFormat frmt = new SimpleDateFormat ("yyyy-MM-dd'T'HH:mm:ss.SSS");
public void map (Object key, Text value, Context context)
throws IOException, InterruptedException {
Map <String, String> parsed = transformXmlToMap (value.toString());
String strDate = parsed.get("CreationDate");
String text = parsed.get("Text");
Date creationDate=null;
try {
creationDate = frmt.parse(strDate);
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
outHour.set(creationDate.getHours());///时间戳
outCommentLength.set(text.length());
context.write(outHour, outCommentLength);
}
}
public static class MedianStdDevReducer
extends Reducer <IntWritable, IntWritable,IntWritable, MedianStdDevTuple> {
private MedianStdDevTuple result = new MedianStdDevTuple();
private ArrayList<Float> commentLengths = new ArrayList<Float>();
public void reduce (IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
float sum = 0;
float count = 0;
commentLengths.clear();
result.setStdDev(0);
for (IntWritable val : values) {
commentLengths.add((float)val.get());
sum += val.get();
count++;
}
Collections.sort(commentLengths);
if(count%2==0){
result.setMedian((commentLengths.get((int)count/2-1)+commentLengths.get((int)count/2))/2.0f);
}else{
result.setMedian(commentLengths.get((int)count/2));
}
float mean = sum/count;
float sumOfSquares = 0.0f;
for(Float f : commentLengths){
sumOfSquares+=(f-mean)*(f-mean);
}
result.setStdDev((float)Math.sqrt(sumOfSquares/(count-1)));
context.write(key,result);
}
}
/*============================================================================================================*/
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//FileUtil.fullyDelete(new File("output7"));
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: mergesort " );
System.exit(2);
}
Job job = Job.getInstance();
job.setJarByClass(MedianStdDevTuple.class);
job.setMapperClass(MedianStdDevMapper.class);
job.setReducerClass(MedianStdDevReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(MedianStdDevTuple.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Map实现
package four;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class JoinMapper extends Mapper<LongWritable, Text, IntWritable, Text>{
//定义文件名称标识
private static final String LEFT_FILENAME = "product_info.txt";
private static final String RIGHT_FILENAME = "product_quantity.txt";
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//从输入分片信息中取得文件路径
//FileSplit 是 抽象类InputSplit 的实现类,记录了文件的具体切片信息。
String filePath = ((FileSplit)context.getInputSplit()).getPath().toString();
//文件标识
String fileFlag = null;
//输出键(学号)
String outKey = null;
//输出值(姓名 或 课程)
String outValue = null;
//行记录的信息
String[] infos = value.toString().split(",");
//判断行记录所来自的文件
if (filePath.contains(LEFT_FILENAME)) {
if(infos.length == 22){
String temp = null;
fileFlag = LEFT_FILENAME;
outKey = infos[0];
temp = infos[1]+" "+infos[2]+" "+infos[3];
outValue = temp;
}
}
else if (filePath.contains(RIGHT_FILENAME)) {
if(infos.length == 9 ){
String temp = null;
temp = infos[1]+" "+infos[2]+" "+infos[3];
fileFlag = RIGHT_FILENAME;
outKey = infos[0];
outValue = temp;
}
}
//输出键值对,并在值上标记文件名
context.write(new IntWritable(Integer.parseInt(outKey)), new Text(outValue + "\t" + fileFlag));
}
}
Reduce实现
package four;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
public class JoinReducer extends Reducer<IntWritable, Text, Text, Text>{
//定义文件名称标识
private static final String LEFT_FILENAME = "product_info.txt";
private static final String RIGHT_FILENAME = "product_quantity.txt";
private static int num = 0;
@Override
protected void reduce(IntWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
//计数reduce调用次数,输出key
{
num++;
System.out.println(num + " " +key);
}
//学生姓名
String studentName = null;
//学生课程名数组
List<String> studentClassNames = new ArrayList<String>();
//根据文件名标识信息,将姓名、课程归类
for (Text value : values) {
String[] infos = value.toString().split("\t");
if(LEFT_FILENAME.equals(infos[1])) {
studentName = infos[0];
}
else if (RIGHT_FILENAME.equals(infos[1])){
studentClassNames.add(infos[0]);
}
}
//去除无法建立内连接的信息
if (studentName == null || studentClassNames.size() == 0) {
return;
}
//将姓名-课程 键值对遍历输出
for (int i = 0; i < studentClassNames.size(); i++) {
context.write(new Text(studentName), new Text(studentClassNames.get(i)));
}
}
}
分区实现:
package four;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class MyPartitioneryy extends Partitioner<IntWritable, Text> {
@Override
public int getPartition(IntWritable key, Text value, int numPartitions) {
int result = 0;
/*********************************************************************/
/***key.toString().equals("long") must use toString()!!!! ***/
/***开始的时候我没有用 ,导致都在一个区里,结果也在一个reduce输出文件中。 ***/
/********************************************************************/
if (key.get() <= 1000) {
result = 0;
}
else if (key.get() <= 2000) {
result = 1;
}
else if (key.get() <= 3000) {
result = 2;
} else {
result = 3;
}
return result;
}
}
驱动
package four;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import two.MyPartitioner;
import two.MyPartitioner.MyPartitionerMap;
import two.MyPartitioner.MyPartitionerPar;
import two.MyPartitioner.MyPartitionerReduce;
/*
* join的驱动类
*/
public class MR_Join {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: MyPartitioner " );
System.exit(2);
}
conf.set("mapred.jar","mp1.jar");
Job job = new Job(conf, "MyPartitioner");
job.setNumReduceTasks(4);
job.setJarByClass(MR_Join.class);
job.setMapperClass(JoinMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setPartitionerClass(MyPartitioneryy.class); //
job.setReducerClass(JoinReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
package binning;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class shijian5 {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = new Job(conf, "binningByTag");
job.setJarByClass(BinningByTags.class);
job.setMapperClass(BinningByTagsMapper.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
MultipleOutputs.addNamedOutput(job, "bins", TextOutputFormat.class, Text.class, NullWritable.class);
MultipleOutputs.setCountersEnabled(job, true);
job.setNumReduceTasks(0);
job.waitForCompletion(true);
}
public static class BinningByTagsMapper extends Mapper<Object, Text, Text, NullWritable> {
private MultipleOutputs<Text, NullWritable> mos = null;
protected void setup(Context context) {
// Create a new MultipleOutputs using the context object
mos = new MultipleOutputs<Text, NullWritable>(context);
}
public void map(Object text, Text value, Context context) throws IOException, InterruptedException {
String datas[] = value.toString().split("\t");
String tag = datas[2];
if (tag.equalsIgnoreCase("hadoop")) {
mos.write("bins", value, NullWritable.get(), "hadoop-tag");
}
else if (tag.equalsIgnoreCase("hive")) {
mos.write("bins", value, NullWritable.get(), "hive-tag");
}
else if (tag.equalsIgnoreCase("pig")) {
mos.write("bins", value, NullWritable.get(), "pig-tag");
}
else if (tag.equalsIgnoreCase("hbase")) {
mos.write("bins", value, NullWritable.get(), "hbase-tag");
}
else{
mos.write("bins", value, NullWritable.get(), "other");
}
}
protected void cleanup(Context context) throws IOException, InterruptedException {
// Close multiple outputs otherwise you will not get any values
mos.close();
}
}
}
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class InvertedIndexMapper extends Mapper<Text, Text, Text, Text>
{ @Override
protected void map(Text key, Text value, Context context)
throws IOException, InterruptedException
// default RecordReader: LineRecordReader; key: line offset; value: line string
{ FileSplit fileSplit = (FileSplit)context.getInputSplit();
String fileName = fileSplit.getPath().getName();
Text word = new Text();
Text fileName_lineOffset = new Text(fileName+”@”+key.toString());
StringTokenizer itr = new StringTokenizer(value.toString());
for(; itr.hasMoreTokens(); )
{ word.set(itr.nextToken());
context.write(word, fileName_lineOffset);
}
}
}
import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class InvertedIndexReducer extends Reducer<Text, Text, Text, Text>
{ @Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException
{ Iterator<Text> it = values.iterator();
StringBuilder all = new StringBuilder();
if(it.hasNext())
all.append(it.next().toString());
for(; it.hasNext(); )
{ all.append(“;");
all.append(it.next().toString()); }
context.write(key, new Text(all.toString()));
}
}
5.笛卡尔积Reduce实现
package com.hadoop.reducejoin.test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/*
* 两个大表
* 通过笛卡尔积实现 reduce join
* 适用场景:两个表的连接字段key都不唯一(包含一对多,多对多的关系)
*/
public class ReduceJoinByCartesianProduct {
/**
为来自不同表(文件)的key/value对打标签以区别不同来源的记录。
然后用连接字段作为key,其余部分和新加的标志作为value,最后进行输出。
*/
public static class ReduceJoinByCartesianProductMapper extends Mapper<Object,Text,Text,Text>{
private Text joinKey=new Text();
private Text combineValue=new Text();
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String pathName=((FileSplit)context.getInputSplit()).getPath().toString();
//如果数据来自于records,加一个records的标记
if(pathName.endsWith("records.txt")){
String line = value.toString();
String[] valueItems = line.split("\\s+");
//过滤掉脏数据
if(valueItems.length!=3){
return;
}
joinKey.set(valueItems[0]);
combineValue.set("records.txt" + valueItems[1] + "\t" + valueItems[2]);
}else if(pathName.endsWith("station.txt")){
//如果数据来自于station,加一个station的标记
String line = value.toString();
String[] valueItems = line.split("\\s+");
//过滤掉脏数据
if(valueItems.length!=2){
return;
}
joinKey.set(valueItems[0]);
combineValue.set("station.txt" + valueItems[1]);
}
context.write(joinKey,combineValue);
}
}
/*
* reduce 端做笛卡尔积
*/
public static class ReduceJoinByCartesianProductReducer extends Reducer<Text,Text,Text,Text>{
private List<String> leftTable=new ArrayList<String>();
private List<String> rightTable=new ArrayList<String>();
private Text result=new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//一定要清空数据
leftTable.clear();
rightTable.clear();
//相同key的记录会分组到一起,我们需要把相同key下来自于不同表的数据分开,然后做笛卡尔积
for(Text value : values){
String val=value.toString();
if(val.startsWith("station.txt")){
leftTable.add(val.replaceFirst("station.txt",""));
}else if(val.startsWith("records.txt")){
rightTable.add(val.replaceFirst("records.txt",""));
}
}
//笛卡尔积
for(String leftPart:leftTable){
for(String rightPart:rightTable){
result.set(leftPart+"\t"+rightPart);
context.write(key, result);
}
}
}
}
public static void main(String[] arg0) throws Exception{
Configuration conf = new Configuration();
String[] args = {"hdfs://sparks:9000/middle/reduceJoin/records.txt"
,"hdfs://sparks:9000/middle/reduceJoin/station.txt"
,"hdfs://sparks:9000/middle/reduceJoin/JoinByCartesian-out"
};
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: reducejoin [...] " );
System.exit(2);
}
//输出路径
Path mypath = new Path(otherArgs[otherArgs.length - 1]);
FileSystem hdfs = mypath.getFileSystem(conf);// 创建输出路径
if (hdfs.isDirectory(mypath)) {
hdfs.delete(mypath, true);
}
Job job = Job.getInstance(conf, "ReduceJoinByCartesianProduct");
job.setJarByClass(ReduceJoinByCartesianProduct.class);
job.setMapperClass(ReduceJoinByCartesianProductMapper.class);
job.setReducerClass(ReduceJoinByCartesianProductReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//添加输入路径
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
//添加输出路径
FileOutputFormat.setOutputPath(job,
new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
ReduceJoinByCartesianProduct