================================input.txt=======================================
youth high no fair no
youth high no excellent no
middle high no fair yes
senior medium no fair yes
senior low yes fair yes
senior low yes excellent no
middle low yes excellent yes
youth medium no fair no
youth low yes fair yes
senior medium yes fair yes
youth medium yes excellent yes
middle medium no excellent yes
middle high yes fair yes
senior medium no excellent no
====================================================================
package com.mahout.bayes;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.mahout.test.FirstGroupingComparator;
import com.mahout.test.StringStringPairAsce;
import com.mahout.test.ItemBasePass1.FirstPartitioner;
/**
* 贝叶斯算法实现
* @author clxin
*
*/
public class Bayes extends Configured implements Tool {
/**
* 把(x1,x2,..,xn,C)转换为
* C A1 x1
* C A1 x2
* @author clxin
*/
public static class BayesMapper extends MapReduceBase implements
Mapper<LongWritable, Text, StringStringPairAsce, Text> {
private StringStringPairAsce tKey = new StringStringPairAsce();
private Text tValue = new Text();
public void map(LongWritable key, Text value,
OutputCollector<StringStringPairAsce, Text> output, Reporter arg3)
throws IOException {
String [] strArr = value.toString().split("\t");
tKey.set("age"+"\t"+strArr[strArr.length-1],strArr[0]);
tValue.set(strArr[0]);
output.collect(tKey, tValue);
tKey.set("income"+"\t"+strArr[strArr.length-1],strArr[1]);
tValue.set(strArr[1]);
output.collect(tKey, tValue);
tKey.set("student"+"\t"+strArr[strArr.length-1],strArr[2]);
tValue.set(strArr[2]);
output.collect(tKey, tValue);
tKey.set("credit_rating"+"\t"+strArr[strArr.length-1],strArr[3]);
tValue.set(strArr[3]);
output.collect(tKey, tValue);
}
}
public static class BayesReducer extends MapReduceBase implements
Reducer<StringStringPairAsce, Text, Text, Text> {
Text tKey = new Text();
Text tValue= new Text();
@Override
public void reduce(StringStringPairAsce key, Iterator<Text> values,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
int pCcount = 1;
int pXcount = 1;
Map xMap = new HashMap<String,String>();
String tmpValue=values.next().toString();
while(values.hasNext()){
pCcount++;
String newValue=values.next().toString();
if(!tmpValue.equals(newValue)){
xMap.put(tmpValue, pXcount);
tmpValue = newValue;
pXcount=1;
}else{
pXcount++;
}
}
xMap.put(tmpValue, pXcount);
Set<Entry<String, String>> sets = xMap.entrySet();
for (Entry<String, String> entry : sets) {
tKey.set(key.getFirst() + "\t" + entry.getKey());
String [] xValue = key.getFirst().split("\t");
Object ob = entry.getValue();
tValue.set(pCcount+"\t"+ob.toString());
System.out.println("p("+xValue[0]+"="+entry.getKey()+"|"+"class="+xValue[1]+
")="+ob.toString()+"/"+pCcount);
output.collect(tKey, tValue);
}
}
}
public static class FirstPartitioner implements
Partitioner<StringStringPairAsce, Text> {
@Override
public int getPartition(StringStringPairAsce key, Text value,
int numPartitions) {
return key.getFirst().hashCode() & Integer.MAX_VALUE
% numPartitions;
}
@Override
public void configure(JobConf job) {
}
}
@Override
public int run(String[] args) throws Exception {
JobConf conf = new JobConf(getConf(), Bayes.class);
conf.setJobName("Bayes");
//conf.setNumMapTasks(200);
// 设置Map输出的key和value的类型
conf.setMapOutputKeyClass(StringStringPairAsce.class);
conf.setMapOutputValueClass(Text.class);
// 设置Reduce输出的key和value的类型
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
// 设置Mapper和Reducer
conf.setMapperClass(BayesMapper.class);
conf.setReducerClass(BayesReducer.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
conf.setPartitionerClass(FirstPartitioner.class);
conf.setOutputValueGroupingComparator(FirstGroupingComparator.class);
// 设置输入输出目录
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Bayes(), args);
System.exit(exitCode);
}
}