2、业务要求:根据资料内容,制作这些内容的TF-IDF文档,同时计算是出现豆浆关键字的IF-IDF值的大小和出现过的内容;
3、hadoop框架设计思路
public class FirstJob {
public static void main(String[] args) {
Configuration conf = new Configuration();
conf.set("mapreduce.app-submission.coress-paltform", "true");
conf.set("mapreduce.framework.name", "local");
try {
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
job.setJarByClass(FirstJob.class);
job.setJobName("weibo1");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(4);
job.setPartitionerClass(FirstPartition.class);
job.setMapperClass(FirstMapper.class);
job.setCombinerClass(FirstReduce.class);
job.setReducerClass(FirstReduce.class);
FileInputFormat.addInputPath(job, new Path("/data/tfidf/input/"));
Path path = new Path("/data/tfidf/output/weibo1");
if (fs.exists(path)) {
fs.delete(path, true);
}
FileOutputFormat.setOutputPath(job, path);
boolean f = job.waitForCompletion(true);
if (f) {
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public class FirstMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//3823890210294392 今天我约了豆浆,油条
String[] v = value.toString().trim().split("\t");
if (v.length >= 2) {
String id = v[0].trim();
String content = v[1].trim();
StringReader sr = new StringReader(content);
IKSegmenter ikSegmenter = new IKSegmenter(sr, true);
Lexeme word = null;
while ((word = ikSegmenter.next()) != null) {
String w = word.getLexemeText();
context.write(new Text(w + "_" + id), new IntWritable(1));
//今天_3823890210294392 1
}
context.write(new Text("count"), new IntWritable(1));
//count 1
} else {
System.out.println(value.toString() + "-------------");
}
}
}
/**
* 第一个MR自定义分区
* @author root
*
*/
public class FirstPartition extends HashPartitioner<Text, IntWritable>{
public int getPartition(Text key, IntWritable value, int reduceCount) {
if(key.equals(new Text("count")))
return 3;
else
return super.getPartition(key, value, reduceCount-1);
}
}
public class FirstReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
protected void reduce(Text key, Iterable<IntWritable> iterable,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable i : iterable) {
sum = sum + i.get();
}
if (key.equals(new Text("count"))) {
System.out.println(key.toString() + "___________" + sum);
}
context.write(key, new IntWritable(sum));
}
}
public class TwoJob {
public static void main(String[] args) {
Configuration conf =new Configuration();
conf.set("mapreduce.app-submission.coress-paltform", "true");
conf.set("mapreduce.framework.name", "local");
try {
Job job =Job.getInstance(conf);
job.setJarByClass(TwoJob.class);
job.setJobName("weibo2");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(TwoMapper.class);
job.setCombinerClass(TwoReduce.class);
job.setReducerClass(TwoReduce.class);
//mr运行时的输入数据从hdfs的哪个目录中获取
FileInputFormat.addInputPath(job, new Path("/data/tfidf/output/weibo1"));
FileOutputFormat.setOutputPath(job, new Path("/data/tfidf/output/weibo2"));
boolean f= job.waitForCompletion(true);
if(f){
System.out.println("执行job成功");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
//统计df:词在多少个微博中出现过。
public class TwoMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 获取当前 mapper task的数据片段(split)
FileSplit fs = (FileSplit) context.getInputSplit();
if (!fs.getPath().getName().contains("part-r-00003")) {
//豆浆_3823890201582094 3
String[] v = value.toString().trim().split("\t");
if (v.length >= 2) {
String[] ss = v[0].split("_");
if (ss.length >= 2) {
String w = ss[0];
context.write(new Text(w), new IntWritable(1));
}
} else {
System.out.println(value.toString() + "-------------");
}
}
}
}
public class TwoReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
protected void reduce(Text key, Iterable<IntWritable> arg1, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable i : arg1) {
sum = sum + i.get();
}
context.write(key, new IntWritable(sum));
}
}
public class LastJob {
public static void main(String[] args) {
Configuration conf =new Configuration();
// conf.set("mapred.jar", "C:\\Users\\root\\Desktop\\tfidf.jar");
conf.set("mapreduce.job.jar", "C:\\Users\\root\\Desktop\\tfidf.jar");
conf.set("mapreduce.app-submission.cross-platform", "true");
try {
FileSystem fs =FileSystem.get(conf);
Job job =Job.getInstance(conf);
job.setJarByClass(LastJob.class);
job.setJobName("weibo3");
job.setJar("C:\\Users\\root\\Desktop\\tfidf.jar");
//2.5
//把微博总数加载到
job.addCacheFile(new Path("/data/tfidf/output/weibo1/part-r-00003").toUri());
//把df加载到
job.addCacheFile(new Path("/data/tfidf/output/weibo2/part-r-00000").toUri());
//设置map任务的输出key类型、value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(LastMapper.class);
job.setReducerClass(LastReduce.class);
//mr运行时的输入数据从hdfs的哪个目录中获取
FileInputFormat.addInputPath(job, new Path("/data/tfidf/output/weibo1"));
Path outpath =new Path("/data/tfidf/output/weibo3");
if(fs.exists(outpath)){
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job,outpath );
boolean f= job.waitForCompletion(true);
if(f){
System.out.println("执行job成功");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public class LastMapper extends Mapper<LongWritable, Text, Text, Text> {
// 存放微博总数
public static Map<String, Integer> cmap = null;
// 存放df
public static Map<String, Integer> df = null;
// 在map方法执行之前
protected void setup(Context context) throws IOException,
InterruptedException {
System.out.println("******************");
if (cmap == null || cmap.size() == 0 || df == null || df.size() == 0) {
URI[] ss = context.getCacheFiles();
if (ss != null) {
for (int i = 0; i < ss.length; i++) {
URI uri = ss[i];
if (uri.getPath().endsWith("part-r-00003")) {// 微博总数
Path path = new Path(uri.getPath());
// FileSystem fs
// =FileSystem.get(context.getConfiguration());
// fs.open(path);
BufferedReader br = new BufferedReader(new FileReader(path.getName()));
String line = br.readLine();
if (line.startsWith("count")) {
String[] ls = line.split("\t");
cmap = new HashMap<String, Integer>();
cmap.put(ls[0], Integer.parseInt(ls[1].trim()));
}
br.close();
} else if (uri.getPath().endsWith("part-r-00000")) {// 词条的DF
df = new HashMap<String, Integer>();
Path path = new Path(uri.getPath());
BufferedReader br = new BufferedReader(new FileReader(path.getName()));
String line;
while ((line = br.readLine()) != null) {
String[] ls = line.split("\t");
df.put(ls[0], Integer.parseInt(ls[1].trim()));
}
br.close();
}
}
}
}
}
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
FileSplit fs = (FileSplit) context.getInputSplit();
// System.out.println("--------------------");
if (!fs.getPath().getName().contains("part-r-00003")) {
//豆浆_3823930429533207 2
String[] v = value.toString().trim().split("\t");
if (v.length >= 2) {
int tf = Integer.parseInt(v[1].trim());// tf值
String[] ss = v[0].split("_");
if (ss.length >= 2) {
String w = ss[0];
String id = ss[1];
double s = tf * Math.log(cmap.get("count") / df.get(w));
NumberFormat nf = NumberFormat.getInstance();
nf.setMaximumFractionDigits(5);
context.write(new Text(id), new Text(w + ":" + nf.format(s)));
}
} else {
System.out.println(value.toString() + "-------------");
}
}
}
}
public class LastReduce extends Reducer<Text, Text, Text, Text> {
protected void reduce(Text key, Iterable<Text> iterable, Context context)
throws IOException, InterruptedException {
StringBuffer sb = new StringBuffer();
for (Text i : iterable) {
sb.append(i.toString() + "\t");
}
context.write(key, new Text(sb.toString()));
}
}