hdfs输入源文件------>读取文件------>InputFormat: TextInputFormat------>
------>自定义Map逻辑,将k1和v1转为k2和v2,继承Mapper,重写map方法------>
------>Shuffle阶段------>新
package cn.itclass.mapreduce;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/*
四个泛型
Mapper<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
<k1,v1>
<k2,v2>
*/
public class WordCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
//map方法就是将<k1,v1>转为<k2,v2>
/*
参数
key:k1 行偏移量(意义不大)
value:v1 每一行的文本数据
context: 上下文对象,连接各个过程
*/
/*
如何将<k1,v1>转为<k2,v2>
k1 v1
0 hello,world,hadoop
15 hdfs,hive,hello
-----------------------
k2 v2
hello 1
world 1
hadoop 1
hdfs 1
hive 1
hello 1
*/
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
//1:将一行的文本数据进行拆分,放入数组内
String[] split=value.toString().split(",");
//2:遍历数组,组装<k2,v2>
Text text=new Text();
LongWritable longWritable=new LongWritable();
for (String word : split) {
//3:将<k2,v2>写入上下文中:利用context.write()
text.set(word);
longWritable.set(1);
context.write(text,longWritable);
}
}
}
package cn.itclass.mapreduce;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
四个泛型
Reducer<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
KEYIN : K2类型
VALUEIN : V2类型
KEYOUT : K3类型
VALUEOUT : V3类型
*/
public class WordCountReducer extends Reducer<Text, LongWritable,Text,LongWritable>{
//reduce方法作用:将新的K2和V2转为K3和V3,将K3和V3写入上下文中
/*
参数:
key: 新K2
values: 集合 新V2
context:表示上下文对象
----------------------
如何将新的K2和V2转为K3和V3
新 K2 V2
hello <1,1,1>
world <1,1>
hadoop <1>
----------------------
K3 V3
hello 3
world 2
hadoop 1
*/
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
long count=0;
//1:遍历集合,将集合中的数字相加得到V3
for (LongWritable value : values) {
count+=value.get();
}
//2:将K3和V3写入上下文中,context.write()
context.write(key,new LongWritable(count));
}
}
package cn.itclass.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class JobMain extends Configured implements Tool {
//该run()方法用于指定一个job任务
@Override
public int run(String[] strings) throws Exception {
//1:创建一个job任务对象
Job job = Job.getInstance(super.getConf(),"wordcount");
job.setJarByClass(JobMain.class);//如果打包运行出错
//super.getConf()获取main里面的Configuration对象
//2:配置job任务对象(八个步骤)
//第一步:指定文件的读取方式和读取路径
job.setInputFormatClass(TextInputFormat.class);//指定输入类
//指定读取原文件路径,到文件夹名即可,会自动读取文件夹下的所有文件
TextInputFormat.addInputPath(job,new Path("hdfs://192.168.75.9:8020/wordcount"));
//第二步:指定Map阶段的处理方式和数据类型
job.setMapperClass(WordCountMapper.class);//Map阶段自定义逻辑类
//设置Map阶段K2类型
job.setMapOutputKeyClass(Text.class);
//设置Map阶段V2类型
job.setMapOutputValueClass(LongWritable.class);
//第三、四、五、六 采用默认方式
//第七步:指定Reduce阶段的处理方式和数据类型
job.setReducerClass(WordCountReducer.class);
//设置K3的类型
job.setOutputKeyClass(Text.class);
//设置V3的类型
job.setMapOutputValueClass(LongWritable.class);
//第八步:设置输出类型
job.setOutputFormatClass(TextOutputFormat.class);
//设置输出路径
Path path =new Path("hdfs://192.168.75.9:8020/wordcount_out");
TextOutputFormat.setOutputPath(job,path);
//获取文件系统FileSystem
FileSystem fileSystem=FileSystem.get(new URI("hdfs://192.168.75.9:8020"),new Configuration());
//判断目录是否存在
boolean bl2= fileSystem.exists(path);
if(bl2){
//删除该目录
fileSystem.delete(path,true);
}
//等待任务结束
boolean bl=job.waitForCompletion(true);
//返回0表示任务执行成功,否则任务执行失败
return bl?0:1;
}
public static void main(String[] args) throws Exception {
Configuration configuration=new Configuration();
//启动job任务
int run = ToolRunner.run(configuration,new JobMain(),args);
//返回值run记录任务执行状态,返回0表示任务执行成功,否则任务执行失败
System.exit(run);//退出
}
}
public class JobMain extends configured implements Tool{
@Override
public int run(String[] args) throws Exception{
//1:创建job对象
Job job=Job.getInstance(super.getConf(),"名字");
//2:配置job任务(八个步骤)
//第一步:设置输入类型和输入路径
job.setInputFormatClass(TextInputFormat.class);
TextInputFormatClass(job,new Path("hdfs://"));
//第二步:设置Mapper类和数据类型
job.setMapperClass(xxx.class);//实现mapper的类
job.setMapperOutputKeyClass(xxx.class);
job.setMapperOutputValueClass(xxx.class);
//第三步:分区
//第四步:排序
//第五步:规约
//第六步:分组
//第七步:设置Reducer类和类型
job.setReducerClass(xxx.class);
job.setOutputKeyClass(xxx.class);
job.setOutputValueClass(xxx.class);
//第八步:设置输出类和输出路径
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("hdfs://"));
//3:等待程序结束
boolean bl =job.waitForCompletion(true);
return bl?0:1;
}
public void main(String[] args) throws Exception{
Configuration configuration = new Configuration();
//启动job任务
int run = ToolRunner.run(configration,new JobMian(),args);
System.exit(run);
}
}
String Similarity Join
Input:S , R ,θ(编辑距离ed)
Output: (s,r)且ed(s,r)≤θ