MR执行环境有两种:本地测试环境,服务器环境
第一种执行方式:服务器端运行
直接在服务器上,使用命令的方式调用,执行过程也在服务器上
a、把MR程序打包(jar),传送到服务器上
b、通过: hadoop jar jar路径 类的全限定名
WordCountMapper .java
package com.mr;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* 用到默认的 inputformat : fileInputFormat 类,把数据片段中的数据一行一行读进来,每行下标为 key ,每行的内容为 value
* @author benxi
*
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
// 以单词作为键
public Text k = new Text();
// 以 1 作为 value
public IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key , Text value ,Context context )
throws IOException, InterruptedException {
String[] words = StringUtils. split ( value .toString(), " " );
for ( int i = 0; i < words . length ; i ++){
String w = words [ i ];
// 以单词为 key
k .set( w );
context .write( k , v );
}
}
}
WorldCountReduce .java
package com.mr;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WorldCountReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
/**
*
*/
@Override
protected void reduce(Text key , Iterable ite ,Context context )
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable i : ite ){
sum += i .get();
}
context .write( key , new IntWritable( sum ));
}
}
package com.mr;
import java.io.FileOutputStream ;
import java.io.IOException ;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class RunJob {
public static void main(String[] args ) {
Configuration config = new Configuration();
try {
FileSystem fs = FileSystem. get ( config );
Job job = Job. getInstance ( config );
job .setJarByClass(RunJob. class );
job .setMapperClass(WordCountMapper. class );
job .setReducerClass(WorldCountReduce. class );
job .setMapOutputKeyClass(Text. class );
job .setMapOutputValueClass(IntWritable. class );
// 给 job 指定计算的输入数据 / usr /input/wc.txt
FileInputFormat. setInputPaths ( job , new Path( "/usr/input/wc.txt" ));
// 给 job 指定计算之后结果的输出目录,该目录不允许存在,如果存在, job 执行出错
Path output = new Path( "/usr/output/wc" );
if ( fs .exists( output )){
fs .delete( output , true );
}
FileOutputFormat. setOutputPath ( job , output );
boolean f = job .waitForCompletion( true );
if ( f ){
System. out .println( " 执行成功 " );
}
} catch (Exception e ) {
e .printStackTrace();
}
}
}
然后把项目Export,选择java》jar file 》选择src目录,并指定文件名和路径。
把生成的jar包放到linux系统中,执行hadoop jar wc.jar com.mr.RunJob
方法二:本地运行
在windows的hadoop目录bin目录有一个winutils.exe
1、在windows下配置hadoop的环境变量
2、拷贝debug工具(winutils.exe)到HADOOP_HOME/bin
3、修改hadoop的源码 ,注意:确保项目的lib需要真实安装的jdk的lib
4、MR调用的代码需要改变:
a、src不能有服务器的hadoop配置文件
b、在调用是使用:
Configuration config = new Configuration();
config.set("fs.defaultFS", "hdfs://node7:8020");
只需要修改一下RunJob中间的内容就行,其他java文件内容一样
public class RunJob {
public static void main(String[] args ) {
Configuration config = new Configuration();
System. setProperty ( "HADOOP_USER_NAME" , "root" ); 设置访问主机名为root
config .set( "fs.defaultFS" , "hdfs://CentOS8:8020" );
try {
FileSystem fs = FileSystem. get ( config );
Job job = Job. getInstance ( config );
job .setJarByClass(RunJob. class );
job .setJobName( "wc" );
job .setMapperClass(WordCountMapper. class );
job .setReducerClass(WorldCountReduce. class );
job .setMapOutputKeyClass(Text. class );
job .setMapOutputValueClass(IntWritable. class );
// 给 job 指定计算的输入数据 / usr /input/wc.txt
FileInputFormat. setInputPaths ( job , new Path( "/usr/input/wc.txt" ));
// 给 job 指定计算之后结果的输出目录,该目录不允许存在,如果存在, job 执行出错
Path output = new Path( "/usr/output/wc" );
if ( fs .exists( output )){
fs .delete( output , true );
}
FileOutputFormat. setOutputPath ( job , output );
boolean f = job .waitForCompletion( true );
if ( f ){
System. out .println( " 执行成功 " );
}
} catch (Exception e ) {
e .printStackTrace();
}
}
}
第三种方法:在本地把文件上传到服务器上面去运行
需要在本地hosts文件中配置端口号和主机名
在本地直接调用,执行过程在服务器上(真正企业运行环境)
a、把MR程序打包(jar),直接放到本地
b、修改hadoop的源码 ,注意:确保项目的lib需要真实安装的jdk的lib
c、增加一个属性:配置打成jar包的文件路径
config.set(“mapred.jar”, “C:\Users\Administrator\Desktop\wc.jar”);
d、本地执行main方法,servlet调用MR。
public class RunJob {
public static void main(String[] args ) {
Configuration config = new Configuration();
System. setProperty ( "HADOOP_USER_NAME" , "root" );
// config.set("fs.defaultFS", " hdfs ://CentOS8:8020");
//C:\Users\ benxi \Desktop\wc.jar
config .set( "mapred.jar" , "C:\\Users\\benxi\\Desktop\\wc.jar" );
try {
FileSystem fs = FileSystem. get ( config );
Job job = Job. getInstance ( config );
job .setJarByClass(RunJob. class );
job .setJobName( "wc" );
job .setMapperClass(WordCountMapper. class );
job .setReducerClass(WorldCountReduce. class );
job .setMapOutputKeyClass(Text. class );
job .setMapOutputValueClass(IntWritable. class );
// 给 job 指定计算的输入数据 / usr /input/wc.txt
FileInputFormat. setInputPaths ( job , new Path( "/usr/input/wc.txt" ));
// 给 job 指定计算之后结果的输出目录,该目录不允许存在,如果存在, job 执行出错
Path output = new Path( "/usr/output/wc" );
if ( fs .exists( output )){
fs .delete( output , true );
}
FileOutputFormat. setOutputPath ( job , output );
boolean f = job .waitForCompletion( true );
if ( f ){
System. out .println( " 执行成功 " );
}
} catch (Exception e ) {
e .printStackTrace();
}
}
}
设置用制表符分隔key和value
job.setInputFormatClass(KeyValueTextInputFormat.class);
在job文件中设置多个文件读入
FileInputFormat.setInputPaths(job, new Path[] {new Path(“/usr/input/network.txt”),new Path(“/usr/input/user.txt”)});
在map中查看读取文件的名字:
FileSplit split = (FileSplit) context.getInputSplit();
String name = split.getPath().getName();