0 每一行执行一次map函数 每个组调用一次reduce函数
1 代码: 这里主要看main函数写法 注意和hadoop1写法不同处的对比 使用ant运行
运行集群参看: hadoop2 搭建自动切换的ha集群 yarn集群
注意 Mapper Reducer类以及 map() reduce() 方法对应的Context的包的引用
打包运行时, job.setJarByClass是必须的
package mapreduce; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCountApp { public static class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{ /** * 每一行执行一次map函数 * @param key 表示字节在源文件中偏移量 * @param value 行文本内容 */ protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,Text,LongWritable>.Context context) throws java.io.IOException ,InterruptedException { final String[] splited = value.toString().split("\t"); for (String word : splited) { context.write(new Text(word), new LongWritable(1)); } }; } //产生输出:<hello,1><you,1><hello,1><me,1> //按照key进行排序:<hello,1><hello,1><me,1><you,1> //分组:<hello,{1,1}><me,{1}><you,{1}>【把相同key的value放到一起】 reduce方法是每一组调用一次 左侧结果 为3组 则调用3次reduce方法 public static class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{ /** * 每个组调用一次reduce函数 * @param word 表示单词 * @param times 表示相同key的value的迭代器 */ protected void reduce(Text word, java.lang.Iterable<LongWritable> times, org.apache.hadoop.mapreduce.Reducer<Text,LongWritable,Text,LongWritable>.Context context) throws java.io.IOException ,InterruptedException { long sum = 0L; for (LongWritable longWritable : times) { sum += longWritable.get(); } context.write(word, new LongWritable(sum)); }; } public static void main(String[] args) throws Exception { // 设置Job对象 final Configuration conf = new Configuration(); final Job job = new Job(conf); job.setJobName(WordCountApp.class.getSimpleName()); job.setJarByClass(WordCountApp.class); // 给Job对象设置自定义 mapper reducer job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); // 设置map reduce输出参数类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // 设置Job任务要处理的数据源和输出数据目的地 FileInputFormat.addInputPaths(job, "/test"); // 注意是addInputPaths 用的是复数的方法 FileOutputFormat.setOutputPath(job, new Path("/out1")); // 执行Job job.waitForCompletion(true); } }
2 build.xml:
<?xml version="1.0" encoding="UTF-8"?> <!-- 该文件与src文件夹、lib文件夹同一级 --> <project name="hadoop2测试项目" basedir="." default="sshexec"> <!--属性设置--> <property environment="env" /> <property file="build.properties" /> <property name="src.dir" value="${basedir}/src" /> <property name="java.lib.dir" value="${env.JAVA_HOME}/lib" /> <property name="classes.dir" value="${basedir}/classes" /> <property name="dist.dir" value="${basedir}/dist" /> <property name="project.lib.dir" value="${basedir}/lib" /> <property name="localpath.dir" value="${basedir}" /> <property name="remote.home" value="~"/> <!--可以修改:hadoop集群的hostname或者ip--> <property name="remote.hostname" value="h2master"/> <!--可以修改:登录hadoop集群所在linux的用户名--> <property name="remote.username" value="root"/> <!--可以修改:登录hadoop集群所在liniux的密码--> <property name="remote.password" value="123456"/> <!--可以修改:每次需要运行的main类,写到这里。运行时拼接为hadoop jar xxx.jar MainClass --> <property name="main.class" value="mapreduce.WordCountApp"/> <!--可以修改:hadoop集群在linux的部署路径--> <property name="hadoop.path" value="/usr/local/hadoop2.5"/> <!-- 基本编译路径设置 --> <path id="compile.classpath"> <fileset dir="${java.lib.dir}"> <include name="tools.jar" /> </fileset> <fileset dir="${project.lib.dir}"> <include name="*.jar" /> </fileset> </path> <!-- 运行路径设置 --> <path id="run.classpath"> <path refid="compile.classpath" /> <pathelement location="${classes.dir}" /> </path> <!-- 清理,删除临时目录 --> <target name="clean" description="清理,删除临时目录"> <!--delete dir="${build.dir}" /--> <delete dir="${dist.dir}" /> <delete dir="${classes.dir}" /> <echo level="info">清理完毕</echo> </target> <!-- 初始化,建立目录,复制文件 --> <target name="init" depends="clean" description="初始化,建立目录,复制文件"> <mkdir dir="${classes.dir}" /> <mkdir dir="${dist.dir}" /> </target> <!-- 编译源文件--> <target name="compile" depends="init" description="编译源文件"> <javac srcdir="${src.dir}" destdir="${classes.dir}" source="1.6" target="1.6" includeAntRuntime="false"> <classpath refid="compile.classpath" /> <compilerarg line="-encoding UTF-8 "/> </javac> </target> <!-- 打包类文件 --> <target name="jar" depends="compile" description="打包类文件"> <jar jarfile="${dist.dir}/jar.jar"> <fileset dir="${classes.dir}" includes="**/*.*" /> </jar> </target> <!--上传到服务器 **需要把lib目录下的jsch-0.1.51拷贝到$ANT_HOME/lib下,如果是Eclipse下的Ant环境必须在Window->Preferences->Ant->Runtime->Classpath中加入jsch-0.1.51。 --> <target name="ssh" depends="jar"> <scp file="${dist.dir}/jar.jar" todir="${remote.username}@${remote.hostname}:${remote.home}" password="${remote.password}" trust="true"/> </target> <target name="sshexec" depends="ssh"> <sshexec host="${remote.hostname}" username="${remote.username}" password="${remote.password}" trust="true" command="${hadoop.path}/bin/hadoop jar ${remote.home}/jar.jar ${main.class}"/> </target> </project>
3 执行过程解释:
15/01/12 17:15:48 INFO mapreduce.Job: Counters: 49 一共有49个计数器 [sshexec] File System Counters 计数器组名称,如下是计数器组下计数器详细信息表征参数展示 [sshexec] FILE: Number of bytes read=65 计数器名称FILE ---> 作业运行时操作linxu磁盘任务 [sshexec] FILE: Number of bytes written=197929 [sshexec] FILE: Number of read operations=0 [sshexec] FILE: Number of large read operations=0 [sshexec] FILE: Number of write operations=0 [sshexec] HDFS: Number of bytes read=104 计数器名称HDFS ---> 作业运行时使用hdfs详细信息 [sshexec] HDFS: Number of bytes written=19 [sshexec] HDFS: Number of read operations=6 [sshexec] HDFS: Number of large read operations=0 [sshexec] HDFS: Number of write operations=2 [sshexec] Job Counters 计数器组名称 [sshexec] Launched map tasks=1 加载map任务数量 可以简单理解为map数量 = 加载hdfs文件所在block个数 [sshexec] Launched reduce tasks=1 [sshexec] Data-local map tasks=1 如果数据存放和数据运行都在一台机器 则Data-local map tasks = map tasks Data-local:节省网络开支不在需要从别的节点读取数据 [sshexec] Total time spent by all maps in occupied slots (ms)=104236 [sshexec] Total time spent by all reduces in occupied slots (ms)=18430 [sshexec] Total time spent by all map tasks (ms)=104236 [sshexec] Total time spent by all reduce tasks (ms)=18430 [sshexec] Total vcore-seconds taken by all map tasks=104236 [sshexec] Total vcore-seconds taken by all reduce tasks=18430 [sshexec] Total megabyte-seconds taken by all map tasks=106737664 [sshexec] Total megabyte-seconds taken by all reduce tasks=18872320 [sshexec] Map-Reduce Framework 计数器组名称 [sshexec] Map input records=2 (输入文件为两行记录 因此执行两次map任务) [sshexec] Map output records=4 输出为<hello,1> <you,1> <hello,1> <me,1> [sshexec] Map output bytes=51 [sshexec] Map output materialized bytes=65 [sshexec] Input split bytes=85 [sshexec] Combine input records=0 [sshexec] Combine output records=0 [sshexec] Reduce input groups=3 (reduce输入分组数 <hello,{1,1}> <you,{1}> <me,{1}>) [sshexec] Reduce shuffle bytes=65 [sshexec] Reduce input records=4 (reduce输入分组数 <hello,{1,1}> <you,{1}> <me,{1}>) [sshexec] Reduce output records=3 执行完reduce后输出结果 hello 2 you 1 me 1 [sshexec] Spilled Records=8 [sshexec] Shuffled Maps =1 [sshexec] Failed Shuffles=0 [sshexec] Merged Map outputs=1 [sshexec] GC time elapsed (ms)=1768 [sshexec] CPU time spent (ms)=2990 [sshexec] Physical memory (bytes) snapshot=212107264 [sshexec] Virtual memory (bytes) snapshot=721317888 [sshexec] Total committed heap usage (bytes)=125792256 [sshexec] Shuffle Errors 计数器组名称 [sshexec] BAD_ID=0 [sshexec] CONNECTION=0 [sshexec] IO_ERROR=0 [sshexec] WRONG_LENGTH=0 [sshexec] WRONG_MAP=0 [sshexec] WRONG_REDUCE=0 [sshexec] File Input Format Counters 计数器名称 [sshexec] Bytes Read=19 [sshexec] File Output Format Counters 计数器名称 [sshexec] Bytes Written=19 BUILD SUCCESSFUL
自定义 计数器, 使用combiner 自定义partitioner 和hadoop1一致, 详细参看hadoop1做法