准备前一篇的涉及到的内容
这里额外新增一快,在idea的安装目录下,也就是idea.exe的目录下新建一个idea.bat,使用txt文本文件改下后缀名即可,编辑输入以下信息:
runas /user:root /savecred idea.exe
相当于使用前面新建的windows账号为root的用户,使用root身份执行idea.exe程序,此时会提示输入root账号的密码,如果idea在root账号下曾经开发过其他的内容,IDEA会使用root下的配置。
前面已经新建了Test的测试类,这里新建WordCount2测试类
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.StringUtils; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.util.*; public class WordCount2 { public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{ static enum CountersEnum { INPUT_WORDS } private final static IntWritable one = new IntWritable(1); private Text word = new Text(); private boolean caseSensitive; private Set<String> patternsToSkip = new HashSet<String>(); private Configuration conf; private BufferedReader fis; @Override public void setup(Context context) throws IOException, InterruptedException { conf = context.getConfiguration(); caseSensitive = conf.getBoolean("wordcount.case.sensitive", false); if (conf.getBoolean("wordcount.skip.patterns", false)) { URI[] patternsURIs = Job.getInstance(conf).getCacheFiles(); for (URI patternsURI : patternsURIs) { Path patternsPath = new Path(patternsURI.getPath()); String patternsFileName = patternsPath.getName().toString(); parseSkipFile(patternsFileName); } } } private void parseSkipFile(String fileName) { try { fis = new BufferedReader(new FileReader(fileName)); String pattern = null; while ((pattern = fis.readLine()) != null) { patternsToSkip.add(pattern); } } catch (IOException ioe) { System.err.println("Caught exception while parsing the cached file '" + StringUtils.stringifyException(ioe)); } } @Override public void map(Object key, Text value, Context context ) throws IOException, InterruptedException { String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase(); for (String pattern : patternsToSkip) { line = line.replaceAll(pattern, ""); } StringTokenizer itr = new StringTokenizer(line); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); Counter counter = context.getCounter(CountersEnum.class.getName(), CountersEnum.INPUT_WORDS.toString()); counter.increment(1); } } } public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context ) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(String[] args0) throws Exception { //String[] args1=args0; //这里两个参数本来在执行main方法的args里面配置,这里写死在这里方便调试 String[] args1={"hdfs://192.168.145.128:9000/tmp/input", "hdfs://192.168.145.128:9000/tmp/output"}; Configuration conf = new Configuration(); // 每个人的maven仓库的路径不一样,这里使用你的MAVEN仓库路径,比如我的MAVEN就暴露出了盘很多,报名为me.j360.hadoop,F:\\Maven\\repo // 如果缺少这一句,会提示ClassNotFound conf.set("mapred.jar", "F:\\Maven\\repo\\me\\j360\\hadoop\\1.0-SNAPSHOT\\hadoop-1.0-SNAPSHOT.jar"); // 以下都是默认配置,端口号和hadoop集群的配置必须要一致 conf.setBoolean("mapreduce.app-submission.cross-platform", true); // 配置使用跨平台提交任务 String flag=conf.get("mapreduce.app-submission.cross-platform"); System.out.println(flag); conf.set("fs.defaultFS", "hdfs://192.168.145.128:9000"); //指定namenode conf.set("mapreduce.framework.name", "yarn"); // 指定使用yarn框架 conf.set("yarn.resourcemanager.address", "192.168.145.128:8032"); // 指定resourcemanager conf.set("yarn.resourcemanager.scheduler.address", "192.168.145.128:8030"); // 指定资源分配器 conf.set("mapreduce.jobhistory.address","192.168.145.128:10020"); GenericOptionsParser optionParser = new GenericOptionsParser(conf, args1); String[] remainingArgs = optionParser.getRemainingArgs(); if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) { System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]"); System.exit(2); } Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount2.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); List<String> otherArgs = new ArrayList<String>(); for (int i=0; i < remainingArgs.length; ++i) { if ("-skip".equals(remainingArgs[i])) { job.addCacheFile(new Path(remainingArgs[++i]).toUri()); job.getConfiguration().setBoolean("wordcount.skip.patterns", true); } else { otherArgs.add(remainingArgs[i]); } } FileInputFormat.addInputPath(job, new Path(otherArgs.get(0))); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1))); if(job.waitForCompletion(true)){ System.out.println("执行完成..."); //使用hdfs的api打印出结果 FileSystem fs = FileSystem.get(URI.create("hdfs://192.168.145.128:9000"), conf); // 列出hdfs上/tmp/output/目录下的所有文件和目录 FileStatus[] statuses = fs.listStatus(new Path("/tmp/output")); for (FileStatus status : statuses) { System.out.println(status); System.out.println(status.getPath()); } // 显示在hdfs的/tmp/output下指定文件的内容 InputStream is = fs.open(new Path("/tmp/output/part-r-00000")); IOUtils.copyBytes(is, System.out, 1024, true); } } }
执行main方法,等待几秒钟,如果提示file已经存在,则修改下输出的路径,如果file找不到,根据hdfs的api调试
FileStatus{path=hdfs://192.168.145.128:9000/tmp/output/_SUCCESS; isDirectory=false; length=0; replication=3; blocksize=134217728; modification_time=1429001943525; access_time=1429001943504; owner=root; group=supergroup; permission=rw-r--r--; isSymlink=false} FileStatus{path=hdfs://192.168.145.128:9000/tmp/output/part-r-00000; isDirectory=false; length=45; replication=3; blocksize=134217728; modification_time=1429001943215; access_time=1429001942602; owner=root; group=supergroup; permission=rw-r--r--; isSymlink=false} bye 2 hadoop 2 hello 3 jj 1 world 1 world! 1