hadoop学习笔记之wordcount

import java.io.IOException;

import java.util.StringTokenizer;



import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;



import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

//我的环境是hadoop 1.1.2

public class WordCount {



	public static void main(String[] args) {				

		try {

			//配置

			Configuration conf=new Configuration();

			String[] otherArgs=new String[]{"file1.txt","hehe001"};

			if(args.length==2)

			{

				otherArgs=new GenericOptionsParser(conf,args).getRemainingArgs();

			}			

			

			if(otherArgs.length!=2)

			{

				System.err.println("Usage:wordcount <in> <out>");

				System.exit(2);

			}

			//生成一次计算任务

			Job job=new Job(conf,"my word count");

			//类来自于

			job.setJarByClass(WordCount.class);

			//映射

			job.setMapperClass(TokenizerMapper.class);

			//合成

			job.setCombinerClass(IntSumReducer.class);

			//规约

			job.setReducerClass(IntSumReducer.class);

			//输出key

			job.setOutputKeyClass(Text.class);

			//输出value

			job.setOutputValueClass(IntWritable.class);

			Path inputPath=new Path(otherArgs[0]);

			System.out.println("inputPath:"+inputPath.toString());

			//hadoop中的输入文件 

			FileInputFormat.setInputPaths(job, inputPath);

			//hadoop中的输出目录

			Path outputPath=new Path(otherArgs[1]);

			System.out.println("outputPath:"+outputPath.toString());

			FileOutputFormat.setOutputPath(job, outputPath);

			//执行

			System.exit(job.waitForCompletion(true)?0:1);

			System.out.println("ok");

					

		} catch (Exception e) {

			e.printStackTrace();

		}

		

		

	

	}

	public static class TokenizerMapper extends  Mapper<Object, Text, Text, IntWritable>

	{



		//封装的int 构造函数赋值为1 每个字符串出现一次

		private final static IntWritable one=new IntWritable(1);

		private Text word=new Text();

		@Override

		protected void map(Object key, Text value,Context context)

				throws IOException, InterruptedException {

			String line=value.toString();

			//System.out.println("映射 TokenizerMapper:" + line);

			//按行读取 每次读取一行 line

			StringTokenizer itr=new StringTokenizer(line);

			//对每行中的字符串按照空格 作分词

			while(itr.hasMoreTokens())

			{

				//获取分词

				word.set(itr.nextToken());

				//key=分词字符串 value=1

				context.write(word, one);

			}

			

			//super.map(key, value, context);

		}

		

		

	}

	public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable>

	{

		private IntWritable result = new IntWritable();



		@Override

		protected void reduce(Text key, Iterable<IntWritable> values,

				Context context)

				throws IOException, InterruptedException {

			//对每个字符串 规约 values=字符串出现的次数 key=字符串

			int sum = 0;

			for (IntWritable val : values) {

				sum += val.get();

			}

			result.set(sum);

			context.write(key, result);

			

			//System.out.println("IntSumReducer 规约" );

			//super.reduce(key, values, context);

		}

		

	}



}


package old;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class Dedup {
	// map将输入中的value复制到输出数据的key上,并直接输出
	public static class Map extends Mapper<Object, Text, Text, Text> {
		private static Text line = new Text();// 每行数据

		// 实现map函数f

		public void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			line = value;
			context.write(line, new Text(""));
		}
	}

	// reduce将输入中的key复制到输出数据的key上,并直接输出
	public static class Reduce extends Reducer<Text, Text, Text, Text> { // 实现reduce函数
		public void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {

			context.write(key, new Text(""));
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		// 这句话很关键
		conf.set("mapred.job.tracker", "192.168.1.9:9001");
		String[] ioArgs = new String[] { "testdata.txt", "dedup_out" };
		if (args.length == 2) {
			ioArgs = args;
		}
		String[] otherArgs = new GenericOptionsParser(conf, ioArgs)
				.getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: Data Deduplication <in> <out>");
			System.exit(2);

		}
		Job job = new Job(conf, "Data Deduplication");
		job.setJarByClass(Dedup.class);
		// 设置Map、Combine和Reduce处理类
		job.setMapperClass(Map.class);
		job.setCombinerClass(Reduce.class);
		job.setReducerClass(Reduce.class);
		// 设置输出类型 job.setOutputKeyClass(Text.class);
		// job.setOutputValueClass(Text.class);
		// 设置输入和输出目录
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}


你可能感兴趣的:(hadoop学习笔记之wordcount)