使用hive、java api两种方式实现wordcount功能、及个人感悟

hadoop入门级练习是wordcount,是使用hadoop提供的java api进行的,它并不像编程语言入门级hello word那么简单几行代码搞定,下面我将演示我个人使用java、hive两种方式实现wordcount。

前期准备

在linux平台下创建word.txt,并添加如下内容(需要上传word.txt到hdfs文件系统中):

hello tom
hello jerry
hello kitty
hello word
hello tom

java api实现WordCount

package cn.itcast.hadoop.mr;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/*
 * 1.分析具体的业力逻辑,确定输入输出数据样式
 * 2.自定义一个类,这个类要继承import org.apache.hadoop.mapreduce.Mapper;
 * 重写map方法,实现具体业务逻辑,将新的kv输出
 * 3.自定义一个类,这个类要继承import org.apache.hadoop.mapreduce.Reducer;
 * 重写reduce,实现具体业务逻辑
 * 4.将自定义的mapper和reducer通过job对象组装起来
 */

public class WordCount {
	public static class WCMapper extends Mapper {

		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			//接收数据V1
			String line = value.toString();
			//切分数据
			String[] wordsStrings = line.split(" ");
			//循环
			for (String w: wordsStrings) {
				//出现一次,记一个一,输出
				context.write(new Text(w), new LongWritable(1));
			}
		}
	}
	public static class WCReducer extends Reducer {

		@Override
		protected void reduce(Text key, Iterable v2s, Context context)
				throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			//接收数据
			//Text k3 = k2;
			//定义一个计算器
			long counter = 0;
			//循环v2s
			for (LongWritable i : v2s)
			{
				counter += i.get();
			}
			//输出
			context.write(key, new LongWritable(counter));
		}
	}
	public static void main(String[] args) throws Exception {
		// 构建Job对象
		Job job = Job.getInstance(new Configuration());
		
		// 注意:main方法所在的类
		job.setJarByClass(WordCount.class);
		
		// 设置Mapper相关属性
		job.setMapperClass(WCMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		
		// 设置Reducer相关属性
		job.setReducerClass(WCReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.setCombinerClass(WCReducer.class);
		// 提交任务
		job.waitForCompletion(true);
	}	
}
 {

		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			//接收数据V1
			String line = value.toString();
			//切分数据
			String[] wordsStrings = line.split(" ");
			//循环
			for (String w: wordsStrings) {
				//出现一次,记一个一,输出
				context.write(new Text(w), new LongWritable(1));
			}
		}
	}
	public static class WCReducer extends Reducer {

		@Override
		protected void reduce(Text key, Iterable v2s, Context context)
				throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			//接收数据
			//Text k3 = k2;
			//定义一个计算器
			long counter = 0;
			//循环v2s
			for (LongWritable i : v2s)
			{
				counter += i.get();
			}
			//输出
			context.write(key, new LongWritable(counter));
		}
	}
	public static void main(String[] args) throws Exception {
		// 构建Job对象
		Job job = Job.getInstance(new Configuration());
		
		// 注意:main方法所在的类
		job.setJarByClass(WordCount.class);
		
		// 设置Mapper相关属性
		job.setMapperClass(WCMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		
		// 设置Reducer相关属性
		job.setReducerClass(WCReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.setCombinerClass(WCReducer.class);
		// 提交任务
		job.waitForCompletion(true);
	}	
}

导出jar包,需要指定main方法所在的类WordCount,然后运行jar包:hadoop jar wordcount.jar /word.txt /wordout

输出结果如下:

hello	5
jerry	1
kitty	1
tom	2
word	1

Hive实现WordCount

1.创建一张表用于存放word.txt数据

create table docs (line string); 

2.从本地加载数据到docs表中

load data local inpath '/root/word.txt' overwrite into table docs;

3.创建计算,利用hive查询语句执行map、reducer

create table word_counts as select word, count(1) as count from (select explode(split(line, ' ')) as word from docs) w group by word order by word;

其输出结果如下:

hello5
jerry1
kitty1
tom2
word1

总结:

两者执行结果是一致的。java api导出的jar包运行快,只执行一次mapreduce;hive sql语句实现wordcount 则执行了两次mapreduce,因此效率更低一些。但对于开发者来说,hive已经将sql封装成了mapreduce,大大简化了编写mapreduce的时间,缩短了开发周期。


文章最后,给大家推荐一些受欢迎的技术博客链接

  1. Hadoop相关技术博客链接
  2. Spark 核心技术链接
  3. JAVA相关的深度技术博客链接
  4. 超全干货--Flink思维导图,花了3周左右编写、校对
  5. 深入JAVA 的JVM核心原理解决线上各种故障【附案例】
  6. 请谈谈你对volatile的理解?--最近小李子与面试官的一场“硬核较量”
  7. 聊聊RPC通信,经常被问到的一道面试题。源码+笔记,包懂

 


欢迎扫描下方的二维码或 搜索 公众号“10点进修”,我们会有更多、且及时的资料推送给您,欢迎多多交流!

                                           

       

你可能感兴趣的:(hive)