Hive-谷粒影音练习和答案

视频表

字段 备注 详细描述
video id 视频唯一 id 11 位字符串
uploader 视频上传者 上传视频的用户名 String
age 视频年龄 视频在平台上的整数天
category 视频类别 上传视频指定的视频分类
length 视频长度 整形数字标识的视频长度
views 观看次数 视频被浏览的次数
rate 视频评分 满分 5 分
ratings 流量 视频的流量,整型数字
conments 评论数 一个视频的整数评论数
related ids 相关视频 id 相关视频的 id,最多 20 个

    原始数据格式如下:

SDNkMu8ZT68	w00dy911	630	People & Blogs	186	10181	3.49	494	257	rjnbgpPJUks
PkGUU_ggO3k	theresident	704	Entertainment	262	11235	3.85	247	280	PkGUU_ggO3k	EYC5bWF0ss8	EUPHdnE83GY	JO1LTIFOkTw	gVSzbvFnVRY	l9NJ04JiZj4	ay3gcr84YeQ	AfBxANiGnnU	RyWz8hwGbY4	BeJ7tGRgiW4	fbq2-jd5Dto	j8fTx5E5rik	qGkCtXLN1W0	mh_MGyx9tgc
RX24KLBhwMI	lemonette	697	People & Blogs	512	24149	4.22	315	474	t60tW0WevkE	WZgoejVDZlo	Xa_op4MhSkg	MwynZ8qTwXA	sfG2rtAkAcg	j72VLPwzd_c	24Qfs69Al3U	EGWutOjVx4M	KVkseZR5coU	R6OaRcsfnY4	dGM3k_4cNhE	ai-cSq6APLQ	73M0y-iD9WE	3uKOSjE79YA	9BBu5N0iFBg	7f9zwx52xgA	ncEV0tSC7xM	H-J8Kbx9o68	s8xf4QX1UvA	2cKd9ERh5-8

用户表

字段 备注 详细描述
uploader 上传者用户名 string
videos 上传视频数 int
friends 朋友数量 int

    原始数据格式如下:

barelypolitical	151	5106
bonk65	89	144
camelcars	26	674
cubskickass34	13	126

数据清洗

    通过观察视频表原始数据形式,需要对其进行数据清洗,要求如下:

  1. 视频表有10个字段,第10个字段相关视频id可能没有,也可能有一个或多个。所以字段数小于9(不包括9)的数据将被视为无效数据。
  2. 当字段视频类别有多个类别时,每个分类用&符号分割,而且中间有空格。所以需要删除多余的空格。
  3. 每个字段之间用“\t”分割开,当字段相关视频id有多个类别时,之间也用“\t”分割。所以多个相关视频id之间用&符号分割。

Mapper

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class ETLMapper extends Mapper<LongWritable, Text, NullWritable, Text>{
	
	Text v = new Text();
	
	@Override
	protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, NullWritable, Text>.Context context)
			throws IOException, InterruptedException {
		String string = ETLUtil.getString(value.toString());
		if (string != null) {   //string不为空才写入
			v.set(string);
			context.write(NullWritable.get(), v);
		}
	}
}

ETLUtil

public class ETLUtil {
	public static String getString(String value) {
		StringBuilder builder = new StringBuilder();
		// 获取字段
		String[] split = value.split("\t");
		// 过滤字段数小于9的数据
		if (split.length >= 9) {
			//去掉类别字段中的空格
			split[3] = split[3].replaceAll(" ", "");
			//修改相关视频ID字段的分隔符,将 \t 替换为 &
			for (int i = 0; i < split.length; i++) {
				builder.append(split[i]);
				if (i != split.length -1) {
					if (i < 9) {
						builder.append("\t");
					}else {
						builder.append("&");
					}
				}
			}
			return builder.toString();
		}
		return null;
	}
}

Driver

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class ETLDriver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		job.setJarByClass(ETLDriver.class);
		job.setMapperClass(ETLMapper.class);
		// 不需要Reducer
		job.setMapOutputKeyClass(NullWritable.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(NullWritable.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.setInputPaths(job, new Path("e:/input"));
		FileOutputFormat.setOutputPath(job, new Path("e:/output"));
		boolean result = job.waitForCompletion(true);
		System.exit(result ? 0 :1);
	}
}

    运行效果如下:

SDNkMu8ZT68	w00dy911	630	People&Blogs	186	10181	3.49	494	257	rjnbgpPJUks
PkGUU_ggO3k	theresident	704	Entertainment	262	11235	3.85	247	280	PkGUU_ggO3k&EYC5bWF0ss8&EUPHdnE83GY&JO1LTIFOkTw&gVSzbvFnVRY&l9NJ04JiZj4&ay3gcr84YeQ&AfBxANiGnnU&RyWz8hwGbY4&BeJ7tGRgiW4&fbq2-jd5Dto&j8fTx5E5rik&qGkCtXLN1W0&mh_MGyx9tgc&bgn6RYut2lE&HS6Nqxh4uf4&m9Gq44o5pcA&K7unV366Qr4&shU2hfHKmU0&p0lq5-8IDqY
RX24KLBhwMI	lemonette	697	People&Blogs	512	24149	4.22	315	474	t60tW0WevkE&WZgoejVDZlo&Xa_op4MhSkg&MwynZ8qTwXA&sfG2rtAkAcg&j72VLPwzd_c&24Qfs69Al3U&EGWutOjVx4M&KVkseZR5coU&R6OaRcsfnY4&dGM3k_4cNhE&ai-cSq6APLQ&73M0y-iD9WE&3uKOSjE79YA&9BBu5N0iFBg&7f9zwx52xgA&ncEV0tSC7xM&H-J8Kbx9o68&s8xf4QX1UvA&2cKd9ERh5-8

创建表

    中间视频表,textfile格式:

create table video_ori(
	videoId string,
	uploader string,
	age int,
	category array<string>,
	length int,
	views int,
	rate float,
	ratings int,
	comments int,
	relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;

    中间用户表,textfile格式:

create table video_user_ori(
	uploader string,
	videos int,
	friends int)
row format delimited
fields terminated by "\t"
stored as textfile;

    最终视频表,orc格式:

create table video(
	videoId string,
	uploader string,
	age int,
	category array<string>,
	length int,
	views int,
	rate float,
	ratings int,
	comments int,
	relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;

    最终用户表,orc格式:

create table video_user(
	uploader string,
	videos int,
	friends int)
row format delimited
fields terminated by "\t"
stored as orc;

    导入数据到中间表

load data local inpath "/opt/data.txt" into table video_ori;
load data local inpath "/opt/user.txt" into table video_user_ori;

    导入数据到最终表

insert into table video select * from video_user;
insert into table video_user select * from video_user_ori;

解决需求

一、统计视频观看次数Top10

SELECT
	videoId,uploader,
	age,category,
	length,views,
	rate,ratings,
	comments,relatedid 
FROM
	video 
ORDER BY
	views DESC 
LIMIT 10;

二、统计视频类别热度Top10

    需要注意的是视频类别可能有多个,需要用explode()函数。

SELECT
	category_name,
	count(*) category_count 
FROM
	video
	lateral VIEW explode ( category ) t1 AS category_name 
GROUP BY
	category_name 
ORDER BY
	category_count DESC 
LIMIT 10;

    运行结果如下:

category_name	category_count
Music			179049
Entertainment	127674
Comedy			87818
Animation		73293
Film			73293
Sports			67329
Games			59817
Gadgets			59817
People			48890
Blogs			48890

三、统计视频观看次数Top20所属类别以及类别包含的Top20的视频个数

    首先获取观看次数前20的数据

SELECT
	videoId,category,views 
FROM
	video 
ORDER BY
	views DESC 
LIMIT 20;

    获取观看次数前20视频的类别并统计每个类别包含Top20视频的个数

SELECT
	videoId,views,
	category_name,
	count(*) over ( distribute BY category_name ) Top20_count 
FROM
	( SELECT
		videoId,category,views 
	  FROM
		video 
	  ORDER BY
		views DESC 
	  LIMIT 20 ) t1
	lateral VIEW explode ( category ) t2 AS category_name 
ORDER BY
	views DESC;

    运行结果如下:

videoid			views		category_name	top20_count
dMH0bHeiRNg		42513417	Comedy			6
0XxI-hvPRRA		20282464	Comedy			6
1dmVU08zVpA		16087899	Entertainment	6
RB-wUgnyGv0		15712924	Entertainment	6
QjA5faZF1A8		15256922	Music			5
-_CSo1gOd48		13199833	Blogs			2
-_CSo1gOd48		13199833	People			2
49IDp76kjPw		11970018	Comedy			6
tYnn51C3X_w		11823701	Music			5
pv5zWaTEVkI		11672017	Music			5
D2kJZOfq7zk		11184051	People			2
D2kJZOfq7zk		11184051	Blogs			2
vr3x_RRJdd4		10786529	Entertainment	6
lsO6D1rwrKc		10334975	Entertainment	6
5P6UU6m3cqk		10107491	Comedy			6
8bbTtPL1jRs		9579911		Music			5
_BuRwH59oAo		9566609		Comedy			6
aRNzWyD7C9o		8825788		UNA				1
UMf40daefsI		7533070		Music			5
ixsZy2425eY		7456875		Entertainment	6
MNxwAU_xAMk		7066676		Comedy			6
RUCZJVJ_M8o		6952767		Entertainment	6

四、统计视频观看次数Top5所关联视频的所属的每个类别的总数排名Rank

    获取前5的视频信息

SELECT
	views,relatedid 
FROM
	video 
ORDER BY
	views DESC 
LIMIT 5;

    获取前5视频相关视频id

SELECT
	related_id 
FROM
	( SELECT
		views,relatedid 
	  FROM
		video 
	  ORDER BY
		views DESC 
	  LIMIT 5 ) t1
	lateral VIEW explode ( relatedid ) t2 AS related_id 
GROUP BY
	related_id;

    获取关联视频类别

SELECT
	video.category 
FROM
	( SELECT
		related_id 
	  FROM
		( SELECT
			views,relatedid 
		  FROM
			video 
		  ORDER BY
			views DESC 
		  LIMIT 5 ) t1
		lateral VIEW explode ( relatedid ) t2 AS related_id 
	  GROUP BY
		related_id ) t3
INNER JOIN video
ON t3.related_id = video.videoId;

    explode获取关联视频类别并统计每个类别的总数

SELECT
	category_name,
	count( * ) category_count 
FROM
	( SELECT
		video.category typename 
	  FROM
		( SELECT
			related_id 
		  FROM
			( SELECT
				views,relatedid 
			  FROM
				video 
			  ORDER BY
				views DESC 
			  LIMIT 5 ) t1
			lateral VIEW explode ( relatedid ) t2 AS related_id 
		  GROUP BY
			related_id ) t3
	  INNER JOIN video
	  ON t3.related_id = video.videoId ) t4
lateral VIEW explode ( typename ) t5 AS category_name 
GROUP BY
	category_name;

    对结果按类别总数排名

SELECT
	category_name,
	category_count,
	row_number () over ( ORDER BY category_count DESC ) category_rank 
FROM
	( SELECT
		category_name,
		count( * ) category_count 
	  FROM
		( SELECT
			video.category typename 
		  FROM
			( SELECT
				related_id 
			  FROM
				( SELECT
					views,relatedid 
				  FROM
					video 
				  ORDER BY
					views DESC 
				  LIMIT 5 ) t1
				lateral VIEW explode ( relatedid ) t2 AS related_id 
				GROUP BY
				related_id ) t3
		  INNER JOIN video ON
		  t3.related_id = video.videoId ) t4
		  lateral VIEW explode ( typename ) t5 AS category_name 
	GROUP BY
	category_name ) t6;

    运行结果如下:

category_name	category_count	category_rank
Entertainment	36				1
Comedy			31				2
Music			22				3
Gadgets			3				4
Games			3				5
Animation		2				6
Howto			2				7
Film			2				8
DIY				2				9
People			1				10
Blogs			1				11
Autos			1				12
Vehicles		1				13

五、统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频

    这道题有歧义,有两种理解:

  1. 统计上传视频最多的前10个用户,以及他们上传的N个视频中,观看次数在他们的上传列表前20的视频。
  2. 统计上传视频最多的前10个用户,以及他们上传的视频在观看次数Top20排行榜的数量。

理解一的做法

    获取上传视频最多的前10个用户

SELECT
	uploader,videos 
FROM
	video_user 
ORDER BY
	videos DESC 
LIMIT 10;

    与原表关联获取用户的视频并排名

SELECT
	videoId,t1.uploader,
	videos,views,
	row_number ( ) over ( distribute BY t1.uploader sort BY views DESC ) ranknum 
FROM
	( SELECT
		uploader,videos 
	  FROM
		video_user 
	  ORDER BY
		videos DESC 
	  LIMIT 10 ) t1
INNER JOIN video
ON t1.uploader = video.uploader;

    过滤每个用户的视频

SELECT
	videoId,up,videos,
	views,ranknum 
FROM
	( SELECT
		videoId,t1.uploader up,
		videos,views,
		row_number () over ( distribute BY t1.uploader sort BY views DESC ) ranknum 
	  FROM
		( SELECT
			uploader,videos 
		  FROM
			video_user 
		  ORDER BY
			videos DESC 
		  LIMIT 10 ) t1
	  INNER JOIN video
	  ON t1.uploader = video.uploader ) t2 
WHERE
	ranknum <= 20;

理解二的做法

    获取上传视频最多的前10个用户

SELECT
	uploader,
	videos 
FROM
	video_user 
ORDER BY
	videos DESC 
LIMIT 10;

    获取观看次数前20的视频

SELECT
	videoId,uploader,views,
	row_number ( ) over ( ORDER BY views DESC ) ranknum 
FROM
	( SELECT
		videoId,uploader,views 
	  FROM
		video 
	  ORDER BY
		views DESC 
	  LIMIT 20 ) t1;

    两表关联统计每个用户在前20排行榜有多少个自己的视频

SELECT
	videoId,views,
	t2.uploader,
	videos,ranknum 
FROM
	( SELECT
		videoId,uploader,views,
		row_number ( ) over ( ORDER BY views DESC ) ranknum 
	  FROM
		( SELECT
			videoId,uploader,views 
		  FROM
			video 
		  ORDER BY
			views DESC 
		  LIMIT 20 ) t1 ) t2
	  INNER JOIN
	  ( SELECT
			uploader,videos 
		FROM
	    	video_user 
		ORDER BY
			videos DESC 
		LIMIT 10 ) t3
	  ON t2.uploader = t3.uploader 
ORDER BY
	t2.uploader;

    口头给的需求尽量不要做,否则可能做了白做,吃亏的是自己。需求必须是书面形式的,必须有证据,可以邮件,微信等留下证据!!!!

六、统计每个类别中视频观看次数Top10

SELECT
	videoId,category_name,
	views,video_rank 
FROM
	( SELECT
		videoId,category_name,views,
		rank () over ( distribute BY category_name sort BY views DESC ) video_rank 
	  FROM
		video
		lateral VIEW explode ( category ) t1 AS category_name ) t2 
WHERE
	video_rank <= 10;

Fetched: 210 row(s)

异常问题

    写出来的SQL可能运行不了,JVM 堆内存溢出,调整堆内存大小
    描述:java.lang.OutOfMemoryError: Java heap space
    解决:在 yarn-site.xml 中加入如下代码


	yarn.scheduler.maximum-allocation-mb
	2048


	 yarn.scheduler.minimum-allocation-mb
	 2048


	yarn.nodemanager.vmem-pmem-ratio
	2.1


	mapred.child.java.opts
	-Xmx1024m

你可能感兴趣的:(Hive)