字段 | 备注 | 详细描述 |
---|---|---|
video id | 视频唯一 id | 11 位字符串 |
uploader | 视频上传者 | 上传视频的用户名 String |
age | 视频年龄 | 视频在平台上的整数天 |
category | 视频类别 | 上传视频指定的视频分类 |
length | 视频长度 | 整形数字标识的视频长度 |
views | 观看次数 | 视频被浏览的次数 |
rate | 视频评分 | 满分 5 分 |
ratings | 流量 | 视频的流量,整型数字 |
conments | 评论数 | 一个视频的整数评论数 |
related ids | 相关视频 id | 相关视频的 id,最多 20 个 |
原始数据格式如下:
SDNkMu8ZT68 w00dy911 630 People & Blogs 186 10181 3.49 494 257 rjnbgpPJUks
PkGUU_ggO3k theresident 704 Entertainment 262 11235 3.85 247 280 PkGUU_ggO3k EYC5bWF0ss8 EUPHdnE83GY JO1LTIFOkTw gVSzbvFnVRY l9NJ04JiZj4 ay3gcr84YeQ AfBxANiGnnU RyWz8hwGbY4 BeJ7tGRgiW4 fbq2-jd5Dto j8fTx5E5rik qGkCtXLN1W0 mh_MGyx9tgc
RX24KLBhwMI lemonette 697 People & Blogs 512 24149 4.22 315 474 t60tW0WevkE WZgoejVDZlo Xa_op4MhSkg MwynZ8qTwXA sfG2rtAkAcg j72VLPwzd_c 24Qfs69Al3U EGWutOjVx4M KVkseZR5coU R6OaRcsfnY4 dGM3k_4cNhE ai-cSq6APLQ 73M0y-iD9WE 3uKOSjE79YA 9BBu5N0iFBg 7f9zwx52xgA ncEV0tSC7xM H-J8Kbx9o68 s8xf4QX1UvA 2cKd9ERh5-8
字段 | 备注 | 详细描述 |
---|---|---|
uploader | 上传者用户名 | string |
videos | 上传视频数 | int |
friends | 朋友数量 | int |
原始数据格式如下:
barelypolitical 151 5106
bonk65 89 144
camelcars 26 674
cubskickass34 13 126
通过观察视频表原始数据形式,需要对其进行数据清洗,要求如下:
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class ETLMapper extends Mapper<LongWritable, Text, NullWritable, Text>{
Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, NullWritable, Text>.Context context)
throws IOException, InterruptedException {
String string = ETLUtil.getString(value.toString());
if (string != null) { //string不为空才写入
v.set(string);
context.write(NullWritable.get(), v);
}
}
}
public class ETLUtil {
public static String getString(String value) {
StringBuilder builder = new StringBuilder();
// 获取字段
String[] split = value.split("\t");
// 过滤字段数小于9的数据
if (split.length >= 9) {
//去掉类别字段中的空格
split[3] = split[3].replaceAll(" ", "");
//修改相关视频ID字段的分隔符,将 \t 替换为 &
for (int i = 0; i < split.length; i++) {
builder.append(split[i]);
if (i != split.length -1) {
if (i < 9) {
builder.append("\t");
}else {
builder.append("&");
}
}
}
return builder.toString();
}
return null;
}
}
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ETLDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(ETLDriver.class);
job.setMapperClass(ETLMapper.class);
// 不需要Reducer
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("e:/input"));
FileOutputFormat.setOutputPath(job, new Path("e:/output"));
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 :1);
}
}
运行效果如下:
SDNkMu8ZT68 w00dy911 630 People&Blogs 186 10181 3.49 494 257 rjnbgpPJUks
PkGUU_ggO3k theresident 704 Entertainment 262 11235 3.85 247 280 PkGUU_ggO3k&EYC5bWF0ss8&EUPHdnE83GY&JO1LTIFOkTw&gVSzbvFnVRY&l9NJ04JiZj4&ay3gcr84YeQ&AfBxANiGnnU&RyWz8hwGbY4&BeJ7tGRgiW4&fbq2-jd5Dto&j8fTx5E5rik&qGkCtXLN1W0&mh_MGyx9tgc&bgn6RYut2lE&HS6Nqxh4uf4&m9Gq44o5pcA&K7unV366Qr4&shU2hfHKmU0&p0lq5-8IDqY
RX24KLBhwMI lemonette 697 People&Blogs 512 24149 4.22 315 474 t60tW0WevkE&WZgoejVDZlo&Xa_op4MhSkg&MwynZ8qTwXA&sfG2rtAkAcg&j72VLPwzd_c&24Qfs69Al3U&EGWutOjVx4M&KVkseZR5coU&R6OaRcsfnY4&dGM3k_4cNhE&ai-cSq6APLQ&73M0y-iD9WE&3uKOSjE79YA&9BBu5N0iFBg&7f9zwx52xgA&ncEV0tSC7xM&H-J8Kbx9o68&s8xf4QX1UvA&2cKd9ERh5-8
中间视频表,textfile格式:
create table video_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
中间用户表,textfile格式:
create table video_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
最终视频表,orc格式:
create table video(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;
最终用户表,orc格式:
create table video_user(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc;
导入数据到中间表
load data local inpath "/opt/data.txt" into table video_ori;
load data local inpath "/opt/user.txt" into table video_user_ori;
导入数据到最终表
insert into table video select * from video_user;
insert into table video_user select * from video_user_ori;
SELECT
videoId,uploader,
age,category,
length,views,
rate,ratings,
comments,relatedid
FROM
video
ORDER BY
views DESC
LIMIT 10;
需要注意的是视频类别可能有多个,需要用explode()函数。
SELECT
category_name,
count(*) category_count
FROM
video
lateral VIEW explode ( category ) t1 AS category_name
GROUP BY
category_name
ORDER BY
category_count DESC
LIMIT 10;
运行结果如下:
category_name category_count
Music 179049
Entertainment 127674
Comedy 87818
Animation 73293
Film 73293
Sports 67329
Games 59817
Gadgets 59817
People 48890
Blogs 48890
首先获取观看次数前20的数据
SELECT
videoId,category,views
FROM
video
ORDER BY
views DESC
LIMIT 20;
获取观看次数前20视频的类别并统计每个类别包含Top20视频的个数
SELECT
videoId,views,
category_name,
count(*) over ( distribute BY category_name ) Top20_count
FROM
( SELECT
videoId,category,views
FROM
video
ORDER BY
views DESC
LIMIT 20 ) t1
lateral VIEW explode ( category ) t2 AS category_name
ORDER BY
views DESC;
运行结果如下:
videoid views category_name top20_count
dMH0bHeiRNg 42513417 Comedy 6
0XxI-hvPRRA 20282464 Comedy 6
1dmVU08zVpA 16087899 Entertainment 6
RB-wUgnyGv0 15712924 Entertainment 6
QjA5faZF1A8 15256922 Music 5
-_CSo1gOd48 13199833 Blogs 2
-_CSo1gOd48 13199833 People 2
49IDp76kjPw 11970018 Comedy 6
tYnn51C3X_w 11823701 Music 5
pv5zWaTEVkI 11672017 Music 5
D2kJZOfq7zk 11184051 People 2
D2kJZOfq7zk 11184051 Blogs 2
vr3x_RRJdd4 10786529 Entertainment 6
lsO6D1rwrKc 10334975 Entertainment 6
5P6UU6m3cqk 10107491 Comedy 6
8bbTtPL1jRs 9579911 Music 5
_BuRwH59oAo 9566609 Comedy 6
aRNzWyD7C9o 8825788 UNA 1
UMf40daefsI 7533070 Music 5
ixsZy2425eY 7456875 Entertainment 6
MNxwAU_xAMk 7066676 Comedy 6
RUCZJVJ_M8o 6952767 Entertainment 6
获取前5的视频信息
SELECT
views,relatedid
FROM
video
ORDER BY
views DESC
LIMIT 5;
获取前5视频相关视频id
SELECT
related_id
FROM
( SELECT
views,relatedid
FROM
video
ORDER BY
views DESC
LIMIT 5 ) t1
lateral VIEW explode ( relatedid ) t2 AS related_id
GROUP BY
related_id;
获取关联视频类别
SELECT
video.category
FROM
( SELECT
related_id
FROM
( SELECT
views,relatedid
FROM
video
ORDER BY
views DESC
LIMIT 5 ) t1
lateral VIEW explode ( relatedid ) t2 AS related_id
GROUP BY
related_id ) t3
INNER JOIN video
ON t3.related_id = video.videoId;
explode获取关联视频类别并统计每个类别的总数
SELECT
category_name,
count( * ) category_count
FROM
( SELECT
video.category typename
FROM
( SELECT
related_id
FROM
( SELECT
views,relatedid
FROM
video
ORDER BY
views DESC
LIMIT 5 ) t1
lateral VIEW explode ( relatedid ) t2 AS related_id
GROUP BY
related_id ) t3
INNER JOIN video
ON t3.related_id = video.videoId ) t4
lateral VIEW explode ( typename ) t5 AS category_name
GROUP BY
category_name;
对结果按类别总数排名
SELECT
category_name,
category_count,
row_number () over ( ORDER BY category_count DESC ) category_rank
FROM
( SELECT
category_name,
count( * ) category_count
FROM
( SELECT
video.category typename
FROM
( SELECT
related_id
FROM
( SELECT
views,relatedid
FROM
video
ORDER BY
views DESC
LIMIT 5 ) t1
lateral VIEW explode ( relatedid ) t2 AS related_id
GROUP BY
related_id ) t3
INNER JOIN video ON
t3.related_id = video.videoId ) t4
lateral VIEW explode ( typename ) t5 AS category_name
GROUP BY
category_name ) t6;
运行结果如下:
category_name category_count category_rank
Entertainment 36 1
Comedy 31 2
Music 22 3
Gadgets 3 4
Games 3 5
Animation 2 6
Howto 2 7
Film 2 8
DIY 2 9
People 1 10
Blogs 1 11
Autos 1 12
Vehicles 1 13
这道题有歧义,有两种理解:
获取上传视频最多的前10个用户
SELECT
uploader,videos
FROM
video_user
ORDER BY
videos DESC
LIMIT 10;
与原表关联获取用户的视频并排名
SELECT
videoId,t1.uploader,
videos,views,
row_number ( ) over ( distribute BY t1.uploader sort BY views DESC ) ranknum
FROM
( SELECT
uploader,videos
FROM
video_user
ORDER BY
videos DESC
LIMIT 10 ) t1
INNER JOIN video
ON t1.uploader = video.uploader;
过滤每个用户的视频
SELECT
videoId,up,videos,
views,ranknum
FROM
( SELECT
videoId,t1.uploader up,
videos,views,
row_number () over ( distribute BY t1.uploader sort BY views DESC ) ranknum
FROM
( SELECT
uploader,videos
FROM
video_user
ORDER BY
videos DESC
LIMIT 10 ) t1
INNER JOIN video
ON t1.uploader = video.uploader ) t2
WHERE
ranknum <= 20;
获取上传视频最多的前10个用户
SELECT
uploader,
videos
FROM
video_user
ORDER BY
videos DESC
LIMIT 10;
获取观看次数前20的视频
SELECT
videoId,uploader,views,
row_number ( ) over ( ORDER BY views DESC ) ranknum
FROM
( SELECT
videoId,uploader,views
FROM
video
ORDER BY
views DESC
LIMIT 20 ) t1;
两表关联统计每个用户在前20排行榜有多少个自己的视频
SELECT
videoId,views,
t2.uploader,
videos,ranknum
FROM
( SELECT
videoId,uploader,views,
row_number ( ) over ( ORDER BY views DESC ) ranknum
FROM
( SELECT
videoId,uploader,views
FROM
video
ORDER BY
views DESC
LIMIT 20 ) t1 ) t2
INNER JOIN
( SELECT
uploader,videos
FROM
video_user
ORDER BY
videos DESC
LIMIT 10 ) t3
ON t2.uploader = t3.uploader
ORDER BY
t2.uploader;
口头给的需求尽量不要做,否则可能做了白做,吃亏的是自己。需求必须是书面形式的,必须有证据,可以邮件,微信等留下证据!!!!
SELECT
videoId,category_name,
views,video_rank
FROM
( SELECT
videoId,category_name,views,
rank () over ( distribute BY category_name sort BY views DESC ) video_rank
FROM
video
lateral VIEW explode ( category ) t1 AS category_name ) t2
WHERE
video_rank <= 10;
Fetched: 210 row(s)
写出来的SQL可能运行不了,JVM 堆内存溢出,调整堆内存大小
描述:java.lang.OutOfMemoryError: Java heap space
解决:在 yarn-site.xml 中加入如下代码
yarn.scheduler.maximum-allocation-mb
2048
yarn.scheduler.minimum-allocation-mb
2048
yarn.nodemanager.vmem-pmem-ratio
2.1
mapred.child.java.opts
-Xmx1024m