数据准备
数据结构
两张表,视频表和用户表
字段 | 备注 | 详细描述 |
---|---|---|
video id | 视频唯一id | 11位字符串 |
uploader | 视频上传者 | 上传视频的用户名String |
age | 视频年龄 | 视频在平台上的整数天 |
category | 视频类别 | 上传视频指定的视频分类 |
length | 视频长度 | 整形数字标识的视频长度 |
views | 观看次数 | 视频被浏览的次数 |
rate | 视频评分 | 满分5分 |
ratings | 流量 | 视频的流量,整型数字 |
conments | 评论数 | 一个视频的整数评论数 |
related ids | 相关视频id | 相关视频的id,最多20个 |
ETL原始数据
部分数据如下所示
SapphireHearts 3 5
VanillaFresh76 37 16
Phyr0Musique 5 0
TwistedKitchen 1 1
TheDailyClog 19 4
4a4ron 4 7
StrictlyZ 1 0
VIRAJ818 6 42
v2uEfmWO6z8 mimzi8 601 Film & Animation 361 25341 4.63 78 44 AItaLnpbIAw 0X_of0woMBU yJ0s-kwG_Ro qThc1spQuiY WCEH9RZZKdU 8ZgFpRZeCds PQfyKAo0-ls 5peppnUHf9I Dx3ri8EVEvE ofmQOjZYLDI Ndtp2wP31K4 HEo6U-jkD4I 3DG1XyqgX3w b7M2mrnSZpM 5gJ21l8_4II MqySp7Nq5j0 MC--VwYTHAM 4SSZBAPleVc eVdiIbWT60M Da80HD18tp0
通过观察原始数据形式,可以发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频id也使用“&”进行分割
数据清洗
ETLUtils
public class ETLUtils {
public static String translateDate(String ori) {
String[] strings = ori.split("\t");
if(strings.length < 9) {
return null;
}
strings[3] = strings[3].replace(" ","");
StringBuffer result = new StringBuffer();
for(int i = 0; i < strings.length; i ++) {
if(i <= 9) {
if(i == 0) {
result.append(strings[i]);
}else {
result.append("\t" + strings[i]);
}
}else {
result.append("&" + strings[i]);
}
}
return result.toString();
}
}
ETLMapper
public class ETLMapper extends Mapper {
private Text text = new Text();
private NullWritable nullWritable = NullWritable.get();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String translateDate = ETLUtils.translateDate(s);
if(translateDate != null){
text.set(translateDate);
context.write(text, nullWritable);
}
}
}
ETLRunner
public class ETLRunner implements Tool {
private Configuration configuration;
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(configuration);
job.setJarByClass(ETLRunner.class);
job.setMapperClass(ETLMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
@Override
public void setConf(Configuration conf) {
configuration = conf;
}
@Override
public Configuration getConf() {
return configuration;
}
public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new ETLRunner(), args);
System.out.println("run = " + run);
}
}
上传文件到hdfs
hadoop fs -put user.txt /
hadoop fs -put video/ /
执行数据清洗
hadoop jar etl.jar com.zj.etl.ETLRunner /video /video/output
创建相关表
create table video_ori(
videoId string,
uploader string,
age int,
category array,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
create table video_user_ori(
uploader string,
videos int,
friends int)
clustered by (uploader) into 24 buckets
row format delimited
fields terminated by "\t"
stored as textfile;
create table video_orc(
videoId string,
uploader string,
age int,
category array,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array)
clustered by (uploader) into 8 buckets
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;
create table video_user_orc(
uploader string,
videos int,
friends int)
clustered by (uploader) into 24 buckets
row format delimited
fields terminated by "\t"
stored as orc;
导入数据
load data inpath "/video" into table video_ori;
load data inpath "/user.txt" into table video_user_ori;
insert into table video_orc select * from video_ori;
insert into table video_user_orc select * from video_user_ori;
需求
- 统计视频观看数Top10
- 统计视频类别热度Top10
- 统计视频观看数Top20所属类别
- 统计视频观看数Top50所关联视频的所属类别Rank
- 统计每个类别中的视频热度Top10
- 统计每个类别中视频流量Top10
- 统计上传视频最多的用户Top10以及他们上传的视频
- 统计每个类别视频观看数Top10
1.
select * from video_orc order by views desc limit 10;
2.
select r.cate,sum(r.views) as sumviews
from
(select views,cate
from video_orc
lateral view explode(category) tableAlias as cate) r
group by r.cate
order by sumviews desc
limit 10;
3.
select cate,count(cate) as num
from (select * from video_orc order by views desc limit 20) r
lateral view explode(r.category) tableAlias as cate
group by cate;
4.
select ca, count(ca) as sumcate,rank() over(sort by count(ca) desc) as rk
from(select o.category as cat
from (select views,id
from video_orc
lateral view explode(relatedId) tableAlias as id
order by views desc
limit 50) r1 join video_orc o on r1.id = o.videoId) r
lateral view explode(r.cat) tableAlias as ca
group by ca;
5.
select t.*
from
(select r.*, ROW_NUMBER() over(distribute by r.cate sort by r.views desc) as rk
from
(
select videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedId
from video_orc
lateral view explode(category) tableAlias as cate) r) t
where t.rk <= 10;
6.
select t.*
from
(select r.*, ROW_NUMBER() over(distribute by r.cate sort by r.ratings desc) as rk
from
(
select videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedId
from video_orc
lateral view explode(category) tableAlias as cate) r) t
where t.rk <= 10;
7.
select t1.*,vo.*
from(
select uploader, videos, friends
from video_user_orc
order by videos desc
limit 10) t1 join video_orc vo on t1.uploader = vo.uploader;
8.
select t.*
from
(select r.*, ROW_NUMBER() over(distribute by r.cate sort by r.views desc) as rk
from
(
select videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedId
from video_orc
lateral view explode(category) tableAlias as cate) r) t
where t.rk <= 10;