字段 | 备注 | 详细描述 |
---|---|---|
video id | 视频唯一id | 11位字符串 |
uploader | 视频上传者 | 上传视频的用户名String |
age | 视频年龄 | 视频在平台上的整数天 |
category | 视频类别 | 上传视频指定的视频分类 |
length | 视频长度 | 整形数字标识的视频长度 |
views | 观看次数 | 视频被浏览的次数 |
rate | 视频评分 | 满分5分 |
ratings | 流量 | 视频的流量,整型数字 |
conments | 评论数 | 一个视频的整数评论数 |
related ids | 相关视频id | 相关视频的id,最多20个 |
字段 | 备注 | 详细描述 |
---|---|---|
uploader | 上传者用户名 | string |
videos | 上传视频数 | int |
friends | 朋友数量 | int |
视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频id也使用“&”进行分割。
public class ETLUtil {
public static String oriString2ETLString(String ori){
StringBuilder etlString = new StringBuilder();
String[] splits = ori.split("\t");
if(splits.length < 9) return null;
splits[3] = splits[3].replace(" ", "");
for(int i = 0; i < splits.length; i++){
if(i < 9){
if(i == splits.length - 1){
etlString.append(splits[i]);
}else{
etlString.append(splits[i] + "\t");
}
}else{
if(i == splits.length - 1){
etlString.append(splits[i]);
}else{
etlString.append(splits[i] + "&");
}
}
}
return etlString.toString();
}
}
public class VideoETLMapper extends Mapper<Object, Text, NullWritable, Text>{
Text text = new Text();
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String etlString = ETLUtil.oriString2ETLString(value.toString());
if(StringUtils.isBlank(etlString)) return;
text.set(etlString);
context.write(NullWritable.get(), text);
}
}
public class VideoETLRunner implements Tool {
private Configuration conf = null;
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public Configuration getConf() {
return this.conf;
}
@Override
public int run(String[] args) throws Exception {
conf = this.getConf();
conf.set("inpath", args[0]);
conf.set("outpath", args[1]);
Job job = Job.getInstance(conf);
job.setJarByClass(VideoETLRunner.class);
job.setMapperClass(VideoETLMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(0);
this.initJobInputPath(job);
this.initJobOutputPath(job);
return job.waitForCompletion(true) ? 0 : 1;
}
private void initJobOutputPath(Job job) throws IOException {
Configuration conf = job.getConfiguration();
String outPathString = conf.get("outpath");
FileSystem fs = FileSystem.get(conf);
Path outPath = new Path(outPathString);
if(fs.exists(outPath)){
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
}
private void initJobInputPath(Job job) throws IOException {
Configuration conf = job.getConfiguration();
String inPathString = conf.get("inpath");
FileSystem fs = FileSystem.get(conf);
Path inPath = new Path(inPathString);
if(fs.exists(inPath)){
FileInputFormat.addInputPath(job, inPath);
}else{
throw new RuntimeException("HDFS中该文件目录不存在:" + inPathString);
}
}
public static void main(String[] args) {
try {
int resultCode = ToolRunner.run(new VideoETLRunner(), args);
if(resultCode == 0){
System.out.println("Success!");
}else{
System.out.println("Fail!");
}
System.exit(resultCode);
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}
$ bin/yarn jar ~/softwares/jars/gulivideo-0.0.1-SNAPSHOT.jar \
com.atguigu.etl.ETLVideosRunner \
/gulivideo/video/2008/0222 \
/gulivideo/output/video/2008/0222
创建表:gulivideo_ori,gulivideo_user_ori,
创建表:gulivideo_orc,gulivideo_user_orc
gulivideo_ori:
create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
gulivideo_user_ori:
create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
然后把原始数据插入到orc表中
gulivideo_orc:
create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
clustered by (uploader) into 8 buckets
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;
gulivideo_user_orc:
create table gulivideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc;
gulivideo_ori:
load data inpath "/gulivideo/output/video/2008/0222" into table gulivideo_ori;
gulivideo_user_ori:
load data inpath "/gulivideo/user/2008/0903" into table gulivideo_user_ori;
gulivideo_orc:
insert into table gulivideo_orc select * from gulivideo_ori;
gulivideo_user_ori:
insert into table gulivideo_user_orc select * from gulivideo_user_ori;
1. 统计视频观看数Top10
思路:使用order by按照views字段做一个全局排序即可,同时我们设置只显示前10条。
最终代码:
select
videoId,
uploader,
age,
category,
length,
views,
rate,
ratings,
comments
from
gulivideo_orc
order by
views
desc limit
10;
2. 统计视频类别热度Top10
思路:
select
category_name as category,
count(t1.videoId) as hot
from (
select
videoId,
category_name
from
gulivideo_orc lateral view explode(category) t_catetory as category_name) t1
group by
t1.category_name
order by
hot
desc limit
10;
3. 统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
思路:
select
category_name as category,
count(t2.videoId) as hot_with_views
from (
select
videoId,
category_name
from (
select
*
from
gulivideo_orc
order by
views
desc limit
20) t1 lateral view explode(category) t_catetory as category_name) t2
group by
category_name
order by
hot_with_views
desc;
4. 统计视频观看数Top50所关联视频的所属类别Rank
思路:
select
*
from
gulivideo_orc
order by
views
desc limit
50;
select
explode(relatedId) as videoId
from
t1;
(select
distinct(t2.videoId),
t3.category
from
t2
inner join
gulivideo_orc t3 on t2.videoId = t3.videoId) t4 lateral view explode(category) t_catetory as category_name;
select
category_name as category,
count(t5.videoId) as hot
from (
select
videoId,
category_name
from (
select
distinct(t2.videoId),
t3.category
from (
select
explode(relatedId) as videoId
from (
select
*
from
gulivideo_orc
order by
views
desc limit
50) t1) t2
inner join
gulivideo_orc t3 on t2.videoId = t3.videoId) t4 lateral view explode(category) t_catetory as category_name) t5
group by
category_name
order by
hot
desc;
5. 统计每个类别中的视频热度Top10,以Music为例
思路:
创建表类别表:
create table gulivideo_category(
videoId string,
uploader string,
age int,
categoryId string,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as orc;
向类别表中插入数据:
insert into table gulivideo_category
select
videoId,
uploader,
age,
categoryId,
length,
views,
rate,
ratings,
comments,
relatedId
from
gulivideo_orc lateral view explode(category) catetory as categoryId;
统计Music类别的Top10(也可以统计其他)
select
videoId,
views
from
gulivideo_category
where
categoryId = "Music"
order by
views
desc limit
10;
6. 统计每个类别中视频流量Top10,以Music为例
思路:
select
videoId,
views,
ratings
from
gulivideo_category
where
categoryId = "Music"
order by
ratings
desc limit
10;
7. 统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频
思路:
select
*
from
gulivideo_user_orc
order by
videos
desc limit
10;
select
t2.videoId,
t2.views,
t2.ratings,
t1.videos,
t1.friends
from (
select
*
from
gulivideo_user_orc
order by
videos desc
limit
10) t1
join
gulivideo_orc t2
on
t1.uploader = t2.uploader
order by
views desc
limit
20;
8. 统计每个类别视频观看数Top10
思路:
select
t1.*
from (
select
videoId,
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank from gulivideo_category) t1
where
rank <= 10;