数据(https://download.csdn.net/download/kevin__durant/11798895)这上面还要积分,如果有需要我私发
数据格式↓前九行分别对应表名,最后对应相关视频
LKh7zAJ4nwo TheReceptionist 653 Entertainment 424 13021 4.34 1305 744 DjdA-5oKYFQ NxTDlnOuybo c-8VuICzXtU DH56yrIO5nI W1Uo5DQTtzc E-3zXq_r4w0 1TCeoRPg5dE yAr26YhuYNY 2ZgXx72XmoE -7ClGo-YgZ0 vmdPOOd6cxI KRHfMQqSHpk pIMpORZthYw 1tUDzOp10pk heqocRij5P0 _XIuvoH6rUg LGVU5DsezE0 uO2kj6_D8B4 xiDqywcDQRM uX81lMev6_o
hive的一些基础经典习题↑
统计硅谷影音视频网站的常规指标,各种TopN指标:
–统计视频观看数Top10
–统计视频类别热度Top10
–统计视频观看数Top20所属类别
–统计视频观看数Top50所关联视频的所属类别Rank
–统计每个类别中的视频热度Top10
–统计每个类别中视频流量Top10
–统计上传视频最多的用户Top10以及他们上传的视频
–统计每个类别视频观看数Top10
1.视频表
表6-13 视频表
字段 | 备注 | 详细描述 |
---|---|---|
video id | 视频唯一id | 11位字符串 |
uploader | 视频上传者 | 上传视频的用户名String |
age | 视频年龄 | 视频在平台上的整数天 |
category | 视频类别 | 上传视频指定的视频分类 |
length | 视频长度 | 整形数字标识的视频长度 |
views | 观看次数 | 视频被浏览的次数 |
rate | 视频评分 | 满分5分 |
ratings | 流量 | 视频的流量,整型数字 |
conments | 评论数 | 一个视频的整数评论数 |
related ids | 相关视频id | 相关视频的id,最多20个 |
2.用户表
表6-14 用户表
字段 | 备注 | 字段类型 | |
---|---|---|---|
uploader | 上传者用户名 | string | |
videos | 上传视频数 | int | |
friends | 朋友数量 | int |
通过观察原始数据形式,可以发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频id也使用“&”进行分割。
public String dataRinse(String str){
String[] split = str.split("\t");
//过滤没有视频的用户
if (split.length<9){
return "";
}
//将用户中的空格替换掉
split[3] = split[3].replaceAll(" ","");
//将后面的视频的数据合并为&分割的数据
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < split.length; i++) {
if (i<9){
if (i==split.length-1){
stringBuilder.append(split[i]);
}else {
stringBuilder.append(split[i]).append("\t");
}
}else {
if (i==split.length-1){
stringBuilder.append(split[i]);
}else {
stringBuilder.append(split[i]).append("&");
}
}
}
return stringBuilder.toString();
}
public static void main(String[] args) {
String s = new ETLUtil().dataRinse("uFoWXi25RBk");
System.out.println(s);
}
public class ETLMapper extends Mapper<LongWritable , Text , Text , NullWritable> {
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获得数据
String line = value.toString();
//清洗
String s = new ETLUtil().dataRinse(line);
k.set(s);
//输出
context.write(k , NullWritable.get());
}
}
public class ETLDriver implements Tool {
private Configuration configuration;
public int run(String[] args) throws Exception {
Job job = Job.getInstance(getConf());
job.setJarByClass(ETLDriver.class);
job.setMapperClass(ETLMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean b = job.waitForCompletion(true);
return b ? 0 : 1;
}
public void setConf(Configuration conf) {
this.configuration = conf;
}
public Configuration getConf() {
return configuration;
}
public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new ETLDriver(), args);
System.out.println(run);
}
}
create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
select uploader,views
from gulivideo_ori
order by views desc
limit 10;
select 3.取出前十
t3.cate,t3.cou_cate
from
(
select 2.统计没类的热度
t2.cate cate , count(*) cou_cate
from
(
select t1.ca cate 1.将类别炸开
from gulivideo_ori lateral view explode(category) t1 as ca
)t2
group by t2.cate
)t3
order by t3.cou_cate
limit 10
select 3.对相同类别去重
distinct(cate)
from
(
select 2.取出前二十观看数,类别,和视频id
cate,views,videoid
from
(
select t1.ca cate,videoid,views 1.将类别炸开
from gulivideo_ori lateral view explode(category) t1 as ca
)t2
order by views desc
limit 20
)t3
select 5.排序rank
*
from
(
select 4.将合并的表的类别字段炸开,对组进行分组,统计count
t4.category , count(*) hot
from
(
select 3.然后与原表再连接join
*
from
(
select 2.因为关联视频字段是数组,将其炸开并对相关视频字段去重
distinct(relatedids_name)
from
(
select * 1.取出观看次数top50
from gulivideo_ori
order by views desc
limit 50
)t1
lateral view explode(t1.relatedid) relatedids_t as relatedids_name
)t2
join gulivideo_ori t3
where t2.relatedids_name=t3.videoid
)t4
lateral view explode(t4.category) category_t as category_name
group by t4.category
)t5
order by t5.hot desc
下面几个业务都遇到了需要将类别炸开的形式,那就先将类别炸开的表先导入临时表
create table gulivideo_category(
videoId string,
uploader string,
age int,
categoryId string,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array)
row format delimited
fields terminated by "\t"
collection items terminated
insert into table gulivideo_category
select
videoId,
uploader,
age,
categoryId,
length,
views,
rate,
ratings,
comments,
relatedId
from
gulivideo_orc lateral view explode(category) catetory as categoryId;
select 2.取出top10
t1.categoryId,
t1.views,
from
(
select 1.按类别分组,观看次数排序
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank
from gulivideo_category
)t1
where rank <= 10
select
t1.categoryId,
t1.ratings
from
(
select
categoryId,
ratings,
row_number() over(partition by categoryId order by ratings desc) rank
from gulivideo_category
)t1
where rank <= 10
select
t2.uploader,
t2.views
from
(
select
*
from gulivideo_user_ori
order by videos desc
limit 20
)t1
join
(
select
*
from gulivideo_ori
)t2
where t1.uploader=t2.uploader
order by views desc
limit 20
select
t1.categoryId,
t1.views,
from
(
select
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank
from gulivideo_category
)t1
where rank <= 10