HIVE项目实战

Hive实战之影音系统

  • 数据(https://download.csdn.net/download/kevin__durant/11798895)这上面还要积分,如果有需要我私发

  • 数据格式↓前九行分别对应表名,最后对应相关视频

  • LKh7zAJ4nwo TheReceptionist 653 Entertainment 424 13021 4.34 1305 744 DjdA-5oKYFQ NxTDlnOuybo c-8VuICzXtU DH56yrIO5nI W1Uo5DQTtzc E-3zXq_r4w0 1TCeoRPg5dE yAr26YhuYNY 2ZgXx72XmoE -7ClGo-YgZ0 vmdPOOd6cxI KRHfMQqSHpk pIMpORZthYw 1tUDzOp10pk heqocRij5P0 _XIuvoH6rUg LGVU5DsezE0 uO2kj6_D8B4 xiDqywcDQRM uX81lMev6_o

  • hive的一些基础经典习题↑

需求描述

统计硅谷影音视频网站的常规指标,各种TopN指标:

–统计视频观看数Top10

–统计视频类别热度Top10

–统计视频观看数Top20所属类别

–统计视频观看数Top50所关联视频的所属类别Rank

–统计每个类别中的视频热度Top10

–统计每个类别中视频流量Top10

–统计上传视频最多的用户Top10以及他们上传的视频

–统计每个类别视频观看数Top10

项目

数据结构

1.视频表

表6-13 视频表

字段 备注 详细描述
video id 视频唯一id 11位字符串
uploader 视频上传者 上传视频的用户名String
age 视频年龄 视频在平台上的整数天
category 视频类别 上传视频指定的视频分类
length 视频长度 整形数字标识的视频长度
views 观看次数 视频被浏览的次数
rate 视频评分 满分5分
ratings 流量 视频的流量,整型数字
conments 评论数 一个视频的整数评论数
related ids 相关视频id 相关视频的id,最多20个

2.用户表

表6-14 用户表

字段 备注 字段类型
uploader 上传者用户名 string
videos 上传视频数 int
friends 朋友数量 int

2 ETL原始数据

​ 通过观察原始数据形式,可以发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频id也使用“&”进行分割。

  • 原始数据在目录下,此处使用MapReduce清洗数据

数据清洗

清洗工具类

public String dataRinse(String str){
        String[] split = str.split("\t");
        //过滤没有视频的用户
        if (split.length<9){
            return "";
        }
        //将用户中的空格替换掉
        split[3] = split[3].replaceAll(" ","");
        //将后面的视频的数据合并为&分割的数据
        StringBuilder stringBuilder = new StringBuilder();
        for (int i = 0; i < split.length; i++) {
            if (i<9){
                if (i==split.length-1){
                    stringBuilder.append(split[i]);
                }else {
                    stringBuilder.append(split[i]).append("\t");
                }
            }else {
                if (i==split.length-1){
                    stringBuilder.append(split[i]);
                }else {
                    stringBuilder.append(split[i]).append("&");
                }
            }

        }
        return stringBuilder.toString();
    }

    public static void main(String[] args) {
        String s = new ETLUtil().dataRinse("uFoWXi25RBk");
        System.out.println(s);
    }

清洗Mapper端

public class ETLMapper extends Mapper<LongWritable , Text , Text , NullWritable> {
    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获得数据
        String line = value.toString();

        //清洗
        String s = new ETLUtil().dataRinse(line);
        k.set(s);

        //输出
        context.write(k , NullWritable.get());
    }
}

清洗驱动Driver

public class ETLDriver implements Tool {
    private Configuration configuration;
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(getConf());

        job.setJarByClass(ETLDriver.class);

        job.setMapperClass(ETLMapper.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        job.setNumReduceTasks(0);

        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        boolean b = job.waitForCompletion(true);
        return b ? 0 : 1;
    }

    public void setConf(Configuration conf) {
        this.configuration = conf;
    }

    public Configuration getConf() {
        return configuration;
    }

    public static void main(String[] args) throws Exception {
        int run = ToolRunner.run(new ETLDriver(), args);
        System.out.println(run);
    }
}
  • 将vedio执行清洗

hive数据分析

建表

  • 创建表:gulivideo_ori gulivideo_user_ori
create table gulivideo_ori(
    videoId string, 
    uploader string, 
    age int, 
    category array, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array)
row format delimited 
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
create table gulivideo_user_ori(
    uploader string,
    videos int,
    friends int)
row format delimited 
fields terminated by "\t" 
stored as textfile;

导入数据

  • 将video数据导入前表,将user导入后表直接load data 即可

业务分析

统计视频观看数Top10

  • 直接单表排序分页
select uploader,views
from gulivideo_ori
order by views desc
limit 10;

统计视频类别热度Top10

select	3.取出前十
t3.cate,t3.cou_cate
from
(
select	2.统计没类的热度
t2.cate cate , count(*) cou_cate
from
(
select t1.ca cate	1.将类别炸开
from gulivideo_ori lateral view explode(category) t1 as ca
)t2
group by t2.cate
)t3
order by t3.cou_cate
limit 10

统计视频观看数Top20所属类别

select	3.对相同类别去重
distinct(cate) 
from
(
select	2.取出前二十观看数,类别,和视频id
cate,views,videoid
from
(
select t1.ca cate,videoid,views	1.将类别炸开
from gulivideo_ori lateral view explode(category) t1 as ca
)t2
order by views desc
limit 20
)t3

统计视频观看数Top50所关联视频的所属类别Rank

select 	5.排序rank
*
from
(
select 	4.将合并的表的类别字段炸开,对组进行分组,统计count
t4.category , count(*) hot
from
(
select	3.然后与原表再连接join
*
from
(
select	2.因为关联视频字段是数组,将其炸开并对相关视频字段去重
distinct(relatedids_name)
from
(
select *	1.取出观看次数top50
from gulivideo_ori
order by views desc
limit 50
)t1
lateral view explode(t1.relatedid) relatedids_t as relatedids_name
)t2
join gulivideo_ori t3
where t2.relatedids_name=t3.videoid
)t4
lateral view explode(t4.category) category_t as category_name
group by t4.category
)t5
order by t5.hot desc
  • 下面几个业务都遇到了需要将类别炸开的形式,那就先将类别炸开的表先导入临时表

  • create table gulivideo_category(
        videoId string, 
        uploader string, 
        age int, 
        categoryId string, 
        length int, 
        views int, 
        rate float, 
        ratings int, 
        comments int, 
        relatedId array)
    row format delimited 
    fields terminated by "\t" 
    collection items terminated 
    
    
  • insert into table gulivideo_category  
        select 
            videoId,
            uploader,
            age,
            categoryId,
            length,
            views,
            rate,
            ratings,
            comments,
            relatedId 
        from 
            gulivideo_orc lateral view explode(category) catetory as categoryId;
    

统计每个类别中的视频热度Top10

select 	2.取出top10
t1.categoryId,
t1.views,
from
(
select 	1.按类别分组,观看次数排序
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank
from gulivideo_category
)t1
where rank <= 10

统计每个类别中视频流量Top10

select 
t1.categoryId,
t1.ratings
from
(
select 
categoryId,
ratings,
row_number() over(partition by categoryId order by ratings desc) rank
from gulivideo_category
)t1
where rank <= 10

统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频

select
t2.uploader,
t2.views
from
(
select
*
from gulivideo_user_ori
order by videos desc
limit 20
)t1
join
(
select
*
from gulivideo_ori
)t2
where t1.uploader=t2.uploader
order by views desc
limit 20

统计每个类别视频观看数Top10

select 
t1.categoryId,
t1.views,
from
(
select 
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank
from gulivideo_category
)t1
where rank <= 10

你可能感兴趣的:(hive,大数据,MapReduce)