10、实战

数据准备

数据结构

两张表，视频表和用户表

字段	备注	详细描述
video id	视频唯一id	11位字符串
uploader	视频上传者	上传视频的用户名String
age	视频年龄	视频在平台上的整数天
category	视频类别	上传视频指定的视频分类
length	视频长度	整形数字标识的视频长度
views	观看次数	视频被浏览的次数
rate	视频评分	满分5分
ratings	流量	视频的流量，整型数字
conments	评论数	一个视频的整数评论数
related ids	相关视频id	相关视频的id，最多20个

ETL原始数据

部分数据如下所示

SapphireHearts  3   5
VanillaFresh76  37  16
Phyr0Musique    5   0
TwistedKitchen  1   1
TheDailyClog    19  4
4a4ron  4   7
StrictlyZ   1   0
VIRAJ818    6   42

v2uEfmWO6z8 mimzi8  601 Film & Animation    361 25341   4.63    78  44  AItaLnpbIAw 0X_of0woMBU yJ0s-kwG_Ro qThc1spQuiY WCEH9RZZKdU 8ZgFpRZeCds PQfyKAo0-ls 5peppnUHf9I Dx3ri8EVEvE ofmQOjZYLDI Ndtp2wP31K4 HEo6U-jkD4I 3DG1XyqgX3w b7M2mrnSZpM 5gJ21l8_4II MqySp7Nq5j0 MC--VwYTHAM 4SSZBAPleVc eVdiIbWT60M Da80HD18tp0

通过观察原始数据形式，可以发现，视频可以有多个所属分类，每个所属分类用&符号分割，且分割的两边有空格字符，同时相关视频也是可以有多个元素，多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作，我们首先进行数据重组清洗操作。即：将所有的类别用“&”分割，同时去掉两边空格，多个相关视频id也使用“&”进行分割

数据清洗

ETLUtils

public class ETLUtils {
    public static String translateDate(String ori) {
        String[] strings = ori.split("\t");
        if(strings.length < 9) {
            return null;
        }
        strings[3] = strings[3].replace(" ","");
        StringBuffer result = new StringBuffer();
        for(int i = 0; i < strings.length; i ++) {
            if(i <= 9) {
                if(i == 0) {
                    result.append(strings[i]);
                }else {
                    result.append("\t" + strings[i]);
                }
            }else {
                result.append("&" + strings[i]);
            }
        }
        return result.toString();
    }
}

ETLMapper

public class ETLMapper extends Mapper {

    private Text text = new Text();
    private NullWritable nullWritable = NullWritable.get();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String translateDate = ETLUtils.translateDate(s);
        if(translateDate != null){
            text.set(translateDate);
            context.write(text, nullWritable);
        }
    }
}

ETLRunner

public class ETLRunner implements Tool {

    private Configuration configuration;

    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(configuration);

        job.setJarByClass(ETLRunner.class);

        job.setMapperClass(ETLMapper.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(0);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean success = job.waitForCompletion(true);

        return success ? 0 : 1;
    }

    @Override
    public void setConf(Configuration conf) {
        configuration = conf;
    }

    @Override
    public Configuration getConf() {
        return configuration;
    }

    public static void main(String[] args) throws Exception {
        int run = ToolRunner.run(new ETLRunner(), args);
        System.out.println("run = " + run);
    }
}

上传文件到hdfs

hadoop fs -put user.txt /
hadoop fs -put video/ /

执行数据清洗

hadoop jar etl.jar com.zj.etl.ETLRunner /video /video/output

创建相关表

create table video_ori(
    videoId string, 
    uploader string, 
    age int, 
    category array, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array)
row format delimited 
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;

create table video_user_ori(
    uploader string,
    videos int,
    friends int)
clustered by (uploader) into 24 buckets
row format delimited 
fields terminated by "\t" 
stored as textfile;

create table video_orc(
    videoId string, 
    uploader string, 
    age int, 
    category array, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array)
clustered by (uploader) into 8 buckets 
row format delimited fields terminated by "\t" 
collection items terminated by "&" 
stored as orc;

create table video_user_orc(
    uploader string,
    videos int,
    friends int)
clustered by (uploader) into 24 buckets 
row format delimited 
fields terminated by "\t" 
stored as orc;

导入数据

load data inpath "/video" into table video_ori;
load data inpath "/user.txt" into table video_user_ori;
insert into table video_orc select * from video_ori;
insert into table video_user_orc select * from video_user_ori;

需求

统计视频观看数Top10
统计视频类别热度Top10
统计视频观看数Top20所属类别
统计视频观看数Top50所关联视频的所属类别Rank
统计每个类别中的视频热度Top10
统计每个类别中视频流量Top10
统计上传视频最多的用户Top10以及他们上传的视频
统计每个类别视频观看数Top10

1. 
select * from video_orc order by views desc limit 10;
2. 
select r.cate,sum(r.views) as sumviews
from 
(select views,cate
from video_orc
lateral view explode(category) tableAlias as cate) r
group by r.cate
order by sumviews desc
limit 10;
3. 
select cate,count(cate) as num
from (select * from video_orc order by views desc limit 20) r
lateral view explode(r.category) tableAlias as cate
group by cate;
4. 
select ca, count(ca) as sumcate,rank() over(sort by count(ca) desc) as rk
from(select o.category as cat
from (select views,id
from video_orc
lateral view explode(relatedId) tableAlias as id
order by views desc
limit 50) r1 join video_orc o on r1.id = o.videoId) r
lateral view explode(r.cat) tableAlias as ca
group by ca;
5. 
select t.*
from
(select r.*, ROW_NUMBER() over(distribute by r.cate sort by r.views desc) as rk
from
(
select videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedId
from video_orc
lateral view explode(category) tableAlias as cate) r) t
where t.rk <= 10;
6. 
select t.*
from
(select r.*, ROW_NUMBER() over(distribute by r.cate sort by r.ratings desc) as rk
from
(
select videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedId
from video_orc
lateral view explode(category) tableAlias as cate) r) t
where t.rk <= 10;
7. 
select t1.*,vo.*
from(
select uploader, videos, friends 
from video_user_orc
order by videos desc
limit 10) t1 join video_orc vo on t1.uploader = vo.uploader;
8. 
select t.*
from
(select r.*, ROW_NUMBER() over(distribute by r.cate sort by r.views desc) as rk
from
(
select videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedId
from video_orc
lateral view explode(category) tableAlias as cate) r) t
where t.rk <= 10;