user.txt
0.txt
字段以及字段名解析
user表
字段 备注 字段类型
uploader 上传者用户名 string
videos 上传视频数 int
friends 朋友数量 int
视频表:
字段 备注 详细描述
video id 视频唯一 id 11 位字符串
uploader 视频上传者 上传视频的用户名 String
age 视频年龄 视频上传日期和 2007 年 2 月
15 日之间的整数天(Youtube的独特设定)
category 视频类别 上传视频指定的视频分类
length 视频长度 整形数字标识的视频长度
views 观看次数 视频被浏览的次数
rate 视频评分 满分 5 分
ratings 流量 视频的流量,整型数字
conments 评论数 一个视频的整数评论数
related ids 相关视频 id 相关视频的 id,最多 20 个
ETL数据清洗
ETLUtils.java
package ETLUtils;
public class ETLUtils {
public static String getETCString(String str){
String[] lines=str.split("\t");
StringBuilder newLines=new StringBuilder();
//1.去掉空格
lines[3]=lines[3].replaceAll(" ","");
//2.过滤不合法的值
if(lines.length<9) return null;
//3.大于9的下标的\t变化成&连接符
for(int i=0;i
if(i<9){
newLines.append("\t");
}else{
if(i!=lines.length-1){
newLines.append("&");
}
}
}
return newLines.toString();
}
}
ETLMapper
package mapper;
import ETLUtils.ETLUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class ETLMapper extends Mapper
ETLRunner
package runner;
import java.io.IOException;
import mapper.ETLMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class ETLRunner implements Tool {
private Configuration conf=null;
@Override
public void setConf(Configuration conf) {
this.conf=conf;
}
@Override
public Configuration getConf() {
return this.conf;
}
@Override
public int run(String[] args) throws Exception {
conf=this.getConf();
//传输路径变量
conf.set("inpath",args[0]);
conf.set("outpath",args[1]);
Job job=Job.getInstance(conf,"youtub_etl_video");
job.setJarByClass(ETLRunner.class);
job.setMapperClass(ETLMapper.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(0);
this.initInputPath(job);
this.initOutputPath(job);
return job.waitForCompletion(true)? 0:1;
}
private void initInputPath(Job job) throws IOException {
Configuration conf=job.getConfiguration();
String inpaths=conf.get("inpath");
//获取抽象文件系统对象
FileSystem fs=FileSystem.get(conf);
//创建hdfs路径实体对象
Path inpath =new Path(inpaths);
//判断文件系统存在该路径
if(fs.exists(inpath)){
//设置输入路径
FileInputFormat.addInputPath(job,inpath);
}else{
throw new RuntimeException("HDFS目录不存在"+inpaths);
}
}
private void initOutputPath(Job job) throws IOException {
Configuration conf=job.getConfiguration();
String outpath=conf.get("outpath");
FileSystem fs=FileSystem.get(conf);
Path opath=new Path(outpath);
if(fs.exists(opath)){
//存在输出路径,删除输出路径
fs.delete(opath,true);
}
FileOutputFormat.setOutputPath(job,opath);
}
public static void main(String[] args) {
try {
int result=ToolRunner.run(new ETLRunner(),args);
if(result==0){
System.out.println("Success!");
}else{
System.out.println("Fail!");
}
System.exit(result);
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}
create table youtube_ori( videoId string, uploader string, age int, category array length int, views int, rate float, ratings int, comments int, relatedId array row format delimited fields terminated by "\t" collection items terminated by "&" stored as textfile;
创建user ori表 create table youtube_user_ori( uploader string, videos int, friends int) clustered by (uploader) into 24 buckets row format delimited fields terminated by "\t" stored as textfile;
//创建orc表 create table youtube_orc( videoId string, uploader string, age int, category array length int, views int, rate float, ratings int, comments int, relatedId array row format delimited fields terminated by "\t" collection items terminated by "&" stored as orc;
create table youtube_user_orc( uploader string, videos int, friends int) clustered by (uploader) into 24 buckets row format delimited fields terminated by "\t" stored as orc;
|
load data inpath '/output/part-m-00000' into table youtube_ori;
load data inpath '/output/user.txt' into table youtube_user_ori;
用insert into table A select * from B倒入到orc表
字段:
视频id 上传者 视频年龄 视频类别 观看长度 观看次数 视频评分 流量 评论数 相关视频
videoId,uploader,age,category,length,views,rate,ratings,comments,relatedId
create table viewstop10 as select videoId,uploader,age,category,length,views,rate,ratings,comments,relatedId from youtube_orc order by views desc limit 10;
create table hotTop10 as select t1.category_name as category,count(t1.videoId) as hot from(
select videoId,category_name from youtube_orc lateral view explode(category) t_catetory as category_name ) t1
group by t1.category_name order by hot desc limit 10;
create table top20views_countCategory as
select category_name,count(videoId) as vcount from
(select videoId,category from
( select * from youtube_orc order by views desc limit 20) t1) t2 lateral view explode(category) t_catetory as category_name
group by category_name order by vcount desc;
select category_name,count(views) as mcount from
(select videoId,category_name,views from
(select videoId,category,views from youtube_orc order by views desc limit 50) t1
lateral view explode(category) t_category as category_name
) t2 group by category_name order by mcount desc;
create table youtube_category(
videoId string,
uploader string,
age int,
categoryId string,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as orc;
insert into table youtube_category
select
videoId,
uploader,
age,
categoryId,
length,
views,
rate,
ratings,
comments,
relatedId
from
youtube_orc lateral view explode(category) catetory as categoryId
create table musicTop10 as
select videoId,categoryId,views from youtube_category where categoryId="Music" order by views desc limit 10;
create table ratingsTop10 as
select videoId,views,ratings from youtube_category where categoryId="Music" order by ratings desc limit 10;
select t2.videoId,t2.uploader,t2.views,t1.videos from
(select * from youtube_user_orc order by videos desc limit 10) t1 join youtube_orc t2 on t1.uploader=t2.uploader order by t2.views
limit 20;
create table categoryId_views as
select * from
(select videoId,categoryId,views,row_number() over(partition by categoryId order by views desc) rant from youtube_category) t1
where rant<=10;