package com.dt.spark.cores.java.HiveDataMode;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SaveMode;
//Hive实战之影音系统,所涉及数据为etc\\Video\\目录下的数据
//数据治理Java实现
public class HiveDataModeSpark {
public static void main(String[] args){
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("HiveDataModeSpark");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
JavaRDD lines = sc.textFile("D:\\SparkApps\\etc\\Video\\video\\2008\\0222\\1.txt",2);
JavaRDD moviesUsersinfo = sc.textFile("D:\\SparkApps\\etc\\Video\\user\\2008\\0903\\user.txt",2);
//利用map函数对每行数据进行拆分,得到相关数组
JavaRDD dataLine = lines.map(new Function() {
@Override
public String[] call(String s) throws Exception {
String[] data = s.split("\t");
return data;
}
});
//过滤没有视频的用户
JavaRDD dataM= dataLine.filter(new Function() {
@Override
public Boolean call(String[] strings) throws Exception {
return strings.length>9;
}
});
//将类型信息进行转换,去掉空格字符
JavaRDD dataTrans = dataM.map(new Function() {
@Override
public String[] call(String[] s) throws Exception {
String scs = s[3].replaceAll(" ","");
s[3] = scs;
return s;
}
});
//得到数据治理后的数据
JavaRDD dataMs= dataTrans.map(new Function() {
@Override
public String call(String[] strings) throws Exception {
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < strings.length; i++) {
if (i < 9) {
if (i == strings.length - 1) {
stringBuilder.append(strings[i]);
} else {
stringBuilder.append(strings[i]).append("\t");
}
} else {
if (i == strings.length - 1) {
stringBuilder.append(strings[i]);
} else {
stringBuilder.append(strings[i]).append("&");
}
}
}
return stringBuilder.toString();
}
});
JavaRDD result = dataMs.map(new Function() {
@Override
public MoviesInformations call(String s) throws Exception {
String[] parts = s.split("\t");
MoviesInformations moviesInformations = new MoviesInformations();
moviesInformations.setVideo_id(parts[0]);
moviesInformations.setUploader(parts[1]);
moviesInformations.setAge(Integer.parseInt(parts[2]));
moviesInformations.setCategory(parts[3]);
moviesInformations.setLength(Integer.parseInt(parts[4]));
moviesInformations.setViews(Integer.parseInt(parts[5]));
moviesInformations.setRate(Float.parseFloat(parts[6]));
moviesInformations.setRatings(Integer.parseInt(parts[7]));
moviesInformations.setConments(Integer.parseInt(parts[8]));
moviesInformations.setRelated_ids(parts[9]);
return moviesInformations;
}
});
JavaRDD moviesUserInformationsJavaRDD = moviesUsersinfo.map(new Function() {
@Override
public MoviesUserInformations call(String s) throws Exception {
String[] users = s.split("\t");
MoviesUserInformations moviesUserInformations = new MoviesUserInformations();
moviesUserInformations.setUploader(users[0]);
moviesUserInformations.setVideos(Integer.parseInt(users[1]));
moviesUserInformations.setFriends(Integer.parseInt(users[2]));
return moviesUserInformations;
}
});
Dataset df = sqlContext.createDataFrame(result,MoviesInformations.class);
Dataset users = sqlContext.createDataFrame(moviesUserInformationsJavaRDD,MoviesUserInformations.class);
df.createOrReplaceTempView("movies");
users.createOrReplaceTempView("users");
sqlContext.sql("select age,category,conments,length,rate,ratings,related_ids,uploader,video_id,views from movies limit 5").show();
//统计视频观看top10的用户
sqlContext.sql("select uploader,views from movies order by views desc limit 10").show();
//统计视频类别热度top10
sqlContext.sql("select " +
"t3.cate,t3.cou_cate " +
"from " +
"(" +
"select " +
"t2.cate cate , count(*) cou_cate " +
"from " +
"( " +
"select t1.ca cate " +
"from movies lateral view explode(split(category,'&')) t1 as ca " +
") t2 " +
"group by t2.cate " +
") t3 " +
"order by t3.cou_cate desc " +
"limit 10").show();
//统计视频观看数Top20类别
sqlContext.sql("select distinct t3.cate from (" +
"select t2.cate cate ,t2.views,t2.video_id from (" +
"select t1.ca cate, video_id,views from movies lateral view explode(split(category,'&')) t1 as ca" +
") t2 order by views desc limit 20) t3").show();
//统计视频观看数Top10所关联视频的所属类别Rank
sqlContext.sql("SELECT category, counts FROM ( " +
"SELECT t6.category AS category, count(*) AS counts FROM ( " +
"SELECT t5.cater AS category FROM ( " +
"SELECT t3.category AS cates FROM ( " +
"SELECT DISTINCT (t1.reid) AS reid FROM ( " +
"SELECT related_ids FROM movies ORDER BY views DESC LIMIT 10 ) " +
"lateral VIEW explode (split(related_ids, '&')) t1 AS reid ) " +
"t2 JOIN movies t3 WHERE t2.reid = t3.video_id ) " +
"t4 lateral VIEW explode (split(t4.cates, '&')) t5 AS cater ) " +
"t6 GROUP BY t6.category ) ORDER BY counts DESC").show();
//统计每个类别中的视频热度Top10
sqlContext.sql(" select t1.categoryId,t1.video_id,t1.views from (select " +
"categoryId,video_id,views," +
"row_number() over(partition by categoryId order by views desc) rank " +
"from (select " +
"video_id," +
"uploader," +
"age," +
"categoryId," +
"length," +
"views," +
"rate," +
"ratings," +
"conments," +
"related_ids " +
"from " +
"movies lateral view explode(split(category,'&')) catetory as categoryId)) t1 where t1.rank<=2 ").show();
//统计每个类别中视频流量Top10
sqlContext.sql(" select t1.categoryId,t1.video_id,t1.ratings from (select " +
"categoryId,video_id,ratings," +
"row_number() over(partition by categoryId order by ratings desc) rank " +
"from (select " +
"video_id," +
"uploader," +
"age," +
"categoryId," +
"length," +
"views," +
"rate," +
"ratings," +
"conments," +
"related_ids " +
"from " +
"movies lateral view explode(split(category,'&')) catetory as categoryId)) t1 where t1.rank<=2 ").show();
//统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频,发现该sql好像不太对
sqlContext.sql("select " +
"t2.uploader, " +
"t2.views " +
"from " +
"( " +
"select " +
"* " +
"from users " +
"order by videos desc " +
"limit 10 " +
")t1 " +
"join " +
"( " +
"select " +
"* " +
"from movies " +
")t2 " +
"where t1.uploader=t2.uploader " +
"order by views desc " +
"limit 20").show();
//统计每个类别视频观看数Top10
Dataset kinds = sqlContext.sql(" select t1.categoryId,t1.video_id,t1.views from (select " +
"categoryId,video_id,views," +
"row_number() over(partition by categoryId order by views desc) rank " +
"from (select " +
"video_id," +
"uploader," +
"age," +
"categoryId," +
"length," +
"views," +
"rate," +
"ratings," +
"conments," +
"related_ids " +
"from " +
"movies lateral view explode(split(category,'&')) catetory as categoryId)) t1 where t1.rank<=2 ");
//将处理好的数据保存到mysql数据库,便于应用程序访问或展示使用。
kinds.write().format("jdbc")
.mode(SaveMode.Append)
.option("url", "jdbc:mysql://localhost:3306/world")
.option("dbtable", "kinds")
.option("user", "root")
.option("password", "222818")
.save();
}
}