Hive实战之影音系统练习

package com.dt.spark.cores.java.HiveDataMode;




import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;


import org.apache.spark.api.java.JavaSparkContext;

import org.apache.spark.api.java.function.Function;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SaveMode;

//Hive实战之影音系统,所涉及数据为etc\\Video\\目录下的数据
//数据治理Java实现
public class HiveDataModeSpark {

    public static void main(String[] args){
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("HiveDataModeSpark");

        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);


        JavaRDD lines = sc.textFile("D:\\SparkApps\\etc\\Video\\video\\2008\\0222\\1.txt",2);
        JavaRDD moviesUsersinfo = sc.textFile("D:\\SparkApps\\etc\\Video\\user\\2008\\0903\\user.txt",2);
        //利用map函数对每行数据进行拆分,得到相关数组
        JavaRDD dataLine = lines.map(new Function() {
            @Override
            public String[] call(String s) throws Exception {
                String[] data = s.split("\t");
                return data;
            }
        });
        //过滤没有视频的用户
        JavaRDD dataM= dataLine.filter(new Function() {
            @Override
            public Boolean call(String[] strings) throws Exception {
                return strings.length>9;
            }
        });
        //将类型信息进行转换,去掉空格字符

        JavaRDD dataTrans = dataM.map(new Function() {
            @Override
            public String[] call(String[] s) throws Exception {
                String scs = s[3].replaceAll(" ","");
                s[3] = scs;
                return s;
            }
        });

        //得到数据治理后的数据
        JavaRDD dataMs= dataTrans.map(new Function() {
            @Override
            public String call(String[] strings) throws Exception {
                StringBuilder stringBuilder = new StringBuilder();
                for (int i = 0; i < strings.length; i++) {
                    if (i < 9) {
                        if (i == strings.length - 1) {
                            stringBuilder.append(strings[i]);
                        } else {
                            stringBuilder.append(strings[i]).append("\t");
                        }
                    } else {
                        if (i == strings.length - 1) {
                            stringBuilder.append(strings[i]);
                        } else {
                            stringBuilder.append(strings[i]).append("&");
                        }
                    }
                }
                return stringBuilder.toString();
            }
        });

        JavaRDD result = dataMs.map(new Function() {
            @Override
            public MoviesInformations call(String s) throws Exception {
                String[] parts = s.split("\t");
                MoviesInformations moviesInformations = new MoviesInformations();
                moviesInformations.setVideo_id(parts[0]);
                moviesInformations.setUploader(parts[1]);
                moviesInformations.setAge(Integer.parseInt(parts[2]));
                moviesInformations.setCategory(parts[3]);
                moviesInformations.setLength(Integer.parseInt(parts[4]));
                moviesInformations.setViews(Integer.parseInt(parts[5]));
                moviesInformations.setRate(Float.parseFloat(parts[6]));
                moviesInformations.setRatings(Integer.parseInt(parts[7]));
                moviesInformations.setConments(Integer.parseInt(parts[8]));
                moviesInformations.setRelated_ids(parts[9]);
                return moviesInformations;
            }
        });
        JavaRDD moviesUserInformationsJavaRDD = moviesUsersinfo.map(new Function() {
            @Override
            public MoviesUserInformations call(String s) throws Exception {
                String[] users = s.split("\t");
                MoviesUserInformations moviesUserInformations = new MoviesUserInformations();
                moviesUserInformations.setUploader(users[0]);
                moviesUserInformations.setVideos(Integer.parseInt(users[1]));
                moviesUserInformations.setFriends(Integer.parseInt(users[2]));
                return moviesUserInformations;
            }
        });
        Dataset df = sqlContext.createDataFrame(result,MoviesInformations.class);
        Dataset users = sqlContext.createDataFrame(moviesUserInformationsJavaRDD,MoviesUserInformations.class);
        df.createOrReplaceTempView("movies");
        users.createOrReplaceTempView("users");

        sqlContext.sql("select age,category,conments,length,rate,ratings,related_ids,uploader,video_id,views from movies limit 5").show();

        //统计视频观看top10的用户
        sqlContext.sql("select uploader,views from movies order by views desc limit 10").show();

        //统计视频类别热度top10
        sqlContext.sql("select " +
                "t3.cate,t3.cou_cate " +
                "from " +
                "(" +
                "select " +
                "t2.cate cate , count(*) cou_cate " +
                "from " +
                "( " +
                "select t1.ca cate " +
                "from movies lateral view explode(split(category,'&')) t1 as ca " +
                ") t2 " +
                "group by t2.cate " +
                ") t3 " +
                "order by t3.cou_cate desc " +
                "limit 10").show();
        //统计视频观看数Top20类别
        sqlContext.sql("select distinct t3.cate  from (" +
                "select t2.cate cate ,t2.views,t2.video_id from (" +
                "select t1.ca cate, video_id,views from movies lateral view explode(split(category,'&')) t1 as ca" +
                ") t2 order by views desc limit 20) t3").show();

        //统计视频观看数Top10所关联视频的所属类别Rank
        sqlContext.sql("SELECT category, counts FROM ( " +
                "SELECT t6.category AS category, count(*) AS counts FROM ( " +
                "SELECT t5.cater AS category FROM ( " +
                "SELECT t3.category AS cates FROM ( " +
                "SELECT DISTINCT (t1.reid) AS reid FROM ( " +
                "SELECT related_ids FROM movies ORDER BY views DESC LIMIT 10 ) " +
                "lateral VIEW explode (split(related_ids, '&')) t1 AS reid ) " +
                "t2 JOIN movies t3 WHERE t2.reid = t3.video_id ) " +
                "t4 lateral VIEW explode (split(t4.cates, '&')) t5 AS cater ) " +
                "t6 GROUP BY t6.category ) ORDER BY counts DESC").show();
        //统计每个类别中的视频热度Top10
        sqlContext.sql(" select t1.categoryId,t1.video_id,t1.views from (select " +
                "categoryId,video_id,views," +
                "row_number() over(partition by categoryId order by views desc) rank " +
                "from (select " +
                "video_id," +
                "uploader," +
                "age," +
                "categoryId," +
                "length," +
                "views," +
                "rate," +
                "ratings," +
                "conments," +
                "related_ids " +
                "from " +
                "movies lateral view explode(split(category,'&')) catetory as categoryId))  t1 where t1.rank<=2 ").show();
        //统计每个类别中视频流量Top10
        sqlContext.sql(" select t1.categoryId,t1.video_id,t1.ratings from (select " +
                "categoryId,video_id,ratings," +
                "row_number() over(partition by categoryId order by ratings desc) rank " +
                "from (select " +
                "video_id," +
                "uploader," +
                "age," +
                "categoryId," +
                "length," +
                "views," +
                "rate," +
                "ratings," +
                "conments," +
                "related_ids " +
                "from " +
                "movies lateral view explode(split(category,'&')) catetory as categoryId))  t1 where t1.rank<=2 ").show();
        //统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频,发现该sql好像不太对
        sqlContext.sql("select " +
                "t2.uploader, " +
                "t2.views " +
                "from " +
                "( " +
                "select " +
                "* " +
                "from users " +
                "order by videos desc " +
                "limit 10 " +
                ")t1 " +
                "join " +
                "( " +
                "select " +
                "* " +
                "from movies " +
                ")t2 " +
                "where t1.uploader=t2.uploader " +
                "order by views desc " +
                "limit 20").show();
        //统计每个类别视频观看数Top10
        Dataset kinds = sqlContext.sql(" select t1.categoryId,t1.video_id,t1.views from (select " +
                "categoryId,video_id,views," +
                "row_number() over(partition by categoryId order by views desc) rank " +
                "from (select " +
                "video_id," +
                "uploader," +
                "age," +
                "categoryId," +
                "length," +
                "views," +
                "rate," +
                "ratings," +
                "conments," +
                "related_ids " +
                "from " +
                "movies lateral view explode(split(category,'&')) catetory as categoryId))  t1 where t1.rank<=2 ");
        //将处理好的数据保存到mysql数据库,便于应用程序访问或展示使用。

        kinds.write().format("jdbc")
                .mode(SaveMode.Append)
                .option("url", "jdbc:mysql://localhost:3306/world")
                .option("dbtable", "kinds")
                .option("user", "root")
                .option("password", "222818")
                .save();




    }
}

 

你可能感兴趣的:(大数据,hive,spark)