《大数据商业实战三部曲》打印所有电影中评分最高的前20个电影名和平均分数

package cn.spark.study.project.movie;

import java.math.BigDecimal;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

/**
 * 电影排行榜实例分析
 * @author dahai
 *08 20 18:37
 */
public class RDD_Movie_Users_Analyzer {
 //数据下载
 //https://grouplens.org/datasets/movielens/
 //movies.dat,ratings.dat,users.dat
 public static void main(String[] args){
  //第一步定义spark上下文,设置spark应用信息
  SparkConf conf = new SparkConf()
    .setAppName("RDD_Movie_Users_Analyzer")
    .setMaster("local");
  // 第二步:创建JavaSparkcontext对象
  JavaSparkContext sc =new JavaSparkContext(conf);
  //第三步:
  //针对输入源(HDFS):创建一个初始化RDD
  JavaRDD ratingsRDD=sc.textFile("C://Users//dahai//Desktop//sparkdata//movie/ratings.dat");
  //循环输出数据
  /*ratingsRDD.foreach(new VoidFunction() {
   private static final long serialVersionUID = 1L;

   @Override
   public void call(String data) throws Exception {
    System.out.println(data);
    
   }
  });*/
  
  JavaRDD movieRDD=sc.textFile("C://Users//dahai//Desktop//sparkdata//movie/movies.dat");
  
  //获取所有电影的平均分数
  JavaPairRDD  aveRDD=getAveMovie(ratingsRDD);
  
  //实现评分最高的前20名电影和平均分
  getTop20Ave(aveRDD,movieRDD);
  
  //最后一步关闭连接
  sc.close();
 
 }

 private static JavaPairRDD getAveMovie(JavaRDD ratingsRDD) {
  //第一步获取电影ID总评分和总次数
    //1获取电影id和评分数次数
    JavaPairRDD> movieAndRatPairRDD =ratingsRDD.mapToPair(new PairFunction>() {
     private static final long serialVersionUID = 1L;

     @Override
     public Tuple2> call(String row) throws Exception {
      String[] datas =row.split("::");
      String movleid=datas[1];
      Long rat=Long.valueOf(datas[2]);
      return new Tuple2>(movleid,new Tuple2(rat,1L));
     }
    });

    
    //测试数据正确性:
   /* movieAndRatPairRDD.foreach(new VoidFunction>>() {

     
     private static final long serialVersionUID = 1L;

     @Override
     public void call(Tuple2> tuple) throws Exception {
      System.out.println(tuple);
      
     }
    });
    */
    //2 获取电影ID总评分和总次数
    JavaPairRDD>  data= movieAndRatPairRDD.reduceByKey(new Function2, Tuple2, Tuple2>() {
     
     /**
      *
      */
     private static final long serialVersionUID = 1L;

     @Override
     public Tuple2 call(Tuple2 tuple1, Tuple2 tuple2) throws Exception {
      Long rat1 =tuple1._1;
      Long rat2 =tuple2._1;
      Long rat =rat1+rat2;
      Long count1 =tuple1._2;
      Long count2 = tuple2._2;
      Long  count3 =count1+count2;
      return new Tuple2(rat,count3);
     }
    });
    //(1486,(22,7)) (電影id:1486,(总评分数22,评分了7次)
    //(3492,(50,14))(電影id:3492,(总评分数22,评分了14次)
    /*data.foreach(new VoidFunction>>() {
     private static final long serialVersionUID = 1L;

     @Override
     public void call(Tuple2> tuple) throws Exception {
      System.out.println(tuple);
      
     }
    });*/
    //第三步获取该电影ID的平均分数
    JavaPairRDD  ave= data.mapToPair(new PairFunction>, String, Double>() {

     /**
      *
      */
     private static final long serialVersionUID = 1L;

     @Override
     public Tuple2 call(Tuple2> rat) throws Exception {
      String movieid =rat._1;
      Double result = new BigDecimal((float)rat._2._1 / rat._2._2).setScale(1, BigDecimal.ROUND_HALF_UP).doubleValue();
      return new Tuple2(movieid,result);
     }
    });
    return ave;
    
    //电影id.平均分数
    //(508,3.9)
    /*(2563,3.0)
    (1910,3.8)
    (1904,3.5)
    (3339,3.7)
    (1715,3.0)
    (605,3.3)
    (706,2.5)
    (2648,4.0)
    (710,2.3)
    (1208,4.2)
    (1,4.1)
    (1313,2.2)
    (2005,3.5)
    (1259,4.1)*/
    /*ave.foreach(new VoidFunction>() {
    private static final long serialVersionUID = 1L;

    @Override
    public void call(Tuple2 tuple) throws Exception {
     System.out.println(tuple);
     
    }
   });*/
 }

 /**
  * 获取评分最高前10名电影和平均分
  * @param aveRDD
  * @param movieRDD
  */
 private static void getTop20Ave(JavaPairRDD aveRDD, JavaRDD movieRDD) {
  
  
  //第四步:获取电影dat中电影ID,电影名称
  JavaPairRDD  movies=movieRDD.mapToPair(new PairFunction() {

   /**
    *
    */
   private static final long serialVersionUID = 1L;

   @Override
   public Tuple2 call(String line) throws Exception {
    String[] datas =line.split("::");
    String movieid =datas[0];
    String movieName=datas[1];
    return new Tuple2(movieid,movieName);
   }
  });
  
  //第五步根据电影ID join 排名信息,得到电影名称和平均分数
  
  JavaPairRDD moviesData=movies.join(aveRDD).mapToPair(new PairFunction>, Double, String>() {
   private static final long serialVersionUID = 1L;

   @Override
   public Tuple2 call(Tuple2> tuple) throws Exception {
    String movieName =tuple._2._1;
    Double ave =tuple._2._2;
    return new Tuple2(ave,movieName);
   }
  });
  
  //第六步取top10倒序排序输出
  List> moviestop10=moviesData.sortByKey(false).take(20);
  for(Tuple2 movieTop10 : moviestop10){
         String movieName = movieTop10._2;//获取电影名称·
         Double ave = movieTop10._1;//获取点评分数
         System.out.println(movieName+" 平均点评分数为:"+ave);
       }
  
  /*Gate of Heavenly Peace, The (1995) 平均点评分数为:5.0
  Song of Freedom (1936) 平均点评分数为:5.0
  One Little Indian (1973) 平均点评分数为:5.0
  Schlafes Bruder (Brother of Sleep) (1995) 平均点评分数为:5.0
  Lured (1947) 平均点评分数为:5.0
  Bittersweet Motel (2000) 平均点评分数为:5.0
  Follow the Bitch (1998) 平均点评分数为:5.0
  Baby, The (1973) 平均点评分数为:5.0
  Smashing Time (1967) 平均点评分数为:5.0
  Ulysses (Ulisse) (1954) 平均点评分数为:5.0
  I Am Cuba (Soy Cuba/Ya Kuba) (1964) 平均点评分数为:4.8
  Lamerica (1994) 平均点评分数为:4.8
  Apple, The (Sib) (1998) 平均点评分数为:4.7
  Shawshank Redemption, The (1994) 平均点评分数为:4.6
  Sanjuro (1962) 平均点评分数为:4.6
  Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) 平均点评分数为:4.6
  Dry Cleaning (Nettoyage � sec) (1997) 平均点评分数为:4.5
  Hour of the Pig, The (1993) 平均点评分数为:4.5
  Callej�n de los milagros, El (1995) 平均点评分数为:4.5
  Skipped Parts (2000) 平均点评分数为:4.5*/
 }
}

你可能感兴趣的:(大数据)