package cn.spark.study.project.movie;
import java.math.BigDecimal;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
/**
* 电影排行榜实例分析
* @author dahai
*08 20 18:37
*/
public class RDD_Movie_Users_Analyzer {
//数据下载
//https://grouplens.org/datasets/movielens/
//movies.dat,ratings.dat,users.dat
public static void main(String[] args){
//第一步定义spark上下文,设置spark应用信息
SparkConf conf = new SparkConf()
.setAppName("RDD_Movie_Users_Analyzer")
.setMaster("local");
// 第二步:创建JavaSparkcontext对象
JavaSparkContext sc =new JavaSparkContext(conf);
//第三步:
//针对输入源(HDFS):创建一个初始化RDD
JavaRDD
//循环输出数据
/*ratingsRDD.foreach(new VoidFunction
private static final long serialVersionUID = 1L;
@Override
public void call(String data) throws Exception {
System.out.println(data);
}
});*/
JavaRDD
//获取所有电影的平均分数
JavaPairRDD
//实现评分最高的前20名电影和平均分
getTop20Ave(aveRDD,movieRDD);
//最后一步关闭连接
sc.close();
}
private static JavaPairRDD
//第一步获取电影ID总评分和总次数
//1获取电影id和评分数次数
JavaPairRDD
private static final long serialVersionUID = 1L;
@Override
public Tuple2
String[] datas =row.split("::");
String movleid=datas[1];
Long rat=Long.valueOf(datas[2]);
return new Tuple2
}
});
//测试数据正确性:
/* movieAndRatPairRDD.foreach(new VoidFunction
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2
System.out.println(tuple);
}
});
*/
//2 获取电影ID总评分和总次数
JavaPairRDD
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Tuple2
Long rat1 =tuple1._1;
Long rat2 =tuple2._1;
Long rat =rat1+rat2;
Long count1 =tuple1._2;
Long count2 = tuple2._2;
Long count3 =count1+count2;
return new Tuple2
}
});
//(1486,(22,7)) (電影id:1486,(总评分数22,评分了7次)
//(3492,(50,14))(電影id:3492,(总评分数22,评分了14次)
/*data.foreach(new VoidFunction
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2
System.out.println(tuple);
}
});*/
//第三步获取该电影ID的平均分数
JavaPairRDD
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Tuple2
String movieid =rat._1;
Double result = new BigDecimal((float)rat._2._1 / rat._2._2).setScale(1, BigDecimal.ROUND_HALF_UP).doubleValue();
return new Tuple2
}
});
return ave;
//电影id.平均分数
//(508,3.9)
/*(2563,3.0)
(1910,3.8)
(1904,3.5)
(3339,3.7)
(1715,3.0)
(605,3.3)
(706,2.5)
(2648,4.0)
(710,2.3)
(1208,4.2)
(1,4.1)
(1313,2.2)
(2005,3.5)
(1259,4.1)*/
/*ave.foreach(new VoidFunction
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2
System.out.println(tuple);
}
});*/
}
/**
* 获取评分最高前10名电影和平均分
* @param aveRDD
* @param movieRDD
*/
private static void getTop20Ave(JavaPairRDD
//第四步:获取电影dat中电影ID,电影名称
JavaPairRDD
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Tuple2
String[] datas =line.split("::");
String movieid =datas[0];
String movieName=datas[1];
return new Tuple2
}
});
//第五步根据电影ID join 排名信息,得到电影名称和平均分数
JavaPairRDD
private static final long serialVersionUID = 1L;
@Override
public Tuple2
String movieName =tuple._2._1;
Double ave =tuple._2._2;
return new Tuple2
}
});
//第六步取top10倒序排序输出
List
for(Tuple2
String movieName = movieTop10._2;//获取电影名称·
Double ave = movieTop10._1;//获取点评分数
System.out.println(movieName+" 平均点评分数为:"+ave);
}
/*Gate of Heavenly Peace, The (1995) 平均点评分数为:5.0
Song of Freedom (1936) 平均点评分数为:5.0
One Little Indian (1973) 平均点评分数为:5.0
Schlafes Bruder (Brother of Sleep) (1995) 平均点评分数为:5.0
Lured (1947) 平均点评分数为:5.0
Bittersweet Motel (2000) 平均点评分数为:5.0
Follow the Bitch (1998) 平均点评分数为:5.0
Baby, The (1973) 平均点评分数为:5.0
Smashing Time (1967) 平均点评分数为:5.0
Ulysses (Ulisse) (1954) 平均点评分数为:5.0
I Am Cuba (Soy Cuba/Ya Kuba) (1964) 平均点评分数为:4.8
Lamerica (1994) 平均点评分数为:4.8
Apple, The (Sib) (1998) 平均点评分数为:4.7
Shawshank Redemption, The (1994) 平均点评分数为:4.6
Sanjuro (1962) 平均点评分数为:4.6
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) 平均点评分数为:4.6
Dry Cleaning (Nettoyage � sec) (1997) 平均点评分数为:4.5
Hour of the Pig, The (1993) 平均点评分数为:4.5
Callej�n de los milagros, El (1995) 平均点评分数为:4.5
Skipped Parts (2000) 平均点评分数为:4.5*/
}
}