目的:java代码实现团购网站的标签生成
最终结果:
83644298=============>体验好:1
82317795=============>味道差:1
77705462=============>服务热情:3,羊肉:2,味道赞:1
85766086=============>价格实惠:2,上菜慢:1
74145782=============>服务热情:18,味道赞:14,上菜快:13,菜品不错:12,回头客:11,性价比高:6,停车方便:5,体验好:4,不推荐:3,服务差:2
71039150=============>团建:1
70611801=============>干净卫生:4,回头客:3,味道赞:2,肉类好:1
73963176=============>味道赞:15,价格实惠:12,分量足:11,菜品不错:10,肉类好:7,环境优雅:6,回头客:4,性价比高:3,味道一般:1
84270191=============>价格实惠:2,干净卫生:1
89223651=============>环境优雅:8,技师专业:7,干净卫生:5,服务一般:4,无办卡:3,环境一般:2
82016443=============>分量足:3,味道赞:2,服务热情:1
77287793=============>干净卫生:29,环境优雅:26,交通便利:25,性价比高:19,服务热情:18,高大上:16,停车方便:13,音响效果差:1
79197522=============>服务热情:2,价格实惠:1
83084036=============>干净卫生:1
73879078=============>饮品赞:3,回头客:2,分量足:1
88284865=============>价格实惠:1
83073343=============>干净卫生:17,味道赞:16,环境优雅:15,菜品不错:11,肉类好:9,性价比高:8,体验好:7,回头客:6,价格实惠:4,上菜慢:1
76114040=============>性价比高:1
86913510=============>午餐:1
88496862=============>回头客:5,味道赞:4,分量足:3,性价比高:2,高大上:1
78477325=============>味道赞:8,回头客:7,干净卫生:5,味道一般:4,菜品不错:3,环境优雅:2,肉类好:1
83981222=============>性价比高:4,干净卫生:3,服务热情:2
82705919=============>回头客:3,饮品赞:2,性价比高:1
87994574=============>无推销:12,价格实惠:8,服务热情:7,效果赞:5,环境优雅:4,技师专业:3,没有异味:2,效果差:1
77373671=============>菜品差:1
75144086=============>服务热情:38,效果赞:30,无办卡:22,性价比高:21,无推销:19,价格实惠:18,干净卫生:13,体验好:12,韩系风格:10,美发师手艺好:3
85648235=============>味道赞:17,服务热情:15,干净卫生:13,上菜快:12,回头客:11,性价比高:10,体验好:9,价格实惠:8,分量足:7,情侣约会:1
73607905=============>菜品不错:16,回头客:15,服务热情:14,分量足:13,肉类好:11,环境优雅:7,体验好:5,体验差:2,价格实惠:1
76893145=============>服务热情:10,环境优雅:7,高大上:5,回头客:4,温馨浪漫:3,味道一般:2,饮品赞:1
78824187=============>价格实惠:13,回头客:11,分量足:10,环境优雅:8,干净卫生:7,上菜快:6,主食赞:5,味道赞:4,服务差:1
代码:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;
/**
* java版实现团购网站标签生成程序
*/
public class ReviewTagsJava {
public static void main(String [] args){
SparkConf conf = new SparkConf();
conf.setMaster("local[4]");
conf.setAppName("ReviewTagsJava");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD rdd1 = jsc.textFile("file:///d:/scala/taggen/temptags.txt");
//以\t切割成String数组
JavaRDD rdd2 = rdd1.map(new Function() {
public String[] call(String s) throws Exception {
return s.split("\t");
}
});
// 过滤
JavaRDD rdd3 = rdd2.filter(new Function() {
public Boolean call(String[] v) throws Exception {
return v.length == 2;
}
});
//变换成数组,ID-->味道好,价格实惠,量足
JavaPairRDD rdd4 = rdd3.mapToPair(new PairFunction() {
public Tuple2 call(String[] t) throws Exception {
return new Tuple2(t[0],ReviewTags.extractTags(t[1]));
}
});
//过滤空评论
JavaPairRDD rdd5 = rdd4.filter(new Function, Boolean>() {
public Boolean call(Tuple2 t) throws Exception {
return t._2.length() > 0;
}
});
//对V进行切割,V形成数组
JavaPairRDD rdd6 = rdd5.mapToPair(new PairFunction, String,String[]>() {
public Tuple2 call(Tuple2 v) throws Exception {
return new Tuple2(v._1(),v._2().split(","));
}
});
//V数组压扁,形成集合。ID->味道好 ID->价格实惠 .......
JavaPairRDD rdd7 = rdd6.flatMapValues(new Function>() {
public Iterable call(String[] v) throws Exception {
List list = new ArrayList();
for(String v1 : v){
list.add(v1);
}
return list;
}
});
//K:ID 味道好 V:1......
JavaPairRDD, Integer> rdd8 = rdd7.mapToPair(new PairFunction, Tuple2, Integer>() {
public Tuple2, Integer> call(Tuple2 v) throws Exception {
return new Tuple2, Integer>(v,1);
}
});
JavaPairRDD,Integer> rdd9 = rdd8.reduceByKey(new Function2() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
//K:ID V:味道好,1 ......
JavaPairRDD> rdd10 = rdd9.mapToPair(new PairFunction,Integer>, String, Tuple2>() {
public Tuple2> call(Tuple2, Integer> v) throws Exception {
return new Tuple2>(v._1()._1(),new Tuple2(v._1()._2(),v._2()));
}
});
// V变成集合,以备聚合
JavaPairRDD>> rdd11 = rdd10.mapToPair(new PairFunction>, String, List>>() {
public Tuple2>> call(Tuple2> v) throws Exception {
List> list = new ArrayList>();
list.add(v._2());
return new Tuple2>>(v._1(),list);
}
});
//
JavaPairRDD>> rdd12 = rdd11.reduceByKey(new Function2>, List>, List>>() {
public List> call(List> v1, List> v2) throws Exception {
v1.addAll(v2);
return v1;
}
});
//排序降序,取前10评论
JavaPairRDD rdd13 = rdd12.mapToPair(new PairFunction>>, String, String>() {
public Tuple2 call(Tuple2>> v) throws Exception {
//降序,
TreeSet> ts = new TreeSet>(new Tuple2Comparator());
ts.addAll(v._2());
//前10迭代
Iterator> it = ts.iterator();
int index = 0;
String str = "";
while(it.hasNext()){
if(index > 9){
break;
}
//迭代出前10的元组给t0
Tuple2 t0 = it.next();
//V:味道好 :12,量大:13
str = str + t0._1() + ":" + t0._2() + ",";
index++;
}
//去掉最后一个","
str = str.substring(0,str.length()-1);
//K:ID, V:味道好+12评论数量
return new Tuple2(v._1(),str);
}
});
//collect执行---->List
List> list = rdd13.collect();
//循环打印结果
for(Tuple2 l : list){
System.out.println(l._1() + "=============>" + l._2());
}
}
}
降序取出前10评论需要调用对比器:
import scala.Tuple2;
import java.util.Comparator;
/**
* 对比器,比较大小,降序
*/
public class Tuple2Comparator implements Comparator> {
public int compare(Tuple2 o1, Tuple2 o2) {
return o2._2() - o1._2();
}
}