数据提取
package cn.spark.uitls;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
/**
* 使用fastjson提取评论
*/
public class ExtractTags {
public static String extract(String info){
JSONObject jsonObj = JSONObject.parseObject(info);
if (jsonObj == null || !jsonObj.containsKey("extInfoList")){
return "";
}
JSONArray jsonArray = jsonObj.getJSONArray("extInfoList");
if(jsonArray == null){
return "";
}
StringBuffer sb = new StringBuffer();
for(int i = 0; i < jsonArray.size(); i++){
JSONObject obj = jsonArray.getJSONObject(i);
if(obj != null && obj.containsKey("values")
&& obj.containsKey("title")
&& obj.getString("title").equals("contentTags")){
JSONArray tagsArray = obj.getJSONArray("values");
if(tagsArray != null){
for(int j = 0; j < tagsArray.size(); j++){
sb.append(tagsArray.get(j));
if(j != tagsArray.size() - 1){
sb.append(",");
}
}
}
}
}
return sb.toString();
}
}
排序比较器
package cn.spark.uitls;
import java.util.Comparator;
import scala.Tuple2;
public class TupleComparator implements Comparator>{
@Override
public int compare(Tuple2 o1, Tuple2 o2) {
return o2._2 - o1._2;
}
}
生成标签
package cn.spark.core.tags;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import cn.spark.uitls.ExtractTags;
import cn.spark.uitls.TupleComparator;
import scala.Tuple2;
/**
* 生成团购标签
*
*/
public class TagsGenerator {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("TagsGenerator")
.setMaster("local[4]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD tempTags = sc
.textFile("file:///tags.txt");
// 提取出评论,并转换成PairRDD
JavaPairRDD pairTags = tempTags.mapToPair(
new PairFunction() {
private static final long serialVersionUID = 2704806555244127991L;
@Override
public Tuple2 call(String line) throws Exception {
String[] tagArray = line.split("\\t");
return new Tuple2(tagArray[0], ExtractTags.extract(tagArray[1]));
}
});
// System.out.println(pairTags.take(10));
// 过滤出无效数据
JavaPairRDD vaildPairTags = pairTags.filter(
new Function, Boolean>() {
private static final long serialVersionUID = 2341022888467424541L;
@Override
public Boolean call(Tuple2 tuple) throws Exception {
if(tuple._2 == ""){
return false;
} else {
return true;
}
}
});
// System.out.println(vaildPairTags.take(10));
// 把有效数据的vlaue转成数组
JavaPairRDD> vaildPairArrayTags = vaildPairTags.mapToPair(
new PairFunction, String, List>() {
private static final long serialVersionUID = 4427355754150194235L;
@Override
public Tuple2> call(Tuple2 tuple) throws Exception {
String[] tags = tuple._2.split(",");
return new Tuple2>(tuple._1, Arrays.asList(tags));
}
});
// System.out.println(vaildPairArrayTags.take(10));
// 压扁
JavaPairRDD flatPairTags = vaildPairArrayTags.flatMapValues(
new Function, Iterable>() {
private static final long serialVersionUID = -83392454411970840L;
@Override
public Iterable call(List list) throws Exception {
return list;
}
});
// System.out.println(flatPairTags.take(10));
// 标1
JavaPairRDD, Integer> flatPairTagsOne = flatPairTags.mapToPair(
new PairFunction, Tuple2, Integer>() {
private static final long serialVersionUID = -4286078108295302852L;
@Override
public Tuple2, Integer> call(Tuple2 tuple) throws Exception {
return new Tuple2, Integer>(tuple, 1);
}
});
// System.out.println(flatPairTagsOne.take(10));
// 聚合reduceBykey
JavaPairRDD, Integer> aggFlatPairTags = flatPairTagsOne.reduceByKey(
new Function2() {
private static final long serialVersionUID = 5247246198728620599L;
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
// System.out.println(aggFlatPairTags.take(10));
// 让评价与次数组合为元组
JavaPairRDD> aggFlatPairTags_1 = aggFlatPairTags.mapToPair(
new PairFunction,Integer>, String, Tuple2>() {
private static final long serialVersionUID = 3487125940408716180L;
@Override
public Tuple2> call(Tuple2, Integer> tuple)
throws Exception {
return new Tuple2>(tuple._1._1, new Tuple2(tuple._1._2, tuple._2));
}
});
// System.out.println(aggFlatPairTags_1.take(10));
// 把value放进List集合
JavaPairRDD>> aggFlatPairTags_2 =
aggFlatPairTags_1.mapToPair(
new PairFunction>,
String, List>>() {
private static final long serialVersionUID = 4033983426111053118L;
@Override
public Tuple2>> call(
Tuple2> tuple) throws Exception {
List> list = new ArrayList<>();
list.add(tuple._2());
return new Tuple2>>(tuple._1, list);
}
});
// System.out.println(aggFlatPairTags_2.take(10));
// 聚合,把相同id的value方到同一个List集合
JavaPairRDD>> aggFlatPairTags_3 =
aggFlatPairTags_2.reduceByKey(
new Function2>,
List>,
List>>() {
private static final long serialVersionUID = -4811643037731110953L;
@Override
public List> call(List> v1,
List> v2)
throws Exception {
v1.addAll(v2);
return v1;
}
});
// System.out.println(aggFlatPairTags_3.take(10));
JavaPairRDD result = aggFlatPairTags_3.mapToPair(
new PairFunction>>, String, String>() {
private static final long serialVersionUID = -2848536030175892531L;
@Override
public Tuple2 call(
Tuple2>> tuple) throws Exception {
TreeSet> set = new TreeSet<>(new TupleComparator());
set.addAll(tuple._2);
Iterator> iter = set.iterator();
StringBuffer sb = new StringBuffer();
int size = set.size();
int count = 0;
while(iter.hasNext()){
count ++;
Tuple2 tags = iter.next();
if(tags._1 != "" && tags._1 != null){
sb.append(tags._1 + ":" + tags._2.toString());
if(count != size){
sb.append("|");
}
}
}
return new Tuple2(tuple._1, sb.toString());
}
});
System.out.println(result.take(10));
sc.close();
}
}