package cn.spark.study.core;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import com.google.common.collect.Sets;
import scala.Tuple2;
public class FindCommonFriends {
public static void main(String[] args){
SparkConf conf = new SparkConf().setAppName("FindCommonFriends");
JavaSparkContext jsc = new JavaSparkContext(conf);
if(args.length < 1){
System.out.println("err");
System.exit(1);
}
JavaRDD records = jsc.textFile(args[0]);
JavaPairRDD,Iterable> pairs = records.flatMapToPair(
new PairFlatMapFunction,Iterable>(){
private static final long serialVersionUID = 1L;
@Override
public Iterable, Iterable>> call(String s) throws Exception {
String[] tokens = s.split(",");
long person = Long.parseLong(tokens[0]);
String friendsAsString = tokens[1];
String[] friendsTokenized = friendsAsString.split(" ");
if(friendsTokenized.length == 1){
Tuple2 key = buildSortedTuple(person,Long.parseLong(friendsTokenized[0]));
return Arrays.asList(new Tuple2,Iterable>(key,new ArrayList()));
}
List friends = new ArrayList();
for(String r : friendsTokenized){
friends.add(Long.parseLong(r));
}
List,Iterable>> result = new ArrayList,Iterable>>();
for(Long l : friends){
Tuple2 key = buildSortedTuple(person,l);
result.add(new Tuple2,Iterable>(key,friends));
}
return result;
}
});
//debug1
List,Iterable>> debug1 = pairs.collect();
for(Tuple2,Iterable> r : debug1){
System.out.println("debug1 key = " + r._1 + "\t value = " + r._2);
}
JavaPairRDD, Iterable>> grouped = pairs.groupByKey();
//debug2
List, Iterable>>> debug2 = grouped.collect();
for(Tuple2, Iterable>> r : debug2){
System.out.println("debug2 key = " + r._1 + "\t value = " + r._2);
}
JavaPairRDD,Iterable> commonFriends = grouped.mapValues(new Function>,Iterable>(){
private static final long serialVersionUID = 1L;
@Override
public Iterable call(Iterable> s) throws Exception {
Map countCommon = new HashMap();
int size = 0;
for(Iterable r : s){
size++;
List list = iterableToList(r);
if((list == null) || (list.isEmpty())){
continue;
}
for(Long f : list){
Integer count = countCommon.get(f);
if(count == null){
countCommon.put(f, 1);
}
else{
countCommon.put(f, ++count);
}
}
}
List finalCommonFriends = new ArrayList();
for(Map.Entry entry : countCommon.entrySet()){
if(entry.getValue() == size){
finalCommonFriends.add(entry.getKey());
}
}
return finalCommonFriends;
}
});
//debug3
List,Iterable>> debug3 = commonFriends.collect();
for(Tuple2, Iterable> r : debug3){
System.out.println("debug3 commonFriends key = " + r._1 + "\t value = " + r._2);
}
System.out.println("===================================Two reduceByKey================================================");
JavaPairRDD,Iterable> commonFriendsReduce = pairs.reduceByKey(
new Function2,Iterable,Iterable>(){
private static final long serialVersionUID = 1L;
@Override
public Iterable call(Iterable a, Iterable b) throws Exception {
Set x = Sets.newHashSet(a);
Set intersection = new HashSet();
for(Long item : b){
if(x.contains(item)){
intersection.add(item);
}
}
return intersection;
}
});
Map, Iterable> commonFriendsMap = commonFriendsReduce.collectAsMap();
for(Entry, Iterable> r : commonFriendsMap.entrySet()){
System.out.println("Two commonFriendsReduce key = " + r.getKey() + "\t value = " + r.getValue());
}
}
//排序避免重复的主键
static Tuple2 buildSortedTuple(Long p1,Long p2){
if(p1 < p2){
return new Tuple2(p1,p2);
}
else{
return new Tuple2(p2,p1);
}
}
static List iterableToList(Iterable r) {
List l = new ArrayList();
for(Long e : r){
l.add(e);
}
return l;
}
}
测试数据:
100,200 300 400 500
200,100 300 400
300,100 200 400 500
400,100 200 300
500,100 300
600,100
运行脚本:
/usr/local/spark1.5/bin/spark-submit \
--class cn.spark.study.core.FindCommonFriends \
--num-executors 3 \
--driver-memory 100m \
--executor-memory 100m \
--executor-cores 3 \
/usr/local/spark-text/java/findCommonFriends/FindCommonFriends.jar hdfs://spark01:9000/commonFriends.txt
运行结果:
debug1 key = (100,200) value = [200, 300, 400, 500]
debug1 key = (100,300) value = [200, 300, 400, 500]
debug1 key = (100,400) value = [200, 300, 400, 500]
debug1 key = (100,500) value = [200, 300, 400, 500]
debug1 key = (100,200) value = [100, 300, 400]
debug1 key = (200,300) value = [100, 300, 400]
debug1 key = (200,400) value = [100, 300, 400]
debug1 key = (100,300) value = [100, 200, 400, 500]
debug1 key = (200,300) value = [100, 200, 400, 500]
debug1 key = (300,400) value = [100, 200, 400, 500]
debug1 key = (300,500) value = [100, 200, 400, 500]
debug1 key = (100,400) value = [100, 200, 300]
debug1 key = (200,400) value = [100, 200, 300]
debug1 key = (300,400) value = [100, 200, 300]
debug1 key = (100,500) value = [100, 300]
debug1 key = (300,500) value = [100, 300]
debug1 key = (100,600) value = []
debug2 key = (300,400) value = [[100, 200, 400, 500], [100, 200, 300]]
debug2 key = (100,200) value = [[200, 300, 400, 500], [100, 300, 400]]
debug2 key = (300,500) value = [[100, 200, 400, 500], [100, 300]]
debug2 key = (100,500) value = [[200, 300, 400, 500], [100, 300]]
debug2 key = (200,300) value = [[100, 300, 400], [100, 200, 400, 500]]
debug2 key = (100,600) value = [[]]
debug2 key = (100,300) value = [[200, 300, 400, 500], [100, 200, 400, 500]]
debug2 key = (200,400) value = [[100, 300, 400], [100, 200, 300]]
debug2 key = (100,400) value = [[200, 300, 400, 500], [100, 200, 300]]
debug3 commonFriends key = (300,400) value = [100, 200]
debug3 commonFriends key = (100,200) value = [400, 300]
debug3 commonFriends key = (300,500) value = [100]
debug3 commonFriends key = (100,500) value = [300]
debug3 commonFriends key = (200,300) value = [100, 400]
debug3 commonFriends key = (100,600) value = []
debug3 commonFriends key = (100,300) value = [200, 500, 400]
debug3 commonFriends key = (200,400) value = [100, 300]
debug3 commonFriends key = (100,400) value = [200, 300]
===================================Two reduceByKey================================================
Two commonFriendsReduce key = (300,400) value = [100, 200]
Two commonFriendsReduce key = (200,400) value = [100, 300]
Two commonFriendsReduce key = (100,600) value = []
Two commonFriendsReduce key = (200,300) value = [100, 400]
Two commonFriendsReduce key = (100,500) value = [300]
Two commonFriendsReduce key = (300,500) value = [100]
Two commonFriendsReduce key = (100,300) value = [200, 500, 400]
Two commonFriendsReduce key = (100,200) value = [400, 300]
Two commonFriendsReduce key = (100,400) value = [200, 300]