能进行join的只能是:
JavaPairRDD
--------------------------------------------------------------------第一种方案------------------------------------------------------------------------
代码如下:
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import org.apache.log4j.Logger;
import java.util.Arrays;
import java.util.List;
public class java_join {
static class Entity {
private String name;
private Integer age;
public Entity(String name, Integer age) //构造函数
{
this.name = name;
this.age = age;
}
public String getName() {
return name;
}
public Integer getAge() {
return age;
}
}
//--------------------------------------------------------------------------------------------------
public static void main(String[] args)
{
Logger.getLogger("org.apache.hadoop").setLevel(org.apache.log4j.Level.WARN);
Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.WARN);
Logger.getLogger("org.project-spark").setLevel(org.apache.log4j.Level.WARN);
String appName = "test";
String master = "local[2]";
String path = "hdfs://Desktop:9000/rdd3.csv";
SparkConf conf = new SparkConf().setAppName(appName).setMaster(master) .set("spark.serializer","org.apache.spark.serializer.KryoSerializer");
JavaSparkContext sc = new JavaSparkContext(conf);
// 这个keyby会把age放前,name放后
JavaPairRDD pairRDD = sc.parallelize(Arrays.asList(
new Entity("zhangsan", 11),
new Entity("lisi", 11),
new Entity("wangwu", 13)
)).keyBy(Entity::getAge);
JavaPairRDD javaPairRDD = sc.textFile(path)
.map(line -> {
String[] strings = line.split(",");
String name = strings[0];
Integer age = Integer.valueOf(strings[1]);
return new Entity(name, age);
}).keyBy(Entity::getAge);
System.out.println("--------------------------------------------------------");
System.out.println(javaPairRDD.collect());
JavaPairRDD> collect = pairRDD.join(javaPairRDD);
System.out.println("-------------------------查看join结果-------------------------------");
List>> result = collect.collect();
for (int i = 0; i < result.size(); i++)
{
System.out.print("List[");
System.out.print(result.get(i)._1);
System.out.print(",Tuple2(");
System.out.print(result.get(i)._2._1.name);
System.out.print(",");
System.out.print(result.get(i)._2._2.name);
System.out.println(")]");
}
}
}
实验验结果是:
List[11,Tuple2(zhangsan,zhangsan)]
List[11,Tuple2(zhangsan,lisi)]
List[11,Tuple2(lisi,zhangsan)]
List[11,Tuple2(lisi,lisi)]
rdd3.csv的内容是:
zhangsan,11
lisi,11
wangwu,14
----------------------第二种方案--------------------------
import com.sun.rowset.internal.Row;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaPairRDD$;
import org.apache.spark.api.java.function.*;
import org.slf4j.event.Level;
import scala.Tuple2;
import java.util.*;
import java.util.Random;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.SparkContext;
import java.util.Iterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import java.lang.*;
//import org.apache.log4j.Level;
import org.apache.log4j.Logger;
//import java.util.logging.Logger;
import scala.Tuple2;
public class sampling_salting
{
public static void main(String[] args)
{
Logger.getLogger("org.apache.hadoop").setLevel(org.apache.log4j.Level.WARN);
Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.WARN);
Logger.getLogger("org.project-spark").setLevel(org.apache.log4j.Level.WARN);
SparkConf conf = new SparkConf().setMaster("local").setAppName("join");
JavaSparkContext sc = new JavaSparkContext(conf);
String path1="hdfs://Desktop:9000/rdd1.csv";
String path2="hdfs://Desktop:9000/rdd2.csv";
JavaPairRDD rdd1 = sc.textFile(path1)
.mapToPair(new PairFunction()
{
@Override
public Tuple2 call(String s) throws Exception
{
String[] strings=s.split(",");
Integer ids = Integer.valueOf(strings[0]);
String greet=strings[1];
return Tuple2.apply(ids,greet);
}
});
JavaPairRDDrdd2=sc.textFile(path2)
.mapToPair(line->{
String[] strings=line.split(",");
Integer ids = Integer.valueOf(strings[0]);
String greet=strings[1];
return new Tuple2<>(ids,greet);
});
System.out.println(rdd1.collect());
System.out.println(rdd2.collect());
JavaPairRDD> result = rdd1.join(rdd2);
System.out.println(result.collect());
}
}
上述代码中,转化为最终的JavaPairRDD使用了mapToPair有两种办法:
return Tuple2.apply(ids,greet);
return new Tuple2<>(ids,greet);
rdd1.csv
001,hello
001,hello
001,hello
001,hello
rdd2.csv
002,hello
002,hello
002,hello
002,hello
hdfs dfs -put rdd1.csv /
hdfs dfs -put rdd2.csv /
--------------------------------------------Java常用的pom.xml文件--------------------------------------------------------------------------------
4.0.0
java_join
java_join
1.0-SNAPSHOT
org.apache.maven.plugins
maven-compiler-plugin
1.8
UTF-8
org.apache.spark
spark-core_2.12
3.0.0
org.apache.spark
spark-sql_2.12
3.0.0
org.apache.spark
spark-streaming_2.12
3.0.0
provided
org.apache.spark
spark-mllib_2.12
3.0.0
runtime
org.apache.spark
spark-graphx_2.12
3.0.0