//表1
scala> val df1 = spark.createDataFrame(Seq(("aaa", 14, 1), ("bbb", 30, 2), ("ccc", 45, 3), ("bbb", 56, 4)) ).toDF("R1","R2","R3")
scala> df1.show
+---+---+---+
| R1| R2| R3|
+---+---+---+
|aaa| 14| 1|
|bbb| 30| 2|
|ccc| 45| 3|
|bbb| 56| 4|
+---+---+---+
//表2
scala> val df2 = spark.createDataFrame(Seq(("eee", 140, 1), ("fff", 300, 2), ("ccc", 450, 3), ("ggg", 560, 9)) ).toDF("R1","R2","R3")
scala> df2.show
+---+---+---+
| R1| R2| R3|
+---+---+---+
|eee|140| 1|
|fff|300| 2|
|ccc|450| 3|
|ggg|560| 9|
+---+---+---+
val data_left = df1.select(df1.columns.map(n => col(n).as("left_"+n)):_*) //修改表的类名,左表为left_+列名
scala> data_left.show
+-------+-------+-------+
|left_R1|left_R2|left_R3|
+-------+-------+-------+
| aaa| 14| 1|
| bbb| 30| 2|
| ccc| 45| 3|
| bbb| 56| 4|
+-------+-------+-------+
val data_right = df2.select(df2.columns.map(n => col(n).as("right_"+n)):_*) //修改表的类名,左表为right_+列名
scala> data_right.show
+--------+--------+--------+
|right_R1|right_R2|right_R3|
+--------+--------+--------+
| eee| 140| 1|
| fff| 300| 2|
| ccc| 450| 3|
| ggg| 560| 9|
+--------+--------+--------+
内连接1
scala> val result = data_left.join(data_right, data_left("left_R3")===data_right("right_R3"),"inner")
scala> result.show
+-------+-------+-------+--------+--------+--------+
|left_R1|left_R2|left_R3|right_R1|right_R2|right_R3|
+-------+-------+-------+--------+--------+--------+
| aaa| 14| 1| eee| 140| 1|
| bbb| 30| 2| fff| 300| 2|
| ccc| 45| 3| ccc| 450| 3|
+-------+-------+-------+--------+--------+--------+
//去除重复列
scala> val result1=result.drop("right_R3")
内连接2
scala> val result = data_left.joinWith(data_right, data_left("left_R3")===data_right("right_R3"),"inner")
result: org.apache.spark.sql.Dataset[(org.apache.spark.sql.Row, org.apache.spark.sql.Row)] = [_1: struct, _2: struct]
scala> result.show
+----------+-----------+
| _1| _2|
+----------+-----------+
|[aaa,14,1]|[eee,140,1]|
|[bbb,30,2]|[fff,300,2]|
|[ccc,45,3]|[ccc,450,3]|
+----------+-----------+
scala> val result = data_left.join(data_right, data_left("left_R3")===data_right("right_R3"),"outer")
scala> result.show
+-------+-------+-------+--------+--------+--------+
|left_R1|left_R2|left_R3|right_R1|right_R2|right_R3|
+-------+-------+-------+--------+--------+--------+
| aaa| 14| 1| eee| 140| 1|
| ccc| 45| 3| ccc| 450| 3|
| null| null| null| ggg| 560| 9|
| bbb| 56| 4| null| null| null|
| bbb| 30| 2| fff| 300| 2|
+-------+-------+-------+--------+--------+--------+
scala> val result = data_left.join(data_right, data_left("left_R3")===data_right("right_R3"),"left_outer")
scala> result.show
+-------+-------+-------+--------+--------+--------+
|left_R1|left_R2|left_R3|right_R1|right_R2|right_R3|
+-------+-------+-------+--------+--------+--------+
| aaa| 14| 1| eee| 140| 1|
| bbb| 30| 2| fff| 300| 2|
| ccc| 45| 3| ccc| 450| 3|
| bbb| 56| 4| null| null| null|
+-------+-------+-------+--------+--------+--------+
scala> val result = data_left.join(data_right, data_left("left_R3")===data_right("right_R3"),"right_outer")
result: org.apache.spark.sql.DataFrame = [left_R1: string, left_R2: int ... 4 more fields]
scala> result.show
+-------+-------+-------+--------+--------+--------+
|left_R1|left_R2|left_R3|right_R1|right_R2|right_R3|
+-------+-------+-------+--------+--------+--------+
| aaa| 14| 1| eee| 140| 1|
| bbb| 30| 2| fff| 300| 2|
| ccc| 45| 3| ccc| 450| 3|
| null| null| null| ggg| 560| 9|
+-------+-------+-------+--------+--------+--------+
scala> val result = data_left.join(data_right, data_left("left_R3")===data_right("right_R3"),"leftsemi")
result: org.apache.spark.sql.DataFrame = [left_R1: string, left_R2: int ... 1 more field]
scala> result.show
+-------+-------+-------+
|left_R1|left_R2|left_R3|
+-------+-------+-------+
| aaa| 14| 1|
| bbb| 30| 2|
| ccc| 45| 3|
+-------+-------+-------+
scala> df1.crossJoin(df2).show
+---+---+---+---+---+---+
| R1| R2| R3| R1| R2| R3|
+---+---+---+---+---+---+
|aaa| 14| 1|eee|140| 1|
|aaa| 14| 1|fff|300| 2|
|aaa| 14| 1|ccc|450| 3|
|aaa| 14| 1|ggg|560| 9|
|bbb| 30| 2|eee|140| 1|
|bbb| 30| 2|fff|300| 2|
|bbb| 30| 2|ccc|450| 3|
|bbb| 30| 2|ggg|560| 9|
|ccc| 45| 3|eee|140| 1|
|ccc| 45| 3|fff|300| 2|
|ccc| 45| 3|ccc|450| 3|
|ccc| 45| 3|ggg|560| 9|
|bbb| 56| 4|eee|140| 1|
|bbb| 56| 4|fff|300| 2|
|bbb| 56| 4|ccc|450| 3|
|bbb| 56| 4|ggg|560| 9|
+---+---+---+---+---+---+