首先用记事本准备两张表数据:
customers.txt:
1,smith,12
2,bob,13
3,alex,14
4,alice,15
6,mike,26
orders.txt:
1,t001,100.9,1
2,t002,100.3,1
3,t001,100.6,2
4,t003,100.7,2
5,t001,100.1,2
6,t003,100.1,3
7,t002,100.1,3
8,t001,100.1,3
9,t002,100.1,3
上传到hadoop集群上:
[hadoop]> hdfs dfs -put customers.txt /user/hadoop/data
[hadoop]> hdfs dfs -put orders.txt /user/hadoop/data
具体的spark sql环境搭建参见我的博客上一篇文章:
spark sql在scala与java中的代码实现 https://blog.csdn.net/nengyu/article/details/95870479
在IDEA中编写代码:
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.*;
import static org.apache.hadoop.yarn.webapp.hamlet.HamletSpec.Scope.row;
/**
* 使用spark sql实现两张表的左外连接
*/
public class SparkSQLDemoJava {
public static void main(String[] args) {
SparkSession spark=SparkSession.builder().appName("sparkSQL").master("local").enableHiveSupport().getOrCreate();
//创建java spark上下文
JavaSparkContext sc=new JavaSparkContext(spark.sparkContext());
//cust
JavaRDD c_rdd1=sc.textFile("/user/hadoop/data2/customers.txt");
JavaRDD c_rdd2=c_rdd1.map(new Function() {
public Row call(String line) throws Exception{
String[] arr=line.split(",");
Integer id=Integer.parseInt(arr[0]);
String name=arr[1];
Integer age=Integer.parseInt(arr[2]);
return RowFactory.create(id,name,age);
}
});
StructField[] c_fields=new StructField[3];
c_fields[0]=new StructField("id", DataTypes.IntegerType,false, Metadata.empty());
c_fields[1]=new StructField("name",DataTypes.StringType,true,Metadata.empty());
c_fields[2]=new StructField("age",DataTypes.IntegerType,true,Metadata.empty());
StructType c_type=new StructType(c_fields);
Dataset c_df1=spark.createDataFrame(c_rdd2,c_type);
//注册临时视图
c_df1.createOrReplaceTempView("_cust");
//order
JavaRDD o_rdd1=sc.textFile("/user/hadoop/data2/orders.txt");
JavaRDD o_rdd2=o_rdd1.map(new Function() {
public Row call(String line) throws Exception{
String[] arr=line.split(",");
Integer id=Integer.parseInt(arr[0]);
String orderno=arr[1];
Float price=Float.parseFloat(arr[2]);
Integer cid=Integer.parseInt(arr[3]);
return RowFactory.create(id,orderno,price,cid);
}
});
StructField[] o_fields=new StructField[4];
o_fields[0]=new StructField("id", DataTypes.IntegerType,false, Metadata.empty());
o_fields[1]=new StructField("orderno",DataTypes.StringType,true,Metadata.empty());
o_fields[2]=new StructField("price",DataTypes.FloatType,true,Metadata.empty());
o_fields[3]=new StructField("cid", DataTypes.IntegerType,false, Metadata.empty());
StructType o_type=new StructType(o_fields);
Dataset o_df1=spark.createDataFrame(o_rdd2,o_type);
//注册临时视图
o_df1.createOrReplaceTempView("_order");
spark.sql("select * from _cust").show();
spark.sql("select * from _order").show();
String sql="select c.id,c.name,ifnull(o._sum,0) total_price from _cust c left outer join (select cid,sum(price) _sum from _order group by cid) o on c.id=o.cid";
spark.sql(sql).show(1000,false);
}
}
运行结果如下:
+---+-----+---+
| id| name|age|
+---+-----+---+
| 1|smith| 12|
| 2| bob| 13|
| 3| alex| 14|
| 4|alice| 15|
| 6| mike| 26|
+---+-----+---+
+---+-------+-----+---+
| id|orderno|price|cid|
+---+-------+-----+---+
| 1| t001|100.9| 1|
| 2| t002|100.3| 1|
| 3| t001|100.6| 2|
| 4| t003|100.7| 2|
| 5| t001|100.1| 2|
| 6| t003|100.1| 3|
| 7| t002|100.1| 3|
| 8| t001|100.1| 3|
| 9| t002|100.1| 3|
| 10| t002|129.1| 5|
+---+-------+-----+---+
+---+-----+------------------+
|id |name |total_price |
+---+-----+------------------+
|1 |smith|201.20000457763672|
|6 |mike |0.0 |
|3 |alex |400.3999938964844 |
|4 |alice|0.0 |
|2 |bob |301.3999938964844 |
+---+-----+------------------+