Spark-Sql创建多数据源Join实例——涉及关系库数据源
1、Spark-Sql数据来源有多种,Hive,Kakfa,RDD及关系库等。
2、注册临时表与关系库映射表的join实现,核心思想就是,Hive和关系库的数据,分别加载出来做成dataframe,然后分别注册成临时表。
import java.util.Properties
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
/**
* Created by wuke on 2016/5/26.
*/
object MultiDataSource {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local")
.setAppName("MultiDataSource")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
// 这里着重说明一下!!!
// 要使用Spark SQL的内置函数,就必须在这里导入SQLContext下的隐式转换
import sqlContext.implicits._
// 构造用户访问日志数据,并创建DataFrame
// 模拟用户访问日志,日志用逗号隔开,第一列是姓名,第二列是id
val userAccessLog = Array(
"zhangsan1,23435",
"zhangsan2,11123422",
"zhangsan3,2341234",
"zhangsan4,1805208126370281",
"zhangsan,1820150000003959",
"lisi,1817430219307158",
"lisi,1234",
"lisi,124124234",
"lisi,4351")
val userAccessLogRDD = sc.parallelize(userAccessLog, 5)
// 首先,将普通的RDD,转换为元素为Row的RDD
val userAccessLogRowRDD = userAccessLogRDD
.map { log => Row(log.split(",")(0), log.split(",")(1).toLong) }
// 构造DataFrame的元数据
val structType = StructType(Array(
StructField("name", StringType, true),
StructField("mid", LongType, true)))
// 使用SQLContext创建DataFrame
val userAccessLogRowDF = sqlContext.createDataFrame(userAccessLogRowRDD, structType)
userAccessLogRowDF.registerTempTable("person")
val url = "jdbc:oracle:thin:nwd/123456@//localhost:1521/orcl"
val prop = new Properties()
// 使用SQLContext创建jdbc的DataFrame
val df = sqlContext.read.jdbc(url, "NWD_TRANSFER_PROJECT", prop)
df.registerTempTable("NWD_TRANSFER_PROJECT")
//rdd注册的临时表join上关系库的表
val pres = sqlContext.sql("select p.name,n.MID,n.TITLE from person p left join NWD_TRANSFER_PROJECT n on p.mid=n.MID ")
//返回并打印结果
pres.collect().foreach(println)
}
}
最后返回的数据:
[zhangsan,1820150000003959.0000000000,测试数据1]
[zhangsan,1820150000003959.0000000000,测试数据2]
[zhangsan,1820150000003959.0000000000,测试数据3]
[zhangsan,1820150000003959.0000000000,测试数据4]
[zhangsan,1820150000003959.0000000000,测试数据5]
[zhangsan,1820150000003959.0000000000,测试数据6]