Spark-Sql创建多数据源Join实例——涉及关系库数据源
1、Spark-Sql数据来源有多种,Hive,Kakfa,RDD及关系库等。
2、注册临时表与关系库映射表的join实现,核心思想就是,Hive和关系库的数据,分别加载出来做成dataframe,然后分别注册成临时表。
import java.util.Properties import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.Row import org.apache.spark.sql.types._ /** * Created by wuke on 2016/5/26. */ object MultiDataSource { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setMaster("local") .setAppName("MultiDataSource") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // 这里着重说明一下!!! // 要使用Spark SQL的内置函数,就必须在这里导入SQLContext下的隐式转换 import sqlContext.implicits._ // 构造用户访问日志数据,并创建DataFrame // 模拟用户访问日志,日志用逗号隔开,第一列是姓名,第二列是id val userAccessLog = Array( "zhangsan1,23435", "zhangsan2,11123422", "zhangsan3,2341234", "zhangsan4,1805208126370281", "zhangsan,1820150000003959", "lisi,1817430219307158", "lisi,1234", "lisi,124124234", "lisi,4351") val userAccessLogRDD = sc.parallelize(userAccessLog, 5) // 首先,将普通的RDD,转换为元素为Row的RDD val userAccessLogRowRDD = userAccessLogRDD .map { log => Row(log.split(",")(0), log.split(",")(1).toLong) } // 构造DataFrame的元数据 val structType = StructType(Array( StructField("name", StringType, true), StructField("mid", LongType, true))) // 使用SQLContext创建DataFrame val userAccessLogRowDF = sqlContext.createDataFrame(userAccessLogRowRDD, structType) userAccessLogRowDF.registerTempTable("person") val url = "jdbc:oracle:thin:nwd/123456@//localhost:1521/orcl" val prop = new Properties() // 使用SQLContext创建jdbc的DataFrame val df = sqlContext.read.jdbc(url, "NWD_TRANSFER_PROJECT", prop) df.registerTempTable("NWD_TRANSFER_PROJECT") //rdd注册的临时表join上关系库的表 val pres = sqlContext.sql("select p.name,n.MID,n.TITLE from person p left join NWD_TRANSFER_PROJECT n on p.mid=n.MID ") //返回并打印结果 pres.collect().foreach(println) } }
最后返回的数据:
[zhangsan,1820150000003959.0000000000,测试数据1] [zhangsan,1820150000003959.0000000000,测试数据2] [zhangsan,1820150000003959.0000000000,测试数据3] [zhangsan,1820150000003959.0000000000,测试数据4] [zhangsan,1820150000003959.0000000000,测试数据5] [zhangsan,1820150000003959.0000000000,测试数据6]