SparkSQL引入了一种新的弹性分布式数据集SchemaRDD,SchemaRDD由行对象以及描述行对象中每列数据类型的schema组成。SchemaRDD很象传统数据库中的表。SchemaRDD可以通过RDD、Parquet文件、JSON文件、或者通过使用hiveql查询hive数据来建立。SchemaRDD除了可以和RDD一样操作外,还可以通过registerTempTable注册成临时表,然后通过SQL语句进行操作。SparkSQL1.1对数据的查询分成了2个分支:SqlContext 和 HiveContext。HiveSQL继承了SqlContext,所以除了拥有Sqlontext的特性之外,还拥有自身的特性,就是支持Hive。
在Spark中操作Hive时需要注意:
1、检查$SPARK_HOME/lib目录下是否有datanucleus-api-jdo-3.2.1.jar、datanucleus-rdbms-3.2.1.jar 、datanucleus-core-3.2.2.jar 这几个jar包。
2、检查$SPARK_HOME/conf目录下是否有从$HIVE_HOME/conf目录下拷贝过来的hive-site.xml。
3、提交程序的时候将数据库驱动程序的jar包指定到DriverClassPath,如bin/spark-submit --driver-class-path *.jar。或者在spark-env.sh中设置SPARK_CLASSPATH。
SparkSQL基本操作:
import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.types.StructType import org.apache.spark.sql.catalyst.types.StructField import org.apache.spark.sql.catalyst.types.StringType import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.SchemaRDD import org.apache.spark.sql.hive.HiveContext case class User(id:String, name:String, age:String) object SparkSQL { def main(args:Array[String]) { val sc = new SparkContext("spark://centos.host1:7077", "SparkSQL") val sqlContext = new SQLContext(sc) //Explicit Apply Schema val schemaTxt = "id name age" val schema1 = StructType(schemaTxt.split(" ").map(fieldName => StructField(fieldName, StringType, true))) val usersRDD1 = sc.textFile("/user/hadoop/data/temp/user.txt") .map(_.split(" ")).map(u => Row(u(0).trim(), u(1).trim(), u(2).trim())) val users1 = sqlContext.applySchema(usersRDD1, schema1) users1.registerTempTable("user1"); sqlContext.sql("select * from user1").map(u => "User1 Name: " + u(1)).collect().foreach(println) val structFields = new Array[StructField](3) structFields.update(0, StructField("id", StringType, true)) structFields.update(1, StructField("name", StringType, true)) structFields.update(2, StructField("age", StringType, true)) val schema2 = StructType(structFields) val usersRDD2 = sc.textFile("/user/hadoop/data/temp/user.txt") .map(_.split(" ")).map(u => Row(u(0).trim(), u(1).trim(), u(2).trim())) val users2 = sqlContext.applySchema(usersRDD2, schema2) users2.registerTempTable("user2"); sqlContext.sql("select * from user2").map(u => "User2 Name: " + u(1)).collect().foreach(println) //Implicit Convert Schema import sqlContext.createSchemaRDD val usersRDD3 = sc.textFile("/user/hadoop/data/temp/user.txt") .map(line => line.split(" ")).map(data => User(data(0).trim(), data(1).trim(), data(2).trim())) usersRDD3.registerTempTable("user3"); sqlContext.sql("select * from user3").map(u => "User3 Name: " + u(1)).collect().foreach(println) //Parquet File users1.saveAsParquetFile("/user/hadoop/data/temp/user.parquet") val usersRDD4 = sqlContext.parquetFile("/user/hadoop/data/temp/user.parquet") usersRDD4.registerTempTable("user4") sqlContext.sql("select * from user4").map(u => "User4 Name: " + u(1)).collect().foreach(println) //JSON File //{"id":"1","name":"zhangsan","age":"18"}{"id":"2","name":"lisi","age":"19"}{"id":"3","name":"wangwu","age":"17"} val usersRDD5 = sqlContext.jsonFile("/user/hadoop/data/temp/user.json") usersRDD5.registerTempTable("user5") sqlContext.sql("select * from user5").map(u => "User5 Name: " + u(1)).collect().foreach(println) //Query Operation sqlContext.sql("select name,age from user5").map{ case Row(name:String, age:String) => s"name: $name, age: $age"}.collect.foreach(println) sqlContext.sql("select name,age from user5").map{ data => s"name: ${data(0)}, age: ${data(1)}"}.collect.foreach(println) import sqlContext._ val count = sqlContext.sql("select count(*) from user4").collect().head.getLong(0) println(s"User4 Count: $count") //DSL usersRDD4.where('age > 18).orderBy('id.asc).collect().foreach(println) usersRDD4.where('age > 18).orderBy('id.asc).select('name as 'n).collect().foreach(println) usersRDD4.where('age > 18).orderBy('id.asc).select('name as 'n,'age as 'a).collect().foreach(println) //Hive val hiveContext = new HiveContext(sc) hiveContext.sql("use hive") hiveContext.sql("show tables").collect().foreach(println) hiveContext.hql("select id,name from a").insertInto("b"); sc.stop() } }