在Spark Shell上,通过创建HiveContext可以直接进行Hive操作
1. 操作Hive中已存在的表
[hadoop@hadoop bin]$ ./spark-shell Spark assembly has been built with Hive, including Datanucleus jars on classpath Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version 1.2.0 /_/ Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_67) Type in expressions to have them evaluated. Type :help for more information. Spark context available as sc. ////创建HiveContext scala> val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc) sqlContext: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@42503a9b ///切换数据库,是一个本地命令,由Hive执行,但是没有生成分布式Job scala> sqlContext.sql("use default"); res1: org.apache.spark.sql.SchemaRDD = SchemaRDD[0] at RDD at SchemaRDD.scala:108 == Query Plan == <Native command: executed by Hive> ///显示当前数据库的表,是一个transformation操作,生成RDD scala> sqlContext.sql("show tables"); res2: org.apache.spark.sql.SchemaRDD = SchemaRDD[2] at RDD at SchemaRDD.scala:108 == Query Plan == <Native command: executed by Hive> ////获取结果 scala> sqlContext.sql("show tables").collect; res3: Array[org.apache.spark.sql.Row] = Array([abc], [avro_table], [employees], [invites], [my_word], [mytable1], [parquet_table], [table1], [word], [word3], [word4], [word5], [word6]) ///执行切换数据库动作 scala> sqlContext.sql("use default").collect; res4: Array[org.apache.spark.sql.Row] = Array() ////从表中查找数据,word6是一个Hive表,是一个transformation操作 scala> sqlContext.sql("select * from word6") res5: org.apache.spark.sql.SchemaRDD = SchemaRDD[8] at RDD at SchemaRDD.scala:108 == Query Plan == == Physical Plan == HiveTableScan [id#20,word#21], (MetastoreRelation default, word6, None), None ////从表中查找数据,word6是一个Hive表,执行具体的查询 scala> sqlContext.sql("select * from word6").collect