数据准备(用空格隔开)
Spark 100 Hadoop 65 Spark 99 Hadoop 61 Spark 195 Hadoop 60 Spark 98 Hadoop 69 Spark 91 Hadoop 98 Spark 88 Hadoop 99 Spark 68 Hadoop 60 Spark 79 Hadoop 97 Spark 69 Hadoop 96
代码编写
package com.imf.spark.sql import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.hive.HiveContext object SparkSQLWindowFunctionOps { def main(args: Array[String]): Unit = { val conf = new SparkConf() conf.setAppName("SparkSQLWindowFunctionOps for scala") conf.setMaster("spark://master1:7077") val sc = new SparkContext(conf) val hiveContext = new HiveContext(sc); hiveContext.sql("use testdb")//使用hive中的testdb数据库 hiveContext.sql("drop table if exists scores") hiveContext.sql("create table if not exists scores(name String,score int) " +"ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' LINES TERMINATED BY '\\n'") hiveContext.sql("load data local inpath '/usr/local/sparkApps/SparkSQLWindowFunctionOps/TopNGroup.txt' INTO TABLE scores") /** * 使用子查询的方式完成目标数据的提取,在目标函数内幕使用窗口函数row_number来进行分组排序: * partition by :指定窗口函数分组的key * order by :分组后进行排序 */ val result = hiveContext.sql("select name,score " +" from ( " +" select name,score,row_number() over(partition by name order by score desc)rank from scores ) sub_scores" +" where rank <=4") result .show();//在Driver的控制台上打印出结果内容 //保存到hive数据仓库中 hiveContext.sql("drop table if exists sortedResultScores") result.saveAsTable("sortedResultScores") } }
调度脚本
/usr/local/spark/spark-1.6.0-bin-hadoop2.6/bin/spark-submit \
--class com.imf.spark.sql.SparkSQLWindowFunctionOps \
--files /usr/local/hive/apache-hive-1.2.1-bin/conf/hive-site.xml \
--master spark://master1:7077 \
/usr/local/sparkApps/SparkSQLWindowFunctionOps/SparkSQLWindowFunctionOps.jar
查看结果
hive> show tables; OK people peopleresult peoplescores scores sortedresultscores student student2 student3 student4 tbsogou tmp_pre_hour_seach_info Time taken: 0.395 seconds, Fetched: 11 row(s) hive> select * from scores; OK Spark 100 Hadoop 65 Spark 99 Hadoop 61 Spark 195 Hadoop 60 Spark 98 Hadoop 69 Spark 91 Hadoop 98 Spark 88 Hadoop 99 Spark 68 Hadoop 60 Spark 79 Hadoop 97 Spark 69 Hadoop 96 Time taken: 2.426 seconds, Fetched: 18 row(s) hive> select * from sortedresultscores; OK SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". SLF4J: Defaulting to no-operation (NOP) logger implementation SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details. Spark 195 Spark 100 Spark 99 Spark 98 Hadoop 99 Hadoop 98 Hadoop 97 Hadoop 96 Time taken: 0.229 seconds, Fetched: 8 row(s)