配置pom.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>Spark3.0</artifactId>
<groupId>org.example</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.27</version>
</dependency>
</dependencies>
<artifactId>spark-core</artifactId>
</project>
连接hadoop集群中的hive
package com.bigdata.SparkSQL
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
/**
* @author wangbo
* @version 1.0
*/
/**
* 环境测试
*/
object Spark02_SparkSQL_Hive_demo {
def main(args: Array[String]): Unit = {
// TODO 创建SparkSQL的运行环境
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
val sparkSession: SparkSession = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate()
//使用SparkSQL连接外置的Hive
//首先 集群要启动
//1.拷贝Hive-size.xml文件到classpath下
//2.启用hive的支持
//3.增加对应的依赖关系(包含mysql的驱动)
sparkSession.sql("show tables").show()
// TODO 关闭环境
sparkSession.close()
}
}
如果报错类似这种:
Caused by: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.security.AccessControlException): Permission denied: user=anonymous, access=EXECUTE, inode=“/tmp”:root:supergroup:drwx------
解决方法:
这种情况说明你hive中的数据库权限不够,直接将hdfs中存放该数据库的文件的权限修改即可 如:[root@hadoop100 ~]# hadoop dfs -chmod 777 /user/hive/warehouse/spark_demo.db
数据文件:
链接:https://pan.baidu.com/s/1t9hxa3dXF9gNRZJtxosWtQ
提取码:x523
package com.bigdata.SparkSQL
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
/**
* @author wangbo
* @version 1.0
*/
/**
* 首先在hive中创建数据库,在hdfs中把hive创建的数据库文件,给上权限 如:hadoop dfs -chmod 777 /user/hive/warehouse/spark_demo.db
* 数据的准备:进入数据库,创建表,导入数据
*/
object Spark02_SparkSQL_Hive_demo1 {
def main(args: Array[String]): Unit = {
// TODO 创建SparkSQL的运行环境
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
val sparkSession: SparkSession = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate()
//进入数据库
sparkSession.sql("use spark_demo")
//TODO 准备数据, 创建表
//用户信息表
sparkSession.sql(
"""
|CREATE TABLE `user_visit_action`(
| `date` string,
| `user_id` bigint,
| `session_id` string,
| `page_id` bigint,
| `action_time` string,
| `search_keyword` string,
| `click_category_id` bigint,
| `click_product_id` bigint,
| `order_category_ids` string,
| `order_product_ids` string,
| `pay_category_ids` string,
| `pay_product_ids` string,
| `city_id` bigint)
|row format delimited fields terminated by '\t'
""".stripMargin)
sparkSession.sql(
"""
|load data local inpath 'datas/user_visit_action1.txt' into table spark_demo.user_visit_action
|""".stripMargin)
//商品信息表
sparkSession.sql(
"""
|CREATE TABLE `product_info`(
| `product_id` bigint,
| `product_name` string,
| `extend_info` string)
|row format delimited fields terminated by '\t'
|""".stripMargin)
sparkSession.sql(
"""
|load data local inpath 'datas/product_info.txt' into table spark_demo.product_info
|""".stripMargin)
//城市信息表
sparkSession.sql(
"""
|CREATE TABLE `city_info`(
| `city_id` bigint,
| `city_name` string,
| `area` string)
|row format delimited fields terminated by '\t'
|""".stripMargin)
sparkSession.sql(
"""
|load data local inpath 'datas/city_info.txt' into table spark_demo.city_info
|""".stripMargin)
sparkSession.sql("show tables").show()
// TODO 关闭环境
sparkSession.close()
}
}
package com.bigdata.SparkSQL
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession, functions}
import org.apache.spark.sql.expressions.Aggregator
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
/**
* @author wangbo
* @version 1.0
*/
/**
* 进行表的查询
*/
object Spark02_SparkSQL_Hive_demo2 {
def main(args: Array[String]): Unit = {
// TODO 创建SparkSQL的运行环境
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
val sparkSession: SparkSession = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate()
//进入数据库
sparkSession.sql("use spark_demo")
//查询基本数据
sparkSession.sql(
"""
| select
| a.*,
| p.product_name,
| c.area,
| c.city_name
| from user_visit_action a
| join product_info p on a.click_product_id = p.product_id
| join city_info c on a.city_id = c.city_id
| where a.click_product_id > -1
|""".stripMargin).createOrReplaceTempView("t1") //把上面的查询结果,放在一个临时表他t1中
//根据区域,商品进行数据聚合
sparkSession.udf.register("cityRemark",functions.udaf(new cityRemarkUDAF()))
sparkSession.sql(
"""
| select
| area,
| product_name,
| count(*) as clickCnt,
| cityRemark(city_name) as city_remark
| from t1 group by area,product_name
|""".stripMargin).createOrReplaceTempView("t2")
//区域内对点击数量进行排序
sparkSession.sql(
"""
| select
| *,
| rank() over(partition by area order by clickCnt desc) as rank
| from t2
|""".stripMargin).createOrReplaceTempView("t3")
//取前三名
sparkSession.sql(
"""
| select
| *
| from t3 where rank <=3
|""".stripMargin).show(false) //这里的false为显示完整的字段名,如果不写,字段过长会被省略
// TODO 关闭环境
sparkSession.close()
}
/*
自定义聚合函数:实现城市备注功能
1.定义自定义类继承org.apache.spark.sql.expressions.Aggregator
定义泛型
IN:输入的数据类型:城市的名称
BUF:缓冲区的数据类型(使用了样例类):【总点击数量,Map[ (city,cnt),(city,cnt) ]】
OUT:输出的数据类型:备注信息
2.重写方法
*/
case class Buffer(var total:Long,var cityMap:mutable.Map[String,Long])
class cityRemarkUDAF extends Aggregator[String,Buffer,String]{
//初始值,缓冲区初始化
override def zero: Buffer = {
Buffer(0,mutable.Map[String,Long]())
}
//根据输入的数据更新缓冲区的数据
override def reduce(buff: Buffer, city: String): Buffer = {
buff.total += 1
val newCount = buff.cityMap.getOrElse(city,0L) + 1 //获取cityMap的value,如果能取到就+1,取不到赋值为0+1
buff.cityMap.update(city,newCount) //更新缓冲区
buff
}
//合并缓冲区的数据
override def merge(buff1: Buffer, buff2: Buffer): Buffer = {
buff1.total += buff2.total //将点击量合并
val map1: mutable.Map[String, Long] = buff1.cityMap
val map2: mutable.Map[String, Long] = buff2.cityMap
//方式一:两个map合并操作
// buff1.cityMap = map1.foldLeft(map2) {
// case (map, (city, count)) => { //key:city,value:count
// val newCount = map.getOrElse(city, 0L) + count
// map.update(city, newCount)
// map
// }
// }
// buff1
//方式二:两个map合并操作
map2.foreach{
case (city , count) => {
val newCount = map1.getOrElse(city,0L) + count
map1.update(city, newCount)
}
}
buff1.cityMap = map1
buff1
}
//将统计的结构生成字符串信息
override def finish(buff: Buffer): String = {
val remarkList: ListBuffer[String] = ListBuffer[String]()
val totalCount: Long = buff.total //城市的总数量
val cityMap: mutable.Map[String, Long] = buff.cityMap
//数据进行降序排列,去前两个
val cityCountList: List[(String, Long)] = cityMap.toList.sortWith( //因为List可以排序
(left, right) => { //cityMap1 和 cityMap2 两个map进行比较
left._2 > right._2
}
).take(2)
//判断城市是否大于2
val bool: Boolean = cityMap.size > 2
var rsum = 0L
cityCountList.foreach{
case (city,count) => { //city城市名称,count城市数量
val r = count * 100 / totalCount //求出商品在主要城市的比例 乘100是为了取整
remarkList.append(s"${city} ${r}%")
rsum += r
}
}
if (bool){
remarkList.append(s"其他 ${100-rsum}")
}
remarkList.mkString(",")
}
//缓冲区的编码操作,自定义的类就写Encoders.product,如果是scala存在的类,如Long 就写Encoders.scalaLong
override def bufferEncoder: Encoder[Buffer] = Encoders.product
//输出的编码操作,自定义的类就写Encoders.product,如果是scala存在的类,如Long 就写Encoders.scalaLong
override def outputEncoder: Encoder[String] = Encoders.STRING
}
}
参考:尚硅谷spark3.0教学