鸣谢:如果您觉得本文对您有帮助,请点赞和收藏,Thanks。
这是一篇关于SparkSQL/SparkStreaming读写Hive/Mysql/Hbase/Kafka的代码模板。
【看完觉得可以的话点个赞,谢谢你的点赞哟】
这里是一些工具类,包括sparkSession和连接参数的配置类
package utils
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.StreamingContext
/**
* @Author xsh
* @Date : 2020-3-20 16:10
* @Description: spark 工具类(scala单例对象,相当于java的静态类)
*/
object SparkUtils {
/**
* 创建批处理配置对象
* setMaster:设置运行模式 local:单线程模式,local[n]:以n个线程运行,local[*]:以所有CPU核数的线程运行
* setAppName:设置应用名称
* set:各种属性
*/
lazy val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("TestHive").set("spark.testing.memory", "471859200")
//创建session
// lazy val sparkSession: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
//创建session,并启用hive
lazy val sparkSessionWithHive: SparkSession = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate()
}
package utils
import java.lang
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableOutputFormat}
import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer}
import org.apache.spark.sql.DataFrame
/**
* @Author xsh
* @Date : 2020-3-20 17:06
* @Description: 连接/配置参数工具类
*/
object ConnectUtils {
/**
* mysql连接配置
*/
lazy val mysqlConnect: (String, String, String, String, String, String) =>
DataFrame = (ip: String, port: String, database: String, table: String, userName: String, password: String) => {
SparkUtils.sparkSessionWithHive.read
.format("jdbc")
.option("url", s"jdbc:mysql://$ip:$port?useUnicode=true&characterEncoding=utf-8&useSSL=false")
.option("dbtable", s"$database.$table")
.option("user", userName)
.option("password", password)
.load()
}
/**
* kafka消费者配置
*/
val kafkaConsumerConfig: (String, String) => Map[String, Object] = (bootstrapServers: String, group: String) => {
Map[String, Object](
"bootstrap.servers" -> bootstrapServers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> group,
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: lang.Boolean)
)
}
/**
* kafka生产者配置
*/
val kafkaProducerConfig: String => Map[String, Object] = (bootstrapServers: String) => {
Map[String, Object](
"bootstrap.servers" -> bootstrapServers,
"key.serializer" -> classOf[StringSerializer],
"value.serializer" -> classOf[StringSerializer]
)
}
/**
* HBase连接配置
* operate:做何操作,读-read,写-write
*/
val HBaseConfig: (String, String, String,String,String) => Configuration = (zkQuorum: String, clientPort: String, znodeParent: String, operate: String, tableName: String) => {
val conf: Configuration = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", zkQuorum)
conf.set("hbase.zookeeper.property.clientPort", clientPort)
conf.set("zookeeper.znode.parent", znodeParent)
if (operate.equals("read")) {
conf.set(TableInputFormat.INPUT_TABLE, tableName)
} else {
conf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
}
conf
}
}
常常用于一些基础表或者维表数据的输入,从mysql导入到hive
package test
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import utils.{ConnectUtils, SparkUtils}
/**
* @Author xsh
* @Date : 2020-3-20 16:56
* @Description:mysql-hive
*/
object MysqlToHive {
def main(args: Array[String]): Unit = {
//创建session
val spark: SparkSession = SparkUtils.sparkSessionWithHive
//导入SQL隐式转换
import spark.sql
//连接mysql
//TODO 修改连接参数(ip,端口,数据库,表名,用户名,密码)
val data: DataFrame = ConnectUtils.mysqlConnect("192.168.11.12", "3306", "micszhc", "entity", "mics", "mics")
//TODO 具体业务逻辑处理
//将结果保存到hive
// data.write.mode(SaveMode.Append).saveAsTable("databases_one.test_xsh02")
data.createTempView("tempView")
sql("insert into table databases_one.test_xsh02 select PKEY,NAME,ADDRESS,DESCRIPTION,SUBSYSTEMKEY,LOCATIONKEY,SEREGI_ID from tempView limit 10")
//释放资源
spark.stop()
}
}
通常,数据需要快速写入时,先存储到hbase,再从hbase抽取到hive进行建仓分析
package test
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import utils.{CommonConstant, ConnectUtils, SparkUtils}
/**
* @Author xsh
* @Date : 2020-3-26 16:47
* @Description: hbase-hive
*/
object HbaseToHive {
def main(args: Array[String]): Unit = {
//创建session
val spark: SparkSession = SparkUtils.sparkSessionWithHive
//设置允许动态分区
spark.sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
spark.sqlContext.setConf("hive.exec.dynamic.partition", "true")
//配置HBase连接 zkQuorum, clientPort,znodeParent,operate:做何操作(读-read,写-write),tableName
//TODO 修改连接参数
val hbaseConfig: Configuration = ConnectUtils.HBaseConfig("node11,node12,node13", "2181", "/hbase-unsecure",CommonConstant.READ,"test_xsh" )
//读取hbase
val hbaseRDD: RDD[(ImmutableBytesWritable, Result)] = spark.sparkContext.newAPIHadoopRDD(hbaseConfig, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
import spark.implicits._
val dataDf: DataFrame = hbaseRDD.map(x => {
val result: Result = x._2
//获取各列
val rowKey: String = Bytes.toString(result.getRow)
//TODO 修改列族和列名
val columnA: String = Bytes.toString(result.getValue("cf".getBytes, "a".getBytes()))
val columnB: String = Bytes.toString(result.getValue("cf".getBytes, "b".getBytes()))
val columnC: String = Bytes.toString(result.getValue("cf".getBytes, "c".getBytes()))
val columnD: String = Bytes.toString(result.getValue("cf".getBytes, "d".getBytes()))
//取得日期用作hive分区字段
val date_partition: String = rowKey.split(CommonConstant.SEPARATOR_UNDERLINE)(2).substring(0, 8)
//返回一个tuple
(rowKey, columnA, columnB, columnC, columnD, date_partition)
//TODO 修改toDF的列名,确保返回的tuple和列名顺序一致
}).toDF("rowkey", "datastatus", "location_id", "quality", "types", "date_partition")
//输出到已存在的hive分区表
//TODO 修改库名表名
dataDf.write.mode(SaveMode.Append).insertInto("databases_one.test_xsh002")
//输出到spark自动创建的分区表(需要用partitionBy标明分区字段)
// dataDf.write.mode(SaveMode.Append).partitionBy("date_partition").saveAsTable("databases_one.test_xsh002")
println("OK。。。。。")
//释放资源
spark.stop()
}
}
在数仓开发中,hive到hive是最常用的一种传输
package test
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import utils.SparkUtils
/**
* @Author xsh
* @Date : 2020-3-20 16:43
* @Description: hive-hive
*/
object HiveToHive {
def main(args: Array[String]): Unit = {
//创建session
val spark: SparkSession = SparkUtils.sparkSessionWithHive
//导入SQL隐式转换
import spark.sql
//使用某个数据库
//TODO 修改连接哪个数据库
sql("use databases_one")
//TODO 具体业务逻辑处理
//桥接表
val sqlStr: String = "select * from table limit 1"
//执行SQL
val data: DataFrame = sql(sqlStr)
//打印结果
// data.show()
/*将结果保存到hive*/
//方式一:利用API插入数据到已存在的表
data.write.mode(SaveMode.Append).insertInto("test_xsh")
//方式二:利用API自动创建表再插入数据
// data.write.mode(SaveMode.Append).saveAsTable("test_xsh")
//方式三:利用SQL插入已存在的表
// data.createTempView("qiaoJie")
// sql("insert into table test_xsh select * from qiaoJie")
//释放资源
spark.stop()
}
}
此操作较为少用,因为hive读写性能较差
package test.kafka
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import utils.{CommonMethod, ConnectUtils, MicsPointStatusHive, SparkUtils}
import scala.collection.mutable.ListBuffer
/**
* @Author xsh
* @Date : 2020-3-23 14:34
* @Description: kafka-hive
*/
object KafkaToHive {
def main(args: Array[String]): Unit = {
//创建streaming环境
//TODO 修改窗口微批处理时间
val streamingContext = new StreamingContext(SparkUtils.sparkConf, Seconds(1))
//要订阅的主题
//TODO 修改需要订阅的topic
val topics: Array[String] = Array("MicsPointStatus")
//配置kafka连接地址和消费group
//TODO 修改地址和消费组
val kafkaParams: Map[String, Object] = ConnectUtils.kafkaConsumerConfig("node11:6667,node12:6667,node13:6667", "KafkaTestXsh")
//创建kafka连接
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
//遍历处理消息
kafkaDStream.foreachRDD(
(rdd: RDD[ConsumerRecord[String, String]]) => {
//记录偏移
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//TODO 具体业务逻辑处理
//获取ConsumerRecord的value值(就是具体的消息内容)
val value: RDD[String] = rdd.mapPartitions(_.map(_.value()))
//打印消息
// value.foreach(println(_))
//处理消息
if (value.count() > 0) {
value.foreachPartition(records => {
//创建可变列表,临时存放消息
val listBuffer: ListBuffer[MicsPointStatusHive] = ListBuffer.empty[MicsPointStatusHive]
records.foreach(record => {
//具体处理业务数据的方法
val data: MicsPointStatusHive = processHiveData(record)
listBuffer += data
})
//创建单例hiveSession
val spark: SparkSession = SparkUtils.sparkSessionWithHive
//设置允许动态分区
spark.sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
spark.sqlContext.setConf("hive.exec.dynamic.partition", "true")
//设置分区允许创建的文件大小
// sqlContext.setConf("hive.exec.max.dynamic.partitions.pernode","100")
// sqlContext.setConf("hive.exec.max.dynamic.partitions","1000")
// sqlContext.setConf("hive.exec.max.created.files","100000")
import spark.implicits._
val dataDf: DataFrame = listBuffer.toDF()
//输出到已存在的hive分区表
//TODO 修改表名
dataDf.write.mode(SaveMode.Append).insertInto("databases_one.test_xsh002")
//输出到spark自动创建的分区表(需要用partitionBy标明分区字段)
// dataDf.write.mode(SaveMode.Append).partitionBy("date_partition").saveAsTable("databases_one.test_xsh002")
println("OK。。。。。")
})
}
//提交偏移
kafkaDStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
)
//启动流任务
streamingContext.start()
//等待消息输入
streamingContext.awaitTermination()
}
}
常见操作,从kafka消费业务数据到hbase供快速查询
package test.kafka
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import utils.{CommonConstant, CommonMethod, ConnectUtils, MicsPointStatusRowKey, SparkUtils}
/**
* @Author xsh
* @Date : 2020-3-23 14:34
* @Description: kafka-hbase
*/
object KafkaToHbase {
def main(args: Array[String]): Unit = {
//创建streaming环境
//TODO 修改窗口微批处理时间
val streamingContext = new StreamingContext(SparkUtils.sparkConf, Seconds(1))
//要订阅的主题
//TODO 修改需要订阅的topic
val topics: Array[String] = Array("MicsPointStatus")
//配置kafka连接地址和消费group
//TODO 修改地址和消费组
val kafkaParams: Map[String, Object] = ConnectUtils.kafkaConsumerConfig("node11:9092,node12:9092,node13:9092", "KafkaTestXsh")
//创建kafka连接
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
//遍历处理消息
kafkaDStream.foreachRDD(
(rdd: RDD[ConsumerRecord[String, String]]) => {
//记录偏移
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//配置HBase连接 zkQuorum, clientPort,znodeParent
//TODO 修改连接参数
val hbaseConfig: Configuration = ConnectUtils.HBaseConfig("node11,node12,node13", "2181", "/hbase-unsecure",CommonConstant.WRITE,"test_xsh" )
val job: Job = Job.getInstance(hbaseConfig)
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
//处理消息
if (rdd.count() > 0) {
rdd.mapPartitions(_.map(x => {
//TODO 具体业务逻辑处理
//将消息解析为一个样例类(实体类),方便处理消息
val data: MsgEntity = processData(x.value())
//使用put插入hbase(需要提前创建好表)
val put = new Put(Bytes.toBytes(data.rowKey))
//第一个参数为列族,第二参数为列名
//TODO 修改列族和列名
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("a"), Bytes.toBytes(data.dataStatus))
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("b"), Bytes.toBytes(data.location_id))
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("c"), Bytes.toBytes(data.quality))
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("d"), Bytes.toBytes(data.types))
(new ImmutableBytesWritable, put)
})).saveAsNewAPIHadoopDataset(job.getConfiguration)
println("OK。。。。。")
}
//提交偏移
kafkaDStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
)
//启动流任务
streamingContext.start()
//等待消息输入
streamingContext.awaitTermination()
}
}
这种操作也少有,如果是输入输出都是在同一个kafka集群的话,建议使用KafkaStream来操作,速度肯定很快
先建一个样例对象
package test.kafka
import java.util.concurrent.Future
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
/**
* @Author xsh
* @Date : 2020-3-24 10:11
* @Description: kafka生产者
*/
class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable {
lazy val producer: KafkaProducer[K, V] = createProducer()
//发送带key的消息
def send(topic: String, key: K, value: V): Future[RecordMetadata] = {
producer.send(new ProducerRecord[K, V](topic, key, value))
}
//发送不带key的消息
def send(topic: String, value: V): Future[RecordMetadata] = {
producer.send(new ProducerRecord[K, V](topic, value))
}
}
object KafkaSink {
import scala.collection.JavaConversions._
def apply[K, V](config: Map[String, Object]): KafkaSink[K, V] = {
val createProducerFunc: () => KafkaProducer[K, V] = () => {
val producer = new KafkaProducer[K, V](config)
sys.addShutdownHook {
//jvm虚拟机关闭之前关闭生产者,确保所有消息发送完毕
producer.close()
}
producer
}
new KafkaSink(createProducerFunc)
}
}
具体处理
package test.kafka
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import utils.{ConnectUtils, SparkUtils}
/**
* @Author xsh
* @Date : 2020-3-23 14:34
* @Description: kafka-kafka
*/
object KafkaToKafka {
def main(args: Array[String]): Unit = {
//创建streaming环境
//TODO 修改窗口微批处理时间
val streamingContext = new StreamingContext(SparkUtils.sparkConf, Seconds(3))
//要订阅的主题
//TODO 修改需要订阅的topic
val topics: Array[String] = Array("hjmos-log")
//配置kafka连接地址和消费group
//TODO 修改地址和消费组
val kafkaParams: Map[String, Object] = ConnectUtils.kafkaConsumerConfig("node11:9092,node12:9092,node13:9092", "KafkaTestXsh")
//创建kafka连接
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
//遍历处理结果
// kafkaData.foreachRDD(
// consumerRecord => consumerRecord.foreach(
// msg => println(msg.value())
// )
// )
//输出到kafka
//广播kafka生产者
val kafkaSink: KafkaSink[String, Object] = KafkaSink[String, Object](ConnectUtils.kafkaProducerConfig("node11:9092,node12:9092,node13:9092"))
val kafkaProducerBroadcast: Broadcast[KafkaSink[String, Object]] = streamingContext.sparkContext.broadcast(kafkaSink)
//遍历处理消息
kafkaDStream.foreachRDD(
(rdd: RDD[ConsumerRecord[String, String]]) => {
//记录偏移
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreach(
(record: ConsumerRecord[String, String]) => {
//TODO 具体业务逻辑处理
//打印消息
println(record.value())
//发送到kafka
//TODO 修改要发送到哪个topic
kafkaProducerBroadcast.value.send("sparkSinkTopic", record.value())
}
)
//提交偏移
kafkaDStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
)
//启动流任务
streamingContext.start()
//等待消息输入
streamingContext.awaitTermination()
}
}
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>org.examplegroupId>
<artifactId>sparkartifactId>
<version>1.0version>
<properties>
<java.version>1.8java.version>
<project.build.sourceEncoding>UTF-8project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8project.reporting.outputEncoding>
<scala.vesion>2.11scala.vesion>
<spark.version>2.1.1spark.version>
<hadoop.version>2.7.3hadoop.version>
<hbase.version>1.1.2hbase.version>
<mysql.version>5.1.48mysql.version>
<kafka.version>2.4.5kafka.version>
properties>
<dependencies>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-core_${scala.vesion}artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-sql_${scala.vesion}artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-hive_${scala.vesion}artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>mysqlgroupId>
<artifactId>mysql-connector-javaartifactId>
<version>${mysql.version}version>
dependency>
<dependency>
<groupId>org.apache.hbasegroupId>
<artifactId>hbase-clientartifactId>
<version>${hbase.version}version>
<exclusions>
<exclusion>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-log4j12artifactId>
exclusion>
exclusions>
dependency>
<dependency>
<groupId>org.apache.hbasegroupId>
<artifactId>hbase-serverartifactId>
<version>${hbase.version}version>
<exclusions>
<exclusion>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-log4j12artifactId>
exclusion>
exclusions>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-streaming_${scala.vesion}artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-streaming-kafka-0-10_${scala.vesion}artifactId>
<version>${kafka.version}version>
dependency>
dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.mavengroupId>
<artifactId>scala-maven-pluginartifactId>
<version>3.2.2version>
<executions>
<execution>
<goals>
<goal>compilegoal>
<goal>testCompilegoal>
goals>
execution>
executions>
plugin>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-compiler-pluginartifactId>
<configuration>
<source>1.8source>
<target>1.8target>
configuration>
plugin>
plugins>
build>
project>