Spark Streaming篇2:Spark Streaming 更新update数据到mysql
目前有很多方式写入mysql,你可以在streaming中创建DataFrame ,用sparksql写入mysql,但是这种只能追加或者覆盖
现在我们利用foreachRDD写入
(以下两种方法不仅对DStream 有用,涉及到rdd或者DataFrame 的可以利用foreachRDD写入)
废话不多说,直接上干货
package com.iflytek.kafka
import java.sql.{Connection, DriverManager, Statement}
import java.text.SimpleDateFormat
import java.util.Date
import com.alibaba.fastjson.JSON
import com.iflytek.kafkaManager.{MysqlManager, MysqlSink}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.slf4j.LoggerFactory
object WC2Mysql {
@transient lazy val logger=LoggerFactory.getLogger(this.getClass())
def send2MysqlMian(ssc: StreamingContext):Unit={
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "cdh01:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "xx001",
"auto.offset.reset" -> "latest", //earliest latest
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("pd_ry_txjl")
val stream: InputDStream[ConsumerRecord[String, String]] =
KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams))
val kv: DStream[(String, String)] = stream.map(record => (record.key, record.value))
val value: DStream[String] = stream.map(_.value())
val mapDS = value.map(x => {
val dataFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val nObject = JSON.parseObject(x)
val bodyObject1 = nObject.getJSONObject("body")
val bodyObject2 = bodyObject1.getJSONObject("body")
val xqbm = bodyObject2.get("xqbm").toString
(xqbm, 1L)
})
val sumed = mapDS.transform(rdd => {
rdd.reduceByKey(_ + _)
})
sumed.foreachRDD(rdd => {
send2MysqlPoll(rdd)
})
sumed.print()
}
//方法一:稍微高效.利用foreachPartition增加效率
def send2Mysql(rdd:RDD[(String,Long)]):Unit={
if (!rdd.isEmpty) {
rdd.foreachPartition(f=>{
var conn: Connection=null
var statement:Statement=null
val jdbcUrl = "jdbc:mysql://localhost:3306/xytest?useUnicode=true&characterEncoding=utf8";
val user = "root"
val password = "123456"
val rq = new SimpleDateFormat("yyyyMMdd").format(new Date().getTime)
try {
conn = DriverManager.getConnection(jdbcUrl, user, password)
statement= conn.createStatement()
conn.setAutoCommit(false)
f.foreach(f => {
var sql =
s"""
|CREATE TABLE if not exists word_count_${rq}(
| `id` int(11) NOT NULL AUTO_INCREMENT,
| `word` varchar(255) DEFAULT NULL,
| `count` int(11) DEFAULT '0',
| `date` varchar(255) NOT NULL,
| PRIMARY KEY (`id`) ,
| UNIQUE KEY `word` (`word`,`date`)
|) ENGINE=InnoDB DEFAULT CHARSET=utf8;
""".stripMargin
statement.addBatch(sql)
sql=s"""
|insert into word_count_${rq}
|(word,count,date)
|values
|('${f._1}',${f._2},'${rq}')
|on duplicate key update count=count+values(count);
""".stripMargin
statement.addBatch(sql)
})
statement.executeBatch()
conn.commit()
} catch {
case e: Exception => println(e.printStackTrace())
} finally {
if (statement != null) statement.close()
if (conn != null) conn.close()
}
})}
}
//方法二:高效.在方法一的基础上增加了c3p0连接池
def send2MysqlPoll(rdd: RDD[(String, Long)]):Unit = {
if (!rdd.isEmpty) {
rdd.foreachPartition(fp => {
// 从线程池中获取一个连接
val conn = MysqlManager.getMysqlSink.getConnection
// val statement=conn.createStatement()
val preTime=System.currentTimeMillis()
val rq = new SimpleDateFormat("yyyyMMdd").format(new Date().getTime)
try {
conn.setAutoCommit(false)
fp.foreach(f => {
logger.info("rdd中的数据为"+f)
val createTime=System.currentTimeMillis()
var sql =
s"""|CREATE TABLE if not exists word_count_${rq}(
| `id` int(11) NOT NULL AUTO_INCREMENT,
| `word` varchar(255) DEFAULT NULL,
| `count` int(11) DEFAULT '0',
| `date` varchar(255) NOT NULL,
| PRIMARY KEY (`id`) ,
| UNIQUE KEY `word` (`word`,`date`)
|) ENGINE=InnoDB DEFAULT CHARSET=utf8;
""".stripMargin
conn.createStatement().addBatch(sql)
// 插入记录
sql =s"""|insert into word_count_${rq}
|(word,count,date)
|values
|('${f._1}',${f._2},'${rq}')
|on duplicate key update count=count+values(count);
""".stripMargin
conn.createStatement().addBatch(sql)
logger.warn("rdd写入mysql成功 word:"+f._1+" count:"+f._2)
})
conn.createStatement().executeBatch()
conn.commit()
logger.warn(s"批次save成功 ${System.currentTimeMillis()-preTime}")
}catch {
case e:Exception=> logger.error("在rdd写mysql时发生错误",e)
}finally {
// if(statement !=null) statement.close()
if(conn !=null) conn.close()
}
})
}
}
}
c3p0连接池
package com.iflytek.kafkaManager
import java.sql.Connection
import com.mchange.v2.c3p0.ComboPooledDataSource
import org.slf4j.LoggerFactory
class MysqlPoll extends Serializable{
@transient lazy val logger=LoggerFactory.getLogger(this.getClass())
private val cpds:ComboPooledDataSource=new ComboPooledDataSource(true)
try{
cpds.setJdbcUrl("jdbc:mysql://localhost:3306/xytest?useUnicode=true&characterEncoding=utf8")
cpds.setDriverClass("com.mysql.jdbc.Driver")
cpds.setUser("root")
cpds.setPassword("123456")
cpds.setMaxPoolSize(200) //最大连接数量
cpds.setMinPoolSize(20) //最小连接数量
cpds.setAcquireIncrement(5) //每次递增数量
cpds.setMaxStatements(180) //连接池最大空闲时间
cpds.setMaxIdleTime(25000)
// cpds.setPreferredTestQuery(s"select id from word_count_20191126 where 1=2")
cpds.setIdleConnectionTestPeriod(18000)
}catch {
case e:Exception=> logger.error("创建C3P0连接时失败,cpds set参数失败",e)
}
// 这里我们不进行关闭操作,在foreachRdd中进行
def getConnection:Connection={
try{
return cpds.getConnection();
}catch {
case ex:Exception=> logger.error("创建C3P0连接时失败,conn创建失败,conn返回为null",ex)
null
}
}
}
object MysqlManager {
var mysqlPoll: MysqlPoll = _
def getMysqlSink: MysqlPoll = {
synchronized {
if (mysqlPoll == null) {
mysqlPoll = new MysqlPoll
}
return mysqlPoll
}
}
}
程序入口
import com.iflytek.kafka.WC2Mysql
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkMain {
def main(args: Array[String]): Unit = {
Logger.getRootLogger.setLevel(Level.ERROR)
System.setProperty("HADOOP_USER_NAME","root")
System.setProperty("user.name","root")
val sc = SparkSession.builder().getOrCreate().sparkContext
val sparkSession = {
SparkSession.builder()
.config(sc.getConf)
.config("spark.steaming.kafka.maxRatePerPartition", "10000")
.config("spark.sql.streaming.schemaInference", "true")
.getOrCreate()
}
val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(3))
WC2Mysql.send2MysqlMian(ssc)
ssc.start()
ssc.awaitTermination()
}
}
pom如下:
<properties>
<spark.version>2.3.2</spark.version>
<scala.version>2.11.8</scala.version>
<hbase.version>1.2.1</hbase.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-yarn_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.11.0.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.31</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.1.2</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.jolbox</groupId>
<artifactId>bonecp</artifactId>
<version>0.8.0.RELEASE</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.4.13</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>${hbase.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
<scope>compile</scope>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.hadoop</groupId>-->
<!--<artifactId>hadoop-client</artifactId>-->
<!--<version>2.7.2</version>-->
<!--</dependency>-->
<!--<!–guava和hadoop版本得对应–>-->
<!--<dependency>-->
<!--<groupId>com.google.guava</groupId>-->
<!--<artifactId>guava</artifactId>-->
<!--<version>18.0</version>-->
<!--</dependency>-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
<scope>compile</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<id>compile-scala</id>
<phase>compile</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>test-compile-scala</id>
<phase>test-compile</phase>
<goals>
<goal>add-source</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>2.11.8</scalaVersion>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<compilerArgs>
<arg>-extdirs</arg>
<arg>${project.basedir}/lib</arg>
</compilerArgs>
</configuration>
</plugin>
</plugins>
</build>
总结:
使用duplicate key update count=count+values(count);不仅仅达到了update的效果,其实也累计计算了每个批次的count数量,可以说达到了updateStateByKey算子的效果