Structured Streaming篇1:Structured Streaming将json数据写入carbondata
废话不多说,直接上干货
package xytest
import com.iflytek.utils.Utils
import org.apache.carbondata.core.util.path.CarbonTablePath
import org.apache.carbondata.streaming.parser.CarbonStreamParser
import org.apache.spark.sql.CarbonSession._
import org.apache.spark.sql.functions.{col, from_json, get_json_object}
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.types._
import org.apache.spark.sql.{CarbonEnv, SparkSession}
object xytestkafka {
def main(args: Array[String]): Unit = {
val sc = SparkSession.builder().getOrCreate().sparkContext
val warehouse = "hdfs://cdh01:8020/user/hive/warehouse/carbon.store"
val sparkSession = SparkSession.builder().config(sc.getConf).getOrCreateCarbonSession(warehouse)
sparkSession.sql("use carbondata")
beginGZDataBoy(warehouse,sparkSession)
sparkSession.sql(s"select * from boykafka").show(30)
}
def beginGZDataBoy(warehose: String, sparkSession: SparkSession) {
val create_boy =
s"""
|CREATE TABLE IF NOT EXISTS boykafka (
| ID int,
| name string,
| age int,
| fv int
|)
| stored as carbondata
| tblproperties('streaming'='true')""".stripMargin
val schemaboy = {StructType(Seq(
StructField("id",IntegerType,true),
StructField("name",StringType,true),
StructField("age",IntegerType,true),
StructField("fv",IntegerType,true)
))}
val boykafka = "boykafka"
sparkSession.sql(create_boy)
val carbonTableBoy = CarbonEnv.getCarbonTable(Some("carbondata"), boykafka)(sparkSession)
Utils.deleteStreamingLock(carbonTableBoy.getTablePath,sparkSession)//防止锁表,需要删除锁
val streamingInputZdrDF = {
sparkSession.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "cdh1:9092")
.option("subscribe", "boykafka")
.option("startingOffsets", "latest")
.option("minPartitions", "10")
.option("failOnDataLoss", "false")
.load()//下面是解析嵌套2层的body json数据
.select(get_json_object(col("value").cast("string"), "$.body") as ("body"))
.select(get_json_object(col("body"), "$.body") as ("body"))
.select(from_json(col("body"), schemaboy) as ("parsed_value"))
.select(
col("parsed_value").getItem("id") as ("id"),
col("parsed_value").getItem("name") as ("name"),
col("parsed_value").getItem("age") as ("age"),
col("parsed_value").getItem("fv") as ("fv"))
.filter("id is not null")
}
//日志显示:输出到控制台
{streamingInputZdrDF.writeStream
.format("console")
.trigger(Trigger.ProcessingTime("5 seconds"))
.start()}
//数据入carbondata库
streamingInputZdrDF.writeStream
.format("carbondata")
.trigger(Trigger.ProcessingTime("5 seconds"))
.option("checkpointLocation", CarbonTablePath.getStreamingCheckpointDir(carbonTableBoy.getTablePath))
.option("dbName", "carbondata")
.option("tableName", boykafka)
.option(CarbonStreamParser.CARBON_STREAM_PARSER,CarbonStreamParser.CARBON_STREAM_PARSER_ROW_PARSER)
.option("BAD_RECORDS_LOGGER_ENABLE","true")
.option("BAD_RECORD_PATH","hdfs://cdh01:8020/user/hive/warehouse/carbondata.db")
.option("BAD_RECORDS_ACTION","FORCE")
.start().awaitTermination()
}
}
//kafka-console-producer --broker-list cdh01:9092 --topic boykafka
//{"properties":{"skynet.msg.location":"LOCAL","skynet.priority":"4"},"body":"{\"properties\":{\"skynet.priority\":\"4\"},\"body\":\"{\\\"fv\\\":155,\\\"id\\\":1550,\\\"age\\\":2,\\\"name\\\":\\\"设备名称\\\"}\"}"}
package com.iflytek.utils
import java.io.File
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.SparkSession
object Utils {
def deleteStreamingLock(tablePath: String, sparkSession: SparkSession){
val hdfs = org.apache.hadoop.fs.FileSystem.get(sparkSession.sparkContext.hadoopConfiguration)
val path = new Path(tablePath+"/LockFiles/streaming.lock")
if(hdfs.exists(path)) {
hdfs.delete(path, false)
}
}
def getConfig(): Config ={
val filePath =System.getProperty("user.dir")
ConfigFactory.parseFile(new File(s"$filePath/conf/config.properties"))
}
}
pom如下:
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.2</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.3.2</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-yarn_2.11</artifactId>
<version>2.3.2</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.carbondata</groupId>
<artifactId>carbonata</artifactId>
<version>1.5.3</version>
<scope>system</scope>
<systemPath>${project.basedir}/lib/apache-carbondata-1.5.3-bin-spark2.3.2-hadoop2.6.0-cdh5.16.1.jar</systemPath>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.3.2</version>
<scope>compile</scope>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.spark</groupId>-->
<!--<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>-->
<!--<version>2.3.2</version>-->
<!--<scope>compile</scope>-->
<!--</dependency>-->
<!--<dependency>-->
<!--<groupId>org.apache.kafka</groupId>-->
<!--<artifactId>kafka-clients</artifactId>-->
<!--<version>0.11.0.0</version>-->
<!--<scope>compile</scope>-->
<!--</dependency>-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.46</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<id>compile-scala</id>
<phase>compile</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>test-compile-scala</id>
<phase>test-compile</phase>
<goals>
<goal>add-source</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>2.11.8</scalaVersion>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<compilerArgs>
<arg>-extdirs</arg>
<arg>${project.basedir}/lib</arg>
</compilerArgs>
</configuration>
</plugin>
</plugins>
</build>