sparkstreaming kafka010

SparkStreaming消费kafka数据

sparkstreaming kafka010_第1张图片

兴趣e族关注0人评论40人阅读2018-10-31 17:05:04

概要:本例子为SparkStreaming消费kafka消息的例子,实现的功能是将数据实时的进行抽取、过滤、转换,然后存储到HDFS中。

实例代码

package com.fwmagic.test

import com.alibaba.fastjson.{JSON, JSONException}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.slf4j.LoggerFactory

/**
  * created by fwmagic
  */
object RealtimeEtl {

  private val logger = LoggerFactory.getLogger(PVUV.getClass)

  def main(args: Array[String]): Unit = {
    System.setProperty("HADOOP_USER_NAME", "hadoop")

    val conf = new SparkConf().setAppName("RealtimeEtl").setMaster("local[*]")

    val spark = SparkSession.builder().config(conf).getOrCreate()

    val streamContext = new StreamingContext(spark.sparkContext, Seconds(5))

    //直连方式相当于跟kafka的Topic至直接连接
    //"auto.offset.reset:earliest(每次重启重新开始消费),latest(重启时会从最新的offset开始读取)
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "hd1:9092,hd2:9092,hd3:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "fwmagic",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    val topics = Array("access")

    val kafkaDStream = KafkaUtils.createDirectStream[String, String](
      streamContext,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
    )

    //如果使用SparkStream和Kafka直连方式整合,生成的kafkaDStream必须调用foreachRDD
    kafkaDStream.foreachRDD(kafkaRDD => {
      if (!kafkaRDD.isEmpty()) {
        //获取当前批次的RDD的偏移量
        val offsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges

        //拿出kafka中的数据
        val lines = kafkaRDD.map(_.value())
        //将lines字符串转换成json对象
        val logBeanRDD = lines.map(line => {
          var logBean: LogBean = null
          try {
            logBean = JSON.parseObject(line, classOf[LogBean])
          } catch {
            case e: JSONException => {
              //logger记录
              logger.error("json解析错误!line:" + line, e)
            }
          }
          logBean
        })

        //过滤
        val filteredRDD = logBeanRDD.filter(_ != null)

        //将RDD转化成DataFrame,因为RDD中装的是case class
        import spark.implicits._

        val df = filteredRDD.toDF()

        df.show()
        //将数据写到hdfs中:hdfs://hd1:9000/360
        df.repartition(1).write.mode(SaveMode.Append).parquet(args(0))

        //提交当前批次的偏移量,偏移量最后写入kafka
        kafkaDStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
      }
    })

    //启动
    streamContext.start()
    streamContext.awaitTermination()
    streamContext.stop()

  }

}

case class LogBean(time:String,
                   longitude:Double,
                   latitude:Double,
                   openid:String,
                   page:String,
                   evnet_type:Int)

依赖环境(pom.xml)

xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0modelVersion>

    <groupId>com.fwmagic.360groupId>
    <artifactId>fwmagic-360artifactId>
    <version>1.0version>

    <properties>
        <maven.compiler.source>1.8maven.compiler.source>
        <maven.compiler.target>1.8maven.compiler.target>
        <scala.version>2.11.7scala.version>
        <spark.version>2.2.2spark.version>
        <hadoop.version>2.7.7hadoop.version>
        <encoding>UTF-8encoding>
    properties>

    <dependencies>
        
        <dependency>
            <groupId>org.scala-langgroupId>
            <artifactId>scala-libraryartifactId>
            <version>${scala.version}version>
        dependency>

        
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-core_2.11artifactId>
            <version>${spark.version}version>
        dependency>

        
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-sql_2.11artifactId>
            <version>${spark.version}version>
        dependency>

        
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-streaming_2.11artifactId>
            <version>${spark.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-streaming-kafka-0-10_2.11artifactId>
            <version>${spark.version}version>
        dependency>

        
        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-clientartifactId>
            <version>${hadoop.version}version>
        dependency>

        
        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-clientartifactId>
            <version>${hadoop.version}version>
        dependency>

        <dependency>
            <groupId>com.alibabagroupId>
            <artifactId>fastjsonartifactId>
            <version>1.2.39version>
        dependency>

    dependencies>

    <build>
        <pluginManagement>
            <plugins>
                
                <plugin>
                    <groupId>net.alchim31.mavengroupId>
                    <artifactId>scala-maven-pluginartifactId>
                    <version>3.2.2version>
                plugin>
                
                <plugin>
                    <groupId>org.apache.maven.pluginsgroupId>
                    <artifactId>maven-compiler-pluginartifactId>
                    <version>3.5.1version>
                plugin>
            plugins>
        pluginManagement>
        <plugins>
            <plugin>
                <groupId>net.alchim31.mavengroupId>
                <artifactId>scala-maven-pluginartifactId>
                <executions>
                    <execution>
                        <id>scala-compile-firstid>
                        <phase>process-resourcesphase>
                        <goals>
                            <goal>add-sourcegoal>
                            <goal>compilegoal>
                        goals>
                    execution>
                    <execution>
                        <id>scala-test-compileid>
                        <phase>process-test-resourcesphase>
                        <goals>
                            <goal>testCompilegoal>
                        goals>
                    execution>
                executions>
            plugin>

            <plugin>
                <groupId>org.apache.maven.pluginsgroupId>
                <artifactId>maven-compiler-pluginartifactId>
                <executions>
                    <execution>
                        <phase>compilephase>
                        <goals>
                            <goal>compilegoal>
                        goals>
                    execution>
                executions>
            plugin>

            
            <plugin>
                <groupId>org.apache.maven.pluginsgroupId>
                <artifactId>maven-shade-pluginartifactId>
                <version>2.4.3version>
                <executions>
                    <execution>
                        <phase>packagephase>
                        <goals>
                            <goal>shadegoal>
                        goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SFexclude>
                                        <exclude>META-INF/*.DSAexclude>
                                        <exclude>META-INF/*.RSAexclude>
                                    excludes>
                                filter>
                            filters>
                        configuration>
                    execution>
                executions>
            plugin>
        plugins>
    build>

project>

你可能感兴趣的:(spark)