SparkStreaming+Kafka+flume实战案例

项目分析:flume监控某端口下产生的日志,并将监控到的信息推送给kafka,kafka产生消费数据,再由SparkStreaming接收,并在控制台实时显示接收到的数据

1. pom.xml

<!-- scala -->
<dependency>
    <groupId>org.scala-lang</groupId>
    <artifactId>scala-library</artifactId>
    <version>${scala.vesion}</version>
</dependency>

<!-- spark -->
<!-- spark-core -->
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-core_2.11</artifactId
    <version>${spark.version}</version>
</dependency>
<!-- spark-sql -->
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-sql_2.11</artifactId>
    <version>${spark.version}</version>
</dependency>
<!-- spark-stream -->
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming_2.11</artif
    <version>${spark.version}</version>
</dependency>
<!-- stream-kafka -->
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming-kafka-0-10_
    <version>${spark.version}</version>
</dependency>
<!-- hadoop -->
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>${hadoop.version}</version>
</dependency>

<!-- kafka -->
<dependency>
    <groupId>org.apache.kafka</groupId>
    <artifactId>kafka-clients</artifactId>
    <version>${kafka.version}</version>
</dependency>
<dependency>
    <groupId>org.apache.kafka</groupId>
    <artifactId>kafka-streams</artifactId>
    <version>${kafka.version}</version>
</dependency>

<!-- flume -->
<dependency>
    <groupId>org.apache.flume</groupId>
    <artifactId>flume-ng-core</artifactId>
    <version>${flume.version}</version>
</dependency>
<!-- flume-log -->
<dependency>
    <groupId>org.apache.flume.flume-ng-clie
    <artifactId>flume-ng-log4jappender</art
    <version>${flume.version}</version>
</dependency>

<!-- commons-pool2 -->
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-pool2</artifactId>
    <version>2.4.2</version>
</dependency>

2. flume-kafka.conf

# Name the components on this agent
alog.sources=avro-source
alog.channels=logger-channel
alog.sinks=log-sink

# Describe/configure the source
alog.sources.avro-source.type=avro
alog.sources.avro-source.bind=hadoop01
alog.sources.avro-source.port=41414

# Use a channel which buffers events in memory
alog.channels.logger-channel.type=memory

# Describe the sink
alog.sinks.log-sink.type=org.apache.flume.sink.kafka.KafkaSink
alog.sinks.log-sink.kafka.topic = logTop
alog.sinks.log-sink.kafka.bootstrap.servers = hadoop01:9092
alog.sinks.log-sink.kafka.flumeBatchSize = 20
alog.sinks.log-sink.kafka.producer.acks = 1

# Bind the source and sink to the channel
alog.sources.avro-source.channels=logger-channel
alog.sinks.log-sink.channel=logger-channel

3. log4j.properties

log4j.rootLogger=info,console,flume
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d %p [%c] - %m%n

log4j.appender.flume = org.apache.flume.clients.log4jappender.Log4jAppender
log4j.appender.flume.Hostname = hadoop01
log4j.appender.flume.Port = 41414
log4j.appender.flume.UnsafeMode = true

4. 配置运行环境

// 启动 zookeeper
zkServer.sh start

// 启动 kafka
kafka-server-start.sh /usr/local/kafka/config/server.properties &

// 启动 flume
flume-ng agent \
--conf /usr/local/flume-1.8.0/conf/ \
--conf-file /usr/local/flume-1.8.0/conf/flume-kafka.conf \
--name alog \
-Dflume.root.logger=INFO,console

// 启动 kafka 消费者
kafka-console-consumer.sh --zookeeper hadoop01:2181 -topic logTop

5. LoggerProducer ( 产生日志 )

package cn.practicea;

import org.apache.log4j.Logger;

public class LoggerProducer {
    private static Logger logger = Logger.getLogger(LoggerProducer.class.getName());
    public static void main(String[] args) throws Exception {
        int index = 0;
        while (true) {
            Thread.sleep(1000);
            logger.info("index : " + index++);
        }
    }
}

6. LoggerConsumer ( 接收kafka消费数据 )

package cn.practicea

import java.util.{Arrays, Properties}
import org.apache.kafka.clients.consumer.{ConsumerRecords, KafkaConsumer}

object LoggerStreaming {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName(this.getClass.getSimpleName)
    val ssc = new StreamingContext(conf, Seconds(3))

    val topic = Array("logTop")
    val map = Map(
      "bootstrap.servers" -> "hadoop01:9092,hadoop02:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "test-consumer-group01",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    val stream = KafkaUtils.createDirectStream[String, String](
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe[String, String](topic, map))

    stream.foreachRDD(rdd => {
      rdd.map(t => (t.value(), 1)).reduceByKey(_ + _).foreach(println)
    })

    ssc.start()
    ssc.awaitTermination()
  }
}

你可能感兴趣的:(Hadoop,hadoop)