从AppServer产生log4j,Flume通过hostname/port收集日志信息,利用KafkaSink发送给Kafka,Kafka通过两种模式,传给Spark Streaming,完成各个业务维度的统计和分析,统计结果入库。
agent1.sources=avro-source
agent1.channels=logger-channel
agent1.sinks=log-sink
#define source
agent1.sources.avro-source.type=avro
agent1.sources.avro-source.bind=192.168.6.130
agent1.sources.avro-source.port=41414
#define channel
agent1.channels.logger-channel.type=memory
#define sink
agent1.sinks.log-sink.type=logger
agent1.sources.avro-source.channels=logger-channel
agent1.sinks.log-sink.channel=logger-channel
启动flume
flume-ng agent \
--conf $FLUME_HOME/conf \
--conf-file $FLUME_HOME/conf/streaming.conf \
--name agent1 \
-Dflume.root.logger=INFO,console
引入依赖
org.apache.flume.flume-ng-clients
flume-ng-log4jappender
1.6.0
# INFO级别以上的日志 控制台、Flume都会有
log4j.rootLogger=INFO,stdout,flume
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.target = System.out
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c] [%p] - %m%n
log4j.appender.flume = org.apache.flume.clients.log4jappender.Log4jAppender
log4j.appender.flume.Hostname = 192.168.6.130
log4j.appender.flume.Port = 41414
# 发送消息失败,不会抛出异常
log4j.appender.flume.UnsafeMode = true
这样启动程序,我们就会在Flume端控制台收集到相应日志信息。
# 启动Zookeeper
zkServer.sh start
# 启动Kafka
kafka-server-start.sh -daemon /home/Kiku/app/kafka_2.11-0.9.0.0/config/server.properties
# 创建topic
kafka-topics.sh --create --zookeeper hadoop000:2181 --replication-factor 1 --partitions 1 --topic test-topic
agent1.sources=avro-source
agent1.channels=logger-channel
agent1.sinks=kafka-sink
#define source
agent1.sources.avro-source.type=avro
agent1.sources.avro-source.bind=192.168.6.130
agent1.sources.avro-source.port=41414
#define channel
agent1.channels.logger-channel.type=memory
#define sink
agent1.sinks.kafka-sink.type=org.apache.flume.sink.kafka.KafkaSink
agent1.sinks.kafka-sink.topic=test-topic
agent1.sinks.kafka-sink.brokerList=hadoop000:9092
agent1.sinks.kafka-sink.requiredAcks=1
agent1.sinks.kafka-sink.batchSize=20
agent1.sources.avro-source.channels=logger-channel
agent1.sinks.kafka-sink.channel=logger-channel
flume-ng agent \
--conf $FLUME_HOME/conf \
--conf-file $FLUME_HOME/conf/streaming2.conf \
--name agent1 \
-Dflume.root.logger=INFO,console
/**
* Spark Streaming 对接 Kafka
*/
object KafkaStreamingApp {
def main(args: Array[String]): Unit = {
//因为主机没有配置hadoop环境,所以需要加上这句话
System.setProperty("hadoop.home.dir", "E:/winutils/")
if (args.length != 4) {
System.err.println("Usage: KafkaStreamingApp ")
}
val Array(zkQuorum, group, topics, numThreads) = args
val sparkConf = new SparkConf()
.setMaster("local[2]").setAppName("KafkaStreamingApp")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
//Spark Streaming 如何对接Kafka
val messages = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap)
//第二位是我们字符串的值,输出结果
messages.map(_._2).count().print()
ssc.start()
ssc.awaitTermination()
}
}
在本地测试,首先运行LoggerGenerator,生成日志信息,使用Flume指定hostname、port去接收数据,分批次传给Kafka,在程序中,用Spark Streaming从Kafka消费数据。
在生产中,把LoggerGenerator打成jar包,Flume、Kafka类似,Spark Streaming代码也需要打成jar包,使用spark-submit提交到YARN上执行。(可以根据实际情况,选择运行模式 local/yarn/standalone)