一、日志采集:从网络端口接收数据,下沉到logger
文件netcat-logger.conf:
1 # Name the components on this agent 2 #给那三个组件取个名字 3 a1.sources = r1 4 a1.sinks = k1 5 a1.channels = c1 6 7 # Describe/configure the source 8 #类型, 从网络端口接收数据,在本机启动, 所以localhost, type=spoolDir采集目录源,目录里有就采 9 a1.sources.r1.type = netcat 10 a1.sources.r1.bind = localhost 11 a1.sources.r1.port = 44444 12 13 # Describe the sink 14 a1.sinks.k1.type = logger 15 16 # Use a channel which buffers events in memory 17 #下沉的时候是一批一批的, 下沉的时候是一个个eventChannel参数解释: 18 #capacity:默认该通道中最大的可以存储的event数量 19 #trasactionCapacity:每次最大可以从source中拿到或者送到sink中的event数量 20 a1.channels.c1.type = memory 21 a1.channels.c1.capacity = 1000 22 a1.channels.c1.transactionCapacity = 100 23 24 # Bind the source and sink to the channel 25 a1.sources.r1.channels = c1 26 a1.sinks.k1.channel = c1
启动命令:
#告诉flum启动一个agent,指定配置参数, --name:agent的名字,
flume-ng agent --conf conf --conf-file conf/netcat-logger.conf --name a1 -Dflume.root.logger=INFO,console
传入数据:
[root@mini03 ~]# telnet localhost 44444 Trying ::1... telnet: connect to address ::1: Connection refused Trying 127.0.0.1... Connected to localhost. Escape character is '^]'. hello world!^H^H^H^H^H^H^H^H^H^H^H^H^H^H OK tianjun2012! OK
控制台看到的数据
2017-05-08 13:41:35,766 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 68 65 6C 6C 6F 20 77 6F 72 6C 64 21 08 08 08 08 hello world!.... }
2017-05-08 13:41:40,153 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 74 69 61 6E 6A 75 6E 32 30 31 32 21 0D tianjun2012!. }
二、监视文件夹
启动命令:
bin/flume-ng agent -c ./conf -f ./conf/spooldir-logger.conf -n a1 -Dflume.root.logger=INFO,console
测试: 往/home/hadoop/flumespool放文件(mv ././xxxFile /home/hadoop/flumeSpool),但是不要在里面生成文件
# Name the components on this agent a1.sources = r1 a1.sinks = k1 a1.channels = c1 # Describe/configure the source #监听目录,spoolDir指定目录, fileHeader要不要给文件夹前坠名 a1.sources.r1.type = spooldir a1.sources.r1.spoolDir = /home/hadoop/flumespool a1.sources.r1.fileHeader = true # Describe the sink a1.sinks.k1.type = logger # Use a channel which buffers events in memory a1.channels.c1.type = memory a1.channels.c1.capacity = 1000 a1.channels.c1.transactionCapacity = 100 # Bind the source and sink to the channel a1.sources.r1.channels = c1 a1.sinks.k1.channel = c1
三、用tail命令获取数据,下沉到hdfs
1 # Name the components on this agent 2 a1.sources = r1 3 a1.sinks = k1 4 a1.channels = c1 5 6 # Describe/configure the source 7 a1.sources.r1.type = exec 8 a1.sources.r1.command = tail -F /home/hadoop/log/test.log 9 a1.sources.r1.channels = c1 10 11 # Describe the sink 12 a1.sinks.k1.type = hdfs 13 a1.sinks.k1.channel = c1 14 a1.sinks.k1.hdfs.path = hdfs://mini01:9000/flume/events/%y-%m-%d/%H%M/ 15 a1.sinks.k1.hdfs.filePrefix = events- 16 a1.sinks.k1.hdfs.round = true 17 a1.sinks.k1.hdfs.roundValue = 10 18 a1.sinks.k1.hdfs.roundUnit = minute 19 a1.sinks.k1.hdfs.rollInterval = 3 20 a1.sinks.k1.hdfs.rollSize = 20 21 a1.sinks.k1.hdfs.rollCount = 5 22 a1.sinks.k1.hdfs.batchSize = 1 23 a1.sinks.k1.hdfs.useLocalTimeStamp = true 24 #生成的文件类型,默认是Sequencefile,可用DataStream,则为普通文本 25 a1.sinks.k1.hdfs.fileType = DataStream 26 27 28 29 # Use a channel which buffers events in memory 30 a1.channels.c1.type = memory 31 a1.channels.c1.capacity = 1000 32 a1.channels.c1.transactionCapacity = 100 33 34 # Bind the source and sink to the channel 35 a1.sources.r1.channels = c1 36 a1.sinks.k1.channel = c1
启动命令:
flume-ng agent -c conf -f conf/tail-hdfs.conf -n a1
模拟写入日志:
1 [root@mini03 log]# i=1; 2 while(( $i<=500000 )); 3 do echo $i >> /home/hadoop/log/test.log; 4 sleep 0.5; 5 let 'i++';done
查看hdfs上的文件内容
1 [root@mini01 ~]# hdfs dfs -cat /flume/events/17-05-08/1530/* 2 1 3 2 4 3 5 4 6 5 7 6 8 7 9 8 10 9 11 10 12 11 13 12 14 13 15 14 16 15 17 16 18 17 19 18 20 19 21 20
注意,本例中,为了快速看到效果,这个值都设置比较小,真实情况需要调整
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k1.hdfs.rollSize = 20
a1.sinks.k1.hdfs.rollCount = 5
22 a1.sinks.k1.hdfs.batchSize = 1
下面给一个真实环境中的配置:
agent1.sources = spooldirSource
agent1.channels = fileChannel
agent1.sinks = hdfsSink
agent1.sources.spooldirSource.type=spooldir
agent1.sources.spooldirSource.spoolDir=/home/hadoop/log
agent1.sources.spooldirSource.channels=fileChannel
agent1.sinks.hdfsSink.type=hdfs
agent1.sinks.hdfsSink.hdfs.path=hdfs://mini01:9000/weblog/flume-input/%y-%m-%d
agent1.sinks.hdfsSink.hdfs.filePrefix=flume-
agent1.sinks.sink1.hdfs.round = true
# Number of seconds to wait before rolling current file (0 = never roll based on time interval)
agent1.sinks.hdfsSink.hdfs.rollInterval = 3600
# File size to trigger roll, in bytes (0: never roll based on file size)
agent1.sinks.hdfsSink.hdfs.rollSize = 128000000
agent1.sinks.hdfsSink.hdfs.rollCount = 0
agent1.sinks.hdfsSink.hdfs.batchSize = 1000
#Rounded down to the highest multiple of this (in the unit configured using hdfs.roundUnit), less than current time.
agent1.sinks.hdfsSink.hdfs.roundValue = 1
agent1.sinks.hdfsSink.hdfs.roundUnit = minute
agent1.sinks.hdfsSink.hdfs.useLocalTimeStamp = true
agent1.sinks.hdfsSink.channel=fileChannel
agent1.sinks.hdfsSink.hdfs.fileType = DataStream
agent1.channels.fileChannel.type = file
agent1.channels.fileChannel.checkpointDir=/tmp/flume/flume-bineckpoint
agent1.channels.fileChannel.dataDirs=/tmp/flume/dataDir
bin/flume-ng agent --conf ./conf/ -f conf/spooldir-hdfs.conf -Dflume.root.logger=DEBUG,console -n agent1 > log.log 2>&1 &