telnet node03 44444
发送数据$ pwd
$ tar -zxvf apache-flume-1.9.0-bin.tar.gz -C /bigdata/install/
# 修改配置文件
$ cd /bigdata/install/apache-flume-1.9.0-bin/conf/
$ cp flume-env.sh.template flume-env.sh
$ vim flume-env.sh
export JAVA_HOME=/usr/apps/jdk1.8.0_241
$ pwd
$ rm -rf lib/guava-11.0.2.jar
$ cp /bigdata/install/hadoop-3.1.4/share/hadoop/common/lib/guava-27.0-jre.jar lib/
$ pwd
$ vim conf/netcat-logger.conf
# 1. 定义这个agent中各组件的名字
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# 2. 描述和配置source组件:r1
a1.sources.r1.type = netcat
# 当前节点的ip地址
a1.sources.r1.bind = node03
a1.sources.r1.port = 44444
# 3. 描述和配置sink组件:k1
a1.sinks.k1.type = logger
# 4. 描述和配置channel组件,此处使用是内存缓存的方式
a1.channels.c1.type = memory
# channel中存储的event的最大个数
a1.channels.c1.capacity = 1000
# channel每次从source获得的event最多个数或一次发往sink的event最多个数
a1.channels.c1.transactionCapacity = 100
# 5. 描述和配置source channel sink之间的连接关系
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
$ pwd
$ bin/flume-ng agent -c conf -f conf/netcat-logger.conf -n a1 -Dflume.root.logger=INFO,console
-c conf 指定flume自身的conf目录中的配置文件
-f conf/netcat-logger.con 指定我们所描述的采集方案
-n a1 指定我们这个agent的名字
-Dflume.root.logger=INFO,console 将info级别的日志打印到控制台
sudo yum -y install telnet
telnet node03 44444 # 使用telnet模拟数据发送
vim conf/spooldir.conf
,内容如下:# 1. Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# 2. Describe/configure the source
# 注意:不能往监控目中重复丢同名文件
a1.sources.r1.type = spooldir
# 监控的路径
a1.sources.r1.spoolDir = /bigdata/install/flumedatas
# Whether to add a header storing the absolute path filename
# 文件绝对路径放到header
a1.sources.r1.fileHeader = true
# 3. Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
# 采集到的数据写入到此路径
a1.sinks.k1.hdfs.path = hdfs://node01:8020/spooldir/files/%y-%m-%d/%H%M/
# 指定在hdfs上生成的文件名前缀
a1.sinks.k1.hdfs.filePrefix = events-
# timestamp向下取整round down
a1.sinks.k1.hdfs.round = true
# 按10分钟,为单位向下取整;如55分,舍成50;38 -> 30
a1.sinks.k1.hdfs.roundValue = 10
# round的单位
a1.sinks.k1.hdfs.roundUnit = minute
# 每3秒滚动生成一个文件;默认30;(0 = never roll based on time interval)
a1.sinks.k1.hdfs.rollInterval = 3
# 每x字节,滚动生成一个文件;默认1024;(0: never roll based on file size)
a1.sinks.k1.hdfs.rollSize = 20
# 每x个event,滚动生成一个文件;默认10; (0 = never roll based on number of events)
a1.sinks.k1.hdfs.rollCount = 5
# 每x个event,flush到hdfs
a1.sinks.k1.hdfs.batchSize = 1
# 使用本地时间
a1.sinks.k1.hdfs.useLocalTimeStamp = true
# 生成的文件类型,默认是Sequencefile;可选DataStream,则为普通文本;可选CompressedStream压缩数据
a1.sinks.k1.hdfs.fileType = DataStream
# 4. Use a channel which buffers events in memory
a1.channels.c1.type = memory
# channel中存储的event的最大数目
a1.channels.c1.capacity = 1000
# 每次传输数据,从source最多获得event的数目或向sink发送的event的最大的数目
a1.channels.c1.transactionCapacity = 100
# 5. Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
bin/flume-ng agent -c conf -f conf/spooldir.conf -n a1 -Dflume.root.logger=INFO,console
cd /bigdata/install/flumedatas
# vim a.txt 加入如下内容
ab cd ef
english math
hadoop alibaba
就会报错:cp a.txt /bigdata/install/flumedatas
exec 'tail -F file'
监控文件内容更新vim conf/tail-file.conf
,内容如下:# 1. Name the components on this agent
agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1
# 2. Describe/configure tail -F source1
agent1.sources.source1.type = exec
agent1.sources.source1.command = tail -F /bigdata/install/flumedatas/taillogs/access_log
agent1.sources.source1.channels = channel1
# 3. Describe sink
agent1.sinks.sink1.type = hdfs
agent1.sinks.sink1.hdfs.path = hdfs://node01:8020/weblog/flume-collection/%y-%m-%d/%H-%M
agent1.sinks.sink1.hdfs.filePrefix = access_log
# 允许打开的文件数;如果超出5000,老文件会被关闭
agent1.sinks.sink1.hdfs.maxOpenFiles = 5000
agent1.sinks.sink1.hdfs.batchSize= 100
agent1.sinks.sink1.hdfs.fileType = DataStream
agent1.sinks.sink1.hdfs.writeFormat =Text
agent1.sinks.sink1.hdfs.rollSize = 102400
agent1.sinks.sink1.hdfs.rollCount = 1000000
agent1.sinks.sink1.hdfs.rollInterval = 60
agent1.sinks.sink1.hdfs.round = true
agent1.sinks.sink1.hdfs.roundValue = 10
agent1.sinks.sink1.hdfs.roundUnit = minute
agent1.sinks.sink1.hdfs.useLocalTimeStamp = true
# 4. Use a channel which buffers events in memory
agent1.channels.channel1.type = memory
# 向channel添加一个event或从channel移除一个event的超时时间
agent1.channels.channel1.keep-alive = 120
agent1.channels.channel1.capacity = 500000
agent1.channels.channel1.transactionCapacity = 600
# 5. Bind the source and sink to the channel
agent1.sources.source1.channels = channel1
agent1.sinks.sink1.channel = channel1
bin/flume-ng agent -c conf -f conf/tail-file.conf -n agent1 -Dflume.root.logger=INFO,console
vim tail-file.sh
while true
date >> /bigdata/install/flumedatas/taillogs/access_log;
sleep 0.5;
chmod u+x tail-file.sh
vim conf/tail-dir.conf
,内容如下:# 1. Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# 2. Describe/configure the source
a1.sources.r1.type = TAILDIR
# 以json格式,记录读取的每个文件及读取的position
a1.sources.r1.positionFile = /bigdata/install/apache-flume-1.9.0-bin/taildir_position.json
# 每个filegroup代表一系列待tail的文件
a1.sources.r1.filegroups = f1
# 指定filegroup的绝对路径
a1.sources.r1.filegroups.f1 = /bigdata/install/flumedatas/dirfile/.*log.*
# 此项用于控制从一个文件连续读取数据的批次;比如有A、B、C多个文件,如果向A文件写入的频率非常高,导致一直循环的从A中采集获取数据,而B、C的数据不被处理;可将此值调低;每个批次由属性batchSize控制,默认500行
a1.sources.r1.maxBatchCount = 1000
# 3. Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
a1.sinks.k1.hdfs.path = hdfs://node01:8020/taildir/files/%y-%m-%d/%H%M/
a1.sinks.k1.hdfs.filePrefix = events-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k1.hdfs.rollSize = 5000
a1.sinks.k1.hdfs.rollCount = 50000
# 每x个event flush到hdfs
a1.sinks.k1.hdfs.batchSize = 1000
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.fileType = DataStream
# 4. Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# 5.Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
bin/flume-ng agent -c conf -f conf/tail-dir.conf -n a1 -Dflume.root.logger=INFO,console
echo "Hello World" >> /bigdata/install/flumedatas/dirfile/file.log
echo "How are you" >> /bigdata/install/flumedatas/dirfile/file1.log
echo "How old are you" >> /bigdata/install/flumedatas/dirfile/file2.log
文件中记录的内容$ cat /bigdata/install/apache-flume-1.9.0-bin/taildir_position.json
scp -r apache-flume-1.9.0-bin/ node02:$PWD
vim conf/tail-avro-logger.conf
,内容如下# 1. Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# 2. Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /bigdata/install/flumedatas/taillogs/access_log
a1.sources.r1.channels = c1
# 3. Describe the sink
# sink端的avro是一个数据发送者
a1.sinks.k1.type = avro
a1.sinks.k1.channel = c1
a1.sinks.k1.hostname = node03
a1.sinks.k1.port = 4141
# 每一批次发送的event的数目
a1.sinks.k1.batch-size = 10
# 4. Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# 5. Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
vim conf/avro-hdfs.conf
,内容如下# 1. Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# 2. Describe/configure the source
# source中的avro组件是一个接收者服务
a1.sources.r1.type = avro
a1.sources.r1.channels = c1
a1.sources.r1.bind = node03
a1.sources.r1.port = 4141
# 3. Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://node01:8020/avro/hdfs/%y-%m-%d/%H%M/
a1.sinks.k1.hdfs.filePrefix = events-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k1.hdfs.rollSize = 200
a1.sinks.k1.hdfs.rollCount = 5
a1.sinks.k1.hdfs.batchSize = 1
a1.sinks.k1.hdfs.useLocalTimeStamp = true
# 生成的文件类型,默认是Sequencefile,可用DataStream,则为普通文本
a1.sinks.k1.hdfs.fileType = DataStream
# 4. Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# 5. Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
# node03
bin/flume-ng agent -c conf -f conf/avro-hdfs.conf -n a1 -Dflume.root.logger=INFO,console
# node02
bin/flume-ng agent -c conf -f conf/tail-avro-logger.conf -n a1 -Dflume.root.logger=INFO,console
# node03
$ pwd
$ scp -r tail-file.sh node02:$PWD
# node02
名称 | HOST | 角色 |
Agent1 | node01 | Web Server |
Collector1 | node02 | AgentMstr1 |
Collector2 | node03 | AgentMstr2 |
# node01
mkdir -p /bigdata/install/flumedatas/taillogs
# node03
cd /bigdata/install
scp -r apache-flume-1.9.0-bin/ node01:$PWD
$ pwd
$ scp -r tail-file.sh node01:$PWD
vim conf agent.conf
,内容如下# agent1 name
agent1.channels = c1
agent1.sources = r1
agent1.sinks = k1 k2
## set gruop
agent1.sinkgroups = g1
## set channel
agent1.channels.c1.type = memory
agent1.channels.c1.capacity = 1000
agent1.channels.c1.transactionCapacity = 100
# 配置source
agent1.sources.r1.channels = c1
agent1.sources.r1.type = exec
agent1.sources.r1.command = tail -F /bigdata/install/flumedatas/taillogs/access_log
# interceptor 拦截器;与source结合,对event进行修改或丢弃
agent1.sources.r1.interceptors = i1 i2
# 静态拦截器在所有的event的header中,增加一个kv对,key是下边属性key对应的值,value是属性value对应的值
agent1.sources.r1.interceptors.i1.type = static
# 被创建的header的名字
agent1.sources.r1.interceptors.i1.key = Type
# 静态的值;key与value对应
agent1.sources.r1.interceptors.i1.value = LOGIN
# timestamp拦截器对event的header中增加kv对,key是timestamp,value是对应的时间戳的值
agent1.sources.r1.interceptors.i2.type = timestamp
## set sink1
agent1.sinks.k1.channel = c1
agent1.sinks.k1.type = avro
agent1.sinks.k1.hostname = node02
agent1.sinks.k1.port = 52020
## set sink2
agent1.sinks.k2.channel = c1
agent1.sinks.k2.type = avro
agent1.sinks.k2.hostname = node03
agent1.sinks.k2.port = 52020
## set sink group
agent1.sinkgroups.g1.sinks = k1 k2
## sink processor处理器;可用于sink的负载均衡或故障转移
agent1.sinkgroups.g1.processor.type = failover
# priority值高的sink,拥有较高的权限;并且必须是唯一不重复的
agent1.sinkgroups.g1.processor.priority.k1 = 10
agent1.sinkgroups.g1.processor.priority.k2 = 1
# maxpenalty 对于故障的节点最大的黑名单时间 (in millis 毫秒)
agent1.sinkgroups.g1.processor.maxpenalty = 10000
vim conf/collector.conf
,内容相同# set Agent name
a1.sources = r1
a1.channels = c1
a1.sinks = k1
## set channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
## set source
a1.sources.r1.type = avro
a1.sources.r1.bind =
a1.sources.r1.port = 52020
a1.sources.r1.channels = c1
# 拦截器
a1.sources.r1.interceptors = i1
#a1.sources.r1.interceptors.i1.type = static
#a1.sources.r1.interceptors.i1.key = Collector
#a1.sources.r1.interceptors.i1.value = node02
# 在header中添加的kv对的key默认是host
a1.sources.r1.interceptors.i1.type = host
## set sink to hdfs
a1.sinks.k1.hdfs.path= hdfs://node01:8020/flume/failover/%{hostname}
# node03
bin/flume-ng agent -n a1 -c conf -f conf/collector.conf -Dflume.root.logger=DEBUG,console
# node02
bin/flume-ng agent -n a1 -c conf -f conf/collector.conf -Dflume.root.logger=DEBUG,console
# node01
bin/flume-ng agent -n agent1 -c conf -f conf/agent.conf -Dflume.root.logger=DEBUG,console
$ pwd
$ ./tail-file.sh