Avro可以发送一个给定的文件给Flume,Avro 源使用AVRO RPC机制。
创建agent配置文件
在flume_home/conf目录下创建一个名为avro.conf的文件,内容如下
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe configure the source
a1.sources.r1.type = avro
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 4141
# Describe the sink
a1.sinks.k1.type = logger
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
对以上内容解释:
指定名称:a1是我们要启动的Agent名字
类似别名:Select u.id ,u.name from user as u where u.id =1;
a1.sources = r1 命名Agent的sources为r1
a1.sinks = k1 命名Agent的sinks为k1
a1.channels = c1 命名Agent的channels 为c1
# Describe configure the source
a1.sources.r1.type = avro 指定r1的类型为AVRO
a1.sources.r1.bind = 0.0.0.0 将Source与IP地址绑定(这里指本机)
a1.sources.r1.port = 4141 指定通讯端口为4141
# Describe the sink
a1.sinks.k1.type = logger 指定k1的类型为Logger(不产生实体文件,只在控制台显示)
# Use a channel which buffers events in memory
a1.channels.c1.type = memory 指定Channel的类型为Memory
a1.channels.c1.capacity = 1000 设置Channel的最大存储event数量为1000
a1.channels.c1.transactionCapacity = 100 每次最大可以source中拿到或者送到sink中的event数量也是100
这里还可以设置Channel的其他属性:
a1.channels.c1.keep-alive=1000 event添加到通道中或者移出的允许时间(秒)
a1.channels.c1.byteCapacity = 800000 event的字节量的限制,只包括eventbody
a1.channels.c1.byteCapacityBufferPercentage = 20
event的缓存比例为20%(800000的20%),即event的最大字节量为800000*120%
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
将source、sink分别与Channel c1绑定
启动flume agent a1
flume-ng agent -c flume_home/conf -f flume_home/conf/avro.conf -n a1 -Dflume.root.logger=INFO,console
# -c:使用配置文件所在目录(这里指默认路径,即$FLUME_HOME/conf)
# -f:flume定义组件的配置文件
# -n:启动Agent的名称,该名称在组件配置文件中定义
# -Dflume.root.logger:flume自身运行状态的日志,按需配置,详细信息,控制台打印
创建指定文件
echo "hello world" > /home/data/log.00
```
使用avro-client发送文件
flume-ng avro-client -c flume_home/conf -H min1 -p 4141 -F /home/data/log.00
# -H:指定主机
# -p:指定端口
# -F:制定要发送的文件
在min1的控制台,可以看到以下信息,注意最后一行:
创建agent配置文件
vi /home/bigdata/flume/conf/exec_tail.conf
#添加以下内容:
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /home/data/log_exec_tail
# Describe the sink
a1.sinks.k1.type = logger
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动flume agent a1
flume-ng agent -c flume_home/conf -f /home/bigdata/flume/conf/exec_tail.conf -n a1 -Dflume.root.logger=INFO,console
制作log_exec_tail文件
echo "exec tail 1" >> /home/data/log_exec_tail
#在flume的控制台,可以看到以下信息:
向log_exec_tail文件中追加数据
echo "exec tail 2" >> /hadoop/flume/log_exec_tail
#在flume的控制台,可以看到以下信息:
在shell中编写一个shell程序
# for i in {1..100}
> do echo "flume +" $i >> /home/data/log_exec_tail
> done
for i in $(seq 1 100) ;do echo "flume"+$i >> /home/data/log_exec_tail ; done
##3 Spool
Spool监测配置的目录下新增的文件,并将文件中的数据读取出来。需要注意两点:
创建agent配置文件
vi /home/bigdata/flume/conf/spool.conf
#添加以下内容:
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe configure the source
a1.sources.r1.type = spooldir
a1.sources.r1.spoolDir = /home/data/logs
a1.sources.r1.fileHeader = true
# Describe the sink
a1.sinks.k1.type = logger
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动flume agent a1
#创建/home/data/logs文件夹
mkdir / home/data /logs
flume-ng agent -c flume_home/conf -f flume_home/conf/spool.conf -n a1 -Dflume.root.logger=INFO,console
追加文件到/hadoop/flume/logs目录
echo "spool test1" > /home/data/logs/spool_text.log
Syslogtcp监听TCP的端口做为数据源
创建agent配置文件
vi /home/bigdata/flume/conf/syslog_tcp.conf
#添加以下内容:
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = syslogtcp
a1.sources.r1.port = 5140
a1.sources.r1.host = localhost
# Describe the sink
a1.sinks.k1.type = logger
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动flume agent a1
flume-ng agent -c flume_home/conf -f /home/bigdata/flume/conf/syslog_tcp.conf -n a1 -Dflume.root.logger=INFO,console
测试产生syslogy
#需要安装nc
Rpm –ivh nc-1.84-22.el6.x86_64
echo "hello idoall.org syslog" | nc localhost 5140
创建agent配置文件
vi /home/bigdata/flume/conf/post_json.conf
#添加如下内容:
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = org.apache.flume.source.http.HTTPSource
a1.sources.r1.port = 8888
# Describe the sink
a1.sinks.k1.type = logger
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动flume agent a1
flume-ng agent -c flume_home/conf -f /home/bigdata/flume/conf/post_json.conf -n a1 -Dflume.root.logger=INFO,console
生成JSON 格式的POST request
curl -X POST -d '[{ "headers" :{"a" : "a1","b" : "b1"},"body" : "idoall.org_body"}]' http://localhost:8888
创建agent配置文件
vi /home/bigdata/flume/conf/hdfs_sink.conf
#添加以下内容:
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = syslogtcp
a1.sources.r1.port = 5140
a1.sources.r1.host = localhost
# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs:// zookeepertest01:8020/user/flume/syslogtcp
a1.sinks.k1.hdfs.filePrefix = Syslog
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 1
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.fileType=DataStream
a1.sinks.k1.hdfs.writeFormat=Text
a1.sinks.k1.hdfs.rollInterval=0
a1.sinks.k1.hdfs.rollSize=10240
a1.sinks.k1.hdfs.rollCount=0
a1.sinks.k1.hdfs.idleTimeout=60
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动flume agent a1
flume-ng agent -c flume_home/conf -f /home/bigdata/flume/conf/hdfs_sink.conf -n a1 -Dflume.root.logger=INFO,console
测试产生syslog
echo "hello idoall flume -> hadoop testing one" | nc localhost 5140
在master上再打开一个窗口,去Hadoop上检查文件是否生成
hadoop fs -ls /user/flume/syslogtcp
hadoop fs -cat /user/flume/syslogtcp/Syslog.1407644509504
for i in {1..30}; do echo “Flume +”$i |nc localhost 5140;done
创建agent配置文件
vi conf/hdfsDate.conf
#定义agent名, source、channel、sink的名称
a5.sources = source1
a5.channels = channel1
a5.sinks = sink1
#配置source
a5.sources.source1.type = spooldir
a5.sources.source1.spoolDir = /home/data/beicai
a5.sources.source1.channels = channel1
a5.sources.source1.fileHeader = false
a5.sources.source1.interceptors = i1
a5.sources.source1.interceptors.i1.type = timestamp
#配置sink
a5.sinks.sink1.type = hdfs
a5.sinks.sink1.hdfs.path = hdfs://192.168.10.11:9000/usr/beicai
a5.sinks.sink1.hdfs.fileType = DataStream
a5.sinks.sink1.hdfs.writeFormat = TEXT
a5.sinks.sink1.hdfs.rollInterval = 1
a5.sinks.sink1.channel = channel1
a5.sinks.sink1.hdfs.filePrefix = %Y-%m-%d
#配置channel
a5.channels.channel1.type = memory
启动flume agent a1
flume-ng agent -n a5 -c flume_home/conf -f conf/hdfsDate.conf -Dflume.root.logger=DEBUG,console
创建agent配置文件
vi /home/bigdata/flume/conf/file_roll.conf
#添加以下内容:
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = syslogtcp
a1.sources.r1.port = 5555
a1.sources.r1.host = localhost
# Describe the sink
a1.sinks.k1.type = file_roll
a1.sinks.k1.sink.directory = /home/data/logs2
a1.sinks.k1.sink.serializer = TEXT
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动flume agent a1
flume-ng agent -c flume_home/conf -f /home/bigdata/flume/conf/file_roll.conf -n a1 -Dflume.root.logger=INFO,console
测试产生logcat
echo "hello idoall.org syslog" | nc localhost 5555
echo "hello idoall.org syslog 2" | nc localhost 5555
查看/home/data/logs2下是否生成文件,默认每30秒生成一个新文件
ll /home/data/logs2
vi conf/channelsFile.conf
a1.sources = s1
a1.channels = c1
a1.sinks = k1
# For each one of the sources, the type is defined
a1.sources.s1.type = syslogtcp
a1.sources.s1.host = localhost
a1.sources.s1.port = 5180
# Each sink's type must be defined
a1.sinks.k1.type = logger
# Each channel's type is defined.
a1.channels.c1.type = file
a1.channels.c1.checkpointDir = /home/bigdata/flume/logs/checkpoint
a1.channels.c1.dataDir = /home/bigdata/flume/logs/data
#Bind the source and sinks to channels
a1.sources.s1.channels = c1
a1.sinks.k1.channel = c1
#flume-ng agent -n a1 -c conf -f conf/ channelsFile.conf -Dflume.root.logger=DEBUG,console