Flume的source、channel,sink简单介绍及使用

flume的启动指令

[root@node132 /opt/sofware/flume/conf/Last]#../../bin/flume-ng agent -n a1 -c ./ -f ./source_kafka.conf -Dflume-root-logger=INFO,console

Source

# exec 动态监听文件
agent1.sources.avro-source1.channels = ch1
agent1.sources.avro-source1.type = exec
agent1.sources.avro-source1.command = tail -F /Users/walle/Documents/D2/testflume.log
# source-spooldir    目錄一經監聽就不能再做修改,也不能再重命名
a1.sources=r1
a1.channels=c1
a1.sinks=s1

a1.sources.r1.type=spooldir
a1.sources.r1.spoolDir=/root/data/test


a1.channels.c1.type=memory
a1.channels.c1.capacity=100
a1.channels.c1.transactionCapacity=100

a1.sinks.s1.type=logger


a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1
#source-HTTP

a1.sources=r1
a1.channels=c1
a1.sinks=s1

a1.sources.r1.type=HTTP
a1.sources.r1.port=4444

a1.channels.c1.type=memory
a1.channels.c1.capacity=100
a1.channels.c1.transactionCapacity=100

a1.sinks.s1.type=logger


a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1



#用於測試的json
curl -X POST -d '[{"headers":{"a":"a1","b":"b1"},"body":"hello http-flume"}]'  http://0.0.0.0:4444(4444是port端口號,根據情況修改)
#source-netcat

a1.sources=r1
a1.channels=c1
a1.sinks=s1

a1.sources.r1.type=netcat
a1.sources.r1.port=4444
a1.sources.r1.bind=0.0.0.0

a1.channels.c1.type=memory
a1.channels.c1.capacity=100
a1.channels.c1.transactionCapacity=100

a1.sinks.s1.type=logger


a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1
# 从kafka采集数据

#-------- kafkaSource相关配置-----------------
# 定义消息源类型
a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
# 定义kafka所在zk的地址
# 这里特别注意: 是kafka的zookeeper的地址  kafka在哪個節點啓動就寫哪個的主機名
a1.sources.r1.zookeeperConnect = node133:2181
# 配置消费的kafka topic
a1.sources.r1.topic = 1705a
# 配置消费者组的id
a1.sources.r1.groupId = xss
# 消费超时时间,参照如下写法可以配置其他所有kafka的consumer选项。注意格式从kafka$
a1.sources.r1.kafka.consumer.timeout.ms = 100


# kafka 生產者

./kafka-console-producer --topic 1705a --broker-list node133:9092

Channel

#channel-kafka
a1.sources=r1
a1.channels=c1
a1.sinks=s1

a1.sources.r1.type=netcat
a1.sources.r1.port=4444
a1.sources.r1.bind=0.0.0.0

a1.channels.c1.type=org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100
a1.channels.c1.brokerList=node132:9092
a1.channels.c1.topic=1705a
a1.channels.c1.zookeeperConnect=node132:2181
a1.channels.c1.parseAsFlumeEvent=false

a1.sinks.s1.type=logger


a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1
# channel-file    存到指定目錄下
a1.sources=r1
a1.channels=c1
a1.sinks=s1

a1.sources.r1.type=netcat
a1.sources.r1.bind=0.0.0.0
a1.sources.r1.port=4444

a1.channels.c1.type=file
# 通过配置dataDirs指向多个路径,每个路径对应不同的硬盘,增大Flume吞吐量
a1.channels.c1.dataDirs=/root/data/test

# checkpointDir(检查点目录)和backupCheckpointDir(备用检查点目录)
也尽量配置在不同硬盘对应的目录中,保证checkpoint坏掉后,可以快速使用backupCheckpointDir恢复数据
a1.channels.c1.checkpointDir = /opt/data/flume/check

a1.sinks.s1.type=logger


a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1
# channel-memory  存到内存中
a1.sources=r1
a1.channels=c1
a1.sinks=s1

a1.sources.r1.type=netcat
a1.sources.r1.port=4444
a1.sources.r1.bind=0.0.0.0

a1.channels.c1.type=memory
a1.channels.c1.capacity=100
a1.channels.c1.transactionCapacity=100

a1.sinks.s1.type=logger


a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1

Sink

 

# sink->hbase      前提是flume的版本是1.7.0的

Hbase中建表

hbase(main):056:0>create 'movie','analyse'
2、Flume配置

a1.sources = r1   
a1.sinks = k1     
a1.channels = c1  

# Describe/configure the source
a1.sources.r1.type = spooldir  
#实时读取本地目录
a1.sources.r1.spoolDir = /root/flume_test
a1.sources.r1.deletePolicy=never 
a1.sources.r1.fileHeader = true  
a1.sources.r1.channels = c1   

# Use a channel which buffers events in memory
a1.channels.c1.type = memory   
a1.channels.c1.capacity = 1000   
a1.channels.c1.transactionCapacity = 100  

a1.sinks.k1.type = org.apache.flume.sink.hbase.HBaseSink 
#表名
a1.sinks.k1.table = movie
#列组
a1.sinks.k1.columnFamily = analyse
#通配符(数据格式为	1001::猫和老鼠::9 )
a1.sinks.k1.serializer.regex =(.*)::(.*)::(.*)
a1.sinks.k1.serializer = org.apache.flume.sink.hbase.RegexHbaseEventSerializer
a1.sinks.k1.channel = c1  
#列
a1.sinks.k1.serializer.colNames = ROW_KEY,movie_name,rating
# 索引为0,即ROW_KEY(ROW_KEY是特殊字符)
a1.sinks.k1.serializer.rowKeyIndex = 0
3、启动flume

[root@quickstart]#flume-ng agent --conf conf /etc/flume-ng/conf --conf-file /etc/flume-ng/conf/flume.conf  --name a1 -Dflume.root.logger = INFO,console
4、导入数据到 /root/flume_test/,文件内容为

1001::tom::3
1002::jerry::5
1003::jack::4
5、查看HBase表

hbase(main):066:0> scan 'movie'
ROW                             COLUMN+CELL                                                                             
 1001                           column=analyse:movieid, timestamp=1545212471611, value=tom                              
 1001                           column=analyse:ratings, timestamp=1545212471611, value=3                                
 1002                           column=analyse:movieid, timestamp=1545212471611, value=jerry                            
 1002                           column=analyse:ratings, timestamp=1545212471611, value=5                                
 1003                           column=analyse:movieid, timestamp=1545212471611, value=jack                             
 1003                           column=analyse:ratings, timestamp=1545212471611, value=4                                
3 row(s) in 0.0440 seconds
# sink-kafka
a1.sources=r1
a1.channels=c1
a1.sinks=s1


a1.sources.r1.type=exec
a1.sources.r1.command=tail -F /root/data/log

a1.channels.c1.type=memory
a1.channles.c1.capacity=1000
a1.channles.c1.transactionCapacity=200

# a1.sinks.s1.type = org.apache.flume.sink.kafka.KafkaSink
# a1.sinks.s1.topic = 1705a
# a1.sinks.s1.brokerList = node132:9092
# a1.sinks.s1.requiredAcks = 1
# a1.sinks.s1.batchSize = 20
a1.sinks.s1.type = logger


a1.sources.r1.channels=c1
a1.sinks.s1.channel = c1
# sink-file_roll   将接收到的内容保存到指定目录下
a1.sources=r1
a1.channels=c1
a1.sinks=s1

a1.sources.r1.type=netcat
a1.sources.r1.bind=0.0.0.0
a1.sources.r1.port=4444


a1.channels.c1.type=memory
a1.channels.c1.capacity=100
a1.channels.c1.transactionCapacity=100

a1.sinks.s1.type=file_roll 
a1.sinks.s1.sink.directory=/root/data/test
#每60秒生成一个新的日志文件,设置成0则接收到的所有东西都存到一个文件中
a1.sinks.s1.sink.rollInterval=60   


a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1

 

sink-avro 多级流动形式

Flume的source、channel,sink简单介绍及使用_第1张图片

#sink-hdfs
a1.sources=r1
a1.channels=c1
a1.sinks=s1

a1.sources.r1.type=netcat
a1.sources.r1.bind=0.0.0.0
a1.sources.r1.port=4444


a1.channels.c1.type=memory
a1.channels.c1.capacity=100
a1.channels.c1.transactionCapacity=100


# hdfs上的目录会 自动生成,不用提前在hdfs上建好
a3.sinks.k3.type = hdfs    
a3.sinks.k3.hdfs.path = hdfs://h136:9000/flume/tailDir/%Y%m%d/%H
# 上传文件的前缀
a3.sinks.k3.hdfs.filePrefix = upload-
# 是否按照时间滚动文件夹
a3.sinks.k3.hdfs.round = true
# 多少时间单位创建一个新的文件夹
a3.sinks.k3.hdfs.roundValue = 1
# 重新定义时间单位
a3.sinks.k3.hdfs.roundUnit = hour
# 是否使用本地时间戳
a3.sinks.k3.hdfs.useLocalTimeStamp = true
# 积攒多少个 Event 才 flush 到 HDFS 一次
a3.sinks.k3.hdfs.batchSize = 100
# 设置文件类型,可支持压缩
a3.sinks.k3.hdfs.fileType = DataStream
# 多久生成一个新的文件
a3.sinks.k3.hdfs.rollInterval = 60
# 设置每个文件的滚动大小大概是 128M
a3.sinks.k3.hdfs.rollSize = 134217700
# 文件的滚动与 Event 数量无关
a3.sinks.k3.hdfs.rollCount = 0

a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1

 

你可能感兴趣的:(Flume,Flume)