flume的启动指令
[root@node132 /opt/sofware/flume/conf/Last]#../../bin/flume-ng agent -n a1 -c ./ -f ./source_kafka.conf -Dflume-root-logger=INFO,console
# exec 动态监听文件
agent1.sources.avro-source1.channels = ch1
agent1.sources.avro-source1.type = exec
agent1.sources.avro-source1.command = tail -F /Users/walle/Documents/D2/testflume.log
# source-spooldir 目錄一經監聽就不能再做修改,也不能再重命名
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=spooldir
a1.sources.r1.spoolDir=/root/data/test
a1.channels.c1.type=memory
a1.channels.c1.capacity=100
a1.channels.c1.transactionCapacity=100
a1.sinks.s1.type=logger
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1
#source-HTTP
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=HTTP
a1.sources.r1.port=4444
a1.channels.c1.type=memory
a1.channels.c1.capacity=100
a1.channels.c1.transactionCapacity=100
a1.sinks.s1.type=logger
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1
#用於測試的json
curl -X POST -d '[{"headers":{"a":"a1","b":"b1"},"body":"hello http-flume"}]' http://0.0.0.0:4444(4444是port端口號,根據情況修改)
#source-netcat
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=netcat
a1.sources.r1.port=4444
a1.sources.r1.bind=0.0.0.0
a1.channels.c1.type=memory
a1.channels.c1.capacity=100
a1.channels.c1.transactionCapacity=100
a1.sinks.s1.type=logger
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1
# 从kafka采集数据
#-------- kafkaSource相关配置-----------------
# 定义消息源类型
a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
# 定义kafka所在zk的地址
# 这里特别注意: 是kafka的zookeeper的地址 kafka在哪個節點啓動就寫哪個的主機名
a1.sources.r1.zookeeperConnect = node133:2181
# 配置消费的kafka topic
a1.sources.r1.topic = 1705a
# 配置消费者组的id
a1.sources.r1.groupId = xss
# 消费超时时间,参照如下写法可以配置其他所有kafka的consumer选项。注意格式从kafka$
a1.sources.r1.kafka.consumer.timeout.ms = 100
# kafka 生產者
./kafka-console-producer --topic 1705a --broker-list node133:9092
#channel-kafka
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=netcat
a1.sources.r1.port=4444
a1.sources.r1.bind=0.0.0.0
a1.channels.c1.type=org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100
a1.channels.c1.brokerList=node132:9092
a1.channels.c1.topic=1705a
a1.channels.c1.zookeeperConnect=node132:2181
a1.channels.c1.parseAsFlumeEvent=false
a1.sinks.s1.type=logger
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1
# channel-file 存到指定目錄下
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=netcat
a1.sources.r1.bind=0.0.0.0
a1.sources.r1.port=4444
a1.channels.c1.type=file
# 通过配置dataDirs指向多个路径,每个路径对应不同的硬盘,增大Flume吞吐量
a1.channels.c1.dataDirs=/root/data/test
# checkpointDir(检查点目录)和backupCheckpointDir(备用检查点目录)
也尽量配置在不同硬盘对应的目录中,保证checkpoint坏掉后,可以快速使用backupCheckpointDir恢复数据
a1.channels.c1.checkpointDir = /opt/data/flume/check
a1.sinks.s1.type=logger
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1
# channel-memory 存到内存中
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=netcat
a1.sources.r1.port=4444
a1.sources.r1.bind=0.0.0.0
a1.channels.c1.type=memory
a1.channels.c1.capacity=100
a1.channels.c1.transactionCapacity=100
a1.sinks.s1.type=logger
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1
Sink
# sink->hbase 前提是flume的版本是1.7.0的
Hbase中建表
hbase(main):056:0>create 'movie','analyse'
2、Flume配置
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = spooldir
#实时读取本地目录
a1.sources.r1.spoolDir = /root/flume_test
a1.sources.r1.deletePolicy=never
a1.sources.r1.fileHeader = true
a1.sources.r1.channels = c1
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sinks.k1.type = org.apache.flume.sink.hbase.HBaseSink
#表名
a1.sinks.k1.table = movie
#列组
a1.sinks.k1.columnFamily = analyse
#通配符(数据格式为 1001::猫和老鼠::9 )
a1.sinks.k1.serializer.regex =(.*)::(.*)::(.*)
a1.sinks.k1.serializer = org.apache.flume.sink.hbase.RegexHbaseEventSerializer
a1.sinks.k1.channel = c1
#列
a1.sinks.k1.serializer.colNames = ROW_KEY,movie_name,rating
# 索引为0,即ROW_KEY(ROW_KEY是特殊字符)
a1.sinks.k1.serializer.rowKeyIndex = 0
3、启动flume
[root@quickstart]#flume-ng agent --conf conf /etc/flume-ng/conf --conf-file /etc/flume-ng/conf/flume.conf --name a1 -Dflume.root.logger = INFO,console
4、导入数据到 /root/flume_test/,文件内容为
1001::tom::3
1002::jerry::5
1003::jack::4
5、查看HBase表
hbase(main):066:0> scan 'movie'
ROW COLUMN+CELL
1001 column=analyse:movieid, timestamp=1545212471611, value=tom
1001 column=analyse:ratings, timestamp=1545212471611, value=3
1002 column=analyse:movieid, timestamp=1545212471611, value=jerry
1002 column=analyse:ratings, timestamp=1545212471611, value=5
1003 column=analyse:movieid, timestamp=1545212471611, value=jack
1003 column=analyse:ratings, timestamp=1545212471611, value=4
3 row(s) in 0.0440 seconds
# sink-kafka
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=exec
a1.sources.r1.command=tail -F /root/data/log
a1.channels.c1.type=memory
a1.channles.c1.capacity=1000
a1.channles.c1.transactionCapacity=200
# a1.sinks.s1.type = org.apache.flume.sink.kafka.KafkaSink
# a1.sinks.s1.topic = 1705a
# a1.sinks.s1.brokerList = node132:9092
# a1.sinks.s1.requiredAcks = 1
# a1.sinks.s1.batchSize = 20
a1.sinks.s1.type = logger
a1.sources.r1.channels=c1
a1.sinks.s1.channel = c1
# sink-file_roll 将接收到的内容保存到指定目录下
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=netcat
a1.sources.r1.bind=0.0.0.0
a1.sources.r1.port=4444
a1.channels.c1.type=memory
a1.channels.c1.capacity=100
a1.channels.c1.transactionCapacity=100
a1.sinks.s1.type=file_roll
a1.sinks.s1.sink.directory=/root/data/test
#每60秒生成一个新的日志文件,设置成0则接收到的所有东西都存到一个文件中
a1.sinks.s1.sink.rollInterval=60
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1
sink-avro 多级流动形式
#sink-hdfs
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=netcat
a1.sources.r1.bind=0.0.0.0
a1.sources.r1.port=4444
a1.channels.c1.type=memory
a1.channels.c1.capacity=100
a1.channels.c1.transactionCapacity=100
# hdfs上的目录会 自动生成,不用提前在hdfs上建好
a3.sinks.k3.type = hdfs
a3.sinks.k3.hdfs.path = hdfs://h136:9000/flume/tailDir/%Y%m%d/%H
# 上传文件的前缀
a3.sinks.k3.hdfs.filePrefix = upload-
# 是否按照时间滚动文件夹
a3.sinks.k3.hdfs.round = true
# 多少时间单位创建一个新的文件夹
a3.sinks.k3.hdfs.roundValue = 1
# 重新定义时间单位
a3.sinks.k3.hdfs.roundUnit = hour
# 是否使用本地时间戳
a3.sinks.k3.hdfs.useLocalTimeStamp = true
# 积攒多少个 Event 才 flush 到 HDFS 一次
a3.sinks.k3.hdfs.batchSize = 100
# 设置文件类型,可支持压缩
a3.sinks.k3.hdfs.fileType = DataStream
# 多久生成一个新的文件
a3.sinks.k3.hdfs.rollInterval = 60
# 设置每个文件的滚动大小大概是 128M
a3.sinks.k3.hdfs.rollSize = 134217700
# 文件的滚动与 Event 数量无关
a3.sinks.k3.hdfs.rollCount = 0
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1