flume的安装和使用

Flume框架简介

  1. Flume提供一个分布式的,可靠的,对大数据量的日志进行高效收集、聚集、移动的服务。
  2. Flume基于流式架构,容错性强,也很灵活简单,主要用于在线实时分析。
  3. 角色
    1. Source:用于采集数据,Source是产生数据流的地方,同时Source会将产生的数据流传输到 Channel,这个有点类似于Java IO部分的Channel
    2. Channel:用于桥接Sources和Sinks,类似于一个队列。
    3. Sink:从Channel收集数据,将数据写到目标源(可以是下一个Source,也可以是HDFS或者HBase)
  4. Flume数据传输的基本单元(Event),以事件的形式将数据从源头送至目的地
  5. 传输过程:source监控某个文件,文件产生新的数据,拿到该数据后,将数据封装在一个Event中,并put到channel后commit提交,channel队列先进先出,sink去channel队列中拉取数据,然后写入到hdfs或者HBase中

安装

  1. 解压
    tar -zxvf apache-flume-1.8.0-bin.tar.gz -C /usr/local
    
  2. 重命名
    mv apache-flume-1.8.0-bin flume-1.8.0
    
  3. 修改配置文件
    // 重命名
    mv flume-env.sh.template flume-env.sh
    
    //修改配置信息
    export JAVA_HOME=/usr/local/jdk1.8.0_211
    
  4. 配置环境变量
    vim /etc/profile
    
    export FLUME_HOME=/usr/local/flume-1.8.0
    export PATH=$PATH:$FLUME_HOME/bin
    
    source /etc/profile
    

案例一、 Flume监听端口,输出端口数据

  1. 安装telnet工具
    rpm -ivh telnet-server-0.17-59.el7.x86_64.rpm 
    rpm -ivh telnet-0.17-59.el7.x86_64.rpm
    
  2. 创建Flume Agent配置文件flume-telnet.conf
    # Name the components on this agent
    a1.sources = r1
    a1.sinks = k1
    a1.channels = c1
    
    # Describe/configure the source
    a1.sources.r1.type = netcat
    a1.sources.r1.bind = localhost
    a1.sources.r1.port = 44444
    # Describe the sink
    a1.sinks.k1.type = logger
    
    # Use a channel which buffers events in memory
    a1.channels.c1.type = memory
    a1.channels.c1.capacity = 1000
    a1.channels.c1.transactionCapacity = 100
    
    # Bind the source and sink to the channel
    a1.sources.r1.channels = c1
    a1.sinks.k1.channel = c1
    
  3. 开启flume监听端口
    flume-ng agent \
    --conf conf/ \
    --conf-file /usr/local/flume-1.8.0/conf/flume-telnet.conf \
    --name a1 \
    -Dflume.root.logger=INFO,console
    
  4. 使用telnet工具向本机的44444端口发送内容
    telnet localhost 44444
    

案例二、监听上传Hive日志文件到HDFS

  1. 拷贝hadoop相关jar到flume的lib目录下
    cp ./hadoop-2.7.3/share/hadoop/common/lib/hadoop-auth-2.7.3.jar ./flume-1.8.0/lib
    cp ./hadoop-2.7.3/share/hadoop/common/lib/commons-configuration-1.6.jar ./flume-1.8.0/lib
    cp ./hadoop-2.7.3/share/hadoop/hdfs/lib/hadoop-hdfs-2.7.3.jar ./flume-1.8.0/lib
    cp ./hadoop-2.7.3/share/hadoop/common/hadoop-common-2.7.3.jar ./flume-1.8.0/lib
    
  2. 创建flume-hdfs.conf文件
    # Name the components on this agent
    a2.sources = r2
    a2.sinks = k2
    a2.channels = c2
    
    # Describe/configure the source
    a2.sources.r2.type = exec
    a2.sources.r2.command = tail -F /usr/local/hive/logs/hive.log
    a2.sources.r2.shell = /bin/bash -c
    
    # Describe the sink
    a2.sinks.k2.type = hdfs
    a2.sinks.k2.hdfs.path = hdfs://hadoop01:9000/flume/%Y%
    #上传文件的前缀
    a2.sinks.k2.hdfs.filePrefix = events-hive-
    #是否按照时间滚动文件夹
    a2.sinks.k2.hdfs.round = true
    #多少时间单位创建一个新的文件夹
    a2.sinks.k2.hdfs.roundValue = 1
    #重新定义时间单位
    a2.sinks.k2.hdfs.roundUnit = hour
    #是否使用本地时间戳
    a2.sinks.k2.hdfs.useLocalTimeStamp = true
    #积攒多少个Event才flush到HDFS一次
    a2.sinks.k2.hdfs.batchSize = 1000
    #设置文件类型,可支持压缩
    a2.sinks.k2.hdfs.fileType = DataStream
    #多久生成一个新的文件
    a2.sinks.k2.hdfs.rollInterval = 600
    #设置每个文件的滚动大小
    a2.sinks.k2.hdfs.rollSize = 134210000
    #文件的滚动与Event数量无关
    a2.sinks.k2.hdfs.rollCount = 0
    #最小冗余数
    a2.sinks.k2.hdfs.minBlockReplicas = 1
    
    # Use a channel which buffers events in memory
    a2.channels.c2.type = memory
    a2.channels.c2.capacity = 1000
    a2.channels.c2.transactionCapacity = 100
    
    # Bind the source and sink to the channel
    a2.sources.r2.channels = c2
    a2.sinks.k2.channel = c2
    
  3. 执行监控配置
    flume-ng agent \
    --conf /usr/local/flume/conf/ \
    --conf-file /usr/local/flume/conf/flume-hdfs.conf \
    --name a2
    

案例三、Flume监听整个目录

  1. 创建配置文件flume-dir.conf
    # Name the components on this agent
    a3.sources = r3
    a3.sinks = k3
    a3.channels = c3
    
    # Describe/configure the source
    a3.sources.r3.type = spooldir
    a3.sources.r3.spoolDir = /home/test/flume
    a3.sources.r3.fileHeader = true
    #忽略所有以.tmp结尾的文件,不上传
    a3.sources.r3.ignorePattern = ([^ ]*\.tmp)
    # Describe the sink
    a3.sinks.k3.type = hdfs
    a3.sinks.k3.hdfs.path = hdfs://hadoop01:9000/flume
    #上传文件的前缀
    a3.sinks.k3.hdfs.filePrefix = upload-
    #是否按照时间滚动文件夹
    a3.sinks.k3.hdfs.round = true
    #多少时间单位创建一个新的文件夹
    a3.sinks.k3.hdfs.roundValue = 1
    #重新定义时间单位
    a3.sinks.k3.hdfs.roundUnit = hour
    #是否使用本地时间戳
    a3.sinks.k3.hdfs.useLocalTimeStamp = true
    #积攒多少个Event才flush到HDFS一次
    a3.sinks.k3.hdfs.batchSize = 1000
    #设置文件类型,可支持压缩
    a3.sinks.k3.hdfs.fileType = DataStream
    #多久生成一个新的文件
    a3.sinks.k3.hdfs.rollInterval = 600
    #设置每个文件的滚动大小
    a3.sinks.k3.hdfs.rollSize = 134210000
    #文件的滚动与Event数量无关
    a3.sinks.k3.hdfs.rollCount = 0
    #最小冗余数
    a3.sinks.k3.hdfs.minBlockReplicas = 1
    
    # Use a channel which buffers events in memory
    a3.channels.c3.type = memory
    a3.channels.c3.capacity = 1000
    a3.channels.c3.transactionCapacity = 100
    
    # Bind the source and sink to the channel
    a3.sources.r3.channels = c3
    a3.sinks.k3.channel = c3
    
  2. 执行测试
    flume-ng agent \
    --conf /usr/local/flume/conf/ \
    --conf-file /usr/local/flume/conf/flume-dir.conf \
    --name a3 &
    
    flume官网

你可能感兴趣的:(Hadoop)