关于flume的采集数据源类型、channel的类型、拦截器、选择器使用

sources

exec类型执行一条命令

a1.sources.s1.type = exec
a1.sources.s1.command = tail -f /export/datas/shells/access.log

channel

memory类型存到内存中

agent.channels.c1.type = memory
#channel的容量最多能存多少个event
agent.channels.c1.capacity = 1000
#单次读取event的个数
agent.channels.c1.transactionCapacity = 100

sink

#avro类型将数据发送到配置的主机名/端口
agent.sinks.k1.type = avro
agent.sinks.k1.hostname = kak01
agent.sinks.k1.port = 45454

拦截器

#################################define the agent###########################
agent.sources = s1 s2
agent.channels = c1
agent.sinks = k1

#define source s1
agent.sources.s1.type = exec
agent.sources.s1.command = tail -f /export/servers/hive-1.1.0-cdh5.14.0/logs/hive.log

agent.sources.s2.type = exec
agent.sources.s2.command = tail -f /export/datas/flume.txt

#add interceptor
agent.sources.s1.interceptors = i1
agent.sources.s1.interceptors.i1.type = static
agent.sources.s1.interceptors.i1.key = filename
agent.sources.s1.interceptors.i1.value = hive

agent.sources.s2.interceptors = i1
agent.sources.s2.interceptors.i1.type = static
agent.sources.s2.interceptors.i1.key = filename
agent.sources.s2.interceptors.i1.value = flume

#define channel
agent.channels.c1.type = memory
agent.channels.c1.capacity = 1000
agent.channels.c1.transactionCapacity = 100

#define sink
agent.sinks.k1.type = avro
agent.sinks.k1.hostname = hpsk.bigdata01.com
agent.sinks.k1.port = 45454

#bond
agent.sources.s1.channels = c1
agent.sources.s2.channels = c1
agent.sinks.k1.channel = c1

#################################define the collect###########################
collect.sources = s1
collect.channels = c1
collect.sinks = k1

#define source s1
collect.sources.s1.type = avro
collect.sources.s1.bind = hpsk.bigdata01.com
collect.sources.s1.port = 45454

#define channel
collect.channels.c1.type = memory
collect.channels.c1.capacity = 1000
collect.channels.c1.transactionCapacity = 100

#define sink
collect.sinks.k1.type = hdfs
#引用自定义的拦截器的key的变量名字filename
collect.sinks.k1.hdfs.path = /flume/interceptors/%{filename}

#bond
collect.sources.s1.channels = c1
collect.sinks.k1.channel = c1

-》source拦截器:对采集的数据实现过滤、在event的头部封装对应keyvalue

		Flume Interceptors:
			Timestamp Interceptor:时间戳拦截器
				在event头部添加一个keyvalue
					key:timestamp
					value:该event的生成的时间
					
				a1.sources.s1.interceptors = i1   #拦截器的名字叫i1
				a1.sources.s1.interceptors.i1.type = timestamp   #拦截器的类型
			Host Interceptor:主机名拦截器
				在event头部添加一个keyvalue
					key:host
					value:该event生成的机器的主机名
			Static Interceptor:自定义拦截器
				在event头部添加一个keyvalue
					key和value都自定义
					
			Regex Filtering Interceptor:对数据实现过滤
				a1.sources.s1.interceptors = i1
				a1.sources.s1.interceptors.i1.type = regex_filter
				a1.sources.s1.interceptors.i1.regex = (\\d):(\\d):(\\d)
				如果该行数据符合正则,就会被封装成event

-》channel选择器

		Replicating Channel Selector (default)
			将source的数据给每个Channel发一份
		Multiplexing Channel Selector:按照规则将数据给不同的channel
			a1.sources = r1
			a1.channels = c1 c2 c3 c4
			a1.sources.r1.selector.type = multiplexing
			a1.sources.r1.selector.header = key
			a1.sources.r1.selector.mapping.value1 = c1
			a1.sources.r1.selector.mapping.value2 = c2 c3
			a1.sources.r1.selector.default = c4

-》sink处理器(必用):构建sinkGroup ,将多个sink放入统一group

		-》故障转移:Failover 
			sink1:正常工作的  
			sink2:备份(standby)
			
			a1.sinkgroups = g1    #定义一个sinkgroups
			a1.sinkgroups.g1.sinks = k1 k2  #把定义好的两个sink 放到sinkgroups中
			a1.sinkgroups.g1.processor.type = failover  # 故障转移
			a1.sinkgroups.g1.processor.priority.k1 = 5  # 设置权重
			a1.sinkgroups.g1.processor.priority.k2 = 10 # 设置权重
			a1.sinkgroups.g1.processor.maxpenalty = 10000
			
			权重最高的优先执行
			
		-》负载均衡:load_balance
			sink1		sink2:两个一起工作
			a1.sinkgroups = g1
			a1.sinkgroups.g1.sinks = k1 k2
			a1.sinkgroups.g1.processor.type = load_balance
		   
		   # 负载均衡包含了故障转移,常用这种配置

案例1

#################################define the agent###########################
agent.sources = s1
agent.channels = c1
agent.sinks = k1 k2

#define source s1
agent.sources.s1.type = exec
agent.sources.s1.command = tail -f /export/servers/hive-1.1.0-cdh5.14.0/logs/hive.log

#define channel
agent.channels.c1.type = memory
agent.channels.c1.capacity = 1000
agent.channels.c1.transactionCapacity = 100

#define sink
agent.sinks.k1.type = avro
agent.sinks.k1.hostname = hpsk.bigdata01.com
agent.sinks.k1.port = 45454

agent.sinks.k2.type = avro
agent.sinks.k2.hostname = hpsk.bigdata02.com
agent.sinks.k2.port = 45454

a1.sinkgroups = g1
a1.sinkgroups.g1.sinks = k1 k2
a1.sinkgroups.g1.processor.type = failover
a1.sinkgroups.g1.processor.priority.k1 = 10
a1.sinkgroups.g1.processor.priority.k2 = 5
a1.sinkgroups.g1.processor.maxpenalty = 100

#bond
agent.sources.s1.channels = c1
agent.sinks.k1.channel = c1
agent.sinks.k2.channel = c1

#################################define the collect###########################
collect.sources = s1
collect.channels = c1
collect.sinks = k1

#define source s1
collect.sources.s1.type = avro
#在另一台启动注意修改主机
collect.sources.s1.bind = hpsk.bigdata01.com
collect.sources.s1.port = 45454

#define channel
collect.channels.c1.type = memory
collect.channels.c1.capacity = 1000
collect.channels.c1.transactionCapacity = 100

#define sink
collect.sinks.k1.type = hdfs
collect.sinks.k1.hdfs.path = /flume/failover

#bond
collect.sources.s1.channels = c1
collect.sinks.k1.channel = c1

你可能感兴趣的:(关于flume的采集数据源类型、channel的类型、拦截器、选择器使用)