spoolingdir-kafka-hive典型日志收集flume配置

1,准备工作

hive 添加分区信息,orc表,分区,分桶,字段小写

ALTER TABLE http add PARTITION (day=20181228,hour=18);

hive metastore服务开启

hive --service metastore

2,flume kafka sink

# Name the components on this agent
# xdr/txt ->kafka


# agent2kafkahttp
agent2kafkahttp.sources = xdrss
agent2kafkahttp.sinks = kafkask
agent2kafkahttp.channels = memcn


# Describe/configure the source
agent2kafkahttp.sources.xdrss.type = spooldir
agent2kafkahttp.sources.xdrss.spoolDir = /datas/loading/http
agent2kafkahttp.sources.xdrss.deletePolicy=immediate


# Describe the sink
agent2kafkahttp.sinks.kafkask.type = org.apache.flume.sink.kafka.KafkaSink
agent2kafkahttp.sinks.kafkask.kafka.topic = httptopic
agent2kafkahttp.sinks.kafkask.kafka.bootstrap.servers = hdmaster:9092,hdslave2:9092,hdslave3:9092
agent2kafkahttp.sinks.kafkask.kafka.flumeBatchSize = 5000
agent2kafkahttp.sinks.kafkask.kafka.producer.acks = 0
agent2kafkahttp.sinks.kafkask.kafka.producer.linger.ms = 1
agent2kafkahttp.sinks.kafkask.kafka.producer.compression.type = snappy


# Use a channel which buffers events in memory
agent2kafkahttp.channels.memcn.type = memory
agent2kafkahttp.channels.memcn.capacity = 1000000
agent2kafkahttp.channels.memcn.transactionCapacity = 100000
# 2G
agent2kafkahttp.channels.memcn.byteCapacity  = 2000000000  



# Bind the source and sink to the channel
agent2kafkahttp.sources.xdrss.channels = memcn
agent2kafkahttp.sinks.kafkask.channel = memcn

3, kafka source-hive sink

# Name the components on this agent

# kafka ->hive


# agent2hivehttp

agent2hivehttp.sources = kafkass
agent2hivehttp.sinks = hivesk
agent2hivehttp.channels = memcn

# Describe/configure the source
agent2hivehttp.sources.kafkass.type = org.apache.flume.source.kafka.KafkaSource
agent2hivehttp.sources.kafkass.batchSize = 5000
agent2hivehttp.sources.kafkass.batchDurationMillis = 2000
agent2hivehttp.sources.kafkass.kafka.bootstrap.servers = hdmaster:9092,hslave2:9092,hslave3:9092
agent2hivehttp.sources.kafkass.kafka.topics = httptopic


# Describe the sink
agent2hivehttp.sinks.hivesk.type = hive
agent2hivehttp.sinks.hivesk.hive.metastore = thrift://hdmaster:9083
agent2hivehttp.sinks.hivesk.hive.database = jsltdpi
agent2hivehttp.sinks.hivesk.hive.table = http
agent2hivehttp.sinks.hivesk.hive.partition = %Y%m,%H
agent2hivehttp.sinks.hivesk.useLocalTimeStamp = true
agent2hivehttp.sinks.hivesk.round = true
agent2hivehttp.sinks.hivesk.roundValue = 10
agent2hivehttp.sinks.hivesk.roundUnit = minute
agent2hivehttp.sinks.hivesk.serializer = DELIMITED
agent2hivehttp.sinks.hivesk.serializer.delimiter = "|"
agent2hivehttp.sinks.hivesk.serializer.serdeSeparator = '|'
agent2hivehttp.sinks.hivesk.serializer.fieldnames =length,localprovince,localcity,interface,xdrid,apptypecode,procedurestarttime,procedureendtime,longitude,latitude,protocoltype,apptype,appsub_type,appcontent,appstatus,ipaddresstype,user_ipv4,user_ipv6,userport,l4protocal,app_server_ip_ipv4,app_server_ip_ipv6,appserverport,uldata,dldata,ulippacket,dlippacket,updura,downdura,uldisorderippacket,dldisorderippacket,ulretransippacket,dlretransippacket,tcpresponsetime,tcpacktime,ul_ip_frag_packets,dl_ip_frag_packets,firstreqtime,firstresponsetime,windows,mss,tcpsynnum,tcpstatus,sessionend,tcpsynackmum,tcpacknum,tcp1_2handshakestatus,tcp2_3handshakestatus,ulprobeid,ullinkindex,dlprobeid,dllinkindex,transactionid,flowcontrol,ul_avg_rtt,dw_avg_rtt,useraccount,referxdrid	,httpversion,messagetype,messagestatus,firsthttpresponsetime,lastcontentpackettime,lastacktime,hostlength,host,urilength,uri,x_online_hostlength,x_online_host,user_agentlength,user_agent,http_content_type,refer_urilength,refer_uri,cookielength,cookie,content_length,keyword,servicebehaviorflag,servicecompflag,servicetime,ie,portal,locationlength,location,firstrequest,useraccount1,uritype,urisub_type

# Use a channel which buffers events in memory
agent2hivehttp.channels.memcn.type = memory
agent2hivehttp.channels.memcn.capacity = 1000000
agent2hivehttp.channels.memcn.transactionCapacity = 100000
agent2hivehttp.channels.memcn.byteCapacity  = 2000000000  

# Bind the source and sink to the channel
agent2hivehttp.sources.kafkass.channels = memcn
agent2hivehttp.sinks.hivesk.channel = memcn

4,开启flume

#flume-ng agent -n agent2hiveradius -c conf -f /root/conf2hiveradius.properties --classpath /home/hadoop/hive-2.3.4/lib/*:/home/hadoop/hive-2.3.4/hcatalog/share/hcatalog/* -Dflume.root.logger=INFO,console
#flume-ng agent -n agent2kafkaradius -c conf -f /root/conf2kafkaradius.properties --classpath /home/hadoop/hive-2.3.4/lib/*:/home/hadoop/hive-2.3.4/hcatalog/share/hcatalog/* -Dflume.root.logger=INFO,console

 

你可能感兴趣的:(kafka,flume,hive)