1、配置服务器格式
博主这里用的是tomcat的combined默认格式,格式如下
127.0.0.1 - - [28/Mar/2017:09:23:10 +0800] "GET /manager/html HTTP/1.1" 401 2536 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
2、进入hive shell来创建表
使用正则法则来匹配日志格式
CREATE TABLE td_log_analyze(
host STRING,
identity STRING,
usr STRING,
time STRING,
request STRING,
status STRING,
size STRING,
referer STRING,
agent STRING)
partitioned by (dt string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE;
3、配置flume-agent
#定义agent名, source、channel、sink的名称
logAnalyzeAG.sources = s1
logAnalyzeAG.channels = c1
logAnalyzeAG.sinks = k1
#具体定义source
logAnalyzeAG.sources.s1.type = spooldir
logAnalyzeAG.sources.s1.spoolDir = /home/data/tomcat/tomcat-8081/logs/access
#设置缓存提交行数
logAnalyzeAG.sources.s1.deserializer.maxLineLength =1048576
logAnalyzeAG.sources.s5.fileSuffix =.FINISH
logAnalyzeAG.sources.s5.ignorePattern=^localhost_access_log\.txt$
logAnalyzeAG.sources.s1.consumeOrder = oldest
logAnalyzeAG.sources.s1.deserializer = org.apache.flume.sink.solr.morphline.BlobDeserializer$Builder
logAnalyzeAG.sources.s1.batchsize = 5
#定义拦截器,为消息添加时间戳
#logAnalyzeAG.sources.r1.interceptors = i1
#logAnalyzeAG.sources.r1.interceptors.i1.type = org.apache.flume.interceptor.TimestampInterceptor$Builder
#具体定义channel
logAnalyzeAG.channels.c1.type = memory
logAnalyzeAG.channels.c1.capacity = 10000
logAnalyzeAG.channels.c1.transactionCapacity = 100
#具体定义sink
logAnalyzeAG.sinks.k1.type = hdfs
#%y-%m-%d/%H%M/%S
#这里对应就是hive 表的目录 此处如果是外部表,则直接对应你的localtion地址,如果普通则对应到你的hive表目录即可
logAnalyzeAG.sinks.k1.hdfs.path = hdfs://172.16.38.159:8020/apps/hive/warehouse/log_data.db/td_log_analyze/%Y-%m-%d
logAnalyzeAG.sinks.k1.hdfs.filePrefix = log-%Y-%m-%d
logAnalyzeAG.sinks.k1.hdfs.fileSuffix = .log
logAnalyzeAG.sinks.k1.hdfs.fileType = DataStream
#不按照条数生成文件
logAnalyzeAG.sinks.k1.hdfs.rollCount = 0
#HDFS上的文件达到128M时生成一个文件
logAnalyzeAG.sinks.k1.hdfs.rollSize = 2914560
#HDFS上的文件达到60秒生成一个文件
#logAnalyzeAG.sinks.k1.hdfs.rollInterval = 60
logAnalyzeAG.sinks.k1.hdfs.useLocalTimeStamp = true
#组装source、channel、sink
logAnalyzeAG.sources.s1.channels = c1
logAnalyzeAG.sinks.k1.channel = c1
PS:采用spoolDir来采集是文件级别的,即扫描新增文件。实时增量可以采用tailDir。上面的配置文件会过滤掉当天打印的文件localhost_access_log.txt。
4、因为对日志进行的分区存放,所以要想将hive表对应指定分区需要创建hive 分区
ALTER TABLE td_log_analyze ADD IF NOT EXISTS PARTITION (dt='2017-03-29') LOCATION '/apps/hive/warehouse/log_data.db/td_log_analyze/2017-03-29/';
注意,需要先创建分区,再采集数据到hdfs,hive才能识别到数据
5、编写shell脚本,每天创建第二天的hive分区
#!/bin/bash
#获取明天的日期
day=$(date --date='1 days' +%Y-%m-%d)
cd /home/
touch log.sql
#为明天预添加hive分区
echo "ALTER TABLE td_log_analyze ADD IF NOT EXISTS PARTITION (dt='${day}') LOCATION '/apps/hive/warehouse/log_data.db/td_log_analyze/${day}/';" > /home/log.sql
beeline -u 'jdbc:hive2://node0.hdp:2181,node2.hdp:2181,node1.hdp:2181/log_data;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2' -f '/home/log.sql'
6、设置定时任务(centos)
crontab -e
添加任务信息,每天凌晨两点执行
0 2 * * * /home/shell-test/hive-partitions.sh
重新载入配置
service crond reload
更多定时任务功能参考:http://julyme.com/20170329/75.html
PS:附上tomcat日志格式的配置方法
转载请注明出处: http://www.julyme.com/20170327/74.html