流程简介:
Nginx定义日志格式:
$remote_addr 客户端IP
$time_local 通用日志格式下的本地时间
$status 状态码
$body_bytes_sent 发送给客户端的字节数,不包括响应头的大小
$http_user_agent 客户端浏览器信息
$http_referer 请求的referer地址。
$request 完整的原始请求
$request_method #HTTP请求方法,通常为"GET"或"POST"
$request_time 请求处理时长
$request_uri 完整的请求地址
$server_protocol #服务器的HTTP版本,通常为 "HTTP/1.0" 或 "HTTP/1.1"
$request_body POST请求参数,参数需放form中
token $http_token (自定义header字段前加http_,即可将指定的自定义header字段打印到log中。)
version $arg_version (自定义body字段前加arg_,即可将指定的自定义header字段打印到log中。)
Nginx配置文件中配置输出日志格式:
log_format user_log_format "$remote_addr,$time_local,$status,$body_bytes_sent,$http_user_agent,$http_referer,$request_method,$request_time,$request_uri,$server_protocol,$request_body,$http_token";
示例:
1.119.140.194,29/Dec/2018:02:08:50 +0000,200,556,okhttp/3.8.1,-,GET,0.028,/phone/resource?type=3,HTTP/1.1,-,-
flume依赖的包:
flume-conf.properties:
# agent
a1.sources = s1
a1.channels = c1
a1.sinks = k1
# sources
a1.sources.s1.type = spooldir
a1.sources.s1.channels = c1
# FTP路径
a1.sources.s1.spoolDir = /var/ftp/
# 自定义拦截器
a1.sources.s1.interceptors = f1
a1.sources.s1.interceptors.f1.type = com.hx.common.flume.FlumeBuilder
# sink
a1.sinks.k1.type=hive
a1.sinks.k1.channel = c1
# hive地址
a1.sinks.k1.hive.metastore=thrift://11.11.11.11:9083
a1.sinks.k1.hive.database=hive_test
a1.sinks.k1.hive.table=nginx_acc_log
a1.sinks.k1.serializer=delimited
# 输入分隔符
a1.sinks.k1.serializer.delimiter=","
# 输出分隔符
a1.sinks.k1.serializer.serdeSeparator=','
a1.sinks.k1.serializer.fieldnames=remote_addr,time_local,status,body_bytes_sent,http_user_agent,http_referer,request_method,request_time,request_uri,server_protocol,request_body,http_token,id,appkey,sing,version
# channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 100
启动flume:
nohup ./flume-ng agent -c /opt/flume/apache-flume/conf -f /opt/flume/apache-flume/conf/flume-conf.properties -n a1 -Dflume.root.logger=INFO,console &
经过Flume拦截器处理后字段:
remote_addr,time_local,status,body_bytes_sent,http_user_agent,http_referer,request_method,request_time,request_uri,server_protocol,request_body,http_token,id,appkey,sing,version
自定义拦截器Common-1.1.0.jar包
compile ('org.apache.flume:flume-ng-sdk:1.8.0')
{
exclude group: 'org.slf4j'
}
compile ('org.apache.flume:flume-ng-core:1.8.0')
{
exclude group: 'org.slf4j'
}
package com.hx;
import org.apache.flume.Context;
import org.apache.flume.interceptor.Interceptor;
public class FlumeBuilder implements Interceptor.Builder {
@Override
public void configure(Context context) {
}
@Override
public Interceptor build() {
return new FlumeInterceptor();
}
}
package com.hx;
import com.google.common.base.Charsets;
import org.apache.commons.lang.StringUtils;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.Locale;
import java.util.Map;
public class FlumeInterceptor implements Interceptor {
RedisClientService redisClient = new RedisClientService();
@Override
public void close() {
}
@Override
public void initialize() {
}
@Override
public Event intercept(Event event) {
if (event == null) {
return null;
}
// 通过获取event数据,转化成字符串
Map headerMap = event.getHeaders();
headerMap.put("timestamp", String.valueOf(System.currentTimeMillis()));
System.out.println("采集的header信息:" + headerMap.toString());
String body = new String(event.getBody(), Charsets.UTF_8);
System.out.println("采集的body信息:" + body);
//0 remote_addr 11.11.11.11,
//1 time_local 29/Dec/2018:02:08:50 +0000,
//2 status 200,
//3 body_bytes_sent 556,
//4 http_user_agent okhttp/3.8.1,
//5 http_referer -,
//6 request_method GET POST,
//7 request_time 0.028,
//8 request_uri /api/account/phone/resource?appkey=3b20168fdc76b&channel=10&sign=9b553673437f4af3&type=3,
//9 server_protocol HTTP/1.1,
//10 request_body - aa=aaaaa,
//11 token bb01-c13b-4f4d
StringBuilder sb = new StringBuilder();
if (StringUtils.isNotEmpty(body)) {
try {
String[] body_sp = body.split(",");
// remote_addr
sb.append(body_sp[0] + ",");
// status,
sb.append(body_sp[2] + ",");
// body_bytes_sent
sb.append(body_sp[3] + ",");
// http_user_agent
sb.append(body_sp[4] + ",");
// request_method
sb.append(body_sp[6] + ",");
// request_time
sb.append(body_sp[7] + ",");
String request_method = body_sp[6];
Map params = null;
if ("GET".equals(request_method)) {
String request_uri = body_sp[8];
params = AnalysisUri.UrlRequest(request_uri);
// request_uri
sb.append(AnalysisUri.UrlPage(request_uri) + ",");
// server_protocol
sb.append(body_sp[9] + ",");
// request_body
sb.append(AnalysisUri.TruncateUrlPage(request_uri) + ",");
} else if ("POST".equals(request_method)) {
String request_body = body_sp[10];
params = AnalysisUri.NoUrlRequest(request_body);
// request_uri
sb.append(body_sp[8] + ",");
// server_protocol
sb.append(body_sp[9] + ",");
// request_body
sb.append(request_body + ",");
} else {
// 无效请求
params = null;
}
if (params != null) {
String token = body_sp[11];
sb.append(token + ",");
sb.append(params.get("appkey") + ",");
sb.append(params.get("version") + ",");
sb.append(params.get("channel") + ",");
sb.append(params.get("checktype") + ",");
String uid = getIdByToken(token, 0);
String box_id = getIdByToken(token, 1);
String device_type;
if (uid != null) {
device_type = "phone";
sb.append(uid + ",");
} else if (box_id != null) {
device_type = "box";
sb.append(box_id + ",");
} else {
device_type = "other";
sb.append(null + ",");
}
sb.append(device_type + ",");
// yyyy-MM-dd HH:mm:ss
String c_time = timeFormat(body_sp[1]);
sb.append(c_time + ",");
String dates = c_time.substring(0, 10);
sb.append(dates + ",");
String hours = c_time.substring(11, 13);
sb.append(hours);
event.setBody(sb.toString().getBytes(Charsets.UTF_8));
System.out.println("清洗后的body信息:" + sb.toString());
}
} catch (Exception e) {
e.printStackTrace();
}
}
return event;
}
@Override
public List intercept(List events) {
for (final Event event : events) {
intercept(event);
}
return events;
}
/**
* 获取id
*
* @param token
* @return
*/
public String getIdByToken(String token, int type) {
try {
String id = redisClient.getStr("token:" + token);
String k = "token:" + type + ":" + id;
if (redisClient.exist(k) && token.equals(redisClient.getStr(k))) {
return id;
} else {
return null;
}
} catch (Exception e) {
return null;
}
}
/**
* nginx日期格式化
*
* @param time
* @return
*/
private String timeFormat(String time) {
SimpleDateFormat sourceFormat = new SimpleDateFormat("dd/MMM/yyyy:hh:mm:ss Z", Locale.ENGLISH);
SimpleDateFormat transferFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String fTime = "";
try {
fTime = transferFormat.format(sourceFormat.parse(time));
} catch (ParseException e) {
e.printStackTrace();
}
return fTime;
}
}
package com.hx;
import java.util.HashMap;
import java.util.Map;
/**
* 解析出url请求的路径和参数键值
*
*/
public class AnalysisUri {
/**
* 解析出url请求的路径,包括页面
*
* @param strURL url地址
* @return url路径
*/
public static String UrlPage(String strURL) {
String strPage = null;
String[] arrSplit = null;
// strURL = strURL.trim();
arrSplit = strURL.split("[?]");
if (strURL.length() > 0) {
if (arrSplit.length > 1) {
if (arrSplit[0] != null) {
strPage = arrSplit[0];
}
}
}
return strPage;
}
/**
* 去掉url中的路径,留下请求参数部分
*
* @param strURL url地址
* @return url请求参数部分
*/
public static String TruncateUrlPage(String strURL) {
String strAllParam = null;
String[] arrSplit = null;
// strURL = strURL.trim();
arrSplit = strURL.split("[?]");
if (strURL.length() > 1) {
if (arrSplit.length > 1) {
if (arrSplit[1] != null) {
strAllParam = arrSplit[1];
}
}
}
return strAllParam;
}
/**
* 解析出url参数中的键值对
* 如 "index.jsp?Action=del&id=123",解析出Action:del,id:123存入map中
*
* @param URL url地址
* @return url请求参数部分
*/
public static Map UrlRequest(String URL) {
Map mapRequest = new HashMap();
String[] arrSplit = null;
String strUrlParam = TruncateUrlPage(URL);
if (strUrlParam == null) {
return mapRequest;
}
//每个键值为一组 www.2cto.com
arrSplit = strUrlParam.split("[&]");
for (String strSplit : arrSplit) {
String[] arrSplitEqual = null;
arrSplitEqual = strSplit.split("[=]");
//解析出键值
if (arrSplitEqual.length > 1) {
//正确解析
mapRequest.put(arrSplitEqual[0], arrSplitEqual[1]);
} else {
if (arrSplitEqual[0] != "") {
//只有参数没有值,不加入
mapRequest.put(arrSplitEqual[0], "");
}
}
}
return mapRequest;
}
/**
* 解析出url参数中的键值对
* 如 "index.jsp?Action=del&id=123",解析出Action:del,id:123存入map中
*
* @param URL url地址
* @return url请求参数部分
*/
public static Map NoUrlRequest(String URL) {
Map mapRequest = new HashMap();
String[] arrSplit = null;
arrSplit = URL.split("[&]");
for (String strSplit : arrSplit) {
String[] arrSplitEqual = null;
arrSplitEqual = strSplit.split("[=]");
//解析出键值
if (arrSplitEqual.length > 1) {
//正确解析
mapRequest.put(arrSplitEqual[0], arrSplitEqual[1]);
} else {
if (arrSplitEqual[0] != "") {
//只有参数没有值,不加入
mapRequest.put(arrSplitEqual[0], "");
}
}
}
return mapRequest;
}
}
先切换账户:
su hadoop
启动hive:
nohup hive --service metastore >> ~/metastore.log 2>&1 &
nohup hive --service hiveserver2 >> ~/hiveserver2.log 2>&1 &
修改权限:
hdfs dfs -ls /user/hive/warehouse
hadoop dfs -chmod 777 /user/hive/warehouse/hive_test.db
建表语句:(与flume输出一致)
DROP TABLE IF EXISTS nginx_acc_log;
create table nginx_acc_log(remote_addr string,time_local string,status string,body_bytes_sent string,http_user_agent string,http_referer string,request_method string,request_time string,request_uri string,server_protocol string,request_body string,http_token string,id string,appkey string,sing string,version string) clustered by (id) into 5 buckets stored as orc TBLPROPERTIES ('transactional'='true');
设置hive事务:
hive.support.concurrency
true
hive.exec.dynamic.partition.mode
nonstrict
hive.txn.manager
org.apache.hadoop.hive.ql.lockmgr.DbTxnManager
hive.compactor.initiator.on
true
hive.compactor.worker.threads
5
hive.enforce.bucketing
true
显示表:
hive> show databases;
hive> use hive_test;
hive> show tables;
hive> select * from nginx_acc_log;