大数据串讲-从日志文件分析访问量最高的10个接口及响应访问量

安装软件


Hadoop

安装Hadoop 2.6.1
参考:http://www.powerxing.com/install-hadoop/

Hive

安装Hive 2.1.1
参考 :https://my.oschina.net/jackieyeah/blog/735424

Sqoop

安装Sqoop 1.4.6
参考:http://blog.csdn.net/dwld_3090271/article/details/50747639

步骤


准备数据


通过日志

使用 tomcat access log 作为数据来源。
日志格式:
date time cs(x-forwarded-for) c-ip s-ip sc-status bytes time-taken cs-method cs-uri-stem cs-uri-query cs(x-c_sessionid) cs(Cookie) cs(Referer) cs(User-Agent)
日志示例:
2017-04-18 02:20:07 - 127.0.0.1 127.0.0.1 200 165 0.065 GET /user/service/get-group-list - - - - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
2017-04-18 02:25:08 "127.0.0.1, 127.0.0.1" 127.0.0.1 127.0.0.1 200 165 0.065 GET /user/service/get-info - - - - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"

通过关系型数据库导入

sqoop --options-file import_conf
import_conf 见下文

hive建表

CREATE TABLE
IF NOT EXISTS useracc (
	dt string,
	time string,
	x_forwarded_for string,
	client_ip string,
	server_ip string,
	http_status string,
	content_length string,
	time_taken string,
	http_method string,
	url string,
	query_string string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;

CREATE TABLE
IF NOT EXISTS userlog (
	client_ip string,
	server_ip string,
	http_status INT,
	content_length INT,
	time_taken DOUBLE,
	http_method string,
	url string
) PARTITIONED BY (dt STRING, ht STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;

CREATE TABLE
IF NOT EXISTS urlcount (
	url string,
	total bigint,
	app string,
	day string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;


导入数据到useracc表

load data local inpath /home/user/input/user-acc.2017-02-27.log into table useracc

导入数据到userlog表并分区(按天和小时分区)

set hive.exec.dynamic.partition=true;  
set hive.exec.dynamic.partition.mode=nonstrict;  
SET hive.exec.max.dynamic.partitions=3000;
SET hive.exec.max.dynamic.partitions.pernode=1000;

INSERT OVERWRITE TABLE userlog PARTITION (dt, ht)  
SELECT client_ip, server_ip, http_status, content_length, time_taken, http_method, url, dt, substr(time,0,2) FROM useracc where url != 'GET' AND url != 'POST';

INSERT INTO TABLE userlog PARTITION (dt, ht)  
SELECT server_ip, http_status, content_length, time_taken, http_method, url,query_string, dt, substr(time,0,2) FROM useracc where url = 'GET' OR url = 'POST';

统计数据并保存到结果表urlcount

hive -e "INSERT INTO TABLE urlcount SELECT url,count(1) as total,'user' as app,'2017-02-27' as day from userlog where dt = '2017-02-27' group by url sort by total desc limit 10"

导出到关系型数据库MySQL

export_conf 文件见下文
sqoop --options-file export_conf

MySQL建表语句:

DROP TABLE IF EXISTS `ACC_RESULT`;
CREATE TABLE `ACC_RESULT` (
  `url` varchar(255) DEFAULT NULL,
  `total` bigint(20) DEFAULT NULL,
  `app` varchar(32) DEFAULT NULL,
  `day` date DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

自动化脚本


启动环境startEnv.sh

#!/bin/bash 

op=$1
if [ x"$op" == x ] 
then
echo "Please input operation: start|stop"
else
  command1="/usr/local/hadoop/sbin/""$op""-dfs.sh"
  command2="/usr/local/hadoop/sbin/""$op""-yarn.sh"
  command3="/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh ""$op"" historyserver"
  if [ "$op" == "start" ] || [ "$op" == "stop" ]
  then
    eval $command1
    eval $command2
    eval $command3
  else
    echo "Only (start|stop) operation is allowed"
  fi
fi

建表create_table.hql

CREATE TABLE
IF NOT EXISTS useracc (
	dt string,
	time string,
	x_forwarded_for string,
	client_ip string,
	server_ip string,
	http_status string,
	content_length string,
	time_taken string,
	http_method string,
	url string,
	query_string string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE;

CREATE TABLE
IF NOT EXISTS userlog (
	client_ip string,
	server_ip string,
	http_status INT,
	content_length INT,
	time_taken DOUBLE,
	http_method string,
	url string
) PARTITIONED BY (dt STRING, ht STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;

CREATE TABLE
IF NOT EXISTS urlcount (
	url string,
	total bigint,
	app string,
	day string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;


统计user.hql

set hive.exec.dynamic.partition=true;  
set hive.exec.dynamic.partition.mode=nonstrict;  
SET hive.exec.max.dynamic.partitions=3000;
SET hive.exec.max.dynamic.partitions.pernode=1000;

INSERT OVERWRITE TABLE userlog PARTITION (dt, ht)  
SELECT client_ip, server_ip, http_status, content_length, time_taken, http_method, url, dt, substr(time,0,2) FROM useracc where url != 'GET' AND url != 'POST';

INSERT INTO TABLE userlog PARTITION (dt, ht)  
SELECT server_ip, http_status, content_length, time_taken, http_method, url,query_string, dt, substr(time,0,2) FROM useracc where url = 'GET' OR url = 'POST';

启动任务start.sh

#!/bin/bash 
app=$2
hive -f create_table.hql


hive -e "truncate table useracc"
hive -e "truncate table userlog"
hive -e "truncate table urlcount"


echo BaseDir: $1
for i in $1/*; do   
 sql="load data local inpath '"$(cd $(dirname $i);pwd;)/$(basename $i)"' into table useracc"
 echo "loading data from" $(basename $i)
 hive -e "$sql"
done


hive -f user.hql


for i in $1/*; do   
 echo "Processing" $(basename $i)
 logname_length=$(expr length $(basename $i))
 start_index=$(expr $logname_length - 13)
 day=$(expr substr $(basename $i) $start_index 10)
 echo "dt=""$day"
 hive -e "INSERT INTO TABLE urlcount SELECT url,count(1) as total,'""$app""' as app,'""$day""' as day from userlog where dt = '""$day""' group by url sort by total desc limit 10"
done


# show result
#hive -e "select * from urlcount"


# export data into MySQL
sqoop --options-file export_conf

Sqoop导出配置

export    
    
-m    
1    
    
--connect     
jdbc:mysql://127.0.0.1/result     
    
--username     
test     
    
--password     
test  
  
--export-dir   
/user/hive/warehouse/urlcount  
  
--input-fields-terminated-by   
'\t'  
  
--input-lines-terminated-by   
'\n'  
  
--columns  
url,total,app,day  
  
--table     
ACC_RESULT   

Sqoop导入配置import_conf

import  
  
-m  
1  
  
--connect   
jdbc:mysql://127.0.0.1/test
  
--username   
test
  
--password   
test

--fields-terminated-by
'\t' 

--lines-terminated-by
'\n'
  
--table   
operation_log

--hive-table
operation_log

--create-hive-table
  
--columns  
ID,ACTIONTYPE,ACTIONVALUE
  
--where  
'ID<=10000'  

--hive-import

一键启动

sh start.sh /home/user/input user

常见问题

2017-04-20 16:17:43,077 main ERROR Could not register mbeans java.security.AccessControlException: access denied ("javax.management.MBeanTrustPermission" "register")
	at java.security.AccessControlContext.checkPermission(AccessControlContext.java:372)
	at java.lang.SecurityManager.checkPermission(SecurityManager.java:585)
	at com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.checkMBeanTrustPermission(DefaultMBeanServerInterceptor.java:1848)
	at com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.registerMBean(DefaultMBeanServerInterceptor.java:322)
	at com.sun.jmx.mbeanserver.JmxMBeanServer.registerMBean(JmxMBeanServer.java:522)
	at org.apache.logging.log4j.core.jmx.Server.register(Server.java:379)
	at org.apache.logging.log4j.core.jmx.Server.reregisterMBeansAfterReconfigure(Server.java:171)
	at org.apache.logging.log4j.core.jmx.Server.reregisterMBeansAfterReconfigure(Server.java:147)
	at org.apache.logging.log4j.core.LoggerContext.setConfiguration(LoggerContext.java:457)
	at org.apache.logging.log4j.core.LoggerContext.start(LoggerContext.java:246)
	at org.apache.logging.log4j.core.impl.Log4jContextFactory.getContext(Log4jContextFactory.java:230)
	at org.apache.logging.log4j.core.config.Configurator.initialize(Configurator.java:140)
	at org.apache.logging.log4j.core.config.Configurator.initialize(Configurator.java:113)
	at org.apache.logging.log4j.core.config.Configurator.initialize(Configurator.java:98)
	at org.apache.logging.log4j.core.config.Configurator.initialize(Configurator.java:156)
	at org.apache.hadoop.hive.common.LogUtils.initHiveLog4jDefault(LogUtils.java:155)
	at org.apache.hadoop.hive.common.LogUtils.initHiveLog4jCommon(LogUtils.java:91)
	at org.apache.hadoop.hive.common.LogUtils.initHiveLog4jCommon(LogUtils.java:83)
	at org.apache.hadoop.hive.common.LogUtils.initHiveLog4j(LogUtils.java:66)
	at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:657)
	at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:641)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at org.apache.sqoop.hive.HiveImport.executeScript(HiveImport.java:331)
	at org.apache.sqoop.hive.HiveImport.importTable(HiveImport.java:241)
	at org.apache.sqoop.tool.ImportTool.importTable(ImportTool.java:514)
	at org.apache.sqoop.tool.ImportTool.run(ImportTool.java:605)
	at org.apache.sqoop.Sqoop.run(Sqoop.java:143)
	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
	at org.apache.sqoop.Sqoop.runSqoop(Sqoop.java:179)
	at org.apache.sqoop.Sqoop.runTool(Sqoop.java:218)
	at org.apache.sqoop.Sqoop.runTool(Sqoop.java:227)
	at org.apache.sqoop.Sqoop.main(Sqoop.java:236)

解决办法:
%JAVA_HOME%\jre\lib\security\java.policy
添加如下内容:
permission javax.management.MBeanTrustPermission "register";

Encountered IOException running import job: java.io.IOException: java.lang.ClassNotFoundException: org.apache.hadoop.hive.conf.HiveConf

解决办法:
增加环境变量
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$HIVE_HOME/lib/*
Sqoop-env.sh 设置 HIVE_CONF_DIR




你可能感兴趣的:(大数据)