学习笔记:从0开始学习大数据-39.综合实训二:hive+hbase对nginx日志分析

一、本节实现  nginx日志 ->flume-> hbase ->hive ->file

即ngnix数据导入到hbase保存,使用hive创建外表使用sql查询,结果存入本地文件。

1. flume 导入nginx日志到hbase

hbase 先建立表

create 'nginx_log','log_info'

[root@centos7 apache-flume-1.6.0-cdh5.16.1-bin]# cat conf/hbase_nginx_1.conf
a3.sources = r3
a3.sinks = k3
a3.channels = c3

# Describe/configure the source
a3.sources.r3.type = exec
a3.sources.r3.command = cat /var/log/nginx/access.log-20200221
a3.sources.r3.checkperiodic = 1000

# Describe the sink
a3.sinks.k3.type = org.apache.flume.sink.hbase.HBaseSink
a3.sinks.k3.table = nginx_log
a3.sinks.k3.columnFamily = log_info
a3.sinks.k3.serializer =  org.apache.flume.sink.hbase.RegexHbaseEventSerializer
a3.sinks.k3.serializer.regex = (\\d+\\.\\d+\\.\\d+\\.\\d+) - ([^ ]*) \\[(.*)\\] \"(.*)\" ([^ ]*) ([^ ]*)
a3.sinks.k3.serializer.colNames = ip,user,date_now,url,status,bytes

# Use a channel which buffers events in memory
a3.channels.c3.type = file
a3.channels.c3.checkpointDir =  /home/linbin/software/apache-flume-1.6.0-cdh5.16.1-bin/flume_file/checkpoint
a3.channels.c3.dataDirs = /home/linbin/software/apache-flume-1.6.0-cdh5.16.1-bin/flume_file/data

# Bind the source and sink to the channel
a3.sinks.k3.channel = c3
a3.sources.r3.channels = c3

[root@centos7 apache-flume-1.6.0-cdh5.16.1-bin]# bin/flume-ng agent --name a3 --conf conf --conf-file conf/hbase_nginx_1.conf -Dflume.root.logger=info,console
2.  hbase 查看导入的数据

[root@centos7 ~]# hbase shell
HBase Shell; enter 'help' for list of supported commands.
Type "exit" to leave the HBase Shell
Version 1.2.0-cdh5.15.1, rUnknown, Thu Aug  9 09:07:24 PDT 2018

hbase(main):001:0> count 'nginx_log'
Current count: 1000, row: 1582901957233-l2PYhzNr9E-999                                                                                                
Current count: 2000, row: 1582901957569-l2PYhzNr9E-1999                                                                                               
Current count: 3000, row: 1582901957759-l2PYhzNr9E-2999                                                                                               
Current count: 4000, row: 1582901957940-l2PYhzNr9E-3999                                                                                               
Current count: 5000, row: 1582901958111-l2PYhzNr9E-4999                                                                                               
Current count: 6000, row: 1582901958285-l2PYhzNr9E-5999                                                                                               
Current count: 7000, row: 1582901958484-l2PYhzNr9E-6999                                                                                               
Current count: 8000, row: 1582901958657-l2PYhzNr9E-7999                                                                                               
Current count: 9000, row: 1582901958810-l2PYhzNr9E-8999                                                                                               
Current count: 10000, row: 1582901958976-l2PYhzNr9E-9999                                                                                              
10450 row(s) in 3.1460 seconds

=> 10450
hbase(main):002:0> 


hbase(main):006:0> scan 'nginx_log',{LIMIT=>1}
ROW                                    COLUMN+CELL                                                                                                    
 1582901956453-l2PYhzNr9E-0            column=log_info:bytes, timestamp=1582901956786, value=24365                                                    
 1582901956453-l2PYhzNr9E-0            column=log_info:date_now, timestamp=1582901956786, value=21/Feb/2020:00:11:05 +0800                            
 1582901956453-l2PYhzNr9E-0            column=log_info:ip, timestamp=1582901956786, value=163.204.2.104                                               
 1582901956453-l2PYhzNr9E-0            column=log_info:status, timestamp=1582901956786, value=200                                                     
 1582901956453-l2PYhzNr9E-0            column=log_info:url, timestamp=1582901956786, value=GET / HTTP/1.1                                             
 1582901956453-l2PYhzNr9E-0            column=log_info:user, timestamp=1582901956786, value=-                                                         
1 row(s) in 0.0140 seconds

3. hive创建hbase外表,测试查询表

#cd /home/linbin/software/hive-1.1.0-cdh5.15.1/bin
#./hive

hive> CREATE EXTERNAL TABLE nginx_log(key string, bytes string,date_now string,ip string,status string,url string)   
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'  
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,log_info:bytes,log_info:date_now,log_info:ip,log_info:status,log_info:url")   
TBLPROPERTIES("hbase.table.name" = "nginx_log");
hive> select * from nginx_log limit 1;
hive> select count(*) from nginx_log where ip='163.204.2.104';
hive> select ip,count(*) from nginx_log group by ip;
hive> select spilt(date_now,':')[1],count(*) from nginx_log group by ip;
hive> select split(date_now,':')[1] as house,count(*) from nginx_log group by split(date_now,':')[1];

4.  查询存入本地文件

hive> insert overwrite local directory '/usr/local/apache-tomcat-9.0.19/webapps/examples/nginxlog3'
    > row format delimited fields terminated by ','
    > select split(date_now,':')[1] as house,count(*) from nginx_log group by split(date_now,':')[1];
[root@centos7 bin]# cat /usr/local/apache-tomcat-9.0.19/webapps/examples/nginxlog3/000000_0
00,236
01,104
02,116
03,47
04,484
05,855
...
可以查看到结果,即每小时的访问量

5. 也可以直接bash执行hive查询

./hive -e "select split(date_now,':')[1] as house,count(*) from nginx_log group by split(date_now,':')[1]"  |  grep -v "WARN"  | tr "\t" "," >> /usr/local/apache-tomcat-9.0.19/webapps/examples/nginxlog2

cat /usr/local/apache-tomcat-9.0.19/webapps/examples/nginxlog2

00,236
01,104
02,116
03,47
04,484
05,855
...
二、. 进一步实现hive查询结果导入到mysql

已实现:nginx日志 ->flume-> hbase ->hive ->file

进一步:nginx日志 ->flume-> hbase ->hive ->hdfs->sqoop2->mysql

就是hive查询结果写入hdfs,再通过sqoop2导入到mysql

前面几步按上面数据导入到了hbase,hive创建了外部表,接下来

1. hive查询结果写入hdfs

hive> insert overwrite  directory '/user/sqoop/nginx_log'
> row format delimited fields terminated by ','
> select split(date_now,':')[1] as house,count(*) from nginx_log group by split(date_now,':')[1];

[root@centos7 bin]# hadoop fs -cat /user/sqoop/nginx_log/000000_0
00,236
01,104
02,116
03,47
04,484
05,855
2. mysql 创建表格

create sqoop.loginx_log1(house varchar(20),vicount int(11));

3.sqoop2 中创建job 并执行job结果如下

sqoop:000> show connector
+----+------------------------+------------------+------------------------------------------------------+----------------------+
| Id |          Name          |     Version      |                        Class                         | Supported Directions |
+----+------------------------+------------------+------------------------------------------------------+----------------------+
| 1  | generic-jdbc-connector | 1.99.5-cdh5.15.1 | org.apache.sqoop.connector.jdbc.GenericJdbcConnector | FROM/TO              |
| 2  | kite-connector         | 1.99.5-cdh5.15.1 | org.apache.sqoop.connector.kite.KiteConnector        | FROM/TO              |
| 3  | hdfs-connector         | 1.99.5-cdh5.15.1 | org.apache.sqoop.connector.hdfs.HdfsConnector        | FROM/TO              |
| 4  | kafka-connector        | 1.99.5-cdh5.15.1 | org.apache.sqoop.connector.kafka.KafkaConnector      | TO                   |
+----+------------------------+------------------+------------------------------------------------------+----------------------+
sqoop:000> show link
+----+---------------+--------------+------------------------+---------+
| Id |     Name      | Connector Id |     Connector Name     | Enabled |
+----+---------------+--------------+------------------------+---------+
| 4  | mysql-link-ok | 1            | generic-jdbc-connector | true    |
| 5  | hdfs-link     | 3            | hdfs-connector         | true    |
+----+---------------+--------------+------------------------+---------+
sqoop:000> show job
+----+---------------------------+----------------+--------------+---------+
| Id |           Name            | From Connector | To Connector | Enabled |
+----+---------------------------+----------------+--------------+---------+
| 1  | from-mysql-to-hdfs-import | 1              | 3            | true    |
| 5  | fromhdfstomysql           | 3              | 1            | true    |
+----+---------------------------+----------------+--------------+---------+
sqoop:000> show job -jid 5
1 job(s) to show: 
Job with id 5 and name fromhdfstomysql (Enabled: true, Created by root at 20-3-2 下午1:01, Updated by root at 20-3-2 下午2:47)
Using link id 5 and Connector id 3
  From Job configuration
    Input directory: /user/sqoop/nginx_log/000000_0
    Override null value: 
    Null value: 
  Throttling resources
    Extractors: 1
    Loaders: 1
  To database configuration
    Schema name: sqoop
    Table name: nginx_log1
    Table SQL statement: 
    Table column names: 
    Stage table name: 
    Should clear stage table: 
sqoop:000> 
sqoop:000> start job --jid 5 -s 
Submission details
Job ID: 5
Server URL: http://centos7:12000/sqoop/
Created by: root
Creation date: 2020-03-02 14:47:58 CST
Lastly updated by: root
External ID: job_1583121373156_0012
    http://centos7:8088/proxy/application_1583121373156_0012/
Target Connector schema: Schema{name=sqoop.nginx_log1,columns=[
    Text{name=house,nullable=true,type=TEXT,charSize=null},
    FixedPoint{name=vicount,nullable=true,type=FIXED_POINT,byteSize=4,signed=true}]}
2020-03-02 14:47:58 CST: BOOTING  - Progress is not available
2020-03-02 14:48:15 CST: RUNNING  - 0.00 %
2020-03-02 14:48:25 CST: SUCCEEDED 

4.到mysql查询导入结果

MariaDB [sqoop]> select * from nginx_log1;
+-------+---------+
| house | vicount |
+-------+---------+
| 00    |     236 |
| 01    |     104 |
| 02    |     116 |
| 03    |      47 |
| 04    |     484 |
| 05    |     855 |
本次实验结束

然后,通过网页读取文件或查询mysql生成web图表,可参考上节。

你可能感兴趣的:(系统集成,hadoop,Hadoop)