hive演示

//日志处理演示
//http://download.labs.sogou.com/dl/q.html 完整版(2GB):gz格式
//访问时间\t用户ID\t[查询词]\t该URL在返回结果中的排名\t用户点击的顺序号\t用户点击的URL
//SogouQ1.txt、SogouQ2.txt、SogouQ3.txt分别是用head -n 或者tail -n 从SogouQ数据日志文件中截取

hadoop fs -mkdir /user/SougouQ1
外部表需要放到一个目录下
bin/hdfs dfs -mv /user/SogouQ1.txt /user/SogouQ1/SougouQ1.txt
查看一下文件
hadoop@moon:/usr/local/hadoop$ bin/hdfs dfs -tail /user/SogouQ1/SougouQ1.txt
15/08/27 14:34:48 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
/index.html
20111230114209  baaaf1a5de7d43ead0e8304620b62352    ��ɳ��Ѿ��Ƹ   1   5   http://www.ganji.com/gongsi/5484315
20111230114209  08802c6e199d82c166bbbabd30c7ee66    ־��ͬ�����ϵ�����ô�ദ  3   1   http://iask.sina.com.cn/b/14749937.html
20111230114209  f77b6e57e97fd04de7c85fbbc4dd363f    ��ƽ���� 1   1   http://www.zpgd.net/zpxwsp/
20111230114209  436e9b67dcdfa318c10ca57e24d71846    �ȵ�����ѧ���ʽ��� 3   1   http://wenku.baidu.com/view/da709fa1b0717fd5360cdc8a.html
20111230114209  a68813d136dcbbc5cb06beed43c56cc7    ���ŵ���ԭ����Ƶ   8   1   http://baike.baidu.com/view/1948820.htm
20111230114209  5709ccd05f90140c0dca1f89b3acdc7d    400ai   1   1   http://d4ee.com/
20111230114209  324260e211ba87a7a9549dc4fe03845e    ��  2   1   http://www.it.com.cn/f/hotweb/057/12/142870.htm
20111230114210  a993c76070ed5d145528f72197c2bcc7    22mp4��Ӱ����    1   1   http://www.22mp4.com/
20111230114210  c410bd957bc99cdebd36d7a08a756624    �����ֻ���ʬ��Ӱ����   3   1   http://www.a67.com/movie/5822
20111230114210  51c8ed79ab04cb5618be75728d9858e2    ���   2   1   http://baike.baidu.com/view/6582.htm

创建一个外部表

CREATE EXTERNAL TABLE SOGOUQ1(DT STRING,WEBSESSION STRING,WORD STRING,S_SEQ INT,C_SEQ INT,WEBSITE STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE LOCATION '/user/SogouQ1'; 创建外部表,需要把数据放在目录下面

hive> select count(*) from SOGOUQ1;
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=
In order to set a constant number of reducers:
set mapreduce.job.reduces=
15/08/27 14:37:53 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform… using builtin-java classes where applicable
15/08/27 14:37:53 WARN conf.Configuration: file:/tmp/hadoop/hive_2015-08-27_14-37-51_106_5822727492659886521-1/-local-10003/jobconf.xml:an attempt to override final parameter: mapreduce.job.end-notification.max.retry.interval; Ignoring.
15/08/27 14:37:53 WARN conf.Configuration: file:/tmp/hadoop/hive_2015-08-27_14-37-51_106_5822727492659886521-1/-local-10003/jobconf.xml:an attempt to override final parameter: mapreduce.job.end-notification.max.attempts; Ignoring.
Execution log at: /tmp/hadoop/hadoop_20150827143737_211991eb-19f1-44ba-8c36-23af6c33d79b.log
Job running in-process (local Hadoop)
Hadoop job information for null: number of mappers: 0; number of reducers: 0
2015-08-27 14:37:56,807 null map = 0%, reduce = 0%
2015-08-27 14:37:58,899 null map = 100%, reduce = 100%
Ended Job = job_local1978371024_0001
Execution completed successfully
MapredLocal task succeeded
OK
1000000
Time taken: 8.18 seconds, Fetched: 1 row(s)



 mysql>

//查询有多少行数据

hive> Select count(*) from SOGOUQ1;

//显示前10行数据,这里不需要走MapReduce

select * from SOGOUQ1 limit 10;

//搜索结果排名第1,但是点击次序排在第2的数据有多少?

select count(*) from SOGOUQ1 where S_SEQ=1 and C_SEQ=2;
hive> select count(*) from SOGOUQ1 where S_SEQ=1 and C_SEQ=2;
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
15/08/27 14:40:19 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
15/08/27 14:40:19 WARN conf.Configuration: file:/tmp/hadoop/hive_2015-08-27_14-40-16_644_8353277626646375782-1/-local-10003/jobconf.xml:an attempt to override final parameter: mapreduce.job.end-notification.max.retry.interval;  Ignoring.
15/08/27 14:40:19 WARN conf.Configuration: file:/tmp/hadoop/hive_2015-08-27_14-40-16_644_8353277626646375782-1/-local-10003/jobconf.xml:an attempt to override final parameter: mapreduce.job.end-notification.max.attempts;  Ignoring.
Execution log at: /tmp/hadoop/hadoop_20150827144040_46c4f0da-c4ee-4010-bd3f-cafd8c639a95.log
Job running in-process (local Hadoop)
Hadoop job information for null: number of mappers: 0; number of reducers: 0
2015-08-27 14:40:22,679 null map = 0%,  reduce = 0%
2015-08-27 14:40:24,752 null map = 100%,  reduce = 100%
Ended Job = job_local1522296580_0001
Execution completed successfully
MapredLocal task succeeded
OK
19771
Time taken: 8.51 seconds, Fetched: 1 row(s)

//搜索用户点击的URL含baidu的数据有多少?

select count(*) from SOGOUQ1 where WEBSITE like '%baidu%';
Hadoop job information for null: number of mappers: 0; number of reducers: 0
2015-08-27 14:43:11,930 null map = 0%,  reduce = 0%
2015-08-27 14:43:14,005 null map = 100%,  reduce = 100%
Ended Job = job_local1315812280_0001
Execution completed successfully
MapredLocal task succeeded
OK
129898
Time taken: 8.372 seconds, Fetched: 1 row(s)

//搜索结果排名第1,但是点击次序排在第2,URL含baidu的数据有多少?

select count(*) from SOGOUQ1 where S_SEQ=1 and C_SEQ=2 and WEBSITE like '%baidu%';
Hadoop job information for null: number of mappers: 0; number of reducers: 0
2015-08-27 14:41:02,373 null map = 0%,  reduce = 0%
2015-08-27 14:41:04,442 null map = 100%,  reduce = 100%
Ended Job = job_local684556565_0001
Execution completed successfully
MapredLocal task succeeded
OK
2548
Time taken: 8.299 seconds, Fetched: 1 row(s)

//session查询次数排行榜

select WEBSESSION,count(WEBSESSION) as cw from SOGOUQ1 group by WEBSESSION order by cw desc limit 10;
OK
b3c94c37fb154d46c30a360c7941ff7e    676
cc7063efc64510c20bcdd604e12a3b26    613
955c6390c02797b3558ba223b8201915    391
b1e371de5729cdda9270b7ad09484c4f    337
6056710d9eafa569ddc800fe24643051    277
637b29b47fed3853e117aa7009a4b621    266
c9f4ff7790d0615f6f66b410673e3124    231
dca9034de17f6c34cfd56db13ce39f1c    226
82e53ddb484e632437039048c5901608    221
c72ce1164bcd263ba1f69292abdfdf7c    214
Time taken: 18.425 seconds, Fetched: 10 row(s)

//创建内部表并加载数据,**原数据被移动到默认路径/user/hive/warehouse/

CREATE TABLE SOGOUQ2(DT STRING,WEBSESSION STRING,WORD STRING,S_SEQ INT,C_SEQ INT,WEBSITE STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' ;

加载hdfs上的数据(客户端加载本地文件时 LOCAL INPATH)文件移动到指定目录里了

LOAD DATA INPATH '/user/SogouQ1/SougouQ1.txt' INTO TABLE SOGOUQ2;此时数据已经移动到/user/hive/warehouse/sogouq2/SougouQ1
select WEBSESSION,count(WEBSESSION)as cw from SOGOUQ2 group by WEBSESSION order by cw desc limit 10;

//查询有多少行数据

Select count(*) from SOGOUQ2;

查询订单交易的数据

CREATE DATABASE SALEDATA;

use SALEDATA;

//qryTheDate.txt文件定义了日期的分类,将每天分别赋予所属的月份、星期、季度等属性
//日期,年月,年,月,日,周几,第几周,季度,旬、半月

CREATE TABLE tblDate(dateID string,theyearmonth string,theyear string,themonth string,thedate string,theweek string,theweeks string,thequot string,thetenday string,thehalfmonth string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' ;

//qrytblStock.txt文件定义了订单表头
//订单号,交易位置,交易日期

CREATE TABLE tblStock(ordernumber STRING,locationid STRING,dateID string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' ;

//qryStockDetail.txt文件定义了订单明细
//订单号,行号,货品,数量,金额

CREATE TABLE tblStockDetail(ordernumber STRING,rownum int,itemid STRING,qty INT,price int ,amount int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' ;

LOAD DATA LOCAL INPATH '/home/mmicky/data/spark/saledata/qryTheDate.txt' INTO TABLE tblDate;

LOAD DATA LOCAL INPATH '/home/mmicky/data/spark/saledata/qrytblStock.txt' INTO TABLE tblStock;

LOAD DATA LOCAL INPATH '/home/mmicky/data/spark/saledata/qryStockDetail.txt' INTO TABLE tblStockDetail;

//异常数据

select sum(b.amount) from tblStock a,tblStockDetail b where a.ordernumber=b.ordernumber;0.13后可用
等价于select sum(b.amount) from tblStock a join tblStockDetail b on a.ordernumber=b.orderNumber;
68100782

select sum(b.amount) from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid;
68099079


select a.* from tblstock a where a.dateid not in (select dateid from tblDate);
//select * from tblstock where tblstock.dateid not in (select dateid from tblDate);

//所有订单中每年的销售单数、销售总额

select c.theyear,count(distinct a.ordernumber),sum(b.amount) from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear order by c.theyear;

2004    1094    3265696
2005    3828    13247234
2006    3772    13670416
2007    4885    16711974
2008    4861    14670698
2009    2619    6322137
2010    94  210924

//所有订单中季度销售额前10位

select c.theyear,c.thequot,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,c.thequot order by sumofamount desc limit 10;

2008    1   5252819
2007    4   4613093
2007    1   4446088
2006    1   3916638
2008    2   3886470
2007    3   3870558
2007    2   3782235
2006    4   3691314
2005    1   3592007
2005    3   3304243

//列出销售金额在100000以上的单据

select a.ordernumber,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b where a.ordernumber=b.ordernumber group by a.ordernumber having sumofamount>100000;

HMJSL00009024   119058
HMJSL00009958   159126

//所有订单每年最大金额订单的销售额
第一步:

select a.dateid,a.ordernumber,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b where a.ordernumber=b.ordernumber group by a.dateid,a.ordernumber 

第二步:

select c.theyear,max(d.sumofamount) from tbldate c,(select a.dateid,a.ordernumber,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b where a.ordernumber=b.ordernumber group by a.dateid,a.ordernumber) d where c.dateid=d.dateid group by c.theyear sort by c.theyear;

2004    23612
2005    38180
2006    36124
2007    159126
2008    55828
2009    25810
2010    13063

//所有订单中每年最畅销货品
第一步:

select c.theyear,b.itemid,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,b.itemid; 

第二步:

select d.theyear,max(d.sumofamount) as maxofamount from (select c.theyear,b.itemid,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,b.itemid) d group by d.theyear ;

第三步:

select distinct  e.theyear,e.itemid,f.maxofamount from (select c.theyear,b.itemid,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,b.itemid) e , (select d.theyear,max(d.sumofamount) as maxofamount from (select c.theyear,b.itemid,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,b.itemid) d group by d.theyear) f where e.theyear=f.theyear and e.sumofamount=f.maxofamount order by e.theyear;

2004    JY424420810101  53374
2005    24124118880102  56569
2006    JY425468460101  113684
2007    JY425468460101  70226
2008    E2628204040101  97981
2009    YL327439080102  30029
2010    SQ429425090101  4494

你可能感兴趣的:(hive)