hive查询语法-数据类型-函数-综合(twenty-four day)

1、查询语法

create table t_a(name string,numb int)
row format delimited
fields terminated by ',';

create table t_b(name string,nick string)
row format delimited
fields terminated by ',';

load data local inpath '/root/hivetest/a.txt' into table t_a;
load data local inpath '/root/hivetest/b.txt' into table t_b;


-- 各类join
--1/ 内连接
-- 笛卡尔积
select a.*,b.*
from t_a a inner join t_b b;


-- 指定join条件
select a.*,b.*
from 
t_a a join t_b b on a.name=b.name;

-- 2/ 左外连接(左连接)
select a.*,b.*
from 
t_a a left outer join t_b b on a.name=b.name;


-- 3/ 右外连接(右连接)
select a.*,b.*
from 
t_a a right outer join t_b b on a.name=b.name;

-- 4/ 全外连接
select a.*,b.*
from
t_a a full outer join t_b b on a.name=b.name;


-- 5/ 左半连接
select a.*
from 
t_a a left semi join t_b b on a.name=b.name;


-- 分组聚合查询

-- 针对每一行进行运算
select ip,upper(url),access_time  -- 该表达式是对数据中的每一行进行逐行运算
from t_pv_log;

-- 求每条URL的访问总次数

select url,count(1) as cnts   -- 该表达式是对分好组的数据进行逐组运算
from t_pv_log 
group by url;

-- 求每个URL的访问者中ip地址最大的

select url,max(ip)
from t_pv_log
group by url;

-- 求每个用户访问同一个页面的所有记录中,时间最晚的一条

select ip,url,max(access_time) 
from  t_pv_log
group by ip,url;


-- 分组聚合综合示例
-- 有如下数据
/*
192.168.33.3,http://www.edu360.cn/stu,2017-08-04 15:30:20
192.168.33.3,http://www.edu360.cn/teach,2017-08-04 15:35:20
192.168.33.4,http://www.edu360.cn/stu,2017-08-04 15:30:20
192.168.33.4,http://www.edu360.cn/job,2017-08-04 16:30:20
192.168.33.5,http://www.edu360.cn/job,2017-08-04 15:40:20


192.168.33.3,http://www.edu360.cn/stu,2017-08-05 15:30:20
192.168.44.3,http://www.edu360.cn/teach,2017-08-05 15:35:20
192.168.33.44,http://www.edu360.cn/stu,2017-08-05 15:30:20
192.168.33.46,http://www.edu360.cn/job,2017-08-05 16:30:20
192.168.33.55,http://www.edu360.cn/job,2017-08-05 15:40:20


192.168.133.3,http://www.edu360.cn/register,2017-08-06 15:30:20
192.168.111.3,http://www.edu360.cn/register,2017-08-06 15:35:20
192.168.34.44,http://www.edu360.cn/pay,2017-08-06 15:30:20
192.168.33.46,http://www.edu360.cn/excersize,2017-08-06 16:30:20
192.168.33.55,http://www.edu360.cn/job,2017-08-06 15:40:20
192.168.33.46,http://www.edu360.cn/excersize,2017-08-06 16:30:20
192.168.33.25,http://www.edu360.cn/job,2017-08-06 15:40:20
192.168.33.36,http://www.edu360.cn/excersize,2017-08-06 16:30:20
192.168.33.55,http://www.edu360.cn/job,2017-08-06 15:40:20

*/
-- 建表映射上述数据
create table t_access(ip string,url string,access_time string)
partitioned by (dt string)
row format delimited fields terminated by ',';


-- 导入数据
load data local inpath '/root/hivetest/access.log.0804' into table t_access partition(dt='2017-08-04');
load data local inpath '/root/hivetest/access.log.0805' into table t_access partition(dt='2017-08-05');
load data local inpath '/root/hivetest/access.log.0806' into table t_access partition(dt='2017-08-06');

-- 查看表的分区
show partitions t_access;

-- 求8月4号以后,每天http://www.edu360.cn/job的总访问次数,及访问者中ip地址中最大的
select dt,'http://www.edu360.cn/job',count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job'
group by dt having dt>'2017-08-04';


select dt,max(url),count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job'
group by dt having dt>'2017-08-04';


select dt,url,count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job'
group by dt,url having dt>'2017-08-04';

select dt,url,count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job' and dt>'2017-08-04'
group by dt,url;


-- 求8月4号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的

select dt,url,count(1),max(ip)
from t_access
where dt>'2017-08-04'
group by dt,url;

-- 求8月4号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的,且,只查询出总访问次数>2 的记录
-- 方式1:
select dt,url,count(1) as cnts,max(ip)
from t_access
where dt>'2017-08-04'
group by dt,url having cnts>2;


-- 方式2:用子查询
select dt,url,cnts,max_ip
from
(select dt,url,count(1) as cnts,max(ip) as max_ip
from t_access
where dt>'2017-08-04'
group by dt,url) tmp
where cnts>2;
 

2、数据类型

2.1、数字类型

TINYINT (1-byte signed integer, from -128 to 127)

SMALLINT (2-byte signed integer, from -32,768 to 32,767)

 

INT/INTEGER (4-byte signed integer, from -2,147,483,648 to 2,147,483,647)

 

BIGINT (8-byte signed integer, from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807)

FLOAT (4-byte single precision floating point number)

DOUBLE (8-byte double precision floating point number)

 

示例:

create table t_test(a string ,b int,c bigint,d float,e double,f tinyint,g smallint)

 

2.2、日期时间类型

TIMESTAMP (Note: Only available starting with Hive 0.8.0)

DATE (Note: Only available starting with Hive 0.12.0)

 

示例,假如有以下数据文件:

1,zhangsan,1985-06-30

2,lisi,1986-07-10

3,wangwu,1985-08-09

那么,就可以建一个表来对数据进行映射

create table t_customer(id int,name string,birthday date)

row format delimited fields terminated by ',';

然后导入数据

load data local inpath '/root/customer.dat' into table t_customer;

然后,就可以正确查询

 

2.3、字符串类型

STRING

VARCHAR (Note: Only available starting with Hive 0.12.0)

CHAR (Note: Only available starting with Hive 0.13.0)

 

2.4、混杂类型

BOOLEAN

BINARY (Note: Only available starting with Hive 0.8.0)

 

2.5、复合类型

array数组类型

arrays: ARRAY (Note: negative values and non-constant expressions are allowed as of Hive 0.14.)

 

示例:array类型的应用

假如有如下数据需要用hive的表去映射:

战狼2,吴京:吴刚:龙母,2017-08-16

三生三世十里桃花,刘亦菲:痒痒,2017-08-20

设想:如果主演信息用一个数组来映射比较方便

 

建表:

create table t_movie(moive_name string,actors array,first_show date)

row format delimited fields terminated by ','

collection items terminated by ':';

 

导入数据:

load data local inpath '/root/movie.dat' into table t_movie;

 

查询:

select * from t_movie;

select moive_name,actors[0] from t_movie;

select moive_name,actors from t_movie where array_contains(actors,'吴刚');

select moive_name,size(actors) from t_movie;

 

2.6、map类型

maps: MAP (Note: negative values and non-constant expressions are allowed as of Hive 0.14.)

 

  1. 假如有以下数据:

1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28

2,lisi,father:mayun#mother:huangyi#brother:guanyu,22

3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29

4,mayun,father:mayongzhen#mother:angelababy,26

 

可以用一个map类型来对上述数据中的家庭成员进行描述

 

  1. 建表语句:

create table t_person(id int,name string,family_members map,age int)

row format delimited fields terminated by ','

collection items terminated by '#'

map keys terminated by ':';

 

  1. 查询

select * from t_person;

 

## 取map字段的指定key的值

select id,name,family_members['father'] as father from t_person;

 

## 取map字段的所有key

select id,name,map_keys(family_members) as relation from t_person;

 

## 取map字段的所有value

select id,name,map_values(family_members) from t_person;

select id,name,map_values(family_members)[0] from t_person;

 

## 综合:查询有brother的用户信息

select id,name,father

from

(select id,name,family_members['brother'] as father from t_person) tmp

where father is not null;

 

 

 

2.7、struct类型

structs: STRUCT

 

  1. 假如有如下数据:

1,zhangsan,18:male:beijing

2,lisi,28:female:shanghai

 

其中的用户信息包含:年龄:整数,性别:字符串,地址:字符串

设想用一个字段来描述整个用户信息,可以采用struct

 

  1. 建表:

create table t_person_struct(id int,name string,info struct)

row format delimited fields terminated by ','

collection items terminated by ':';

 

  1. 查询

select * from t_person_struct;

select id,name,info.age from t_person_struct;

 

3、函数

 

hive的所有函数手册:

https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-Built-inTable-GeneratingFunctions(UDTF)

你可能感兴趣的:(向大数据进军~每天记)