按数据流入流出的过程,数据仓库可以分为三层—元数据、数据仓库、数据应用
//直接执行SQL命令
hive -e 'SQL命令'
//执行脚本
hive -f '脚本位置'
hive
命令进入hivenohup hive --service hiveserver2 $
beeline -u jdbc:hive2://localhost:10000
nohup hive --service hiveserver2 $
beeline
!connect jdbc:hive2://hadoop100:10000
create external table employee(
name string,
address array<string>,
genderAndAge struct<gender:string,age:int>,
jobAndSalary map<string,int>,
depAndLvl map<string,string>)
row format delimited
fields terminated by '|'
collection items terminated by ',' //string和struct的分隔符设置
map keys terminated by ':' //map类型的分隔符设置
lines terminated by '\n'
stored as textfile
location '/test/data/emp'
show databases
describe database 库名;
create database 库名
use 库名
select current_database()
drop database 库名
create table 表名(column 属性,...)...
drop tables 表名
truncate 表名
show tables
create table test(id int,name string)
row format delimited
fields terminated by '\t'
create external table test(id int,name string)
row format delimited
fields terminated by '\t'
create external table test(id int,name string)
partitioned by (month string)
row format delimited
fields terminated by '\t'
create external table test(id int,name string)
clustered by(id) into 3 buckets
row format delimited
fields terminated by '\t'
//表只对当前session有效,退出会话即删除
create temporary table test();
row format delimited
fields terminated by '\t'
//1、Like
create table test like oldTable; //复制oldTest的表结构来创建test
//2、CTAS – as select方式建表
create table test as select * from test1; //复制test1的数据到test
//3、CTE (CTAS with Common Table Expression)
CREATE TABLE cte_employee AS
WITH
r1 AS (SELECT name FROM r2 WHERE name = 'Michael'),
r2 AS (SELECT name FROM employee WHERE sex_age.sex= 'Male'),
r3 AS (SELECT name FROM employee WHERE sex_age.sex= 'Female')
SELECT * FROM r1 UNION ALL SELECT * FROM r3;
create external tabkle if not exists employee_external(
name string,
work_place ARRAY<string>,
sex_age STRUCT<sex:string,age:int>,
skills_score MAP<string,int>,
depart_title MAP<STRING,ARRAY<STRING>>)
comment 'This is an external table'
ROW FORMAT DELIMITED
row format delimited
fields treminated by '|'
collection items terminated by ','
map keys terminated by ':',
stored as textfile
row format serde
"org.apache.hadoop.hive.serde2.OpenCSVSerde"
with serdeproperties(
"separatorChar"=",",
"quoteChar"="\"",
"escapeChar"="\\")
create external table test(id int,name string)
partitioned by (month string,date string,...)
row format delimited
fields terminated by '\t'
alter table tab add partition(month='202103') partition(month='202104') ...;
alter table tab drop partition(month='202103');
show partitions tab;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;
insert into table tab partition(month)
select id,name,
id as month
from tab1;
set hive.enforce.bucketing = true;
set mapreduce.job.reduces=num;
clustered by (column) into num buckets; //根据列column 分为num个桶
select * from tab tablesample(bucket 3 out of 32 on rand()) s;
select * from tab tablesample(bucket 3 out of 32 on id) s;
create view view_test as select * from tab;
show create table view_test;
select * from view_test;
drop view ciew_test;
//explode使用:string goods_id='1,2,3,4,5'
//select explode(split(goods_id,',')) as goods_id from explode_lateral_view;
select name,wps,skill,score from employee
lateral view explode(work_place) work_place_single as wps
lateral view explode(skills_score) sks as skill,score;
//explode(数组),如果后面只as一个字段,则表示行转列;如果多个字段,则表示分成多列
//从本地给表加载数据
load data local inpath '本地文件地址' into table tab1; //原文件不动(复制)
//从HDFS给表加载数据
load data inpath 'HDFS文件地址' into table tab2; //原文件消失(移动)
//INSERT支持OVERWRITE(覆盖)和INTO(追加)
insert into tab1 select * from tab2;
插入多个表:
from ctas_employee
insert into table employee select *
insert overwrite table employee_internal select *;
from tab
insert overwrite local directory '本地文件夹' select *
insert overwrite directory 'hdfs文件夹' select *;
export table employee to '/tmp/output3';
import table employee from '/tmp/output3';
function(arg 1,..., arg n) over([PARTITION BY <...>] [ORDER BY <....>] [<window_clause>])
row_number()
:对所有的数值输出不同的序号,序号唯一连续rank()
:对相同的值,输出相同的序号,下一个序号跳过(1,1,3)dense_rank()
:对相同的值,输出相同的序号,下一个序号连续(1,1,2)ntile(n)
:将有序的数据集合平均分配到n个桶中,将桶号分给每一行percent_rank
:(目前排名/总行数-1),值相当于一组值的百分比排名SELECT
name, dept_num, salary,
LEAD(salary, 2) OVER(PARTITION BY dept_num ORDER BY salary) AS lead,
LAG(salary, 2, 0) OVER(PARTITION BY dept_num ORDER BY salary) AS lag,
FIRST_VALUE(salary) OVER (PARTITION BY dept_num ORDER BY salary) AS first_value,
LAST_VALUE(salary) OVER (PARTITION BY dept_num ORDER BY salary) AS last_value_default,
LAST_VALUE(salary) OVER (PARTITION BY dept_num ORDER BY salary RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_value
FROM employee_contract
ORDER BY dept_num, salary;