国家城市
出租车出车记录
# 创建测试库
create DATABASE if not EXISTS test_trino COMMENT 'Trino测试的库' WITH DBPROPERTIES ('createUser'='顾栋','date'='20230529');
## 创建json临时表
create table tmpjson(line string) row format delimited fields terminated by "\n";
## 利用hive客户端加载本地json数据文件
LOAD DATA LOCAL INPATH '/opt/documents.json' OVERWRITE INTO TABLE test_trino.tmpjson;
## 创建国家主要城市表
CREATE TABLE `test_trino.all_countries`(
`geonameid` bigint COMMENT '',
`name` string COMMENT '',
`latitude` double COMMENT '',
`longitude` double COMMENT '',
`country_code` string COMMENT '',
`population` bigint COMMENT '')
COMMENT '城市表'
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
## 数据复写到内部表中
insert overwrite table `test_trino.all_countries` select json_tuple(line,'geonameid','name','latitude','longitude','country_code','population')as(geonameid,name,latitude,longitude,country_code,population) from test_trino.tmpjson;
# Trino COMMENT命令
COMMENT ON TABLE hive.test_trino.all_countries IS '国家城市表';
COMMENT ON COLUMN hive.test_trino.all_countries.geonameid IS '地名ID';
COMMENT ON COLUMN hive.test_trino.all_countries.name IS '地名';
COMMENT ON COLUMN hive.test_trino.all_countries.latitude IS '纬度';
COMMENT ON COLUMN hive.test_trino.all_countries.longitude IS '经度';
COMMENT ON COLUMN hive.test_trino.all_countries.country_code IS '国家编码';
COMMENT ON COLUMN hive.test_trino.all_countries.population IS '城市人口数';
DESCRIBE hive.test_trino.all_countries;
# 检查文件的元数据情况和抽样数据
java -jar parquet-tools-1.10.0.jar head -n 2 yellow_tripdata_2023-03.parquet
java -jar parquet-tools-1.10.0.jar meta yellow_tripdata_2023-02.parquet
# 创建以parquet存储的表
CREATE TABLE `test_trino.yellow_taxi_trip_records_tmp`
(
`VendorID` int COMMENT '仪表供应商ID',
`tpep_pickup_datetime` TIMESTAMP COMMENT '仪表启动时间',
`tpep_dropoff_datetime` TIMESTAMP COMMENT '仪表关闭时间',
`passenger_count` bigint COMMENT '乘客数量',
`trip_distance` double COMMENT '行程距离',
`RateCodeID` bigint COMMENT '费率编码',
`store_and_fwd_flag` string COMMENT '是否存储',
`PULocationID` bigint COMMENT '上车区域坐标',
`DOLocationID` bigint COMMENT '下场区域坐标',
`payment_type` bigint COMMENT '付款方式',
`fare_amount` double COMMENT '票价',
`extra` double COMMENT '杂费附加费',
`mta_tax` double COMMENT '税费',
`tip_amount` double COMMENT '小费',
`tolls_amount` double COMMENT '过路费',
`improvement_surcharge` double COMMENT '改善附加费',
`total_amount` double COMMENT '费用总计,不包含现金小费',
`congestion_surcharge` double COMMENT '拥堵费',
`airport_fee` double COMMENT '机房上下车费用'
)
COMMENT '黄色的出租车记录'
PARTITIONED BY (
`ym` string COMMENT '分区字段,年月(yyyyMM)')
STORED AS PARQUET;
ALTER TABLE `test_trino.yellow_taxi_trip_records_tmp` ADD IF NOT EXISTS PARTITION (ym='202301');
ALTER TABLE `test_trino.yellow_taxi_trip_records_tmp` ADD IF NOT EXISTS PARTITION (ym='202302');
ALTER TABLE `test_trino.yellow_taxi_trip_records_tmp` ADD IF NOT EXISTS PARTITION (ym='202303');
ALTER TABLE `test_trino.yellow_taxi_trip_records_tmp` DROP IF EXISTS PARTITION (ym='202301');
ALTER TABLE `test_trino.yellow_taxi_trip_records_tmp` DROP IF EXISTS PARTITION (ym='202302');
ALTER TABLE `test_trino.yellow_taxi_trip_records_tmp` DROP IF EXISTS PARTITION (ym='202303');
# 利用hive客户端load parquet数据
LOAD DATA LOCAL INPATH '/opt/yellow_tripdata_2023-02.parquet' OVERWRITE INTO TABLE `test_trino.yellow_taxi_trip_records_tmp` PARTITION (ym=202302);
LOAD DATA LOCAL INPATH '/opt/yellow_tripdata_2023-03.parquet' OVERWRITE INTO TABLE `test_trino.yellow_taxi_trip_records_tmp` PARTITION (ym=202303);
ALTER TABLE `test_trino.yellow_taxi_trip_records` ADD IF NOT EXISTS PARTITION (ym='202301');
ALTER TABLE `test_trino.yellow_taxi_trip_records` ADD IF NOT EXISTS PARTITION (ym='202302');
ALTER TABLE `test_trino.yellow_taxi_trip_records` ADD IF NOT EXISTS PARTITION (ym='202303');
ALTER TABLE `test_trino.yellow_taxi_trip_records` DROP IF EXISTS PARTITION (ym='202301');
ALTER TABLE `test_trino.yellow_taxi_trip_records` DROP IF EXISTS PARTITION (ym='202302');
ALTER TABLE `test_trino.yellow_taxi_trip_records` DROP IF EXISTS PARTITION (ym='202303');
#创建ORC内部表
CREATE TABLE `test_trino.yellow_taxi_trip_records`(
`VendorID` bigint COMMENT '仪表供应商ID',
`tpep_pickup_datetime` TIMESTAMP COMMENT '仪表启动时间',
`tpep_dropoff_datetime` TIMESTAMP COMMENT '仪表关闭时间',
`passenger_count` bigint COMMENT '乘客数量',
`trip_distance` double COMMENT '行程距离',
`RateCodeID` bigint COMMENT '费率编码',
`store_and_fwd_flag` string COMMENT '是否存储',
`PULocationID` bigint COMMENT '上车区域坐标',
`DOLocationID` bigint COMMENT '下场区域坐标',
`payment_type` bigint COMMENT '付款方式',
`fare_amount` double COMMENT '票价',
`extra` double COMMENT '杂费附加费',
`mta_tax` double COMMENT '税费',
`tip_amount` double COMMENT '小费',
`tolls_amount` double COMMENT '过路费',
`improvement_surcharge` double COMMENT '改善附加费',
`total_amount` double COMMENT '费用总计,不包含现金小费',
`congestion_surcharge` double COMMENT '拥堵费',
`airport_fee` double COMMENT '机房上下车费用'
)
COMMENT '黄色的出租车记录'
PARTITIONED BY (
`ym` string COMMENT '分区字段,年月(yyyyMM)')
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
# 覆写数据(hive写法)
INSERT OVERWRITE TABLE `test_trino.yellow_taxi_trip_records` PARTITION (ym=202302) select * from test_trino.yellow_taxi_trip_records_tmp where ym=202302
#
show table extended like yellow_taxi_trip_records_tmp PARTITION(ym='202302');
can not read class org.apache.parquet.format.FileMetaData: Required field 'codec' was not present!
org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec,com.hadoop.compression.lzo.LzoCodec,com.hadoop.compression.lzo.LzopCodec
-- 分组查询
SELECT country_code
,maxpopulation
FROM
(
SELECT country_code
,MAX(population) AS maxpopulation
FROM hive.test_trino.all_countries
WHERE country_code is not null
GROUP BY country_code
) AS x;
-- 使用with子句
WITH x AS
(
SELECT country_code
,MAX(population) AS maxpopulation
FROM hive.test_trino.all_countries
WHERE country_code is not null
GROUP BY country_code
)
SELECT country_code
,maxpopulation
FROM x;
-- with多子查询
WITH t1 AS
(
SELECT country_code
,MAX(population) AS maxpopulation
FROM hive.test_trino.all_countries
WHERE country_code is not null
GROUP BY country_code
), t2 AS
(
SELECT country_code
,MIN(population) AS minpopulation
FROM hive.test_trino.all_countries
WHERE country_code is not null
GROUP BY country_code
)
SELECT t1.*
,t2.*
FROM t1
JOIN t2
ON t1.country_code = t2.country_code;
-- with支持内联
WITH x AS
(
SELECT country_code
,MAX(population) AS maxpopulation
FROM hive.test_trino.all_countries
WHERE country_code is not null
GROUP BY country_code
), y AS
(
SELECT country_code AS b
FROM x
), z AS
(
SELECT b AS c
FROM y
)
SELECT c
FROM z;
-- with支持递归(实验性特性)
WITH RECURSIVE t
(n
) AS ( VALUES (1)
UNION ALL
SELECT n + 1
FROM t
WHERE n < 4 )
SELECT SUM(n)
FROM t;
-- 直接去重
SELECT DISTINCT country_code
FROM hive.test_trino.all_countries
WHERE country_code is not null
ORDER BY country_code
-- 复杂分组 GROUPING SETS
GROUPING SETS
SELECT country_code
,population
,COUNT(*) AS geocount
FROM hive.test_trino.all_countries
GROUP BY
GROUPING SETS ((country_code), (country_code, population));
-- 等价于
SELECT *
FROM
(
SELECT country_code
,null
,COUNT(*) AS geocount
FROM hive.test_trino.all_countries
GROUP BY country_code
UNION ALL
SELECT country_code
,population
,COUNT(*) AS geocount
FROM hive.test_trino.all_countries
GROUP BY country_code
,population
)
ORDER BY country_code;
-- 复杂分组 CUBE
SELECT country_code
,population
,COUNT(*) AS geocount
FROM hive.test_trino.all_countries
GROUP BY CUBE (country_code,population)
ORDER BY country_code
,population;
-- 等价于
SELECT country_code
,population
,COUNT(*) AS geocount
FROM hive.test_trino.all_countries
GROUP BY
GROUPING SETS ( (country_code, population), (country_code), (population), () )
ORDER BY country_code, population;
-- 复杂分组 ROLLUP
SELECT country_code
,population
,COUNT(*) AS geocount
FROM hive.test_trino.all_countries
GROUP BY ROLLUP (country_code,population)
ORDER BY country_code
,population;
-- 复杂分组+分组条件去重
SELECT country_code
,population
,COUNT(*) AS geocount
FROM hive.test_trino.all_countries
GROUP BY DISTINCT ROLLUP (country_code,population)
ORDER BY country_code
,population;
-- 分组编号
SELECT country_code
,population
,COUNT(*) AS geocount
,grouping(country_code,population)
FROM hive.test_trino.all_countries
GROUP BY
GROUPING SETS ( (country_code), (country_code, population), (population) );
-- having删选,在分组和聚合后生效
SELECT country_code
,population
,COUNT(*) AS geocount
,grouping(country_code,population)
FROM hive.test_trino.all_countries
GROUP BY country_code
,population
HAVING COUNT(*) > 100000
ORDER BY country_code, geocount
-- 并集
SELECT 13
UNION
SELECT 42;
-- 交集
SELECT *
FROM
(VALUES 13, 42
) INTERSECT
SELECT 13;
-- 差集
SELECT *
FROM
(VALUES 13, 42
) EXCEPT
SELECT 13;
-- 空行排序
SELECT country_code
,MAX(population) AS maxpopulation
FROM hive.test_trino.all_countries
GROUP BY country_code
ORDER BY maxpopulation desc nulls first
-- 位移
SELECT *
FROM hive.test_trino.all_countries
ORDER BY population desc OFFSET 22;
-- 返回结果集控制
SELECT orderdate
FROM orders
LIMIT 5;
--
SELECT name
,population
FROM hive.test_trino.all_countries
ORDER BY population desc FETCH FIRST ROW
WITH Earth;
-- 采样 BERNOULLI
SELECT *
FROM hive.test_trino.all_countries TABLESAMPLE BERNOULLI(50);
-- 采样 SYSTEM
SELECT *
FROM hive.test_trino.all_countries TABLESAMPLE SYSTEM(75);
-- UNNEST将行分拆多个字段
SELECT *
FROM UNNEST
( map_from_entries( ARRAY[ ('SQL', 1974), ('Java', 1995) ] )
) AS t(language, first_appeared_year);
--
SELECT a
,b
,rownumber
FROM UNNEST
( ARRAY[2, 5], ARRAY[7, 8, 9])
WITH ORDINALITY AS t
(a, b, rownumber);
--CROSS JOIN交叉连接(笛卡尔积)
SELECT n.id
,r.name
FROM
(VALUES 1, 2, 3, 4, 5
) AS n(id)
CROSS JOIN
(VALUES 'a', 'b', 'c'
) AS r(name)
ORDER BY 1, 2;
-- 引用前面的列
SELECT country_code
,x
,y
FROM hive.test_trino.all_countries
CROSS JOIN LATERAL
(
SELECT country_code || ' :-' AS x
)
CROSS JOIN LATERAL
(
SELECT x || ')' AS y
);
-- EXISTS
SELECT id
FROM
(VALUES '1', '2', '3', '4', '5', 'a', 'b', 'c'
) AS n(id)
WHERE EXISTS (
SELECT *
FROM
(VALUES 'a', 'b', 'c', 'd'
) AS t(id)
WHERE t.id = n.id );
-- IN
SELECT n.id
FROM
(VALUES '1', '2', '3', '4', '5', 'a', 'b', 'c'
) AS n(id)
WHERE id IN ( SELECT t.id FROM (VALUES 'a', 'b', 'c', 'd' ) AS t(id) WHERE t.id = 'a' OR t.id = 'd' );
-- 标量子查询
SELECT *
FROM hive.test_trino.all_countries
WHERE population = (
SELECT MAX(population)
FROM hive.test_trino.all_countries
WHERE country_code is not null);
-- 匹配识别
-- 按条件查询查询AE的数据排列情况
SELECT *
FROM
(
SELECT row_number() over( ORDER BY geonameid)as row
,*
FROM hive.test_trino.all_countries
WHERE country_code = 'AE'
) t;
-- 查询V形数据,只对country_code = 'AE'的数据进行查询,以geonameid正序排序,获取满足V形的数据情况
SELECT *
FROM hive.test_trino.all_countries MATCH_RECOGNIZE
( partition by country_code
ORDER BY geonameid
MEASURES
A.population AS starting_population,
last(B.population) AS bottom_population,
last(U.population) AS top_population
ONE row PER match
after match SKIP PAST last row
PATTERN (A B + C + D +)
SUBSET U = (C, D)
DEFINE
B AS population < PREV(population),
C AS population > PREV(population) AND population <= A.population,
D AS population > PREV(population)
)
WHERE country_code = 'AE'