【Trino练习】Trino 基于hive schema 进行的查询验证

Trino 基于hive schema 进行的查询验证

数据准备

  • 国家城市

  • 出租车出车记录

# 创建测试库
 create DATABASE if not EXISTS test_trino COMMENT 'Trino测试的库' WITH DBPROPERTIES ('createUser'='顾栋','date'='20230529');

## 创建json临时表
 create table tmpjson(line string) row format delimited fields terminated by "\n";
## 利用hive客户端加载本地json数据文件
 LOAD DATA LOCAL INPATH '/opt/documents.json' OVERWRITE INTO TABLE test_trino.tmpjson;

## 创建国家主要城市表
 CREATE TABLE `test_trino.all_countries`(
  `geonameid` bigint COMMENT '', 
  `name` string COMMENT '', 
  `latitude` double COMMENT '',
  `longitude` double COMMENT '', 
  `country_code` string COMMENT '',
  `population` bigint COMMENT '')
COMMENT '城市表'
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
 
 ## 数据复写到内部表中
 insert overwrite table `test_trino.all_countries` select json_tuple(line,'geonameid','name','latitude','longitude','country_code','population')as(geonameid,name,latitude,longitude,country_code,population) from test_trino.tmpjson;
 
 # Trino COMMENT命令
 COMMENT ON TABLE hive.test_trino.all_countries IS '国家城市表';

 COMMENT ON COLUMN hive.test_trino.all_countries.geonameid IS '地名ID';
 COMMENT ON COLUMN hive.test_trino.all_countries.name IS '地名';
 COMMENT ON COLUMN hive.test_trino.all_countries.latitude IS '纬度';
 COMMENT ON COLUMN hive.test_trino.all_countries.longitude IS '经度';
 COMMENT ON COLUMN hive.test_trino.all_countries.country_code IS '国家编码';
 COMMENT ON COLUMN hive.test_trino.all_countries.population IS '城市人口数';

 DESCRIBE hive.test_trino.all_countries;
 # 检查文件的元数据情况和抽样数据
  java -jar parquet-tools-1.10.0.jar head -n 2  yellow_tripdata_2023-03.parquet
  java -jar parquet-tools-1.10.0.jar meta yellow_tripdata_2023-02.parquet
 # 创建以parquet存储的表
  CREATE TABLE `test_trino.yellow_taxi_trip_records_tmp`
(
  `VendorID` int COMMENT '仪表供应商ID', 
  `tpep_pickup_datetime` TIMESTAMP COMMENT '仪表启动时间', 
  `tpep_dropoff_datetime` TIMESTAMP COMMENT '仪表关闭时间',
  `passenger_count` bigint COMMENT '乘客数量', 
  `trip_distance` double COMMENT '行程距离',
  `RateCodeID` bigint COMMENT '费率编码',
  `store_and_fwd_flag` string COMMENT '是否存储',
  `PULocationID` bigint COMMENT '上车区域坐标',
  `DOLocationID` bigint COMMENT '下场区域坐标',
  `payment_type` bigint COMMENT '付款方式',
  `fare_amount` double COMMENT '票价',
  `extra` double COMMENT '杂费附加费',
  `mta_tax` double COMMENT '税费',
  `tip_amount` double COMMENT '小费',
  `tolls_amount` double COMMENT '过路费',
  `improvement_surcharge` double COMMENT '改善附加费',
  `total_amount` double COMMENT '费用总计,不包含现金小费',
  `congestion_surcharge` double COMMENT '拥堵费',
  `airport_fee` double COMMENT '机房上下车费用'
)
COMMENT '黄色的出租车记录'
PARTITIONED BY ( 
  `ym` string COMMENT '分区字段,年月(yyyyMM)')
STORED AS PARQUET;

  ALTER TABLE `test_trino.yellow_taxi_trip_records_tmp` ADD IF NOT EXISTS PARTITION (ym='202301');
  ALTER TABLE `test_trino.yellow_taxi_trip_records_tmp` ADD IF NOT EXISTS PARTITION (ym='202302');
  ALTER TABLE `test_trino.yellow_taxi_trip_records_tmp` ADD IF NOT EXISTS PARTITION (ym='202303');
  
  ALTER TABLE `test_trino.yellow_taxi_trip_records_tmp` DROP IF  EXISTS PARTITION (ym='202301');
  ALTER TABLE `test_trino.yellow_taxi_trip_records_tmp` DROP IF  EXISTS PARTITION (ym='202302');
  ALTER TABLE `test_trino.yellow_taxi_trip_records_tmp` DROP IF  EXISTS PARTITION (ym='202303');
  # 利用hive客户端load parquet数据
    LOAD DATA LOCAL INPATH '/opt/yellow_tripdata_2023-02.parquet' OVERWRITE INTO TABLE `test_trino.yellow_taxi_trip_records_tmp` PARTITION (ym=202302);
  
  LOAD DATA LOCAL INPATH '/opt/yellow_tripdata_2023-03.parquet' OVERWRITE INTO TABLE `test_trino.yellow_taxi_trip_records_tmp` PARTITION (ym=202303);
  
  ALTER TABLE `test_trino.yellow_taxi_trip_records` ADD IF NOT EXISTS PARTITION (ym='202301');
  ALTER TABLE `test_trino.yellow_taxi_trip_records` ADD IF NOT EXISTS PARTITION (ym='202302');
  ALTER TABLE `test_trino.yellow_taxi_trip_records` ADD IF NOT EXISTS PARTITION (ym='202303');
  
  ALTER TABLE `test_trino.yellow_taxi_trip_records` DROP IF EXISTS PARTITION (ym='202301');
  ALTER TABLE `test_trino.yellow_taxi_trip_records` DROP IF EXISTS PARTITION (ym='202302');
  ALTER TABLE `test_trino.yellow_taxi_trip_records` DROP IF EXISTS PARTITION (ym='202303');
  
  #创建ORC内部表
 CREATE TABLE `test_trino.yellow_taxi_trip_records`(
  `VendorID` bigint COMMENT '仪表供应商ID', 
  `tpep_pickup_datetime` TIMESTAMP COMMENT '仪表启动时间', 
  `tpep_dropoff_datetime` TIMESTAMP COMMENT '仪表关闭时间',
  `passenger_count` bigint COMMENT '乘客数量', 
  `trip_distance` double COMMENT '行程距离',
  `RateCodeID` bigint COMMENT '费率编码',
  `store_and_fwd_flag` string COMMENT '是否存储',
  `PULocationID` bigint COMMENT '上车区域坐标',
  `DOLocationID` bigint COMMENT '下场区域坐标',
  `payment_type` bigint COMMENT '付款方式',
  `fare_amount` double COMMENT '票价',
  `extra` double COMMENT '杂费附加费',
  `mta_tax` double COMMENT '税费',
  `tip_amount` double COMMENT '小费',
  `tolls_amount` double COMMENT '过路费',
  `improvement_surcharge` double COMMENT '改善附加费',
  `total_amount` double COMMENT '费用总计,不包含现金小费',
  `congestion_surcharge` double COMMENT '拥堵费',
  `airport_fee` double COMMENT '机房上下车费用'
)
COMMENT '黄色的出租车记录'
PARTITIONED BY ( 
  `ym` string COMMENT '分区字段,年月(yyyyMM)')
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
  
  # 覆写数据(hive写法)
  INSERT OVERWRITE TABLE `test_trino.yellow_taxi_trip_records` PARTITION (ym=202302) select * from test_trino.yellow_taxi_trip_records_tmp where ym=202302
  #
  show table extended like yellow_taxi_trip_records_tmp PARTITION(ym='202302');
can not read class org.apache.parquet.format.FileMetaData: Required field 'codec' was not present! 
org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec,com.hadoop.compression.lzo.LzoCodec,com.hadoop.compression.lzo.LzopCodec

查询语句

-- 分组查询
SELECT  country_code
       ,maxpopulation
FROM
(
	SELECT  country_code
	       ,MAX(population) AS maxpopulation
	FROM hive.test_trino.all_countries
	WHERE country_code is not null
	GROUP BY  country_code
) AS x;
-- 使用with子句
WITH x AS
(
	SELECT  country_code
	       ,MAX(population) AS maxpopulation
	FROM hive.test_trino.all_countries
	WHERE country_code is not null
	GROUP BY  country_code
)
SELECT  country_code
       ,maxpopulation
FROM x;
-- with多子查询
WITH t1 AS
(
	SELECT  country_code
	       ,MAX(population) AS maxpopulation
	FROM hive.test_trino.all_countries
	WHERE country_code is not null
	GROUP BY  country_code
), t2 AS
(
	SELECT  country_code
	       ,MIN(population) AS minpopulation
	FROM hive.test_trino.all_countries
	WHERE country_code is not null
	GROUP BY  country_code
)
SELECT  t1.*
       ,t2.*
FROM t1
JOIN t2
ON t1.country_code = t2.country_code;
-- with支持内联
WITH x AS
(
	SELECT  country_code
	       ,MAX(population) AS maxpopulation
	FROM hive.test_trino.all_countries
	WHERE country_code is not null
	GROUP BY  country_code
), y AS
(
	SELECT  country_code AS b
	FROM x
), z AS
(
	SELECT  b AS c
	FROM y
)
SELECT  c
FROM z;
-- with支持递归(实验性特性)
WITH RECURSIVE t
(n
) AS ( VALUES (1)
UNION ALL
SELECT  n + 1
FROM t
WHERE n < 4 )
SELECT  SUM(n)
FROM t;
-- 直接去重
SELECT  DISTINCT country_code
FROM hive.test_trino.all_countries
WHERE country_code is not null
ORDER BY country_code
-- 复杂分组 GROUPING SETS
GROUPING SETS
SELECT  country_code
       ,population
       ,COUNT(*) AS geocount
FROM hive.test_trino.all_countries
GROUP BY
GROUPING SETS ((country_code), (country_code, population));
-- 等价于
SELECT  *
FROM
(
	SELECT  country_code
	       ,null
	       ,COUNT(*) AS geocount
	FROM hive.test_trino.all_countries
	GROUP BY  country_code
	UNION ALL
	SELECT  country_code
	       ,population
	       ,COUNT(*) AS geocount
	FROM hive.test_trino.all_countries
	GROUP BY  country_code
	         ,population
)
ORDER BY country_code;
-- 复杂分组 CUBE
SELECT  country_code
       ,population
       ,COUNT(*) AS geocount
FROM hive.test_trino.all_countries
GROUP BY  CUBE (country_code,population)
ORDER BY country_code
         ,population;
-- 等价于
SELECT  country_code
       ,population
       ,COUNT(*) AS geocount
FROM hive.test_trino.all_countries
GROUP BY
GROUPING SETS ( (country_code, population), (country_code), (population), () )
ORDER BY country_code, population;
-- 复杂分组 ROLLUP
SELECT  country_code
       ,population
       ,COUNT(*) AS geocount
FROM hive.test_trino.all_countries
GROUP BY  ROLLUP (country_code,population)
ORDER BY country_code
         ,population;
-- 复杂分组+分组条件去重
SELECT  country_code
       ,population
       ,COUNT(*) AS geocount
FROM hive.test_trino.all_countries
GROUP BY DISTINCT ROLLUP (country_code,population)
ORDER BY country_code
         ,population;
-- 分组编号
SELECT  country_code
       ,population
       ,COUNT(*) AS geocount
       ,grouping(country_code,population)
FROM hive.test_trino.all_countries
GROUP BY
GROUPING SETS ( (country_code), (country_code, population), (population) );
-- having删选,在分组和聚合后生效
SELECT  country_code
       ,population
       ,COUNT(*) AS geocount
       ,grouping(country_code,population)
FROM hive.test_trino.all_countries
GROUP BY  country_code
         ,population
HAVING COUNT(*) > 100000
ORDER BY country_code, geocount
-- 并集
SELECT  13
UNION
SELECT  42;
-- 交集
SELECT  *
FROM
(VALUES 13, 42
) INTERSECT
SELECT  13;
-- 差集
SELECT  *
FROM
(VALUES 13, 42
) EXCEPT
SELECT  13;
-- 空行排序
SELECT  country_code
       ,MAX(population) AS maxpopulation
FROM hive.test_trino.all_countries
GROUP BY  country_code
ORDER BY maxpopulation desc nulls first
-- 位移
SELECT  *
FROM hive.test_trino.all_countries
ORDER BY population desc OFFSET 22;
-- 返回结果集控制
SELECT  orderdate
FROM orders
LIMIT 5;
--
SELECT  name
       ,population
FROM hive.test_trino.all_countries
ORDER BY population desc FETCH FIRST ROW
WITH Earth;
-- 采样 BERNOULLI
SELECT  *
FROM hive.test_trino.all_countries TABLESAMPLE BERNOULLI(50);
-- 采样 SYSTEM
SELECT  *
FROM hive.test_trino.all_countries TABLESAMPLE SYSTEM(75);
-- UNNEST将行分拆多个字段
SELECT  *
FROM UNNEST
( map_from_entries( ARRAY[ ('SQL', 1974), ('Java', 1995) ] )
) AS t(language, first_appeared_year);
--
SELECT  a
       ,b
       ,rownumber
FROM UNNEST
( ARRAY[2, 5], ARRAY[7, 8, 9])
WITH ORDINALITY AS t
(a, b, rownumber);

--CROSS JOIN交叉连接(笛卡尔积)
SELECT  n.id
       ,r.name
FROM
(VALUES 1, 2, 3, 4, 5
) AS n(id)
CROSS JOIN
(VALUES 'a', 'b', 'c'
) AS r(name)
ORDER BY 1, 2;

-- 引用前面的列
SELECT  country_code
       ,x
       ,y
FROM hive.test_trino.all_countries
CROSS JOIN LATERAL
(
	SELECT  country_code || ' :-' AS x
)
CROSS JOIN LATERAL
(
	SELECT  x || ')' AS y
);
-- EXISTS
SELECT  id
FROM
(VALUES '1', '2', '3', '4', '5', 'a', 'b', 'c'
) AS n(id)
WHERE EXISTS (
SELECT  *
FROM
(VALUES 'a', 'b', 'c', 'd'
) AS t(id)
WHERE t.id = n.id );
-- IN
SELECT  n.id
FROM
(VALUES '1', '2', '3', '4', '5', 'a', 'b', 'c'
) AS n(id)
WHERE id IN ( SELECT t.id FROM  (VALUES 'a', 'b', 'c', 'd'  ) AS t(id) WHERE t.id = 'a' OR t.id = 'd' );
-- 标量子查询
SELECT  *
FROM hive.test_trino.all_countries
WHERE population = (
SELECT  MAX(population)
FROM hive.test_trino.all_countries
WHERE country_code is not null);
-- 匹配识别 
-- 按条件查询查询AE的数据排列情况
SELECT  *
FROM
(
	SELECT  row_number() over( ORDER BY geonameid)as row
	       ,*
	FROM hive.test_trino.all_countries
	WHERE country_code = 'AE'
) t;
-- 查询V形数据,只对country_code = 'AE'的数据进行查询,以geonameid正序排序,获取满足V形的数据情况
SELECT  *
FROM hive.test_trino.all_countries MATCH_RECOGNIZE
( partition by country_code
  ORDER BY geonameid 
  MEASURES 
  A.population AS starting_population, 
  last(B.population) AS bottom_population, 
  last(U.population) AS top_population 
  ONE row PER match 
  after match SKIP PAST last row 
  PATTERN (A B + C + D +) 
  SUBSET U = (C, D) 
  DEFINE 
    B AS population < PREV(population), 
    C AS population > PREV(population) AND population <= A.population, 
    D AS population > PREV(population)
)
WHERE country_code = 'AE'

你可能感兴趣的:(Trino,hive,hadoop,数据仓库)