SELECT … FROM Clauses
hive> SELECT name, salary FROM employees;
表别名
hive> SELECT name, salary FROM employees; hive> SELECT e.name, e.salary FROM employees e;
Specify Columns with Regular Expressions
hive> SELECT symbol, `price.*` FROM stocks; AAPL 195.69 197.88 194.0 194.12 194.12 AAPL 192.63 196.0 190.85 195.46 195.46 AAPL 196.73 198.37 191.57 192.05 192.05 AAPL 195.17 200.2 194.42 199.23 199.23 AAPL 195.91 196.32 193.38 195.86 195.86
Computing with Column Values
hive> SELECT upper(name), salary, deductions["Federal Taxes"], > round(salary * (1 - deductions["Federal Taxes"])) FROM employees; JOHN DOE 100000.0 0.2 80000 MARY SMITH 80000.0 0.2 64000 TODD JONES 70000.0 0.15 59500 BILL KING 60000.0 0.15 51000
Arithmetic Operators --算数运算
Mathematical functions
Aggregate functions -- 集合函数
hive> SET hive.map.aggr=true; hive> SELECT count(*), avg(salary) FROM employees;
Table generating functions
hive> SELECT explode(subordinates) AS sub FROM employees; Mary Smith Todd Jones Bill King
Other built-in functions
LIMIT Clause --限制行数
hive> SELECT upper(name), salary, deductions["Federal Taxes"], > round(salary * (1 - deductions["Federal Taxes"])) FROM employees > LIMIT 2; JOHN DOE 100000.0 0.2 80000 MARY SMITH 80000.0 0.2 64000
Column Aliases --列别名
hive> SELECT upper(name), salary, deductions["Federal Taxes"] as fed_taxes, > round(salary * (1 - deductions["Federal Taxes"])) as salary_minus_fed_taxes > FROM employees LIMIT 2; JOHN DOE 100000.0 0.2 80000 MARY SMITH 80000.0 0.2 64000
Nested SELECT Statements --子查询
hive> FROM ( > SELECT upper(name), salary, deductions["Federal Taxes"] as fed_taxes, > round(salary * (1 - deductions["Federal Taxes"])) as salary_minus_fed_taxes > FROM employees > ) e > SELECT e.name, e.salary_minus_fed_taxes > WHERE e.salary_minus_fed_taxes > 70000; JOHN DOE 100000.0 0.2 80000
CASE … WHEN … THEN Statements --case 关键字
hive> SELECT name, salary, > CASE > WHEN salary < 50000.0 THEN 'low' > WHEN salary >= 50000.0 AND salary < 70000.0 THEN 'middle' > WHEN salary >= 70000.0 AND salary < 100000.0 THEN 'high' > ELSE 'very high' > END AS bracket FROM employees; John Doe 100000.0 very high Mary Smith 80000.0 high Todd Jones 70000.0 high Bill King 60000.0 middle Boss Man 200000.0 very high Fred Finance 150000.0 very high Stacy Accountant 60000.0 middle
WHERE Clauses -- 筛选
SELECT * FROM employees WHERE country = 'US' AND state = 'CA';
hive> SELECT name, salary, deductions["Federal Taxes"], > salary * (1 - deductions["Federal Taxes"]) > FROM employees > WHERE round(salary * (1 - deductions["Federal Taxes"])) > 70000; John Doe 100000.0 0.2 80000.0
hive> SELECT name, salary, deductions["Federal Taxes"], > salary * (1 - deductions["Federal Taxes"]) as salary_minus_fed_taxes > FROM employees > WHERE round(salary_minus_fed_taxes) > 70000; FAILED: Error in semantic analysis: Line 4:13 Invalid table alias or column reference 'salary_minus_fed_taxes': (possible column names are: name, salary, subordinates, deductions, address)
hive> SELECT e.* FROM > (SELECT name, salary, deductions["Federal Taxes"] as ded, > salary * (1 - deductions["Federal Taxes"]) as salary_minus_fed_taxes > FROM employees) e > WHERE round(e.salary_minus_fed_taxes) > 70000; John Doe 100000.0 0.2 80000.0 Boss Man 200000.0 0.3 140000.0 Fred Finance 150000.0 0.3 105000.0
Predicate Operators
LIKE and RLIKE
hive> SELECT name, address.street FROM employees WHERE address.street LIKE '%Ave.'; John Doe 1 Michigan Ave. Todd Jones 200 Chicago Ave. hive> SELECT name, address.city FROM employees WHERE address.city LIKE 'O%'; Todd Jones Oak Park Bill King Obscuria hive> SELECT name, address.street FROM employees WHERE address.street LIKE '%Chi%'; Todd Jones 200 Chicago Ave.
hive> SELECT name, address.street > FROM employees WHERE address.street RLIKE '.*(Chicago|Ontario).*'; Mary Smith 100 Ontario St. Todd Jones 200 Chicago Ave.
SELECT name, address FROM employees WHERE address.street LIKE '%Chicago%' OR address.street LIKE '%Ontario%';
GROUP BY Clauses
hive> SELECT year(ymd), avg(price_close) FROM stocks > WHERE exchange = 'NASDAQ' AND symbol = 'AAPL' > GROUP BY year(ymd); 1984 25.578625440597534 1985 20.193676221040867 1986 32.46102808021274 1987 53.88968399108163 1988 41.540079275138766 1989 41.65976212516664 1990 37.56268799823263 1991 52.49553383386182 1992 54.80338610251119 1993 41.02671956450572 1994 34.0813495847914
HAVING Clauses
hive> SELECT year(ymd), avg(price_close) FROM stocks > WHERE exchange = 'NASDAQ' AND symbol = 'AAPL' > GROUP BY year(ymd) > HAVING avg(price_close) > 50.0; 1987 53.88968399108163 1991 52.49553383386182 1992 54.80338610251119 1999 57.77071460844979 2000 71.74892876261757 2005 52.401745992993554
Inner JOIN
hive> SELECT a.ymd, a.price_close, b.price_close > FROM stocks a JOIN stocks b ON a.ymd = b.ymd > WHERE a.symbol = 'AAPL' AND b.symbol = 'IBM'; 2010-01-04 214.01 132.45 2010-01-05 214.38 130.85 2010-01-06 210.97 130.0 2010-01-07 210.58 129.55 2010-01-08 211.98 130.85 2010-01-11 210.11 129.48
Example 6-1. Query that will not work in Hive
SELECT a.ymd, a.price_close, b.price_close FROM stocks a JOIN stocks b ON a.ymd <= b.ymd WHERE a.symbol = 'AAPL' AND b.symbol = 'IBM';
LEFT OUTER JOIN
hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend > FROM stocks s LEFT OUTER JOIN dividends d ON s.ymd = d.ymd AND s.symbol = d.symbol > WHERE s.symbol = 'AAPL'; ... 1987-05-01 AAPL 80.0 NULL 1987-05-04 AAPL 79.75 NULL 1987-05-05 AAPL 80.25 NULL 1987-05-06 AAPL 80.0 NULL 1987-05-07 AAPL 80.25 NULL 1987-05-08 AAPL 79.0 NULL 1987-05-11 AAPL 77.0 0.015 1987-05-12 AAPL 75.5 NULL 1987-05-13 AAPL 78.5 NULL 1987-05-14 AAPL 79.25 NULL 1987-05-15 AAPL 78.25 NULL 1987-05-18 AAPL 75.75 NULL 1987-05-19 AAPL 73.25 NULL 1987-05-20 AAPL 74.5 NULL ...
OUTER JOIN Gotcha
hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend > FROM stocks s LEFT OUTER JOIN dividends d ON s.ymd = d.ymd AND s.symbol = d.symbol > WHERE s.symbol = 'AAPL' > AND s.exchange = 'NASDAQ' AND d.exchange = 'NASDAQ'; 1987-05-11 AAPL 77.0 0.015 1987-08-10 AAPL 48.25 0.015 1987-11-17 AAPL 35.0 0.02 1988-02-12 AAPL 41.0 0.02 1988-05-16 AAPL 41.25 0.02 ...
hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend FROM > (SELECT * FROM stocks WHERE symbol = 'AAPL' AND exchange = 'NASDAQ') s > LEFT OUTER JOIN > (SELECT * FROM dividends WHERE symbol = 'AAPL' AND exchange = 'NASDAQ') d > ON s.ymd = d.ymd; ... 1988-02-10 AAPL 41.0 NULL 1988-02-11 AAPL 40.63 NULL 1988-02-12 AAPL 41.0 0.02 1988-02-16 AAPL 41.25 NULL 1988-02-17 AAPL 41.88 NULL
RIGHT OUTER JOIN
hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend > FROM dividends d RIGHT OUTER JOIN stocks s ON d.ymd = s.ymd AND d.symbol = s.symbol > WHERE s.symbol = 'AAPL'; ... 1987-05-07 AAPL 80.25 NULL 1987-05-08 AAPL 79.0 NULL 1987-05-11 AAPL 77.0 0.015 1987-05-12 AAPL 75.5 NULL 1987-05-13 AAPL 78.5 NULL
FULL OUTER JOIN
hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend > FROM dividends d FULL OUTER JOIN stocks s ON d.ymd = s.ymd AND d.symbol = s.symbol > WHERE s.symbol = 'AAPL'; ... 1987-05-07 AAPL 80.25 NULL 1987-05-08 AAPL 79.0 NULL 1987-05-11 AAPL 77.0 0.015 1987-05-12 AAPL 75.5 NULL 1987-05-13 AAPL 78.5 NULL ...
LEFT SEMI-JOIN
Example 6-2. Query that will not work in Hive
SELECT s.ymd, s.symbol, s.price_close FROM stocks s WHERE s.ymd, s.symbol IN (SELECT d.ymd, d.symbol FROM dividends d);
Instead, you use the following LEFT SEMI JOIN syntax:
hive> SELECT s.ymd, s.symbol, s.price_close > FROM stocks s LEFT SEMI JOIN dividends d ON s.ymd = d.ymd AND s.symbol = d.symbol; ... 1962-11-05 IBM 361.5 1962-08-07 IBM 373.25 1962-05-08 IBM 459.5 1962-02-06 IBM 551.5
ORDER BY and SORT BY
Here is an example using ORDER BY:
SELECT s.ymd, s.symbol, s.price_close FROM stocks s ORDER BY s.ymd ASC, s.symbol DESC;
Here is the same example using SORT BY instead:
SELECT s.ymd, s.symbol, s.price_cl FROM stocks s SORT BY s.ymd ASC, s.symbol DESC;
Casting
SELECT name, salary FROM employees WHERE cast(salary AS FLOAT) < 100000.0;
Queries that Sample Data -- 抽样
hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s; 2 4 hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s; 7 10 hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s;
hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON number) s; 2 hive> SELECT * from numbers TABLESAMPLE(BUCKET 5 OUT OF 10 ON number) s; 4 hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON number) s; 2
hive> SELECT * from numbers TABLESAMPLE(BUCKET 1 OUT OF 2 ON number) s; 2 4 6 8 10 hive> SELECT * from numbers TABLESAMPLE(BUCKET 2 OUT OF 2 ON number) s; 1 3 5 7 9
UNION ALL
SELECT log.ymd, log.level, log.message FROM ( SELECT l1.ymd, l1.level, l1.message, 'Log1' AS source FROM log1 l1 UNION ALL SELECT l2.ymd, l2.level, l2.message, 'Log2' AS source FROM log1 l2 ) log SORT BY log.ymd ASC;
FROM ( FROM src SELECT src.key, src.value WHERE src.key < 100 UNION ALL FROM src SELECT src.* WHERE src.key > 110 ) unioninput INSERT OVERWRITE DIRECTORY '/tmp/union.out' SELECT unioninput.*