数据&建表

[whg@cdh01 hive]$ cat salary 
1001    100.0   ABC
1001    150.0   BCD
1001    200.0   CDE
1001    150.0   DEF
1002    200.0   ABC
1002    200.0   ABC
1002    100.0   BCD
1002    300.0   CDE
1002    50.0    DEF
1002    400.0   EFG
1003    100.0   ABC
1003    50.0    BCD
1004    60.0    ABC

--建测试表
CREATE table IF NOT EXISTS test02.salary 
 ( id String COMMENT '员工编号'
 ,salary String COMMENT '工资' 
,type String )
 row FORMAT DELIMITED 
FIELDS TERMINATED BY '\t' 
stored AS TEXTFILE
--加载数据
load data inpath '/user/whg/data/salary' overwrite into table test02.salary

or & and 优先级问题

在where条件语句中 and的优先级要高于or的，所以必要时要加( ) 来规定计算顺序

-- 如
SELECT id,salary from test02.salary where  (id='1001' or id='1002' )and salary=100 ;

cast( )

cast是用来进行类型转换的
格式为cast( id as String)

SELECT  cast(1.5 as INT) as num FROM salary;

if(con,' ',' ')

if(BOOLEAN testCondition, T valueTrue, T valueFalseOrNull)
Returns valueTrue when testCondition is true, returns valueFalseOrNull otherwise.

SELECT if(2>1,'ture','false') FROM salary LIMIT 2;

case when ... then ... when ... then ... else ... end

用于select后边，作用是根据不同的条件来筛选相应结果
注意:when后边是条件，then后边是满足该条件返回的结果，且不同then返回的结果类型需保持一致

select case when id='1001' then 'num1' 
when id='1002' then 'num2'  else  ' num_other' end from salary;
--id为1001返回num1 id为1002 返回num2 其他id返回num_other

get_json_object(String json_string,String json_path)

第一个参数是json对象的字段名或json字符串，第二个参数是需要提取的json对象格式为$.json_key 如果是数组的话就用[1] 如果输入的jsonkey值无效，那么就会返回null
每次只能返回一个数据项
在技术对app进行埋点时，会将多个字段存放在一个数组中，因此调用数据时，要对埋点数据进行解析，以作进一步的清洗

SELECT get_json_object
('{"name":"Morgan","age":"18","score":{"math":"90","English":"80"}}',"$.score.English");

parse_url

parse_url(STRING urlString, STRING partToExtract [, STRING keyToExtract])
可解析url字段，清洗出想要的信息
Returns the specified part from the URL. Valid values for partToExtract include HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, and USERINFO

SELECT parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL') 

SELECT parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k1') 
--return k1

concat()

字符串连接或者是数字连接，对类型没有规定

select *
  from log a
  left outer join users b
  on case when a.user_id is null then concat(‘hive’,rand() ) else a.user_id end = b.user_id;

concat_ws()
concat升级版，可以将数组元素和字符串之间用规定的分隔符连接并输出
注意：数组必须是字符串数组

SELECT concat_ws("-",array("1","2"))

SELECT concat_ws("-",type,cast(rand(10) as String)) FROM salary

collect_set() collect_list()
collect_set()可对括号内的字段下元素去重输出
collect_list()可全部输出
输出的都是数组

select collect_set(id) as id from salary;
select collect_list(id ) as id from salary;

聚合运算

count(1) count(*) count(row_name)

count(*)：所有行进行统计，包括NULL行
count(1)：所有行进行统计，包括NULL行
count(column)：对column中非Null进行统计

sum(row_name)

时间类函数

date_add(DATE startdate, INT days)

Adds a number of days to startdate: date_add('2008-12-31', 1) = '2009-01-01'. T = pre 2.1.0: STRING, 2.1.0 on: DATE

date_format(DATE|TIMESTAMP|STRING ts, STRING fmt)

Converts a date/timestamp/string to a value of string in the format specified by the date format fmt (as of Hive 1.2.0). Supported formats are Java SimpleDateFormat formats – https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html. The second argument fmt should be constant. Example: date_format('2015-04-08', 'y') = '2015'.

date_sub(DATE startdate, INT days)

Subtracts a number of days to startdate: date_sub('2008-12-31', 1) = '2008-12-30'. T = pre 2.1.0: STRING, 2.1.0 on: DATE

datediff(STRING enddate, STRING startdate)

Returns the number of days from startdate to enddate: datediff('2009-03-01', '2009-02-27') = 2.

函数使用基础