select 1;------> 1
select explode(split('aa,bb',','));
select `^o.*` from offers; 列名不是o开头的都列出来
# 查询订单表中共有多少不同顾客下过订单
select customer_id,concat(customer_fname,' ',customer_lname) as customer_name
from (select order_customer_id from orders group by order_customer_id) o
inner join
customers c
on o.order_customer_id=c.customer_id
limit 10;
with
a1 as (select order_customer_id from orders group by order_customer_id)
select customer_id,concat(customer_fname,' ',customer_lname) as customer_name
from customers a2 inner join a1 on a2.customer_id=a1.order_customer_id
limit 10;
select customer_id,concat(customer_fname,' ',customer_lname) as customer_name
from customers c where c.customer_id in
(select order_customer_id from orders group by order_customer_id)
limit 10;
select customer_id,concat(customer_fname,' ',customer_lname) as customer_name
from customers c where exists
(select order_customer_id from orders s where s.order_customer_id=c.customer_id)
limit 10;
with
a1 as (select distinct order_customer_id from orders)
select customer_id,concat(customer_fname,' ',customer_lname) as customer_name
from customers a2 inner join a1 on a2.customer_id=a1.order_customer_id
limit 10;
# 查询商品表中前五个商品
select * from products limit 5;
# 从顾客表中查询每位顾客的全名(分别使用CTE和子查询)
select concat(customer_fname,' ',customer_lname) as customer_name
from customers;
# 使用关联查询获取没有订单的所有顾客
select customer_id,concat(customer_fname,' ',customer_lname) as customer_name
from customers c where not exists
(select order_customer_id from orders s where s.order_customer_id=c.customer_id);
exist;
mysql内置函数50个 https://blog.csdn.net/qq646040754/article/details/82721801
load用于在hive中移动数据
原始数据被移动到目标表/分区,不再存在于原始位置
LOAD DATA LOCAL INPATH '/home/dayongd/Downloads/employee.txt'
OVERWRITE INTO TABLE employee;
-- LOCAL表示文件位于本地,OVERWRITE表示覆盖现有数据
LOAD DATA LOCAL INPATH '/home/dayongd/Downloads/employee.txt'
OVERWRITE INTO TABLE employee_partitioned PARTITION (year=2014, month=12);
-- 没有LOCAL,文件位于HDFS文件系统中
LOAD DATA INPATH '/tmp/employee.txt'
OVERWRITE INTO TABLE employee_partitioned PARTITION (year=2017, month=12);
使用insert语句将数据插入表/分区
// INSERT支持OVERWRITE(覆盖)和INTO(追加)
INSERT OVERWRITE/INTO TABLE tablename1
[PARTITION (partcol1=val1, partcol2=val2 ...)]
select fileds,... from tb_other;
insert表插入数据
INSERT OVERWRITE TABLE test select 'hello'; -- INSERT不支持的写法
insert into employee select * from ctas_employee; -- 通过查询语句插入
-- 多插入
from ctas_employee
insert overwrite table employee select *
insert overwrite table employee_internal select *;
-- 插入到分区
from ctas_patitioned
insert overwrite table employee PARTITION (year, month)
select *,'2018','09';
-- 通过指定列插入(insert into可以省略table关键字)
insert into employee(name) select 'John' from test limit 1;
-- 通过指定值插入
insert into employee(name) value('Judy'),('John');
使用insert语句将数据插入、导出到文件
-- 从同一数据源插入本地文件,hdfs文件,表
from ctas_employee
insert overwrite local directory '/tmp/out1' select *
insert overwrite directory '/tmp/out1' select *
insert overwrite table employee_internal select *;
-- 以指定格式插入数据
insert overwrite directory '/tmp/out3'
row format delimited fields terminated by ','
select * from ctas_employee;
-- 其他方式从表获取文件
hdfs dfs -getmerge <table_file_path>
EXPORT TABLE employee TO '/tmp/output3';
EXPORT TABLE employee_partitioned partition (year=2014, month=11) TO '/tmp/output5';
IMPORT TABLE employee FROM '/tmp/output3';
IMPORT TABLE employee_partitioned partition (year=2014, month=11) FROM '/tmp/output5';
mysqldump -uroot -p1234 mydemo custom>abc 导出
mysql -uroot -p1234 mydemo1<abc 导入
set hive.groupby.orderby.position.alias=true;
select * from offers order by case when offerid = 1 then 1 else 0 end;
select * from offers order by 1;
set mapred.reduce.tasks=2
select * from mynitltab sort by mntname;
hive> set mapred.reduce.tasks=2; 分两个区
hive> select * from mynitltab distribute by mntid sort by mntname;
select * from mynitltab distribute by mntname sort by mntname;
sort by 局部排序,前提设置task任务=reduce
custer by = distribute by + sort by,默认只能升序
order by 一个reduce,一个分区
distribute by 指定哪一个建进行分区,指定哪一列作为分区的依据,一般和sort by联合使用=partition
// mysql实现分组查询每个班级的前三名:
select a.class,a.score from student a where (select count(*) from student where a.class=class and a.score
hive基本内置聚合函数与group by一起使用
如果没有指定group by子句,则默认聚合整个表
除聚合函数外,所选的其他列也必须包含在group by中
group by支持使用case when或表达式
支持按位置编号分组
group by后面跟的列名越多,分组越细,带“每”,“组”等关键字的就要用group by
select…from中间放group by后面的列名和聚合函数的列,其他的不要乱放
set hive.groupby.orderby.position.alias=true;
对group by聚合结果的条件过滤
可以避免在group by之后使用子查询
having之后可以使用表达式,但不建议这样做,一般都用子查询代替having
grouping sets
实现对同一数据集进行多重group by操作
本质是多个group by进行union all操作
group by with cube | rollup
cube:对分组列进行所有可能组合的聚合
rollup:计算维度层次级别上的聚合
窗口函数:https://blog.csdn.net/weixin_38750084/article/details/82779910
文章写的太好,就不过多赘述了,简单做下总结
jack,2015-01-01,10
tony,2015-01-02,15
jack,2015-02-03,23
tony,2015-01-04,29
jack,2015-01-05,46
jack,2015-04-06,42
tony,2015-01-07,50
jack,2015-01-08,55
mart,2015-04-08,62
mart,2015-04-09,68
neil,2015-05-10,12
mart,2015-04-11,75
neil,2015-06-12,80
mart,2015-04-13,94
// 分组
select name,count(cost) from t_window where substring(orderdate,1,7)='2015-04' group by name;
select name,count(*) over () from t_window where substring(orderdate,1,7)='2015-04';
// 交叉连接,计算2015年4月每个人消费的笔数占月笔数的比率
with
t1 as (select * from t_window where substring(orderdate,1,7)='2015-04'),
t2 as (select name,count(cost) as consume from t1 group by name),
t3 as (select count(cost) as cnt from t1)
select * from t2,t3;
// 窗口函数
分组出一条数据,窗口自定义出几条数据
over()括号里面不写东西,整表作为窗口
name和count(*) over()作为两个表,然后再交叉连接,不聚合
select name,count(*) over() from t_window where substring(orderdate,1,7)='2015-04';
// 所有数据处理完成之后处理count,over字段跟之前的字段无关,只做统计用
select distinct name,count(*) over() from t_window where substring(orderdate,1,7)='2015-04';
select name,count(*) over() from t_window where substring(orderdate,1,7)='2015-04' group by name;
select name,max(ee) from (select name,count(*) over() as ee from t_window where substring(orderdate,1,7)='2015-04') s group by name;
select name,max(ee) from (select name,max(cost) over() as ee from t_window where substring(orderdate,1,7)='2015-04') s group by name;
// 分区
select name,orderdate,cost,sum(cost) over(partition by month(orderdate)) from t_window;
// 排序
select name,orderdate,cost,sum(cost) over(partition by month(orderdate) order by cost) from t_window;
//给窗口里面的数据顺序排序 row number
select name,orderdate,cost,row_number() over(partition by month(orderdate) order by cost) from t_window;
insert into t_window values('smith','2015-01-31',50);
// rank并列第二,去除第三
select name,orderdate,cost,rank() over(partition by month(orderdate) order by cost) from t_window;
// dense_rank并列第二,还有第三
select name,orderdate,cost,dense_rank() over(partition by month(orderdate) order by cost) from t_window;
// 取每人每月消费最高的三个月
select name,orderdate,cost,row_number() over(partition by name,month(orderdate) order by cost) as rn from t_window;
with
t1 as (select name,substring(orderdate,1,7) as y_m,sum(cost) as sale from t_window group by name,substring(orderdate,1,7))
select name,y_m,sale,row_number() over(partition by name order by sale) from t1;
with
t1 as (select name,substring(orderdate,1,7) as y_m,sum(cost) as sale from t_window group by name,substring(orderdate,1,7)),
t2 as (select name,y_m,sale,row_number() over(partition by name order by sale) as mc from t1)
select name,y_m,sale,mc from t2 where mc<3;
select name,orderdate,cost,lag(overdate,1) over (partition by name order by orderdate) as time1 from t_window;
select name,orderdate,cost,lead(overdate,1) over (partition by name order by orderdate) as time1 from t_window;
count | avg | sum | max | min() over() // 全局一个窗口
over(partition by) // 按用户选择分窗口
over(partition by...order by) // 窗口强制排序
row_number() over(...order by) // 无重复排序
rank() over() // 并列名次跳
dense_rank() // 并列名次不跳
lag(列,x) over(order by) // 找上x笔数据,没有就用null补
lead(列,x) over(order by) // 找下x笔数据,没有就用null补
first_value(列) over(order by) // 根据窗口的当前指针获取当前窗口的第一个值
last_value(列) over(order by) // 根据窗口的当前指针获取当前窗口的最后一个值
sum | max | min...() over(partition by...order by...between x and y) // 游标
x | y => UNBOUNDED PRECEDING(窗口首行) UNBOUNDED FOLLOWING(窗口最后一行)
n PRECEDING(向前n行) n FOLLOWING(向后n行)
CURRENT ROW(当前行)
oltp和olap:https://www.cnblogs.com/lingchen-liang/p/10690619.html
######### 安装zeppelin ###########
tar -zxf zeppelin-0.8.1-bin-all.tgz
mv zeppelin-0.8.1-bin-all soft/zepplin081
cd soft/zeppelin081/conf
cp zeppelin-site.xml.template zeppelin-site.xml
vi zeppelin-site.xml
cp zeppelin-env.sh.template zeppelin-env.sh
vi zeppelin-env.sh
echo $JAVA_HOME /opt/soft/jdk180
echo $HADOOP_HOME /opt/soft/hadoop260
vi zeppelin-env.sh
加入$JAVA_HOME=/opt/soft/jdk180和/opt/soft/hadoop260/etc/hadoop
./zeppelin-daemon.sh start // 启动zeppelin
[root@xuxu bin]# cp /opt/soft/hive110/conf/hive-site.xml /opt/soft/zeppelin081/conf/
[root@xuxu bin]# cp /opt/soft/hive110/lib/hive-jdbc-1.1.0-cdh5.14.2-standalone.jar /opt/soft/zeppelin081/interpreter/
[root@xuxu bin]# cp /opt/soft/hadoop260/share/hadoop/common/hadoop-common-2.6.0-cdh5.14.2.jar /opt/soft/zeppelin081/interpreter/