1、知识点回顾
hive数据库是hdfs上的文件夹,表也是文件夹,表里的数据是文件
hive建表
create table 表名(字段1 类型1,字段2 类型2……)
row format delimited fields terminated by '字段分隔符';
为一键启动集群中的zookeeper,以下提供参考脚本:
#vim my-zkServer.sh 添加如下内容:
/opt/zookeeper-3.4.10/bin/zkServer.sh $1
ssh hdp2 "source /etc/profile;/opt/zookeeper-3.4.10/bin/zkServer.sh $1"
ssh hdp3 "source /etc/profile;/opt/zookeeper-3.4.10/bin/zkServer.sh $1"
#source /etc/profile 的目的是将profile中的JAVA_HOME路径提升为全局变量才能到相应的服务器启动具体操作:
#赋予权限 chmod +x my-zkServer.sh
#一键启动zookeeper集群:./my-zkServer.sh start
#一键停止zookeeper集群:./my-zkServer.sh stop
2、测试阶段可以把集群运行环境切换为本地运行 yarn -> local
mapreduce.framework.name
local
mapreduce.framework.name设置为local,则不会使用YARN集群来分配资源,在本地节点执行。在本地模式运行的任务,无法发挥集群的优势。注:在web UI是查看不到本地模式运行的任务。
3、hive以服务方式运行:
启动服务:./hive --service hiveserver2 &
查看端口:netstat -tunl
hiveserver端口:10000
4、beeline客户端连接hiveserver
bin/beeline
!connect jdbc:hive2://hdp1:10000
root
回车(hive的默认不认证用户名密码)
5、建表,内部表和外部表
hive建表语句
建内部表
create table t_student(id string,name string,age int,classNo string)
row format delimited
fields terminated by ',';
建外部表
create external table t_a(id string,name string)
row format delimited fields terminated by ','
location '/ainput';
删除表
drop table xx
删除内部表是连同表结构和数据一起删除
删除外部表只删除表结构,不删除存储在hdfs上的数据
hive导入数据的语句
从hiverserver本地导入,注意不是beeline客户端的本地
load data local inpath '/root/b.dat' into table t_b;
从hdfs上导入数据(移动)
load data inpath '/datafromhdfs/a.dat' into table t_a;
6、笛卡尔积 join
笛卡尔积定义:
设A和B是两个集合,存在一个集合,它的元素是用A中元素为第一个元素,B中元素为第二个元素构成的有序二元组。称它为集合A和B的笛卡尔积集,记为A×B。即
A×B = {(a,b)|a∈A,b∈B}
例 A={1,2}
B={a,b,c}
A×B = {(1,a),(1,b),(1,c),(2,a),(2,b),(2,c)}
join例子:
a.dat
a,1
b,2
c,3
d,4
b.dat
a,xx
b,yy
d,zz
e,pp
创建表t_a ,t_b与以上数据对应并导入数据。
create table t_a(id string,name string)
row format delimited fields terminated by ',';
create table t_b(id string,name string)
row format delimited fields terminated by ',';
load data local inpath '/root/a.dat' into table t_a;
load data local inpath '/root/b.dat' into table t_b;
6.1 笛卡尔积(内连接):
select t_a.*,t_b.* from t_a inner join t_b;
+---------+-----------+---------+-----------+--+
| t_a.id | t_a.name | t_b.id | t_b.name |
+---------+-----------+---------+-----------+--+
| a | 1 | a | xx |
| b | 2 | a | xx |
| c | 3 | a | xx |
| d | 4 | a | xx |
| a | 1 | b | yy |
| b | 2 | b | yy |
| c | 3 | b | yy |
| d | 4 | b | yy |
| a | 1 | d | zz |
| b | 2 | d | zz |
| c | 3 | d | zz |
| d | 4 | d | zz |
| a | 1 | e | pp |
| b | 2 | e | pp |
| c | 3 | e | pp |
| d | 4 | e | pp |
+---------+-----------+---------+-----------+--+
6.2 带条件的笛卡尔积
select t_a.*,t_b.* from t_a inner join t_b on t_a.id = t_b.id;
等同于:
select t_a.*,t_b.* from t_a inner join t_b where t_a.id = t_b.id
+---------+-----------+---------+-----------+--+
| t_a.id | t_a.name | t_b.id | t_b.name |
+---------+-----------+---------+-----------+--+
| a | 1 | a | xx |
| b | 2 | b | yy |
| d | 4 | d | zz |
+---------+-----------+---------+-----------+--+
6.3 左外连接:左表的数据一定显示,右表的数据有就显示,没有就不显示
select t_a.*,t_b.* from t_a left outer join t_b;
6.3 带条件的左外连接
select t_a.*,t_b.* from t_a left outer join t_b on t_a.id=t_b.id;
+---------+-----------+---------+-----------+--+
| t_a.id | t_a.name | t_b.id | t_b.name |
+---------+-----------+---------+-----------+--+
| a | 1 | a | xx |
| b | 2 | b | yy |
| c | 3 | NULL | NULL |
| d | 4 | d | zz |
+---------+-----------+---------+-----------+--+
6.4 右外连接:右表的数据一定显示,左表的数据有就显示,没有就不显示
select t_a.*,t_b.* from t_a right outer join t_b on t_a.id = t_b.id;
+---------+-----------+---------+-----------+--+
| t_a.id | t_a.name | t_b.id | t_b.name |
+---------+-----------+---------+-----------+--+
| a | 1 | a | xx |
| b | 2 | b | yy |
| d | 4 | d | zz |
| NULL | NULL | e | pp |
+---------+-----------+---------+-----------+--+
6.5 全外连接:左右表的数据都显示,对应不上的就为空
select t_a.*,t_b.* from t_a full outer join t_b on t_a.id=t_b.id;
+---------+-----------+---------+-----------+--+
| t_a.id | t_a.name | t_b.id | t_b.name |
+---------+-----------+---------+-----------+--+
| a | 1 | a | xx |
| b | 2 | b | yy |
| c | 3 | NULL | NULL |
| d | 4 | d | zz |
| NULL | NULL | e | pp |
+---------+-----------+---------+-----------+--+
6.6 左半连接 相当于指定条件的内连接,但是只显示左边的表数据
select a.*from t_a a left semi join t_b b on a.id = b.id;
+-------+---------+--+
| a.id | a.name |
+-------+---------+--+
| a | 1 |
| b | 2 |
| d | 4 |
+-------+---------+--+
7、分区表、PV、UV
例如有这些数据
vim access.log.0804
192.168.33.3,http://www.sina.com/stu,2017-08-04 15:30:20
192.168.33.3,http://www.sina.com/teach,2017-08-04 15:35:20
192.168.33.4,http://www.sina.com/stu,2017-08-04 15:30:20
192.168.33.4,http://www.sina.com/job,2017-08-04 16:30:20
192.168.33.5,http://www.sina.com/job,2017-08-04 15:40:20
vim access.log.0805
192.168.33.3,http://www.sina.com/stu,2017-08-05 15:30:20
192.168.44.3,http://www.sina.com/teach,2017-08-05 15:35:20
192.168.33.44,http://www.sina.com/stu,2017-08-05 15:30:20
192.168.33.46,http://www.sina.com/job,2017-08-05 16:30:20
192.168.33.55,http://www.sina.com/job,2017-08-05 15:40:20
vim access.log.0806
192.168.133.3,http://www.sina.com/register,2017-08-06 15:30:20
192.168.111.3,http://www.sina.com/register,2017-08-06 15:35:20
192.168.34.44,http://www.sina.com/pay,2017-08-06 15:30:20
192.168.33.46,http://www.sina.com/excersize,2017-08-06 16:30:20
192.168.33.55,http://www.sina.com/job,2017-08-06 15:40:20
192.168.33.46,http://www.sina.com/excersize,2017-08-06 16:30:20
192.168.33.25,http://www.sina.com/job,2017-08-06 15:40:20
192.168.33.36,http://www.sina.com/excersize,2017-08-06 16:30:20
192.168.33.55,http://www.sina.com/job,2017-08-06 15:40:20
分区意义:
创建分区表:
create table t_access0(ip string,url string,access_time string)
row format delimited fields terminated by ',';
create table t_access(ip string,url string,access_time string)
partitioned by (day string)
row format delimited fields terminated by ',';
把数据导入分区表中:
load data local inpath '/root/access.log.0804' into table t_access partition(day='0804');
查看表分区信息:
show partitions t_access
7.1 求pv,每天pv
select url,count(1) cnts from t_access group by url;
select day,url,count(1) cnts from t_access group by day,url;
7.2 求uv, 每天uv
select url,count(distinct ip) cnts from t_access group by url;
select url,count(distinct(ip)) cnts from t_access group by url;
select day,url,count(distinct(ip)) cnts from t_access group by day,url;
7.3 求每个ip访问同一个页面的记录中,最晚的一条
+--------------------------------+----------------+----------------------+--+
| url | ip | _c2 |
+--------------------------------+----------------+----------------------+--+
| http://www.sina.com/excersize | 192.168.33.36 | 2017-08-06 16:30:20 |
| http://www.sina.com/excersize | 192.168.33.46 | 2017-08-06 16:30:20 |
| http://www.sina.com/job | 192.168.33.25 | 2017-08-06 15:40:20 |
| http://www.sina.com/job | 192.168.33.4 | 2017-08-04 16:30:20 |
| http://www.sina.com/job | 192.168.33.46 | 2017-08-05 16:30:20 |
| http://www.sina.com/job | 192.168.33.5 | 2017-08-04 15:40:20 |
| http://www.sina.com/job | 192.168.33.55 | 2017-08-06 15:40:20 |
| http://www.sina.com/pay | 192.168.34.44 | 2017-08-06 15:30:20 |
| http://www.sina.com/register | 192.168.111.3 | 2017-08-06 15:35:20 |
| http://www.sina.com/register | 192.168.133.3 | 2017-08-06 15:30:20 |
| http://www.sina.com/stu | 192.168.33.3 | 2017-08-05 15:30:20 |
| http://www.sina.com/stu | 192.168.33.4 | 2017-08-04 15:30:20 |
| http://www.sina.com/stu | 192.168.33.44 | 2017-08-05 15:30:20 |
| http://www.sina.com/teach | 192.168.33.3 | 2017-08-04 15:35:20 |
| http://www.sina.com/teach | 192.168.44.3 | 2017-08-05 15:35:20 |
+--------------------------------+----------------+----------------------+--+
7.4 求8月6号的pv,uv
7.5 每天,pv排序
+-------+--------------------------------+-------+--+
| day | url | cnts |
+-------+--------------------------------+-------+--+
| 0806 | http://www.sina.com/job | 3 |
| 0806 | http://www.sina.com/excersize | 3 |
| 0806 | http://www.sina.com/register | 2 |
| 0805 | http://www.sina.com/stu | 2 |
| 0805 | http://www.sina.com/job | 2 |
| 0804 | http://www.sina.com/stu | 2 |
| 0804 | http://www.sina.com/job | 2 |
| 0806 | http://www.sina.com/pay | 1 |
| 0805 | http://www.sina.com/teach | 1 |
| 0804 | http://www.sina.com/teach | 1 |
+-------+--------------------------------+-------+--+
8、order by、distributed by、sort by
hive里的order by是全局排序,order by 是全排序,所有的数据会发送给一个reduceTask进行处理,在数据量大的时候,reduce就会超负荷
distribute by先把数据分发到各个reduce中,然后sort by在各个reduce中进行局部排序
cluster by mid 等于 distribute by mid sort by mid
cluster by后面不能跟desc,asc,默认的只能升序
9、Hive MR参数设置
--设置最大的reduce启动个数
set hive.exec.reducers.max=10;
--设置reduce的启动个数
set mapreduce.job.reduce=3