start-dfs.sh
start-yarn.sh
#先启动metastore服务 然后启动hiveserver2服务
nohup bin/hive --service metastore >> logs/metastore.log 2>&1 &
nohup bin/hive --service hiveserver2 >> logs/hiveserver2.log 2>&1 &
select [all | distinct] select_expr, select_expr, ..
from table_reference
[WHERE where_condition]
[group by col_list]
[having where_condition]
[order by col_list]
[ cluster by col_list
|[DISTRIBUTE BY col_list] [SORT by col_list]
]
[LIMIT number]
create database itheima;
use itheima;
CREATE TABLE itheima.orders (
orderId bigint COMMENT '订单id',
orderNo string COMMENT '订单编号',
shopId bigint COMMENT '门店id',
userId bigint COMMENT '用户id',
orderStatus tinyint COMMENT '订单状态 -3:用户拒收 -2:未付款的订单 -1:用户取消 0:待发货 1:配送中 2:用户确认收货',
goodsMoney double COMMENT '商品金额',
deliverMoney double COMMENT '运费',
totalMoney double COMMENT '订单金额(包括运费)',
realTotalMoney double COMMENT '实际订单金额(折扣后金额)',
payType tinyint COMMENT '支付方式,0:未知;1:支付宝,2:微信;3、现金;4、其他',
isPay tinyint COMMENT '是否支付 0:未支付 1:已支付',
userName string COMMENT '收件人姓名',
userAddress string COMMENT '收件人地址',
userPhone string COMMENT '收件人电话',
createTime timestamp COMMENT '下单时间',
payTime timestamp COMMENT '支付时间',
totalPayFee int COMMENT '总支付金额'
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
-- 上传数据到linux,导入数据
load data local inpath '/home/hadoop/itheima_orders.txt' into table itheima.orders;
CREATE TABLE itheima.users (
userId int,
loginName string,
loginSecret int,
loginPwd string,
userSex tinyint,
userName string,
trueName string,
brithday date,
userPhoto string,
userQQ string,
userPhone string,
userScore int,
userTotalScore int,
userFrom tinyint,
userMoney double,
lockMoney double,
createTime timestamp,
payPwd string,
rechargeMoney double
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
-- 导入数据
load data local inpath '/home/hadoop/itheima_users.txt' into table itheima.users;
-- 查询全表数据
SELECT * FROM itheima.orders;
-- 查询单列信息
SELECT orderid, userid, totalmoney FROM itheima.orders;
-- 查询表有多少条数据
SELECT COUNT(*) FROM itheima.orders;
-- 过滤广东省的订单
SELECT * FROM itheima.orders
WHERE useraddress LIKE '%广东%';
-- 找出广东省单笔营业额最大的订单
SELECT * FROM itheima.orders
WHERE useraddress LIKE '%广东%'
ORDER BY totalmoney DESC LIMIT 1;
-- 统计未支付、已支付各自的人数
SELECT ispay, COUNT(*)
FROM itheima.orders
GROUP BY ispay;
-- 在已付款的订单中,统计每个用户最高的一笔消费金额
SELECT userid, MAX(totalmoney)
FROM itheima.orders
WHERE ispay = 1 GROUP BY userid;
SELECT usr.username, MAX(ord.totalmoney)
FROM itheima.orders ord,itheima.users usr
WHERE ord.userId=usr.userId and ord.ispay = 1
GROUP BY usr.username;
-- 统计每个用户的平均订单消费额
SELECT userid, AVG(totalmoney)
FROM itheima.orders
GROUP BY userid;
-- 统计每个用户的平均订单消费额,并过滤大于10000的数据
SELECT userid, AVG(totalmoney) AS avg_money
FROM itheima.orders
GROUP BY userid
HAVING avg_money > 10000;
-- 订单表和用户表JOIN 找出用户username
SELECT o.orderid, o.userid, u.username
FROM itheima.orders o JOIN itheima.users u
ON o.userid = u.userid;
-- 左外连接
SELECT o.orderid, o.userid, u.username
FROM itheima.orders o
LEFT JOIN itheima.users u
ON o.userid = u.userid;
-- 查找广东省数据
SELECT * FROM itheima.orders WHERE useraddress RLIKE '.*广东.*';
-- 查找用户地址是:xx省 xx市 xx区
SELECT * FROM itheima.orders WHERE useraddress RLIKE '..省 ..市 ..区';
-- 查找用户姓为:张、王、邓
SELECT * FROM itheima.orders WHERE username RLIKE '[张王邓]\\S*+';
-- 查找手机号符合:188****0*** 规则
SELECT * FROM itheima.orders WHERE userphone RLIKE '188\\S{4}0[0-9]{3}';
select ...
union [all]
select ...
CREATE TABLE itheima.course(
c_id string,
c_name string,
t_id string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
LOAD DATA LOCAL INPATH '/home/hadoop/course.txt' INTO TABLE itheima.course;
-- 基础UNION
SELECT * FROM itheima.course WHERE t_id = '周杰轮'
UNION
SELECT * FROM itheima.course WHERE t_id = '王力鸿';
-- 去重演示
SELECT * FROM itheima.course
UNION
SELECT * FROM itheima.course;
-- 不去重
SELECT * FROM itheima.course
UNION ALL
SELECT * FROM itheima.course;
-- UNION写在FROM中 UNION写在子查询中
SELECT t_id, COUNT(*) FROM
(
SELECT * FROM itheima.course WHERE t_id = '周杰轮'
UNION ALL
SELECT * FROM itheima.course WHERE t_id = '王力鸿'
) AS u GROUP BY t_id;
TABLE SAMPLE (BUCKET <x> OUT OF <y> [ON <col_name> | rand()])
语法2,基于数据块抽样
SELECT ... FROM tbl TABLESAMPLE(num ROWS I num PERCENT I num(KM|G));
TABLESAMPLE(BUCKET x OUT OF y ON(colname rand() ) )
,推荐,完全随机,速度略慢块抽样,使用分桶表可以加速TABLESAMPLE(num ROWS/num PERCENT」num(K|M|G))
速度快于桶抽样方式,但不随机,只是按照数据顺序从前向后取。-- 随机桶抽取, 分配桶是有规则的
-- 可以按照列的hash取模分桶
-- 按照完全随机分桶
-- 其它条件不变的话,每一次运行结果一致
SELECT username, orderId, totalmoney
FROM itheima.orders tablesample(bucket 3 out of 10 on orders.username);
-- 完全随机,每一次运行结果不同
select * from itheima.orders
tablesample(bucket 3 out of 10 on rand());
-- 数据块抽取,按顺序抽取,每次条件不变,抽取结果不变
-- 抽取100条
select * from itheima.orders
tablesample(100 rows);
-- 取1%数据
select * from itheima.orders
tablesample(20 percent);
-- 取 1KB数据
select * from itheima.orders tablesample(1K);
Hive自前可用3个虚拟列:
INPUT_FILE_NAME
,显示数据行所在的具体文件BLOCK_OFFSET_INSIDE_FILE
,显示数据行所在文件的偏移量ROW_OFFSET_INSIDE__BLOCK
,显示数据所在HDFS块的偏移量
SET hive.exec.rowoffset=true
才可使用虚拟列的作用
--虚拟列
SET hive.exec.rowoffset=true;
SELECT orderid, username, INPUT__FILE__NAME, BLOCK__OFFSET__INSIDE__FILE, ROW__OFFSET__INSIDE__BLOCK FROM itheima.orders;
SELECT *, BLOCK__OFFSET__INSIDE__FILE FROM itheima.orders WHERE BLOCK__OFFSET__INSIDE__FILE < 1000;
SELECT orderid, username, INPUT__FILE__NAME, BLOCK__OFFSET__INSIDE__FILE, ROW__OFFSET__INSIDE__BLOCK FROM itheima.orders_bucket;
SELECT INPUT__FILE__NAME, COUNT(*) FROM itheima.orders_bucket GROUP BY INPUT__FILE__NAME;