hive_study

最近太忙先随便贴上来,需要源数据的可以私聊(TvT)

id:代表数据集的第几条数据,从1到11376681。
target:代表该视频是否被用户点击了,1代表点击,0代表未点击。
timestamp:代表改用户点击改视频的时间戳,如果未点击则为NULL。
deviceid:用户的设备id。
newsid:视频的id。
guid:用户的注册id。
pos:视频推荐位置。
app_version:app版本。
device_vendor:设备厂商。
netmodel:网络类型。
osversion:操作系统版本。
lng:经度。
lat:维度。
device_version:设备版本。
ts:视频暴光给用户的时间戳。

作业1:构建曝光表、点击表,基于train.csv。
table1:badou_bigdata_exp_table
table2:badou_bigdata_click_table

orders.csv
order_id:订单编号
user_id:用户id
eval_set
order_number:用户下单先后排序。
(加个10个购物车,手机订单,拖鞋订单)
order_dow:用户购买的日期星期几
order_hour_of_day:产生订单的是哪个时间段:
days_since_prior_order:距离上次订单的时间

2539329,1,prior,1,2,08,
2398795,1,prior,2,3,07,15.0
473747,1,prior,3,3,12,21.0
2254736,1,prior,4,4,07,29.0
431534,1,prior,5,4,15,28.0
3367565,1,prior,6,2,07,19.0
550135,1,prior,7,1,09,20.0
3108588,1,prior,8,1,14,14.0
2295261,1,prior,9,1,16,0.0

– 创建订单表
CREATE TABLE badou.orders(
order_id string,
user_id string,
eval_set string,
order_number string,
order_dow string,
order_hour_of_day string,
days_since_prior_order string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘,’
tblproperties(“skip.header.line.count”=“1”);

load data local inpath ‘/mnt/hgfs/badou_project_data/project1/orders.csv’ into table badou.orders;

– 清空表数据
truncate table badou.orders;

create table acticle(
sent string)
row format delimited fields terminated by ‘,’
tblproperties(“skip.header.line.count”=“1”);

load data local inpath ‘/mnt/hgfs/hadoop_test/word_count/acticle.txt’ into table badou.acticle;

explain select sum(word) from acticle;

– 似乎没用不知道为什么
hive --service hiveserver2 &

现在我们看一下曝光表的属性(exp_table)
user_id – 设备信息
item_id – 物品 id
exp_ts – 曝光时间
page_event – 来源:是从搜索进来的还是从其他页面进来的

接下来时浏览信息即点击表(brow_table)
一次曝光记录一条信息
实际如果存在点击时间小于曝光表时间,则不是该次曝光表的点击
可以按照曝光、按用户、曝光时间节点分组,然后点击时间插入组内
user_id – 设备信息
item_id – 物品 id
click_ts – 曝光时间
page_event – 来源:是从搜索进来的还是从其他页面进来的

创建movies表
create table movies(
uid int,
iid int,
score int,
ts string)
row format delimited fields terminated by ‘\t’;

– 导入 movies 数据
load data local inpath ‘/mnt/hgfs/Hivedata/movies/ua.base’ into table movies;

– 找出每个用户最喜欢看的10个电影

create table badou.movie_tmp as select uid,iid,score,ts,row_number() over(partition by uid order by score desc) as rk from movies;
select uid,iid,score,ts,row_number() over(partition by uid order by score desc) as rk from movies where rk<=10 limit 30;

select uid,iid,score,ts,row_number() over(partition by uid order by score desc) as rk from movies where rk <= 10;

首先运行执行 hive 语句会报错
Error during job, obtaining debugging information…
FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 7.06 sec HDFS Read: 0 HDFS Write: 0 FAIL
Total MapReduce CPU Time Spent: 7 seconds 60 msec

经过查询得知设置
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.mode.local.auto=true;

schematool -initSchema -dbType mysql -verbose

87,
1,
1573364503073,
fc2537a764aeebad1d9738bd835830c1,
2492493898394443843,
7a7e251a3a8a3e51f304558189d920f8,
5,
2.1.5,
OPPO,
o,
8.1.0,
4.9E-324,
4.9E-324,
PBCM10,
1573364373414

create table user_log(
id string,
target string,
timestamp string,
deviceid string,
newsid string,
guid string,
pos string,
app_version string,
device_vendor string,
netmodel string,
osversion string,
lng string,
lat string,
device_version string,
ts string)
row format delimited fields terminated by ‘,’
tblproperties(“skip.header.line.count”=“1”);

load data local inpath ‘/mnt/hgfs/badou_project_data/project2/train.csv’ into table user_log;

create table exp_table as select guid, from movies;

8,0,NULL,04813dbae7d339a61f38d648e77b2c28,6167225445325229993,0,2.1.5,OPPO,w,8.1.0,0.0,0.0,PBAM00,1573280222575
8,0,NULL,04813dbae7d339a61f38d648e77b2c28,6167225445325229993,0,2.1.5,OPPO,w,8.1.0,0.0,0.0,PBAM00,1573280222575

在创建表的时候有的字段又有 hive 直接跳过也不设值

create table job as select user_id, count(1) as user_cnt from orders group by user_id;

create table badou_bigdata_exp_table as select * from user_log where target == 0;

SET hive.exec.mode.local.auto.inputbytes.max=<新的数值>;

SET hive.exec.mode.local.auto.inputbytes.max=2000000000;

=========================================================================================================

大作业2:
表1:
orders.csv
order_id:订单编号
user_id:用户id
eval_set
order_number:用户下单先后排序。
(加个10个购物车,手机订单,拖鞋订单)
order_dow:用户购买的日期星期几
order_hour_of_day:产生订单的是哪个时间段:
days_since_prior_order:距离上次订单的时间

表2:order_products__prior.csv
order_id:订单编号
product_id:商品id
add_to_cart_order:订单支付的先后位置
reordered:是否重复下单

create table order_products__prior (
order_id string,
product_id string,
add_to_cart_order string,
reordered string)
row format delimited fields terminated by ‘,’
tblproperties(“skip.header.line.count”=“1”);

load data local inpath ‘/mnt/hgfs/badou_project_data/project1/order_products__prior.csv’ into table order_products__prior;

创建 priors 表
CREATE TABLE priors(
order_id string,
product_id string,
add_to_cart_order string,
reordered string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘,’
tblproperties(“skip.header.line.count”=“1”);

load data local inpath ‘/mnt/hgfs/badou_project_data/project1/order_products__prior.csv’ into table priors;

set hive.cli.print.header=false;
SET mapreduce.job.reduces=4;
set hive.exec.mode.local.auto=true;

hive --service metastore
hive --service hiveserver2

beeline -u jdbc:hive2://Linux:10000 -n lt

set mapreduce.jobtracker.split.metainfo.maxsize=-1;

select uid,count(*) from movies where score > 2 group by uid;

=============================================================================================================

20230621
窗口函数

创建movies表
create table movies(
uid int,
iid int,
score int,
ts string)
row format delimited fields terminated by ‘\t’;

– 导入 movies 数据
load data local inpath ‘/mnt/hgfs/Hivedata/movies/ua.base’ into table movies;

– 找出每个用户最喜欢看的10个电影

create table badou.movie_tmp as select uid,iid,score,ts,row_number() over(partition by uid order by score desc) as rk from movies;

hive 的开窗函数不能直接用 where

create table badou.movie_tmp as select uid,iid,score,ts,row_number() over(partition by uid order by score desc) as rk from movies;
select uid,iid,score,ts,row_number() over(partition by uid order by score desc) as rk from movies where rk<=10 limit 30;

select uid,iid,score,ts,row_number() over(partition by uid order by score desc) as rk from movies where rk <= 10;

select count() as user_num from (select t. from (select uid,iid,score,ts,row_number() over(partition by uid order by score desc) as rk from movies)t where t.rk <= 10) t1;

select distinct(uid) as num from movies;
为什么一般不用 distinct
distinct 默认map后全部打到一个 reduce 上会造成数据倾斜

select distinct(uid) as uid_dis from movies order by uid_dis asc;

distinct 底层就是 group by 加上 group by 就可以用了

– 对 group by 的理解案例
– 创建部门表
10 ACCOUNTING 1700
20 RESEARCH 1800
30 SALES 1900
40 OPERATIONS 1700

create table if not exists dept(
deptno int,
dname string,
loc int
)
row format delimited fields terminated by ’ ';

load data local inpath ‘/mnt/hgfs/Hivedata/atguigu/dept.txt’ into table dept;

±-------------±------------±----------+
| dept.deptno | dept.dname | dept.loc |
±-------------±------------±----------+
| 10 | ACCOUNTING | 1700 |
| 20 | RESEARCH | 1800 |
| 30 | SALES | 1900 |
| 40 | OPERATIONS | 1700 |
±-------------±------------±----------+

创建员工表

7369 SMITH CLERK 7902 1980-12-17 800.00 20
7499 ALLEN SALESMAN 7698 1981-2-20 1600.00 300.00 30
7521 WARD SALESMAN 7698 1981-2-22 1250.00 500.00 30
7566 JONES MANAGER 7839 1981-4-2 2975.00 20
7654 MARTIN SALESMAN 7698 1981-9-28 1250.00 1400.00 30
7698 BLAKE MANAGER 7839 1981-5-1 2850.00 30
7782 CLARK MANAGER 7839 1981-6-9 2450.00 10
7788 SCOTT ANALYST 7566 1987-4-19 3000.00 20
7839 KING PRESIDENT 1981-11-17 5000.00 10
7844 TURNER SALESMAN 7698 1981-9-8 1500.00 0.00 30
7876 ADAMS CLERK 7788 1987-5-23 1100.00 20
7900 JAMES CLERK 7698 1981-12-3 950.00 30
7902 FORD ANALYST 7566 1981-12-3 3000.00 20
7934 MILLER CLERK 7782 1982-1-23 1300.00 10

create table if not exists emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int)
row format delimited fields terminated by ’ ';

load data local inpath ‘/mnt/hgfs/Hivedata/atguigu/emp.txt’ into table emp;

±-----------±-----------±-----------±---------±--------------±---------±----------±------------+
| emp.empno | emp.ename | emp.job | emp.mgr | emp.hiredate | emp.sal | emp.comm | emp.deptno |
±-----------±-----------±-----------±---------±--------------±---------±----------±------------+
| 7369 | SMITH | CLERK | 7902 | 1980-12-17 | 800.0 | 20.0 | NULL |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-2-20 | 1600.0 | 300.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-2-22 | 1250.0 | 500.0 | 30 |
| 7566 | JONES | MANAGER | 7839 | 1981-4-2 | 2975.0 | 20.0 | NULL |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-9-28 | 1250.0 | 1400.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-5-1 | 2850.0 | 30.0 | NULL |
| 7782 | CLARK | MANAGER | 7839 | 1981-6-9 | 2450.0 | 10.0 | NULL |
| 7788 | SCOTT | ANALYST | 7566 | 1987-4-19 | 3000.0 | 20.0 | NULL |
| 7839 | KING | PRESIDENT | NULL | 5000.00 | 10.0 | NULL | NULL |
| 7844 | TURNER | SALESMAN | 7698 | 1981-9-8 | 1500.0 | 0.0 | 30 |
| 7876 | ADAMS | CLERK | 7788 | 1987-5-23 | 1100.0 | 20.0 | NULL |
| 7900 | JAMES | CLERK | 7698 | 1981-12-3 | 950.0 | 30.0 | NULL |
| 7902 | FORD | ANALYST | 7566 | 1981-12-3 | 3000.0 | 20.0 | NULL |
| 7934 | MILLER | CLERK | 7782 | 1982-1-23 | 1300.0 | 10.0 | NULL |
±-----------±-----------±-----------±---------±--------------±---------±----------±------------+

– 计算 emp 表每个部门的平均工资
select t.deptno, avg(t.sal) avg_sal from emp t group by t.deptno;

– 计算 emp 每个部门中每个岗位的最高薪水
select deptno,job,max(sal) from emp group by deptno,job;
select t.deptno, t.job, max(t.sal) max_sal from emp t group by t.deptno, t.job;

– Join 语句
– 根据员工表和部门表中的部门编号相等,查询员工编号、员工名称和部门名称;
select emp.empno,emp.ename,dept.dname,dept.deptno from emp join dept on emp.deptno = dept.deptno;

select e.empno, e.ename, d.deptno, d.dname from emp e join dept d on e.deptno = d.deptno;
select e.empno, e.ename, d.deptno, d.dname from dept d join emp e on e.deptno = d.deptno;

– 内连接:只有进行连接的两个表中都存在与连接条件相匹配的数据才会被保留下来
select e.empno, e.ename, d.deptno from emp e join dept d on e.deptno = d.deptno;
– 左外连接:JOIN 操作符左边表中符合 WHERE 子句的所有记录将会被返回
select e.empno, e.ename, d.deptno from emp e left join dept d on e.deptno = d.deptno;
– 右外连接:JOIN 操作符右边表中符合 WHERE 子句的所有记录将会被返回
select e.empno, e.ename, d.deptno from emp e right join dept d on e.deptno = d.deptno;
– 满外连接:将会返回所有表中符合 WHERE 语句条件的所有记录。如果任一表的指定字段没有符合条件的值的话,
– 那么就使用 NULL 值替代
select e.empno, e.ename, d.deptno from emp e full join dept d on e.deptno = d.deptno;

– 多表连接,连接 n 个表,至少需要 n-1 个连接条件。例如:连接三个表,至少需要两个连接条件
– 需要准备数据
1700 Beijing
1800 London
1900 Tokyo

±-------------±------------±----------+
| dept.deptno | dept.dname | dept.loc |
±-------------±------------±----------+
| 10 | ACCOUNTING | 1700 |
| 20 | RESEARCH | 1800 |
| 30 | SALES | 1900 |
| 40 | OPERATIONS | 1700 |
±-------------±------------±----------+

create table if not exists location(
loc int,
loc_name string
)
row format delimited fields terminated by ’ ';

load data local inpath ‘/mnt/hgfs/Hivedata/atguigu/location.txt’ into table location;

– 多表连接查询
SELECT e.ename, d.dname, l.loc_name
FROM emp e
JOIN dept d
ON d.deptno = e.deptno
JOIN location l
ON d.loc = l.loc;

大多数情况下,Hive 会对每对 JOIN 连接对象启动一个 MapReduce 任务。本例中会首先
启动一个 MapReduce job 对表 e 和表 d 进行连接操作,然后会再启动一个 MapReduce job 将
第一个 MapReduce job 的输出和表 l;进行连接操作。

你可能感兴趣的:(hadoop,hive)