#能把列名删除(就是第一行)
sed '1d' orders.csv
# 查看有哪些字段(维度)
$ desc orders.csv
输出:
order_id string
user_id string
eval_set string
order_number string
order_dow string
order_hour_of_day string
days_since_prior_order string
create table orders
(
order_id string,
user_id string,
dow string
) row format delimited fields terminated by ','
lines terminated by '\n';
2.每个用户有多少个订单[orders] pv浏览量 > uv用户量
select user_id, count(order_id) as order_cnt
from orders
group by user_id
order by order_cnt
limit 100;
输出:
202272 100
133983 100
172806 100
69995 100
162455 100
67798 100
82420 100
35190 100
67736 100
117890 100
3.每个用户【2.平均【1.每个订单是多少商品】】 avg
分析:
$ desc orders
order_id string
user_id string
eval_set string
order_number string
order_dow string
order_hour_of_day string
days_since_prior_order string
$desc desc order_product_prior;
order_id string
product_id string 订单里面的商品数
add_to_cart string
reordered string
$ select * from order_product_prior limit 10;
order_id product_id add_to_cart_order reordered
5 18569 19 1
5 41176 20 1
5 48366 21 1
5 47209 22 0
5 46522 23 0
5 38693 24 0
5 48825 25 0
5 8479 26 0
Hive SQL :
select user_id, avg(prod_cnt) as avg_prod_cnt
from (
select orders.user_id, prod.prod_cnt from orders
join (
select order_id, count(product_id) as prod_cnt
from order_product_prior
group by order_id) prod
on orders.order_id=prod.order_id
) t
group by user_id
order by avg_prod_cnt desc
limit 100;
kill 子任务:
① kill job: hadoop job -kill job_1553939296911_0006
②kill application:
yarn application -kill application_1537547243219_0056
- 每个用户在一周中的购买订单的分布 --列转行
if 语句
case when 语句
- 每个用户平均每个购买天中,购买的商品数量
days_since_prior_order 这个字段当做:订单里面的商品数
select
if(days_since_prior_order='','0',days_since_prior_order)
as days_since_prior_order
select user_id, count(distinct days_since_prior_order) as day_cnt
from orders
group by user_id
每个订单是有多少商品
select order_id, count(product_id) as prod_cnt
from order_products_prior
group by order_id
- 每个用户最喜欢的3个商品是什么
①表join关联
select user_id, product_id from orders join order_product_prior pri on orders.order_id=pri.order_id limit 100;
select user_id, concat_ws(',', collect_list(order_id)) as order_ids from orders group by user_id limit 100;
1 2539329,2398795,473747,2254736,431534,3367565,550135,3108588,2295261,2550362,1187899
10 1224907,68288,2115522,83395,1353310,1822501
100 680467,3159209,2443738,2337051,2875733,3302990
1000 3045856,1456368,2700013,2003258,568709,18216,2885089,903956
10000 1492646,1444722,3272690,2152976,859342,103998,3146859,2306339,1765454,377814,1090401,3204347,2077534,1731354,1191621,428409,936532,2300676,236049,715590,2196356,192826,2652468,2600064,12156,1550736,793788,1824260,1044054,2505014,1104652,2646743,2166278,247668,545893,1902338,2327642,2610522,3038094,2856130,675149,1762088,2662079,775362,956424,3007165,858276,957745,664258,2492781,686954,3128885,1946074,611183,3079937,193643,1788987,2864915,842149,1404909,2477429,2999447,2511241,3060036,3268994,584896,343340,3306425,1125006,951827,1990815,2989046,1925217
100000 507791,284691,1016379,1666305,2380440,2159051,1844720,2433517,2104152,576286
select user_id, collect_list(order_id) as order_ids from orders group by user_id limit 100;
1 ["2539329","2398795","473747","2254736","431534","3367565","550135","3108588","2295261","2550362","1187899"]
10 ["1224907","68288","2115522","83395","1353310","1822501"]
100 ["680467","3159209","2443738","2337051","2875733","3302990"]
1000 ["3045856","1456368","2700013","2003258","568709","18216","2885089","903956"]
10000 ["1492646","1444722","3272690","2152976","859342","103998","3146859","2306339","1765454","377814","1090401","3204347","2077534","1731354","1191621","428409","936532","2300676","236049","715590","2196356","192826","2652468","2600064","12156","1550736","793788","1824260","1044054","2505014","1104652","2646743","2166278","247668","545893","1902338","2327642","2610522","3038094","2856130","675149","1762088","2662079","775362","956424","3007165","858276","957745","664258","2492781","686954","3128885","1946074","611183","3079937","193643","1788987","2864915","842149","1404909","2477429","2999447","2511241","3060036","3268994","584896","343340","3306425","1125006","951827","1990815","2989046","1925217"]
100000 ["507791","284691","1016379","1666305","2380440","2159051","1844720","2433517","2104152","576286"]
我的思考:
- hive中的数据如何导入到redis中?
回答:
- hive中的数据如何导入到redis中?
待补充。
https://www.fuwuqizhijia.com/redis/201704/67005.html
https://blog.csdn.net/DSLZTX/article/details/50775500
use test;
drop table if exists student2;
create table student2(
id int,
name string,
age int,
course array,
body map,
address struct
)
partitioned by (state string,city string)
row format delimited
fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
stored as textfile;
--1. 将orders和order_products_prior建表入hive
--sed '1d' orders.csv 能把列名删除
create table orders
(
order_id string,
user_id string,
eval_set string,
order_number string,
order_dow string,
order_hour_of_day string,
days_since_prior_order string
)
row format delimited fields terminated by ',' --'\t'
lines terminated by '\n';
--location '/data/orders' --(orders是文件夹)
--导入数据
load local data inpath '/home/badou/Documents/data/order_data/orders.csv'
overwrite into table orders;
order_id string
product_id string
add_to_cart_order string
reordered string
--2.每个用户有多少个订单[orders] pv浏览量 > uv用户量
--order_id,user_id
select user_id,count(order_id) as order_cnt
from orders
group by user_id
order by order_cnt desc
limit 10
--3.每个用户【2.平均【1.每个订单是多少商品】】 avg
orders[用户,订单] order_products_prior【订单,商品】order_id product_id
--3.1.每个订单是多少商品
select order_id,count(product_id) as prod_cnt
from order_products_prior
group by order_id
--3.2
set hive.cli.print.header=true;
select orders.user_id,prod.prod_cnt
from orders
join(
select order_id,count(product_id) as prod_cnt
from order_products_prior
group by order_id)prod
on orders.order_id=prod.order_id
limit 10;
三张小票
- 10个prod
- 20个prod
- 30个prod
(10+20+30)/3 avg
user_id avg([10,20,30])=20
1user 100order sum 1*100prod avg 1
2user 2order sum 10+20prod avg 15
select user_id, avg(prod_cnt) as avg_prod_cnt
from
(
select orders.user_id,prod.prod_cnt
from orders
join(
select order_id,count(product_id) as prod_cnt
from order_products_prior
group by order_id)prod
on orders.order_id=prod.order_id
)t
group by user_id
order by avg_prod_cnt desc
limit 100
都是kill子任务:
-- kill job:
hadoop job -kill job_1537547243219_0058
-- kill application:
yarn application -kill application_1537547243219_0056
select orders.user_id,avg(prod.prod_cnt) as avg_prod_cnt
from orders
join(
select order_id,count(product_id) as prod_cnt
from order_products_prior
group by order_id)prod
on orders.order_id=prod.order_id
group by orders.user_id
limit 10;
--4. 每个用户在一周中的购买订单的分布 --列转行
set hive.cli.print.header=true;
select
user_id,
sum(case order_dow when '0' then 1 else 0 end) as dow_0,
sum(case order_dow when '1' then 1 else 0 end) as dow_1,
sum(case order_dow when '2' then 1 else 0 end) as dow_2,
sum(case order_dow when '3' then 1 else 0 end) as dow_3,
sum(case order_dow when '4' then 1 else 0 end) as dow_4,
sum(case order_dow when '5' then 1 else 0 end) as dow_5,
sum(case order_dow when '6' then 1 else 0 end) as dow_6
from orders
group by user_id
limit 20;
--5. 每个用户平均每个购买天中,购买的商品数量
--【把days_since_prior_order当做一个日期20181014】
--1.orders user_id days_since_prior_order 天数
select if(days_since_prior_order='','0',days_since_prior_order) as days_since_prior_order
select user_id,count(distinct days_since_prior_order) as day_cnt
from orders
group by user_id
--2.每个订单是多少商品
select order_id,count(product_id) as prod_cnt
from order_products_prior
group by order_id
1.user_id prod_cnt day_cnt join orders order_products_prior
2.user_id prod_cnt dt(days_since_prior_order) join
--1.
select orders.user_id,sum(prod.prod_cnt)/count(distinct days_since_prior_order) as day_cnt
from orders
join
(select order_id,count(product_id) as prod_cnt
from order_products_prior
group by order_id)prod
on orders.order_id=prod.order_id
group by orders.user_id
limit 100
--2.
select user_id,avg(prod_cnt) as avg_day_prod_cnt
from(
select orders.user_id,orders.days_since_prior_order,sum(prod.prod_cnt) as prod_cnt
from orders
join
(select order_id,count(product_id) as prod_cnt
from order_products_prior
group by order_id)prod
on orders.order_id=prod.order_id
group by orders.user_id,orders.days_since_prior_order
)t
group by user_id
limit 100
最终两个结果一样,执行逻辑一样
- 每个用户最喜爱购买的三个product是什么,最终表结构可以是3个列,或者一个字符串
user_id,product_id,prod_buy_cnt 一个用户对同一个商品购买多少次
select user_id,collect_list(concat_ws('_',product_id,cast(row_num as string))) as top_3_prods
from(
select user_id,product_id,
row_number() over(partition by user_id order by prod_buy_cnt desc) as row_num,
prod_buy_cnt
from(
select orders.user_id,pri.product_id,count(1) as prod_buy_cnt
from orders
join order_products_prior pri
on orders.order_id=pri.order_id
group by orders.user_id,pri.product_id
)t
)tt
where row_num<=3
group by user_id
limit 10;
--concat_ws
select user_id,collect_list(order_id) as order_ids
from orders
group by user_id
limit 10;
orders
app, m, pc
orders_app
orders_m
orders_pc
group by == word count
1.combiner
2.set hive.groupby.skewindata=true;
一个map reduce拆成两个MR
--多个mapreduce
a.order_id = b.ord_id
c.product_id = b.prod_id
user_id product_desc
--一个mapreduce
a.order_id = b.ord_id
a.order_id = c.ord_id
partition
a:orders订单表 user_id order_id
b: 订单对应的商品表 order_id 产品
c: 订单类型表 order_id 类型,在哪些广告渠道过来的下的订单
loading page
--一个 MR job
select * from orders
join order_products_prior pri on orders.order_id=pri.order_id
join tmp_order tord on orders.order_id=tord.order_id
--
select * from orders
join order_products_prior pri on orders.order_id=pri.order_id
join tmp_order tord on pri.order_id=tord.order_id
dict ={"order_id":"product_id"}
order_id ,user_id
a: user_id, product_id
for line in sys.stdin:
s = line.split()
user_id=s[0]
order_id = s[1]
product_id = dict.get(order_id)
print('%s\t%s\t%s'%(user_id,order_id,product_id))
a b
1,
2,
3
hue cdh
- [4,5[7,8,9[11,21,23]],6]
-
val s = "The figure of Irene, never, as the reader may possibly have observed, present, except through the senses of other characters, is a concretion of disturbing Beauty impinging on a possessive world."
s.split(" ")
s.split(" ").length
import scala.io.Source
val lines = Source.fromFile("./The_man_of_property.txt").getLines().toList
lines: List[String]
lines.map(x=>x.split(" "))
res0: List[Array[String]]
lines.map(x=>x.split(" ")).flatten
res2: List[String]
<=>
lines.flatMap(x=>x.split(" "))
res2: List[String]
MR的map处理:
lines.flatMap(.split(" ")).map(x=>(x,1))
lines.flatMap(.split(" ")).map((_,1))
lines.flatMap(.split(" ")).map(x=>(x,1)).groupBy(x=>x.1)
lines.flatMap(.split(" ")).map(x=>(x,1)).groupBy(._1)
scala.collection.immutable.Map[String,List[(String, Int)]]
key->value
forgotten -> List((forgotten,1), (forgotten,1), (forgotten,1), (forgotten,1), (forgotten,1), (forgotten,1)
MR word count:
lines.flatMap(.split(" ")).map(x=>(x,1)).groupBy(._1).map(x=>(x._1,x._2.length))
lines.flatMap(.split(" "))
.map(x=>(x,1))
.groupBy(._1)
.map(x=>(x._1,x.2.map(._2).sum))
--x.2 List((String,Int))
--List[(Int, Int, Int)] = List((1,2,3), (4,5,6), (7,8,9))
--b.map(._2).sum
--reduce
lines.flatMap(.split(" ")).map(x=>(x,1)).groupBy(._1).map(x=>(x.1,x.2.map(.2).reduce(+)))
sum = reduce(+)
reduce()
list(1,1,1) ((1+1)+1)
sum += x
topN
--排序 按照词频选择top10
lines.flatMap(.split(" ")).map(x=>(x,1)).groupBy(._1)
.map(x=>(x._1,x.2.size))
.toList
.sortBy(._2)
.reverse
.slice(0,10)
lines.flatMap(.split(" "))
.map(x=>(x,1)).groupBy(._1)
.map(x=>(x._1,x.2.size))
.toList.sortWith(.2>._2)
lines.flatMap(.split(" ")).map((,1))
.groupBy(.1)
.mapValues(.size).toArray
.sortWith(.2>._2)
.slice(0,10)
rdd DataFrame