用户行为日志user_log.csv,日志中的字段定义如下:
(1)查看user_log表数据结构
(2)查看user_log表简单数据结构
(3)查看日志前10个交易日志的商品品牌
(4)查询前20个交易日志中购买商品时的时间和商品的种类
(5)用聚合函数count()计算出表内有多少条行数据
(6)在函数内部加上distinct,查出user_id不重复的数据有多少条
(7)排除顾客刷单(查询不重复的数据)
(8)查询双11当天有多少人购买了商品
(9)品牌2661,当天购买此品牌商品的数量
(10)查询多少用户当天点击了2661品牌的该店
(11)查询双十一当天男女购买商品比例
(12)查询某一天在该网站购买商品超过5次的用户id
(13)创建姓名缩写表 其中字段大于4条,并使查询插入,最后显示姓名缩写表格数据
yum install unzip
mkdir /usr/local/dbtaobao/dataset
cp -r /mnt/hgfs/data_format.zip /usr/local/dbtaobao/dataset/
cd /usr/local/dbtaobao/dataset/
unzip data_format.zip
head -5 user_log.csv
sed -i '1d' user_log.csv
提取10000条user_log中日期为11月11日的数据,并存放于small_user_log中
infile=$1
outfile=$2
awk -F "," 'BEGIN{
id=0;
}
{
if($6=11 && $7=11){
id=id+1;
print $1","$2","$3","$4","$5","$6","$7","$8","$9","$10","$11","$12
if(id==10000){
exit
}
}
}' $infile > $outfile
chmod +x ./predeal.sh
./predeal ./user_log.csv ./small_user_log.csv
start-all.sh
hdfs dfs -mkdir -p /dbtaobao/dataset/user_log
hdfs dfs -put /usr/local/dbtaobao/dataset/small_user_log.csv /dbtaobao/dataset/user_log
hdfs dfs -cat /dbtaobao/dataset/user_log/small_user_log.csv | head -10
create database dbtaobao;
create external table dbtaobao.user_log(user_id int,item_id int,cat_id int,merchant_id int,brand_id int,month string,day string,action int,age_range int,gender int,province string) comment 'Welcome to Alex dblab, now create dbtaobao.user_log!' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' stored as textfile location '/dbtaobao/dataset/user_log';
use dbtaobao;
select * from user_log limit 10;
show create table user_log;
desc user_log;
select brand_id from user_log limit 10;
select month,day,brand_id from user_log limit 20;
select count(*) from user_log;
Result : 10000
select count(distinct user_id) from user_log;
Result : 358
select count(distinct user_id,item_id,cat_id,merchant_id,brand_id,month,day,action,age_range,gender,province) from user_log;
Result : 9944
select count(distinct user_id) from user_log where action='2';
Result : 358
select count(*) from user_log where brand_id='2661' and action='2';
Result : 3
select count(distinct user_id) from user_log where brand_id='2661' and action='0';
Result : 1
select count(distinct user_id) from user_log where gender='0' and action='2';
select count(distinct user_id) from user_log where gender='1' and action='2';
Result : 238 (女)
Result : 214 (男)
男 女 比 例 = 214 / 238 = 89.916 % 男女比例 = 214 / 238 = 89.916\% 男女比例=214/238=89.916%
select user_id from user_log where action='2' group by user_id having count(action='2')>5;
Result :
user_id
1321
6058
16464
18378
23786
26516
32569
35260
41494
47958
55440
61703
69247
70816
71744
84400
106446
106629
153790
161778
171909
173427
179194
186568
188977
196638
203651
211273
212058
212504
217844
219316
234456
242845
249869
251260
256190
261596
270040
272775
274559
278823
278884
283204
284990
289429
310348
310632
320313
328230
330576
332670
333389
345251
356220
356408
366342
370679
378206
379005
389295
396129
407719
409280
422917
create external table dbtaobao.GR(user_id int,item_id int,age_range int,gender int,province string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' stored as textfile;
insert into table gr select user_id,item_id,age_range,gender,province from user_log;
select * from gr limit 10;