sudo mkdir -R /usr/local/bigdatacase/dataset
//这里会提示你输入当前用户(本教程是hadoop用户名)的密码
//下面给hadoop用户赋予针对bigdatacase目录的各种操作权限
cd /usr/local/
sudo chown -R hadoop:hadoop ./bigdatacase
cd bigdatacase
//下面就可以解压缩user.zip文件
cd ~ //表示进入hadoop用户的目录
cd 下载
ls
unzip user.zip -d /usr/local/bigdatacase/dataset
cd /usr/local/bigdatacase/dataset
ls
head -5 raw_user.csv
cd /usr/local/bigdatacase/dataset
//下面删除raw_user中的第1行
//1d表示删除第1行,同理,3d表示删除第3行,nd表示删除第n行
sed -i '1d' raw_user.csv
//下面删除small_user中的第1行
sed -i '1d' small_user.csv
//下面再用head命令去查看文件的前5行记录,就看不到字段名称这一行了
head -5 raw_user.csv
head -5 small_user.csv
添加随机省份字段
略
//这个为小数据集,下面的为大数据集
hive> create database dblab;
hive> create database dblab_big;
//创建表并映射入HDFS中的文件
hive> CREATE EXTERNAL TABLE dblab.bigdata_user(id INT,uid STRING,item_id STRING,behavior_type INT,item_category STRING,visit_date DATE,province STRING) COMMENT 'Welcome to xmu dblab!' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/bigdatacase/dataset';
到目前为止就完成了数据的预处理,添加省份,创建Hive数据库,上传到HDFS,并且映射到Hive里面
Hive数据分析就是写SQL语句对刚刚映射的表进行查询、筛选、统计等等一系列操作。
uid | item_id | behavior_type | item_category | visit_date | province | |
---|---|---|---|---|---|---|
1 | 用户ID | 商品ID | 用户行为 | 商品分类 | 产生时间 | 用户地理位置 |
uid | item_id | behavior_type | item_category | visit_date | province | |
---|---|---|---|---|---|---|
1 | 10001082 | 285259775 | 1 | 4076 | 2014-12-08 | 广东 |
2 | 10001082 | 4368907 | 1 | 5503 | 2014-12-08 | 湖南 |
3 | 10001082 | 285259788 | 4 | 4076 | 2014-12-08 | 北京 |
behaviour_type属性1234代表的意思
1 | 2 | 3 | 4 | |
---|---|---|---|---|
1 | 浏览 | 收藏 | 加购物车 | 购买 |
//首先查询表中的数据
select * from bigdata_user limit 10;
//然后查询10号之后
select * from bigdata_user where visit_date >'2014-12-10' limit 10;
//再添加限制 13号之前
select * from bigdata_user where visit_date >'2014-12-10' and visit_date <'2014-12-13' limit 10;
//购买行为
select * from bigdata_user where visit_date >'2014-12-10' and visit_date <'2014-12-13' and behavior_type='4' limit 10;
按购买日期统计用户的购买商品记录数、用户数
//首先搜索出所有信息
select * from bigdata_user limit 10;
//行为日期,搜索出日期
select * from bigdata_user limit 10;
select day(visit_date) from bigdata_user limit 10;
//购买行为的记录数、不同用户数
select count(*) from bigdata_user where behavior_type='4';
select count(distinct uid) from bigdata_user where behavior_type='4';
//按日期统计记录数、用户数
select count(*),day(visit_date) from bigdata_user where behavior_type='4' group by day(visit_date) limit 10;
select count(distinct uid),day(visit_date) from bigdata_user where behavior_type='4' group by day(visit_date) limit 10;
//保存为表格
create table day_count as select count(*),day(visit_date) from bigdata_user where behavior_type='4' group by day(visit_date);
create table day_uid as select count(distinct uid),day(visit_date) from bigdata_user where behavior_type='4' group by day(visit_date);
查询某一天在该网站的购买 4项以上商品的用户id
//12号+购买行为
select * from bigdata_user where behavior_type='4'and visit_date='2014-12-12' limit 10;
//按用户编号分组
select uid from bigdata_user where behavior_type='4'and visit_date='2014-12-12' group by uid limit 10;
//按用户分组统计
select uid,count(*) from bigdata_user where behavior_type='4'and visit_date='2014-12-12' group by uid limit 10;
//12号,购买,4项以上
select uid,count(*) from bigdata_user where behavior_type='4'and visit_date='2014-12-12' group by uid having count(*)>4 limit 10;
select uid,count(*) from bigdata_user where behavior_type='4'and visit_date='2014-12-12' group by uid having count(behavior_type='4')>4 limit 10;
查询出每个商品的购买次数 和购买人数
//查询所有购买商品集合
select * from bigdata_user where behavior_type='4' limit 10;
//查询每个商品的购买次数
select item_id,count(1) buy_number from bigdata_user where behavior_type='4' group by item_id order by buy_number desc limit 10;
//查询每个商品购买的用户数量
select item_id,count(distinct uid) user_number from bigdata_user where behavior_type='4' group by item_id order by user_number desc limit 10;
//查询每个商品购买的总数量和购买的用户数量
select item_id,count(1) buy_number,count(distinct uid) user_number from bigdata_user where behavior_type='4' group by item_id order by buy_number desc limit 10;
//广东每日浏览商品数
select visit_date, count(*) browse_number from bigdata_user where province='广东' and behavior_type='1' group by visit_date order by browse_number desc ;
//广东每日购买览商品数
select visit_date, count(*) buy_number from bigdata_user where province='广东' and behavior_type='4' group by visit_date order by buy_number desc ;
//为以上两个创建单独的表出来
//创建表,每日用户浏览数量
create table browse_user as select visit_date, count(*) browse_number from bigdata_user where province='广东' and behavior_type='1' group by visit_date order by browse_number desc ;
//创建表,每日用户购买数量
create table buy_user as select visit_date, count(*) buy_number from bigdata_user where province='广东' and behavior_type='4' group by visit_date order by buy_number desc ;
//每日浏览购买成功率
//通过左联查询出每日的浏览量和购买量,并且算出每日购买率
select browse_user.visit_date, browse_number,buy_number,buy_number/browse_number buy_rate from browse_user left join buy_user on browse_user.visit_date=buy_user.visit_date;
//查询2019-12-12每个省的浏览数量,购买数量和比例
//a查询2014-12-12每个省的浏览数量
select province,count(1) browse_number from bigdata_user where visit_date='2014-12-12' and behavior_type='1' group by province order by browse_number desc;
//b查询2014-12-12每个省的购买数量
select province,count(1) buy_number from bigdata_user where visit_date='2014-12-12' and behavior_type='4' group by province order by buy_number desc;
//以下语句太长格式为
select a.province,a.browse_number,b.buy_number,两个相除 from (表格a) a
left join (表格b) b on a.province=b.province;
//一条语句
//直接显示每个省浏览数量、购买数量、购买率
select a.province,a.browse_number,b.buy_number,b.buy_number/a.browse_number rate from ( select province,count(1) browse_number from bigdata_user where visit_date='2014-12-12' and behavior_type='1' group by province order by browse_number desc) a left join ( select province,count(1) buy_number from bigdata_user where visit_date='2014-12-12' and behavior_type='4' group by province order by buy_number desc ) b on a.province=b.province;
//该天所有点击行为数量
select count(*) users from bigdata_user where visit_date='2014-12-12' and behavior_type='1';
//该用户的点击量
select count(*) user_number from bigdata_user where visit_date='2014-12-12' and behavior_type='1' and uid=10001082;
//结合上面两个语句,显示用户点击量、当天点击量和比例
select '10001082' uid,a.users,b.user_number,b.user_number/a.users rate from ( select count(*) users from bigdata_user where visit_date='2014-12-12' and behavior_type='1' ) a ,( select count(*) user_number from bigdata_user where visit_date='2014-12-12' and behavior_type='1' and uid=10001082 ) b;
//查询出用户最喜欢的商品集
select item_id,count(1) collect from bigdata_user where behavior_type='2' group by item_id order by collect desc limit 20 ;
//查询出每个省浏览网站的次数
select province,count(behavior_type) count from bigdata_user where behavior_type='1' group by province;
//把结果存入表格
create table browse_count as select province,count(behavior_type) count from bigdata_user where behavior_type='1' group by province;
//查看结果
select * from browse_count;
//查询加入购物车最多的商品
select item_id,count(behavior_type) cart_number from bigdata_user where behavior_type='3' group by item_id order by cart_number desc limit 20;