1、组织数据 (需要处理每条数据开头和结尾的中括号)
(1)创建Hive表weibo_json(json string),表只有一个字段,导入所有数据,并验证查询前5条数据
create table weibo_json(json string);
load data local inpath '/root/weibo.txt' into table weibo_json;
select * from weibo_json limit 5;
(2)解析完weibo_json当中的json格式数据到拥有19个字段的weibo表中,写出必要的SQL语句
create table weibo as
select json_tuple(json,
'beCommentWeiboId'
,'beForwardWeiboId'
,'catchTime'
,'commentCount'
,'content'
,'createTime'
,'info1'
,'info2'
,'info3'
,'mlevel'
,'musicurl'
,'pic_list'
,'praiseCount'
,'reportCount'
,'source'
,'userId'
,'videourl'
,'weiboId'
,'weiboUrl'
) as
(beCommentWeiboId
,beForwardWeiboId
,catchTime
,commentCount
,content
,createTime
,info1
,info2
,info3
,mlevel
,musicurl
,pic_list
,praiseCount
,reportCount
,source
,userId
,videourl
,weiboId
,weiboUrl
)
from weibo_json;
2、统计微博总量 和 独立用户数
select count(*) from weibo;
+----------+--+
| _c0 |
+----------+--+
| 1451868 |
+----------+--+
select count(*) as Independent from
(select count(*) from weibo group by userId) tmp;
+--------------+--+
| independent |
+--------------+--+
| 78540 |
+--------------+--+
3、统计用户所有微博被转发的次数之和,输出top5用户,并给出次数
#select source from weibo limit 10;
reportCount
select userId,sum(reportCount) as total from weibo group by userId order by total desc limit 5;
+-------------+--------------+--+
| userid | total |
+-------------+--------------+--+
| 1793285524 | 7.6454805E7 |
| 1629810574 | 7.3656898E7 |
| 2803301701 | 6.8176008E7 |
| 1266286555 | 5.5111054E7 |
| 1191258123 | 5.4808042E7 |
+-------------+--------------+--+
4、统计带图片的微博数
select count(*) as total from weibo where pic_list='[]';
+---------+--+
| total |
+---------+--+
| 701356 |
+---------+--+
5、统计使用iphone发微博的独立用户数
select count(*) as total from
(select count(*) as total from weibo where source='iPhone客户端' group by userId) tmp;
+------+--+
| total |
+------+--+
| 921 |
+------+--+
6、将用户所有微博的点赞人数和转发人数相加求和,并将相加之和降序排列,取前10条记录,输出userid和总次数
select userId,(sum(praiseCount)+sum(reportCount)) as total from weibo group by userId order by total desc limit 10;
+-------------+---------------+--+
| userid | total |
+-------------+---------------+--+
| 1793285524 | 1.14941096E8 |
| 1629810574 | 9.761207E7 |
| 1266286555 | 8.3789422E7 |
| 2803301701 | 7.4208822E7 |
| 1195242865 | 6.9292231E7 |
| 1191258123 | 6.1985742E7 |
| 1197161814 | 5.9093308E7 |
| 2656274875 | 5.2380775E7 |
| 2202387347 | 5.1623117E7 |
| 1195230310 | 4.8321083E7 |
+-------------+---------------+--+
7、统计微博中评论次数小于1000的用户ID与数据来源信息,将其放入视图,然后统计视图中数据来源是”ipad客户端”的用户数目
#创建表接收微博中评论次数小于1000的用户ID与数据来源信息
create table small1000 as
select userId,source from weibo where commentCount<1000 group by userId,source;
#查询数据来源是”ipad客户端”的用户数目
select count(*) as total from small1000 where source='iPad客户端';
+--------+--+
| total |
+--------+--+
| 537 |
+--------+--+
8、统计微博内容中出现”iphone”次数最多的用户,
最终结果输出用户id和次数(注意:该次数是”iphone”的出现次数,
不是出现”iphone”的微博数目)
select userId,count(userid) from weibo where content like '%iphone%' group by userId;
9、求每天发微博次数最多的那个家伙的ID和发微博的条数
#时间函数,原表存储的数据的时间是字符串的时间戳,所以需要将其转成长整型bigint
from_unixtime(cast(createTime as bigint)
#首先获取数据的userId,以及时间并转换为"yyyy-MM-dd"格式
select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo;
#然后按照时间,用户进行分组查询,获取每日用户发布微博数量,这里定义该查询结果为a
select tmp.Release_time as Release_time,
tmp.userId as userId,
count(tmp.userId) as total
from
(select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
group by tmp.userId,tmp.Release_time;
#从a中按照获取的时间再进行分组查询,获取当日发布微博最大值,这里将查询结果定义为b
select tmp1.Release_time as Release_time,max(tmp1.total) as big from
(select tmp.Release_time as Release_time,
tmp.userId as userId,
count(tmp.userId) as total
from
(select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
group by tmp.userId,tmp.Release_time) tmp1
group by tmp1.Release_time;
#最后b与a进行内连接,以获取导userId
select a.Release_time as Release_time,b.userId as userId,a.big as total from
(select tmp1.Release_time as Release_time,max(tmp1.total) as big from
(select tmp.Release_time as Release_time,
tmp.userId as userId,
count(tmp.userId) as total
from
(select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
group by tmp.userId,tmp.Release_time) tmp1
group by tmp1.Release_time) a
join
(select tmp.Release_time as Release_time,
tmp.userId as userId,
count(tmp.userId) as total
from
(select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
group by tmp.userId,tmp.Release_time) b
on a.Release_time=b.Release_time and a.big=b.total;