hive-学习微博日志分析

–微博日志分析–
show databases;
use weibo_db ;
–创建数据仓库
create database if not exists weibo_db;
use weibo_db ;
create external table if not exists wb_table(
json string
)
location ‘/data/wb’;

SELECT * from wb_table limit 10;

–数据分析
–微博总量 1451868
SELECT count(*) from wb_table ;

–独立用户数 78540
–SELECT
– COUNT(DISTINCT GET_JSON_OBJECT(t1.js,‘$.userId’))
–from (
–SELECT
– SUBSTRING(t.json,2,length(t.json)-2) js
–from wb_table t)t1;

select
count(t2.dis_uid)
from(
SELECT
DISTINCT GET_JSON_OBJECT(t1.js,‘$.userId’) dis_uid
from (SELECT SUBSTRING(t.json,2,length(t.json)-2) js from wb_table t)t1)t2;

–转发维度
–用户所有微博被转发的总数,输出前10个用户
SELECT
t1.id
, sum(t1.reportCount)
from (
SELECT
GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘ . u s e r I d ′ ) i d , G E T J S O N O B J E C T ( S U B S T R I N G ( t . j s o n , 2 , l e n g t h ( t . j s o n ) − 2 ) , ′ .userId') id , GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),' .userId)id,GETJSONOBJECT(SUBSTRING(t.json,2,length(t.json)2),.reportCount’) reportCount
from wb_table t)t1
group by t1.id
order by sum(t1.reportCount) desc
limit 10;

–被转发次数最多的前10条微博,输出用户id
SELECT
t1.id , t1.reportCount
from (
SELECT
GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘ . u s e r I d ′ ) i d , G E T J S O N O B J E C T ( S U B S T R I N G ( t . j s o n , 2 , l e n g t h ( t . j s o n ) − 2 ) , ′ .userId') id , GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),' .userId)id,GETJSONOBJECT(SUBSTRING(t.json,2,length(t.json)2),.reportCount’) reportCount
from wb_table t) t1
order by t1.reportCount desc
limit 10;

–被点赞次数最多的前10条微博,输出用户id
SELECT
t1.id , t1.praiseCount
from (
SELECT
GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘ . u s e r I d ′ ) i d , G E T J S O N O B J E C T ( S U B S T R I N G ( t . j s o n , 2 , l e n g t h ( t . j s o n ) − 2 ) , ′ .userId') id , GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),' .userId)id,GETJSONOBJECT(SUBSTRING(t.json,2,length(t.json)2),.praiseCount’) praiseCount
from wb_table t) t1
order by t1.praiseCount desc
limit 10;

–每个用户发布的微博总数
SELECT count(t1.id) from (
SELECT
GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.userId’) id
from wb_table t)t1
group by t1.id
order by count(t1.id) desc
limit 10;

–统计带图片的微博数
SELECT
count(*)
from wb_table t
where GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.pic_list’) like ‘%http%’;

–统计使用iphone发微博的独立用户数
SELECT
count(*)
from wb_table t
where GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.source’) like ‘%iphone%’;

–微博中评论次数小于1000的用户id和数据来源
SELECT
GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘ . u s e r I d ′ ) i d , G E T J S O N O B J E C T ( S U B S T R I N G ( t . j s o n , 2 , l e n g t h ( t . j s o n ) − 2 ) , ′ .userId') id , GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),' .userId)id,GETJSONOBJECT(SUBSTRING(t.json,2,length(t.json)2),.commentCount’) commentCount
from wb_table t
where GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.commentCount’) < 1000;

–数据ETL
– 将查询的数据导出到mysql中
set hive.map.aggr = true;
–set hive.grouby.mapaggr.checkinterval=100000;
set hive.groupby.skewindata = true;
– 先将查询结果保存到临时表 再根据临时表的路径导出即可 默认会在/usr/local/hive_dw/weibo_db.db/wb_user_nums
create table wb_user_nums(
uid string ,
nums int
)
row format delimited
fields terminated by ‘,’;

insert overwrite table wb_user_nums
SELECT t1.id, count(*) from (
SELECT
GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.userId’) id
from wb_table t)t1
group by t1.id;

你可能感兴趣的:(hive,hive,学习,hadoop)