大数据Hive学习案例(3)——基于新浪微博的日志数据分析

下方有数据可免费下载

目录

  • 原始数据
  • 数据仓库构建
  • 统计需求
    • 1.微博总量和独立用户数
    • 2.用户所有微博被转发的总数,输出前3个用户
    • 3.被转发次数最多的前3条微博,输出用户id
    • 4.每个用户发布的微博总数,存储到临时表
    • 5.统计带图片的微博数
    • 6.统计使用iphone发微博的独立用户数
    • 7.微博中评论次数小于1000的用户id和数据来源,放入视图
    • 8.统计上条视图中数据来源“ipad客户端”的用户数目
  • 特殊需求
    • 1.Hive的UDF应用一
    • 2.Hive的UDF应用二
  • 数据ETL

数据下载请点击我,提取码:v3rn,觉得有用希望您能点一个赞哦。

原始数据

大数据Hive学习案例(3)——基于新浪微博的日志数据分析_第1张图片

文件夹中含有多个json文件,每个文件中的信息如上图所示。

总共19个字段
beCommentWeiboId 是否评论
beForwardWeiboId 是否是转发微博
catchTime 抓取时间
commentCount 评论次数
content 内容
createTime 创建时间
info1 信息字段1
info2信息字段2
info3信息字段3
mlevel no sure
musicurl 音乐链接
pic_list 照片列表(可以有多个)
praiseCount 点赞人数
reportCount 转发人数
source 数据来源
userId 用户id
videourl 视频链接
weiboId 微博id
weiboUrl 微博网址

数据仓库构建

create database if not exists weibo;

use weibo;

create external table weibo(json string)
location '/weibo';
create external table weibo_aaa(json string)
location '/aaa';

*加载数据*
hadoop fs -put ./weibo/* /weibo
hadoop fs -ls /weibo/

select * from weibo limit 3;

统计需求

1.微博总量和独立用户数

select count(*) from weibo;
Total MapReduce CPU Time Spent: 16 seconds 990 msec
OK
1451868
Time taken: 41.665 seconds, Fetched: 1 row(s)
select count(distinct(get_json_object(a.j,'$.userId')))
from
(select substring(json,2,length(json)-2) as j from weibo) a;
Total MapReduce CPU Time Spent: 47 seconds 400 msec
OK
78540
Time taken: 47.451 seconds, Fetched: 1 row(s)

2.用户所有微博被转发的总数,输出前3个用户

select b.id,sum(b.cnt) as bsum
from
(select get_json_object(a.j,'$.userId') as id,get_json_object(a.j,'$.reportCount') as cnt
from (select substring(json,2,length(json)-2) as j from weibo) a) b 
group by b.id 
order by bsum desc 
limit 3;
Total MapReduce CPU Time Spent: 56 seconds 720 msec
OK
1793285524	7.6454805E7
1629810574	7.3656898E7
2803301701	6.8176008E7
Time taken: 72.604 seconds, Fetched: 3 row(s)

3.被转发次数最多的前3条微博,输出用户id

select get_json_object(a.j,'$.userId') as id,cast(get_json_object(a.j,'$.reportCount') as INT) as cnt 
from 
(select substring(json,2,length(json)-2) as j from weibo) a 
order by cnt desc limit 3;
Total MapReduce CPU Time Spent: 52 seconds 560 msec
OK
2202387347	2692012
2202387347	2692012
2202387347	2692012
Time taken: 51.86 seconds, Fetched: 3 row(s)

4.每个用户发布的微博总数,存储到临时表

create table weibo_uid_wbcnt(
userid string,
wbcnt int
)
row format delimited
fields terminated by '\t';
insert overwrite table weibo_uid_wbcnt select get_json_object(a.j,'$.userId'),count(1)
from
(select substring(json,2,length(json)-2) as j from weibo) a
group by get_json_object(a.j,'$.userId');

select * from weibo_uid_wbcnt limit 10;
1000432103	10
1001869117	1
1002133091	2
1002282395	2
1002737912	2
1002861732	1
1002906354	2
1002909672	3
1003106904	1
1003669351	1
Time taken: 0.062 seconds, Fetched: 10 row(s)

5.统计带图片的微博数

select count(1)
from 
(select substring(json,2,length(json)-2) as j from weibo) a 
where get_json_object(a.j,'$.pic_list') like '%http%';
Total MapReduce CPU Time Spent: 41 seconds 720 msec
OK
750512
Time taken: 45.552 seconds, Fetched: 1 row(s)

6.统计使用iphone发微博的独立用户数

select count(distinct get_json_object(a.j,'$.userId'))
from 
(select substring(json,2,length(json)-2) as j from weibo) a 
where lower(get_json_object(a.j,'$.source')) like '%iphone%';
Total MapReduce CPU Time Spent: 42 seconds 690 msec
OK
936
Time taken: 45.606 seconds, Fetched: 1 row(s)

7.微博中评论次数小于1000的用户id和数据来源,放入视图

create view weibo_view as
select get_json_object(a.j,'$.userId') as id,get_json_object(a.j,'$.source') as source
from 
(select substring(json,2,length(json)-2) as j from weibo) a 
where get_json_object(a.j,'$.commentCount')<1000;

select * from weibo_view limit 10;
2989711735	
1087770692	iPad客户端
1390470392	
1390470392	
1498502972	
1087770692	iPad客户端
1589706710	
1087770692	iPad客户端
1087770692	iPad客户端
1589706710	
Time taken: 0.414 seconds, Fetched: 10 row(s)

8.统计上条视图中数据来源“ipad客户端”的用户数目

select count(distinct id) as cnt from weibo_view where source='iPad客户端';
Total MapReduce CPU Time Spent: 56 seconds 750 msec
OK
537
Time taken: 57.111 seconds, Fetched: 1 row(s)

特殊需求

1.Hive的UDF应用一

将微博的点赞数和转发数求和,降序,取前10条。

public class DemoTest1 extends UDF {
    public Integer evaluate(Integer num1,Integer num2){
        try{
            return num1+num2;
        }catch (Exception e){
            return null;
        }
    }
}

create temporary function wb as 'DemoTest1';
select wb(cast(get_json_object(a.j,'$.praiseCount') as int),cast(get_json_object(a.j,'$.reportCount') as int)) as cnt
from
(select substring(json,2,length(json)-2) as j from weibo) a 
order by cnt desc limit 10;
Total MapReduce CPU Time Spent: 59 seconds 10 msec
OK
2692013
2692013
2692013
2692013
2692013
2692013
2692013
2692010
2692010
2692009
Time taken: 62.921 seconds, Fetched: 10 row(s)

2.Hive的UDF应用二

用户微博内容中出现iphone关键词的最大次数

public class DemoTest2 extends UDF {
    public int evaluate(String content,String word){
        int count = 0;
        if(content != null&&content.length()>0){
            String[] array = content.split(word);
            count = array.length-1;
        }
        return count;
    }
}

create temporary function wcount as 'DemoTest2';
select b.id,max(b.cnt) as cn
from
(select get_json_object(a.j,'$.userId') as id,wcount(get_json_object(a.j,'$.content'),'iphone') as cnt
from
(select substring(json,2,length(json)-2) as j from weibo) a) b 
group by b.id 
order by cn desc limit 10;
Total MapReduce CPU Time Spent: 1 minutes 4 seconds 570 msec
OK
2003347594	11
1735618041	11
1220291284	7
2096991555	5
1144393351	5
2438243845	5
1136793703	4
1687914673	4
1346470754	4
2668575357	3
Time taken: 85.444 seconds, Fetched: 10 row(s)

数据ETL

在mysql中操作,将hive中的数据导入到mysql
每个用户发布的微博总数,存储到临时表

use sqoop_test;
create table weibo(
userid varchar(255),
wbcnt int
);

bin/sqoop export \
--connect jdbc:mysql://hadoop000:3306/sqoop_test \
--username root \
--password 123456 \
--table weibo \
--export-dir 'hdfs://hadoop000:8020/user/hive/warehouse/weibo.db/weibo_uid_wbcnt' \
--fields-terminated-by '\t';

select * from weibo limit 10;
+------------+-------+
| userid     | wbcnt |
+------------+-------+
| 1837902673 |     1 |
| 1837921322 |    12 |
| 1837934232 |     1 |
| 1837937837 |     1 |
| 1837956335 |     1 |
| 1837963841 |     2 |
| 1837975031 |     1 |
| 1837999323 |     2 |
| 1838068594 |     1 |
| 1838101235 |     1 |
+------------+-------+
10 rows in set (0.00 sec)

你可能感兴趣的:(Hive,大数据,hive,数据仓库,java,udf)