create table access_log(
ip String,
date String,
address String
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’
LINES TERMINATED BY ‘\n’;
LOAD DATA LOCAL INPATH ‘/opt/datas/hive-access.log’ INTO TABLE access_log;
public class transforDate extends UDF {
public Text evaluate (Text text) throws ParseException {
SimpleDateFormat inputDate = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss");
SimpleDateFormat outputDate = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
//将字段转化成字符串格式
String value = text.toString();
//去除字符中间符号
String s = value.replaceAll("\"", "-");
//判断过滤
if (StringUtils.isBlank(value)){
return null;
}
// 解析成时间格式
Date inputDate1 = inputDate.parse(s);
//转化成要求的格式
String outputDate1 = outputDate.format(inputDate1);
//text调用此方法
text.set(outputDate1);
return text;
}
}
添加jar包到hive中 (在bin/hive中操作)
add jar /opt/cdh5.7.6/hive-1.1.0-cdh5.7.6/hadoop-1.0-SNAPSHOT.jar;
创建函数:create function 函数名 as ‘类的路径’
create temporary function transforDate0720 as ‘com.huadian.bigdata.test0720
.transforDate’;
#将原表格中的日期类型转化成需要的格式
(这一步可不用,可在SQL语句中直接使用function名称对时间字段进行转换:
select transforDate0720(date) from …
)
create table hive_access_log1
AS
select
ip,transforDate0720(date) as date,address
from
access_log;
1.统计pv
select
substr(date,1,10) as date ,count(address) as pv
from
hive_access_log1
group by
substr(date,1,10) as date;
结果:
2.统计注册人数:
select
substr(date,1,10) as date , count(1) as register
from
hive_access_log1
where
instr(address,“member.php?mod=register”)>0
group by
substr(date,1,10);
结果:
3. 统计独立IP数
select
substr(date,1,10) as date , count(distinct ip) as ip
from
hive_access_log1
group by
substr(date,1,10);
#4.统计跳出数
select
substr(date,1,10) as date,count(t1.ip) as jump
from
(
select
substr(date,1,10) as date,ip,count(ip) as ip_count
from
hive_access_log1
group by
substr(date,1,10),ip
) t1
where t1.ip_count=1
group by
substr(date,1,10)
;
合并:
(4张表进行left join)
select
t_a.date as date ,
t_a.pv as pv ,
t_b.register as register,
t_c.ip as ip ,
t_d.jump as jump
from
(
select
substr(date,1,10) as date ,count(address) as pv
from
hive_access_log1
group by
substr(date,1,10)
) t_a
left join
(
select
substr(date,1,10) as date , count(1) as register
from
hive_access_log1
where
instr(address,“member.php?mod=register”)>0
group by
substr(date,1,10)
)t_b
left join
(
select
substr(date,1,10) as date , count(distinct ip) as ip
from
hive_access_log1
group by
substr(date,1,10)
)t_c
left join
(
select
substr(date,1,10) as date,count(t1.ip) as jump
from
(
select
substr(date,1,10) as date,ip,count(ip) as ip_count
from
hive_access_log1
group by
substr(date,1,10),ip
) t1
where t1.ip_count=1
group by
substr(date,1,10)
)t_d;
结果:
bin/sqoop export
–connect jdbc:mysql://bigdata-hpsk01.huadian.com/default
–username root
–password 123456
–table hive_access_log
–columns ip,date,address
–export-dir /user/hive/warehouse/db_0708.db/tb_pageview_result
–num-mappers 2
–input-fields-terminated-by ‘\t’