实战背景
新闻网站
- 版块
- 新闻页面
- 新用户注册
- 用户跳出
案例需求分析
- 每天每个页面的PV
PV是Page View,是指一个页面被所有用户访问次数的总和,页面被访问一次就被记录1次PV - 每天每个页面的UV
UV是User View,是指一个页面被多少个用户访问了,一个用户访问一次是1次UV,一个用户访问多次还是1次UV - 新用户注册比率
当天注册用户数 / 当天未注册用户的访问数 - 用户跳出率
IP只浏览了一个页面就离开网站的次数/网站总访问数(PV) - 版块热度排行榜
根据每个版块每天被访问的次数,做出一个排行榜
网站日志格式
date timestamp userid pageid section action
日志字段说明
date: 日期,yyyy-MM-dd格式
timestamp: 时间戳
userid: 用户id
pageid: 页面id
section: 新闻版块
action: 用户行为,两类,点击页面和注册
模拟数据生成程序
public class OfflineDataGenerator {
public static void main(String[] args) throws Exception {
StringBuffer buffer = new StringBuffer("");
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Random random = new Random();
String[] sections = new String[] {"country", "international", "sport", "entertainment", "movie", "carton", "tv-show", "technology", "internet", "car"};
int[] newOldUserArr = new int[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
// 生成日期,默认就是昨天
Calendar cal = Calendar.getInstance();
cal.setTime(new Date());
cal.add(Calendar.DAY_OF_YEAR, -1);
Date yesterday = cal.getTime();
String date = sdf.format(yesterday);
// 生成10000000条访问数据
for(int i = 0; i < 10000000; i++) {
// 生成时间戳
long timestamp = System.currentTimeMillis();
// 生成随机userid(默认1000注册用户,每天1/10的访客是未注册用户)
Long userid = 0L;
int newOldUser = newOldUserArr[random.nextInt(10)];
if(newOldUser == 1) {
userid = null;
} else {
userid = (long) random.nextInt(1000);
}
// 生成随机pageid(总共1k个页面)
Long pageid = (long) random.nextInt(1000);
// 生成随机版块(总共10个版块)
String section = sections[random.nextInt(10)];
// 生成固定的行为,view
String action = "view";
buffer.append(date).append("�")
.append(timestamp).append("�")
.append(userid).append("�")
.append(pageid).append("�")
.append(section).append("�")
.append(action).append("\n");
}
// 生成100000条注册数据
for(int i = 0; i < 100000; i++) {
// 生成时间戳
long timestamp = System.currentTimeMillis();
// 新用户都是userid为null
Long userid = null;
// 生成随机pageid,都是null
Long pageid = null;
// 生成随机版块,都是null
String section = null;
// 生成固定的行为,view
String action = "register";
buffer.append(date).append("�")
.append(timestamp).append("�")
.append(userid).append("�")
.append(pageid).append("�")
.append(section).append("�")
.append(action).append("\n");
}
PrintWriter pw = null;
try {
pw = new PrintWriter(new OutputStreamWriter(
new FileOutputStream("C:\\Users\\ZJ\\Desktop\\access.log")));
pw.write(buffer.toString());
} catch (Exception e) {
e.printStackTrace();
} finally {
pw.close();
}
}
}
创建相关表
在hive中创建访问日志表
create table news (
date string,
timestamp bigint,
userid bigint,
pageid bigint,
section string,
action string);
将模拟数据导入hive表中
load data local inpath '/opt/spark-study/news.log' into table news;
编码
main方法
public static void main(String[] args) {
SparkSession sparkSession = SparkSession.builder().appName("NewsOfflineStatSpark").enableHiveSupport().getOrCreate();
String yesterday = getYesterday();
// 开发第一个关键指标:页面pv统计以及排序
calculateDailyPagePv(sparkSession, yesterday);
// 开发第二个关键指标:页面uv统计以及排序
calculateDailyPageUv(sparkSession, yesterday);
// 开发第三个关键指标:新用户注册比率统计
calculateDailyNewUserRegisterRate(sparkSession, yesterday);
// 开发第四个关键指标:用户跳出率统计
calculateDailyUserJumpRate(sparkSession, yesterday);
// 开发第五个关键指标:版块热度排行榜
calculateDailySectionPvSort(sparkSession, yesterday);
}
getYesterday方法
private static String getYesterday() {
Calendar cal = Calendar.getInstance();
cal.setTime(new Date());
cal.add(Calendar.DAY_OF_YEAR, -1);
Date yesterday = cal.getTime();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
return sdf.format(yesterday);
}
}
每天每个页面的PV
private static void calculateDailyPagePv(SparkSession sparkSession, String yesterday) {
// select date,pageid, pv from(
// select date,pageid,count(pageid) as pv from news where date = '2019-01-24' and action = 'view' group by date,pageid
// ) t
// order by pv desc;
String sql =
"select date, pageid, pv from ( " +
"select date, pageid, count(pageid) as pv from news " +
"where date = '" + yesterday + "' " +
" and action = " + "'view' " +
"group by date, pageid " +
") t " +
"order by pv desc";
Dataset dataset = sparkSession.sql(sql);
dataset.show();
}
每天每个页面的UV
private static void calculateDailyPageUv(SparkSession sparkSession, String yesterday) {
// select date,pageid, uv from (
// select date, pageid, count(userid) as uv from (
// select date,pageid,userid from news where date = '2019-01-24' and action = 'view' group by date,pageid,userid
// ) t1
// group by date,pageid
// ) t2
// order by uv desc;
String sql =
"select date,pageid, uv from ( " +
"select date, pageid, count(userid) as uv from ( " +
"select date,pageid,userid from news " +
"where date = '" + yesterday + "' " +
"and action = 'view' " +
"group by date,pageid,userid " +
") t1 " +
"group by date,pageid " +
") t2 " +
"order by uv desc ";
Dataset dataset = sparkSession.sql(sql);
dataset.show();
}
新用户注册比率
private static void calculateDailyNewUserRegisterRate(SparkSession sparkSession, String yesterday) {
String sql1 = "SELECT count(*) FROM news WHERE action='view' AND date='" + yesterday + "' AND userid IS NULL";
String sql2 = "SELECT count(*) FROM news WHERE action='register' AND date='" + yesterday + "' ";
Dataset sql = sparkSession.sql(sql1);
Long result1 = sql.collectAsList().get(0).getLong(0);
long number1 = 0L;
if(result1 != null) {
number1 = result1;
}
Dataset sql3 = sparkSession.sql(sql2);
Long result2 = sql3.collectAsList().get(0).getLong(0);
long number2 = 0L;
if(result2 != null) {
number2 = result2;
}
// 计算结果
System.out.println("======================" + number1 + "======================");
System.out.println("======================" + number2 + "======================");
double rate = (double)number2 / (double)number1;
System.out.println("======================" + rate + "======================");
}
用户跳出率
private static void calculateDailyUserJumpRate(SparkSession sparkSession, String yesterday) {
// 网站总访问数
String sql1 = "select count(*) from news where action='view' and date='" + yesterday + "' and userid is not null";
// select date,userid,count(userid) as time from news where action='view' and date='2019-01-26' and userid is not null group by date,userid;
// 已注册用户的昨天跳出的总数
String sql2 =
"select count(userid) from ( " +
"select date,userid,count(userid) as time from news where action='view' and date='" + yesterday + "' and userid is not null group by date,userid " +
") t " +
"where time = 1";
Dataset sql = sparkSession.sql(sql1);
Long result1 = sql.collectAsList().get(0).getLong(0);
long number1 = 0L;
if(result1 != null) {
number1 = result1;
}
Dataset sql3 = sparkSession.sql(sql2);
Long result2 = sql3.collectAsList().get(0).getLong(0);
long number2 = 0L;
if(result2 != null) {
number2 = result2;
}
// 计算结果
System.out.println("======================" + number1 + "======================");
System.out.println("======================" + number2 + "======================");
double rate = (double)number2 / (double)number1;
System.out.println("======================" + rate + "======================");
}
版块热度排行榜
private static void calculateDailySectionPvSort(SparkSession sparkSession, String yesterday) {
// select date,section,count(section) as num from news where action='view' and date='2019-01-25' group by date,section
String sql =
"select date,section,num from ( " +
"select date,section,count(section) as num from news where action='view' and date='" + yesterday + "' group by date,section " +
") t " +
"order by num desc";
Dataset sql1 = sparkSession.sql(sql);
sql1.show();
}