转载自龙果学院的视频http://www.roncoo.com/course/view/1af3e9867cb84263a2a1873880205ae1。
有这样一份log日志记录了某时间戳下某个设备访问网站时产生的上行流量、下行流量。
时间戳/设备号/上行流量/下行流量
现在想统计出每个设备号的最早访问时间及总的上行流量、下行流量,最后打印出10个按上行流量、下行流量排序的最多的10个记录。
创建个普通的maven项目。
pom.xml
4.0.0
com.fei
spark-appTrafficCount
0.0.1-SNAPSHOT
UTF-8
junit
junit
3.8.1
test
org.apache.spark
spark-core_2.10
1.3.0
写个DataFileGenerator.java模拟日志数据,并写入文件,便于后面spark读取文件
package com.fei;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.UUID;
/**
* 模拟生成数据,并把数据写到文件中
* 文件内容:
* 时间戳 \t设备号\t上行流量\t下行流量
* @author Jfei
*
*/
public class DataFileGenerator {
public static void main(String[] args) {
//1.生成100个设备号
List deviceIds = new ArrayList();
for(int i=0;i<100;i++){
deviceIds.add(getDeviceId());
}
//2.生成1000个时间戳,该时间戳对应的设备号及上行流量、下行流量
Random random = new Random();
StringBuffer sb = new StringBuffer();
for(int i=0;i<1000;i++){
long timestamp = System.currentTimeMillis() - random.nextInt(10000);
//随机获取一个设备号
String deviceId = deviceIds.get(random.nextInt(100));
//上行流量
long upTraffic = random.nextInt(10000);
//下行流量
long downTraffic = random.nextInt(10000);
sb.append(timestamp).append("\t").append(deviceId).append("\t").append(upTraffic)
.append("\t").append(downTraffic).append("\n");
}
//将数据写到文件
PrintWriter pw = null;
try {
pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("e:\\app-log.txt")));
pw.write(sb.toString());
} catch (Exception e) {
e.printStackTrace();
}finally{
if(pw != null){
pw.close();
}
}
}
private static String getDeviceId(){
return UUID.randomUUID().toString().replace("-", "");
}
}
写个AccessLogInfo.java对应文件中的时间戳/上行流量/下行流量
package com.fei;
import java.io.Serializable;
/**
* 设备(deviceId)访问日志信息对象,对应app-log.txt里面的内容
*
* 因为后面是要将这对象放到spark中处理的,所以必须序列化
* @author Jfei
*
*/
public class AccessLogInfo implements Serializable{
private static final long serialVersionUID = 1L;
//没有设备号,是因为这对象是站在设备角度看的,k-v形式<设备号,AccessLogInfo>
private long timestamp;//时间戳
private long upTraffic;//上行流量
private long downTraffic;//下行流量
public AccessLogInfo(){
}
public AccessLogInfo(long timestamp, long upTraffic, long downTraffic) {
this.timestamp = timestamp;
this.upTraffic = upTraffic;
this.downTraffic = downTraffic;
}
public long getTimestamp() {
return timestamp;
}
public void setTimestamp(long timestamp) {
this.timestamp = timestamp;
}
public long getUpTraffic() {
return upTraffic;
}
public void setUpTraffic(long upTraffic) {
this.upTraffic = upTraffic;
}
public long getDownTraffic() {
return downTraffic;
}
public void setDownTraffic(long downTraffic) {
this.downTraffic = downTraffic;
}
public static long getSerialversionuid() {
return serialVersionUID;
}
}
package com.fei;
import java.io.Serializable;
import scala.math.Ordered;
/**
* 可排序的对象,上行流量、下行流量、时间戳来排序,降序
* @author Jfei
*
*/
public class AccessLogSortKey implements Ordered,Serializable{
private static final long serialVersionUID = 1L;
private long timestamp;//时间戳
private long upTraffic;//上行流量
private long downTraffic;//下行流量
public AccessLogSortKey(){
}
public AccessLogSortKey(long timestamp, long upTraffic, long downTraffic) {
this.timestamp = timestamp;
this.upTraffic = upTraffic;
this.downTraffic = downTraffic;
}
public boolean $greater(AccessLogSortKey other) {
if(upTraffic > other.upTraffic){
return true;
}else if(upTraffic == other.upTraffic && downTraffic > other.downTraffic ){
return true;
}else if(upTraffic == other.upTraffic && downTraffic == other.downTraffic && timestamp > other.timestamp ){
return true;
}
return false;
}
public boolean $greater$eq(AccessLogSortKey other) {
if($greater(other)){
return true;
}else if(upTraffic == other.upTraffic && downTraffic == other.downTraffic && timestamp == other.timestamp){
return true;
}
return false;
}
public boolean $less(AccessLogSortKey other) {
if(upTraffic < other.upTraffic){
return true;
}else if(upTraffic == other.upTraffic && downTraffic < other.downTraffic ){
return true;
}else if(upTraffic == other.upTraffic && downTraffic == other.downTraffic && timestamp < other.timestamp ){
return true;
}
return false;
}
public boolean $less$eq(AccessLogSortKey other) {
if($less(other)){
return true;
}else if(upTraffic == other.upTraffic && downTraffic == other.downTraffic && timestamp == other.timestamp){
return true;
}
return false;
}
public int compare(AccessLogSortKey other) {
if(upTraffic - other.upTraffic != 0){
return (int)(upTraffic - other.upTraffic);
}
if(downTraffic - other.downTraffic != 0){
return (int)(downTraffic - other.downTraffic);
}
if(timestamp - other.timestamp != 0){
return (int)(timestamp - other.timestamp);
}
return 0;
}
public int compareTo(AccessLogSortKey other) {
if(upTraffic - other.upTraffic != 0){
return (int)(upTraffic - other.upTraffic);
}
if(downTraffic - other.downTraffic != 0){
return (int)(downTraffic - other.downTraffic);
}
if(timestamp - other.timestamp != 0){
return (int)(timestamp - other.timestamp);
}
return 0;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + (int) (downTraffic ^ (downTraffic >>> 32));
result = prime * result + (int) (timestamp ^ (timestamp >>> 32));
result = prime * result + (int) (upTraffic ^ (upTraffic >>> 32));
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
AccessLogSortKey other = (AccessLogSortKey) obj;
if (downTraffic != other.downTraffic)
return false;
if (timestamp != other.timestamp)
return false;
if (upTraffic != other.upTraffic)
return false;
return true;
}
public long getTimestamp() {
return timestamp;
}
public void setTimestamp(long timestamp) {
this.timestamp = timestamp;
}
public long getUpTraffic() {
return upTraffic;
}
public void setUpTraffic(long upTraffic) {
this.upTraffic = upTraffic;
}
public long getDownTraffic() {
return downTraffic;
}
public void setDownTraffic(long downTraffic) {
this.downTraffic = downTraffic;
}
public static long getSerialversionuid() {
return serialVersionUID;
}
}
AppLogSpark.java
package com.fei;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
/**
* 读取app-log.txt中的内容,然后按设备号统计出每个设备号总的上行流量/总的下行流量,
* 然后挑出前10个上行流量最大的设备号,打印出来
* @author Jfei
*
*/
public class AppLogSpark {
public static void main(String[] args) {
//1.创建spark配置文件及上下文对象
SparkConf conf = new SparkConf().setAppName("appLogSpark").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
//2.读取日志文件,并创建一个RDD,使用sparkContext方法textFile()可以读取本地文件或
//HDFS文件,创建的RDD会包含文件中的所有内容数据
JavaRDD appLogRDD = sc.textFile("e:\\app-log.txt");
//3.将appLogRDD映射成k-v形式<设备号,AccessLogInfo>的RDD
//可以这样理解为List,deviceId是可以对应多个AcessLogInfo的
JavaPairRDD appLogPairRDD = mapAppLogRDD2Pair(appLogRDD);
//4.将appLogPairRDD根据key进行聚合,并统计
//可以这样理解List聚合后,deviceId只有对应一个AccessLogInfo了,
//AccessLogInfo里面的数据就是某个deviceId的总上行流量/总下行流量了
JavaPairRDD aggregateLogPairRDD = aggregateByDevice(appLogPairRDD);
//5.排序,按上行流量、上行流量、时间戳排序
//因为JavaPairRDD中排序只有sortByKey(XX),而我们这是根据value排序的,所以需要
//将JavaPairRDD转为JavaPairRDD的RDD,
//而AccessLogInfo只是个普通的POJO,为了便于比较排序,定义个AccessLogSortKey对象
JavaPairRDD logSortRDD = mapRDDKey2SortKey(aggregateLogPairRDD);
//6.实现排序
JavaPairRDD sortRDD = logSortRDD.sortByKey(false);//降序
//7.取前10个
List> list = sortRDD.take(10);
//打印
for(Tuple2 t : list){
System.out.println(t._2 + "\t" + t._1.getTimestamp() + "\t" + t._1.getUpTraffic()+ "\t" + t._1.getDownTraffic());
}
//关闭
sc.close();
}
/**
* 将appLogRDD映射成key-value形式的JavaPairRDD的RDD
* @param appLogRDD
* @return
*/
private static JavaPairRDD mapAppLogRDD2Pair(JavaRDD appLogRDD){
return appLogRDD.mapToPair(new PairFunction() {
private static final long serialVersionUID = 1L;
//accessLog是对应文件中的一行数据
public Tuple2 call(String accessLog) throws Exception {
//根据\t对一行的数据进行分割
String[] data = accessLog.split("\t");
long timestamp = Long.parseLong(data[0]);
String deviceId = data[1];
long upTraffic = Long.parseLong(data[2]);
long downTraffic = Long.parseLong(data[3]);
AccessLogInfo info = new AccessLogInfo(timestamp,upTraffic,downTraffic);
return new Tuple2(deviceId,info);
}
});
}
/**
* 将JavaPairRDD进行聚合
* @param appLogPairRDD
* @return
*/
private static JavaPairRDD aggregateByDevice(JavaPairRDD appLogPairRDD){
return appLogPairRDD.reduceByKey(new Function2() {
private static final long serialVersionUID = 1L;
public AccessLogInfo call(AccessLogInfo v1, AccessLogInfo v2) throws Exception {
//最早的访问时间
long timestamp = v1.getTimestamp() < v2.getTimestamp() ? v1.getTimestamp() : v2.getTimestamp();
long upTraffic = v1.getUpTraffic() + v2.getUpTraffic();//总上行流量
long downTraffic = v1.getDownTraffic() + v2.getDownTraffic();//总下行流量
return new AccessLogInfo(timestamp,upTraffic,downTraffic);
}
});
}
/**
* 将JavaPairRDD转为JavaPairRDD便于后面的排序
*/
private static JavaPairRDD mapRDDKey2SortKey(JavaPairRDD aggregateRDD){
return aggregateRDD.mapToPair(new PairFunction,AccessLogSortKey, String >() {
private static final long serialVersionUID = 1L;
public Tuple2 call(Tuple2 t) throws Exception {
String deviceId = t._1;
AccessLogInfo info = t._2;
AccessLogSortKey sortKey = new AccessLogSortKey(info.getTimestamp(),info.getUpTraffic(),info.getDownTraffic() );
return new Tuple2(sortKey,deviceId);
}
});
}
}