背景
目的
难点
数据
1454307391161 77e3c9e1811d4fb291d0d9bbd456bb4b 79976 11496
1454315971161 f92ecf8e076d44b89f2d070fb1df7197 95291 89092
1454304331161 3de7d6514f1d4ac790c630fa63d8d0be 57029 50228
1454303131161 dd382d2a20464a74bbb7414e429ae452 20428 93467
1454319991161 bb2956150d6741df875fbcca76ae9e7c 51994 57706
1454302711161 225424dd7dd44d12b4190d1549540bf3 3448 56119
1454316091161 d368d95c643f4943ba1f5ea97b5a9a91 98230 96925
...
实例代码Java
DataGenerator.java : 生成随机数据
package cn.spark.study.core.upgrade.applog;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Random;
import java.util.UUID;
/*
* 模拟生成数据
*/
public class DataGenerator {
public static void main(String[] args) throws Exception {
Random random = new Random();
// 生成100个deviceID
List<String> deviceIDs = new ArrayList<String>();
for(int i = 0; i < 100; i++) {
deviceIDs.add(getRandomUUID());
}
StringBuffer buffer = new StringBuffer("");
for(int i = 0; i < 1000; i++) {
// 生成随机时间戳
Calendar cal = Calendar.getInstance();
cal.setTime(new Date());
cal.add(Calendar.MINUTE, -random.nextInt(600));
long timestamp = cal.getTime().getTime();
// 生成随机deviceID
String deviceID = deviceIDs.get(random.nextInt(100));
// 生成随机的上行流量
long upTraffic = random.nextInt(100000);
// 生成随机的下行流量
long downTraffic = random.nextInt(100000);
buffer.append(timestamp).append("\t")
.append(deviceID).append("\t")
.append(upTraffic).append("\t")
.append(downTraffic).append("\n");
}
PrintWriter pw = null;
try {
pw = new PrintWriter(new OutputStreamWriter(
new FileOutputStream("C:\\Users\\Administrator\\Desktop\\access.log")));
pw.write(buffer.toString());
} catch (Exception e) {
e.printStackTrace();
} finally {
pw.close();
}
}
private static String getRandomUUID() {
return UUID.randomUUID().toString().replace("-", "");
}
}
AccessLogInfo.java: 自定义二次排序类
package cn.spark.study.core.upgrade.applog;
import java.io.Serializable;
/*
* 访问日志信息类(可序列化)
*/
public class AccessLogInfo implements Serializable{
private static final long serialVersionUID = 5749943279909593929L;
private long timestamp; // 时间戳
private long upTraffic; //上行流量
private long downTraffic; // 下行流量
public AccessLogInfo() {}
public AccessLogInfo(long timestamp, long upTraffic, long downTraffic) {
this.timestamp = timestamp;
this.upTraffic = upTraffic;
this.downTraffic = downTraffic;
}
public long getTimestamp() {
return timestamp;
}
public long getUpTraffic() {
return upTraffic;
}
public long getDownTraffic() {
return downTraffic;
}
public void setTimestamp(long timestamp) {
this.timestamp = timestamp;
}
public void setUpTraffic(long upTraffic) {
this.upTraffic = upTraffic;
}
public void setDownTraffic(long downTraffic) {
this.downTraffic = downTraffic;
}
}
AccessLogSortKey.java: 将二次排序key映射成RDD的key
package cn.spark.study.core.upgrade.applog;
import java.io.Serializable;
import scala.math.Ordered;
public class AccessLogSortKey implements Ordered<AccessLogSortKey>, Serializable{
private static final long serialVersionUID = 3702442700882342403L;
private long upTraffic;
private long downTraffic;
private long timestamp;
public AccessLogSortKey() {}
public AccessLogSortKey(long upTraffic, long downTraffic, long timestamp) {
this.upTraffic = upTraffic;
this.downTraffic = downTraffic;
this.timestamp = timestamp;
}
@Override
public boolean $greater(AccessLogSortKey other) {
if(upTraffic > other.upTraffic) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic > other.downTraffic) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic == other.downTraffic &&
timestamp > other.timestamp) {
return true;
}
return false;
}
@Override
public boolean $greater$eq(AccessLogSortKey other) {
if($greater(other)) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic == other.downTraffic &&
timestamp == other.timestamp) {
return true;
}
return false;
}
@Override
public boolean $less(AccessLogSortKey other) {
if(upTraffic < other.upTraffic) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic < other.downTraffic) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic == other.downTraffic &&
timestamp < other.timestamp) {
return true;
}
return false;
}
@Override
public boolean $less$eq(AccessLogSortKey other) {
if($less(other)) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic == other.downTraffic &&
timestamp == other.timestamp) {
return true;
}
return false;
}
@Override
public int compare(AccessLogSortKey other) {
if(upTraffic - other.upTraffic != 0) {
return (int) (upTraffic - other.upTraffic);
} else if(downTraffic - other.downTraffic != 0) {
return (int) (downTraffic - other.downTraffic);
} else if(timestamp - other.timestamp != 0) {
return (int) (timestamp - other.timestamp);
}
return 0;
}
@Override
public int compareTo(AccessLogSortKey other) {
if(upTraffic - other.upTraffic != 0) {
return (int) (upTraffic - other.upTraffic);
} else if(downTraffic - other.downTraffic != 0) {
return (int) (downTraffic - other.downTraffic);
} else if(timestamp - other.timestamp != 0) {
return (int) (timestamp - other.timestamp);
}
return 0;
}
long getUpTraffic() {
return upTraffic;
}
long getDownTraffic() {
return downTraffic;
}
long getTimestamp() {
return timestamp;
}
void setUpTraffic(long upTraffic) {
this.upTraffic = upTraffic;
}
void setDownTraffic(long downTraffic) {
this.downTraffic = downTraffic;
}
void setTimestamp(long timestamp) {
this.timestamp = timestamp;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + (int) (downTraffic ^ (downTraffic >>> 32));
result = prime * result + (int) (timestamp ^ (timestamp >>> 32));
result = prime * result + (int) (upTraffic ^ (upTraffic >>> 32));
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
AccessLogSortKey other = (AccessLogSortKey) obj;
if (downTraffic != other.downTraffic)
return false;
if (timestamp != other.timestamp)
return false;
if (upTraffic != other.upTraffic)
return false;
return true;
}
}
AppLogSpark.java : 获取top10数据
package cn.spark.study.core.upgrade.applog;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class AppLogSpark {
public static void main(String[] args) throws Exception{
// 创建Spark配置和上下文对象
SparkConf conf = new SparkConf()
.setAppName("AppLogSpark")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// 读取日志文件,并创建一个RDD
// 使用SparkContext的textFile()方法,即可读取本地磁盘文件,或者是HDFS上的文件
// 创建出来一个初始的RDD,其中包含了日志文件中的所有数据
JavaRDD<String> accessLogRDD = sc.textFile(
"C:\\Users\\htfeng\\Desktop\\BigData\\Spark学习\\Spark学习\\Spark核心编程进阶\\data\\access.log");
// 将RDD映射为key-value格式,为后面的reduceByKey聚合做准备
JavaPairRDD<String, AccessLogInfo> accessLogPairRDD = mapAccessLogRDD2Pair(accessLogRDD);
// 根据deviceID进行聚合操作
// 获取每个deviceID的总上行流量、总下行流量、最早访问时间戳
JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD = aggregateByDeviceID(accessLogPairRDD);
// 将按deviceID聚合RDD的key映射为二次排序key,value映射为deviceID
JavaPairRDD<AccessLogSortKey, String> accessLogSortRDD = mapRDDKey2SortKey(aggrAccessLogPairRDD);
// 执行二次排序操作,按照上行流量、下行流量以及时间戳进行倒序排序
JavaPairRDD<AccessLogSortKey, String> sortedAccessLogRDD = accessLogSortRDD.sortByKey(false);
// 获取top10数据
List<Tuple2<AccessLogSortKey, String>> top10DataList = sortedAccessLogRDD.take(10);
for(Tuple2<AccessLogSortKey, String> data : top10DataList) {
System.out.println(data._2 + ": " + data._1.getUpTraffic() + "," + data._1.getDownTraffic() + "," + data._1.getTimestamp());
}
// 关闭Spark上下文
sc.close();
}
/**
* 将日志RDD映射为key-value的格式
* @param accessLogRDD 日志RDD
* @return key-value格式RDD
*/
private static JavaPairRDD<String, AccessLogInfo> mapAccessLogRDD2Pair(
JavaRDD<String> accessLogRDD){
return accessLogRDD.mapToPair(new PairFunction<String, String, AccessLogInfo>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, AccessLogInfo> call(String accessLog) throws Exception {
// 根据\t对日志进行切分
String[] accessLogSplited = accessLog.split("\t");
// 获取四个字段
long timestamp = Long.valueOf(accessLogSplited[0]);
String deviceID = accessLogSplited[1];
long upTraffic = Long.valueOf(accessLogSplited[2]);
long downTraffic = Long.valueOf(accessLogSplited[3]);
// 将时间戳、上行流量、下行流量,封装为自定义的可序列化对象
AccessLogInfo accessLogInfo = new AccessLogInfo(timestamp, upTraffic, downTraffic);
return new Tuple2<String, AccessLogInfo>(deviceID, accessLogInfo);
}
});
}
/**
* 根据deviceID进行聚合操作
* 计算出每个deviceID的总上行流量、总下行流量以及最早访问时间
* @param accessLogPairRDD 日志key-value格式RDD
* @return 按deviceID聚合RDD
*/
private static JavaPairRDD<String, AccessLogInfo> aggregateByDeviceID(
JavaPairRDD<String, AccessLogInfo> accessLogPairRD){
return accessLogPairRD.reduceByKey(new Function2<AccessLogInfo, AccessLogInfo, AccessLogInfo>() {
private static final long serialVersionUID = 1L;
@Override
public AccessLogInfo call(AccessLogInfo accessLogInfo1, AccessLogInfo accessLogInfo2) throws Exception {
long timestamp = accessLogInfo1.getTimestamp() < accessLogInfo2.getTimestamp()?
accessLogInfo1.getTimestamp():accessLogInfo2.getTimestamp();
long upTraffic = accessLogInfo1.getUpTraffic() + accessLogInfo2.getUpTraffic();
long downTraffic = accessLogInfo1.getDownTraffic() + accessLogInfo2.getDownTraffic();
AccessLogInfo accessLogInfo = new AccessLogInfo();
accessLogInfo.setTimestamp(timestamp);
accessLogInfo.setUpTraffic(upTraffic);
accessLogInfo.setDownTraffic(downTraffic);
return accessLogInfo;
}
});
}
/**
* 将RDD的key映射为二次排序key
* @param aggrAccessLogPairRDD 按deviceID聚合RDD
* @return 二次排序key RDD
*/
private static JavaPairRDD<AccessLogSortKey, String> mapRDDKey2SortKey(
JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD){
return aggrAccessLogPairRDD.mapToPair(
new PairFunction<Tuple2<String,AccessLogInfo>, AccessLogSortKey, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<AccessLogSortKey, String> call(Tuple2<String, AccessLogInfo> tuple)
throws Exception {
String deviceID = tuple._1;
AccessLogInfo accessLogInfo = tuple._2;
// 将日志信息封装成二次排序key
AccessLogSortKey accessLogSortKey = new AccessLogSortKey(
accessLogInfo.getUpTraffic(),
accessLogInfo.getDownTraffic(),
accessLogInfo.getTimestamp());
return new Tuple2<AccessLogSortKey, String>(accessLogSortKey, deviceID);
}
});
}
}