spark实战-移动端app访问流量日志分析

综合案例-移动端app访问流量日志分析

移动端app访问流量日志分析

背景

  • 如果你是在一个互联网公司,然后你的公司现在也在做移动互联网,做了一个手机app,那么你的手机app的用户,每次进行点击,或者是一些搜索操作的时候,都会跟你的远程的后端服务器做一次交互,也就是说,你的手机app,首先会往后端服务器发送一个请求,然后你的后端服务器会给你的手机app返回一个响应,响应的内容可能是图片、或者文字、或者json,此时,就完成了一次你的移动端app和后端服务器之间的交互过程。
  • 通常来说,在你的移动端app访问你的后端服务器的时候,你的后端服务器会记录一条日志,这个日志,也就是你的移动端app访问流量的相关日志,但是也可以根据你自己的需要,移动端发送一条日志过来,服务器端的web系统保存日志

目的

  • 我们这里做的就是最基本的,记录你的移动端app和服务器之间的上行数据包和下行数据包,上行流量和下行流量
  • 我们要来计算,就是说,你的每个移动端,唯一的一个标识是你的deviceID
  • 每条日志,都会有这一次请求和响应的上行流量和下行流量的记录,上行流量指的是手机app向服务器发送的请求数据的流量,下行流量,认为是服务器端给手机app返回的数据(比如说图片、文字、json)的流量
  • 每个设备(deviceID),总上行流量和总下行流量,计算之后,要根据上行流量和下行流量进行排序,需要进行倒序排序,获取流量最大的前10个设备

难点

  • 根据上行流量和下行流量进行排序的时候,不是简单的排序,优先根据上行流量进行排序,如果上行流量相等,那么根据下行流量排序
  • 二次排序

数据

1454307391161	77e3c9e1811d4fb291d0d9bbd456bb4b	79976	11496
1454315971161	f92ecf8e076d44b89f2d070fb1df7197	95291	89092
1454304331161	3de7d6514f1d4ac790c630fa63d8d0be	57029	50228
1454303131161	dd382d2a20464a74bbb7414e429ae452	20428	93467
1454319991161	bb2956150d6741df875fbcca76ae9e7c	51994	57706
1454302711161	225424dd7dd44d12b4190d1549540bf3	3448	56119
1454316091161	d368d95c643f4943ba1f5ea97b5a9a91	98230	96925
...
  • 第一个字段:时间戳
  • 第二个字段:diviceID
  • 第三个字段:上行流量
  • 第四个字段:下行流量

实例代码Java

DataGenerator.java : 生成随机数据

package cn.spark.study.core.upgrade.applog;

import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Random;
import java.util.UUID;
/*
 * 模拟生成数据
 */
public class DataGenerator {
	
	public static void main(String[] args) throws Exception {
		Random random = new Random();
		
		// 生成100个deviceID
		List<String> deviceIDs = new ArrayList<String>();
		for(int i = 0; i < 100; i++) {
			deviceIDs.add(getRandomUUID());
		}
		
		StringBuffer buffer = new StringBuffer("");  
		
		for(int i = 0; i < 1000; i++) {
			// 生成随机时间戳
			Calendar cal = Calendar.getInstance();
			cal.setTime(new Date());    
			cal.add(Calendar.MINUTE, -random.nextInt(600)); 
			long timestamp = cal.getTime().getTime();
		
			// 生成随机deviceID
			String deviceID = deviceIDs.get(random.nextInt(100));  
			
			// 生成随机的上行流量
			long upTraffic = random.nextInt(100000);
			// 生成随机的下行流量
			long downTraffic = random.nextInt(100000);
			
			buffer.append(timestamp).append("\t")  
					.append(deviceID).append("\t")  
					.append(upTraffic).append("\t")
					.append(downTraffic).append("\n");  
		}
		
		PrintWriter pw = null;  
		try {
			pw = new PrintWriter(new OutputStreamWriter(
					new FileOutputStream("C:\\Users\\Administrator\\Desktop\\access.log")));
			pw.write(buffer.toString());  
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			pw.close();
		}
	}
	
	private static String getRandomUUID() {
		return UUID.randomUUID().toString().replace("-", "");
	}
	
}

AccessLogInfo.java: 自定义二次排序类

package cn.spark.study.core.upgrade.applog;

import java.io.Serializable;

/*
 * 访问日志信息类(可序列化)
 */
public class AccessLogInfo implements Serializable{
	private static final long serialVersionUID = 5749943279909593929L;
	
	private long timestamp;    // 时间戳
	private long upTraffic;    //上行流量
	private long downTraffic;  // 下行流量
	
	public AccessLogInfo() {}
	
	public AccessLogInfo(long timestamp, long upTraffic, long downTraffic) {
		this.timestamp = timestamp;
		this.upTraffic = upTraffic;
		this.downTraffic = downTraffic;
	}
	
	public long getTimestamp() {
		return timestamp;
	}
	public long getUpTraffic() {
		return upTraffic;
	}
	public long getDownTraffic() {
		return downTraffic;
	}
	public void setTimestamp(long timestamp) {
		this.timestamp = timestamp;
	}
	public void setUpTraffic(long upTraffic) {
		this.upTraffic = upTraffic;
	}
	public void setDownTraffic(long downTraffic) {
		this.downTraffic = downTraffic;
	}
}

AccessLogSortKey.java: 将二次排序key映射成RDD的key

package cn.spark.study.core.upgrade.applog;

import java.io.Serializable;

import scala.math.Ordered;

public class AccessLogSortKey implements Ordered<AccessLogSortKey>, Serializable{

	private static final long serialVersionUID = 3702442700882342403L;
	
	private long upTraffic;
	private long downTraffic;
	private long timestamp;
	
	public AccessLogSortKey() {}
	
	public AccessLogSortKey(long upTraffic, long downTraffic, long timestamp) {
		this.upTraffic = upTraffic;
		this.downTraffic = downTraffic;
		this.timestamp = timestamp;
	}

	@Override
	public boolean $greater(AccessLogSortKey other) {
		if(upTraffic > other.upTraffic) {
			return true;
		} else if(upTraffic == other.upTraffic && 
				downTraffic > other.downTraffic) {
			return true;
		} else if(upTraffic == other.upTraffic && 
				downTraffic == other.downTraffic &&
				timestamp > other.timestamp) {
			return true;
		}
		return false;
	}

	@Override
	public boolean $greater$eq(AccessLogSortKey other) {
		if($greater(other)) {
			return true;
		} else if(upTraffic == other.upTraffic && 
				downTraffic == other.downTraffic &&
				timestamp == other.timestamp) {
			return true;
		}
		return false;
	}

	@Override
	public boolean $less(AccessLogSortKey other) {
		if(upTraffic < other.upTraffic) {
			return true;
		} else if(upTraffic == other.upTraffic && 
				downTraffic < other.downTraffic) {
			return true;
		} else if(upTraffic == other.upTraffic && 
				downTraffic == other.downTraffic &&
				timestamp < other.timestamp) {
			return true;
		}
		return false;
	}

	@Override
	public boolean $less$eq(AccessLogSortKey other) {
		if($less(other)) {
			return true;
		} else if(upTraffic == other.upTraffic && 
				downTraffic == other.downTraffic &&
				timestamp == other.timestamp) {
			return true;
		}
		return false;
	}

	@Override
	public int compare(AccessLogSortKey other) {
		if(upTraffic - other.upTraffic != 0) {
			return (int) (upTraffic - other.upTraffic); 
		} else if(downTraffic - other.downTraffic != 0) {
			return (int) (downTraffic - other.downTraffic);
		} else if(timestamp - other.timestamp != 0) {
			return (int) (timestamp - other.timestamp);
		}
		return 0;
	}

	@Override
	public int compareTo(AccessLogSortKey other) {
		if(upTraffic - other.upTraffic != 0) {
			return (int) (upTraffic - other.upTraffic); 
		} else if(downTraffic - other.downTraffic != 0) {
			return (int) (downTraffic - other.downTraffic);
		} else if(timestamp - other.timestamp != 0) {
			return (int) (timestamp - other.timestamp);
		}
		return 0;
	}

	long getUpTraffic() {
		return upTraffic;
	}

	long getDownTraffic() {
		return downTraffic;
	}

	long getTimestamp() {
		return timestamp;
	}

	void setUpTraffic(long upTraffic) {
		this.upTraffic = upTraffic;
	}

	void setDownTraffic(long downTraffic) {
		this.downTraffic = downTraffic;
	}

	void setTimestamp(long timestamp) {
		this.timestamp = timestamp;
	}

	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + (int) (downTraffic ^ (downTraffic >>> 32));
		result = prime * result + (int) (timestamp ^ (timestamp >>> 32));
		result = prime * result + (int) (upTraffic ^ (upTraffic >>> 32));
		return result;
	}

	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		AccessLogSortKey other = (AccessLogSortKey) obj;
		if (downTraffic != other.downTraffic)
			return false;
		if (timestamp != other.timestamp)
			return false;
		if (upTraffic != other.upTraffic)
			return false;
		return true;
	}
	
	

}

AppLogSpark.java : 获取top10数据

package cn.spark.study.core.upgrade.applog;

import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

public class AppLogSpark {
	public static void main(String[] args) throws Exception{
		// 创建Spark配置和上下文对象
		SparkConf conf = new SparkConf()
				.setAppName("AppLogSpark")  
				.setMaster("local"); 
		JavaSparkContext sc = new JavaSparkContext(conf);
		
		// 读取日志文件,并创建一个RDD
		// 使用SparkContext的textFile()方法,即可读取本地磁盘文件,或者是HDFS上的文件
		// 创建出来一个初始的RDD,其中包含了日志文件中的所有数据
		JavaRDD<String> accessLogRDD = sc.textFile(
				"C:\\Users\\htfeng\\Desktop\\BigData\\Spark学习\\Spark学习\\Spark核心编程进阶\\data\\access.log");   
		// 将RDD映射为key-value格式,为后面的reduceByKey聚合做准备
		JavaPairRDD<String, AccessLogInfo> accessLogPairRDD = mapAccessLogRDD2Pair(accessLogRDD);
		
		// 根据deviceID进行聚合操作
		// 获取每个deviceID的总上行流量、总下行流量、最早访问时间戳
		JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD = aggregateByDeviceID(accessLogPairRDD);
		
		// 将按deviceID聚合RDD的key映射为二次排序key,value映射为deviceID
		JavaPairRDD<AccessLogSortKey, String> accessLogSortRDD = mapRDDKey2SortKey(aggrAccessLogPairRDD);
		
		// 执行二次排序操作,按照上行流量、下行流量以及时间戳进行倒序排序
		JavaPairRDD<AccessLogSortKey, String> sortedAccessLogRDD = accessLogSortRDD.sortByKey(false);
		
		// 获取top10数据
		List<Tuple2<AccessLogSortKey, String>> top10DataList = sortedAccessLogRDD.take(10);
		
		for(Tuple2<AccessLogSortKey, String> data : top10DataList) {
			System.out.println(data._2 + ": " + data._1.getUpTraffic() + "," + data._1.getDownTraffic() + "," + data._1.getTimestamp());
		}
		
		// 关闭Spark上下文
		sc.close();
	}
	
	/**
	 * 将日志RDD映射为key-value的格式
	 * @param accessLogRDD 日志RDD
	 * @return key-value格式RDD
	 */
	private static JavaPairRDD<String, AccessLogInfo> mapAccessLogRDD2Pair(
			JavaRDD<String> accessLogRDD){
				return accessLogRDD.mapToPair(new PairFunction<String, String, AccessLogInfo>() {

					private static final long serialVersionUID = 1L;

					@Override
					public Tuple2<String, AccessLogInfo> call(String accessLog) throws Exception {
						// 根据\t对日志进行切分
						String[] accessLogSplited = accessLog.split("\t");
						
						// 获取四个字段
						long timestamp = Long.valueOf(accessLogSplited[0]);
						String deviceID = accessLogSplited[1];
						long upTraffic = Long.valueOf(accessLogSplited[2]);
						long downTraffic = Long.valueOf(accessLogSplited[3]);
						
						// 将时间戳、上行流量、下行流量,封装为自定义的可序列化对象
						AccessLogInfo accessLogInfo = new AccessLogInfo(timestamp, upTraffic, downTraffic);
						
						return new Tuple2<String, AccessLogInfo>(deviceID, accessLogInfo);
					}
					
				});	
	}
	
	/**
	 * 根据deviceID进行聚合操作
	 * 计算出每个deviceID的总上行流量、总下行流量以及最早访问时间
	 * @param accessLogPairRDD 日志key-value格式RDD
	 * @return 按deviceID聚合RDD
	 */
	private static JavaPairRDD<String, AccessLogInfo> aggregateByDeviceID(
			JavaPairRDD<String, AccessLogInfo> accessLogPairRD){
		return accessLogPairRD.reduceByKey(new Function2<AccessLogInfo, AccessLogInfo, AccessLogInfo>() {
			
			private static final long serialVersionUID = 1L;

			@Override
			public AccessLogInfo call(AccessLogInfo accessLogInfo1, AccessLogInfo accessLogInfo2) throws Exception {
				long timestamp = accessLogInfo1.getTimestamp() < accessLogInfo2.getTimestamp()?
						accessLogInfo1.getTimestamp():accessLogInfo2.getTimestamp();
				long upTraffic = accessLogInfo1.getUpTraffic() + accessLogInfo2.getUpTraffic();
				long downTraffic = accessLogInfo1.getDownTraffic() + accessLogInfo2.getDownTraffic();
				
				AccessLogInfo accessLogInfo = new AccessLogInfo();
				accessLogInfo.setTimestamp(timestamp);
				accessLogInfo.setUpTraffic(upTraffic);
				accessLogInfo.setDownTraffic(downTraffic);
				
				return accessLogInfo;
			}
		});
	}
	/**
	 * 将RDD的key映射为二次排序key
	 * @param aggrAccessLogPairRDD 按deviceID聚合RDD
	 * @return 二次排序key RDD
	 */
	private static JavaPairRDD<AccessLogSortKey, String> mapRDDKey2SortKey(
			JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD){
				return aggrAccessLogPairRDD.mapToPair(
						new PairFunction<Tuple2<String,AccessLogInfo>, AccessLogSortKey, String>() {
							private static final long serialVersionUID = 1L;

							@Override
							public Tuple2<AccessLogSortKey, String> call(Tuple2<String, AccessLogInfo> tuple)
									throws Exception {
								String deviceID = tuple._1;
								AccessLogInfo accessLogInfo = tuple._2;
								
								// 将日志信息封装成二次排序key
								AccessLogSortKey accessLogSortKey = new AccessLogSortKey(
										accessLogInfo.getUpTraffic(),
										accessLogInfo.getDownTraffic(),
										accessLogInfo.getTimestamp());
								return new Tuple2<AccessLogSortKey, String>(accessLogSortKey, deviceID);
							}
							
						});
	}
	
	
	
}

你可能感兴趣的:(spark,spark,流量日志分析,java)