1、业务需求
在拥有手机号在每个基站处停留时间日志 和 基站信息的 算出某个手机号的(所在基站,停留时间),(当前所在经度,当前所在纬度)
其中手机连接基站产生的日志信息类似如下:
18688888888,20160327082400,16030401EAFB68F1E3CDF819735E1C66,1
18611132889,20160327082500,16030401EAFB68F1E3CDF819735E1C66,1
18688888888,20160327170000,16030401EAFB68F1E3CDF819735E1C66,0
18611132889,20160327180000,16030401EAFB68F1E3CDF819735E1C66,0
上面的含义表示的是:手机号,时间,基站ID,接入网络的类型(0:unknow,1:3G,2:2G,6:4G)
基站信息:
9F36407EAD0629FC166F14DDE7970F68,116.304864,40.050645,6
CC0710CC94ECC657A8561DE549D940E0,116.303955,40.041935,6
16030401EAFB68F1E3CDF819735E1C66,116.296302,40.032296,6
上面的含义表示的是:基站ID,经度,纬度,接入网络的类型(0:unknow,1:3G,2:2G,6:4G)
创建过程参考:http://blog.csdn.net/tototuzuoquan/article/details/74571374,目录结构和Pom文件和这个里面的代码一样。
package cn.toto.spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by toto on 2017/7/7.
*/
object MobileLocation {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("MobileLocation").setMaster("local")
val sc = new SparkContext(conf)
val lines : RDD[String] = sc.textFile(args(0))
//切分
lines.map(_.split(",")).map(arr => (arr(0), arr(1).toLong, arr(2), args(3)))
val splited = lines.map(line => {
val fields = line.split(",")
val mobile = fields(0)
val lac = fields(2)
val tp = fields(3)
//如果是建立连接的
val time = if(tp == "1") -fields(1).toLong else fields(1).toLong
//拼接数据,某个手机号在某个基站下停留的时间,下面的会返回放到基站里面
// ((基站,位置),时间)
((mobile, lac), time)
})
//分组聚合(计算某个手机号在某个基站下所有停留的时间)
val reduced : RDD[((String,String),Long)] = splited.reduceByKey(_+_)
val lmt = reduced.map(x => {
//(基站ID,(手机号,时间))
//x._1对应的是元组 ((mobile, lac), time)中的(mobile,lac)
//x._2对应的是元组 ((mobile, lac), time)中的time
(x._1._2, (x._1._1, x._2))
})
//连接,数据局
val lacInfo : RDD[String] = sc.textFile(args(1))
//整理基站数据
val splitedLacInfo = lacInfo.map(line => {
val fields = line.split(",")
//id做为key(基站ID)
val id = fields(0)
//经度
val x = fields(1)
//纬度
val y = fields(2)
//(基站ID,(经度,纬度))
(id, (x, y))
})
//连接join之后的结果是: [(基站ID,((手机号,时间),(经度,纬度)))]
val joined: RDD[(String, ((String, Long), (String, String)))] = lmt.join(splitedLacInfo)
//按照手机号进行分组
//_. :代表的是(基站ID,((手机号,时间),(经度,纬度)))
//_._2 :代表的是 ((手机号,时间),(经度,纬度))
//_._2_1 :代表的是 (手机号,时间)
//_._2._1._1 : 代表的是 手机号
val groupedByMobile = joined.groupBy(_._2._1._1)
val result = groupedByMobile.mapValues(_.toList.sortBy(_._2._1._2).reverse.take(2))
println(result.collect().toBuffer)
sc.stop()
}
}
其中E:\mobileLocation\loc_info\loc_info.txt:
9F36407EAD0629FC166F14DDE7970F68,116.304864,40.050645,6
CC0710CC94ECC657A8561DE549D940E0,116.303955,40.041935,6
16030401EAFB68F1E3CDF819735E1C66,116.296302,40.032296,6
19735E1C66.log:
18688888888,20160327082400,16030401EAFB68F1E3CDF819735E1C66,1
18611132889,20160327082500,16030401EAFB68F1E3CDF819735E1C66,1
18688888888,20160327170000,16030401EAFB68F1E3CDF819735E1C66,0
18611132889,20160327180000,16030401EAFB68F1E3CDF819735E1C66,0
DDE7970F68.log
18611132889,20160327075000,9F36407EAD0629FC166F14DDE7970F68,1
18688888888,20160327075100,9F36407EAD0629FC166F14DDE7970F68,1
18611132889,20160327081000,9F36407EAD0629FC166F14DDE7970F68,0
18688888888,20160327081300,9F36407EAD0629FC166F14DDE7970F68,0
18688888888,20160327175000,9F36407EAD0629FC166F14DDE7970F68,1
18611132889,20160327182000,9F36407EAD0629FC166F14DDE7970F68,1
18688888888,20160327220000,9F36407EAD0629FC166F14DDE7970F68,0
18611132889,20160327230000,9F36407EAD0629FC166F14DDE7970F68,0
E549D940E0.log
18611132889,20160327081100,CC0710CC94ECC657A8561DE549D940E0,1
18688888888,20160327081200,CC0710CC94ECC657A8561DE549D940E0,1
18688888888,20160327081900,CC0710CC94ECC657A8561DE549D940E0,0
18611132889,20160327082000,CC0710CC94ECC657A8561DE549D940E0,0
18688888888,20160327171000,CC0710CC94ECC657A8561DE549D940E0,1
18688888888,20160327171600,CC0710CC94ECC657A8561DE549D940E0,0
18611132889,20160327180500,CC0710CC94ECC657A8561DE549D940E0,1
18611132889,20160327181500,CC0710CC94ECC657A8561DE549D940E0,0