Spark-core综合练习-IP匹配

ip.txt 部分数据:

220.177.248.0|220.177.255.255|3702650880|3702652927|亚洲|中国|江西|南昌||电信|360100|China|CN|115.892151|28.676493
220.178.0.0|220.178.56.113|3702652928|3702667377|亚洲|中国|安徽|合肥||电信|340100|China|CN|117.283042|31.86119
220.178.56.114|220.178.57.33|3702667378|3702667553|亚洲|中国|安徽|合肥|巢湖|电信|340181|China|CN|117.874155|31.600518
220.178.57.34|220.178.57.34|3702667554|3702667554|亚洲|中国|安徽|合肥|肥东|电信|340122|China|CN|117.47128|31.88525
220.178.57.35|220.178.57.53|3702667555|3702667573|亚洲|中国|安徽|合肥|巢湖|电信|340181|China|CN|117.874155|31.600518
220.178.57.54|220.178.57.54|3702667574|3702667574|亚洲|中国|安徽|合肥|肥东|电信|340122|China|CN|117.47128|31.88525
220.178.57.55|220.178.57.145|3702667575|3702667665|亚洲|中国|安徽|合肥|巢湖|电信|340181|China|CN|117.874155|31.600518
220.178.57.146|220.178.57.146|3702667666|3702667666|亚洲|中国|安徽|合肥|肥东|电信|340122|China|CN|117.47128|31.88525
220.178.57.147|220.178.57.181|3702667667|3702667701|亚洲|中国|安徽|合肥|巢湖|电信|340181|China|CN|117.874155|31.600518
220.178.57.182|220.178.57.184|3702667702|3702667704|亚洲|中国|安徽|合肥|肥东|电信|340122|China|CN|117.47128|31.88525
220.178.57.185|220.178.57.189|3702667705|3702667709|亚洲|中国|安徽|合肥|巢湖|电信|340181|China|CN|117.874155|31.600518
220.178.57.190|220.178.57.190|3702667710|3702667710|亚洲|中国|安徽|合肥|肥东|电信|340122|China|CN|117.47128|31.88525
220.178.57.191|220.178.62.41|3702667711|3702668841|亚洲|中国|安徽|合肥|巢湖|电信|340181|China|CN|117.874155|31.600518
220.178.62.42|220.178.62.42|3702668842|3702668842|亚洲|中国|安徽|合肥|肥西|电信|340123|China|CN|117.16845|31.72143
220.178.62.43|220.178.62.65|3702668843|3702668865|亚洲|中国|安徽|合肥||电信|340100|China|CN|117.283042|31.86119
220.178.62.66|220.178.62.66|3702668866|3702668866|亚洲|中国|安徽|合肥|肥西|电信|340123|China|CN|117.16845|31.72143
220.178.62.67|220.178.62.193|3702668867|3702668993|亚洲|中国|安徽|合肥||电信|340100|China|CN|117.283042|31.86119
220.178.62.194|220.178.62.194|3702668994|3702668994|亚洲|中国|安徽|合肥|肥西|电信|340123|China|CN|117.16845|31.72143
220.178.62.195|220.178.76.41|3702668995|3702672425|亚洲|中国|安徽|合肥||电信|340100|China|CN|117.283042|31.86119
220.178.76.42|220.178.76.42|3702672426|3702672426|亚洲|中国|安徽|合肥|长丰|电信|340121|China|CN|117.16549|32.47959
220.178.76.43|220.178.76.133|3702672427|3702672517|亚洲|中国|安徽|合肥||电信|340100|China|CN|117.283042|31.86119
220.178.76.134|220.178.76.134|3702672518|3702672518|亚洲|中国|安徽|合肥|长丰|电信|340121|China|CN|117.16549|32.47959
220.178.76.135|220.178.77.49|3702672519|3702672689|亚洲|中国|安徽|合肥||电信|340100|China|CN|117.283042|31.86119

目标文件部分数据:20090121000132.394251.http.format

20090121000132124542000|117.101.215.133|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/index.php?uidhash=d1c3b69e9b8355a5204474c749fb76ef|__tkist=0; myloc=50%7C5008; myage=2009; PROFILE=14469674%3A%E8%8B%A6%E6%B6%A9%E5%92%96%E5%95%A1%3Am%3Aphotos2.love21cn.com%2F45%2F1b%2F388111afac8195cc5d91ea286cdd%3A1%3A%3Ahttp%3A%2F%2Fimages.love21cn.com%2Fw4%2Fglobal%2Fi%2Fhykj_m.jpg; last_login_time=1232454068; SESSION_HASH=8176b100a84c9a095315f916d7fcbcf10021e3af; RAW_HASH=008a1bc48ff9ebafa3d5b4815edd04e9e7978050; COMMON_HASH=45388111afac8195cc5d91ea286cdd1b; pop_1232093956=1232468896968; pop_time=1232466715734; pop_1232245908=1232469069390; pop_1219903726=1232477601937; LOVESESSID=98b54794575bf547ea4b55e07efa2e9e; main_search:14469674=%7C%7C%7C00; registeruid=14469674; REG_URL_COOKIE=http%3A%2F%2Fphoto.jiayuan.com%2Fshowphoto.php%3Fuid_hash%3D0319bc5e33ba35755c30a9d88aaf46dc%26total%3D6%26p%3D5; click_count=0%2C3363619
20090121000132406516000|117.101.222.68|gg.xiaonei.com|/view.jsp?p=389|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; CIBA)|http://home.xiaonei.com/Home.do?id=229670724|_r01_=1; __utma=204579609.31669176.1231940225.1232462740.1232467011.145; __utmz=204579609.1231940225.1.1.utmccn=(direct)
20090121000132581311000|115.120.36.118|tj.tt98.com|/tj.htm|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TheWorld)|http://www.tt98.com/|
20090121000132864647000|123.197.64.247|cul.sohu.com|/20071227/n254338813_22.shtml|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TheWorld)|http://cul.sohu.com/20071227/n254338813_22.shtml|ArticleTab=visit:1; IPLOC=unknown; SUV=0901080709152121; vjuids=832dd37a1.11ebbc5d590.0.b20f858f14e918; club_chat_ircnick=JaabvxC4aaacQ; spanel=%7B%22u%22%3A%22%22%7D; vjlast=1232467312,1232467312,30
20090121000133296729000|222.55.57.176|down.chinaz.com|/|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; iCafeMedia; TencentTraveler 4.0)||cnzz_a33219=0; vw33219=%3A18167791%3A; sin33219=http%3A//www.itxls.com/wz/wyfx/it.html; rtime=0; ltime=1232464387281; cnzz_eid=6264952-1232464379-http%3A//www.itxls.com/wz/wyfx/it.html
20090121000133331104000|123.197.66.93|www.pkwutai.cn|/down/downLoad-id-45383.html|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 1.7)|http://www.baidu.com/s?tn=b1ank_pg&ie=gb2312&bs=%C3%C0%C6%BC%B7%FE%D7%B0%B9%DC%C0%ED%C8%ED%BC%FE&sr=&z=&cl=3&f=8&wd=%C6%C6%BD%E2%C3%C0%C6%BC%B7%FE%D7%B0%B9%DC%C0%ED%C8%ED%BC%FE&ct=0|
20090121000133446262000|115.120.12.157|v.ifeng.com|/live/|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 2.0.50727; CIBA)|http://www.ifeng.com/|userid=1232466610953_4339; location=186; sclocationid=10002; vjuids=22644b162.11ef4bc1624.0.63ad06717b426; vjlast=1232466614,1232467297,13
20090121000133456256000|115.120.7.240|cqbbs.soufun.com|/3110502342~-1~2118/23004348_23004348.htm|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 2.0.50727; CIBA)||new_historysignlist=%u534E%u6DA6%u4E8C%u5341%u56DB%u57CE%7Chttp%3A//cqbbs.soufun.com/board/3110502342/%7C%7C%u9A8F%u9038%u7B2C%u4E00%u6C5F%u5CB8%7Chttp%3A//cqbbs.soufun.com/board/3110169184/%7C%7C%u793E%u533A%u4E4B%u661F%7Chttp%3A//cqbbs.soufun.com/board/sqzx/%7C%7C; SoufunSessionID=2y5xyr45kslc0zbdooqnoo55; viewUser=1; vjuids=-870e9088.11ee89aba57.0.be9c3d988def8; vjlast=1232263101,1232380806,11; new_viewtype=1; articlecolor=#000000; usersms_pop_type=1; articlecount=186; __utma=101868291.755195653.1232450942.1232450942.1232450942.1; __utmz=101868291.1232450942.1.1.utmccn=(referral)
20090121000133586141000|117.101.219.241|12.zgwow.com|/launcher/index.htm|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)||

需求:

  • 根据ip地址查询进行统计
  • 展示结果样式: ((经度,纬度,所在城市),次数)

参考答案:

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable

object IPDemoWithScala {

  def main(args: Array[String]): Unit = {

    val sparkConf: SparkConf = new SparkConf().setAppName("IPDemoWithScala").setMaster("local[4]")
    val sc = new SparkContext(sparkConf)
    sc.setLogLevel("WARN")
    val lines1: RDD[String] = sc.textFile("D:\\workplace\\testFile\\ip\\20090121000132.394251.http.format")
    val lines2: RDD[String] = sc.textFile("D:\\workplace\\testFile\\ip\\ip.txt")

    //125.213.100.123
    val ips1Rdd: RDD[String] = lines1.map(x => x.split("\\|")).map(x => x(1))

    //1.0.1.0|1.0.3.255|16777472|16778239|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
    val ips2Rdd: RDD[Array[String]] = lines2.map(x => x.split("\\|"))

    //ipArray={125,213,100,123}
    val ipTransfer: RDD[Long] = ips1Rdd.map(x => {
      val ipArray: Array[String] = x.split("[.]")
      var ipNum = 0L
      for (i <- ipArray) {
        ipNum = i.toLong | ipNum << 8L
      }
      ipNum
    })

    val ipTransfer2: Array[(Long, Int)] = ipTransfer.map(x => (x, 1)).reduceByKey(_ + _).collect

//ips2Transfer=((16777472,16778239),((119.306239,26.075302,重庆),1))
val ips2Transfer: RDD[((Long, Long), ((String, String,String), Int))] = ips2Rdd.map(x => {
  val start = x(2).toLong
  val end = x(3).toLong
  val city = x(7)
  val longitude = x(13) //经度
  val latitude = x(14) //纬度
  ((start, end), ((longitude, latitude,city), 1))
})

var hashMap = new mutable.HashMap[(String, String,String), Int]()
val ipList: List[((Long, Long), ((String, String,String), Int))] = ips2Transfer.toLocalIterator.toList


for (elem <- ipList) {
  for (one <- ipTransfer2) {
    if (one._1 >= elem._1._1 && one._1 <= elem._1._2) {
      if (hashMap.contains(elem._2._1)) {
        hashMap(elem._2._1) += one._2
      } else {
        hashMap.put(elem._2._1, one._2)
      }
    }
  }
}

val result: List[((String, String,String), Int)] = hashMap.toList.sortBy(_._2).reverse
result.foreach(x => println(x))
    sc.stop()
  }
}

运行结果:
Spark-core综合练习-IP匹配_第1张图片

方法二:(通过Spark SQL)

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

object IPLocationWithSparkSQL {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("HiveSupportWithScala")
      .master("local")
      .enableHiveSupport()
      .getOrCreate()
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("WARN")

    //3:读取本地文件,加载数据
    val lines: RDD[String] = sc.textFile("D:/workplace/testFile/ip/ip.txt")
    val lines2: RDD[String] = sc.textFile("D:/workplace/testFile/ip/20090121000132.394251.http.format")
    //4:将每一行数据进行切分,并转换成People对象
    val rulesRDD: RDD[IpLocation] = lines.map(line => line.split("[|]")).map(attr => IpLocation(attr(2).toLong, attr(3).toLong, attr(13), attr(14), attr(7)))

    val ipTransfer: RDD[Long] = lines2.map(line => line.split("[|]")).map(x => {
      val ipArray: Array[String] = x(1).split("[.]")
      var ipNum = 0L
      for (i <- ipArray) {
        ipNum = i.toLong | ipNum << 8L
      }
      ipNum
    })
    
    import spark.implicits._
    val rulesDF: DataFrame = rulesRDD.toDF()
    val ipTransferDF: DataFrame = ipTransfer.map(x => TempIp(x)).toDF()

    rulesDF.createOrReplaceTempView("iplocation")
    ipTransferDF.createOrReplaceTempView("tempIp")

    //查询
    val sql: String = "select longitude, latitude,city, count(1) as number from tempIp left join iplocation on ip_transfer >= start_ip and ip_transfer <= end_ip group by longitude, latitude,city order by number desc"
    
    spark.sql(sql).show()
    
    sc.stop()
  }
}

case class IpLocation(start_ip: Long, end_ip: Long, longitude: String, latitude: String, city: String)
case class TempIp(ip_transfer: Long)

运行结果:
Spark-core综合练习-IP匹配_第2张图片
方法三(利用二分法):

object IPDemoWithBinary {
  def main(args: Array[String]): Unit = {

    val sparkConf: SparkConf = new SparkConf().setAppName("IPDemoWithBinary").setMaster("local[4]")
    val sc = new SparkContext(sparkConf)
    sc.setLogLevel("WARN")
    val lines1: RDD[String] = sc.textFile("D:\\workplace\\testFile\\ip\\20090121000132.394251.http.format")
    val lines2: RDD[String] = sc.textFile("D:\\workplace\\testFile\\ip\\ip.txt")

    //125.213.100.123
    val ips1Rdd: RDD[String] = lines1.map(x => x.split("\\|")).map(x => x(1))

    //1.0.1.0|1.0.3.255|16777472|16778239|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
    val ips2Rdd: RDD[Array[String]] = lines2.map(x => x.split("\\|"))

    //ipArray={125,213,100,123}
    val ipTransfer: RDD[Long] = ips1Rdd.map(x => {
      val ipArray: Array[String] = x.split("[.]")
      var ipNum = 0L
      for (i <- ipArray) {
        ipNum = i.toLong | ipNum << 8L
      }
      ipNum
    })

    val ipTransfer2: Array[(Long, Int)] = ipTransfer.map(x => (x, 1)).reduceByKey(_ + _).collect

    //ips2Transfer=((16777472,16778239),((119.306239,26.075302,重庆),1))
    val rulesRDD: RDD[((Long, Long), ((String, String, String), Int))] = ips2Rdd.map(x => {
      val start = x(2).toLong
      val end = x(3).toLong
      val city = x(7)
      val longitude = x(13) //经度
      val latitude = x(14) //纬度
      ((start, end), ((longitude, latitude, city), 1))
    })

    var hashMap = new mutable.HashMap[(String, String, String), Int]()
    val rules: List[((Long, Long), ((String, String, String), Int))] = rulesRDD.toLocalIterator.toList

    //二分法查找
    for (elem <- ipTransfer2) {
      var low = 0
      var high = rules.length - 1
      while (low <= high) {
        val middle = (low + high) / 2
        if ((elem._1 >= rules(middle)._1._1) && (elem._1 <= rules(middle)._1._2))
          if (hashMap.contains(rules(middle)._2._1)) {
            hashMap(rules(middle)._2._1) += elem._2
          } else {
            hashMap.put(rules(middle)._2._1, elem._2)
          }
        if (elem._1 < rules(middle)._1._1)
          high = middle - 1
        else {
          low = middle + 1
        }
      }
    }

    val result: List[((String, String, String), Int)] = hashMap.toList.sortBy(_._2).reverse
    result.foreach(x => println(x))

    println("size --> " + ipTransfer.collect.length)

    // sc.parallelize(result).saveAsTextFile("C:\\Users\\ST\\Desktop\\aaa")
    sc.stop()
  }
}

你可能感兴趣的:(Spark)