Spark:找到IP范围重复的(交集,包含)IP范围

目标:找到IP范围重复的IP范围

数据:

id,ip_start,ip_end,longitude,latitude,province,city,unit,operator,type,name,network_area,number,raw_data
***--7845-46f2-***--cce40f54d449,1******3,1******0,,,兵团,第六师,*公安局,局域网计算机,1,,,,10.*.*.1-10.*.*.254
***--e35b-4758-***--5c204a7cc2f1,17******2,1******5,,,湖南,**,*,综合用途,1,,,,10.*.*.0-10.*.*.255
***-1a1a-4159-***--6e76a8e504c8,1******5,1******0,,,浙江,*,监所管理支队,电脑接入,1,,,,10.*.*.1-10.*.*.126

代码:

package com.doctorai.dpnice.testdemo

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.mutable.ArrayBuffer

/**
  * @author DPn!ce   date 2018 08 21 下午 1:46
  */
object IPIntersectionFilter extends App {
  val sparkSession = SparkSession.builder.
    master("local[1]")
    .appName("IPIntersectionFilter")
    .getOrCreate()
  val sc = sparkSession.sparkContext
  val data: DataFrame = sparkSession.read.format("com.databricks.spark.csv").option("header", "true").load("C:\\Users\\Administrator\\Desktop\\intranet_ip.csv")
  val raw_data_rdd: RDD[Row] = data.select("raw_data").rdd

  raw_data_rdd.map(r => {
    val rangeStr = r.getAs[String](0).split("-")
    val key = new StringBuilder
    val value = new StringBuilder
    if (rangeStr.size == 2) {
      val f = rangeStr(0).split("\\.")
      val l = rangeStr(1).split("\\.")
      key.append(f(0)).append(".").append(f(1)).append(".").append(f(2)).append("-").append(l(0)).append(".").append(l(1)).append(".").append(l(2))
      value.append(f(3)).append("-").append(l(3))
    }
    //(10.135.147-10.135.147 , 128-255)
    (key.toString(), value.toString())
  }).reduceByKey((x, y) => {
    var result = ""
    if (!x.isEmpty) {
      val xArray = x.split("-")
      val yArray = y.split("-")
      val xMax = Integer.parseInt(xArray(1))
      val xMin = Integer.parseInt(xArray(0))
      val yMax = Integer.parseInt(yArray(1))
      val yMin = Integer.parseInt(yArray(0))
      if (yMin > xMax) {
        result = ""
      } else {
        var min = 0
        var max = 0
        if (xMin < yMin) {
          min = xMin
        } else {
          min = yMin
        }
        if (xMax > yMax) {
          max = xMax
        } else {
          max = yMax
        }
        //0-255-128-255-0-255
        result = min + "-" + max + "-" + x + "-" + y
        println(x, y)
        println(result)
      }
    }
    result
  }).filter(t => {
    if (t._2.split("-").length > 2) {
      true
    } else {
      false
    }
  }).flatMap(t => {
    val key = t._1.split("-")(0)
    val array = ArrayBuffer[String]()
    val strings = t._2.split("-")
    for (i <- 2 until strings.length) {
      if (i % 2 != 0) {
        val value = key + "." + strings(i - 1) + "-" + key + "." + strings(i)
        array.append(value)
      }
    }
    array
  }).foreach(println(_))

}

 

你可能感兴趣的:(spark,scala,ip,spark)