目标:找到IP范围重复的IP范围
数据:
id,ip_start,ip_end,longitude,latitude,province,city,unit,operator,type,name,network_area,number,raw_data
***--7845-46f2-***--cce40f54d449,1******3,1******0,,,兵团,第六师,*公安局,局域网计算机,1,,,,10.*.*.1-10.*.*.254
***--e35b-4758-***--5c204a7cc2f1,17******2,1******5,,,湖南,**,*,综合用途,1,,,,10.*.*.0-10.*.*.255
***-1a1a-4159-***--6e76a8e504c8,1******5,1******0,,,浙江,*,监所管理支队,电脑接入,1,,,,10.*.*.1-10.*.*.126
代码:
package com.doctorai.dpnice.testdemo
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.mutable.ArrayBuffer
/**
* @author DPn!ce date 2018 08 21 下午 1:46
*/
object IPIntersectionFilter extends App {
val sparkSession = SparkSession.builder.
master("local[1]")
.appName("IPIntersectionFilter")
.getOrCreate()
val sc = sparkSession.sparkContext
val data: DataFrame = sparkSession.read.format("com.databricks.spark.csv").option("header", "true").load("C:\\Users\\Administrator\\Desktop\\intranet_ip.csv")
val raw_data_rdd: RDD[Row] = data.select("raw_data").rdd
raw_data_rdd.map(r => {
val rangeStr = r.getAs[String](0).split("-")
val key = new StringBuilder
val value = new StringBuilder
if (rangeStr.size == 2) {
val f = rangeStr(0).split("\\.")
val l = rangeStr(1).split("\\.")
key.append(f(0)).append(".").append(f(1)).append(".").append(f(2)).append("-").append(l(0)).append(".").append(l(1)).append(".").append(l(2))
value.append(f(3)).append("-").append(l(3))
}
//(10.135.147-10.135.147 , 128-255)
(key.toString(), value.toString())
}).reduceByKey((x, y) => {
var result = ""
if (!x.isEmpty) {
val xArray = x.split("-")
val yArray = y.split("-")
val xMax = Integer.parseInt(xArray(1))
val xMin = Integer.parseInt(xArray(0))
val yMax = Integer.parseInt(yArray(1))
val yMin = Integer.parseInt(yArray(0))
if (yMin > xMax) {
result = ""
} else {
var min = 0
var max = 0
if (xMin < yMin) {
min = xMin
} else {
min = yMin
}
if (xMax > yMax) {
max = xMax
} else {
max = yMax
}
//0-255-128-255-0-255
result = min + "-" + max + "-" + x + "-" + y
println(x, y)
println(result)
}
}
result
}).filter(t => {
if (t._2.split("-").length > 2) {
true
} else {
false
}
}).flatMap(t => {
val key = t._1.split("-")(0)
val array = ArrayBuffer[String]()
val strings = t._2.split("-")
for (i <- 2 until strings.length) {
if (i % 2 != 0) {
val value = key + "." + strings(i - 1) + "-" + key + "." + strings(i)
array.append(value)
}
}
array
}).foreach(println(_))
}