spark读取csv文件用类进行封装最后转成json写出到本地

package scala

import com.google.gson.{Gson, JsonObject, JsonParser}
import org.apache.spark.sql.{Row, SaveMode, SparkSession}

object CsvParse {

  case class Data(callerip: String,callere164: String, calleee164: String, starttime: String,
                  stoptime: String, holdtime: String, feetime: String, fee: String, 
                  enddirection: String, endreason: String)

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local[*]").appName("SparkCsv").getOrCreate();
    import spark.implicits._
    val df = spark.read.option("header", "true").csv("D:\\tmp\\home2").select("callerip", "callere164", "calleee164", "starttime", "stoptime",
      "holdtime", "feetime", "fee", "enddirection", "endreason")

    df.mapPartitions(transform _).map((x: Data) => {
      val gson = new Gson()
      gson.fromJson("sd",classOf[Data])
      gson.toJson(x)
    }).repartition(10).write.mode(SaveMode.Overwrite).text("C:\\Users\\Administrator\\Desktop\\spark\\")
  }


  def transform(iterator: Iterator[Row]): Iterator[Data] = {
    var dataArray = List[Data]()
    while (iterator.hasNext) {
      val row = iterator.next()
      //calleee164是一个电话号要进行两部操作,第一是正则匹配十二位的,第二substring(1)
      val calleee164 = row.getAs[String]("calleee164")
      val callerip = row.getAs[String]("callerip")

      if (calleee164 != null && calleee164.length == 12) {
   
        val pattern = ".[1][0-9]{10}".r
        val regixResult = pattern.findFirstIn(calleee164)
        regixResult match {
          case None => println("没有匹配到数据")
          case Some(value) => {
            val telnumber =value.substring(1)
            dataArray.::=(Data(row.getAs("callerip"),row.getAs("callere164"), telnumber, row.getAs("starttime"), row.getAs("stoptime"), row.getAs("holdtime"), row.getAs("feetime"),
              row.getAs("fee"), row.getAs("enddirection"), row.getAs("endreason")))
//            println(value.substring(1))
          }
        }
      }
    }
    dataArray.iterator
  }

}

你可能感兴趣的:(大数据,ELK,scala,玩转海量数据)