spark数据清洗的案例

spark数据清洗的案例

  • 基于生产实际的案例
    • 练习的内容
    • 程序结构
    • 代码

基于生产实际的案例

练习的内容

  1. json字符串解析
  2. mapPartition
  3. 累加器
  4. 集合的应用
  5. 多目录压缩写入
  6. 自定义分区

程序结构

spark数据清洗的案例_第1张图片

代码

package pers.machi.sparkRddDataCleansing

import java.util

import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
import org.apache.spark.util.AccumulatorV2

import scala.collection.JavaConversions._
import org.apache.spark.{SparkConf, SparkContext}
import com.google.gson._
import org.apache.hadoop.io.compress.GzipCodec

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

object SparkRddDataCleansing {
  def main(args: Array[String]): Unit = {


    val path = "file:///d:/0000000000/1111111111/jsonData*"
    val conf = new SparkConf().setAppName("SparkRddDataCleansing").setMaster("local[1]")
    val sc = new SparkContext(conf)

    val sourceData = sc.textFile(path)

    val metadata = new MetadataAccumulatorV2()
    sc.register(metadata, "metadata")

    sourceData
      .mapPartitions(partitionIterator => {

        /* 记录元数据 */
        var partitionMetadata = new mutable.HashMap[String, mutable.HashSet[String]]()
        /* 记录整个分区的记录 */
        var partitionRecordArrayBuffer = new mutable.ArrayBuffer[(String, String)]()

        var message: String = ""
        while (partitionIterator.hasNext) {
          message = partitionIterator.next()



          val dataProcessing: String => Array[(String, String)] = (kafkaMessage: String) => {
            val gson = new GsonBuilder()
              .serializeNulls()
              .create()
            val parser: JsonParser = new JsonParser()

            var MessageRecordArrayBuffer = new mutable.ArrayBuffer[(String, String)]()

            val kafkaMessageObject = parser.parse(kafkaMessage).getAsJsonObject
            val bodyObject = parser.parse(kafkaMessageObject.get("body").getAsString).getAsJsonObject

            val preKeys = Seq("t1", "t2", "t3")

            var preKeysValuesMap = new mutable.HashMap[String, String]()
            var records: JsonElement = null
            var recordObject: JsonObject = null
            var recordsArray: JsonArray = null

            for (key <- preKeys) {
              preKeysValuesMap += ((key, bodyObject.get(key).getAsString))
            }

            val reqDataObject = parser.parse(bodyObject.get("reqData").getAsString).getAsJsonObject
            for (tableRecordsPair <- reqDataObject.entrySet()) {
              records = tableRecordsPair.getValue
              if (records.isJsonObject) {
                recordObject = records.getAsJsonObject
                for (key <- preKeysValuesMap.keySet) {
                  recordObject.addProperty(key, preKeysValuesMap.get(key).get)
                }
                MessageRecordArrayBuffer += ((tableRecordsPair.getKey, gson.toJson(recordObject)))

                if (partitionMetadata.keySet.contains(tableRecordsPair.getKey)) {
                  for (fieldValuePair <- recordObject.entrySet()) {
                      partitionMetadata.get(tableRecordsPair.getKey).get += fieldValuePair.getKey
                  }
                } else {
                  var tableFieldsSet =   new mutable.HashSet[String]()
                  for (fieldValuePair <- recordObject.entrySet()) {
                    tableFieldsSet+=fieldValuePair.getKey
                  }
                  partitionMetadata+=((tableRecordsPair.getKey, tableFieldsSet))
                }
              }
              else if (records.isJsonArray) {
                recordsArray = records.getAsJsonArray
                for (i <- 0 until recordsArray.size()) {
                  recordObject = recordsArray.get(i).getAsJsonObject
                  for (key <- preKeysValuesMap.keySet) {
                    recordObject.addProperty(key, preKeysValuesMap.get(key).get)
                  }
                  MessageRecordArrayBuffer += ((tableRecordsPair.getKey, gson.toJson(recordObject)))

                  if (partitionMetadata.keySet.contains(tableRecordsPair.getKey)) {
                    for (fieldValuePair <- recordObject.entrySet()) {
                      if (!partitionMetadata.get(tableRecordsPair.getKey).contains(fieldValuePair.getKey)) {
                        partitionMetadata.get(tableRecordsPair.getKey).get += fieldValuePair.getKey
                      }
                    }
                  } else {
                    var tableFieldsSet =   new mutable.HashSet[String]()
                    for (fieldValuePair <- recordObject.entrySet()) {
                      tableFieldsSet+=fieldValuePair.getKey
                    }
                    partitionMetadata+=((tableRecordsPair.getKey, tableFieldsSet))
                  }
                }
              }
            }

            MessageRecordArrayBuffer.toArray
          }

          partitionRecordArrayBuffer ++= dataProcessing(message)
        }



        /* 累加每个分区的元数据 */
        metadata.add(partitionMetadata)
        partitionRecordArrayBuffer.iterator
      })
      .partitionBy(new TablePartitioner(3))
      .coalesce(1)
      .saveAsHadoopFile("file:///d:/0000000000/2222222222",
        classOf[String],
        classOf[String],
        classOf[TableMultipleTextOutputFormat],
        classOf[GzipCodec])

    for (table <- metadata.value.keySet){
      print(table+":")
      println(metadata.value.get(table).mkString("/t"))
    }

    sc.stop()
  }
}
package pers.machi.sparkRddDataCleansing

import org.apache.spark.util.AccumulatorV2

import scala.collection.mutable

class MetadataAccumulatorV2 extends AccumulatorV2[mutable.HashMap[String, mutable.HashSet[String]], mutable.HashMap[String, mutable.HashSet[String]]] {
    var metadataCollection = new mutable.HashMap[String, mutable.HashSet[String]]()

    override def isZero: Boolean = {
        if (metadataCollection.size == 0)
            return true
        else return false
    }

    override def reset(): Unit = {
        metadataCollection.empty
    }

    override def add(v: mutable.HashMap[String, mutable.HashSet[String]]): Unit = {
        for (table <- v.keySet) {
            if (metadataCollection.keySet.contains(table)) {
                metadataCollection.get(table).get ++ v.get(table).get

            } else {
                metadataCollection += ((table, v.get(table).get))
            }
        }
        println(metadataCollection.keySet.mkString("$"))
        println(metadataCollection.values.mkString("#"))
    }

    override def merge(other: AccumulatorV2[mutable.HashMap[String, mutable.HashSet[String]], mutable.HashMap[String, mutable.HashSet[String]]]): Unit = other match {
        case o: MetadataAccumulatorV2 => {
            for (table <- o.metadataCollection.keySet) {
                print(metadataCollection.keySet.mkString("$") + "XX")
                metadataCollection.get(table) match {
                    case Some(value) =>
                        metadataCollection.get(table).get ++ value
                    case None => metadataCollection += ((table, o.metadataCollection.get(table).get))
                }
            }
        }
        case _ => throw new UnsupportedOperationException(
            s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
    }

    override def value: mutable.HashMap[String, mutable.HashSet[String]] = metadataCollection

    override def copy(): AccumulatorV2[mutable.HashMap[String, mutable.HashSet[String]], mutable.HashMap[String, mutable.HashSet[String]]] = {
        this
    }
}
package pers.machi.sparkRddDataCleansing

import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat

class TableMultipleTextOutputFormat extends MultipleTextOutputFormat[Any,Any] {

  override def generateActualKey(key:Any,value:Any):Any=
    NullWritable.get()

  override def generateFileNameForKeyValue(key:Any,value:Any,name:String):String=
    key.toString+"/"+name

}

package pers.machi.sparkRddDataCleansing

import org.apache.spark.Partitioner

class TablePartitioner(partitions: Int) extends Partitioner {
    override def numPartitions: Int = partitions

    override def getPartition(key: Any): Int =  key match {
        case "表1" => 0
        case "表2"=>1
        case _ => 2
    }

}

你可能感兴趣的:(Spark,scala)