spark使用自定义Hadoop的FileInputFormat读取hdfs上的数据

比如:一个300MB的文件,其中包含6条记录,每条记录300/6=50MB,该文件在hdfs上分了3个block,每个block为128MB,如上图,第3、6个记录跨block了。如下图所示。

spark使用自定义Hadoop的FileInputFormat读取hdfs上的数据_第1张图片

为了处理改文件,spark在每个数据节点上创建executor,这些executor负责读取自己的数据。

  1. Block 1 上的Executor 1 读取3条记录作为输入,其中本地读Line1、Line2,本地和远程读Line3
  2. Block 2 上的Executor 2 读取后3条记录作为输入,其中本地读Line4、Line5,本地和远程读Line5
  3. Block 3 上的Executor 3 不读取数据

上述的解释可以参考:

https://hadoopi.wordpress.com/2013/05/27/understand-recordreader-inputsplit/

https://www.ae.be/blog-en/ingesting-data-spark-using-custom-hadoop-fileinputformat/

https://blog.csdn.net/tanggao1314/article/details/51307642

 

如果有以下情况:

spark根据inputformat知道怎么读取记录,默认的是每行一个记录,所以spark会扫描数据的换行符来作为记录之间的分隔。当然你也可以自定义分隔符(textinputformat.record.delimiter)。

如果有个转义换行,spark就不能识别,比如如下的聊天记录,两条记录,而spark会错误的认为是三条记录,因为不能识别转义符'\n'。

{from:"Gert", to:"Melissa", message:"Want to have dinner?"}
 {from:"Melissa", to:"Gert", message:"Ok\
 How about Italian?"}

解决方案:重写InputFormat类

下面列出主要的scala代码片段:

 

//  val rdd = sparkContext.textFile("data.txt")

  val conf = new Configuration(sparkContext.hadoopConfiguration)
  val rdd = sparkContext.newAPIHadoopFile("data.txt", classOf[MyFileInputFormat],
    classOf[LongWritable], classOf[Text], conf)

  class MyFileInputFormat extends FileInputFormat[LongWritable, Text] {
    override def createRecordReader(split: InputSplit, context: TaskAttemptContext):
    RecordReader[LongWritable, Text] = new MyRecordReader()
  }

  class MyRecordReader() extends RecordReader[LongWritable, Text] {
    var start, end, position = 0L
    var reader: LineReader = null
    var key = new LongWritable
    var value = new Text

    override def initialize(inputSplit: InputSplit, context: TaskAttemptContext): Unit = {
      // split position in data (start one byte earlier to detect if
      // the split starts in the middle of a previous record)
      val split = inputSplit.asInstanceOf[FileSplit]
      start = 0.max(split.getStart - 1)
      end = start + split.getLength
      // open a stream to the data, pointing to the start of the split
      val stream = split.getPath.getFileSystem(context.getConfiguration)
        .open(split.getPath)
      stream.seek(start)
      reader = new LineReader(stream, context.getConfiguration)
      // if the split starts at a newline, we want to start yet another byte
      // earlier to check if the newline was escaped or not
      val firstByte = stream.readByte().toInt
      if(firstByte == '\n')
        start = 0.max(start - 1)
      stream.seek(start)

      if(start != 0)
        skipRemainderFromPreviousSplit(reader)
    }

    def skipRemainderFromPreviousSplit(reader: LineReader): Unit = {
      var readAnotherLine = true
      while(readAnotherLine) {
        // read next line
        val buffer = new Text()
        start += reader.readLine(buffer, Integer.MAX_VALUE, Integer.MAX_VALUE)
        pos = start
        // detect if delimiter was escaped
        readAnotherLine = buffer.getLength >= 1 && // something was read
        buffer.charAt(buffer.getLength - 1) == '\\' && // newline was escaped
        pos <= end // seek head hasn't passed the split
      }
    }
    override def nextKeyValue(): Boolean = {
      key.set(pos)
      // read newlines until an unescaped newline is read
      var lastNewlineWasEscaped = false
      while (pos < end || lastNewlineWasEscaped) {
        // read next line
        val buffer = new Text
        pos += reader.readLine(buffer, Integer.MAX_VALUE, Integer.MAX_VALUE)
        // append newly read data to previous data if necessary
        value = if(lastNewlineWasEscaped) new Text(value + "\n" + buffer) else buffer
        // detect if delimiter was escaped
        lastNewlineWasEscaped = buffer.charAt(buffer.getLength - 1) == '\\'
        // let Spark know that a key-value pair is ready!
        if(!lastNewlineWasEscaped)
          return true
      }
      // end of split reached?
      return false
    }
  }

}

class MyRecordReaderTests extends FunSuite {
  // other tests
  ...
  test("I should be able to read an input split") {
    val split = new FileSplit(path, 2750, 4085, null)
    // setup
    val context = setupTaskAttemptContextImpl()
    val reader = new MyRecordReader()
    // initialize
    reader.initialize(split, context)
    // read all records in split
    val actual = new mutable.ListBuffer[(LongWritable, Text)]()
    while(reader.nextKeyValue())
      actual += (reader.getCurrentKey, reader.getCurrentValue)
        // assert that data is what you expect
        ...
  }
}

 

你可能感兴趣的:(spark,scala)