通过idea查看option源码发现没找到 点击csv看源码发现会查找option里面的设置: 下面源码中有介绍:
eg:
sep 表示设置文本分隔符...
def csv(csvDataset: Dataset[String]): DataFrame = {
val parsedOptions: CSVOptions = new CSVOptions(
extraOptions.toMap,
sparkSession.sessionState.conf.sessionLocalTimeZone)
val filteredLines: Dataset[String] =
CSVUtils.filterCommentAndEmpty(csvDataset, parsedOptions)
val maybeFirstLine: Option[String] = filteredLines.take(1).headOption
val schema = userSpecifiedSchema.getOrElse {
TextInputCSVDataSource.inferFromDataset(
sparkSession,
csvDataset,
maybeFirstLine,
parsedOptions)
}
verifyColumnNameOfCorruptRecord(schema, parsedOptions.columnNameOfCorruptRecord)
val actualSchema =
StructType(schema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
val linesWithoutHeader: RDD[String] = maybeFirstLine.map { firstLine =>
filteredLines.rdd.mapPartitions(CSVUtils.filterHeaderLine(_, firstLine, parsedOptions))
}.getOrElse(filteredLines.rdd)
val parsed = linesWithoutHeader.mapPartitions { iter =>
val rawParser = new UnivocityParser(actualSchema, parsedOptions)
val parser = new FailureSafeParser[String](
input => Seq(rawParser.parse(input)),
parsedOptions.parseMode,
schema,
parsedOptions.columnNameOfCorruptRecord)
iter.flatMap(parser.parse)
}
sparkSession.internalCreateDataFrame(parsed, schema, isStreaming = csvDataset.isStreaming)
}
/**
* Loads CSV files and returns the result as a `DataFrame`.
*
* This function will go through the input once to determine the input schema if `inferSchema`
* is enabled. To avoid going through the entire data once, disable `inferSchema` option or
* specify the schema explicitly using `schema`.
*
* You can set the following CSV-specific options to deal with CSV files:
*
* - `sep` (default `,`): sets a single character as a separator for each
* field and value.
* - `encoding` (default `UTF-8`): decodes the CSV files by the given encoding
* type.
* - `quote` (default `"`): sets a single character used for escaping quoted values where
* the separator can be part of the value. If you would like to turn off quotations, you need to
* set not `null` but an empty string. This behaviour is different from
* `com.databricks.spark.csv`.
* - `escape` (default `\`): sets a single character used for escaping quotes inside
* an already quoted value.
* - `charToEscapeQuoteEscaping` (default `escape` or `\0`): sets a single character used for
* escaping the escape for the quote character. The default value is escape character when escape
* and quote characters are different, `\0` otherwise.
* - `comment` (default empty string): sets a single character used for skipping lines
* beginning with this character. By default, it is disabled.
* - `header` (default `false`): uses the first line as names of columns.
* - `inferSchema` (default `false`): infers the input schema automatically from data. It
* requires one extra pass over the data.
* - `ignoreLeadingWhiteSpace` (default `false`): a flag indicating whether or not leading
* whitespaces from values being read should be skipped.
* - `ignoreTrailingWhiteSpace` (default `false`): a flag indicating whether or not trailing
* whitespaces from values being read should be skipped.
* - `nullValue` (default empty string): sets the string representation of a null value. Since
* 2.0.1, this applies to all supported types including the string type.
* - `nanValue` (default `NaN`): sets the string representation of a non-number" value.
* - `positiveInf` (default `Inf`): sets the string representation of a positive infinity
* value.
* - `negativeInf` (default `-Inf`): sets the string representation of a negative infinity
* value.
* - `dateFormat` (default `yyyy-MM-dd`): sets the string that indicates a date format.
* Custom date formats follow the formats at `java.text.SimpleDateFormat`. This applies to
* date type.
* - `timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSXXX`): sets the string that
* indicates a timestamp format. Custom date formats follow the formats at
* `java.text.SimpleDateFormat`. This applies to timestamp type.
* - `maxColumns` (default `20480`): defines a hard limit of how many columns
* a record can have.
* - `maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed
* for any given value being read. By default, it is -1 meaning unlimited length
* - `mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
* during parsing. It supports the following case-insensitive modes.
*
* - `PERMISSIVE` : when it meets a corrupted record, puts the malformed string into a
* field configured by `columnNameOfCorruptRecord`, and sets other fields to `null`. To keep
* corrupt records, an user can set a string type field named `columnNameOfCorruptRecord`
* in an user-defined schema. If a schema does not have the field, it drops corrupt records
* during parsing. A record with less/more tokens than schema is not a corrupted record to
* CSV. When it meets a record having fewer tokens than the length of the schema, sets
* `null` to extra fields. When the record has more tokens than the length of the schema,
* it drops extra tokens.
* - `DROPMALFORMED` : ignores the whole corrupted records.
* - `FAILFAST` : throws an exception when it meets corrupted records.
*
*
* - `columnNameOfCorruptRecord` (default is the value specified in
* `spark.sql.columnNameOfCorruptRecord`): allows renaming the new field having malformed string
* created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`.
* - `multiLine` (default `false`): parse one record, which may span multiple lines.
*
* @since 2.0.0
*/