使用Spark 2.2.1 + Kudu 1.5.0 操作Kudu大数据系统
Apache Kudu org.apache.kudu interface-annotations 1.5.0-cdh5.13.0
org.apache.kudu kudu-client 1.5.0-cdh5.13.0
org.apache.kudu kudu-client-tools 1.5.0-cdh5.13.0
org.apache.kudu kudu-flume-sink 1.5.0-cdh5.13.0
org.apache.kudu kudu-hive 1.5.0-cdh5.13.0
org.apache.kudu kudu-mapreduce 1.5.0-cdh5.13.0
org.apache.kudu kudu-parent 1.5.0-cdh5.13.0
org.apache.kudu kudu-spark-tools 1.5.0-cdh5.13.0
org.apache.kudu kudu-spark2-tools_2.11 1.5.0-cdh5.13.0
org.apache.kudu kudu-spark2_2.11 1.5.0-cdh5.13.0
org.apache.kudu kudu-spark_2.10 1.5.0-cdh5.13.0
2.Kudu的Maven依赖:
org.apache.kudu
kudu-client-tools
1.5.0
org.apache.kudu
kudu-client
1.5.0
test
org.apache.kudu
kudu-spark2_2.11
1.5.0
3,Spark2.2.1连接Kudu 1.5.0 的源码分析:
sparkSession使用("kudu.master",kuduMaster),("kudu.table",kuduTable)连接Kudu系统:
Dataset ds = sparkSession.read().format("org.apache.kudu.spark.kudu").
schema(schema).option("kudu.master",kuduMaster).option("kudu.table",kuduTable).load();
/**
* Loads input in as a `DataFrame`, for data sources that don't require a path (e.g. external
* key-value stores).
*
* @since 1.4.0
*/
def load(): DataFrame = {
load(Seq.empty: _*) // force invocation of `load(...varargs...)`
}
/**
* Loads input in as a `DataFrame`, for data sources that support multiple paths.
* Only works if the source is a HadoopFsRelationProvider.
*
* @since 1.6.0
*/
@scala.annotation.varargs
def load(paths: String*): DataFrame = {
if (source.toLowerCase(Locale.ROOT) == DDLUtils.HIVE_PROVIDER) {
throw new AnalysisException("Hive data source can only be used with tables, you can not " +
"read files of Hive data source directly.")
}
sparkSession.baseRelationToDataFrame(
DataSource.apply(
sparkSession,
paths = paths,
userSpecifiedSchema = userSpecifiedSchema,
className = source,
options = extraOptions.toMap).resolveRelation())
}
/**
* Create a resolved [[BaseRelation]] that can be used to read data from or write data into this
* [[DataSource]]
*
* @param checkFilesExist Whether to confirm that the files exist when generating the
* non-streaming file based datasource. StructuredStreaming jobs already
* list file existence, and when generating incremental jobs, the batch
* is considered as a non-streaming file based data source. Since we know
* that files already exist, we don't need to check them again.
*/
def resolveRelation(checkFilesExist: Boolean = true): BaseRelation = {
val relation = (providingClass.newInstance(), userSpecifiedSchema) match {
// TODO: Throw when too much is given.
case (dataSource: SchemaRelationProvider, Some(schema)) =>
dataSource.createRelation(sparkSession.sqlContext, caseInsensitiveOptions, schema)
case (dataSource: RelationProvider, None) =>
dataSource.createRelation(sparkSession.sqlContext, caseInsensitiveOptions)
case (_: SchemaRelationProvider, None) =>
throw new AnalysisException(s"A schema needs to be specified when using $className.")
case (dataSource: RelationProvider, Some(schema)) =>
val baseRelation =
dataSource.createRelation(sparkSession.sqlContext, caseInsensitiveOptions)
if (baseRelation.schema != schema) {
throw new AnalysisException(s"$className does not allow user-specified schemas.")
}
baseRelation
// We are reading from the results of a streaming query. Load files from the metadata log
// instead of listing them using HDFS APIs.
case (format: FileFormat, _)
if FileStreamSink.hasMetadata(
caseInsensitiveOptions.get("path").toSeq ++ paths,
sparkSession.sessionState.newHadoopConf()) =>
val basePath = new Path((caseInsensitiveOptions.get("path").toSeq ++ paths).head)
val fileCatalog = new MetadataLogFileIndex(sparkSession, basePath)
val dataSchema = userSpecifiedSchema.orElse {
format.inferSchema(
sparkSession,
caseInsensitiveOptions,
fileCatalog.allFiles())
}.getOrElse {
throw new AnalysisException(
s"Unable to infer schema for $format at ${fileCatalog.allFiles().mkString(",")}. " +
"It must be specified manually")
}
HadoopFsRelation(
fileCatalog,
partitionSchema = fileCatalog.partitionSchema,
dataSchema = dataSchema,
bucketSpec = None,
format,
caseInsensitiveOptions)(sparkSession)
// This is a non-streaming file based datasource.
case (format: FileFormat, _) =>
val allPaths = caseInsensitiveOptions.get("path") ++ paths
val hadoopConf = sparkSession.sessionState.newHadoopConf()
val globbedPaths = allPaths.flatMap(
DataSource.checkAndGlobPathIfNecessary(hadoopConf, _, checkFilesExist)).toArray
val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
val (dataSchema, partitionSchema) = getOrInferFileFormatSchema(format, fileStatusCache)
val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions &&
catalogTable.isDefined && catalogTable.get.tracksPartitionsInCatalog) {
val defaultTableSize = sparkSession.sessionState.conf.defaultSizeInBytes
new CatalogFileIndex(
sparkSession,
catalogTable.get,
catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(defaultTableSize))
} else {
new InMemoryFileIndex(
sparkSession, globbedPaths, options, Some(partitionSchema), fileStatusCache)
}
HadoopFsRelation(
fileCatalog,
partitionSchema = partitionSchema,
dataSchema = dataSchema.asNullable,
bucketSpec = bucketSpec,
format,
caseInsensitiveOptions)(sparkSession)
case _ =>
throw new AnalysisException(
s"$className is not a valid Spark SQL Data Source.")
}
relation
}
调用org.apache.kudu.spark.kudu.DefaultSource.createRelation方法
override def createRelation(sqlContext: SQLContext, parameters: Map[String, String],
schema: StructType): BaseRelation = {
val tableName = parameters.getOrElse(TABLE_KEY,
throw new IllegalArgumentException(s"Kudu table name must be specified in create options " +
s"using key '$TABLE_KEY'"))
val kuduMaster = parameters.getOrElse(KUDU_MASTER, "localhost")
val operationType = getOperationType(parameters.getOrElse(OPERATION, "upsert"))
val faultTolerantScanner = Try(parameters.getOrElse(FAULT_TOLERANT_SCANNER, "false").toBoolean)
.getOrElse(false)
new KuduRelation(tableName, kuduMaster, faultTolerantScanner, operationType,
Some(schema))(sqlContext)
}
/**
* Implementation of Spark BaseRelation.
*
* @param tableName Kudu table that we plan to read from
* @param masterAddrs Kudu master addresses
* @param faultTolerantScanner scanner type to be used. Fault tolerant if true,
* otherwise, use non fault tolerant one
* @param operationType The default operation type to perform when writing to the relation
* @param userSchema A schema used to select columns for the relation
* @param sqlContext SparkSQL context
*/
@InterfaceStability.Unstable
class KuduRelation(private val tableName: String,
private val masterAddrs: String,
private val faultTolerantScanner: Boolean,
private val operationType: OperationType,
private val userSchema: Option[StructType])(
val sqlContext: SQLContext)
extends BaseRelation
with PrunedFilteredScan
with InsertableRelation {
import KuduRelation._
private val context: KuduContext = new KuduContext(masterAddrs, sqlContext.sparkContext)
private val table: KuduTable = context.syncClient.openTable(tableName)
/**
* KuduContext is a serializable container for Kudu client connections.
*
* If a Kudu client connection is needed as part of a Spark application, a
* [[KuduContext]] should be created in the driver, and shared with executors
* as a serializable field.
*/
@InterfaceStability.Unstable
class KuduContext(val kuduMaster: String,
sc: SparkContext) extends Serializable {
import kudu.KuduContext._
@Deprecated()
def this(kuduMaster: String) {
this(kuduMaster, new SparkContext())
}
@transient lazy val syncClient = {
val c = KuduConnection.getSyncClient(kuduMaster)
if (authnCredentials != null) {
c.importAuthenticationCredentials(authnCredentials)
}
c
}
@transient lazy val asyncClient = {
val c = KuduConnection.getAsyncClient(kuduMaster)
if (authnCredentials != null) {
c.importAuthenticationCredentials(authnCredentials)
}
c
}
// Visible for testing.
private[kudu] val authnCredentials : Array[Byte] = {
Subject.doAs(getSubject(sc), new PrivilegedAction[Array[Byte]] {
override def run(): Array[Byte] = syncClient.exportAuthenticationCredentials()
})
}
/**
* Create an RDD from a Kudu table.
*
* @param tableName table to read from
* @param columnProjection list of columns to read. Not specifying this at all
* (i.e. setting to null) or setting to the special
* string '*' means to project all columns
* @return a new RDD that maps over the given table for the selected columns
*/
def kuduRDD(sc: SparkContext,
tableName: String,
columnProjection: Seq[String] = Nil): RDD[Row] = {
// TODO: provide an elegant way to pass various options (faultTolerantScan, etc) to KuduRDD
new KuduRDD(this, 1024*1024*20, columnProjection.toArray, Array(),
syncClient.openTable(tableName), false, sc)
}
/**
* Check if kudu table already exists
*
* @param tableName name of table to check
* @return true if table exists, false if table does not exist
*/
def tableExists(tableName: String): Boolean = syncClient.tableExists(tableName)
/**
* Delete kudu table
*
* @param tableName name of table to delete
* @return DeleteTableResponse
*/
def deleteTable(tableName: String): DeleteTableResponse = syncClient.deleteTable(tableName)
/**
* Creates a kudu table for the given schema. Partitioning can be specified through options.
*
* @param tableName table to create
* @param schema struct schema of table
* @param keys primary keys of the table
* @param options replication and partitioning options for the table
*/
def createTable(tableName: String,
schema: StructType,
keys: Seq[String],
options: CreateTableOptions): KuduTable = {
val kuduCols = new util.ArrayList[ColumnSchema]()
// add the key columns first, in the order specified
for (key <- keys) {
val f = schema.fields(schema.fieldIndex(key))
kuduCols.add(new ColumnSchema.ColumnSchemaBuilder(f.name, kuduType(f.dataType)).key(true).build())
}
// now add the non-key columns
for (f <- schema.fields.filter(field=> !keys.contains(field.name))) {
kuduCols.add(new ColumnSchema.ColumnSchemaBuilder(f.name, kuduType(f.dataType)).nullable(f.nullable).key(false).build())
}
syncClient.createTable(tableName, new Schema(kuduCols), options)
}
/** Map Spark SQL type to Kudu type */
def kuduType(dt: DataType) : Type = dt match {
case DataTypes.BinaryType => Type.BINARY
case DataTypes.BooleanType => Type.BOOL
case DataTypes.StringType => Type.STRING
case DataTypes.TimestampType => Type.UNIXTIME_MICROS
case DataTypes.ByteType => Type.INT8
case DataTypes.ShortType => Type.INT16
case DataTypes.IntegerType => Type.INT32
case DataTypes.LongType => Type.INT64
case DataTypes.FloatType => Type.FLOAT
case DataTypes.DoubleType => Type.DOUBLE
case _ => throw new IllegalArgumentException(s"No support for Spark SQL type $dt")
}
/**
* Inserts the rows of a [[DataFrame]] into a Kudu table.
*
* @param data the data to insert
* @param tableName the Kudu table to insert into
*/
def insertRows(data: DataFrame, tableName: String): Unit = {
writeRows(data, tableName, Insert)
}
/**
* Inserts the rows of a [[DataFrame]] into a Kudu table, ignoring any new
* rows that have a primary key conflict with existing rows.
*
* @param data the data to insert into Kudu
* @param tableName the Kudu table to insert into
*/
def insertIgnoreRows(data: DataFrame, tableName: String): Unit = {
writeRows(data, tableName, InsertIgnore)
}
/**
* Upserts the rows of a [[DataFrame]] into a Kudu table.
*
* @param data the data to upsert into Kudu
* @param tableName the Kudu table to upsert into
*/
def upsertRows(data: DataFrame, tableName: String): Unit = {
writeRows(data, tableName, Upsert)
}
/**
* Updates a Kudu table with the rows of a [[DataFrame]].
*
* @param data the data to update into Kudu
* @param tableName the Kudu table to update
*/
def updateRows(data: DataFrame, tableName: String): Unit = {
writeRows(data, tableName, Update)
}
/**
* Deletes the rows of a [[DataFrame]] from a Kudu table.
*
* @param data the data to delete from Kudu
* note that only the key columns should be specified for deletes
* @param tableName The Kudu tabe to delete from
*/
def deleteRows(data: DataFrame, tableName: String): Unit = {
writeRows(data, tableName, Delete)
}
private[kudu] def writeRows(data: DataFrame, tableName: String, operation: OperationType) {
val schema = data.schema
data.foreachPartition(iterator => {
val pendingErrors = writePartitionRows(iterator, schema, tableName, operation)
val errorCount = pendingErrors.getRowErrors.length
if (errorCount > 0) {
val errors = pendingErrors.getRowErrors.take(5).map(_.getErrorStatus).mkString
throw new RuntimeException(
s"failed to write $errorCount rows from DataFrame to Kudu; sample errors: $errors")
}
})
}
private def writePartitionRows(rows: Iterator[Row],
schema: StructType,
tableName: String,
operationType: OperationType): RowErrorsAndOverflowStatus = {
val table: KuduTable = syncClient.openTable(tableName)
val indices: Array[(Int, Int)] = schema.fields.zipWithIndex.map({ case (field, sparkIdx) =>
sparkIdx -> table.getSchema.getColumnIndex(field.name)
})
val session: KuduSession = syncClient.newSession
session.setFlushMode(FlushMode.AUTO_FLUSH_BACKGROUND)
session.setIgnoreAllDuplicateRows(operationType.ignoreDuplicateRowErrors)
try {
for (row <- rows) {
val operation = operationType.operation(table)
for ((sparkIdx, kuduIdx) <- indices) {
if (row.isNullAt(sparkIdx)) {
operation.getRow.setNull(kuduIdx)
} else schema.fields(sparkIdx).dataType match {
case DataTypes.StringType => operation.getRow.addString(kuduIdx, row.getString(sparkIdx))
case DataTypes.BinaryType => operation.getRow.addBinary(kuduIdx, row.getAs[Array[Byte]](sparkIdx))
case DataTypes.BooleanType => operation.getRow.addBoolean(kuduIdx, row.getBoolean(sparkIdx))
case DataTypes.ByteType => operation.getRow.addByte(kuduIdx, row.getByte(sparkIdx))
case DataTypes.ShortType => operation.getRow.addShort(kuduIdx, row.getShort(sparkIdx))
case DataTypes.IntegerType => operation.getRow.addInt(kuduIdx, row.getInt(sparkIdx))
case DataTypes.LongType => operation.getRow.addLong(kuduIdx, row.getLong(sparkIdx))
case DataTypes.FloatType => operation.getRow.addFloat(kuduIdx, row.getFloat(sparkIdx))
case DataTypes.DoubleType => operation.getRow.addDouble(kuduIdx, row.getDouble(sparkIdx))
case DataTypes.TimestampType => operation.getRow.addLong(kuduIdx, KuduRelation.timestampToMicros(row.getTimestamp(sparkIdx)))
case t => throw new IllegalArgumentException(s"No support for Spark SQL type $t")
}
}
session.apply(operation)
}
} finally {
session.close()
}
session.getPendingErrors
}
}
private object KuduContext {
val Log: Logger = LoggerFactory.getLogger(classOf[KuduContext])
/**
* Returns a new Kerberos-authenticated [[Subject]] if the Spark context contains
* principal and keytab options, otherwise returns the currently active subject.
*
* The keytab and principal options should be set when deploying a Spark
* application in cluster mode with Yarn against a secure Kudu cluster. Spark
* internally will grab HDFS and HBase delegation tokens (see
* [[org.apache.spark.deploy.SparkSubmit]]), so we do something similar.
*
* This method can only be called on the driver, where the SparkContext is
* available.
*
* @return A Kerberos-authenticated subject if the Spark context contains
* principal and keytab options, otherwise returns the currently
* active subject
*/
private def getSubject(sc: SparkContext): Subject = {
val subject = Subject.getSubject(AccessController.getContext)
val principal = sc.getConf.getOption("spark.yarn.principal").getOrElse(return subject)
val keytab = sc.getConf.getOption("spark.yarn.keytab").getOrElse(return subject)
Log.info(s"Logging in as principal $principal with keytab $keytab")
val conf = new Configuration {
override def getAppConfigurationEntry(name: String): Array[AppConfigurationEntry] = {
val options = Map(
"principal" -> principal,
"keyTab" -> keytab,
"useKeyTab" -> "true",
"useTicketCache" -> "false",
"doNotPrompt" -> "true",
"refreshKrb5Config" -> "true"
)
Array(new AppConfigurationEntry("com.sun.security.auth.module.Krb5LoginModule",
AppConfigurationEntry.LoginModuleControlFlag.REQUIRED,
options.asJava))
}
}
val loginContext = new LoginContext("kudu-spark", new Subject(), null, conf)
loginContext.login()
loginContext.getSubject
}
}
4.spark shell连接Kudu。
spark-shell --master yarn --num-executors 2 --executor-cores 2 --executor-memory 4G --jars /Path/kudu-client-1.5.0.jar,/Path/kudu-client-tools-1.5.0.jar,/Path/kudu-spark2_2.11-1.5.0.jar
Spark context available as 'sc' (master = yarn, app id = application_1520565037255_1081004).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.2.1
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_92)
Type in expressions to have them evaluated.
Type :help for more information.
scala> import org.apache.kudu.spark.kudu._
import org.apache.kudu.spark.kudu._
scala> import org.apache.kudu.client._
import org.apache.kudu.client._
scala> import collection.JavaConverters._
import collection.JavaConverters._
scala> val df = spark.sqlContext.read.options(Map("kudu.master" -> "master1:port,master2:port,master3:port","kudu.table" -> "table")).kudu
df.registerTempTable("table");
spark.sql("select count(1) from table").show();