kudu的基本操作(查看集群状态,APT操作,impala操作,spark整合)

1.命令行查看kudu状态操作

通过root用户切换到kudu用户
1.1 查看集群整体信息(我本机的hostname是hadoop002,也就是kudu的master是hadoop002)

-bash-4.2$ kudu cluster ksck hadoop002
Connected to the Master
Fetched info from all 1 Tablet Servers
Table wc is HEALTHY (3 tablet(s) checked)

Table Summary
 Name | Status  | Total Tablets | Healthy | Recovering | Under-replicated | Unavailable
------+---------+---------------+---------+------------+------------------+-------------
 wc   | HEALTHY | 3             | 3       | 0          | 0                | 0
The metadata for 1 table(s) is HEALTHY
OK

1.2 查看master状态

-bash-4.2$ kudu master status localhost
node_instance {
  permanent_uuid: "2431dd2c03a54ff0be6f14fec9cb4ab7"
  instance_seqno: 1586275391374513
}
bound_rpc_addresses {
  host: "hadoop002"
  port: 7051
}
bound_http_addresses {
  host: "hadoop002"
  port: 8051
}
version_info {
  git_hash: "70babd7c7391a980df09d8b7bc5b42ed35a26f62"
  build_hostname: "impala-ec2-pkg-centos-7-1c83.vpc.cloudera.com"
  build_timestamp: "09 Aug 2018 09:46:59 PST"
  build_username: "jenkins"
  build_clean_repo: true
  build_id: "2018-08-09_08-50-19"
  build_type: "RELEASE"
  version_string: "1.7.0-cdh5.15.1"
}

1.2 查看tserver的状态

-bash-4.2$  kudu tserver status localhost
node_instance {
  permanent_uuid: "8cd8c704667747d98afb3ef342f6f1b5"
  instance_seqno: 1586275398491656
}
bound_rpc_addresses {
  host: "hadoop002"
  port: 7050
}
bound_http_addresses {
  host: "hadoop002"
  port: 8050
}
version_info {
  git_hash: "70babd7c7391a980df09d8b7bc5b42ed35a26f62"
  build_hostname: "impala-ec2-pkg-centos-7-1c83.vpc.cloudera.com"
  build_timestamp: "09 Aug 2018 09:46:59 PST"
  build_username: "jenkins"
  build_clean_repo: true
  build_id: "2018-08-09_08-50-19"
  build_type: "RELEASE"
  version_string: "1.7.0-cdh5.15.1"
}

2.kudu整合impala操作

impala安装教程:


3.kudu的api操作

3.1 初始化kudu的client

    val KUDU_MASTERS = "hadoop002"
    val client: KuduClient = new KuduClient.KuduClientBuilder(KUDU_MASTERS).build()
    val tableName = "test"

3.2 创建表

  /**
    * 创建表
    */
  def createTable(client: KuduClient, tableName: String): Unit = {
    import scala.collection.JavaConverters._
    val columns = List(
      new ColumnSchema.ColumnSchemaBuilder("id", Type.STRING).key(true).build(),
      new ColumnSchema.ColumnSchemaBuilder("name", Type.INT32).build()
    ).asJava

    val schema = new Schema(columns)

    val options: CreateTableOptions = new CreateTableOptions()
    options.setNumReplicas(1)

    val parcols: util.LinkedList[String] = new util.LinkedList[String]()
    parcols.add("word")
    options.addHashPartitions(parcols,3)

    client.createTable(tableName,schema,options)
  }

3.3 插入数据

  def insertRows(client: KuduClient, tableName: String) = {
    val table: KuduTable = client.openTable(tableName)  // 根据表名获取kudu的表
    val session: KuduSession = client.newSession() // JPA Hibernate

    for(i<-1 to 10) {
      val insert: Insert = table.newInsert()
      val row: PartialRow = insert.getRow
      row.addString("id", 100+i)
      row.addInt("name",s"test-$i")

      session.apply(insert)
    }

  }

3.4 修改表结构

  def renameTable(client: KuduClient, tableName: String, newTableName: String) = {

    val options: AlterTableOptions = new AlterTableOptions()
    options.renameTable(newTableName)
    client.alterTable(tableName, options)
  }

3.5 查询数据

  def query(client: KuduClient, tableName: String) = {
    val table: KuduTable = client.openTable(tableName)

    val scanner: KuduScanner = client.newScannerBuilder(table).build()

    while(scanner.hasMoreRows) {
      val iterator: RowResultIterator = scanner.nextRows()

      while(iterator.hasNext) {
        val result: RowResult = iterator.next()
        println(result.getString("id") + " => " + result.getInt("name"))
      }
    }

  }

3.6 修改数据

  def upsertRow(client: KuduClient, tableName: String) = {
    val table: KuduTable = client.openTable(tableName)
    val session: KuduSession = client.newSession()

    val update: Update = table.newUpdate()
    val row: PartialRow = update.getRow
    row.addString("word", "pk-10")
    row.addInt("cnt", 8888)
    session.apply(update)
  }

3.7 删除表

  def deleteTable(client: KuduClient, tableName: String) = {
    client.deleteTable(tableName)
  }

调用例子

  def main(args: Array[String]): Unit = {
    val KUDU_MASTERS = "hadoop002"
    val client: KuduClient = new KuduClient.KuduClientBuilder(KUDU_MASTERS).build()
    val tableName = "test"
    createTable(client, tableName)
    client.close()
  }

4. kudu 整合spark
kudu整合spark的官网文档:https://kudu.apache.org/docs/developing.html#_kudu_integration_with_spark
4.1 spark读取MySQL数据源,把数据存入kudu并读取从kudu数据源加载数据。

package com.wxx.bigdata.kudu

import java.util.Properties

import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.{SaveMode, SparkSession}

object SparkKuduApp {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().master("local").getOrCreate()

    val config = ConfigFactory.load()
    val url = config.getString("db.default.url")
    val user = config.getString("db.default.user")
    val password = config.getString("db.default.password")
    val driver = config.getString("db.default.driver")
    val database = config.getString("db.default.database")
    val table = config.getString("db.default.table")

    //从mysql读取数据
    val connectionProperties = new Properties()
    connectionProperties.put("user", user)
    connectionProperties.put("password", password)
    //TODO 处理业务逻辑
    val jdbcDF = spark.read.jdbc(url, s"$database.$table", connectionProperties)
    jdbcDF.show()

    //将数据写入到kudu
    val kuduMaster = "hadoop002"
    jdbcDF.write.format("org.apache.kudu.spark.kudu")
        .mode(SaveMode.Append)
        .option("kudu.master",kuduMaster)
        .option("kudu.table", "test")
        .save()

    // 从Kudu中读取数据源
    val df = spark.read.format("org.apache.kudu.spark.kudu")
      .option("kudu.master", kuduMaster)
      .option("kudu.table", "test")
      .load()
    df.show(false)
    spark.stop()
  }
}

读取MySQL数据源的配置文件

db.default.driver="com.mysql.jdbc.Driver"
db.default.url="jdbc:mysql://hostname:13306"
db.default.user="user"
db.default.password="password"
db.default.database=test
db.default.table=wc

4.2 通过KuduContext操作kudu

package com.imooc.bigdata.chapter07

import org.apache.kudu.client.CreateTableOptions
import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

import collection.JavaConverters._

object SparkKuduApp2 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local[2]").appName("SparkKuduApp2").getOrCreate()
//    val df = spark.read.format("org.apache.kudu.spark.kudu")
//      .option("kudu.master", "hadoop002")
//      .option("kudu.table", "test")
//      .load()
//    df.select("word","cnt").filter("cnt > 100").show(false)

    // Use KuduContext to create, delete, or write to Kudu tables
    val kuduContext = new KuduContext("hadoop002:7051", spark.sparkContext)

    val schema =StructType(
      List(
        //  Bad schema: Nullable key columns are not supported: id
        StructField("id", IntegerType, false),
        StructField("name", StringType, true),
        StructField("age", IntegerType, true)
      ))
    kuduContext.createTable(
      "user", schema, Seq("id"),
      new CreateTableOptions()
        .setNumReplicas(1)
        .addHashPartitions(List("id").asJava, 3))

    // Check for the existence of a Kudu table
    val isExisted = kuduContext.tableExists("user")
    println(isExisted)

    // Insert data
    import spark.implicits._
    val userDf = Seq((1, "zhangsan", 22),(2,"lisi",33), (3,"wangwu", 18)).toDF("id", "name", "age")
    kuduContext.insertRows(userDf, "user")
      val userReturnDf = spark.read.format("org.apache.kudu.spark.kudu")
        .option("kudu.master", "hadoop002")
        .option("kudu.table", "user")
        .load()
      userReturnDf.show()

    // Delete data
    kuduContext.deleteRows(userDf, "user")

    // Upsert data
      val userUpsertDf = Seq((1, "zhangsan2", 22),(2,"lisi",44), (4,"zhaoliu", 33)).toDF("id", "name", "age")
      kuduContext.upsertRows(userUpsertDf, "user")
      val userUpsertReturnDf = spark.read.format("org.apache.kudu.spark.kudu")
        .option("kudu.master", "hadoop002")
        .option("kudu.table", "user")
        .load()
    userUpsertReturnDf.show()

    //  Update data
    val userUpdateDF = Seq((1, "tianqi", 20)).toDF("id", "name", "age")
        kuduContext.updateRows(userUpdateDF, "user")
        val userUpdateReturnDf = spark.read.format("org.apache.kudu.spark.kudu")
          .option("kudu.master", "hadoop002")
          .option("kudu.table", "user")
          .load()
        userUpdateReturnDf.show()

    // Delete a Kudu table
    kuduContext.deleteTable("user")

    spark.stop()


  }
}






 

你可能感兴趣的:(Kudu)