通过root用户切换到kudu用户
1.1 查看集群整体信息(我本机的hostname是hadoop002,也就是kudu的master是hadoop002)
-bash-4.2$ kudu cluster ksck hadoop002
Connected to the Master
Fetched info from all 1 Tablet Servers
Table wc is HEALTHY (3 tablet(s) checked)
Table Summary
Name | Status | Total Tablets | Healthy | Recovering | Under-replicated | Unavailable
------+---------+---------------+---------+------------+------------------+-------------
wc | HEALTHY | 3 | 3 | 0 | 0 | 0
The metadata for 1 table(s) is HEALTHY
OK
1.2 查看master状态
-bash-4.2$ kudu master status localhost
node_instance {
permanent_uuid: "2431dd2c03a54ff0be6f14fec9cb4ab7"
instance_seqno: 1586275391374513
}
bound_rpc_addresses {
host: "hadoop002"
port: 7051
}
bound_http_addresses {
host: "hadoop002"
port: 8051
}
version_info {
git_hash: "70babd7c7391a980df09d8b7bc5b42ed35a26f62"
build_hostname: "impala-ec2-pkg-centos-7-1c83.vpc.cloudera.com"
build_timestamp: "09 Aug 2018 09:46:59 PST"
build_username: "jenkins"
build_clean_repo: true
build_id: "2018-08-09_08-50-19"
build_type: "RELEASE"
version_string: "1.7.0-cdh5.15.1"
}
1.2 查看tserver的状态
-bash-4.2$ kudu tserver status localhost
node_instance {
permanent_uuid: "8cd8c704667747d98afb3ef342f6f1b5"
instance_seqno: 1586275398491656
}
bound_rpc_addresses {
host: "hadoop002"
port: 7050
}
bound_http_addresses {
host: "hadoop002"
port: 8050
}
version_info {
git_hash: "70babd7c7391a980df09d8b7bc5b42ed35a26f62"
build_hostname: "impala-ec2-pkg-centos-7-1c83.vpc.cloudera.com"
build_timestamp: "09 Aug 2018 09:46:59 PST"
build_username: "jenkins"
build_clean_repo: true
build_id: "2018-08-09_08-50-19"
build_type: "RELEASE"
version_string: "1.7.0-cdh5.15.1"
}
impala安装教程:
3.1 初始化kudu的client
val KUDU_MASTERS = "hadoop002"
val client: KuduClient = new KuduClient.KuduClientBuilder(KUDU_MASTERS).build()
val tableName = "test"
3.2 创建表
/**
* 创建表
*/
def createTable(client: KuduClient, tableName: String): Unit = {
import scala.collection.JavaConverters._
val columns = List(
new ColumnSchema.ColumnSchemaBuilder("id", Type.STRING).key(true).build(),
new ColumnSchema.ColumnSchemaBuilder("name", Type.INT32).build()
).asJava
val schema = new Schema(columns)
val options: CreateTableOptions = new CreateTableOptions()
options.setNumReplicas(1)
val parcols: util.LinkedList[String] = new util.LinkedList[String]()
parcols.add("word")
options.addHashPartitions(parcols,3)
client.createTable(tableName,schema,options)
}
3.3 插入数据
def insertRows(client: KuduClient, tableName: String) = {
val table: KuduTable = client.openTable(tableName) // 根据表名获取kudu的表
val session: KuduSession = client.newSession() // JPA Hibernate
for(i<-1 to 10) {
val insert: Insert = table.newInsert()
val row: PartialRow = insert.getRow
row.addString("id", 100+i)
row.addInt("name",s"test-$i")
session.apply(insert)
}
}
3.4 修改表结构
def renameTable(client: KuduClient, tableName: String, newTableName: String) = {
val options: AlterTableOptions = new AlterTableOptions()
options.renameTable(newTableName)
client.alterTable(tableName, options)
}
3.5 查询数据
def query(client: KuduClient, tableName: String) = {
val table: KuduTable = client.openTable(tableName)
val scanner: KuduScanner = client.newScannerBuilder(table).build()
while(scanner.hasMoreRows) {
val iterator: RowResultIterator = scanner.nextRows()
while(iterator.hasNext) {
val result: RowResult = iterator.next()
println(result.getString("id") + " => " + result.getInt("name"))
}
}
}
3.6 修改数据
def upsertRow(client: KuduClient, tableName: String) = {
val table: KuduTable = client.openTable(tableName)
val session: KuduSession = client.newSession()
val update: Update = table.newUpdate()
val row: PartialRow = update.getRow
row.addString("word", "pk-10")
row.addInt("cnt", 8888)
session.apply(update)
}
3.7 删除表
def deleteTable(client: KuduClient, tableName: String) = {
client.deleteTable(tableName)
}
调用例子
def main(args: Array[String]): Unit = {
val KUDU_MASTERS = "hadoop002"
val client: KuduClient = new KuduClient.KuduClientBuilder(KUDU_MASTERS).build()
val tableName = "test"
createTable(client, tableName)
client.close()
}
4. kudu 整合spark
kudu整合spark的官网文档:https://kudu.apache.org/docs/developing.html#_kudu_integration_with_spark
4.1 spark读取MySQL数据源,把数据存入kudu并读取从kudu数据源加载数据。
package com.wxx.bigdata.kudu
import java.util.Properties
import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.{SaveMode, SparkSession}
object SparkKuduApp {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").getOrCreate()
val config = ConfigFactory.load()
val url = config.getString("db.default.url")
val user = config.getString("db.default.user")
val password = config.getString("db.default.password")
val driver = config.getString("db.default.driver")
val database = config.getString("db.default.database")
val table = config.getString("db.default.table")
//从mysql读取数据
val connectionProperties = new Properties()
connectionProperties.put("user", user)
connectionProperties.put("password", password)
//TODO 处理业务逻辑
val jdbcDF = spark.read.jdbc(url, s"$database.$table", connectionProperties)
jdbcDF.show()
//将数据写入到kudu
val kuduMaster = "hadoop002"
jdbcDF.write.format("org.apache.kudu.spark.kudu")
.mode(SaveMode.Append)
.option("kudu.master",kuduMaster)
.option("kudu.table", "test")
.save()
// 从Kudu中读取数据源
val df = spark.read.format("org.apache.kudu.spark.kudu")
.option("kudu.master", kuduMaster)
.option("kudu.table", "test")
.load()
df.show(false)
spark.stop()
}
}
读取MySQL数据源的配置文件
db.default.driver="com.mysql.jdbc.Driver"
db.default.url="jdbc:mysql://hostname:13306"
db.default.user="user"
db.default.password="password"
db.default.database=test
db.default.table=wc
4.2 通过KuduContext操作kudu
package com.imooc.bigdata.chapter07
import org.apache.kudu.client.CreateTableOptions
import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import collection.JavaConverters._
object SparkKuduApp2 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[2]").appName("SparkKuduApp2").getOrCreate()
// val df = spark.read.format("org.apache.kudu.spark.kudu")
// .option("kudu.master", "hadoop002")
// .option("kudu.table", "test")
// .load()
// df.select("word","cnt").filter("cnt > 100").show(false)
// Use KuduContext to create, delete, or write to Kudu tables
val kuduContext = new KuduContext("hadoop002:7051", spark.sparkContext)
val schema =StructType(
List(
// Bad schema: Nullable key columns are not supported: id
StructField("id", IntegerType, false),
StructField("name", StringType, true),
StructField("age", IntegerType, true)
))
kuduContext.createTable(
"user", schema, Seq("id"),
new CreateTableOptions()
.setNumReplicas(1)
.addHashPartitions(List("id").asJava, 3))
// Check for the existence of a Kudu table
val isExisted = kuduContext.tableExists("user")
println(isExisted)
// Insert data
import spark.implicits._
val userDf = Seq((1, "zhangsan", 22),(2,"lisi",33), (3,"wangwu", 18)).toDF("id", "name", "age")
kuduContext.insertRows(userDf, "user")
val userReturnDf = spark.read.format("org.apache.kudu.spark.kudu")
.option("kudu.master", "hadoop002")
.option("kudu.table", "user")
.load()
userReturnDf.show()
// Delete data
kuduContext.deleteRows(userDf, "user")
// Upsert data
val userUpsertDf = Seq((1, "zhangsan2", 22),(2,"lisi",44), (4,"zhaoliu", 33)).toDF("id", "name", "age")
kuduContext.upsertRows(userUpsertDf, "user")
val userUpsertReturnDf = spark.read.format("org.apache.kudu.spark.kudu")
.option("kudu.master", "hadoop002")
.option("kudu.table", "user")
.load()
userUpsertReturnDf.show()
// Update data
val userUpdateDF = Seq((1, "tianqi", 20)).toDF("id", "name", "age")
kuduContext.updateRows(userUpdateDF, "user")
val userUpdateReturnDf = spark.read.format("org.apache.kudu.spark.kudu")
.option("kudu.master", "hadoop002")
.option("kudu.table", "user")
.load()
userUpdateReturnDf.show()
// Delete a Kudu table
kuduContext.deleteTable("user")
spark.stop()
}
}