Spark操作hbase

  1. Spark操作HBase - 读操作
 1 /**
 2   * 配置hbase
 3   *
 4   * @param tableName
 5   * @param quorum
 6   * @param port
 7   * @return
 8   */
 9 def getHbaseConf(quorum: String, port: String): Configuration = {
10   // 配置hbase环境
11   val conf = HBaseConfiguration.create()
12   // 设置zookeeper地址
13   conf.set("hbase.zookeeper.quorum", quorum)
14   // 设置zookeeper端口
15   conf.set("hbase.zookeeper.property.clientPort", port)
16   // 设置读取hbase的tablename
17   conf
18 }

  

 1 def main(args: Array[String]): Unit = {
 2 
 3   // 获取spark环境 设置local使程序在本地运行,不需要安装Spark集群 这里使用的是yarn模式
 4   val sparkConf = new SparkConf().setAppName("Spark-Hbase-Read").setMaster("local[2]")
 5   val sc = new SparkContext(sparkConf)
 6   // 获取hbase相关配置信息
 7   val hbaseConf = HbaseUtils.getHbaseConf("centos01,centos02,centos03", "2181")
 8   hbaseConf.set(TableInputFormat.INPUT_TABLE, "movie_wordcount")
 9   // 将 hbase 读取信息转换成rdd
10   val hbaseRdd = sc.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result]).cache()
11 
12   hbaseRdd.map(x => {
13     val result = x._2
14     val row = Bytes.toString(result.getRow)
15     val word = Bytes.toString(result.getValue("word".getBytes(), "word".getBytes()))
16     // 单词量是int 所以需要用 Bytes.toInt 否则会出现乱码
17     val count = Bytes.toInt(result.getValue("word".getBytes(), "count".getBytes()))
18 
19     println(row, word, count)
20     (row, word, count)
21   }).saveAsTextFile("/wordcount/output3")
22 
23 
24 }

打包提交到集群,运行 命令:./bin/spark-submit --class com.xxx.xx.scala.hbase.SparkHbaseR ./localjar/sc-1.0-SNAPSHOT-jar-with-dependencies.jar 

  saveAsTextFile 默认保存地址是hdfs上的,所以去hdfs上查看结果,结果是一个文件夹,

  查看命令: hadoop fs -ls /wordcount/output3

  

  2.Spark操作HBase - 写操作

 

object SparkHbaseW {

def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("Spark-Hbase-Write").setMaster("yarn")
val sc = new SparkContext(conf)
val file = sc.textFile("/spark/movie/wordcount/pinglun.txt")
// 使用中文分词
val rdd = file.flatMap(line => {
getWords(line, filter(new Array[String](0)))
}).map(x => (x, 1)).reduceByKey(_ + _)
// 保存一份结果到hdfs
rdd.saveAsTextFile(args(0))

// 插入方式一
rdd.foreachPartition(x => {
x.foreach(y => {
// 将数组插入hbase
val table = HbaseUtils.getTable(HbaseUtils.getHbaseConf("centos01,centos02,centos03", "2181"), "movie_wordcount")
val family = Bytes.toBytes("word")
val wordColum = Bytes.toBytes("word")
val countColum = Bytes.toBytes("count")

val uuid = UUID.randomUUID()
val wordPut = new Put(Bytes.toBytes(uuid.toString))
wordPut.addColumn(family, wordColum, Bytes.toBytes(y._1))
wordPut.addColumn(family, countColum, Bytes.toBytes(y._2))
table.put(wordPut)
})
})

// 插入方式二 (批量插入)
// rdd.foreachPartition(x => {
// val table = HbaseUtils.getTable(HbaseUtils.getHbaseConf("centos01,centos02,centos03", "2181"), "movie_wordcount")
// val puts = new java.util.LinkedList[Put]()
// x.foreach(y => {
// // 将数组插入hbase
// val family = Bytes.toBytes("word")
// val wordColum = Bytes.toBytes("word")
// val countColum = Bytes.toBytes("count")
// val uuid = UUID.randomUUID()
// val wordPut = new Put(Bytes.toBytes(uuid.toString))
// wordPut.addColumn(family, wordColum, Bytes.toBytes(y._1))
// wordPut.addColumn(family, countColum, Bytes.toBytes(y._2))
// puts.add(wordPut)
// })
// table.put(puts)
// })


}


/**
* 分词停止符
*
* @param stopWords
* @return
*/
def filter(stopWords: Array[String]): StopRecognition = {
// add stop words
val filter = new StopRecognition
filter.insertStopNatures("w") // filter punctuation
filter.insertStopNatures("m") // filter m pattern
filter.insertStopNatures("null") // filter null
filter.insertStopNatures("
") // filter

filter.insertStopRegexes("^[a-zA-Z]{1,}") //filter English alphabet
filter.insertStopRegexes("^[0-9]+") //filter number
filter.insertStopRegexes("[^a-zA-Z0-9\\u4e00-\\u9fa5]+")
filter.insertStopRegexes("\t")
for (x <- stopWords) {
filter.insertStopWords(x)
}
filter
}

/**
* 分词统计
*
* @param text
* @param filter
* @return
*/
def getWords(text: String, filter: StopRecognition): ArrayBuffer[String] = {
val words = new mutable.ArrayBuffer[String]()
val terms = ToAnalysis.parse(text).recognition(filter).getTerms
for (i <- 0 until terms.size()) {
val word = terms.get(i).getName
if (word.length >= 0) {
words += word
}
}
words
}


}

 

你可能感兴趣的:(Spark操作hbase)