HBase数据库
由于 org.apache.hadoop.hbase.mapreduce.TableInputFormat 类的实现,Spark 可以通过Hadoop输入格式访问HBase。
这个输入格式会返回键值对数据,
其中键的类型为org. apache.hadoop.hbase.io.ImmutableBytesWritable,
而值的类型为org.apache.hadoop.hbase.client.Result。
1、HBase读取
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.Logger
object HbaseRedTest extends java.io.Serializable{
val logger = Logger.getLogger(HbaseRedTest.getClass)
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("HBaseApp")//.setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val conf = HBaseConfiguration.create()
//HBase中的表名
conf.set(TableInputFormat.INPUT_TABLE, "fruit")
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
val count = hBaseRDD.count()
println("hBaseRDD RDD Count:"+ count)
hBaseRDD.cache()
hBaseRDD.foreach {
case (_, result) =>
val key = Bytes.toString(result.getRow)
val name = Bytes.toString(result.getValue("info".getBytes, "name".getBytes))
val color = Bytes.toString(result.getValue("info".getBytes, "color".getBytes))
println("Row key:" + key + " Name:" + name + " Color:" + color)
}
sc.stop()
}
}
2、HBase写入
import org.apache.hadoop.hbase.client.{HBaseAdmin, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.{SparkConf, SparkContext}
object HbaseWriTest extends java.io.Serializable{
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("HBaseApp")//.setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val conf = HBaseConfiguration.create()
val jobConf = new JobConf(conf)
jobConf.setOutputFormat(classOf[TableOutputFormat])
jobConf.set(TableOutputFormat.OUTPUT_TABLE, "fruit_spark")
val fruitTable = TableName.valueOf("fruit_spark")
val tableDescr = new HTableDescriptor(fruitTable)
tableDescr.addFamily(new HColumnDescriptor("info".getBytes))
val admin = new HBaseAdmin(conf)
if (admin.tableExists(fruitTable)) {
admin.disableTable(fruitTable)
admin.deleteTable(fruitTable)
}
admin.createTable(tableDescr)
def convert(triple: (Int, String, Int)) = {
val put = new Put(Bytes.toBytes(triple._1))
put.addImmutable(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes(triple._2))
put.addImmutable(Bytes.toBytes("info"), Bytes.toBytes("price"), Bytes.toBytes(triple._3))
(new ImmutableBytesWritable, put)
}
val initialRDD = sc.parallelize(List((1,"apple",11), (2,"banana",12), (3,"pear",13)))
val localData = initialRDD.map(convert)
localData.saveAsHadoopDataset(jobConf)
}
}
3、Hbase -> pom.xml
4.0.0
com.heihouzi.sparkhbase
sparkhbase
1.0-SNAPSHOT
UTF-8
1.2.17
1.7.22
2.1.1
2.11.8
org.slf4j
jcl-over-slf4j
${slf4j.version}
org.slf4j
slf4j-api
${slf4j.version}
org.slf4j
slf4j-log4j12
${slf4j.version}
log4j
log4j
${log4j.version}
junit
junit
3.8.1
test
org.apache.hbase
hbase-server
1.3.1
org.apache.hbase
hbase-client
1.3.1
org.scala-lang
scala-library
${scala.version}
org.apache.spark
spark-core_2.11
${spark.version}
SparkHbase
org.apache.maven.plugins
maven-compiler-plugin
3.6.1
1.8
net.alchim31.maven
scala-maven-plugin
3.2.2
compile
testCompile
org.apache.maven.plugins
maven-assembly-plugin
3.0.0
com.yinggu.spark.WordCount
jar-with-dependencies
make-assembly
package
single
4、log4j.properties
log4j.rootLogger=info,stdout,R
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%t] %-c(line:%L) : %m%n
log4j.appender.R=org.apache.log4j.RollingFileAppender
log4j.appender.R.File=spark.log
log4j.appender.R.MaxFileSize=1024KB
log4j.appender.R.MaxBackupIndex=1
log4j.appender.R.layout=org.apache.log4j.PatternLayout
log4j.appender.R.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%t] %-c(line:%L) : %m%n
5、spark-submit
[heihouzi@hadoop102 spark]$ bin/spark-submit \
--master spark://hadoop102:7077 \
--class unit.HbaseRedTest
SparkHbase-jar-with-dependencies.jar