本文将介绍如何使用Spark操作HBase的数据,实现列之间的计算,以特征值计算为例。特征值是指从原始数据中提取出来的具有代表性或判别性的数值,可以用于数据分析或机器学习等领域。本文将使用hbase-spark连接器,通过Spark RDD的方式,读取和写入HBase的表,实现对Sentinel-2卫星影像数据的特征值计算。
主要内容如下:
目录
一、环境准备
二、创建SparkSession和HBaseConfiguration对象
三、读取HBase表的数据,并转化成RDD
四、计算特征值,并转化成RDD
五、写入HBase表的数据
六、关闭SparkSession
七、验证HBase表的数据
分布式存储与并行处理环境配置:Hadoop、HBase和Spark等_runepic的博客-CSDN博客https://blog.csdn.net/weixin_40694662/article/details/130030611
create 't3', 'f1'
create 't3index', 'f1'
put 't3', '1000', 'f1:2019-01-01B2', '100'
put 't3', '1000', 'f1:2019-01-01B3', '200'
put 't3', '1000', 'f1:2019-01-01B4', '300'
put 't3', '1000', 'f1:2019-01-01B8', '400'
put 't3', '1000', 'f1:2019-01-01B11', '500'
put 't3', '1000', 'f1:2019-01-01B12', '600'
put 't3index', '0001', 'f1:td', ':2019-01-01B:'
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.io.NullWritable
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().appName("SparkHBaseRDDfeature")
.master("local")
.getOrCreate()
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum", "hadoop100:2181,hadoop200:2181,hadoop201:2181")
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
val tablename = "t3"
hbaseConf.set(TableInputFormat.INPUT_TABLE, tablename)
val hBaseRDD = spark.sparkContext.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result])
val hBaseRDD2 = hBaseRDD.map{case (k,v) =>
val rowkey = Bytes.toString(k.get())
val b2 = Bytes.toString(v.getValue("f1".getBytes, "2019-01-01B2".getBytes)).toDouble
val b3 = Bytes.toString(v.getValue("f1".getBytes, "2019-01-01B3".getBytes)).toDouble
val b4 = Bytes.toString(v.getValue("f1".getBytes, "2019-01-01B4".getBytes)).toDouble
val b8 = Bytes.toString(v.getValue("f1".getBytes, "2019-01-01B8".getBytes)).toDouble
val b11 = Bytes.toString(v.getValue("f1".getBytes, "2019-01-01B11".getBytes)).toDouble
val b12 = Bytes.toString(v.getValue("f1".getBytes, "2019-01-01B12".getBytes)).toDouble
(rowkey, b2, b3, b4, b8, b11, b12)
}
def calculateFeatures(Blue: Double, Green: Double, Red: Double, NIR: Double, SWIR_1: Double, SWIR_2: Double): (Double, Double, Double) = {
val DVI = NIR - Red
val RVI = NIR / Red
val NDVI = (NIR - Red) / (NIR + Red)
(DVI, RVI, NDVI)
}
val hBaseRDDre = hBaseRDD2.map{case (rowkey, b2, b3, b4, b8, b11, b12) =>
// 创建一个Put对象,并设置行键
val put: Put = new Put(rowkey.getBytes)
// 调用calculateFeatures函数,计算特征值
val (DVI, RVI, NDVI) = calculateFeatures(b2, b3, b4, b8, b11, b12)
// 将特征值作为列值,添加到Put对象中
put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("DVI"), Bytes.toBytes(DVI.toString))
put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("RVI"), Bytes.toBytes(RVI.toString))
put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("NDVI"), Bytes.toBytes(NDVI.toString))
// 返回(NullWritable.get(), put)类型的元组
(NullWritable.get(), put)
}
hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, "t3feature")
val job: Job = Job.getInstance(hbaseConf)
job.setOutputFormatClass(classOf[TableOutputFormat[NullWritable]])
job.setOutputKeyClass(classOf[NullWritable])
job.setOutputValueClass(classOf[Put])
hBaseRDDre.saveAsNewAPIHadoopDataset(job.getConfiguration)
spark.stop()
exit
hbase shell
scan 't3feature'
exit