最近在做新闻信息流推荐业务,采取标签倒排索引存储在HBase的方案。大家都知道HBase会有热点Region的问题,会给单台服务器造成很大的压力,大大降低了HBase的相应性能。为此我们需要把在一个Region中访问量都比较大的标签切分,让它们分布在不同的Region中,以缓解压力。下面来介绍一下切分方案,另附有代码。
在HBase中以新闻被打的标签为rowkey存储,所以只要找到切分点也就是标签名就可以重构Region.首先我们获取拥有这些标签的用户,获取用户的活跃度,以及标签的分值,这样以来,每个标签的热度就可以用这个标签所有用户活跃度的和乘标签的分值获取。例如我们要切分为5个Region,把所有标签的热度和除以5就,可以获得每个Region的热度配额。最后,把所有标签按照字典排序,从头累加到大于等于Region的热度配额时,则下个标签即为切分点,然后累加值只为零,重复操作直到标签结束。
切分之后再做merge。
4.0.0
hbase_region_split
hbase_region_split
1.0-SNAPSHOT
2.10.4
1.5.1
org.scala-lang
scala-library
${scala.version}
org.scala-lang
scala-library
${scala.version}
org.apache.spark
spark-core_2.10
${spark.version}
org.apache.spark
spark-mllib_2.10
${spark.version}
org.apache.spark
spark-hive_2.10
${spark.version}
org.apache.spark
spark-streaming_2.10
${spark.version}
junit
junit
4.4
test
org.specs
specs
1.2.5
test
org.apache.hbase
hbase
1.0.2
pom
org.apache.hbase
hbase-client
1.0.2
org.apache.hbase
hbase-server
1.0.2
src/main/scala
org.scala-tools
maven-scala-plugin
compile
testCompile
${scala.version}
-target:jvm-1.5
org.apache.maven.plugins
maven-eclipse-plugin
true
ch.epfl.lamp.sdt.core.scalabuilder
ch.epfl.lamp.sdt.core.scalanature
org.eclipse.jdt.launching.JRE_CONTAINER
ch.epfl.lamp.sdt.launching.SCALA_CONTAINER
org.apache.maven.plugins
maven-surefire-plugin
2.13
false
true
**/*Test.*
**/*Suite.*
package com.hbase
import org.apache.spark.SparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import scala.collection.mutable
object HbaseRegionSplit {
def main(args: Array[String]): Unit = {
val sc = SparkContext.getOrCreate()
val sQLContext = new HiveContext(sc)
//用户热点分值表
val lifeCycleSql = "select * from table1"
//用户标签评分表
val hobbySql = "select * from table2"
//获取用户生命周期表中的用户活跃度
val lifeCycle = sQLContext.sql(lifeCycleSql)
//获取用户画像中的兴趣标签
val hobby = sQLContext.sql(hobbySql)
//用户兴趣标签与活跃度进行关联
val userList = hobby.join(lifeCycle,lifeCycle("imei") === hobby("user_id"), "inner")
.select("imei","second_class","second_class_score", "third_class", "third_class_score", "tags", "tags_score", "active_type")
//合并各分类标签,并对应相应评分值,
val userListClean = userList.map{case Row(imei: String, second_class: String, second_class_score: String, third_class: String,
third_class_score: String, tags: String, tags_score: String, active_type: String) =>
val sencond_s = second_class.split(",").zip(second_class_score.split(",")).toList
val third_s = third_class.split(",").zip(third_class_score.split(",")).toList
val tags_s = tags.split(",").zip(tags_score.split(",")).toList
val allTags = (sencond_s ++ third_s ++ tags_s).map(x =>{
val testScore = try x._2.toDouble catch {case e: Exception => 0.0}
(x._1.trim, testScore)
}).filter(x => x._1!="" && x._1!=null && x._2>0)
val active_score = try active_type.toDouble catch {case e: Exception=> 0.0}
(imei, allTags, active_score)
}.filter(_._3 > 0)
//计算每个标签的热度:不同user的相同标签按活跃度进行加权求和
val tagList = userListClean.flatMap{case (imei, allTags, active_score)=>
allTags.map(x => (x._1, x._2 * active_score))
}.reduceByKey((x, y) => x + y).collect().sortBy(_._1)
//计算平均热度,5为region个数
val rhvalue = tagList.map(_._2).sum / 5
//根据标签热度计算分割点
var cumcal = 0.0
val splitPoint = mutable.ArrayBuffer.empty[String]
for(i <- tagList.indices){
val score = tagList(i)._2
val tag_next = if(i < tagList.length - 1) tagList(i + 1)._1 else ""
cumcal += score
if(cumcal >= rhvalue){
splitPoint += tag_next
cumcal = 0.0
}
}
splitPoint.foreach{println(_)}
//根据分割点进行重构
val hbaseTableName = "hbase_test"
val regionRebuilder = new RegionRebuilder(hbaseTableName, splitPoint.toArray)
regionRebuilder.execute()
sc.stop()
}
}
package com.hbase
import java.io.IOException
import org.apache.hadoop.hbase.client.{Admin, Connection, ConnectionFactory}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HRegionInfo, TableName}
import scala.collection.JavaConversions._
import scala.collection.mutable
class RegionRebuilderTest(tableName: String, pointArr:Array[String]) {
var connection: Connection = null
var admin: Admin = null
var table: TableName = null
var SLEEP_TIME = 100
//单次merge失败重试次数
var SINGLE_MERGE_RETRY_TIMES = 5
//单次merge最大等待时长
var SINGLE_MERGE_MAX_WAIT_TIME = 1000*60
//多次merge超时时间
var MERGE_TIME_OUT = 1000*60*10
//单次split失败重试次数
var SINGLE_SPLIT_RETRY_TIMES = 5
//单次split最大等待时长
var SINGLE_SPLITE_MAX_WAIT_TIME = 1000*60
def execute(): Unit={
try {
//初始化hbase表
init()
//切分region
split()
//合并region
merge()
} catch {
case e: Exception =>{
e.printStackTrace()
}
} finally {
if(connection != null)
connection.close()
}
}
private def merge(): Unit={
var continue: Boolean = false
val begin = System.currentTimeMillis()
var end = System.currentTimeMillis()
var timeOut = end - begin
do{
continue = mergeOnce()
end = System.currentTimeMillis()
timeOut = end - begin
}while(continue && timeOut < MERGE_TIME_OUT)
if(timeOut >= MERGE_TIME_OUT){
throw new IOException("Merge TimeOut")
}
}
private def mergeOnce(): Boolean={
val regions = admin.getTableRegions(table)
var index = 0
var tobeMergedRegion: HRegionInfo = null
val pointMap = mutable.Map.empty[Int, Int]
var maxSonRegionNum = 0
regions.foreach(region =>{
//一个分割点内最多子region数,>2需要循环merge
maxSonRegionNum = getMaxSonRegions(maxSonRegionNum, pointMap, index)
val endKey = region.getEndKey
//处理最后一个分区有多个子分区场景
if(index >= pointArr.size - 1){
tobeMergedRegion = mergeAndGetNextTobeMerge(tobeMergedRegion, region)
}else{
//分区endKey小于分割点,和下一个region做合并
if(Bytes.toString(endKey).compareTo(pointArr(index)) < 0){
tobeMergedRegion = mergeAndGetNextTobeMerge(tobeMergedRegion, region)
//分区endKey>=分割点
}else{
//如果有待合并分区,进行合并
if(tobeMergedRegion != null){
mergeRegion(tobeMergedRegion, region, 0)
tobeMergedRegion = null
}
//跳到下一个分割点
index += 1
}
}
})
//分割点没有都处理完或者最后一个分割点后有超过两个region时,需要继续合并
if(index < pointArr.size - 1 || maxSonRegionNum > 2){
return true
}
return false
}
private def mergeAndGetNextTobeMerge(tobeMergedRegion: HRegionInfo, region: HRegionInfo): HRegionInfo ={
var toBeMergedTemp: HRegionInfo = null
if(tobeMergedRegion == null){
toBeMergedTemp = region
}else{
mergeRegion(tobeMergedRegion, region, 0)
toBeMergedTemp = null
}
toBeMergedTemp
}
private def mergeRegion(hRegionInfo1: HRegionInfo, hRegionInfo2: HRegionInfo, retriedTimes: Int): Unit={
val start = System.currentTimeMillis()
var waitTimes = 0
try {
admin.mergeRegions(hRegionInfo1.getEncodedNameAsBytes, hRegionInfo2.getEncodedNameAsBytes, true)
val waitBegin = System.currentTimeMillis()
val waitEnd = waitBegin
var oriEnd1: String = null
if (!hRegionInfo1.getEndKey.isEmpty) {
oriEnd1 = Bytes.toString(hRegionInfo1.getEndKey)
}
var oriEnd2: String = null
if (!hRegionInfo2.getEndKey.isEmpty) {
oriEnd2 = Bytes.toString(hRegionInfo2.getEndKey)
}
var pointSet = mutable.Set.empty[String]
do {
waitTimes += waitTimes
if (waitEnd - waitBegin <= SINGLE_MERGE_MAX_WAIT_TIME) {
Thread.sleep(SLEEP_TIME)
pointSet = getAllEndKey()
} else {
throw new IOException("wait for merge timeout")
}
} while (!isMergeFinished(oriEnd1, oriEnd2, pointSet))
} catch {
case e: Exception =>{
val retiredTimesTemp = retriedTimes+1
if(retiredTimesTemp <= SINGLE_MERGE_RETRY_TIMES){
mergeRegion(hRegionInfo1, hRegionInfo2, retiredTimesTemp)
}else{
throw new IOException("mergeRegion retired max times")
}
}
} finally {
val end = System.currentTimeMillis()
}
}
private def isMergeFinished(oriEnd1: String, oriEnd2: String, pointSet: mutable.Set[String]): Boolean={
(oriEnd1 != null && pointSet.contains(oriEnd1)) || (oriEnd2 != null && pointSet.contains(oriEnd2))
}
private def getMaxSonRegions(maxSonRegionNum: Int, pointMap: mutable.Map[Int, Int], index: Int)={
var max = maxSonRegionNum
var regionSize = 1
if(pointMap.contains(index)){
regionSize = pointMap(index) + 1
}
pointMap += index-> regionSize
if(regionSize > maxSonRegionNum){
max = regionSize
}
max
}
private def split(): Unit={
val t1 = System.currentTimeMillis()
for(point <- pointArr){
splitByPoint(point, 0)
}
val t2 = System.currentTimeMillis()
}
private def splitByPoint(point: String, retryTimes: Int): Unit={
val begin = System.currentTimeMillis()
var waitTimes = 0
try {
var pointSet = getAllEndKey()
if (!pointSet.contains(point)) {
admin.split(table, point.getBytes())
}
val waitBegin = System.currentTimeMillis()
var waitEnd = System.currentTimeMillis()
do {
waitTimes = waitTimes + 1
if (waitEnd - waitBegin <= SINGLE_MERGE_RETRY_TIMES) {
Thread.sleep(SLEEP_TIME)
pointSet = getAllEndKey()
waitEnd = System.currentTimeMillis()
} else {
throw new IOException("Wait for split timeout")
}
} while (!isSplitFinished(pointSet, point))
} catch {
case e: Exception =>{
val retryTemp = retryTimes + 1
if(retryTemp <= SINGLE_SPLIT_RETRY_TIMES){
Thread.sleep(SLEEP_TIME)
splitByPoint(point, retryTemp)
}else throw e
}
}finally {
val end = System.currentTimeMillis()
}
}
private def isSplitFinished(pointSet: mutable.Set[String], point: String): Boolean={
pointSet.contains(point)
}
private def getAllEndKey(): mutable.Set[String]={
val pointSet = mutable.Set.empty[String]
val regions = admin.getTableRegions(table)
regions.foreach(region =>{
if(!region.getEndKey.isEmpty){
pointSet += Bytes.toString(region.getEndKey)
}
})
pointSet
}
private def init(): Unit={
connection = ConnectionFactory.createConnection()
admin = connection.getAdmin
table = TableName.valueOf(tableName)
}
}