Spark处理HBase热点region重构附scala代码

        最近在做新闻信息流推荐业务,采取标签倒排索引存储在HBase的方案。大家都知道HBase会有热点Region的问题,会给单台服务器造成很大的压力,大大降低了HBase的相应性能。为此我们需要把在一个Region中访问量都比较大的标签切分,让它们分布在不同的Region中,以缓解压力。下面来介绍一下切分方案,另附有代码。

       在HBase中以新闻被打的标签为rowkey存储,所以只要找到切分点也就是标签名就可以重构Region.首先我们获取拥有这些标签的用户,获取用户的活跃度,以及标签的分值,这样以来,每个标签的热度就可以用这个标签所有用户活跃度的和乘标签的分值获取。例如我们要切分为5个Region,把所有标签的热度和除以5就,可以获得每个Region的热度配额。最后,把所有标签按照字典排序,从头累加到大于等于Region的热度配额时,则下个标签即为切分点,然后累加值只为零,重复操作直到标签结束。

切分之后再做merge。



    4.0.0

    hbase_region_split
    hbase_region_split
    1.0-SNAPSHOT
    
        2.10.4
        1.5.1
    
    
        
            org.scala-lang
            scala-library
            ${scala.version}
        
        
            org.scala-lang
            scala-library
            ${scala.version}
        
        
            org.apache.spark
            spark-core_2.10
            ${spark.version}
        

        
            org.apache.spark
            spark-mllib_2.10
            ${spark.version}
            
        
        
            org.apache.spark
            spark-hive_2.10
            ${spark.version}
        
        
            org.apache.spark
            spark-streaming_2.10
            ${spark.version}
        
        
            junit
            junit
            4.4
            test
        
        
            org.specs
            specs
            1.2.5
            test
        
        
        
            org.apache.hbase
            hbase
            1.0.2
            pom
        
        
            org.apache.hbase
            hbase-client
            1.0.2
        
        
            org.apache.hbase
            hbase-server
            1.0.2
        

    
    
        src/main/scala
        
            
                org.scala-tools
                maven-scala-plugin
                
                    
                        
                            compile
                            testCompile
                        
                    
                
                
                    ${scala.version}
                    
                        -target:jvm-1.5
                    
                
            
            
                org.apache.maven.plugins
                maven-eclipse-plugin
                
                    true
                    
                        ch.epfl.lamp.sdt.core.scalabuilder
                    
                    
                        ch.epfl.lamp.sdt.core.scalanature
                    
                    
                        org.eclipse.jdt.launching.JRE_CONTAINER
                        ch.epfl.lamp.sdt.launching.SCALA_CONTAINER
                    
                
            
            
                org.apache.maven.plugins
                maven-surefire-plugin
                2.13
                
                    false
                    true
                    
                    
                    
                        **/*Test.*
                        **/*Suite.*
                    
                
            
        
    


package com.hbase

import org.apache.spark.SparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext

import scala.collection.mutable

object HbaseRegionSplit {

    def main(args: Array[String]): Unit = {

        val sc = SparkContext.getOrCreate()
        val sQLContext = new HiveContext(sc)
        //用户热点分值表
        val lifeCycleSql = "select * from table1"
        //用户标签评分表
        val hobbySql = "select * from table2"
        //获取用户生命周期表中的用户活跃度
        val lifeCycle = sQLContext.sql(lifeCycleSql)
        //获取用户画像中的兴趣标签
        val hobby = sQLContext.sql(hobbySql)
        //用户兴趣标签与活跃度进行关联
        val userList = hobby.join(lifeCycle,lifeCycle("imei") === hobby("user_id"), "inner")
            .select("imei","second_class","second_class_score", "third_class", "third_class_score", "tags", "tags_score", "active_type")
        //合并各分类标签,并对应相应评分值,
        val userListClean = userList.map{case Row(imei: String, second_class: String, second_class_score: String, third_class: String,
            third_class_score: String, tags: String, tags_score: String, active_type: String) =>
            val sencond_s = second_class.split(",").zip(second_class_score.split(",")).toList
            val third_s = third_class.split(",").zip(third_class_score.split(",")).toList
            val tags_s = tags.split(",").zip(tags_score.split(",")).toList
            val allTags = (sencond_s ++ third_s ++ tags_s).map(x =>{
                val testScore = try x._2.toDouble catch {case e: Exception => 0.0}
                (x._1.trim, testScore)
            }).filter(x => x._1!="" && x._1!=null && x._2>0)
            val active_score = try active_type.toDouble catch {case e: Exception=> 0.0}
            (imei, allTags, active_score)
        }.filter(_._3 > 0)

        //计算每个标签的热度:不同user的相同标签按活跃度进行加权求和
        val tagList = userListClean.flatMap{case (imei, allTags, active_score)=>
            allTags.map(x => (x._1, x._2 * active_score))
        }.reduceByKey((x, y) => x + y).collect().sortBy(_._1)

        //计算平均热度,5为region个数
        val rhvalue = tagList.map(_._2).sum / 5

        //根据标签热度计算分割点
        var cumcal = 0.0
        val splitPoint = mutable.ArrayBuffer.empty[String]
        for(i <- tagList.indices){
            val score = tagList(i)._2
            val tag_next = if(i < tagList.length - 1) tagList(i + 1)._1 else ""
            cumcal += score
            if(cumcal >= rhvalue){
                splitPoint += tag_next
                cumcal = 0.0
            }
        }
        splitPoint.foreach{println(_)}
        //根据分割点进行重构
        val hbaseTableName = "hbase_test"
        val regionRebuilder = new RegionRebuilder(hbaseTableName, splitPoint.toArray)
        regionRebuilder.execute()

        sc.stop()
    }

}
package com.hbase

import java.io.IOException

import org.apache.hadoop.hbase.client.{Admin, Connection, ConnectionFactory}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HRegionInfo, TableName}

import scala.collection.JavaConversions._
import scala.collection.mutable

class RegionRebuilderTest(tableName: String, pointArr:Array[String]) {

    var connection: Connection = null
    var admin: Admin = null
    var table: TableName = null
    var SLEEP_TIME = 100
    //单次merge失败重试次数
    var SINGLE_MERGE_RETRY_TIMES = 5
    //单次merge最大等待时长
    var SINGLE_MERGE_MAX_WAIT_TIME = 1000*60
    //多次merge超时时间
    var MERGE_TIME_OUT = 1000*60*10
    //单次split失败重试次数
    var SINGLE_SPLIT_RETRY_TIMES = 5
    //单次split最大等待时长
    var SINGLE_SPLITE_MAX_WAIT_TIME = 1000*60


    def execute(): Unit={

        try {
            //初始化hbase表
            init()
            //切分region
            split()
            //合并region
            merge()
        } catch {
            case e: Exception =>{
                e.printStackTrace()
            }
        } finally {
            if(connection != null)
                connection.close()
        }
    }

    private def merge(): Unit={
        var continue: Boolean = false
        val begin = System.currentTimeMillis()
        var end = System.currentTimeMillis()
        var timeOut = end - begin
        do{
            continue = mergeOnce()
            end = System.currentTimeMillis()
            timeOut = end - begin
        }while(continue && timeOut < MERGE_TIME_OUT)
        if(timeOut >= MERGE_TIME_OUT){
            throw new IOException("Merge TimeOut")
        }
    }

    private def mergeOnce(): Boolean={
        val regions = admin.getTableRegions(table)
        var index = 0
        var tobeMergedRegion: HRegionInfo = null
        val pointMap = mutable.Map.empty[Int, Int]
        var maxSonRegionNum = 0
        regions.foreach(region =>{
            //一个分割点内最多子region数,>2需要循环merge
            maxSonRegionNum = getMaxSonRegions(maxSonRegionNum, pointMap, index)
            val endKey = region.getEndKey
            //处理最后一个分区有多个子分区场景
            if(index >= pointArr.size - 1){
                tobeMergedRegion = mergeAndGetNextTobeMerge(tobeMergedRegion, region)
            }else{
                //分区endKey小于分割点,和下一个region做合并
                if(Bytes.toString(endKey).compareTo(pointArr(index)) < 0){
                    tobeMergedRegion = mergeAndGetNextTobeMerge(tobeMergedRegion, region)
                    //分区endKey>=分割点
                }else{
                    //如果有待合并分区,进行合并
                    if(tobeMergedRegion != null){
                        mergeRegion(tobeMergedRegion, region, 0)
                        tobeMergedRegion = null
                    }
                    //跳到下一个分割点
                    index += 1
                }
            }
        })
        //分割点没有都处理完或者最后一个分割点后有超过两个region时,需要继续合并
        if(index < pointArr.size - 1 || maxSonRegionNum > 2){
            return true
        }
        return false
    }

    private def mergeAndGetNextTobeMerge(tobeMergedRegion: HRegionInfo, region: HRegionInfo): HRegionInfo ={
        var toBeMergedTemp: HRegionInfo = null
        if(tobeMergedRegion == null){
            toBeMergedTemp = region
        }else{
            mergeRegion(tobeMergedRegion, region, 0)
            toBeMergedTemp = null
        }
        toBeMergedTemp
    }

    private def mergeRegion(hRegionInfo1: HRegionInfo, hRegionInfo2: HRegionInfo, retriedTimes: Int): Unit={
        val start = System.currentTimeMillis()
        var waitTimes = 0
        try {
            admin.mergeRegions(hRegionInfo1.getEncodedNameAsBytes, hRegionInfo2.getEncodedNameAsBytes, true)
            val waitBegin = System.currentTimeMillis()
            val waitEnd = waitBegin
            var oriEnd1: String = null
            if (!hRegionInfo1.getEndKey.isEmpty) {
                oriEnd1 = Bytes.toString(hRegionInfo1.getEndKey)
            }
            var oriEnd2: String = null
            if (!hRegionInfo2.getEndKey.isEmpty) {
                oriEnd2 = Bytes.toString(hRegionInfo2.getEndKey)
            }
            var pointSet = mutable.Set.empty[String]
            do {
                waitTimes += waitTimes
                if (waitEnd - waitBegin <= SINGLE_MERGE_MAX_WAIT_TIME) {
                    Thread.sleep(SLEEP_TIME)
                    pointSet = getAllEndKey()
                } else {
                    throw new IOException("wait for merge timeout")
                }
            } while (!isMergeFinished(oriEnd1, oriEnd2, pointSet))
        } catch {
            case e: Exception =>{
                val retiredTimesTemp = retriedTimes+1
                if(retiredTimesTemp <= SINGLE_MERGE_RETRY_TIMES){
                    mergeRegion(hRegionInfo1, hRegionInfo2, retiredTimesTemp)
                }else{
                    throw new IOException("mergeRegion retired max times")
                }
            }
        } finally {
            val end = System.currentTimeMillis()
        }
    }

    private def isMergeFinished(oriEnd1: String, oriEnd2: String, pointSet: mutable.Set[String]): Boolean={
        (oriEnd1 != null && pointSet.contains(oriEnd1)) || (oriEnd2 != null && pointSet.contains(oriEnd2))
    }

    private def getMaxSonRegions(maxSonRegionNum: Int, pointMap: mutable.Map[Int, Int], index: Int)={
        var max = maxSonRegionNum
        var regionSize = 1
        if(pointMap.contains(index)){
            regionSize = pointMap(index) + 1
        }
        pointMap += index-> regionSize
        if(regionSize > maxSonRegionNum){
            max = regionSize
        }
        max
    }

    private def split(): Unit={
        val t1 = System.currentTimeMillis()
        for(point <- pointArr){
            splitByPoint(point, 0)
        }
        val t2 = System.currentTimeMillis()
    }

    private def splitByPoint(point: String, retryTimes: Int): Unit={
        val begin = System.currentTimeMillis()
        var waitTimes = 0
        try {
            var pointSet = getAllEndKey()
            if (!pointSet.contains(point)) {
                admin.split(table, point.getBytes())
            }
            val waitBegin = System.currentTimeMillis()
            var waitEnd = System.currentTimeMillis()
            do {
                waitTimes = waitTimes + 1
                if (waitEnd - waitBegin <= SINGLE_MERGE_RETRY_TIMES) {
                    Thread.sleep(SLEEP_TIME)
                    pointSet = getAllEndKey()
                    waitEnd = System.currentTimeMillis()
                } else {
                    throw new IOException("Wait for split timeout")
                }
            } while (!isSplitFinished(pointSet, point))
        } catch {
            case e: Exception =>{
                val retryTemp = retryTimes + 1
                if(retryTemp <= SINGLE_SPLIT_RETRY_TIMES){
                    Thread.sleep(SLEEP_TIME)
                    splitByPoint(point, retryTemp)
                }else throw e
            }
        }finally {
            val end = System.currentTimeMillis()
        }
    }

    private def isSplitFinished(pointSet: mutable.Set[String], point: String): Boolean={
        pointSet.contains(point)
    }

    private def getAllEndKey(): mutable.Set[String]={
        val pointSet = mutable.Set.empty[String]
        val regions = admin.getTableRegions(table)
        regions.foreach(region =>{
            if(!region.getEndKey.isEmpty){
                pointSet += Bytes.toString(region.getEndKey)
            }
        })
        pointSet
    }
    private def init(): Unit={
        connection = ConnectionFactory.createConnection()
        admin = connection.getAdmin
        table = TableName.valueOf(tableName)
    }
}

 

你可能感兴趣的:(HBase,spark,scala,HBase)