踩了很多坑,终于把数据做成HFile文件。
package com.hun.scala
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.commons.codec.digest.DigestUtils
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.client.{HTable, Table, _}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat2, LoadIncrementalHFiles}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by hun on 2016/10/17.
*
* 终于成功了
*
*/
object Test6 {
def main(args: Array[String]) = {
//创建sparkcontext,用默认的配置
//val sc = new SparkContext(new SparkConf())
val sc = new SparkContext("local", "app name")
//hbase的列族
val columnFamily1 = "f1"
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.property.clientPort", "2181")
conf.set("hbase.zookeeper.quorum", "120.27.111.55")
val res1=sc.textFile("file:///E:/BaiduYunDownload/data1").map(x =>
x.replaceAll("<|>", "")
).distinct();
val res2=res1.filter(x=>
x.contains("REC")
)
val sourceRDD= res2.flatMap(x=>
{
val arg0=x.split(",")
val arg1=arg0.map(y=>
y.replaceFirst("=",",")
).filter(s=>
s.split(",").length>1
)
//arg0(10).replaceFirst("=",",").split(",")(0).contat(arg0(10).replaceFirst("=",",").split(",")(0))
// val key1=Bytes.toBytes(arg0(11).replaceFirst("=",",").split(",")(0).concat(arg0(17).replaceFirst("=",",").split(",")(1)));
val sdf = new SimpleDateFormat("yyyyMMdd")
val date=(Long.MaxValue-sdf.parse(arg0(11).replaceFirst("=",",").split(",")(1)).getTime).toString
val key=DigestUtils.md5Hex(date).concat(arg0(17).replaceFirst("=",",").split(",")(1));
// println(arg0(11).replaceFirst("=",",").split(",")(1).concat(arg0(17).replaceFirst("=",",").split(",")(1)))
val arg2=arg1.map(z=>
(key,(columnFamily1,z.split(",")(0), z.split(",")(1)))
).sorted
arg2
// arg0.
}
)
val source=sourceRDD.sortBy(_._1)
source.foreach(println)
val date = new Date().getTime
val rdd = source.map(x => {
//将rdd转换成HFile需要的格式,我们上面定义了Hfile的key是ImmutableBytesWritable,那么我们定义的RDD也是要以ImmutableBytesWritable的实例为key
//KeyValue的实例为value
//rowkey
val rowKey = x._1
val family = x._2._1
val colum = x._2._2
val value = x._2._3
(new ImmutableBytesWritable(Bytes.toBytes(rowKey)), new KeyValue(Bytes.toBytes(rowKey), Bytes.toBytes(family), Bytes.toBytes(colum), date,Bytes.toBytes(value)))
})
rdd.foreach(print)
//生成的HFile的临时保存路径
val stagingFolder = "file:///E:/BaiduYunDownload/data12"
//将日志保存到指定目录
rdd.saveAsNewAPIHadoopFile(stagingFolder,
classOf[ImmutableBytesWritable],
classOf[KeyValue],
classOf[HFileOutputFormat2],
conf)
//此处运行完成之后,在stagingFolder会有我们生成的Hfile文件
//开始即那个HFile导入到Hbase,此处都是hbase的api操作
val load = new LoadIncrementalHFiles(conf)
//hbase的表名
val tableName = "output_table"
//创建hbase的链接,利用默认的配置文件,实际上读取的hbase的master地址
val conn = ConnectionFactory.createConnection(conf)
//根据表名获取表
val table: Table = conn.getTable(TableName.valueOf(tableName))
//print(table.getTableDescriptor()+"eeeeeeeeeeeeeeeeeeeeeeeeeeeeee")
try {
//获取hbase表的region分布
// val regionLocator = conn.getRegionLocator(TableName.valueOf(tableName))
//创建一个hadoop的mapreduce的job
val job = Job.getInstance(conf)
//设置job名称
job.setJobName("DumpFile")
//此处最重要,需要设置文件输出的key,因为我们要生成HFil,所以outkey要用ImmutableBytesWritable
job.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
//输出文件的内容KeyValue
job.setMapOutputValueClass(classOf[KeyValue])
//配置HFileOutputFormat2的信息
//HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator)
HFileOutputFormat2.configureIncrementalLoadMap(job, table)
//开始导入
load.doBulkLoad(new Path(stagingFolder), table.asInstanceOf[HTable])
} finally {
table.close()
conn.close()
}
}
}
对于数据源的不同,Load是会出现错误。
一般都是:
java.io.IOException: Added a key not lexically larger than previous key=\x00\x02Mi\x0BsearchIndexuserId\x00\x00\x01>\xD5\xD6\xF3\xA3\x04, lastkey=\x00\x01w\x0BsearchIndexuserId\x00\x00\x01>\xD5\xD6\xF3\xA3\x04
最主要原因,在制作HFile文件的时候,一定要主键排序。Put进去会自动排序。但自己做成HFile文件不会自动排序。
所有一定要排序好,从
主键
列族
列
都要手动排序好。然后生成HFile文件。不然只会报错。