1、通过mapreduce的方式存入hbase,只有map,其实reduce阶段也是一样的
代码如下:
import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class HBaseImport extends Configured implements Tool{ static final Log LOG = LogFactory.getLog(HBaseImport.class); public static final String JOBNAME = "MRImport "; public static class Map extends Mapper<LongWritable , Text, NullWritable, NullWritable>{ Configuration configuration = null; HTable xTable = null; private boolean wal = true; static long count = 0; @Override protected void cleanup(Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub super.cleanup(context); xTable.flushCommits(); xTable.close(); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String all[] = value.toString().split("/t"); If(all.length==2){ put = new Put(Bytes.toBytes(all[0]))); put.add(Bytes.toBytes("xxx"),Bytes.toBytes("20110313"),Bytes.toBytes(all[1])); } if (!wal) { put.setWriteToWAL(false); } xTable.put(put); if ((++count % 100)==0) { context.setStatus(count +" DOCUMENTS done!"); context.progress(); System.out.println(count +" DOCUMENTS done!"); } } @Override protected void setup(Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub super.setup(context); configuration = context.getConfiguration(); xTable = new HTable(configuration,"testKang"); xTable.setAutoFlush(false); xTable.setWriteBufferSize(12*1024*1024); wal = true; } } @Override public int run(String[] args) throws Exception { String input = args[0]; Configuration conf = HBaseConfiguration.create(getConf()); conf.set("hbase.master", "m0:60000"); Job job = new Job(conf,JOBNAME); job.setJarByClass(HBaseImport.class); job.setMapperClass(Map.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.setInputPaths(job, input); job.setOutputFormatClass(NullOutputFormat.class); return job.waitForCompletion(true)?0:1; } public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); int res = 1; try { res = ToolRunner.run(conf, new HBaseImport (), otherArgs); } catch (Exception e) { e.printStackTrace(); } System.exit(res); } }
import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; public class InsertContactJava { public static long startTime; public static long rowkey = 0; //起始rowkey public static final int lineCount = 100000; //每次提交时录入的行数 public static String tableName = "usercontact_kang"; //录入目的表名 public static int countLie = 8; //表的列数 public static void main(String[] args) throws IOException { startTime = System.currentTimeMillis() / 1000; System.out.println("start time = " + startTime); Thread t1 = new Thread() { @Override public void run() { try { insert_one("/run/jar/123"); //loadByLieWithVector("/run/jar/123"); //loadByLieWithArrayList("/run/jar/123"); } catch (IOException e) { e.printStackTrace(); } } }; t1.start(); } public static void insert_one(String path) throws IOException { Configuration conf = HBaseConfiguration.create(); HTable table = new HTable(conf, tableName); File f = new File(path); ArrayList<Put> list = new ArrayList<Put>(); BufferedReader br = new BufferedReader(new FileReader(f)); String tmp = br.readLine(); int count = 0; while (tmp != null) { if (list.size() > 10000) { table.put(list); table.flushCommits(); list.clear(); } else { String arr_value[] = tmp.toString().split("/t", 10); String first[] = arr_value[0].split("~", 5); String second[] = arr_value[1].split("~", 5); String rowname = getIncreasRowKey(); String firstaccount = first[0]; String firstprotocolid = first[1]; String firstdomain = first[2]; String inserttime = Utils.getToday("yyyyMMdd"); String secondaccount = second[0]; String secondprotocolid = second[1]; String seconddomain = second[2]; String timescount = Integer.valueOf(arr_value[2]).toString(); Put p = new Put(rowname.getBytes()); p.add(("ucvalue").getBytes(), "FIRSTACCOUNT".getBytes(), firstaccount.getBytes()); p.add(("ucvalue").getBytes(), "FIRSTDOMAIN".getBytes(), firstdomain.getBytes()); p.add(("ucvalue").getBytes(), "FIRSTPROTOCOLID".getBytes(), firstprotocolid.getBytes()); p.add(("ucvalue").getBytes(), "INSERTTIME".getBytes(), inserttime.getBytes()); p.add(("ucvalue").getBytes(), "SECONDACCOUNT".getBytes(), secondaccount.getBytes()); p.add(("ucvalue").getBytes(), "SECONDDOMAIN".getBytes(), seconddomain.getBytes()); p.add(("ucvalue").getBytes(), "SECONDPROTOCOLID".getBytes(), secondprotocolid.getBytes()); p.add(("ucvalue").getBytes(), "TIMESCOUNT".getBytes(), timescount.getBytes()); list.add(p); } tmp = br.readLine(); count++; } if (list.size() > 0) { table.put(list); table.flushCommits(); } table.close(); System.out.println("total = " + count); long endTime = System.currentTimeMillis() / 1000; long costTime = endTime - startTime; System.out.println("end time = " + endTime); System.out.println(path + ": cost time = " + costTime); }
两种方式的优劣比较
MapReduce方式:
开始会很快,但是由于mr和hbase竞争资源,到一个特定的时间点会变很慢
Java程序方式:
多客户端,多线程同时入库,目前看来是最好的方式,client和regionserver分开,硬盘读写分开,瓶颈只在网络和内存上。咨询了一些牛人,大多推荐这种方式,并且一定要多客户端,多线程。