hadoop2.6数据导入elasticsearch2.2(解析hbase导出数据)

参考网址:

https://www.elastic.co/guide/en/elasticsearch/hadoop/current/mapreduce.html


1.下载依赖jar

elasticsearch-hadoop2.2.0.jar这个从私服下载吧。

2.数据流向是

hbase导出数据-》hdfs-》es2

3.以下直接粘贴代码


import java.util.Map.Entry;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.mapreduce.MutationSerialization;
import org.apache.hadoop.hbase.mapreduce.ResultSerialization;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.elasticsearch.hadoop.mr.EsOutputFormat;
import org.elasticsearch.hadoop.mr.LinkedMapWritable;
public class MyJob extends Configured implements Tool {
	@Override
	public int run(String[] args) throws Exception {
		Path input = new Path(args[0]);
		Configuration conf = getConf();  
		    conf.setBoolean("mapred.map.tasks.speculative.execution", false);    
		    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
		    conf.setStrings("io.serializations", conf.get("io.serializations"),
		            MutationSerialization.class.getName(), ResultSerialization.class.getName());//如果是hbase0.9导出数据就不需要这个参数了,1以上需要
		//conf.set("es.nodes", "host228"); // index or indices used for storing data
		conf.set("es.port", "9200"); // index or indices used for storing data
		//conf.set("es.resource", "ehlindex/tr_plate"); // index or indices used for storing data
		GenericOptionsParser parser = new GenericOptionsParser(conf, args); 
		for (Entry entry : conf) {  
            System.out.printf("%s=%s\n", entry.getKey(), entry.getValue());  
        } 
		Job job = Job.getInstance(conf, "hfile 2 es");
		
		job.setJarByClass(MyJob.class);
		FileInputFormat.addInputPath(job, input);
		
		job.setInputFormatClass(SequenceFileInputFormat.class);
		job.setOutputFormatClass(EsOutputFormat.class);
		job.setMapOutputValueClass(LinkedMapWritable.class);       
		job.setNumReduceTasks(0);
		job.setMapperClass(MyMaper.class);
		
		return job.waitForCompletion(true)?0:1;
	}
	public static void main(String[] args) throws Exception {
		int run = ToolRunner.run( new MyJob(), args);
		System.exit(run);
	}
}

import java.io.IOException;

import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.elasticsearch.hadoop.mr.LinkedMapWritable;

import com.ehl.im.transfer.TRFieldEnum;
import com.ehl.im.transfer.TravelRecord;

public class MyMaper extends  Mapper
{
	 protected void map(ImmutableBytesWritable key, Result value,
				Context context)
	        throws IOException, InterruptedException {
		if("true".equals( context.getConfiguration().get("notinsert"))){
			return ;
		}
		 try {
			LinkedMapWritable linkObj = result2Map(value);
			context.write(NullWritable.get(), linkObj);
		} catch (Exception e) {
			e.printStackTrace();
		}
	 }
	 
	 
	 
	 private LinkedMapWritable result2Map(Result r){
		 LinkedMapWritable linkObj = new LinkedMapWritable();

			byte[] passCarRowValue = r.getValue("cf".getBytes(), null);
			TravelRecord record = new TravelRecord(passCarRowValue);
			linkObj.put(new Text("timestamp"),new LongWritable(Long.valueOf(record.getStringValue(TRFieldEnum.TIMESTAMP))));
			linkObj.put(new Text("car_plate_number"),new Text(record.getStringValue(TRFieldEnum.CAR_PLATE_NUMBER)));
			try {
				linkObj.put(new Text("carplateindex"),new Text(CarPlateCommonUtil.produceCarPlateIndexStr( record.getStringValue(TRFieldEnum.CAR_PLATE_NUMBER))) );
			} catch (Exception e) {
				e.printStackTrace();
			}
			linkObj.put(new Text("speed"), new LongWritable(Long.valueOf(record.getStringValue(TRFieldEnum.SPEED))));
			
			linkObj.put(new Text("lane_id"),new Text(record.getStringValue(TRFieldEnum.LANE_ID)));
			linkObj.put(new Text("camera_location"),new Text(record.getStringValue(TRFieldEnum.CAMERA_LOCATION)));
			linkObj.put(new Text("bay_id"),new Text(record.getStringValue(TRFieldEnum.BAY_ID)));
			linkObj.put(new Text("camera_orientation"),new Text(record.getStringValue(TRFieldEnum.CAMERA_ORIENTATION)));

			linkObj.put(new Text("car_brand"),new Text(record.getStringValue(TRFieldEnum.CAR_BRAND)));
			linkObj.put(new Text("car_color"),new Text(record.getStringValue(TRFieldEnum.CAR_COLOR)));
			
			linkObj.put(new Text("car_plate_color"),new Text(record.getStringValue(TRFieldEnum.CAR_PLATE_COLOR)));
			linkObj.put(new Text("car_plate_type"),new Text(record.getStringValue(TRFieldEnum.CAR_PLATE_TYPE)));
			
			linkObj.put(new Text("car_status"),new Text(record.getStringValue(TRFieldEnum.CAR_STATUS)));
			linkObj.put(new Text("travel_orientation"),new Text(record.getStringValue(TRFieldEnum.TRAVEL_ORIENTATION)));
			
			
			linkObj.put(new Text("plate_coordinates"),new Text(record.getStringValue(TRFieldEnum.PLATE_COORDINATES)));
			linkObj.put(new Text("driver_coordinates"),new Text(record.getStringValue(TRFieldEnum.DRIVER_COORDINATES)));
			String[] imgUrls = record.getStringArrayValue(TRFieldEnum.IMAGE_URLS);
			if (imgUrls != null) {
				if (imgUrls.length >= 1 && imgUrls[0] != null && !"".equals(imgUrls[0])) {
					linkObj.put(new Text("tp1"),new Text(imgUrls[0]));
				}
				if (imgUrls.length >= 2 && imgUrls[1] != null && !"".equals(imgUrls[1])) {
					linkObj.put(new Text("tp2"),new Text(imgUrls[1]));
				}
				if (imgUrls.length >= 3 && imgUrls[2] != null && !"".equals(imgUrls[2])) {
					linkObj.put(new Text("tp3"),new Text(imgUrls[2]));
				}

			}
		return linkObj;
	 }
	 
	 
}
以下是建立es的索引
curl -XPOST host213:9200/ehlindex -d '{ 
"settings" : { "number_of_shards" : 20,"number_of_replicas" : 0 }, 
"mappings" : { 
"tr_plate" : {
 "properties" : {
 "timestamp" : { "type" : "long", "index" : "not_analyzed" } ,
 "car_plate_number" : { "type" : "string", "index" : "not_analyzed" } ,
 "speed" : { "type" : "long", "index" : "not_analyzed" } ,
 "lane_id" : { "type" : "string", "index" : "not_analyzed" } ,
 "camera_location" : { "type" : "string", "index" : "not_analyzed" } ,
 "bay_id" : { "type" : "string", "index" : "not_analyzed" } ,
 "camera_orientation" : { "type" : "string", "index" : "not_analyzed" } ,
 "car_brand" : { "type" : "string", "index" : "not_analyzed" } ,
"car_color" : { "type" : "string", "index" : "not_analyzed" } ,
"car_plate_color" : { "type" : "string", "index" : "not_analyzed" } ,
"car_plate_type" : { "type" : "string", "index" : "not_analyzed" } ,
"tp1" : { "type" : "string", "index" : "not_analyzed" } ,
"tp2" : { "type" : "string", "index" : "not_analyzed" } ,
"tp3" : { "type" : "string", "index" : "not_analyzed" } ,
"car_status" : { "type" : "string", "index" : "not_analyzed" } ,
"travel_orientation" : {  "type" : "string", "index" : "not_analyzed"  } ,
"plate_coordinates" : {  "type" : "string", "index" : "not_analyzed"  } ,
"driver_coordinates" : {  "type" : "string", "index" : "not_analyzed"  } ,
 "carplateindex" : { "type" : "string", "index" : "analyzed" } 
}
 }
 }
 }'


执行mr需要以下参数,命令如下:

hadoop jar downloads/Hfile2Es-0.0.1-SNAPSHOT-jar-with-dependencies.jar -D  es.resource=ehlindex/tr_plate -D es.nodes=host228   /yangxTest/qhd_data1/qhd_data1

这些参数在MyJob中都能获取,所以比较灵活,纯粹干货自己消化吧





你可能感兴趣的:(hbase,elasticsearch)