背景:公司有需求将hive数据导入es,之前是通过datax进行导数的;但是datax多线程很耗内存,并且经常性会出现一些程序上的问题。故想要使用hive外部表映射es的方式将数据导入es
添加 jar 包,创建环境
- 将相关 jar 包添加至 hive 环境,进去 es 官网,下载对应版本jar包,网址:https://artifacts.elastic.co
- 提取压缩包中的 elasticsearch-hadoop-hive-6.hive添加jar包参考:http://note.youdao.com/noteshare?id=061ad30a8eee86362bb154cf6f923c25&sub=F68ADFE343044A8193EA06026FEAC0B3。
- 个人推荐,最有效且长期的方式,将elasticsearch-hadoop-hive-6.3.2.jar包放到/opt/cloudera/parcels/CDH/lib/hive/auxlib路径下,在CDH管理界面重启hive
构建 es 索引
- 注意:es 索引不要加上
"dynamic": "strict"
,否则即有可能会报错 - 创建 hive 外部表映射 es
CREATE EXTERNAL TABLE demo(
`es_id` STRING COMMENT 'ES唯一键',
`oid` STRING COMMENT '新闻id',
`enterprises` STRUCT COMMENT '相关企业列表',
`title` STRING COMMENT '新闻标题',
`names` STRING COMMENT '相关人员名列表',
`companies` STRING COMMENT '相关公司名列表',
`url` STRING COMMENT '新闻链接',
`neg_index` STRING COMMENT '情感正负面打分',
`sentiment` STRING COMMENT '情感正负面',
`brief` STRING COMMENT '摘要',
`keywords` STRING COMMENT '关键词',
`keywords_desc` STRING COMMENT '关键字前后文字',
`class_type` STRING COMMENT '大类',
`pub_time` STRING COMMENT '发布时间',
`pub_time_date_long` STRING COMMENT '发布时间',
`pub_time_date` STRING COMMENT '发布时间',
`pub_time_year` STRING COMMENT '发布年份',
`pub_time_month` STRING COMMENT '发布年月',
`tag_list` STRING COMMENT '标签列表',
`source` STRING COMMENT '来源',
`theme_keywords` STRUCT COMMENT '主题词',
`deprecated` STRING COMMENT 'I 新增 U更新 D 删除')
STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES(
'es.nodes' = 'ip:9200', --es集群节点ip
'es.index.auto.create' = 'true', --写数据时,若es中无该索引,会自动创建一个索引
'es.resource' = 'demo/content', --写入es的index/type
'es.mapping.id'='es_id', --将该字段作为es的_id,所谓唯一键
'es.mapping.names' = 'oid:oid,enterprises:enterprises,title:title,names:names,companies:companies,url:url,neg_index:neg_index,sentiment:sentiment,brief:brief,keywords:keywords,keywords_desc:keywords_desc,class_type:class_type,pub_time:pub_time,pub_time_date_long:pub_time_date_long,pub_time_date:pub_time_date,pub_time_year:pub_time_year,pub_time_month:pub_time_month,tag_list:tag_list,source:source,theme_keywords:theme_keywords,deprecated:deprecated' --hive字段:es字段
);
- 插入数据:
- es 索引:
put demo
{
"mappings":{
"content":{
"properties":{
"brief":{
"type":"text",
"index":false
},
"class_type":{
"type":"keyword"
},
"companies":{
"type":"keyword",
"ignore_above":256
},
"keywords":{
"type":"keyword",
"ignore_above":256
},
"keywords_desc":{
"type":"text",
"index":false
},
"names":{
"type":"keyword",
"ignore_above":256
},
"neg_index":{
"type":"half_float"
},
"oid":{
"type":"keyword"
},
"pub_time":{
"type":"long"
},
"sentiment":{
"type":"keyword"
},
"pub_time_date_long":{
"type":"keyword"
},
"pub_time_date":{
"type":"keyword"
},
"pub_time_year":{
"type":"keyword"
},
"pub_time_month":{
"type":"keyword"
},
"source":{
"type":"keyword",
"ignore_above":50
},
"tag_list":{
"type":"keyword"
},
"enterprises":{
"type":"nested",
"properties":{
"eid":{
"type":"keyword"
},
"name":{
"type":"keyword",
"ignore_above":256
}
}
},
"theme_keywords":{
"type":"nested",
"properties":{
"count":{
"type":"short"
},
"name":{
"type":"keyword",
"ignore_above":50
}
}
},
"title":{
"type":"text",
"analyzer":"ik_smart"
},
"url":{
"type":"keyword",
"ignore_above":256
},
"deprecated":{
"type":"keyword",
"doc_values":true
}
}
}
},
"settings":{
"index":{
"max_result_window":30000,
"indexing.slowlog.level":"info",
"indexing.slowlog.source":"1000",
"indexing.slowlog.threshold.index.info":"5s",
"indexing.slowlog.threshold.index.warn":"10s",
"search.slowlog.level":"info",
"search.slowlog.threshold.query.info":"1s",
"search.slowlog.threshold.query.warn":"4s",
"routing.rebalance.enable":"replicas",
"refresh_interval":"120s",
"store.type":"niofs",
"number_of_shards":"3",
"number_of_replicas":"0"
}
}
}