ES整合HBase实现二级索引

前言;
es整合hbase实现二级索引的目的,只要是因为hbase不具备全文检索,只有rowkey是全局的唯一标识,在大量数据的前提下,想要根据字段进行检索,没办法利用rowkey就会出现效率低下的情况.刚好es具备全文检索的优良传统,两个优秀的框架注定是要擦出点火花来的.

设计思想: 在es中存储标题,在hbase 中存储正文

实现思路:
1.使用代码解析excel,读取excel的内容
2.将读取到的内容保存到es和hbase中去
3.搜索es中的字段,然后选择详细信息
4.通过hbase显示具体文件内容

ES整合HBase实现二级索引_第1张图片

一:在es中构建索引库

PUT /articles
{  
    "settings":{  
         "number_of_shards":3,   //分片个数
         "number_of_replicas":1, //副本个数
         "analysis" : {
            "analyzer" : {
                "ik" : {      //ik分词器
                    "tokenizer" : "ik_max_word"  //分词器
                }
            }
        }
    }, 
    "mappings":{  
         "article":{   
             "dynamic":"strict",   //动静态
             "_source": {  //字段来源
               "includes": [
                  "id","title","from","readCounts","times"   //包含字段
                ],
               "excludes": [    //不包含字段
                  "content"    //不包含文章内容
               ]
             },
             "properties":{  //属性
                 "id":{"type": "keyword", "store": true},  //id,keyword类型,存储
                    //标题,text类型,存储,索引存储,使用ik分词器
                 "title":{"type": "text","store": true,"index" : true,"analyzer": "ik_max_word"}, 
                    //from字段, keyword类型,存储
                 "from":{"type": "keyword","store": true}, 
                    //readcounts字段,integer类型,存储
                 "readCounts":{"type": "integer","store": true},  
                    //文章内容,text类型,不存储,不做索引
                 "content":{"type": "text","store": false,"index": false},
                    //时间,keyword类型,不做索引
                 "times": {"type": "keyword", "index": false}
             }  
         }  
    }  
}

二:创建maven工程,导入依赖



  
  org.apache.poi
  poi-ooxml-schemas
  3.8
  

  
  org.apache.poi
  poi-ooxml
  3.8
  

  
  org.apache.poi
  poi
  3.8
  


  org.elasticsearch.client
  transport
  6.7.0
  

  
  org.apache.logging.log4j
  log4j-core
  2.9.1
  

  
  com.google.code.gson
  gson
  2.8.2
  


  org.apache.hadoop
  hadoop-client
  2.7.5




  org.apache.hbase
  hbase-client
  2.0.0



  org.apache.hbase
  hbase-server
  2.0.0


  


  
  
  org.apache.maven.plugins
  maven-compiler-plugin
  3.0
  
  1.8
  1.8
  UTF-8
  
  
  
  


三:创建javabean,映射文件所有字段,并提供get/set方法(省略)

public class Article {
    private String id;
    private String title;
    private String from;
    private String times;
    private String readCounts;
    private String content;
....

}


四:创建解析excel的工具类

public class ExcelUtil {
    public static void main(String[] args) throws IOException {
        //第一步:解析数据
        List
list = getArticleList(); } public static List
getArticleList() throws IOException { //创建文件输入流 FileInputStream inputStream = new FileInputStream("C:\\Users\\93095\\Desktop\\baijia.xlsx"); //使用XSSFWorkbook获取文件解析对象来解析xlsx文件 XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream); //获取第一个sheet页.因为一个文件可能有多个sheet页面 XSSFSheet sheetAt = xssfSheets.getSheetAt(0); //获取页面最后一行数据的number号,已经自行减去头行,所以得到的就是数的具体条数 int lastRowNum = sheetAt.getLastRowNum(); //System.out.println(lastRowNum); //创建集合存放Article List
list = new ArrayList<>(); //获取每一条数据 for (int i = 1; i<= lastRowNum;i++){ //创建Article对象 Article article = new Article(); //循环遍历每行数据,行数据封装成了XSSFRow对象 XSSFRow row = sheetAt.getRow(i); //获取每一个单元格的信息,从0 开始,从左往右数 //第一行是标题 String title = row.getCell(0).toString(); //获取第二个单元格--数据来源 String from = row.getCell(1).toString(); //获取时间 String times = row.getCell(2).toString(); //获取阅读次数 String readCounts = row.getCell(3).toString(); //获取文字内容 String content = row.getCell(4).toString(); article.setId(i+""); article.setTitle(title); article.setFrom(from); article.setTimes(times); article.setReadCounts(readCounts); article.setContent(content); list.add(article); } inputStream.close(); return list; } }

五:具体实现思路和步骤

public class ES2HBase {
    public static void main(String[] args) throws IOException {
        //第一步: 解析excel
        List
list = ExcelUtil.getArticleList(); //第二步: 将excel数据保存到hbase和es中 //获取es集群客户端连接 TransportClient client = getClient(); //把数据存储到es中 //save2ES(list, client); //创建表--接收数据 Table table = getTable(); //把数据保存到hbase中 save2HBase(list, table); //第三步: 搜索es数据 //查询es中title有机器人关键字的内容 TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", "机器人"); SearchRequestBuilder builder = client.prepareSearch("articles").setTypes("article").setQuery(termQueryBuilder); SearchResponse searchResponse = builder.get(); //获取外围hit SearchHits hits = searchResponse.getHits(); //获取每一条数据 SearchHit[] hits1 = hits.getHits(); //遍历hit内容 for (SearchHit hit : hits1){ //获取系统id String id = hit.getId(); String source = hit.getSourceAsString(); } //第四步: 到hbase中获取详情内容 //获取系统id 就是 _id的值为123的内容--模拟用户点击词条显示内容 //es中的系统id 就是hbase中的rowkey Get get = new Get("123".getBytes()); //获取一行数据 Result result = table.get(get); //获取每个cell Cell[] cells = result.rawCells(); for (Cell cell : cells){ //获取cell的值,转换成字符串 String string = Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength()); System.out.println(string); } table.close(); client.close(); } //把数据保存到hbase表中 private static void save2HBase(List
list, Table table) throws IOException { //把数据保存到hbase中 ArrayList arrayList = new ArrayList<>(); //遍历文章内容集合 for (Article article : list){ //获取文章内容的id Put put = new Put(article.getId().getBytes()); //添加列族和各个列和每个列的值 put.addColumn("f1".getBytes(),"titlt".getBytes(),article.getTitle().getBytes()); put.addColumn("f1".getBytes(),"from".getBytes(),article.getFrom().getBytes()); put.addColumn("f1".getBytes(),"times".getBytes(),article.getTimes().getBytes()); put.addColumn("f1".getBytes(),"readCounts".getBytes(),article.getReadCounts().getBytes()); put.addColumn("f1".getBytes(),"content".getBytes(),article.getContent().getBytes()); //把数据存进put中 arrayList.add(put); } //把数据保存到hbase表中 table.put(arrayList); } //创建hbase中的表 private static Table getTable() throws IOException { //获取hbase客户端连接 Configuration configuration = HBaseConfiguration.create(); configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181"); Connection connection = ConnectionFactory.createConnection(configuration); //将数据保存到eshbase表的f1列族中--系统原先没有此表,需要创建 Admin admin = connection.getAdmin(); //判断,如果表不存在就创建 if (!admin.tableExists(TableName.valueOf("eshbase"))){ HTableDescriptor tableDescriptor = new HTableDescriptor(TableName.valueOf("eshbase")); //指定列族 HColumnDescriptor columnDescriptor = new HColumnDescriptor("f1"); tableDescriptor.addFamily(columnDescriptor); //创建表 admin.createTable(tableDescriptor); } //获取表 Table table = connection.getTable(TableName.valueOf("eshbase")); return table; } //把数据保存到ES private static void save2ES(List
list, TransportClient client) { //把数据保存到es,批量方式 Gson gson = new Gson(); BulkRequestBuilder bulkRequestBuilder = client.prepareBulk(); for (Article article : list){ //转成json格式 String json = gson.toJson(article); IndexRequestBuilder indexRequestBuilder = client.prepareIndex("articles", "article", article.getId()).setSource(json, XContentType.JSON); bulkRequestBuilder.add(indexRequestBuilder); } //触发请求 bulkRequestBuilder.get(); } //获取ES客户端 private static TransportClient getClient() throws UnknownHostException { Settings settings = Settings.builder().put("cluster.name", "es").build(); //获取集群 TransportAddress node01 = new TransportAddress(InetAddress.getByName("node01"), 9300); TransportAddress node02 = new TransportAddress(InetAddress.getByName("node02"), 9300); TransportAddress node03 = new TransportAddress(InetAddress.getByName("node03"), 9300); //获取es连接客户端 TransportClient client = new PreBuiltTransportClient(settings).addTransportAddress(node01).addTransportAddress(node02).addTransportAddress(node03); return client; } }

你可能感兴趣的:(HBase)