IK分词热更新

在使用ES做查询的时候,为了获取更加准确地匹配查询结果,需要自定义与业务相关的分词词典。比如汽车行业的一些专业用词:奔驰AMG、宝马X5......

假如不自定义分词的话,默认“奔驰AMG”是会被切分成“奔驰”和“AMG”的,那么意味着所有和“奔驰”或“AMG”相关的数据都会被查询出来,显然所有的“奔驰”并不是我们的目标数据。

根据以上问题描述,我们需要添加自定义分词。通过查看IK文档,发现可以做成热更新:

IK分词热更新_第1张图片
image-20191030150618555.png

1.数据表设计

CREATE TABLE `brahma_tab_ik_ext_word` (
  `id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
  `word` varchar(255) COLLATE utf8mb4_bin NOT NULL COMMENT '扩展词',
  `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改时间',
  PRIMARY KEY (`id`),
  UNIQUE KEY `UNIQUE_WORD` (`word`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin COMMENT='ik分词扩展词典';

2.编码

jpa

import com.yx.brahma.persistence.entity.BrahmaTabIkExtWordEntity;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.Pageable;
import org.springframework.data.jpa.domain.Specification;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.repository.PagingAndSortingRepository;
import org.springframework.stereotype.Repository;

/** @author zouwei */
@Repository
public interface IkExtWordRepository
        extends JpaRepository,
                PagingAndSortingRepository {

    /**
     * 查询分词记录
     *
     * @param word
     * @return
     */
    BrahmaTabIkExtWordEntity findFirstByWord(String word);

    /**
     * 条件查询
     *
     * @param spec
     * @param pageable
     * @return
     */
    Page findAll(
            Specification spec, Pageable pageable);
}

service

import com.yx.brahma.persistence.entity.BrahmaTabIkExtWordEntity;
import com.yx.brahma.persistence.repository.IkExtWordRepository;
import com.yx.brahma.service.ServiceException;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Pageable;
import org.springframework.data.domain.Sort;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;

import javax.persistence.criteria.CriteriaBuilder;
import javax.persistence.criteria.CriteriaQuery;
import javax.persistence.criteria.Predicate;
import javax.persistence.criteria.Root;
import java.sql.Timestamp;
import java.util.List;
import java.util.Objects;
import java.util.Optional;

/** @author zouwei */
@Slf4j
@Service
public class IKAnalyzerService {
    @Autowired private IkExtWordRepository ikExtWordRepository;

    /**
     * 获取全部分词
     *
     * @return
     */
    public List allWorld() {
        List list =
                ikExtWordRepository.findAll(Sort.by(Sort.Direction.DESC, "updateTime"));
        return list;
    }

    /**
     * 获取分词更新状态(需要考虑更新,添加和删除情况)
     *
     * @return
     */
    public String getETag() {
        Page ikExtWordPage =
                ikExtWordRepository.findAll(
                        PageRequest.of(0, 1, Sort.by(Sort.Direction.DESC, "updateTime")));
        long total = ikExtWordPage.getTotalElements();
        long updateTime = System.currentTimeMillis();
        if (total > 0) {
            BrahmaTabIkExtWordEntity ikExtWord = ikExtWordPage.getContent().get(0);
            updateTime = ikExtWord.getUpdateTime().getTime();
        }
        return updateTime + ":" + total;
    }

    /**
     * 添加分词
     *
     * @param word
     * @return
     * @throws ServiceException
     */
    @Transactional(rollbackFor = ServiceException.class)
    public BrahmaTabIkExtWordEntity addWord(String word) throws ServiceException {
        BrahmaTabIkExtWordEntity ikExtWordEntity = ikExtWordRepository.findFirstByWord(word);
        if (Objects.nonNull(ikExtWordEntity)) {
            throw new ServiceException("已经存在" + word);
        }
        return ikExtWordRepository.save(new BrahmaTabIkExtWordEntity(word));
    }

    /**
     * 删除指定分词
     *
     * @param id
     */
    @Transactional(rollbackFor = ServiceException.class)
    public void deleteWord(Integer id) {
        ikExtWordRepository.deleteById(id);
    }

    /**
     * 查询指定分词
     *
     * @param word
     * @return
     */
    public BrahmaTabIkExtWordEntity findByWord(String word) {
        return ikExtWordRepository.findFirstByWord(word);
    }

    /**
     * 条件查询
     *
     * @param search
     * @param page
     * @param size
     * @return
     */
    public Page findBySearchKey(String search, int page, int size) {
        page = page - 1;
        if (page < 0) {
            page = 0;
        }
        if (size <= 0) {
            size = 10;
        }
        Pageable pageable = PageRequest.of(page, size, Sort.by(Sort.Direction.DESC, "updateTime"));
        return ikExtWordRepository.findAll(
                (Root root,
                        CriteriaQuery query,
                        CriteriaBuilder criteriaBuilder) -> {
                    if (StringUtils.isNoneBlank(search)) {
                        String searchStr = "%" + search + "%";
                        Predicate result = criteriaBuilder.like(root.get("word"), searchStr);
                        query.where(result);
                    }
                    return null;
                },
                pageable);
    }

    /**
     * 更新指定分词
     *
     * @param wordEntity
     * @return
     * @throws ServiceException
     */
    @Transactional(rollbackFor = ServiceException.class)
    public BrahmaTabIkExtWordEntity updateExtWord(BrahmaTabIkExtWordEntity wordEntity)
            throws ServiceException {
        Integer id = wordEntity.getId();
        String word = wordEntity.getWord();
        if (Objects.isNull(id)) {
            throw new ServiceException("id不能为空");
        }
        if (StringUtils.isBlank(word)) {
            throw new ServiceException("分词不能修改为空");
        }
        Optional result = ikExtWordRepository.findById(id);
        if (result.isPresent()) {
            BrahmaTabIkExtWordEntity data = result.get();
            data.setUpdateTime(new Timestamp(System.currentTimeMillis()));
            data.setWord(word);
            return ikExtWordRepository.save(data);
        }
        throw new ServiceException("不存在这个id:" + id);
    }
}

controller

import com.yx.brahma.persistence.entity.BrahmaTabIkExtWordEntity;
import com.yx.brahma.service.ServiceException;
import com.yx.brahma.service.elasticsearch.ik.IKAnalyzerService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.util.CollectionUtils;
import org.springframework.web.bind.annotation.*;
import pers.roamer.boracay.helper.HttpResponseHelper;

import javax.servlet.http.HttpServletResponse;
import java.sql.Timestamp;
import java.text.DateFormat;
import java.util.Date;
import java.util.List;
import java.util.StringJoiner;

/** @author zouwei */
@Slf4j
@RestController
public class IKAnalyzerController {

    @Autowired private IKAnalyzerService ikAnalyzerService;

    /**
     * 获取所有分词
     *
     * @return
     */
    @GetMapping(
            value = "/ik/search",
            produces = {"text/plain;charset=utf-8"})
    public String searchByIKPlugin() {
        log.info("获取最新分词词典");
        StringJoiner stringJoiner = new StringJoiner(StringUtils.LF);
        List list = ikAnalyzerService.allWorld();
        if (CollectionUtils.isEmpty(list)) {
            return StringUtils.EMPTY;
        }
        list.forEach(e -> stringJoiner.add(e.getWord()));
        return stringJoiner.toString();
    }

    /**
     * 检测是否需要请求分词
     *
     * @param response
     */
    @RequestMapping(value = "/ik/search", method = RequestMethod.HEAD)
    public void headAllHotWord(HttpServletResponse response) {
        log.info("检测是否需要更新分词词典");
        String eTag = ikAnalyzerService.getETag();
        response.setHeader("ETag", eTag);
    }

    /**
     * 添加分词
     *
     * @param word
     * @return
     * @throws ControllerException
     */
    @PostMapping("/extDict/add")
    public String addWord(@RequestParam("word") String word) throws ControllerException {
        try {
            return HttpResponseHelper.successInfoInbox(ikAnalyzerService.addWord(word));
        } catch (ServiceException e) {
            throw new ControllerException(e.getMessage());
        }
    }

    /**
     * 删除指定分词
     *
     * @param id
     */
    @DeleteMapping("/extDict/delete/{id}")
    public void deleteWord(@PathVariable("id") Integer id) {
        ikAnalyzerService.deleteWord(id);
    }

    /**
     * 条件查询
     *
     * @param search
     * @param page
     * @param size
     * @return
     */
    @GetMapping("/extDict/search")
    public String findBySearch(String search, int page, int size) {
        return HttpResponseHelper.successInfoInbox(
                ikAnalyzerService.findBySearchKey(search, page, size));
    }

    /**
     * 更新指定分词
     *
     * @param wordEntity
     * @return
     * @throws ControllerException
     */
    @PutMapping("/extDict/update")
    public String updateWord(@RequestBody BrahmaTabIkExtWordEntity wordEntity)
            throws ControllerException {
        try {
            return HttpResponseHelper.successInfoInbox(ikAnalyzerService.updateExtWord(wordEntity));
        } catch (ServiceException e) {
            throw new ControllerException(e.getMessage());
        }
    }
}

使用ngrok内网传统测试,修改一下IK的IKAnalyzer.cfg.xml配置文件:

http://d75c5cc4.ngrok.io/ik/search

注意要把注释删掉

你可能感兴趣的:(IK分词热更新)