在使用ES做查询的时候,为了获取更加准确地匹配查询结果,需要自定义与业务相关的分词词典。比如汽车行业的一些专业用词:奔驰AMG、宝马X5......
假如不自定义分词的话,默认“奔驰AMG”是会被切分成“奔驰”和“AMG”的,那么意味着所有和“奔驰”或“AMG”相关的数据都会被查询出来,显然所有的“奔驰”并不是我们的目标数据。
根据以上问题描述,我们需要添加自定义分词。通过查看IK文档,发现可以做成热更新:
1.数据表设计
CREATE TABLE `brahma_tab_ik_ext_word` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`word` varchar(255) COLLATE utf8mb4_bin NOT NULL COMMENT '扩展词',
`update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改时间',
PRIMARY KEY (`id`),
UNIQUE KEY `UNIQUE_WORD` (`word`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin COMMENT='ik分词扩展词典';
2.编码
jpa
import com.yx.brahma.persistence.entity.BrahmaTabIkExtWordEntity;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.Pageable;
import org.springframework.data.jpa.domain.Specification;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.repository.PagingAndSortingRepository;
import org.springframework.stereotype.Repository;
/** @author zouwei */
@Repository
public interface IkExtWordRepository
extends JpaRepository,
PagingAndSortingRepository {
/**
* 查询分词记录
*
* @param word
* @return
*/
BrahmaTabIkExtWordEntity findFirstByWord(String word);
/**
* 条件查询
*
* @param spec
* @param pageable
* @return
*/
Page findAll(
Specification spec, Pageable pageable);
}
service
import com.yx.brahma.persistence.entity.BrahmaTabIkExtWordEntity;
import com.yx.brahma.persistence.repository.IkExtWordRepository;
import com.yx.brahma.service.ServiceException;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Pageable;
import org.springframework.data.domain.Sort;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import javax.persistence.criteria.CriteriaBuilder;
import javax.persistence.criteria.CriteriaQuery;
import javax.persistence.criteria.Predicate;
import javax.persistence.criteria.Root;
import java.sql.Timestamp;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
/** @author zouwei */
@Slf4j
@Service
public class IKAnalyzerService {
@Autowired private IkExtWordRepository ikExtWordRepository;
/**
* 获取全部分词
*
* @return
*/
public List allWorld() {
List list =
ikExtWordRepository.findAll(Sort.by(Sort.Direction.DESC, "updateTime"));
return list;
}
/**
* 获取分词更新状态(需要考虑更新,添加和删除情况)
*
* @return
*/
public String getETag() {
Page ikExtWordPage =
ikExtWordRepository.findAll(
PageRequest.of(0, 1, Sort.by(Sort.Direction.DESC, "updateTime")));
long total = ikExtWordPage.getTotalElements();
long updateTime = System.currentTimeMillis();
if (total > 0) {
BrahmaTabIkExtWordEntity ikExtWord = ikExtWordPage.getContent().get(0);
updateTime = ikExtWord.getUpdateTime().getTime();
}
return updateTime + ":" + total;
}
/**
* 添加分词
*
* @param word
* @return
* @throws ServiceException
*/
@Transactional(rollbackFor = ServiceException.class)
public BrahmaTabIkExtWordEntity addWord(String word) throws ServiceException {
BrahmaTabIkExtWordEntity ikExtWordEntity = ikExtWordRepository.findFirstByWord(word);
if (Objects.nonNull(ikExtWordEntity)) {
throw new ServiceException("已经存在" + word);
}
return ikExtWordRepository.save(new BrahmaTabIkExtWordEntity(word));
}
/**
* 删除指定分词
*
* @param id
*/
@Transactional(rollbackFor = ServiceException.class)
public void deleteWord(Integer id) {
ikExtWordRepository.deleteById(id);
}
/**
* 查询指定分词
*
* @param word
* @return
*/
public BrahmaTabIkExtWordEntity findByWord(String word) {
return ikExtWordRepository.findFirstByWord(word);
}
/**
* 条件查询
*
* @param search
* @param page
* @param size
* @return
*/
public Page findBySearchKey(String search, int page, int size) {
page = page - 1;
if (page < 0) {
page = 0;
}
if (size <= 0) {
size = 10;
}
Pageable pageable = PageRequest.of(page, size, Sort.by(Sort.Direction.DESC, "updateTime"));
return ikExtWordRepository.findAll(
(Root root,
CriteriaQuery query,
CriteriaBuilder criteriaBuilder) -> {
if (StringUtils.isNoneBlank(search)) {
String searchStr = "%" + search + "%";
Predicate result = criteriaBuilder.like(root.get("word"), searchStr);
query.where(result);
}
return null;
},
pageable);
}
/**
* 更新指定分词
*
* @param wordEntity
* @return
* @throws ServiceException
*/
@Transactional(rollbackFor = ServiceException.class)
public BrahmaTabIkExtWordEntity updateExtWord(BrahmaTabIkExtWordEntity wordEntity)
throws ServiceException {
Integer id = wordEntity.getId();
String word = wordEntity.getWord();
if (Objects.isNull(id)) {
throw new ServiceException("id不能为空");
}
if (StringUtils.isBlank(word)) {
throw new ServiceException("分词不能修改为空");
}
Optional result = ikExtWordRepository.findById(id);
if (result.isPresent()) {
BrahmaTabIkExtWordEntity data = result.get();
data.setUpdateTime(new Timestamp(System.currentTimeMillis()));
data.setWord(word);
return ikExtWordRepository.save(data);
}
throw new ServiceException("不存在这个id:" + id);
}
}
controller
import com.yx.brahma.persistence.entity.BrahmaTabIkExtWordEntity;
import com.yx.brahma.service.ServiceException;
import com.yx.brahma.service.elasticsearch.ik.IKAnalyzerService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.util.CollectionUtils;
import org.springframework.web.bind.annotation.*;
import pers.roamer.boracay.helper.HttpResponseHelper;
import javax.servlet.http.HttpServletResponse;
import java.sql.Timestamp;
import java.text.DateFormat;
import java.util.Date;
import java.util.List;
import java.util.StringJoiner;
/** @author zouwei */
@Slf4j
@RestController
public class IKAnalyzerController {
@Autowired private IKAnalyzerService ikAnalyzerService;
/**
* 获取所有分词
*
* @return
*/
@GetMapping(
value = "/ik/search",
produces = {"text/plain;charset=utf-8"})
public String searchByIKPlugin() {
log.info("获取最新分词词典");
StringJoiner stringJoiner = new StringJoiner(StringUtils.LF);
List list = ikAnalyzerService.allWorld();
if (CollectionUtils.isEmpty(list)) {
return StringUtils.EMPTY;
}
list.forEach(e -> stringJoiner.add(e.getWord()));
return stringJoiner.toString();
}
/**
* 检测是否需要请求分词
*
* @param response
*/
@RequestMapping(value = "/ik/search", method = RequestMethod.HEAD)
public void headAllHotWord(HttpServletResponse response) {
log.info("检测是否需要更新分词词典");
String eTag = ikAnalyzerService.getETag();
response.setHeader("ETag", eTag);
}
/**
* 添加分词
*
* @param word
* @return
* @throws ControllerException
*/
@PostMapping("/extDict/add")
public String addWord(@RequestParam("word") String word) throws ControllerException {
try {
return HttpResponseHelper.successInfoInbox(ikAnalyzerService.addWord(word));
} catch (ServiceException e) {
throw new ControllerException(e.getMessage());
}
}
/**
* 删除指定分词
*
* @param id
*/
@DeleteMapping("/extDict/delete/{id}")
public void deleteWord(@PathVariable("id") Integer id) {
ikAnalyzerService.deleteWord(id);
}
/**
* 条件查询
*
* @param search
* @param page
* @param size
* @return
*/
@GetMapping("/extDict/search")
public String findBySearch(String search, int page, int size) {
return HttpResponseHelper.successInfoInbox(
ikAnalyzerService.findBySearchKey(search, page, size));
}
/**
* 更新指定分词
*
* @param wordEntity
* @return
* @throws ControllerException
*/
@PutMapping("/extDict/update")
public String updateWord(@RequestBody BrahmaTabIkExtWordEntity wordEntity)
throws ControllerException {
try {
return HttpResponseHelper.successInfoInbox(ikAnalyzerService.updateExtWord(wordEntity));
} catch (ServiceException e) {
throw new ControllerException(e.getMessage());
}
}
}
使用ngrok内网传统测试,修改一下IK的IKAnalyzer.cfg.xml配置文件:
http://d75c5cc4.ngrok.io/ik/search
注意要把注释删掉