elasticsearch7.2.0
同义词插件:elasticsearch-analysis-dynamic-synonym
elasticsearch-analysis-dynamic-synonym插件官网
https://github.com/bells/elasticsearch-analysis-dynamic-synonym
后面内容为同义词更新词库方法,elasticsearch安装同义词插件不再赘述;
由于服务器jdk版本与es使用版本不一致,以下为es单独指定jdk版本的;
在es内部添加同义词文件,实现同义词查询,es已内置该功能;
官方说明文档地址如下:
https://www.elastic.co/guide/en/elasticsearch/reference/7.2/analysis-synonym-tokenfilter.html
简单使用方法:
在elasticsearch安装目录下的config目录下(elasticsearch-7.2.0/config)新建synonyms.txt文本;
并在文本内添加同义词如(英文逗号分隔):
美元,美金,美币
苹果,iphone
PUT syno_v1
{
"settings": {
"index":{
"number_of_shards": "3",
"number_of_replicas": "1",
"max_result_window": "200000",
"analysis": {
"filter":{
"my_syno_filter":{
"type":"synonym",
"synonyms_path":"synonyms.txt"
}
},
"ik_max_syno": {
"type":"custom",
"tokenizer": "ik_max_word",
"filter": [
"lowercase",
"my_syno_filter"
]
}
}
}
}
},
"mappings": {
"properties": {
"keyword": {
"type": "text",
"analyzer": "ik_max_syno"
}
}
}
}
dsl:
GET syno_v1/_analyze
{
"analyzer": "ik_max_syno",
"text": "苹果"
}
结果:
{
"tokens" : [
{
"token" : "苹果",
"start_offset" : 0,
"end_offset" : 2,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "iphone",
"start_offset" : 0,
"end_offset" : 2,
"type" : "SYNONYM",
"position" : 0
}
]
}
Example:
{
"index" : {
"analysis" : {
"analyzer" : {
"synonym" : {
"tokenizer" : "whitespace",
"filter" : ["remote_synonym"]
}
},
"filter" : {
"remote_synonym" : {
"type" : "dynamic_synonym",
"synonyms_path" : "http://host:port/synonym.txt",
"interval": 30
},
"local_synonym" : {
"type" : "dynamic_synonym",
"synonyms_path" : "synonym.txt"
}
}
}
}
}
Configuration:
synonyms_path
: A file path relative to the Elastic config file or an URL, mandatory
相对于Elastic配置文件或URL的文件路径(必填)
interval
: Refresh interval in seconds for the synonym file, default: 60
, optional
同义词文件的刷新间隔(以秒为单位),默认值:60,可选
ignore_case
: Ignore case in synonyms file, default: false
, optional
忽略同义词文件中的大小写,默认值:false,可选
expand
: Expand, default: true
, optional
lenient
: Lenient on exception thrown when importing a synonym, default: false
, optional
format
: Synonym file format, default: ''
, optional. For WordNet structure this can be set to 'wordnet'
@RestController
@RequestMapping("/synonym")
@Slf4j
public class SynonymController {
private String lastModified = new Date().toString();
private String etag = String.valueOf(System.currentTimeMillis());
@RequestMapping(value = "/word", method = {
RequestMethod.GET,RequestMethod.HEAD}, produces="text/html;charset=UTF-8")
public String getSynonymWord(HttpServletResponse response){
response.setHeader("Last-Modified",lastModified);
response.setHeader("ETag",etag);
//response.setHeader("If-Modified-Since",lastModified);
Connection conn = null;
Statement stmt = null;
ResultSet rs = null;
StringBuilder words = new StringBuilder();
try {
Class.forName("oracle.jdbc.driver.OracleDriver");
conn = DriverManager.getConnection(
"jdbc:oracle:thin:@192.168.114.13:1521:xe",
"test",
"test"
);
stmt = conn.createStatement();
rs = stmt.executeQuery("select word from SYNONYM_WORD where status=0");
while(rs.next()) {
String theWord = rs.getString("word");
System.out.println("hot word from mysql: " + theWord);
words.append(theWord);
words.append("\n");
}
return words.toString();
} catch (Exception e) {
e.printStackTrace();
} finally {
if(rs != null) {
try {
rs.close();
} catch (SQLException e) {
log.error("资源关闭异常:",e);
}
}
if(stmt != null) {
try {
stmt.close();
} catch (SQLException e) {
log.error("资源关闭异常:",e);
}
}
if(conn != null) {
try {
conn.close();
} catch (SQLException e) {
log.error("资源关闭异常:",e);
}
}
}
return null;
}
@RequestMapping(value = "/update", method = RequestMethod.GET)
public void updateModified(){
lastModified = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss").format(new Date());
etag = String.valueOf(System.currentTimeMillis());
}
}
注:
PUT syno_v2
{
"settings": {
"index":{
"number_of_shards": "3",
"number_of_replicas": "1",
"max_result_window": "200000",
"analysis": {
"filter":{
"remote_syno_filter":{
"type":"dynamic_synonym",
"synonyms_path":"http://192.168.xx.xx:8080/synonym/word"
}
},
"ik_max_syno": {
"type":"custom",
"tokenizer": "ik_max_word",
"filter": [
"lowercase",
"remote_syno_filter"
]
}
}
}
}
},
"mappings": {
"properties": {
"keyword": {
"type": "text",
"analyzer": "ik_max_syno"
}
}
}
}
https://github.com/bells/elasticsearch-analysis-dynamic-synonym
在项目根目录下创建config目录并创建config\jdbc-reload.properties配置文件:
jdbc.url=jdbc:oracle:thin:@192.168.xx.xx:1521:xe
jdbc.user=test
jdbc.password=test
jdbc.reload.synonym.sql=SELECT word FROM TEST.SYNONYM_WORD WHERE STATUS = 0
jdbc.lastModified.synonym.sql=SELECT MAX(UPDATE_TIME) AS last_modify_dt FROM TEST.SYNONYM_WORD
jdbc.driver=oracle.jdbc.driver.OracleDriver
在目录analysis下创建DBRemoteSynonymFile类,
具体位置:
src\main\java\com\bellszhu\elasticsearch\plugin\synonym\analysis\DBRemoteSynonymFile.java
内容:
package com.bellszhu.elasticsearch.plugin.synonym.analysis;
import com.bellszhu.elasticsearch.plugin.DynamicSynonymPlugin;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.elasticsearch.common.io.PathUtils;
import org.elasticsearch.env.Environment;
import java.io.*;
import java.nio.file.Path;
import java.sql.*;
import java.util.ArrayList;
import java.util.Properties;
/**
* com.bellszhu.elasticsearch.plugin.synonym.analysis
* author: Yic.z
* date: 2020-08-04
*/
public class DBRemoteSynonymFile implements SynonymFile {
// 配置文件名
private final static String DB_PROPERTIES = "jdbc-reload.properties";
private static Logger logger = LogManager.getLogger("dynamic-synonym");
private String format;
private boolean expand;
private boolean lenient;
private Analyzer analyzer;
private Environment env;
// 数据库配置
private String location;
private long lastModified;
private Connection connection = null;
private Statement statement = null;
private Properties props;
private Path conf_dir;
DBRemoteSynonymFile(Environment env, Analyzer analyzer,
boolean expand,boolean lenient, String format, String location) {
this.analyzer = analyzer;
this.expand = expand;
this.lenient = lenient;
this.format = format;
this.env = env;
this.location = location;
this.props = new Properties();
//读取当前 jar 包存放的路径
Path filePath = PathUtils.get(new File(DynamicSynonymPlugin.class.getProtectionDomain().getCodeSource()
.getLocation().getPath())
.getParent(), "config")
.toAbsolutePath();
this.conf_dir = filePath.resolve(DB_PROPERTIES);
//判断文件是否存在
File configFile = conf_dir.toFile();
InputStream input = null;
try {
input = new FileInputStream(configFile);
} catch (FileNotFoundException e) {
logger.info("jdbc-reload.properties not find. " + e);
}
if (input != null) {
try {
props.load(input);
} catch (IOException e) {
logger.error("fail to load the jdbc-reload.properties," + e);
}
}
isNeedReloadSynonymMap();
}
/**
* 加载同义词词典至SynonymMap中
* @return SynonymMap
*/
@Override
public SynonymMap reloadSynonymMap() {
try {
logger.info("start reload local synonym from {}.", location);
Reader rulesReader = getReader();
SynonymMap.Builder parser = RemoteSynonymFile.getSynonymParser(rulesReader, format, expand, lenient, analyzer);
return parser.build();
} catch (Exception e) {
logger.error("reload local synonym {} error!", e, location);
throw new IllegalArgumentException(
"could not reload local synonyms file to build synonyms", e);
}
}
/**
* 判断是否需要进行重新加载
* @return true or false
*/
@Override
public boolean isNeedReloadSynonymMap() {
try {
Long lastModify = getLastModify();
if (lastModified < lastModify) {
lastModified = lastModify;
return true;
}
} catch (Exception e) {
logger.error(e);
}
return false;
}
/**
* 获取同义词库最后一次修改的时间
* 用于判断同义词是否需要进行重新加载
*
* @return getLastModify
*/
public Long getLastModify() {
ResultSet resultSet = null;
Long last_modify_long = null;
try {
if (connection == null || statement == null) {
Class.forName(props.getProperty("jdbc.driver"));
connection = DriverManager.getConnection(
props.getProperty("jdbc.url"),
props.getProperty("jdbc.user"),
props.getProperty("jdbc.password")
);
statement = connection.createStatement();
}
resultSet = statement.executeQuery(props.getProperty("jdbc.lastModified.synonym.sql"));
while (resultSet.next()) {
Timestamp last_modify_dt = resultSet.getTimestamp("last_modify_dt");
last_modify_long = last_modify_dt.getTime();
}
} catch (ClassNotFoundException | SQLException e) {
logger.error("获取同义词库最后一次修改的时间",e);
} finally {
try {
if (resultSet != null) {
resultSet.close();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
return last_modify_long;
}
/**
* 查询数据库中的同义词
* @return DBData
*/
public ArrayList<String> getDBData() {
ArrayList<String> arrayList = new ArrayList<>();
ResultSet resultSet = null;
try {
if (connection == null || statement == null) {
Class.forName(props.getProperty("jdbc.driver"));
connection = DriverManager.getConnection(
props.getProperty("jdbc.url"),
props.getProperty("jdbc.user"),
props.getProperty("jdbc.password")
);
statement = connection.createStatement();
}
resultSet = statement.executeQuery(props.getProperty("jdbc.reload.synonym.sql"));
while (resultSet.next()) {
String theWord = resultSet.getString("word");
arrayList.add(theWord);
}
} catch (ClassNotFoundException | SQLException e) {
logger.error("查询数据库中的同义词异常",e);
} finally {
try {
if (resultSet != null) {
resultSet.close();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
return arrayList;
}
/**
* 同义词库的加载
* @return Reader
*/
@Override
public Reader getReader() {
StringBuffer sb = new StringBuffer();
try {
ArrayList<String> dbData = getDBData();
for (int i = 0; i < dbData.size(); i++) {
logger.info("load the synonym from db," + dbData.get(i));
sb.append(dbData.get(i))
.append(System.getProperty("line.separator"));
}
} catch (Exception e) {
logger.error("reload synonym from db failed");
}
return new StringReader(sb.toString());
}
}
修改DynamicSynonymTokenFilterFactory类中的getSynonymFile方法:
添加选择远程直连数据库方法
SynonymFile getSynonymFile(Analyzer analyzer) {
try {
SynonymFile synonymFile;
if (location.equals("fromDB")){
synonymFile = new DBRemoteSynonymFile(environment, analyzer, expand, lenient, format,
location);
} else if (location.startsWith("http://") || location.startsWith("https://")) {
synonymFile = new RemoteSynonymFile(environment, analyzer, expand, lenient, format,
location);
} else {
synonymFile = new LocalSynonymFile(environment, analyzer, expand, lenient, format,
location);
}
if (scheduledFuture == null) {
scheduledFuture = pool.scheduleAtFixedRate(new Monitor(synonymFile),
interval, interval, TimeUnit.SECONDS);
}
return synonymFile;
} catch (Exception e) {
throw new IllegalArgumentException("failed to get synonyms : " + location, e);
}
}
在pom.xml文件中添加数据库依赖
com.oracle.ojdbc
ojdbc8
19.3.0.0
根据es版本修改ik对应版本
7.2.0
在src\main\assemblies\plugin.xml中添加配置使得数据库相关依赖一并打包
在中添加:
true
true
com.oracle.ojdbc:ojdbc8
空白处添加打包配置文件配置
${project.basedir}/config
config
修改后的同义词插件使用例子:
PUT syno_v2
{
"settings": {
"index":{
"number_of_shards": "3",
"number_of_replicas": "1",
"max_result_window": "200000",
"analysis": {
"filter":{
"remote_syno_filter":{
"type":"dynamic_synonym",
"synonyms_path":"fromDB",
"interval": 120
}
},
"ik_max_syno": {
"type":"custom",
"tokenizer": "ik_max_word",
"filter": [
"lowercase",
"remote_syno_filter"
]
}
}
}
}
},
"mappings": {
"properties": {
"keyword": {
"type": "text",
"analyzer": "ik_max_syno"
}
}
}
}
如更新ik插件以后,出现报错如下:
java.security.AccessControlException: access denied (java.net.SocketPermission172.16.xxx.xxx:3306 connect,resolve)
这是jar的安全策略的错误(具体没有深究),解决方案如下:
1、在ik源码的config中创建文件socketPolicy.policy
grant {
permission java.net.SocketPermission "business.mysql.youboy.com:3306","connect,resolve";
};
2、在服务器上的es中的config目录文件jvm.option添加如下代码配置上面的文件路径
-Djava.security.policy=/data/elasticsearch-6.5.3/plugins/ik/config/socketPolicy.policy
https://blog.csdn.net/weixin_43315211/article/details/100144968
ps:ik分词器实现词库热更新文章链接