修改IK分词器源码来基于mysql热更新词库

热更新
每次都是在es的扩展词典中,手动添加新词语,很坑
(1)每次添加完,都要重启es才能生效,非常麻烦
(2)es是分布式的,可能有数百个节点,你不能每次都一个一个节点上面去修改

es不停机,直接我们在外部某个地方添加新的词语,es中立即热加载到这些新词语

热更新方案:
(1)修改IK分词器源码,然后手动支持从mysql中每隔一定时间,自动加载新的词库
(1)基于IK分词器原生支持的热更新方案,部署一个web服务器,提供一个http接口,通过modified和tag两个http响应头,来提供词语的热更新

用第一种方案,第二种,ik git社区官方都不建议采用,觉得不太稳定

1、下载源码
https://github.com/medcl/elasticsearch-analysis-ik/tree/6.x
ik分词器,是个标准的java maven工程

2、修改源码
我们需要创建一个我们自定义的线程,并且启动它

/**
     * 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
     * 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段
     * 
     * @return Dictionary
     */
    public static synchronized void initial(Configuration cfg) {
        if (singleton == null) {
            synchronized (Dictionary.class) {
                if (singleton == null) {

                    singleton = new Dictionary(cfg);
                    singleton.loadMainDict();
                    singleton.loadSurnameDict();
                    singleton.loadQuantifierDict();
                    singleton.loadSuffixDict();
                    singleton.loadPrepDict();
                    singleton.loadStopWordDict();

                    // 自己加入的代码
                    new Thread(new HotDictReloadThread()).start();

                    if(cfg.isEnableRemoteDict()){
                        // 建立监控线程
                        for (String location : singleton.getRemoteExtDictionarys()) {
                            // 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
                            pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
                        }
                        for (String location : singleton.getRemoteExtStopWordDictionarys()) {
                            pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
                        }
                    }

                }
            }
        }
    }
package org.wltea.analyzer.dic;

import org.wltea.analyzer.help.ESPluginLoggerFactory;

import java.util.logging.Logger;

public class HotDictReloadThread implements Runnable {

    private static final Logger logger = (Logger) ESPluginLoggerFactory.getLogger(HotDictReloadThread.class.getName());
    
    @Override
    public void run() {
        while (true) {
            logger.info("[==============]reload hot dict from mysql......");
            Dictionary.getSingleton().reLoadMainDict();
        }
    }
}

jdbc-reload.properties

jdbc.url=jdbc:mysql://localhost:3306/test?serverTimezone=GMT
jdbc.user=root
jdbc.password=root
jdbc.reload.sql=select word from hot_words
jdbc.reload.stopword.sql=select stopword as word from hot_stopwords
jdbc.reload.interval=1000

从mysql加载词典

     /**
     * 加载主词典及扩展词典
     */
    private void loadMainDict() {
        // 建立一个主词典实例
        _MainDict = new DictSegment((char) 0);

        // 读取主词典文件
        Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
        loadDictFile(_MainDict, file, false, "Main Dict");
        // 加载扩展词典
        this.loadExtDict();
        // 加载远程自定义词库
        this.loadRemoteExtDict();
        // 从mysql加载词典
        this.loadMySQLExtDict();
    }

    private static Properties prop = new Properties();

    static {
        try {
            Class.forName("com.mysql.jdbc.Driver");
        } catch (ClassNotFoundException e) {
            logger.error("error", e);
        }
    }

    /**
     * 从mysql加载热更新词典
     */
    private void loadMySQLExtDict() {
        Connection conn = null;
        Statement stmt = null;
        ResultSet rs = null;
        try {
            Path file = PathUtils.get(getDictRoot(), "jdbc-reload.properties");
            prop.load(new FileInputStream(file.toFile()));

            logger.info("[===========]jdbc-reload.properties");
            for (Object key : prop.keySet()) {
                logger.info("[==========]" + key + "=" + prop.getProperty(String.valueOf(key)));
            }

            logger.info("[=============]query hot dict from mysql, " + prop.getProperty("jdbc.reload.sql"));

            conn = DriverManager.getConnection(
                    props.getProperty("jdbc.url"),
                    props.getProperty("jdbc.user"),
                    props.getProperty("jdbc.password"));
            stmt = conn.createStatement();
            rs = stmt.executeQuery(prop.getProperty("jdbc.reload.sql"));

            while (rs.next()) {
                String theWord = rs.getString("word");
                logger.info("[================]hot word from mysql: " + theWord);
                _MainDict.fillSegment(theWord.trim().toCharArray());
            }

            Thread.sleep(Integer.valueOf(String.valueOf(prop.get("jdbc.reload.interval"))));
        } catch (Exception e) {
            logger.error("error", e);
        } finally {
            if (rs != null) {
                try {
                    rs.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
            if (stmt != null) {
                try {
                    stmt.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
            if (conn != null) {
                try {
                    conn.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
        }
    }

从mysql加载停用词

/**
     * 从mysql加载停用词
     */
    private void loadMySQLStopwordDict() {
        Connection conn = null;
        Statement stmt = null;
        ResultSet rs = null;
        
        try {
            Path file = PathUtils.get(getDictRoot(), "jdbc-reload.properties");   
            prop.load(new FileInputStream(file.toFile()));
            
            logger.info("[==========]jdbc-reload.properties");
            for(Object key : prop.keySet()) {
                logger.info("[==========]" + key + "=" + prop.getProperty(String.valueOf(key)));      
            }
            
            logger.info("[==========]query hot stopword dict from mysql, " + prop.getProperty("jdbc.reload.stopword.sql") + "......");  
            
            conn = DriverManager.getConnection(
                    prop.getProperty("jdbc.url"),   
                    prop.getProperty("jdbc.user"),  
                    prop.getProperty("jdbc.password"));  
            stmt = conn.createStatement();
            rs = stmt.executeQuery(prop.getProperty("jdbc.reload.stopword.sql"));  
            
            while(rs.next()) {
                String theWord = rs.getString("word"); 
                logger.info("[==========]hot stopword from mysql: " + theWord); 
                _StopWords.fillSegment(theWord.trim().toCharArray());
            }
             
            Thread.sleep(Integer.valueOf(String.valueOf(prop.get("jdbc.reload.interval"))));   
        } catch (Exception e) {
            logger.error("erorr", e); 
        } finally {
            if(rs != null) {
                try {
                    rs.close();
                } catch (SQLException e) {
                    logger.error("error", e); 
                }
            }
            if(stmt != null) {
                try {
                    stmt.close();
                } catch (SQLException e) {
                    logger.error("error", e); 
                }
            }
            if(conn != null) {
                try {
                    conn.close();
                } catch (SQLException e) {
                    logger.error("error", e); 
                }
            }
        }
    }
/**
     * 加载用户扩展的停止词词典
     */
    private void loadStopWordDict() {
        // 建立主词典实例
        _StopWords = new DictSegment((char) 0);

        // 读取主词典文件
        Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP);

        InputStream is = null;
        try {
            is = new FileInputStream(file.toFile());
        } catch (FileNotFoundException e) {
            logger.error(e.getMessage(), e);
        }

        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
            String theWord = null;
            do {
                theWord = br.readLine();
                if (theWord != null && !"".equals(theWord.trim())) {
                    _StopWords.fillSegment(theWord.trim().toCharArray());
                }
            } while (theWord != null);

        } catch (IOException e) {
            logger.error("ik-analyzer", e);

        } finally {
            try {
                if (is != null) {
                    is.close();
                    is = null;
                }
            } catch (IOException e) {
                logger.error("ik-analyzer", e);
            }
        }

        // 加载扩展停止词典
        List extStopWordDictFiles = getExtStopWordDictionarys();
        if (extStopWordDictFiles != null) {
            is = null;
            for (String extStopWordDictName : extStopWordDictFiles) {
                logger.info("[Dict Loading] " + extStopWordDictName);

                // 读取扩展词典文件
                file = PathUtils.get(getDictRoot(), extStopWordDictName);
                try {
                    is = new FileInputStream(file.toFile());
                } catch (FileNotFoundException e) {
                    logger.error("ik-analyzer", e);
                }
                // 如果找不到扩展的字典,则忽略
                if (is == null) {
                    continue;
                }
                try {
                    BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
                    String theWord = null;
                    do {
                        theWord = br.readLine();
                        if (theWord != null && !"".equals(theWord.trim())) {
                            // 加载扩展停止词典数据到内存中
                            _StopWords.fillSegment(theWord.trim().toCharArray());
                        }
                    } while (theWord != null);

                } catch (IOException e) {
                    logger.error("ik-analyzer", e);

                } finally {
                    try {
                        if (is != null) {
                            is.close();
                            is = null;
                        }
                    } catch (IOException e) {
                        logger.error("ik-analyzer", e);
                    }
                }
            }
        }

        // 加载远程停用词典
        List remoteExtStopWordDictFiles = getRemoteExtStopWordDictionarys();
        for (String location : remoteExtStopWordDictFiles) {
            logger.info("[Dict Loading] " + location);
            List lists = getRemoteWords(location);
            // 如果找不到扩展的字典,则忽略
            if (lists == null) {
                logger.error("[Dict Loading] " + location + "加载失败");
                continue;
            }
            for (String theWord : lists) {
                if (theWord != null && !"".equals(theWord.trim())) {
                    // 加载远程词典数据到主内存中
                    logger.info(theWord);
                    _StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray());
                }
            }
        }
        
        // 自定义方法
        this.loadMySQLStopwordDict();
    }

3、mvn package打包代码

4、解压缩ik压缩包

将mysql驱动jar,放入ik的目录下

5、修改jdbc相关配置

6、重启es

观察日志,日志中就会显示我们打印的那些东西,比如加载了什么配置,加载了什么词语,什么停用词

7、在mysql中添加词库与停用词

8、分词实验,验证热更新生效

你可能感兴趣的:(修改IK分词器源码来基于mysql热更新词库)