solr4的solr-ik插件 http://www.chepoo.com/solr4-plugin-solr-ik.html
solr4中添加中文分词ik。
1.首先从https://code.google.com/p/ik-analyzer/下载IK Analyzer 2012FF_hf1_source.rar打包成jar包,然后放入到solr程序中WEB-INF/lib/下。
2.这样做有一个很不方便的地方就是,每次我们要修改词库,我们都要重新打包,很不方便。所以我就修改了其中的部分源码。在词库更新时,直接更新WEB-INF/classes/ik目录下的文件就成。主要修改的源码为Dictionary.java.
public class Dictionary { /* * 词典单子实例 */ private static Dictionary singleton; /* * 主词典对象 */ private DictSegment _MainDict; /* * 停止词词典 */ private DictSegment _StopWordDict; /* * 量词词典 */ private DictSegment _QuantifierDict; /** * 配置对象 */ private Configuration cfg; private Dictionary(Configuration cfg){ this.cfg = cfg; this.loadMainDict(); this.loadStopWordDict(); this.loadQuantifierDict(); } /** * 词典初始化 * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化 * 只有当Dictionary类被实际调用时,才会开始载入词典, * 这将延长首次分词操作的时间 * 该方法提供了一个在应用加载阶段就初始化字典的手段 * @return Dictionary */ public static Dictionary initial(Configuration cfg){ if(singleton == null){ synchronized(Dictionary.class){ if(singleton == null){ singleton = new Dictionary(cfg); return singleton; } } } return singleton; } /** * 获取词典单子实例 * @return Dictionary 单例对象 */ public static Dictionary getSingleton(){ if(singleton == null){ throw new IllegalStateException("词典尚未初始化,请先调用initial方法"); } return singleton; } /** * 批量加载新词条 * @param words Collection<String>词条列表 */ public void addWords(Collection<String> words){ if(words != null){ for(String word : words){ if (word != null) { //批量加载词条到主内存词典中 singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray()); } } } } /** * 批量移除(屏蔽)词条 * @param words */ public void disableWords(Collection<String> words){ if(words != null){ for(String word : words){ if (word != null) { //批量屏蔽词条 singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray()); } } } } /** * 检索匹配主词典 * @param charArray * @return Hit 匹配结果描述 */ public Hit matchInMainDict(char[] charArray){ return singleton._MainDict.match(charArray); } /** * 检索匹配主词典 * @param charArray * @param begin * @param length * @return Hit 匹配结果描述 */ public Hit matchInMainDict(char[] charArray , int begin, int length){ return singleton._MainDict.match(charArray, begin, length); } /** * 检索匹配量词词典 * @param charArray * @param begin * @param length * @return Hit 匹配结果描述 */ public Hit matchInQuantifierDict(char[] charArray , int begin, int length){ return singleton._QuantifierDict.match(charArray, begin, length); } /** * 从已匹配的Hit中直接取出DictSegment,继续向下匹配 * @param charArray * @param currentIndex * @param matchedHit * @return Hit */ public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){ DictSegment ds = matchedHit.getMatchedDictSegment(); return ds.match(charArray, currentIndex, 1 , matchedHit); } /** * 判断是否是停止词 * @param charArray * @param begin * @param length * @return boolean */ public boolean isStopWord(char[] charArray , int begin, int length){ return singleton._StopWordDict.match(charArray, begin, length).isMatch(); } /** * 加载主词典及扩展词典 */ private void loadMainDict(){ //建立一个主词典实例 _MainDict = new DictSegment((char)0); //读取主词典文件 InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getMainDictionary()); if(is == null){ throw new RuntimeException("Main Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { System.err.println("Main Dictionary loading exception."); ioe.printStackTrace(); }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { e.printStackTrace(); } } //加载扩展词典 this.loadExtDict(); } /** * 加载用户配置的扩展词典到主词库表 */ private void loadExtDict(){ //加载扩展词典配置 List<String> extDictFiles = cfg.getExtDictionarys(); if(extDictFiles != null){ InputStream is = null; for(String extDictName : extDictFiles){ //读取扩展词典文件 System.out.println("加载扩展词典:" + extDictName); is = this.getClass().getClassLoader().getResourceAsStream(extDictName); //如果找不到扩展的字典,则忽略 if(is == null){ continue; } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { //加载扩展词典数据到主内存词典中 //System.out.println(theWord); _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { System.err.println("Extension Dictionary loading exception."); ioe.printStackTrace(); }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { e.printStackTrace(); } } } } } /** * 加载用户扩展的停止词词典 */ private void loadStopWordDict(){ //建立一个主词典实例 _StopWordDict = new DictSegment((char)0); //加载扩展停止词典 List<String> extStopWordDictFiles = cfg.getExtStopWordDictionarys(); if(extStopWordDictFiles != null){ InputStream is = null; for(String extStopWordDictName : extStopWordDictFiles){ System.out.println("加载扩展停止词典:" + extStopWordDictName); //读取扩展词典文件 is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName); //如果找不到扩展的字典,则忽略 if(is == null){ continue; } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { //System.out.println(theWord); //加载扩展停止词典数据到内存中 _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { System.err.println("Extension Stop word Dictionary loading exception."); ioe.printStackTrace(); }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { e.printStackTrace(); } } } } } /** * 加载量词词典 */ private void loadQuantifierDict(){ //建立一个量词典实例 _QuantifierDict = new DictSegment((char)0); //读取量词词典文件 InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary()); if(is == null){ throw new RuntimeException("Quantifier Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { System.err.println("Quantifier Dictionary loading exception."); ioe.printStackTrace(); }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { e.printStackTrace(); } } } } |
3.相关程序已经放入到了https://github.com/awnuxkjy/solr-ik
4.使用方法。配置solrhome下的collection1\conf\schema.xml
<fieldType name="ik_text" class="solr.TextField"> <analyzer class="org.wltea.analyzer.lucene.IKAnalyzer" useSmart="true"/> </fieldType> |
在你的solr程序中/classes文件中新建ik目录,拷贝main2012.dic,ext.dic,IKAnalyzer.cfg.xml,quantifier.dic, stopword.dic等文件。
将ik源码打成jar包。放入到WEB-INF/lib/下
本文固定链接: http://www.chepoo.com/solr4-plugin-solr-ik.html | IT技术精华网