Java开源分词系统IKAnalyzer学习(四) 词库加载源代码——Dictionary类

词库加载的源代码:

/** * */ package org.wltea.analyzer.dic; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Collection; import java.util.List; import org.wltea.analyzer.cfg.Configuration; /** * IK Analyzer v3.2 * 词典管理类,单子模式 * @author 林良益 * */ public class Dictionary { /* * 分词器默认字典路径 */ public static final String PATH_DIC_MAIN = "/org/wltea/analyzer/dic/main.dic"; public static final String PATH_DIC_SURNAME = "/org/wltea/analyzer/dic/surname.dic"; public static final String PATH_DIC_QUANTIFIER = "/org/wltea/analyzer/dic/quantifier.dic"; public static final String PATH_DIC_SUFFIX = "/org/wltea/analyzer/dic/suffix.dic"; public static final String PATH_DIC_PREP = "/org/wltea/analyzer/dic/preposition.dic"; public static final String PATH_DIC_STOP = "/org/wltea/analyzer/dic/stopword.dic"; /* * 词典单子实例 */ private static final Dictionary singleton; /* * 词典初始化 */ static{ singleton = new Dictionary(); } /* * 主词典对象 */ private DictSegment _MainDict; /* * 姓氏词典 */ private DictSegment _SurnameDict; /* * 量词词典 */ private DictSegment _QuantifierDict; /* * 后缀词典 */ private DictSegment _SuffixDict; /* * 副词,介词词典 */ private DictSegment _PrepDict; /* * 停止词集合 */ private DictSegment _StopWords; private Dictionary(){ //初始化系统词典 loadMainDict(); loadSurnameDict(); loadQuantifierDict(); loadSuffixDict(); loadPrepDict(); loadStopWordDict(); } /** * 加载主词典及扩展词典 */ private void loadMainDict(){ //建立一个主词典实例 _MainDict = new DictSegment((char)0); //读取主词典文件 InputStream is = Dictionary.class.getResourceAsStream(Dictionary.PATH_DIC_MAIN); if(is == null){ throw new RuntimeException("Main Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _MainDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { System.err.println("Main Dictionary loading exception."); ioe.printStackTrace(); }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { e.printStackTrace(); } } //加载扩展词典配置 List<String> extDictFiles = Configuration.getExtDictionarys(); if(extDictFiles != null){ for(String extDictName : extDictFiles){ //读取扩展词典文件 is = Dictionary.class.getResourceAsStream(extDictName); //如果找不到扩展的字典,则忽略 if(is == null){ continue; } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { //加载扩展词典数据到主内存词典中 //System.out.println(theWord); _MainDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { System.err.println("Extension Dictionary loading exception."); ioe.printStackTrace(); }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { e.printStackTrace(); } } } } } /** * 加载姓氏词典 */ private void loadSurnameDict(){ //建立一个姓氏词典实例 _SurnameDict = new DictSegment((char)0); //读取姓氏词典文件 InputStream is = Dictionary.class.getResourceAsStream(Dictionary.PATH_DIC_SURNAME); if(is == null){ throw new RuntimeException("Surname Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _SurnameDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { System.err.println("Surname Dictionary loading exception."); ioe.printStackTrace(); }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { e.printStackTrace(); } } } /** * 加载量词词典 */ private void loadQuantifierDict(){ //建立一个量词典实例 _QuantifierDict = new DictSegment((char)0); //读取量词词典文件 InputStream is = Dictionary.class.getResourceAsStream(Dictionary.PATH_DIC_QUANTIFIER); if(is == null){ throw new RuntimeException("Quantifier Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _QuantifierDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { System.err.println("Quantifier Dictionary loading exception."); ioe.printStackTrace(); }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { e.printStackTrace(); } } } /** * 加载后缀词典 */ private void loadSuffixDict(){ //建立一个后缀词典实例 _SuffixDict = new DictSegment((char)0); //读取量词词典文件 InputStream is = Dictionary.class.getResourceAsStream(Dictionary.PATH_DIC_SUFFIX); if(is == null){ throw new RuntimeException("Suffix Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _SuffixDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { System.err.println("Suffix Dictionary loading exception."); ioe.printStackTrace(); }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { e.printStackTrace(); } } } /** * 加载介词/副词词典 */ private void loadPrepDict(){ //建立一个介词/副词词典实例 _PrepDict = new DictSegment((char)0); //读取量词词典文件 InputStream is = Dictionary.class.getResourceAsStream(Dictionary.PATH_DIC_PREP); if(is == null){ throw new RuntimeException("Preposition Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { //System.out.println(theWord); _PrepDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { System.err.println("Preposition Dictionary loading exception."); ioe.printStackTrace(); }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { e.printStackTrace(); } } } /** * 加载停止词词典 */ private void loadStopWordDict(){ //建立一个停止词典实例 _StopWords = new DictSegment((char)0); //读取量词词典文件 InputStream is = Dictionary.class.getResourceAsStream(Dictionary.PATH_DIC_STOP); if(is == null){ throw new RuntimeException("Stopword Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _StopWords.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { System.err.println("Stopword Dictionary loading exception."); ioe.printStackTrace(); }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { e.printStackTrace(); } } //加载扩展停止词典 List<String> extStopWordDictFiles = Configuration.getExtStopWordDictionarys(); if(extStopWordDictFiles != null){ for(String extStopWordDictName : extStopWordDictFiles){ //读取扩展词典文件 is = Dictionary.class.getResourceAsStream(extStopWordDictName); //如果找不到扩展的字典,则忽略 if(is == null){ continue; } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { //System.out.println(theWord); //加载扩展停止词典数据到内存中 _StopWords.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { System.err.println("Extension Stop word Dictionary loading exception."); ioe.printStackTrace(); }finally{ try { if(is != null){ is.close(); is = null; } } catch (IOException e) { e.printStackTrace(); } } } } } /** * 词典初始化 * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化 * 只有当Dictionary类被实际调用时,才会开始载入词典, * 这将延长首次分词操作的时间 * 该方法提供了一个在应用加载阶段就初始化字典的手段 * 用来缩短首次分词时的时延 * @return Dictionary */ public static Dictionary getInstance(){ return Dictionary.singleton; } /** * 加载扩展的词条 * @param extWords Collection<String>词条列表 */ public static void loadExtendWords(Collection<String> extWords){ if(extWords != null){ for(String extWord : extWords){ if (extWord != null) { //加载扩展词条到主内存词典中 singleton._MainDict.fillSegment(extWord.trim().toCharArray()); } } } } /** * 加载扩展的停止词条 * @param extStopWords Collection<String>词条列表 */ public static void loadExtendStopWords(Collection<String> extStopWords){ if(extStopWords != null){ for(String extStopWord : extStopWords){ if (extStopWord != null) { //加载扩展的停止词条 singleton._StopWords.fillSegment(extStopWord.trim().toCharArray()); } } } } /** * 检索匹配主词典 * @param charArray * @return Hit 匹配结果描述 */ public static Hit matchInMainDict(char[] charArray){ return singleton._MainDict.match(charArray); } /** * 检索匹配主词典 * @param charArray * @param begin * @param length * @return Hit 匹配结果描述 */ public static Hit matchInMainDict(char[] charArray , int begin, int length){ return singleton._MainDict.match(charArray, begin, length); } /** * 检索匹配主词典, * 从已匹配的Hit中直接取出DictSegment,继续向下匹配 * @param charArray * @param currentIndex * @param matchedHit * @return Hit */ public static Hit matchInMainDictWithHit(char[] charArray , int currentIndex , Hit matchedHit){ DictSegment ds = matchedHit.getMatchedDictSegment(); return ds.match(charArray, currentIndex, 1 , matchedHit); } /** * 检索匹配姓氏词典 * @param charArray * @param begin * @param length * @return Hit 匹配结果描述 */ public static Hit matchInSurnameDict(char[] charArray , int begin, int length){ return singleton._SurnameDict.match(charArray, begin, length); } // /** // * // * 在姓氏词典中匹配指定位置的char数组 // * (对传入的字串进行后缀匹配) // * @param charArray // * @param begin // * @param end // * @return // */ // public static boolean endsWithSurnameDict(char[] charArray , int begin, int length){ // Hit hit = null; // for(int i = 1 ; i <= length ; i++){ // hit = singleton._SurnameDict.match(charArray, begin + (length - i) , i); // if(hit.isMatch()){ // return true; // } // } // return false; // } /** * 检索匹配量词词典 * @param charArray * @param begin * @param length * @return Hit 匹配结果描述 */ public static Hit matchInQuantifierDict(char[] charArray , int begin, int length){ return singleton._QuantifierDict.match(charArray, begin, length); } /** * 检索匹配在后缀词典 * @param charArray * @param begin * @param length * @return Hit 匹配结果描述 */ public static Hit matchInSuffixDict(char[] charArray , int begin, int length){ return singleton._SuffixDict.match(charArray, begin, length); } // /** // * 在后缀词典中匹配指定位置的char数组 // * (对传入的字串进行前缀匹配) // * @param charArray // * @param begin // * @param end // * @return // */ // public static boolean startsWithSuffixDict(char[] charArray , int begin, int length){ // Hit hit = null; // for(int i = 1 ; i <= length ; i++){ // hit = singleton._SuffixDict.match(charArray, begin , i); // if(hit.isMatch()){ // return true; // }else if(hit.isUnmatch()){ // return false; // } // } // return false; // } /** * 检索匹配介词、副词词典 * @param charArray * @param begin * @param length * @return Hit 匹配结果描述 */ public static Hit matchInPrepDict(char[] charArray , int begin, int length){ return singleton._PrepDict.match(charArray, begin, length); } /** * 判断是否是停止词 * @param charArray * @param begin * @param length * @return boolean */ public static boolean isStopWord(char[] charArray , int begin, int length){ return singleton._StopWords.match(charArray, begin, length).isMatch(); } }  

你可能感兴趣的:(java,String,null,扩展,extension,Dictionary)