在线演示:[url]http://ansj.sdapp.cn/demo/seg.jsp [/url]
官网地址:[url]http://www.ansj.org/ [/url]
github:https://github.com/NLPchina/ansj_seg
通过maven引入源码,这里不再赘述。得到结构图如下:
我们可以发现library.properties就是用来配置词典的,最开始配置如下:
#redress dic file path ambiguityLibrary=library/ambiguity.dic #path of userLibrary this is default library userLibrary=library/default.dic #set real name isRealName=true
添加一个词典文件,得到如下所示:
#redress dic file path ambiguityLibrary=library/ambiguity.dic #path of defultLibrary this is default library defaultLibrary=library/default.dic #path of userLibrary this is user library userLibrary=library/userLibrary.dic #set real name isRealName=true
个人偏好,把原有的userLibrary改成defaultLibrary,因为我觉得用户自定义词库,可以暂时定义,加入分词,后期维护可以加入默认词库,这样就有了一个升级过程。
把新加的词库读入内存,只修改如下代码:
/** * 加载用户自定义词典和补充词典 */ private static void initUserLibrary() { // TODO Auto-generated method stub try { FOREST = new Forest(); // 加载用户自定义词典 String userLibrary = MyStaticValue.userLibrary; loadLibrary(FOREST, userLibrary); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }为:
/** * 加载用户自定义词典和补充词典 */ private static void initUserLibrary() { // TODO Auto-generated method stub try { FOREST = new Forest(); // 加载默认自定义词典 String defaultLibrary = MyStaticValue.defaultLibrary; loadLibrary(FOREST, defaultLibrary); //加载用户新增词典 String userLibrary = MyStaticValue.userLibrary; loadLibrary(FOREST, userLibrary); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
这里我没有加上类名,是我希望读者自己能够根据debug找到相应的类,还请谅解。
另外,我再把停用词也指出一下:
通过FilterModifWord类调用。
需要修改一下源码:
package org.ansj.util; import static org.ansj.util.MyStaticValue.LIBRARYLOG; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.ansj.domain.Nature; import org.ansj.domain.Term; import org.ansj.library.UserDefineLibrary; import org.nlpcn.commons.lang.tire.domain.Forest; import org.nlpcn.commons.lang.util.IOUtil; import org.nlpcn.commons.lang.util.StringUtil; /* * 停用词过滤,修正词性到用户词性. */ public class FilterModifWord { private static SetFILTER = new HashSet (); private static String TAG = "#"; private static boolean isTag = false; static{ String filePath = MyStaticValue.stopWordsLibrary; initStopWordsDic(filePath); } /** * 初始化停用词词库 * @param stopWordsPath */ private static void initStopWordsDic(String stopWordsPath){ File file = null; if (StringUtil.isNotBlank(stopWordsPath)) { file = new File(stopWordsPath); if (!file.canRead() || file.isHidden()) { LIBRARYLOG.warning("init stopWordsLibrary warning :" + new File(stopWordsPath).getAbsolutePath() + " because : file not found or failed to read !"); return; } if (file.isFile()) { loadStopWordsFile(file); } else if (file.isDirectory()) { File[] files = file.listFiles(); for (int i = 0; i < files.length; i++) { if (files[i].getName().trim().endsWith(".dic")) { loadStopWordsFile(files[i]); } } } else { LIBRARYLOG.warning("init stopWordsLibrary error :" + new File(stopWordsPath).getAbsolutePath() + " because : not find that file !"); } } } /** * 加载停用词文件 * @param filePath */ private static void loadStopWordsFile(File file){ if (!file.canRead()) { LIBRARYLOG.warning("file in path " + file.getAbsolutePath() + " can not to read!"); return; } String temp = null; BufferedReader br = null; String[] strs = null; try { br = IOUtil.getReader(new FileInputStream(file), "UTF-8"); while ((temp = br.readLine()) != null) { if (StringUtil.isBlank(temp)) { continue; } else { insertStopWord(temp); } } LIBRARYLOG.info("init stopWordsLibrary ok path is : " + file.getAbsolutePath()); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { IOUtil.close(br); br = null; } } public static void insertStopWords(List filterWords) { FILTER.addAll(filterWords); } public static void insertStopWord(String... filterWord) { for (String word : filterWord) { FILTER.add(word); } } public static void insertStopNatures(String... filterNatures) { isTag = true; for (String natureStr : filterNatures) { FILTER.add(TAG + natureStr); } } /* * 停用词过滤并且修正词性 */ public static List modifResult(List all) { List result = new ArrayList (); try { for (Term term : all) { if (FILTER.size() > 0 && (FILTER.contains(term.getName()) || (isTag && FILTER.contains(TAG + term.natrue().natureStr)))) { continue; } String[] params = UserDefineLibrary.getParams(term.getName()); if (params != null) { term.setNature(new Nature(params[0])); } result.add(term); } } catch (Exception e) { // TODO Auto-generated catch block System.err.println("FilterStopWord.updateDic can not be null , " + "you must use set FilterStopWord.setUpdateDic(map) or use method set map"); } return result; } /* * 停用词过滤并且修正词性 */ public static List modifResult(List all, Forest... forests) { List result = new ArrayList (); try { for (Term term : all) { if (FILTER.size() > 0 && (FILTER.contains(term.getName()) || FILTER.contains(TAG + term.natrue().natureStr))) { continue; } for (Forest forest : forests) { String[] params = UserDefineLibrary.getParams(forest, term.getName()); if (params != null) { term.setNature(new Nature(params[0])); } } result.add(term); } } catch (Exception e) { // TODO Auto-generated catch block System.err.println("FilterStopWord.updateDic can not be null , " + "you must use set FilterStopWord.setUpdateDic(map) or use method set map"); } return result; } }
package org.ansj.util; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; import java.util.ResourceBundle; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.logging.Logger; import org.ansj.app.crf.Model; import org.ansj.app.crf.SplitWord; import org.ansj.dic.DicReader; import org.ansj.domain.AnsjItem; import org.ansj.library.DATDictionary; import org.nlpcn.commons.lang.util.IOUtil; import org.nlpcn.commons.lang.util.StringUtil; /** * 这个类储存一些公用变量. * * @author ansj * */ public class MyStaticValue { public static final Logger LIBRARYLOG = Logger.getLogger("DICLOG"); // 是否开启人名识别 public static boolean isNameRecognition = true; private static final Lock LOCK = new ReentrantLock(); // 是否开启数字识别 public static boolean isNumRecognition = true; // 是否数字和量词合并 public static boolean isQuantifierRecognition = true; // crf 模型 private static SplitWord crfSplitWord = null; public static boolean isRealName = false; /** * 用户自定义词典的加载,如果是路径就扫描路径下的dic文件 */ public static String defaultLibrary = "library/default.dic"; public static String ambiguityLibrary = "library/ambiguity.dic"; public static String userLibrary = "library/userLibrary.dic"; public static String stopWordsLibrary = "src/main/resources/newWord/newWordFilter.dic"; /** * 是否用户辞典不加载相同的词 */ public static boolean isSkipUserDefine = false; static { /** * 配置文件变量 */ try { ResourceBundle rb = ResourceBundle.getBundle("library"); if (rb.containsKey("defaultLibrary")) defaultLibrary = rb.getString("defaultLibrary"); if (rb.containsKey("ambiguityLibrary")) ambiguityLibrary = rb.getString("ambiguityLibrary"); if (rb.containsKey("userLiberary")) userLibrary = rb.getString("userLibrary"); if (rb.containsKey("stopWordsLibrary")) stopWordsLibrary = rb.getString("stopWordsLibrary"); if (rb.containsKey("isSkipUserDefine")) isSkipUserDefine = Boolean.valueOf(rb.getString("isSkipUserDefine")); if (rb.containsKey("isRealName")) isRealName = Boolean.valueOf(rb.getString("isRealName")); } catch (Exception e) { LIBRARYLOG.warning("not find library.properties in classpath use it by default !"); } } /** * 人名词典 * * @return */ public static BufferedReader getPersonReader() { return DicReader.getReader("person/person.dic"); } /** * 机构名词典 * * @return */ public static BufferedReader getCompanReader() { return DicReader.getReader("company/company.data"); } /** * 机构名词典 * * @return */ public static BufferedReader getNewWordReader() { return DicReader.getReader("newWord/new_word_freq.dic"); } /** * 核心词典 * * @return */ public static BufferedReader getArraysReader() { // TODO Auto-generated method stub return DicReader.getReader("arrays.dic"); } /** * 数字词典 * * @return */ public static BufferedReader getNumberReader() { // TODO Auto-generated method stub return DicReader.getReader("numberLibrary.dic"); } /** * 英文词典 * * @return */ public static BufferedReader getEnglishReader() { // TODO Auto-generated method stub return DicReader.getReader("englishLibrary.dic"); } /** * 词性表 * * @return */ public static BufferedReader getNatureMapReader() { // TODO Auto-generated method stub return DicReader.getReader("nature/nature.map"); } /** * 词性关联表 * * @return */ public static BufferedReader getNatureTableReader() { // TODO Auto-generated method stub return DicReader.getReader("nature/nature.table"); } /** * 得道姓名单字的词频词典 * * @return */ public static BufferedReader getPersonFreqReader() { // TODO Auto-generated method stub return DicReader.getReader("person/name_freq.dic"); } /** * 名字词性对象反序列化 * * @return */ @SuppressWarnings("unchecked") public static MapgetPersonFreqMap() { InputStream inputStream = null; ObjectInputStream objectInputStream = null; Map map = new HashMap (0); try { inputStream = DicReader.getInputStream("person/asian_name_freq.data"); objectInputStream = new ObjectInputStream(inputStream); map = (Map ) objectInputStream.readObject(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { try { if (objectInputStream != null) objectInputStream.close(); if (inputStream != null) inputStream.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return map; } /** * 词与词之间的关联表数据 * * @return */ public static void initBigramTables() { BufferedReader reader = null; try { reader = IOUtil.getReader(DicReader.getInputStream("bigramdict.dic"), "UTF-8"); String temp = null; String[] strs = null; int freq = 0; while ((temp = reader.readLine()) != null) { if (StringUtil.isBlank(temp)) { continue; } strs = temp.split("\t"); freq = Integer.parseInt(strs[1]); strs = strs[0].split("@"); AnsjItem fromItem = DATDictionary.getItem(strs[0]); AnsjItem toItem = DATDictionary.getItem(strs[1]); if (fromItem == AnsjItem.NULL && strs[0].contains("#")) { fromItem = AnsjItem.BEGIN; } if (toItem == AnsjItem.NULL && strs[1].contains("#")) { toItem = AnsjItem.END; } if (fromItem == AnsjItem.NULL || toItem == AnsjItem.NULL) { continue; } if(fromItem.bigramEntryMap==null){ fromItem.bigramEntryMap = new HashMap () ; } fromItem.bigramEntryMap.put(toItem.index, freq) ; } } catch (NumberFormatException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { IOUtil.close(reader); } } /** * 得到默认的模型 * * @return */ public static SplitWord getCRFSplitWord() { // TODO Auto-generated method stub if (crfSplitWord != null) { return crfSplitWord; } LOCK.lock(); if (crfSplitWord != null) { return crfSplitWord; } try { long start = System.currentTimeMillis(); LIBRARYLOG.info("begin init crf model!"); crfSplitWord = new SplitWord(Model.loadModel(DicReader.getInputStream("crf/crf.model"))); LIBRARYLOG.info("load crf crf use time:" + (System.currentTimeMillis() - start)); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { LOCK.unlock(); } return crfSplitWord; } }
测试用例:
package org.ansj.demo; import java.util.List; import org.ansj.domain.Term; import org.ansj.splitWord.analysis.NlpAnalysis; import org.ansj.util.FilterModifWord; public class StopWordDemo { public static void main(String[] args) { // FilterModifWord.insertStopWord("五一"); ListparseResultList = NlpAnalysis.parse("your五一,劳动节快乐"); System.out.println(parseResultList); parseResultList = FilterModifWord.modifResult(parseResultList); System.out.println(parseResultList); } }
程序猿行业技术生活交流群:181287753(指尖天下),欢迎大伙加入交流学习。