ansj_seg源码分析之用户自定义词库

阅读更多
    最近,工作上用到中文分词ansj,先前我是整合到ES里面,但是觉得这样不利于源码分析,所以我们先把源码部署起来:
    在线演示:[url]http://ansj.sdapp.cn/demo/seg.jsp [/url]
    官网地址:[url]http://www.ansj.org/ [/url]
    github:https://github.com/NLPchina/ansj_seg
    通过maven引入源码,这里不再赘述。得到结构图如下:
ansj_seg源码分析之用户自定义词库_第1张图片
    我们可以发现library.properties就是用来配置词典的,最开始配置如下:
#redress dic file path
ambiguityLibrary=library/ambiguity.dic
#path of userLibrary this is default library
userLibrary=library/default.dic
#set real name
isRealName=true

    添加一个词典文件,得到如下所示:
#redress dic file path
ambiguityLibrary=library/ambiguity.dic
#path of defultLibrary this is default library
defaultLibrary=library/default.dic
#path of userLibrary this is user library
userLibrary=library/userLibrary.dic
#set real name
isRealName=true

    个人偏好,把原有的userLibrary改成defaultLibrary,因为我觉得用户自定义词库,可以暂时定义,加入分词,后期维护可以加入默认词库,这样就有了一个升级过程。
    把新加的词库读入内存,只修改如下代码:
/**
	 * 加载用户自定义词典和补充词典
	 */
	private static void initUserLibrary() {
		// TODO Auto-generated method stub
		try {
			FOREST = new Forest();
			// 加载用户自定义词典
			String userLibrary = MyStaticValue.userLibrary;
			loadLibrary(FOREST, userLibrary);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
为:
/**
	 * 加载用户自定义词典和补充词典
	 */
	private static void initUserLibrary() {
		// TODO Auto-generated method stub
		try {
			FOREST = new Forest();
			// 加载默认自定义词典
			String defaultLibrary = MyStaticValue.defaultLibrary;
			loadLibrary(FOREST, defaultLibrary);
			//加载用户新增词典
			String userLibrary = MyStaticValue.userLibrary;
			loadLibrary(FOREST, userLibrary);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

    这里我没有加上类名,是我希望读者自己能够根据debug找到相应的类,还请谅解。

    另外,我再把停用词也指出一下:
ansj_seg源码分析之用户自定义词库_第2张图片
    通过FilterModifWord类调用。
    需要修改一下源码:
package org.ansj.util;

import static org.ansj.util.MyStaticValue.LIBRARYLOG;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.ansj.domain.Nature;
import org.ansj.domain.Term;
import org.ansj.library.UserDefineLibrary;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;

/*
 * 停用词过滤,修正词性到用户词性.
 */
public class FilterModifWord {

	private static Set FILTER = new HashSet();

	private static String TAG = "#";

	private static boolean isTag = false;
	
	static{
		String filePath = MyStaticValue.stopWordsLibrary;
		initStopWordsDic(filePath);
	}
	
	/**
	 * 初始化停用词词库
	 * @param stopWordsPath
	 */
	private static void initStopWordsDic(String stopWordsPath){
		File file = null;
		if (StringUtil.isNotBlank(stopWordsPath)) {
			file = new File(stopWordsPath);
			if (!file.canRead() || file.isHidden()) {
				LIBRARYLOG.warning("init stopWordsLibrary  warning :" + new File(stopWordsPath).getAbsolutePath() + " because : file not found or failed to read !");
				return;
			}
			if (file.isFile()) {
				loadStopWordsFile(file);
			} else if (file.isDirectory()) {
				File[] files = file.listFiles();
				for (int i = 0; i < files.length; i++) {
					if (files[i].getName().trim().endsWith(".dic")) {
						loadStopWordsFile(files[i]);
					}
				}
			} else {
				LIBRARYLOG.warning("init stopWordsLibrary  error :" + new File(stopWordsPath).getAbsolutePath() + " because : not find that file !");
			}
		}
	}
	
	/**
	 * 加载停用词文件
	 * @param filePath
	 */
	private static void loadStopWordsFile(File file){
		if (!file.canRead()) {
			LIBRARYLOG.warning("file in path " + file.getAbsolutePath() + " can not to read!");
			return;
		}
		String temp = null;
		BufferedReader br = null;
		String[] strs = null;
		try {
			br = IOUtil.getReader(new FileInputStream(file), "UTF-8");
			while ((temp = br.readLine()) != null) {
				if (StringUtil.isBlank(temp)) {
					continue;
				} else {
					insertStopWord(temp);
				}
			}
			LIBRARYLOG.info("init stopWordsLibrary ok path is : " + file.getAbsolutePath());
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			IOUtil.close(br);
			br = null;
		}
	}

	public static void insertStopWords(List filterWords) {
		FILTER.addAll(filterWords);
	}

	public static void insertStopWord(String... filterWord) {
		for (String word : filterWord) {
			FILTER.add(word);
		}
	}

	public static void insertStopNatures(String... filterNatures) {
		isTag = true;
		for (String natureStr : filterNatures) {
			FILTER.add(TAG + natureStr);
		}
	}

	/*
	 * 停用词过滤并且修正词性
	 */
	public static List modifResult(List all) {
		List result = new ArrayList();
		try {
			for (Term term : all) {
				if (FILTER.size() > 0 && (FILTER.contains(term.getName()) || (isTag && FILTER.contains(TAG + term.natrue().natureStr)))) {
					continue;
				}
				String[] params = UserDefineLibrary.getParams(term.getName());
				if (params != null) {
					term.setNature(new Nature(params[0]));
				}
				result.add(term);
			}
		} catch (Exception e) {
			// TODO Auto-generated catch block
			System.err.println("FilterStopWord.updateDic can not be null , " + "you must use set FilterStopWord.setUpdateDic(map) or use method set map");
		}
		return result;
	}

	/*
	 * 停用词过滤并且修正词性
	 */
	public static List modifResult(List all, Forest... forests) {
		List result = new ArrayList();
		try {
			for (Term term : all) {
				if (FILTER.size() > 0 && (FILTER.contains(term.getName()) || FILTER.contains(TAG + term.natrue().natureStr))) {
					continue;
				}
				for (Forest forest : forests) {
					String[] params = UserDefineLibrary.getParams(forest, term.getName());
					if (params != null) {
						term.setNature(new Nature(params[0]));
					}
				}
				result.add(term);
			}
		} catch (Exception e) {
			// TODO Auto-generated catch block
			System.err.println("FilterStopWord.updateDic can not be null , " + "you must use set FilterStopWord.setUpdateDic(map) or use method set map");
		}
		return result;
	}
}



package org.ansj.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;
import java.util.ResourceBundle;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.logging.Logger;

import org.ansj.app.crf.Model;
import org.ansj.app.crf.SplitWord;
import org.ansj.dic.DicReader;
import org.ansj.domain.AnsjItem;
import org.ansj.library.DATDictionary;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;

/**
 * 这个类储存一些公用变量.
 * 
 * @author ansj
 * 
 */
public class MyStaticValue {

	public static final Logger LIBRARYLOG = Logger.getLogger("DICLOG");

	// 是否开启人名识别
	public static boolean isNameRecognition = true;

	private static final Lock LOCK = new ReentrantLock();

	// 是否开启数字识别
	public static boolean isNumRecognition = true;

	// 是否数字和量词合并
	public static boolean isQuantifierRecognition = true;

	// crf 模型

	private static SplitWord crfSplitWord = null;

	public static boolean isRealName = false;

	/**
	 * 用户自定义词典的加载,如果是路径就扫描路径下的dic文件
	 */
	public static String defaultLibrary = "library/default.dic";

	public static String ambiguityLibrary = "library/ambiguity.dic";
	
	public static String userLibrary = "library/userLibrary.dic";
	
	public static String stopWordsLibrary = "src/main/resources/newWord/newWordFilter.dic";

	/**
	 * 是否用户辞典不加载相同的词
	 */
	public static boolean isSkipUserDefine = false;

	static {
		/**
		 * 配置文件变量
		 */
		try {
			ResourceBundle rb = ResourceBundle.getBundle("library");
			if (rb.containsKey("defaultLibrary"))
				defaultLibrary = rb.getString("defaultLibrary");
			if (rb.containsKey("ambiguityLibrary"))
				ambiguityLibrary = rb.getString("ambiguityLibrary");
			if (rb.containsKey("userLiberary")) 
				userLibrary = rb.getString("userLibrary");
			if (rb.containsKey("stopWordsLibrary"))
				stopWordsLibrary = rb.getString("stopWordsLibrary");
			if (rb.containsKey("isSkipUserDefine"))
				isSkipUserDefine = Boolean.valueOf(rb.getString("isSkipUserDefine"));
			if (rb.containsKey("isRealName"))
				isRealName = Boolean.valueOf(rb.getString("isRealName"));
		} catch (Exception e) {
			LIBRARYLOG.warning("not find library.properties in classpath use it by default !");
		}
	}

	/**
	 * 人名词典
	 * 
	 * @return
	 */
	public static BufferedReader getPersonReader() {
		return DicReader.getReader("person/person.dic");
	}

	/**
	 * 机构名词典
	 * 
	 * @return
	 */
	public static BufferedReader getCompanReader() {
		return DicReader.getReader("company/company.data");
	}

	/**
	 * 机构名词典
	 * 
	 * @return
	 */
	public static BufferedReader getNewWordReader() {
		return DicReader.getReader("newWord/new_word_freq.dic");
	}

	/**
	 * 核心词典
	 * 
	 * @return
	 */
	public static BufferedReader getArraysReader() {
		// TODO Auto-generated method stub
		return DicReader.getReader("arrays.dic");
	}

	/**
	 * 数字词典
	 * 
	 * @return
	 */
	public static BufferedReader getNumberReader() {
		// TODO Auto-generated method stub
		return DicReader.getReader("numberLibrary.dic");
	}

	/**
	 * 英文词典
	 * 
	 * @return
	 */
	public static BufferedReader getEnglishReader() {
		// TODO Auto-generated method stub
		return DicReader.getReader("englishLibrary.dic");
	}

	/**
	 * 词性表
	 * 
	 * @return
	 */
	public static BufferedReader getNatureMapReader() {
		// TODO Auto-generated method stub
		return DicReader.getReader("nature/nature.map");
	}

	/**
	 * 词性关联表
	 * 
	 * @return
	 */
	public static BufferedReader getNatureTableReader() {
		// TODO Auto-generated method stub
		return DicReader.getReader("nature/nature.table");
	}

	/**
	 * 得道姓名单字的词频词典
	 * 
	 * @return
	 */
	public static BufferedReader getPersonFreqReader() {
		// TODO Auto-generated method stub
		return DicReader.getReader("person/name_freq.dic");
	}

	/**
	 * 名字词性对象反序列化
	 * 
	 * @return
	 */
	@SuppressWarnings("unchecked")
	public static Map getPersonFreqMap() {
		InputStream inputStream = null;
		ObjectInputStream objectInputStream = null;
		Map map = new HashMap(0);
		try {
			inputStream = DicReader.getInputStream("person/asian_name_freq.data");
			objectInputStream = new ObjectInputStream(inputStream);
			map = (Map) objectInputStream.readObject();

		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} finally {
			try {
				if (objectInputStream != null)
					objectInputStream.close();
				if (inputStream != null)
					inputStream.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		return map;
	}

	/**
	 * 词与词之间的关联表数据
	 * 
	 * @return
	 */
	public static void initBigramTables() {
		BufferedReader reader = null;
		try {
			reader = IOUtil.getReader(DicReader.getInputStream("bigramdict.dic"), "UTF-8");
			String temp = null;
			String[] strs = null;
			int freq = 0;
			while ((temp = reader.readLine()) != null) {
				if (StringUtil.isBlank(temp)) {
					continue;
				}
				strs = temp.split("\t");
				freq = Integer.parseInt(strs[1]);
				strs = strs[0].split("@");
				AnsjItem fromItem = DATDictionary.getItem(strs[0]);

				AnsjItem toItem = DATDictionary.getItem(strs[1]);

				if (fromItem == AnsjItem.NULL && strs[0].contains("#")) {
					fromItem = AnsjItem.BEGIN;
				}

				if (toItem == AnsjItem.NULL && strs[1].contains("#")) {
					toItem = AnsjItem.END;
				}

				if (fromItem == AnsjItem.NULL || toItem == AnsjItem.NULL) {
					continue;
				}
				
				if(fromItem.bigramEntryMap==null){
					fromItem.bigramEntryMap = new HashMap() ;
				}

				fromItem.bigramEntryMap.put(toItem.index, freq) ;

			}
		} catch (NumberFormatException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} finally {
			IOUtil.close(reader);
		}
		
	}

	/**
	 * 得到默认的模型
	 * 
	 * @return
	 */
	public static SplitWord getCRFSplitWord() {
		// TODO Auto-generated method stub
		if (crfSplitWord != null) {
			return crfSplitWord;
		}
		LOCK.lock();
		if (crfSplitWord != null) {
			return crfSplitWord;
		}

		try {
			long start = System.currentTimeMillis();
			LIBRARYLOG.info("begin init crf model!");
			crfSplitWord = new SplitWord(Model.loadModel(DicReader.getInputStream("crf/crf.model")));
			LIBRARYLOG.info("load crf crf use time:" + (System.currentTimeMillis() - start));
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} finally {
			LOCK.unlock();
		}

		return crfSplitWord;
	}

}

    测试用例:
package org.ansj.demo;

import java.util.List;

import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.ansj.util.FilterModifWord;

public class StopWordDemo {
	public static void main(String[] args) {
//        FilterModifWord.insertStopWord("五一");
        List parseResultList = NlpAnalysis.parse("your五一,劳动节快乐");
        System.out.println(parseResultList);
        parseResultList = FilterModifWord.modifResult(parseResultList);
        System.out.println(parseResultList);
	}
}


程序猿行业技术生活交流群:181287753(指尖天下),欢迎大伙加入交流学习。
  • ansj_seg源码分析之用户自定义词库_第3张图片
  • 大小: 11.3 KB
  • ansj_seg源码分析之用户自定义词库_第4张图片
  • 大小: 8.8 KB
  • 查看图片附件

你可能感兴趣的:(中文分词,ansj源码,自定义词库,开源,java)