分词的简单算法
1.完全切分
即在字典中查找到某个词 ,就切分
2.正向切分
从左到右开始匹配最大长度词,例如北京真好玩,
i = 0 时:北、北京 (当前最大词)、北京真、北京真好、北京真好玩 i = i + length(longestWord) = 0 + 2 = 2
i = 2时:真、真好、真好玩(最大词)i = 5
3.逆向切分
从右到左开始匹配最大长度词,例如北京真好玩,
i = 4时,北京真好玩、京真好玩、真好玩(最大词)、好玩、玩 i = length(text) - length(longestWord) = 4-3 = 1
同理
4.双向切分
左右各切一遍,谁的结果分词数量少取谁,若分词数量一样,则取分词结果中单字少的结果。
5.提升分词器性能
嗯,还在研究中。。。。。。好难T T,大概就是用TreeMap存储字典的话,从TreeMap查询词的效率低,采用字典树存储字典,词的查询效率会明显提升,从而使得分词效率明显提升。https://blog.csdn.net/johnny901114/article/details/80711441 字典树的学习,But 该博客字典树的实现还是通过TreeMap实现的,所以大致了解一下思路就好。。。
6.分词器性能评估
主要指标有:P精确率、R召回率、F1调和平均值、OOV_R:未登录词的召回率、IV_R:登录词的召回率
因为P = TP/(TP+FP)即从预测的角度,计算预测的准确性,而R=TP/(TP+FN)即从事实的角度,计算现实正确的有多少被预测出。又因为在分词问题中,标准答案和分词结果的单词数不一定相等,而且混淆矩阵针对的是分类问题,而中文分词针对的是分块 问题,则将标准答案作为A(从事实的角度),分词结果作为B(从预测的角度),则两者重复的区域为TP,即预测对的区域。故有以下公式:
TP∪FN = A、TP∪FP = B、TP = A∩B
P = |A∩B|/|B|
R = |A∩B|/|A|
书上例子:
标准答案(A):结婚 的 和 尚未 结婚 的
分词结果(B):结婚 的 和尚 未结婚 的
重合部分(A∩B):结婚、的、的
则P = 3/5=0.6、R=3/6=0.5、F1=2*0.6*0.5/(0.6+0.5) = 0.55
7.代码实现
[分词器]
package HanLpTest;
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import org.antlr.v4.runtime.ListTokenSource;
import java.io.IOException;
import java.util.*;
public class HanLpCut {
public static List segementFully(String text,Map dictionary){
List wordList = new LinkedList();
for(int i=0;i maxPosMatch(String text,Map dictionary){
List wordList = new LinkedList();
for(int i=0;i longestWord.length()){
longestWord = word;
}
}
}
wordList.add(longestWord);
i += longestWord.length();
}
return wordList;
}
public static List maxNegMatch(String text,Map dictionary){
List wordList = new LinkedList();
for(int i=text.length()-1;i>=0;){
String longestWord = text.substring(i,i+1);
for(int j=0;j<=i;j++){
String word = text.substring(j,i+1);
if(dictionary.containsKey(word)){
if(word.length()>longestWord.length()){
longestWord = word;
}
}
}
wordList.add(0,longestWord);
i -= longestWord.length();
}
return wordList;
}
public static int countSingleChar(List wordList){
int size = 0;
for (String word : wordList){
if (word.length() == 1){
size += 1;
}
}
return size;
}
public static List biSegement(String text,Map dictionary){
List posMatch = maxPosMatch(text,dictionary);
List negMatch = maxNegMatch(text,dictionary);
if(posMatch.size()negMatch.size()){
return negMatch;
}
else {
if(countSingleChar(posMatch) > countSingleChar(negMatch)){
return negMatch;
}
else {
return posMatch;
}
}
}
public static void evaluateSpeed(Map dictionary){
String text = "江西鄱阳湖干枯,中国最大淡水湖变成大草原";
long start;
double costTime;
final int pressure = 10000;
start = System.currentTimeMillis();
for (int i=0;i dictionary = IOUtil.loadDictionary("E:\\NLP\\data\\dictionary\\CoreNatureDictionary.txt");
final BinTrie binTrie = new BinTrie(dictionary);
Map binTrieMap = new Map() {
@Override
public int size() {
return 0;
}
@Override
public boolean isEmpty() {
return false;
}
@Override
public boolean containsKey(Object key) {
return binTrie.containsKey((String) key);
}
@Override
public boolean containsValue(Object value) {
return false;
}
@Override
public CoreDictionary.Attribute get(Object key) {
return null;
}
@Override
public CoreDictionary.Attribute put(String key, CoreDictionary.Attribute value) {
return null;
}
@Override
public CoreDictionary.Attribute remove(Object key) {
return null;
}
@Override
public void putAll(Map extends String, ? extends CoreDictionary.Attribute> m) {
}
@Override
public void clear() {
}
@Override
public Set keySet() {
return null;
}
@Override
public Collection values() {
return null;
}
@Override
public Set> entrySet() {
return null;
}
};
// System.out.printf("词典大小:%d个词条\n",dictionary.size());
// System.out.printf(dictionary.keySet().iterator().next());
// String text = "研究生命起源";
// List fully = segementFully(text, dictionary);
// List posMatch = maxPosMatch(text, dictionary);
// List negMatch = maxNegMatch(text, dictionary);
// List biSegement = biSegement(text, dictionary);
//
// System.out.println(fully);
// System.out.println(posMatch);
// System.out.println(negMatch);
// System.out.println(biSegement);
evaluateSpeed(binTrieMap);
}
}
[性能测试]
package HanLpTest;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import java.io.BufferedWriter;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
public class CWSEvaluator {
private int A_size,B_size,A_cap_B_soze,OOV,OOV_R,IV,IV_R;
private Set dic;
public CWSEvaluator(){}
public CWSEvaluator(Set dic){
this.dic = dic;
}
public CWSEvaluator(String dictPath) throws IOException {
this(new TreeSet());
if (dictPath == null) return;
try{
IOUtil.LineIterator lineIterator = new IOUtil.LineIterator(dictPath);
for(String word : lineIterator){
word = word.trim();
if(word.isEmpty()) continue;
dic.add(word);
}
}
catch (Exception e){
throw new IOException(e);
}
}
//获取PRF
//TP∪FN = A
//TP∩FP = B
//P = |A∩B|/|B| R = |A∩B|/|A|
public Result getResult(boolean percentage){
float p = A_cap_B_soze / (float)B_size;
float r = A_cap_B_soze / (float)A_size;
if(percentage){
p *= 100;
r *= 100;
}
float oov_r = Float.NaN;
if(OOV > 0){
oov_r = OOV_R / (float) OOV;
if(percentage){
oov_r *= 100;
}
}
float iv_r = Float.NaN;
if(IV>0){
iv_r = IV_R / (float) IV;
if(percentage){
iv_r *= 100;
}
}
return new Result(p,r,2*p*r/(p+r),oov_r,iv_r);
}
public static class Result{
float P,R,F1,OOV_R,IV_R;
public Result(float p, float r, float f1, float OOV_R, float IV_R) {
P = p;
R = r;
F1 = f1;
this.OOV_R = OOV_R;
this.IV_R = IV_R;
}
@Override
public String toString() {
return String.format("P:%.2f R:%.2f F1:%.2f OOV-R:%.2f IV-R:%.2f", P, R, F1, OOV_R, IV_R);
}
}
public Result getResult(){
return getResult(true);
}
//比较标准答案和分词结果
//若分词结果与答案分词相同,即|A∩B| + 1
//
public void compare(String gold,String pred){
String[] wordArray = gold.split("\\s+");
A_size += wordArray.length;
String[] predArray = pred.split("\\s+");
B_size += predArray.length;
int goldIndex = 0, predIndex = 0;
int goldLen = 0,predLen = 0;
while (goldIndex < wordArray.length && predIndex < predArray.length){
if(goldLen == predLen){
if(wordArray[goldIndex].equals(predArray[predIndex])){
if(dic != null){
if(dic.contains(wordArray[goldIndex])){
IV_R += 1;
}
else {
OOV_R += 1;
}
}
A_cap_B_soze++;
goldLen += wordArray[goldIndex].length();
predLen += wordArray[goldIndex].length();
goldIndex++;
predIndex++;
}
else {
goldLen += wordArray[goldIndex].length();
predLen += predArray[predIndex].length();
goldIndex++;
predIndex++;
}
}
else if(goldLen < predLen){
goldLen += wordArray[goldIndex].length();
goldIndex++;
}
else {
predLen += predArray[predIndex].length();
predIndex++;
}
}
if(dic != null){
for (String word : wordArray){
if(dic.contains(word)){
IV += 1;
}
else {
OOV += 1;
}
}
}
}
public static Result evaluate(Segment segment,String outputPath,String goldFile,String dictPath) throws IOException {
IOUtil.LineIterator lineIterator = new IOUtil.LineIterator(goldFile);
BufferedWriter bw = IOUtil.newBufferedWriter(outputPath);
for(String line:lineIterator){
List termList = segment.seg(line.replaceAll("\\s+", ""));
int i=0;
for (Term term : termList){
bw.write(term.word);
if(++i != termList.size()){
bw.write(" ");
}
}
bw.newLine();
}
bw.close();
Result result = CWSEvaluator.evaluate(goldFile, outputPath, dictPath);
return result;
}
public static Result evaluate(String goldFile, String predFile, String dictPath) throws IOException {
IOUtil.LineIterator goldIter = new IOUtil.LineIterator(goldFile);
IOUtil.LineIterator predIter = new IOUtil.LineIterator(predFile);
CWSEvaluator evaluator = new CWSEvaluator(dictPath);
while (goldIter.hasNext() && predIter.hasNext()){
evaluator.compare(goldIter.next(),predIter.next());
}
return evaluator.getResult();
}
public static void main(String[] args) throws IOException {
String dictPath = "C:\\Users\\dell\\Desktop\\icwb2-data\\gold\\msr_training_words.utf8";
DoubleArrayTrieSegment segment = (DoubleArrayTrieSegment) new DoubleArrayTrieSegment(dictPath).enablePartOfSpeechTagging(true);
IOUtil.LineIterator lineIterator = new IOUtil.LineIterator("C:\\Users\\dell\\Desktop\\icwb2-data\\testing\\msr_test.utf8");
String pred = "C:\\Users\\dell\\Desktop\\msr_output.txt";
BufferedWriter bw = IOUtil.newBufferedWriter(pred);
for (String line : lineIterator){
for (Term term:segment.seg(line)){
bw.write(term.word);
bw.write(" ");
}
bw.newLine();
}
bw.close();
Result evaluate = CWSEvaluator.evaluate("C:\\Users\\dell\\Desktop\\icwb2-data\\gold\\msr_test_gold.utf8", pred, dictPath);
System.out.println(evaluate);
}
}