何晗自然语言处理入门学习(一)

分词的简单算法

1.完全切分

即在字典中查找到某个词 ,就切分

2.正向切分

从左到右开始匹配最大长度词,例如北京真好玩,

i = 0 时:北、北京 (当前最大词)、北京真、北京真好、北京真好玩 i = i + length(longestWord)  = 0 + 2 = 2

i = 2时:真、真好、真好玩(最大词)i = 5

3.逆向切分

从右到左开始匹配最大长度词,例如北京真好玩,

i = 4时,北京真好玩、京真好玩、真好玩(最大词)、好玩、玩 i = length(text) - length(longestWord) = 4-3 = 1

同理

4.双向切分

左右各切一遍,谁的结果分词数量少取谁,若分词数量一样,则取分词结果中单字少的结果。

5.提升分词器性能

嗯,还在研究中。。。。。。好难T T,大概就是用TreeMap存储字典的话,从TreeMap查询词的效率低,采用字典树存储字典,词的查询效率会明显提升,从而使得分词效率明显提升。https://blog.csdn.net/johnny901114/article/details/80711441 字典树的学习,But 该博客字典树的实现还是通过TreeMap实现的,所以大致了解一下思路就好。。。

6.分词器性能评估

主要指标有:P精确率、R召回率、F1调和平均值、OOV_R:未登录词的召回率、IV_R:登录词的召回率

因为P = TP/(TP+FP)即从预测的角度,计算预测的准确性,而R=TP/(TP+FN)即从事实的角度,计算现实正确的有多少被预测出。又因为在分词问题中,标准答案和分词结果的单词数不一定相等,而且混淆矩阵针对的是分类问题,而中文分词针对的是分块 问题,则将标准答案作为A(从事实的角度),分词结果作为B(从预测的角度),则两者重复的区域为TP,即预测对的区域。故有以下公式:

                                                          TP∪FN = A、TP∪FP = B、TP = A∩B

                                                                           P = |A∩B|/|B| 

                                                                           R = |A∩B|/|A|

 

书上例子:
标准答案(A):结婚 的 和 尚未 结婚 的

分词结果(B):结婚 的 和尚 未结婚 的

重合部分(A∩B):结婚、的、的

则P = 3/5=0.6、R=3/6=0.5、F1=2*0.6*0.5/(0.6+0.5) = 0.55

7.代码实现

[分词器]

package HanLpTest;

import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import org.antlr.v4.runtime.ListTokenSource;

import java.io.IOException;
import java.util.*;

public class HanLpCut {
    public static List segementFully(String text,Map dictionary){
        List wordList = new LinkedList();

        for(int i=0;i maxPosMatch(String text,Map dictionary){
        List wordList = new LinkedList();

        for(int i=0;i longestWord.length()){
                        longestWord = word;
                    }
                }
            }
            wordList.add(longestWord);
            i += longestWord.length();
        }
        return wordList;
    }

    public static List maxNegMatch(String text,Map dictionary){
        List wordList = new LinkedList();
        for(int i=text.length()-1;i>=0;){
            String longestWord = text.substring(i,i+1);
            for(int j=0;j<=i;j++){
                String word = text.substring(j,i+1);
                if(dictionary.containsKey(word)){
                    if(word.length()>longestWord.length()){
                        longestWord = word;
                    }
                }
            }

            wordList.add(0,longestWord);
            i -= longestWord.length();
        }
        return wordList;
    }

    public static int countSingleChar(List wordList){
        int size = 0;
        for (String word : wordList){
            if (word.length() == 1){
                size += 1;
            }
        }
        return size;
    }

    public static List biSegement(String text,Map dictionary){
        List posMatch = maxPosMatch(text,dictionary);
        List negMatch = maxNegMatch(text,dictionary);

        if(posMatch.size()negMatch.size()){
            return negMatch;
        }
        else {
            if(countSingleChar(posMatch) > countSingleChar(negMatch)){
                return negMatch;
            }
            else {
                return posMatch;
            }
        }

    }

    public static void evaluateSpeed(Map dictionary){
        String text = "江西鄱阳湖干枯,中国最大淡水湖变成大草原";
        long start;
        double costTime;
        final int pressure = 10000;

        start = System.currentTimeMillis();
        for (int i=0;i dictionary = IOUtil.loadDictionary("E:\\NLP\\data\\dictionary\\CoreNatureDictionary.txt");
        final BinTrie binTrie = new BinTrie(dictionary);
        Map binTrieMap = new Map() {
            @Override
            public int size() {
                return 0;
            }

            @Override
            public boolean isEmpty() {
                return false;
            }

            @Override
            public boolean containsKey(Object key) {
                return binTrie.containsKey((String) key);
            }

            @Override
            public boolean containsValue(Object value) {
                return false;
            }

            @Override
            public CoreDictionary.Attribute get(Object key) {
                return null;
            }

            @Override
            public CoreDictionary.Attribute put(String key, CoreDictionary.Attribute value) {
                return null;
            }

            @Override
            public CoreDictionary.Attribute remove(Object key) {
                return null;
            }

            @Override
            public void putAll(Map m) {

            }

            @Override
            public void clear() {

            }

            @Override
            public Set keySet() {
                return null;
            }

            @Override
            public Collection values() {
                return null;
            }

            @Override
            public Set> entrySet() {
                return null;
            }
        };
//        System.out.printf("词典大小:%d个词条\n",dictionary.size());
//        System.out.printf(dictionary.keySet().iterator().next());
//        String text = "研究生命起源";
//        List fully = segementFully(text, dictionary);
//        List posMatch = maxPosMatch(text, dictionary);
//        List negMatch = maxNegMatch(text, dictionary);
//        List biSegement = biSegement(text, dictionary);
//
//        System.out.println(fully);
//        System.out.println(posMatch);
//        System.out.println(negMatch);
//        System.out.println(biSegement);

        evaluateSpeed(binTrieMap);
    }
}

[性能测试]

package HanLpTest;

import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;

import java.io.BufferedWriter;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

public class CWSEvaluator {
   private int A_size,B_size,A_cap_B_soze,OOV,OOV_R,IV,IV_R;
   private Set dic;

   public CWSEvaluator(){}

   public CWSEvaluator(Set dic){
       this.dic = dic;
   }

   public CWSEvaluator(String dictPath) throws IOException {
       this(new TreeSet());
       if (dictPath == null) return;

       try{
           IOUtil.LineIterator lineIterator = new IOUtil.LineIterator(dictPath);
           for(String word : lineIterator){
               word = word.trim();
               if(word.isEmpty()) continue;
               dic.add(word);
           }
       }
       catch (Exception e){
           throw new IOException(e);
       }
   }

   //获取PRF
    //TP∪FN = A
    //TP∩FP = B
    //P = |A∩B|/|B|   R = |A∩B|/|A|
    public Result getResult(boolean percentage){
       float p = A_cap_B_soze / (float)B_size;
       float r = A_cap_B_soze / (float)A_size;
       if(percentage){
           p *= 100;
           r *= 100;
       }
       float oov_r = Float.NaN;
       if(OOV > 0){
           oov_r = OOV_R / (float) OOV;
           if(percentage){
               oov_r *= 100;
           }
       }

       float iv_r = Float.NaN;
       if(IV>0){
           iv_r = IV_R / (float) IV;
           if(percentage){
               iv_r *= 100;
           }
       }
       return new Result(p,r,2*p*r/(p+r),oov_r,iv_r);
    }

    public static class Result{
       float P,R,F1,OOV_R,IV_R;

       public Result(float p, float r, float f1, float OOV_R, float IV_R) {
           P = p;
           R = r;
           F1 = f1;
           this.OOV_R = OOV_R;
           this.IV_R = IV_R;
       }

       @Override
       public String toString() {
           return String.format("P:%.2f R:%.2f F1:%.2f OOV-R:%.2f IV-R:%.2f", P, R, F1, OOV_R, IV_R);
       }
   }

   public Result getResult(){
       return getResult(true);
   }

    //比较标准答案和分词结果
    //若分词结果与答案分词相同,即|A∩B| + 1
    //
    public void compare(String gold,String pred){
       String[] wordArray = gold.split("\\s+");
       A_size += wordArray.length;

       String[] predArray = pred.split("\\s+");
       B_size += predArray.length;

       int goldIndex = 0, predIndex = 0;
       int goldLen = 0,predLen = 0;

       while (goldIndex < wordArray.length && predIndex < predArray.length){
           if(goldLen == predLen){
               if(wordArray[goldIndex].equals(predArray[predIndex])){
                   if(dic != null){
                       if(dic.contains(wordArray[goldIndex])){
                           IV_R += 1;
                       }
                       else {
                           OOV_R += 1;
                       }
                   }
                   A_cap_B_soze++;
                   goldLen += wordArray[goldIndex].length();
                   predLen += wordArray[goldIndex].length();
                   goldIndex++;
                   predIndex++;
               }
               else {
                   goldLen += wordArray[goldIndex].length();
                   predLen += predArray[predIndex].length();
                   goldIndex++;
                   predIndex++;
               }
           }
           else if(goldLen < predLen){
               goldLen += wordArray[goldIndex].length();
               goldIndex++;
           }
           else {
               predLen += predArray[predIndex].length();
               predIndex++;
           }
       }

       if(dic != null){
           for (String word : wordArray){
               if(dic.contains(word)){
                   IV += 1;
               }
               else {
                   OOV += 1;
               }
           }
       }
    }

    public static Result evaluate(Segment segment,String outputPath,String goldFile,String dictPath) throws IOException {
        IOUtil.LineIterator lineIterator = new IOUtil.LineIterator(goldFile);
        BufferedWriter bw = IOUtil.newBufferedWriter(outputPath);
        for(String line:lineIterator){
            List termList = segment.seg(line.replaceAll("\\s+", ""));
            int i=0;
            for (Term term : termList){
                bw.write(term.word);
                if(++i != termList.size()){
                    bw.write(" ");
                }
            }
            bw.newLine();
        }
        bw.close();
        Result result = CWSEvaluator.evaluate(goldFile, outputPath, dictPath);
        return result;
    }

    public static Result evaluate(String goldFile, String predFile, String dictPath) throws IOException {
        IOUtil.LineIterator goldIter = new IOUtil.LineIterator(goldFile);
        IOUtil.LineIterator predIter = new IOUtil.LineIterator(predFile);
        CWSEvaluator evaluator = new CWSEvaluator(dictPath);
        while (goldIter.hasNext() && predIter.hasNext()){
            evaluator.compare(goldIter.next(),predIter.next());
        }
        return evaluator.getResult();
    }

    public static void main(String[] args) throws IOException {
        String dictPath = "C:\\Users\\dell\\Desktop\\icwb2-data\\gold\\msr_training_words.utf8";
        DoubleArrayTrieSegment segment = (DoubleArrayTrieSegment) new DoubleArrayTrieSegment(dictPath).enablePartOfSpeechTagging(true);
        IOUtil.LineIterator lineIterator = new IOUtil.LineIterator("C:\\Users\\dell\\Desktop\\icwb2-data\\testing\\msr_test.utf8");
        String pred = "C:\\Users\\dell\\Desktop\\msr_output.txt";
        BufferedWriter bw = IOUtil.newBufferedWriter(pred);
        for (String line : lineIterator){
            for (Term term:segment.seg(line)){
                bw.write(term.word);
                bw.write(" ");
            }
            bw.newLine();
        }
        bw.close();
        Result evaluate = CWSEvaluator.evaluate("C:\\Users\\dell\\Desktop\\icwb2-data\\gold\\msr_test_gold.utf8", pred, dictPath);
        System.out.println(evaluate);
    }
}

 

你可能感兴趣的:(自然语言处理入门,自然语言处理入门)