Stanford coreNLP源码学习(1)

代码


//openie is dependent on tokenize,ssplit,pos,depparse
public class Try1 {
    public static void main(String[] args){
        Properties props = new Properties();         //props是一个类似map的结构
        props.put("annotators", "tokenize, ssplit, pos, lemma, ner, depparse, natlog, openie"); 
        /*   tokenize       Tokenizesthetextintoasequenceoftokens.分词,中文中,将句子分成一个个的词,英文中较简单?
           * ssplit       Splits a sequence of tokens into sentences. 断句
           * cleanxml     Removes most or all XML tags from the document             
           * truecase     Determinesthelikelytruecaseoftokens in text
           * pos          part of speech,Labels tokens with their POS tag,词性标注CC,DT,JJR,TO,VB等等等等
           * lemma        lemmatization,词元化,表示出词的原型,例如 sings--sing   your--you  is--be 
           * gender       Adds likely gender information to names
           * ner          named entities recognizer  命名实体识别  识别出是ORGANIZATION组织,LOCATION地点 等等
           *              Time, Location, Organization, Person, Money, Percent, Date   这7种
           * parse        找出句子的语法结构,哪些词可以成组,哪些词是这个动词的主语或宾语
           * depparse     Neural Network Dependency Parser  更厉害的parse?
           * sentiment    Sentiment analysis with a compositional model over trees using deep learning 
           * natlog       Natural Logic   some cute rabbits are small -- some rabbits are small.
           * dcoref       同义词分辨 Implements mention detection and both pronominal and nominal coreference resolution 
           * openie       open information extraction, 提取关系三元组
           * */
        /*
         * 以下两种初始化管道的方式二选一
         */
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);                  //用props初始化管道pipeline
//      StanfordCoreNLP pipeline = new StanfordCoreNLP(
//              PropertiesUtils.asProperties(
//                  "annotators", "tokenize,ssplit,pos,lemma,ner,depparse,natlog,openie",
//                  "ssplit.isOneSentence", "true",
//                  "parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz",
//                  "tokenize.language", "en"));

        String text = "Stanford University is located in Stanford which is one of best good universities in 2015.\n";
        Annotation doc = new Annotation(text);                                 // 用字符串初始化一个annotation类型
        pipeline.annotate(doc);
        /*  将前面的一系列操作处理字符串  StanfordCoreNLP.annotate(Annotation)
         *  得到的doc为处理后的doc
         */
        int sentNo = 0;
        // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
        // sentence 是一个coreMap类型,使用类作为key,value可以为自定义类型
        for(CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)){            //对 doc 的每一句话
            System.out.println("Sentence #" + ++sentNo + ": " + sentence.get(TextAnnotation.class)); //输出处理前的那句话
            //如何得到分句后的结果呢?
            System.out.println("word\tpos\tlema\tne");
            // a CoreLabel is a CoreMap with additional token-specific methods
            for(CoreLabel token : sentence.get(TokensAnnotation.class)){                       //对每句话的每个单词(分词以后的)
                String word = token.get(TextAnnotation.class);                                 //获取分词
                String lema = token.get(LemmaAnnotation.class);                                //这个词的词元
                String pos = token.get(PartOfSpeechAnnotation.class);                          //这个词的词性
                String ne = token.get(NamedEntityTagAnnotation.class);                         //这个词属于哪种命名实体

                System.out.println(word + '\t' + pos + '\t' + lema + '\t' + ne);
            }

            // this is the parse tree of the current sentence
            // depparse 没有parse tree
//          System.out.println("--------tree--------");
//          Tree tree = sentence.get(TreeAnnotation.class);
//          System.out.println(tree.toString());


            // SemanticGraph,BasicDependencies
            System.out.println("--------SemanticGraph BasicDependencies--------");
            SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class);
            System.out.println(dependencies.toString());

            // SemanticGraph,EnhancedDependencies
            // this is the Stanford dependency graph of the current sentence
            // toString(SemanticGraph.OutputFormat.LIST)为输出格式
            System.out.println("--------SemanticGraph EnhancedDependencies LIST-format--------");
            System.out.println(sentence.get(EnhancedDependenciesAnnotation.class).toString(SemanticGraph.OutputFormat.LIST));
            System.out.println("--------SemanticGraph EnhancedDependencies READABLE-format--------");
            System.out.println(sentence.get(EnhancedDependenciesAnnotation.class).toString(SemanticGraph.OutputFormat.READABLE));


            System.out.println("--------RelationTriple--------");
            // Get the OpenIE triples for the sentence
            Collection triples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
            for(RelationTriple triple : triples) {
                System.out.println(triple.confidence + '\t' + triple.subjectLemmaGloss() + '\t' + triple.relationLemmaGloss()
                                                     + '\t' + triple.objectLemmaGloss());
            }

            // Alternately, to only run e.g., the clause splitter:
            System.out.println("--------clauses--------");
            List clauses = new OpenIE(props).clausesInSentence(sentence);
            for(SentenceFragment clause : clauses){
                System.out.println("LIST");
                System.out.println(clause.parseTree.toString(SemanticGraph.OutputFormat.LIST));
                System.out.println("READABLE");
                System.out.println(clause.parseTree.toString(SemanticGraph.OutputFormat.READABLE));

            }

            System.out.println();
        }
        System.out.println("end!");
    }
}

输出结果

Adding annotator tokenize
No tokenizer type provided. Defaulting to PTBTokenizer.
Adding annotator ssplit
Adding annotator pos
Reading POS tagger model from edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger ... done [1.1 sec].
Adding annotator lemma
Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [2.0 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [3.5 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz ... done [0.9 sec].
Adding annotator depparse
Loading depparse model file: edu/stanford/nlp/models/parser/nndep/english_UD.gz ... 
PreComputed 99996, Elapsed Time: 22.545 (s)
Initializing dependency parser ... done [28.8 sec].
Adding annotator natlog
Adding annotator openie
Loading clause splitter from edu/stanford/nlp/models/naturalli/clauseSearcherModel.ser.gz ... done [0.0187 seconds]
Sentence #1: Stanford University is located in Stanford which is one of best good universities in 2015.
word    pos lema    ne
Stanford    NNP Stanford    ORGANIZATION
University  NNP University  ORGANIZATION
is  VBZ be  O
located JJ  located O
in  IN  in  O
Stanford    NNP Stanford    LOCATION
which   WDT which   O
is  VBZ be  O
one CD  one NUMBER
of  IN  of  O
best    RB  best    O
good    JJ  good    O
universities    NNS university  O
in  IN  in  O
2015    CD  2015    DATE
.   .   .   O
--------SemanticGraph BasicDependencies--------
-> located/JJ (root)
  -> University/NNP (nsubjpass)
    -> Stanford/NNP (compound)
  -> is/VBZ (auxpass)
  -> Stanford/NNP (nmod)
    -> in/IN (case)
    -> one/CD (acl:relcl)
      -> which/WDT (nsubj)
      -> is/VBZ (cop)
      -> universities/NNS (nmod)
        -> of/IN (case)
        -> best/RB (advmod)
        -> good/JJ (amod)
        -> 2015/CD (nmod)
          -> in/IN (case)
  -> ./. (punct)

--------SemanticGraph EnhancedDependencies LIST-format--------
root(ROOT-0, located-4)
compound(University-2, Stanford-1)
nsubjpass(located-4, University-2)
auxpass(located-4, is-3)
case(Stanford-6, in-5)
nmod:in(located-4, Stanford-6)
nsubj(one-9, Stanford-6)
ref(Stanford-6, which-7)
cop(one-9, is-8)
acl:relcl(Stanford-6, one-9)
case(universities-13, of-10)
advmod(universities-13, best-11)
amod(universities-13, good-12)
nmod:of(one-9, universities-13)
case(2015-15, in-14)
nmod:in(universities-13, 2015-15)
punct(located-4, .-16)

--------SemanticGraph EnhancedDependencies READABLE-format--------
dep                 reln                gov                 
---                 ----                ---                 
located/JJ-4        root                root                
Stanford/NNP-1      compound            University/NNP-2    
University/NNP-2    nsubjpass           located/JJ-4        
is/VBZ-3            auxpass             located/JJ-4        
in/IN-5             case                Stanford/NNP-6      
Stanford/NNP-6      nmod:in             located/JJ-4        
Stanford/NNP-6      nsubj               one/CD-9            
which/WDT-7         ref                 Stanford/NNP-6      
is/VBZ-8            cop                 one/CD-9            
one/CD-9            acl:relcl           Stanford/NNP-6      
of/IN-10            case                universities/NNS-13 
best/RB-11          advmod              universities/NNS-13 
good/JJ-12          amod                universities/NNS-13 
universities/NNS-13 nmod:of             one/CD-9            
in/IN-14            case                2015/CD-15          
2015/CD-15          nmod:in             universities/NNS-13 
./.-16              punct               located/JJ-4        

--------RelationTriple--------
10.0Stanford University be  located
10.0good university be in   2015
--------clauses--------
Loading clause splitter from edu/stanford/nlp/models/naturalli/clauseSearcherModel.ser.gz ... done [0.0638 seconds]
LIST
root(ROOT-0, located-4)
compound(University-2, Stanford-1)
nsubjpass(located-4, University-2)
auxpass(located-4, is-3)
case(Stanford-6, in-5)
nmod:in(located-4, Stanford-6)
ref(Stanford-6, which-7)
cop(one-9, is-8)
acl:relcl(Stanford-6, one-9)
case(universities-13, of-10)
advmod(universities-13, best-11)
amod(universities-13, good-12)
nmod:of(one-9, universities-13)
case(2015-15, in-14)
nmod:in(universities-13, 2015-15)

READABLE
dep                 reln                gov                 
---                 ----                ---                 
located/JJ-4        root                root                
Stanford/NNP-1      compound            University/NNP-2    
University/NNP-2    nsubjpass           located/JJ-4        
is/VBZ-3            auxpass             located/JJ-4        
in/IN-5             case                Stanford/NNP-6      
Stanford/NNP-6      nmod:in             located/JJ-4        
which/WDT-7         ref                 Stanford/NNP-6      
is/VBZ-8            cop                 one/CD-9            
one/CD-9            acl:relcl           Stanford/NNP-6      
of/IN-10            case                universities/NNS-13 
best/RB-11          advmod              universities/NNS-13 
good/JJ-12          amod                universities/NNS-13 
universities/NNS-13 nmod:of             one/CD-9            
in/IN-14            case                2015/CD-15          
2015/CD-15          nmod:in             universities/NNS-13 


end!

你可能感兴趣的:(nlp)