自定义语法与CFG

#encoding=utf8

def exec_cmd(cmd):
    p = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        shell=True,
        env=ENVIRON)
    out, err = p.communicate()
    return out, err

import nltk,os,jieba
from nltk.tree import Tree
from nltk.draw import TreeWidget
from nltk.draw.tree import TreeView
from nltk.draw.util import CanvasFrame
from nltk.parse import RecursiveDescentParser


class Cfg():
    '''
    '''

    def setUp(self):
        pass

    def tearDown(self):
        pass

    def test_sample(self):
        print("test_sample")
        # This is a CFG grammar, where:
        # Start Symbol : S
        # Nonterminal : NP,VP,DT,NN,VB
        # Terminal : "I", "a" ,"saw" ,"dog"
        grammar = nltk.grammar.CFG.fromstring("""
            S -> NP VP
            NP -> DT NN | NN
            VP -> VB NP
            DT -> "a"
            NN -> "I" | "dog"
            VB -> "saw"
        """)
        sentence = "I saw a dog".split()
        parser = RecursiveDescentParser(grammar)
        final_tree = parser.parse(sentence)

        for i in final_tree:
            print(i)

    def test_nltk_cfg_qtype(self):
        print("test_nltk_cfg_qtype")
        gfile = os.path.join(
            curdir,
            os.path.pardir,
            "config",
            "grammar.question-type.cfg")
        question_grammar = nltk.data.load('file:%s' % gfile)

        def get_missing_words(grammar, tokens):
            """
            Find list of missing tokens not covered by grammar
            """
            missing = [tok for tok in tokens
                       if not grammar._lexical_index.get(tok)]
            return missing

        sentence = "what is your name"

        sent = sentence.split()
        missing = get_missing_words(question_grammar, sent)
        target = []
        for x in sent:
            if x in missing:
                continue
            target.append(x)

        rd_parser = RecursiveDescentParser(question_grammar)
        result = []
        print("target: ", target)
        for tree in rd_parser.parse(target):
            result.append(x)
            print("Question Type\n", tree)

        if len(result) == 0:
            print("Not Question Type")

    def cfg_en(self):
        print("test_nltk_cfg_en")              # 定义英文语法规则
        grammar = nltk.CFG.fromstring("""
         S -> NP VP
         VP -> V NP | V NP PP
         V -> "saw" | "ate"
         NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
         Det -> "a" | "an" | "the" | "my"
         N -> "dog" | "cat" | "cookie" | "park"
         PP -> P NP
         P -> "in" | "on" | "by" | "with"
         """)

        sent = "Mary saw Bob".split()

        rd_parser = RecursiveDescentParser(grammar)

        result = []

        for i, tree in enumerate(rd_parser.parse(sent)):
            result.append(tree)

        assert len(result) > 0, " CFG tree parse fail."

        print(result)

    def cfg_zh(self):
       
        grammar = nltk.CFG.fromstring("""
             S -> N VP
             VP -> V NP | V NP | V N
             V -> "尊敬"
             N -> "我们" | "老师" 
             """)

        sent = "我们 尊敬 老师".split()
        rd_parser = RecursiveDescentParser(grammar)

        result = []

        for i, tree in enumerate(rd_parser.parse(sent)):
            result.append(tree)
            print("Tree [%s]: %s" % (i + 1, tree))

        assert len(result) > 0, "Can not recognize CFG tree."
        if len(result) == 1 :
            print("Draw tree with Display ...")
            result[0].draw()
        else:
            print("WARN: Get more then one trees.")

        print(result)

if __name__ == '__main__':
    cfg=Cfg()
    cfg.cfg_en()
    cfg.cfg_zh()

 

你可能感兴趣的:(机器学习)