Deepti Chopra(印度)
王威 译
语义分析(意义生成)被定义为确定字符或单次序列意义的过程,可用于执行语义消歧任务。
名词解释:
语义解释:将意义分配给句子 |
---|
上下文解释:将逻辑形式分配给知识表示 |
语义分析的原语或基本单位:意义或语义(meaning或sense) |
语义分析用到的Python库:
Python库 | 说明 |
---|---|
TextBlob | 用于执行NLP任务的API,例如词性标注、名词短语提取、文本分类、机器翻译、情感分析等 |
Gensim | 用于执行文档索引、主体建模和相似性检索 |
Polyglot | 一个支持多语言应用的NLP工具,它提供了40种语言的命名实体识别、165种语言的分词、196种语言的语言检测、136种语言的情感分析、16种语言的词性标注、135种语言的形态分析、137种语言的嵌入以及69种语言的音译 |
mport nltk
nltk.boolean_ops()
import nltk
input_expr = nltk.sem.Expression.fromstring
print(input_expr('X | (Y -> Z)'))
print(input_expr('-(X & Y)'))
print(input_expr('X & Y'))
print(input_expr('X <-> -- X'))
import nltk
value = nltk.Valuation([('X', True), ('Y', False), ('Z', True)])
print(value['Z'])
domain = set()
v = nltk.Assignment(domain)
u = nltk.Model(domain, value)
print(u.evaluate('(X & Y)', v))
print(u.evaluate('-(X & Y)', v))
print(u.evaluate('(X & Z)', v))
print(u.evaluate('(X | Y)', v))
import nltk
input_expr = nltk.sem.Expression.fromstring
expression = input_expr('run(marcus)', type_check=True)
print(expression.argument)
print(expression.argument.type)
print(expression.function)
print(expression.function.type)
sign = {'run': ''}
expression = input_expr('run(marcus)', signature=sign)
print(expression.function.type)
Import nltk
Nltk.data.show_cfg(‘grammars/book_grammars/sqqll.fcfg’)
from nltk import load_parser
test = load_parser ('grammars/book_grammars/sqll.fcfg')
q=" What cities are in Greece"
t = list(test.parse(q.split())
ans.t[0] .label () ['SEM']
ans = [s for s in ans if s]
q = ''.join(ans)
print (q)
from nltk.sem import chat80
r = chat80.sql_query('corpora/city_database/city.db', q)
for p in r:
print(p[0], end=" ")
命名实体识别(Named entity recognition,NER)是定位文档中的专有名词或命名实体的过程。
import nltk
locations=[('Jaipur','IN','Rajasthan'),('Ajmer','IN','Rajasthan'),('Udaipur','IN', 'Rajasthan'),('Mumbai', 'IN', 'Maharashtra'),('Ahmedabad', 'IN', 'Gujrat')]
q = [x1 for (x1, relation, x2) in locations if x2=='Rajasthan']
print(q)
from nltk.tag import StanfordNERTagger
sentences = StanfordNERTagger(‘english.all.3class.distism.crf.ser.gz’)
print(sentence.tag(‘John goes to NY’.split()))
import nltk
sentences1 = nltk.corpus.treebank.tagged_sents()[17]
print(nltk.ne_chunk(sentences1, binary=True))
sentences2 = nltk.corpus.treebank.tagged_sents()[7]
print(nltk.ne_chunk(sentences2, binary=True))
print(nltk.ne_chunk(sentences2))
import nltk
from nltk.corpus import conll2002
for documents in conll2002.chunked_sents('ned.train')[25]:
print(documents)
import nltk
sentence = "I went to Greece to meet John";
tok=nltk.word_tokenize(sentence)
pos_tag=nltk.pos_tag(tok)
print(nltk.ne_chunk(pos_tag))
import nltk
nltk.tag.hmm.demo_pos()
分类 | 方法 |
---|---|
基于规则的或手工 | 列表查找、语言学 |
基于机器学习的方法或自动化 | 隐马尔科夫模型(HMM)、最大熵马尔可夫模型(MEHMM)、条件随机场(CRF)、支持向量机(SVM)、决策树(DT) |
import nltk
from nltk import pos_tag, word_tokenize
print(pos_tag(word_tokenize("John and Smith are going to NY and Germany")))
import nltk
from nltk.corpus import brown
from nltk.tag import UnigramTagger
tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700])
sentence = ['John','and','Smith','went','to','NY','and','Germany']
for word, tag in tagger.tag(sentence):
print(word,'->',tag)
Wordnet可以定义为一个英语词汇数据集。通过使用同义词集可以找到单词之间的概念依存。
import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
wn.synsets('cat')
wn.synsets('cat', pos=wn.VERB)
wn.synset('cat.n.01')
print(wn.synset('cat.n.01').definition())
print(len(wn.synset('cat.n.01').examples()))
print(wn.synset('cat.n.01').lemmas())
print([str(lemma.name()) for lemma in wn.synset('cat.n.01').lemmas()])
print(wn.lemma('cat.n.01.cat').synset())
import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
print(sorted(wn.langs()))
print(wn.synset('cat.n.01').lemma_names('ita'))
print(sorted(wn.synset('cat.n.01').lemmas('dan')))
print(sorted(wn.synset('cat.n.01').lemmas('por')))
print(len(wordnet.all_lemma_names(pos='n', lang='jpn')))
cat = wn.synset('cat.n.01')
print(cat.hypernyms())
print(cat.hyponyms())
print(cat.member_holonyms())
print(cat.root_hypernyms())
print(wn.synset('cat.n.01').lowest_common_hypernyms(wn.synset('dog.n.01')))
import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
lion = wn.synset('lion.n.01')
cat = wn.synset('cat.n.01')
print(lion.path_similarity(cat))
import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
lion = wn.synset('lion.n.01')
cat = wn.synset('cat.n.01')
print(lion.lch_similarity(cat))
import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
lion = wn.synset('lion.n.01')
cat = wn.synset('cat.n.01')
print(lion.wup_similarity(cat))
import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')
from nltk.corpus import genesis
genesis_ic = wn.ic(genesis, False, 0.0)
lion = wn.synset('lion.n.01')
cat = wn.synset('cat.n.01')
print(lion.res_similarity(cat, brown_ic))
print(lion.res_similarity(cat, genesis_ic))
print(lion.jcn_similarity(cat, brown_ic))
print(lion.jcn_similarity(cat, genesis_ic))
print(lion.lin_similarity(cat, semcor_ic))
from nltk.corpus import wordnet as wn
def getSenseSimilarity (worda, wordb) :
“”“
find similarity between word senses of two words
”“”
wordasynsets = wn.synsets (worda)
wordbsynsets = wn.synsets (wordb)
synsetnamea = [wn.synset lstr (syno.name)) for syns in wordasynsets]
synsetnameb = [wn. synset (str (syns .name)) for syns in wordbsynsete]
for sseta, ssetb in [(sseta,ssetb) for sseta in synsetnamea\for ssetb in synsetnameb] :
pathsim = sseta.path_similarity(ssetb)
wupsim = sseta.wup_similarity (ssetb)
if pathsim != None:
print "Path Sim Score: ",pathsim," WUP Sim Score: ",wupsim, \
"\t" ,sseta.definition, "\t", ssetb.definition
if __name__ == "__main__ ":
#getSenseSimilarity('walk','dog')
getSenseSimilarity('cricket', 'ball')
from nltk.corpus import wordnet
def lesk (context_sentence, ambiguous_word, pos=None, synsets=None) :
“""Return a synset for an ambiguous word in a context.
:param iter context_sentence: The context sentence where theambiguous word
occurs, passed as an iterable of words.
:param str ambiguous_word: The ambiguous word that requires wsD.
:pazam str pos: A specified Part-of-speech (POS).
:param iter synsets: Possible synsets of the ambiguous word.
:return: ’’lesk_sense’’the Synset() object with the highest signature overlaps.
// This funetion is an implementation of thealgorithm 1986) [1] .
Usage example: :
Lesk([‘I’, 'went', 'to', 'the', 'bank', 'to’, 'deposit', ’money,' ‘.'], 'bank', 'n')
Synset('savings_bank.n.02')
context = set (context_sentence)
if synsets is None:
synsets = wordnet.synsets (ambiguous_word)
if pos:
synsets = [ss for ss in synsets if str(ss.pos()) == pos]
if not synsets:
return None
_, sense = max(
(len(context. intersection (ss .definition().split())), ss) for ss in synsets)
return sense
“”"***笔者的话:整理了《精通Python自然语言处理》的第六章内容:语义分析。语义分析最分词来说也是至关重要的。后续会整理这本书的后面章节。本博客记录了书中的每段代码。希望对阅读这本书的人有所帮助。FIGHTING...(热烈欢迎大家批评指正,互相讨论)
(Life isn't tied with a bow, but it's still a gift.
) ***"""