pip install -U nltk
pip install NetworkX
from nltk.book import *
text2.common_contexts(["monstrous", "very"])
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])
len(text3) / len(set(text3))
>> text3.count("smote")
>> 100 * text4.count('a') / len(text4)
> sent1 = ['Call', 'me', 'Ishmael', '.']
>> sent[:3]
>>>name ='Monty'
>>name *2
>>''.join(['Monty', 'Python'])
>>'Monty Python'.split()
>>> fdist1 = FreqDist(text1)
>> vocabulary1 = fdist1.keys()
>>> V = set(text1)
>> long_words = [w for w in V if len(w) > 15]
>>> sorted(long_words)
>> fdist5 = FreqDist(text5)
>>> sorted([w for w in set(text5) if len(w) > 7 and fdist5[w] > 7])
>>bigrams(['more','is','said','than','done']) #单词搭配#
>>text4.collocations() #最频繁出现的双连词#
>>[len(w)for w in text1]
>>>fdist =FreqDist([len(w)for w in text1])
dist= FreqDist(samples) 创建包含给定样本的频率分布
fdist.inc(sample) 增加样本
fdist['monstrous'] 计数给定样本出现的次数
fdist.freq('monstrous') 给定样本的频率
fdist.N() 样本总数
fdist.keys() 以频率递减顺序排序的样本链表
forsample in fdist: 以频率递减的顺序遍历样本
fdist.max() 数值最大的样本
fdist.tabulate() 绘制频率分布表
fdist.plot() 绘制频率分布图
fdist.plot(cumulative=True) 绘制累积频率分布图
fdist1< fdist2 测试样本在 fdist1中出现的频率是否小于 fdist
>[wforw in sent7if len(w)!=4]
函数 含义
s.startswith(t) 测试 s是否以t开头
s.endswith(t) 测试 s是否以t结尾
tin s 测试 s是否包含t
s.islower() 测试 s中所有字符是否都是小写字母
s.isupper() 测试 s中所有字符是否都是大写字母
s.isalpha() 测试 s中所有字符是否都是字母
s.isalnum() 测试 s中所有字符是否都是字母或数字
s.isdigit() 测试 s中所有字符是否都是数字
s.istitle() 测试 s是否首字母大写( s中所有的词都首字母大写)
>>>sorted([w for w in set(text1)if w.endswith('ableness')])
>>>sorted([term for term in set(text4)if 'gnt'in term])
>>>sorted([item for item in set(text6)if item.istitle()])
>>>sorted([item for item in set(sent7)if item.isdigit()])
>>sorted([wforwin set(sent7)if notw.islower()])
>>[w.upper()forw in text1]
>> len(set([word.lower() for word in text1])
>> len(set([word.lower() for word in text1 if word.isalpha()])
> sent1 = ['Call', 'me', 'Ishmael', '.']
>>> for xyzzy in sent1:
... if xyzzy.endswith('l'):
... print xyzzy
>>> tricky = sorted([w for w in set(text2) if 'cie' in w or 'cei' in w])
>>> for word in tricky:
... print word
>> babelize_shell()
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
from nltk.corpus import gutenberg
mma = gutenberg.words('austen-emma.txt')
or fileid in gutenberg.fileids():
... num_chars = len(gutenberg.raw(fileid)) ?
... num_words = len(gutenberg.words(fileid))
... num_sents = len(gutenberg.sents(fileid))
... num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
... print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
> macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') #把文本划分为句子#
from nltk.corpus import webtext
>>> for fileid in webtext.fileids():
... print fileid, webtext.raw(fileid)[:65], '...'
>> from nltk.corpus import nps_chat
>>> chatroom = nps_chat.posts('10-19-20s_706posts.xml')
>>> chatroom[123]
from nltk.corpus import brown
>>> brown.categories()
>> brown.words(categories='news')
>>> brown.words(fileids=['cg22'])
>>> brown.sents(categories=['news', 'editorial', 'reviews'])
> from nltk.corpus import brown
>>> news_text = brown.words(categories='news')
>>> fdist = nltk.FreqDist([w.lower() for w in news_text])
>>> modals = ['can', 'could', 'may', 'might', 'must', 'will']
>>> for m in modals:
... print m + ':', fdist[m],
fd = nltk.ConditionalFreqDist(
... (genre, word)
... for genre in brown.categories()
... for word in brown.words(categories=genre))
>>> genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
>>> modals = ['can', 'could', 'may', 'might', 'must', 'will']
>>> cfd.tabulate(conditions=genres, samples=modals)
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals]
>>> from nltk.corpus import reuters
>>> reuters.fileids()
> reuters.words('training/9865')[:14]
> from nltk.corpus import inaugura
fd = nltk.ConditionalFreqDist(
... (target, file[:4])
... for fileid in inaugural.fileids()
... for w in inaugural.words(fileid)
... for target in ['america', 'citizen']
... if w.lower().startswith(target))
>>> cfd.plot()
ileids() 语料库中的文件
fileids([categories]) 这些分类对应的语料库中的文件
categories() 语料库中的分类
categories([fileids]) 这些文件对应的语料库中的分类
raw() 语料库的原始内容
raw(fileids=[f1,f2,f3]) 指定文件的原始内容
raw(categories=[c1,c2]) 指定分类的原始内容
words() 整个语料库中的词汇
words(fileids=[f1,f2,f3]) 指定文件中的词汇
words(categories=[c1,c2]) 指定分类中的词汇
sents() 指定分类中的句子
sents(fileids=[f1,f2,f3]) 指定文件中的句子
sents(categories=[c1,c2]) 指定分类中的句子
abspath(fileid) 指定文件在磁盘上的位置
encoding(fileid) 文件的编码(如果知道的话)
open(fileid) 打开指定语料库文件的文件流
root() 到本地安装的语料库根目录的路径
>> from nltk.corpus import PlaintextCorpusReader
>>> corpus_root = '/usr/share/dict'
>>> wordlists = PlaintextCorpusReader(corpus_root, '.*')
>>> wordlists.fileids()
['README', 'connectives', 'propernames', 'web2', 'web2a', 'words']
>>> wordlists.words('connectives')
>> from nltk.corpus import BracketParseCorpusReader
>>> corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj"
>>> file_pattern = r".*/wsj_.*\.mrg" ?
>>> ptb = BracketParseCorpusReader(corpus_root, file_pattern)
>>> ptb.fileids()
>>>text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
>>> pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ...]
> cfd.tabulate(conditions=['English', 'German_Deutsch'],
... samples=range(10), cumulative=True)
> sent = ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven',
... 'and', 'the', 'earth', '.']
> nltk.bigrams(sent)
ef generate_model(cfdist, word, num=15):
for i in range(num):
print word,
word = cfdist[word].max()
text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)
cfdist= ConditionalFreqDist(pairs) 从配对链表中创建条件频率分布
cfdist.conditions() 将条件按字母排序
cfdist[condition] 此条件下的频率分布
cfdist[condition][sample] 此条件下给定样本的频率
cfdist.tabulate() 为条件频率分布制表
cfdist.tabulate(samples, conditions) 指定样本和条件限制下制表
cfdist.plot() 为条件频率分布绘图
cfdist.plot(samples, conditions) 指定样本和条件限制下绘图
cfdist1 < cfdist2 测试样本在cfdist1中出现次数是否小于在cfdist2中出现次
>>>fromnltk.corpusimport stopwords
>> entries = nltk.corpus.cmudict.entries()
>>> len(entries)
>>> for word, pron in entries: ?
... if len(pron) == 3: ?
... ph1, ph2, ph3 = pron ?
... if ph1 == 'P' and ph3 == 'T':
... print word, ph2
> [w for w, pron in entries if pron[-1] == 'M' and w[-1] == 'n']
>>> def stress(pron):
... return [char for phone in pron for char in phone if char.isdigit()]
>>> [w for w, pron in entries if stress(pron) == ['0', '1', '0', '2', '0']]
>> p3 = [(pron[0]+'-'+pron[2], word) ?
... for (word, pron) in entries
... if pron[0] == 'P' and len(pron) == 3] ?
>>> cfd = nltk.ConditionalFreqDist(p3)
>>> for template in cfd.conditions():
... if len(cfd[template]) > 10:
... words = cfd[template].keys()
... wordlist = ' '.join(words)
... print template, wordlist[:70] + "..."
prondict = nltk.corpus.cmudict.dict()
>>> prondict['fire']
>> text = ['natural', 'language', 'processing']
>>> [ph for w in text for ph in prondict[w][0]]
from nltk.corpus import swadesh
>>> swadesh.fileids()
fr2en = swadesh.entries(['fr', 'en'])
es2en = swadesh.entries(['de', 'en']) # German-English
>>> es2en = swadesh.entries(['es', 'en']) # Spanish-English
>>> translate.update(dict(de2en))
>>> translate.update(dict(es2en))
>>> translate['Hund']
> from nltk.corpus import toolbox
>>> toolbox.entries('rotokas.dic')
>>>fromnltk.corpusimport wordnetas wn
otorcar = wn.synset('car.n.01')
>>> types_of_motorcar = motorcar.hyponyms()
>>> paths = motorcar.hypernym_paths()
>>> len(paths)
> wn.synset('tree.n.01').part_meronyms()
>> wn.synset('tree.n.01').substance_meronyms()
>>> wn.synset('tree.n.01').member_holonyms()
> wn.lemma('supply.n.02.supply').antonyms()
> right = wn.synset('right_whale.n.01')
>>> orca = wn.synset('orca.n.01')
> wn.synset('whale.n.02').min_depth()
> from urllib import urlopen
>>> url = "http://www.gutenberg.org/files/2554/2554.txt"
>>> raw = urlopen(url).read()
>>> type(raw)
>>> len(raw)
> proxies = {'http': 'http://www.someproxy.com:3128'}
>>>raw =urlopen(url, proxies=proxies).read()
>> tokens = nltk.word_tokenize(raw)
>>> type(tokens)
> text = nltk.Text(tokens)
raw.find("PART I")
raw.rfind("End of Project Gutenberg's Crime")
> raw = nltk.clean_html(html)
import feedparser
>>> llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
>>> llog['feed']['title']
>>> len(llog.entries)
>>> post = llog.entries[2]
> nltk.word_tokenize(nltk.clean_html(llog.entries[2].content[0].value)
> import os
>>> os.listdir('.')
or line in f:
... print line.strip()
ath = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
>>> raw = open(path, 'rU').read()
raw = open('document.txt').read()
tokens = nltk.word_tokenize(raw)
vocab = sorted(set(words)
方法 功能
s.find(t) 字符串 s中包含t的第一个索引(没找到返回-1)
s.rfind(t) 字符串 s中包含t的最后一个索引(没找到返回-1)
s.index(t) 与 s.find(t)功能类似,但没找到时引起ValueError
s.rindex(t) 与 s.rfind(t)功能类似,但没找到时引起 ValueError
s.join(text) 连接字符串 s与text中的词汇
s.split(t) 在所有找到 t的位置将 s分割成链表(默认为空白符)
s.splitlines() 将 s按行分割成字符串链表
s.lower() 将字符串 s小写
s.upper() 将字符串 s大写
s.titlecase() 将字符串 s首字母大写
s.strip() 返回一个没有首尾空白字符的 s的拷贝
s.replace(t,u) 用 u替换 s中的)
>>> import codecs
>>> f = codecs.open(path, encoding='latin2')
>> for line in f:
... line = line.strip()
... print line.encode('unicode_escape')
>>> a = u'\u0061'
> nacute = u'\u0144'
>>> nacute
>>> nacute_utf = nacute.encode('utf8')
>>> print repr(nacute_utf)
import unicodedata
>>> lines = codecs.open(path, encoding='latin2').readlines()
>>> line = lines[2]
>>> print line.encode('unicode_escape')
>>> line.find(u'zosta\u0142y')
>>> line = line.lower()
>>> print line.encode('unicode_escape')
>>> import re
>>> m = re.search(u'\u015b\w*', line)
>>> m.group()
> nltk.word_tokenize(line)
> import re
>>> wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
> [w for w in wordlist if re.search('ed$', w)]
> [w for w in wordlist if re.search('^..j..t..$', w)]
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]
> [w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)]
作符 行为
· 通配符,匹配所有字符
^abc 匹配以 abc开始的字符串
abc$ 匹配以 abc结尾的字符串
[abc] 匹配字符集合中的一个
[A-Z0-9] 匹配字符一个范
d|ing|s 匹配指定的一个字符串(析取)
* 前面的项目零个或多个,如 a*, [a-z]* (也叫Kleene闭包)
+ 前面的项目 1个或多个,如 a+, [a-z]+
? 前面的项目零个或 1个(即:可选)如: a?, [a-z]?
{n} 重复 n次, n为非负整数
{n,} 至少重复 n次
{,n} 重复不多于 n次
{m,n} 至少重复 m次不多于 n次
a(b|c)+ 括号表示操作符的范
word = 'supercalifragilisticexpialidocious'
>>> re.findall(r'[aeiou]', word]
>> wsj = sorted(set(nltk.corpus.treebank.words()))
>>> fd = nltk.FreqDist(vs for word in wsj
... for vs in re.findall(r'[aeiou]{2,}', word))
> fd.items()
>> regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
>>> def compress(word):
... pieces = re.findall(regexp, word)
... return ''.join(pieces)
>> cv_word_pairs = [(cv, w) for w in rotokas_words
... for cv in re.findall(r'[ptksvr][aeiou]', w)]
>>> cv_index = nltk.Index(cv_word_pairs)
>> def stem(word):
... for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
... if word.endswith(suffix):
... return word[:-len(suffix)]
... return word
>> re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
>> re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
> re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
e.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')
> def stem(word):
... regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
... stem, suffix = re.findall(regexp, word)[0]
... return stem
> hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
>>> hobbies_learned.findall(r"<\w*>
> porter = nltk.PorterStemmer()
>>> lancaster = nltk.LancasterStemmer()
>>> [porter.stem(t) for t in tokens]
> [lancaster.stem(t) for t in tokens]
class IndexedText(object):
def __init__(self, stemmer, text):
self._text = text
self._stemmer = stemmer
self._index = nltk.Index((self._stem(word), i)
for (i, word) in enumerate(text))
def concordance(self, word, width=40):
key = self._stem(word)
wc = width/4 # words of context
for i in self._index[key]:
lcontext = ' '.join(self._text[i-wc:i])113
rcontext = ' '.join(self._text[i:i+wc])
ldisplay = '%*s' % (width, lcontext[-width:])
rdisplay = '%-*s' % (width, rcontext[:width])
print ldisplay, rdisplay
def _stem(self, word):
return self._stemmer.stem(word).lower()
>>> porter = nltk.PorterStemmer()
>>> grail = nltk.corpus.webtext.words('grail.txt')
>>> text = IndexedText(porter, grail)
>>> text.concordance('lie')
>> wnl = nltk.WordNetLemmatizer()
>>> [wnl.lemmatize(t) for t in tokens]
> re.split(r'[ \t\n]+', raw)
号 功能
\b 词边界(零宽度)
\d 任一十进制数字(相当于[0-9])
\D 任何非数字字符(等价于[^ 0-9])
\s 任何空白字符(相当于[ \t\n\r\f\v])
\S 任何非空白字符(相当于[^ \t\n\r\f\v])
\w 任何字母数字字符(相当于[a-zA-Z0-9_])
\W 任何非字母数字字符(相当于[^a-zA-Z0-9_])
\t 制表符
\n 换行
>> nltk.regexp_tokenize(text, pattern)
> sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
>>> text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
>>> sents = sent_tokenizer.tokenize(text)
>>> pprint.pprint(sents[171:181])
def segment(text, segs):
words = []
last = 0
for i in range(len(segs)):
if segs[i] == '1':
last = i+1
return words
>>> text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
>>> seg1 = "0000000000000001000000000010000000000000000100000000000"
>>> seg2 = "0100100100100001001001000010100100010010000100010010000"
>>> segment(text, seg1)
def evaluate(text, segs):
words = segment(text, segs)
text_size = len(words)
lexicon_size = len(' '.join(list(set(words))))
return text_size + lexicon_size
>>> text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"119
>>> seg1 = "0000000000000001000000000010000000000000000100000000000"
>>> seg2 = "0100100100100001001001000010100100010010000100010010000"
>>> seg3 = "0000100100000011001000000110000100010000001100010000001"
>>> segment(text, seg3)
> evaluate(text, seg3)
rom random import randint
def flip(segs, pos):
return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]
def flip_n(segs, n):
for i in range(n):
segs = flip(segs, randint(0,len(segs)-1))
return segs
def anneal(text, segs, iterations, cooling_rate):
'%-*s' % (width, 'dog')
temperature = float(len(segs))
while temperature > 0.5:
best_segs, best = segs, evaluate(text, segs)
for i in range(iterations):
guess = flip_n(segs, int(round(temperature)))
score = evaluate(text, guess)
if score < best:
best, best_segs = score, guess
score, segs = best, best_segs
temperature = temperature / cooling_rate
print evaluate(text, segs), segment(text, segs)
return segs
>>> text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
>>> seg1 = "0000000000000001000000000010000000000000000100000000000"
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])
>>> for word in fdist:
... print word, '->', fdist[word], ';'
> for word in fdist:
... print '%s->%d;' % (word, fdist[word]),
> output_file = open('output.txt', 'w')
>>> words = set(nltk.corpus.genesis.words('english-kjv.txt'))
>>> for word in sorted(words):
... output_file.write(word + "\n")
> output_file.write(str(len(words)) + "\n")
>>> output_file.close()
> from textwrap import fill
>>> format = '%s (%d),'
>>> pieces = [format % (word, len(word)) for word in saying]
>>> output = ' '.join(pieces)
>>> wrapped = fill(output)
>>> print wrapped
python表达式 评论
for item in s 遍历 s中的元素
for item in sorted(s) 按顺序遍历 s中的元素
for item in set(s) 遍历 s中的无重复的元素
for item in reversed(s) 按逆序遍历 s中的元素
for item in set(s).difference(t) 遍历在集合s中不在集合t的元素
for item in random.shuffle(s) 按随机顺序遍历 s中的元
>>> text = nltk.corpus.nps_chat.words()
>>> cut = int(0.9 * len(text))
>>> training_data, test_data = text[:cut], text[cut:]
>>> text == training_data + test_data
>> words = 'I turned off the spectroroute'.split() ?
>>> wordlens = [(len(word), word) for word in words] ?137
>>> wordlens.sort() ?
>>> ' '.join(w for (_, w) in wordlens)
> max([w.lower() for w in nltk.word_tokenize(text)])
>>> max(w.lower() for w in nltk.word_tokenize(text)) #生成器表达式,注意和上面列表推导式的不同
> sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']
>>> n = 3
>>> [sent[i:i+n] for i in range(len(sent)-n+1)
m, n = 3, 7
>>> array = [[set() for i in range(n)] for j in range(m)] #注意i,j和时间的对象没有关系,只是次数和for语法要正确
>>> array[2][5].add('Alice')
>>> pprint.pprint(array
def tag(word):
... assert isinstance(word, basestring), "argument to tag() must be a string"
... if word in ['a', 'the', 'all']:
... return 'det'
... else:
... return 'noun'
> extract_property(lambda w: w[-1])
> sorted(sent, lambda x, y: cmp(len(y), len(x))
ef is_content_word(word):
... return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.']
>> filter(is_content_word, sent)
engths = map(len, nltk.corpus.brown.sents(categories='news')
ef generic(*args, **kwargs):
... print args
... print kwargs
>>> generic(1, "African swallow", monty="python")
>> nltk.metrics.distance.__file__
import pdb 调试器
>>> import mymodule
>>> pdb.run('mymodule.myfunction()'
ef virahanka1(n):
if n == 0:
return [""]
elif n == 1:
return ["S"]
s = ["S" + prosody for prosody in virahanka1(n-1)]
l = ["L" + prosody for prosody in virahanka1(n-2)]
return s + l
def virahanka2(n):
lookup = [[""], ["S"]]
for i in range(n-1):
s = ["S" + prosody for prosody in lookup[i+1]]162
l = ["L" + prosody for prosody in lookup[i]]
lookup.append(s + l)
return lookup[n]
def virahanka3(n, lookup={0:[""], 1:["S"]}):
if n not in lookup:
s = ["S" + prosody for prosody in virahanka3(n-1)]
l = ["L" + prosody for prosody in virahanka3(n-2)]
lookup[n] = s + l
return lookup[n]
from nltk import memoize
def virahanka4(n):
if n == 0:
return [""]
elif n == 1:
return ["S"]
s = ["S" + prosody for prosody in virahanka4(n-1)]
l = ["L" + prosody for prosody in virahanka4(n-2)]
return s + l
mport networkx as nx
import matplotlib
from nltk.corpus import wordnet as wn
def traverse(graph, start, node):
graph.depth[node.name] = node.shortest_path_distance(start)
for child in node.hyponyms():
graph.add_edge(node.name, child.name) ?
traverse(graph, start, child) ?
def hyponym_graph(start):
G = nx.Graph() ?
G.depth = {}
traverse(G, start, start)
return G
def graph_draw(graph):
node_size = [16 * graph.degree(n) for n in graph],
node_color = [graph.depth[n] for n in graph],
with_labels = False)
>>> dog = wn.synset('dog.n.01')
>>> graph = hyponym_graph(dog)
>>> graph_draw(graph)
rom numpy import linalg
>>> a=array([[4,0], [3,-5]])
>>> u,s,vt = linalg.svd(a)
text = nltk.word_tokenize("And now for something completely different")
> nltk.pos_tag(text)
> text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
>>> text.similar('woman')
> tagged_token = nltk.tag.str2tuple('fly/NN')
>> sent = '''
... The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
... other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
... Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PP
... said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/R
... accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
... interest/NN of/IN both/ABX governments/NNS ''/'' ./.
... '''
>>> [nltk.tag.str2tuple(t) for t in sent.split()]
[('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN'), ('commented', 'VBD'),
('on', 'IN'), ('a', 'AT'), ('number', 'NN'), ... ('.', '.')]
> nltk.corpus.brown.tagged_words()
记 含义 例子
ADJ 形容词 new, good, high, special, big, local
ADV 动词 really, already, still, early, now
CNJ 连词 and, or, but, if, while, although
DET 限定词 the, a, some, most, every, no
EX 存在量词 there, there's
FW 外来词 dolce, ersatz, esprit, quo, maitre
MOD 情态动词 will, can, would, may, must, shoul
名词 year, home, costs, time, education
NP 专有名词 Alison, Africa, April, Washington
NUM 数词 twenty-four, fourth, 1991, 14:24
PRO 代词 he, their, her, its, my, I, us
P 介词 on, of, at, with, by, into, under
TO 词 to to
UH 感叹词 ah, bang, ha, whee, hmpf, oops
V 动词 is, has, get, do, make, see, run
VD 过去式 said, took, told, made, asked
VG 现在分词 making, going, playing, working
VN 过去分词 given, taken, begun, sung
WH Wh限定词 who, which, when, what, where, ho
> from nltk.corpus import brown
>>> brown_news_tagged = brown.tagged_words(categories='news', simplify_tags=True)
>>> tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
>>> tag_fd.keys()
>> word_tag_pairs = nltk.bigrams(brown_news_tagged)
>>> list(nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] == 'N')
>>> wsj = nltk.corpus.treebank.tagged_words(simplify_tags=True)
>>> word_tag_fd = nltk.FreqDist(wsj)
>>> [word + "/" + tag for (word, tag) in word_tag_fd if tag.startswith('V')]
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj)
>>> cfd2['VN'].keys()
>>> [w for w in cfd1.conditions() if 'VD' in cfd1[w] and 'VN' in cfd1[w]]
>>> idx1 = wsj.index(('kicked', 'VD')
def findtags(tag_prefix, tagged_text):
cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
if tag.startswith(tag_prefix))
return dict((tag, cfd[tag].keys()[:5]) for tag in cfd.conditions())
>>> tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news'))
>>> for tag in sorted(tagdict):
... print tag, tagdict[tag]
>>> brown_learned_text = brown.words(categories='learned')
>>> sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often'))
>> brown_learned_text = brown.words(categories='learned')
>>> sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often'))
>>> brown_lrnd_tagged = brown.tagged_words(categories='learned', simplify_tags=True)
>>> tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == 'often']
>>> fd = nltk.FreqDist(tags)
>>> fd.tabulate()
from nltk.corpus import brown
def process(sentence):
for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence): ?
if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')): ?
print w1, w2, w3 ?
>>> for tagged_sent in brown.tagged_sents():
... process(tagged_sent)
>> brown_news_tagged = brown.tagged_words(categories='news', simplify_tags=True)
>>> data = nltk.ConditionalFreqDist((word.lower(), tag)
... for (word, tag) in brown_news_tagged)
>>> for word in data.conditions():
... if len(data[word]) > 3:
... tags = data[word].keys()
... print word, ' '.join(tags)
> pos.keys()
>>> pos.values()
>>> pos.items()
>>> for key, val in sorted(pos.items()):
print key + ":", val
>>> pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}
>>> pos = dict(colorless='ADJ', ideas='N', sleep='V', furiously='ADV')
>> frequency = nltk.defaultdict(int)
>>> frequency['colorless'] = 4
>>> frequency['ideas']
>>> pos = nltk.defaultdict(list)
> pos = nltk.defaultdict(lambda: 'N')
>>> alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
>>> vocab = nltk.FreqDist(alice)
>>> v1000 = list(vocab)[:1000]
>>> mapping = nltk.defaultdict(lambda: 'UNK')
>>> for v in v1000:
... mapping[v] = v
>>> alice2 = [mapping[v] for v in alice]
>>> counts = nltk.defaultdict(int)
>>> from nltk.corpus import brown
>>> for (word, tag) in brown.tagged_words(categories='news'):
... counts[tag] += 1
>>> from operator import itemgetter
>>> sorted(counts.items(), key=itemgetter(1), reverse=True
> last_letters = nltk.defaultdict(list)
>>> words = nltk.corpus.words.words('en')
>>> for word in words:
... key = word[-2:]
... last_letters[key].append(word)
> anagrams = nltk.Index((''.join(sorted(w)), w) for w in words)
>>> anagrams['aeilnrt']
>>> pos = nltk.defaultdict(lambda: nltk.defaultdict(int))
>>> brown_news_tagged = brown.tagged_words(categories='news', simplify_tags=True)
>>> for ((w1, t1), (w2, t2)) in nltk.ibigrams(brown_news_tagged): ?
... pos[(t1, w2)][t2] += 1
counts = nltk.defaultdict(int)
>>> for word in nltk.corpus.gutenberg.words('milton-paradise.txt'):
... counts[word] += 1
>>> [key for (key, value) in counts.items() if value == 32]
> pos.update({'cats': 'N', 'scratch': 'V', 'peacefully': 'ADV', 'old': 'ADJ'})
>>> pos2 = nltk.defaultdict(list)
>>> for key, value in pos.items():
... pos2[value].append(key)
例 说明
d = {} 创建一个空的字典,并将分配给d
d[key] = value 分配一个值给一个给定的字典键
d.keys() 字典的键的链表
list(d) 字典的键的链表
sorted(d) 字典的键,排序
key in d 测试一个特定的键是否在字典中
for key in d 遍历字典的键
d.values() 字典中的值的链表
dict([(k1,v1), (k2,v2), ...]) 从一个键-值对链表创建一个字典
d1.update(d2) 添加d2中所有项目到d1
defaultdict(int) 一个默认值为0的字
>> raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
>>> tokens = nltk.word_tokenize(raw)
>>> default_tagger = nltk.DefaultTagger('NN')
>>> default_tagger.tag(tokens)
>> patterns = [
... (r'.*ing$', 'VBG'), # gerunds
... (r'.*ed$', 'VBD'), # simple past
... (r'.*es$', 'VBZ'), # 3rd singular present
... (r'.*ould$', 'MD'), # modals
... (r'.*\'s$', 'NN$'), # possessive nouns
... (r'.*s$', 'NNS'), # plural nouns190
... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
... (r'.*', 'NN') # nouns (default)
... ]
> regexp_tagger = nltk.RegexpTagger(patterns)
>>> regexp_tagger.tag(brown_sents[3])
>>> regexp_tagger.evaluate(brown_tagged_sents]
> fd = nltk.FreqDist(brown.words(categories='news'))
>>> cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
>>> most_freq_words = fd.keys()[:100]
>>> likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
>>> baseline_tagger = nltk.UnigramTagger(model=likely_tags)
>>> baseline_tagger.evaluate(brown_tagged_sents)
>> baseline_tagger = nltk.UnigramTagger(model=likely_tags,
... backoff=nltk.DefaultTagger('NN'))
def performance(cfd, wordlist):
lt = dict((word, cfd[word].max()) for word in wordlist)
baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
def display():
import pylab
words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
sizes = 2 ** pylab.arange(15)
perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
pylab.plot(sizes, perfs, '-bo')
pylab.title('Lookup Tagger Performance with Varying Model Size')
pylab.xlabel('Model Size')
>>> display()
>>> from nltk.corpus import brown
>>> brown_tagged_sents = brown.tagged_sents(categories='news')
>>> brown_sents = brown.sents(categories='news')
>>> unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
>>> unigram_tagger.tag(brown_sents[2007])
> bigram_tagger = nltk.BigramTagger(train_sents)
>>> bigram_tagger.tag(brown_sents[2007]
> t0 = nltk.DefaultTagger('NN')
>>> t1 = nltk.UnigramTagger(train_sents, backoff=t0)
>>> t2 = nltk.BigramTagger(train_sents, backoff=t1)
>>> t2.evaluate(test_sents)
>> from cPickle import dump
>>> output = open('t2.pkl', 'wb')
>>> dump(t2, output, -1)
>>> output.close()
> from cPickle import load
>>> input = open('t2.pkl', 'rb')
>>> tagger = load(input)196
>>> input.close()
test_tags = [tag for sent in brown.sents(categories='editorial')
... for (word, tag) in t2.tag(sent)]
>>> gold_tags = [tag for (word, tag) in brown.tagged_words(categories='editorial')]
>>> print nltk.ConfusionMatrix(gold, test)
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
>>> t2.evaluate(test_sents)
> nltk.tag.brill.demo()
> def gender_features(word):
... return {'last_letter': word[-1]}
>>> gender_features('Shrek')
>> from nltk.corpus import names
>> import random
> names = ([(name, 'male') for name in names.words('male.txt')] +
... [(name, 'female') for name in names.words('female.txt')])
>>> random.shuffle(names)
>> featuresets = [(gender_features(n), g) for (n,g) in names]
>>> train_set, test_set = featuresets[500:], featuresets[:500]
>>> classifier = nltk.NaiveBayesClassifier.train(train_set)
> print nltk.classify.accuracy(classifier, test_set)
from nltk.classify import apply_features
>>> train_set = apply_features(gender_features, names[500:])
>>> test_set = apply_features(gender_features, names[:500])
def gender_features2(name):
features = {}
features["firstletter"] = name[0].lower()
features["lastletter"] = name[–1].lower()
for letter in 'abcdefghijklmnopqrstuvwxyz':
features["count(%s)" % letter] = name.lower().count(letter)
features["has(%s)" % letter] = (letter in name.lower())
return features
>>> gender_features2('John')
>> train_set = [(gender_features(n), g) for (n,g) in train_names]
>>> devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
>>> test_set = [(gender_features(n), g) for (n,g) in test_names]
>>> classifier = nltk.NaiveBayesClassifier.train(train_set) ?
>>> print nltk.classify.accuracy(classifier, devtest_set)
>> errors = []
>>> for (name, tag) in devtest_names:
... guess = classifier.classify(gender_features(name))
... if guess != tag:
... errors.append( (tag, guess, name) )
> def gender_features(word):
... return {'suffix1': word[-1:],
... 'suffix2': word[-2:]}
> from nltk.corpus import movie_reviews
>>> documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
>>> random.shuffle(documents)
ll_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
> print document_features(movie_reviews.words('pos/cv957_8737.txt'))
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
>>> print nltk.classify.accuracy(classifier, test_set)
>> from nltk.corpus import brown
>>> suffix_fdist = nltk.FreqDist()
>>> for word in brown.words():
... word = word.lower()
... suffix_fdist.inc(word[-1:])
... suffix_fdist.inc(word[-2:])
... suffix_fdist.inc(word[-3:])
>>> common_suffixes = suffix_fdist.keys()[:100]
>> def pos_features(word):213
... features = {}
... for suffix in common_suffixes:
... features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
... return features
> tagged_words = brown.tagged_words(categories='news')
>>> featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
>>> size = int(len(featuresets) * 0.1)
>>> train_set, test_set = featuresets[size:], featuresets[:size]
>>> classifier = nltk.DecisionTreeClassifier.train(train_set)
>>> nltk.classify.accuracy(classifier, test_set)
> print classifier.pseudocode(depth=4)
def pos_features(sentence, i):
features = {"suffix(1)": sentence[i][-1:],
"suffix(2)": sentence[i][-2:],
"suffix(3)": sentence[i][-3:]}
if i == 0:
features["prev-word"] = ""
features["prev-word"] = sentence[i-1]
return features
>>> pos_features(brown.sents()[0], 8)
{'suffix(3)': 'ion', 'prev-word': 'an', 'suffix(2)': 'on', 'suffix(1)': 'n'}
>>> tagged_sents = brown.tagged_sents(categories='news')
>>> featuresets = []
>>> for tagged_sent in tagged_sents:
... untagged_sent = nltk.tag.untag(tagged_sent)
... for i, (word, tag) in enumerate(tagged_sent):
... featuresets.append(
(pos_features(untagged_sent, i), tag) )
>>> size = int(len(featuresets) * 0.1)
>>> train_set, test_set = featuresets[size:], featuresets[:size]
>>> classifier = nltk.NaiveBayesClassifier.train(train_set)
>>> nltk.classify.accuracy(classifier, test_set)
def pos_features(sentence, i, history): ?
features = {"suffix(1)": sentence[i][-1:],
"suffix(2)": sentence[i][-2:],
"suffix(3)": sentence[i][-3:]}
if i == 0:
features["prev-word"] = ""
features["prev-tag"] = ""
features["prev-word"] = sentence[i-1]
features["prev-tag"] = history[i-1]
return features
class ConsecutivePosTagger(nltk.TaggerI): ?
def __init__(self, train_sents):
train_set = []
for tagged_sent in train_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
history = []
for i, (word, tag) in enumerate(tagged_sent):
featureset = pos_features(untagged_sent, i, history)
train_set.append( (featureset, tag) )
self.classifier = nltk.NaiveBayesClassifier.train(train_set)
def tag(self, sentence):
history = []
for i, word in enumerate(sentence):
featureset = pos_features(sentence, i, history)
tag = self.classifier.classify(featureset)
return zip(sentence, history)
>>> tagged_sents = brown.tagged_sents(categories='news')
>>> size = int(len(tagged_sents) * 0.1)
>>> train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
>>> tagger = ConsecutivePosTagger(train_sents)216
>>> print tagger.evaluate(test_sents)
>> sents = nltk.corpus.treebank_raw.sents()
>>> tokens = []
>>> boundaries = set()
>>> offset = 0
>>> for sent in nltk.corpus.treebank_raw.sents():
... tokens.extend(sent)
... offset += len(sent)
... boundaries.add(offset-1)
> def punct_features(tokens, i):
... return {'next-word-capitalized': tokens[i+1][0].isupper(),
... 'prevword': tokens[i-1].lower(),217
... 'punct': tokens[i],
... 'prev-word-is-one-char': len(tokens[i-1]) == 1}
> featuresets = [(punct_features(tokens, i), (i in boundaries))
... for i in range(1, len(tokens)-1)
... if tokens[i] in '.?!']
> size = int(len(featuresets) * 0.1)
>>> train_set, test_set = featuresets[size:], featuresets[:size]
>>> classifier = nltk.NaiveBayesClassifier.train(train_set)
>>> nltk.classify.accuracy(classifier, test_set]
def segment_sentences(words):
start = 0
sents = []
for i, word in words:
if word in '.?!' and classifier.classify(words, i) == True:
start = i+1
if start < len(words):
> posts = nltk.corpus.nps_chat.xml_posts()[:10000]
def dialogue_act_features(post):
... features = {}
... for word in nltk.word_tokenize(post):
... features['contains(%s)' % word.lower()] = True
... return features
featuresets = [(dialogue_act_features(post.text), post.get('class'))
... for post in posts]
>>> size = int(len(featuresets) * 0.1)
>>> train_set, test_set = featuresets[size:], featuresets[:size]
>>> classifier = nltk.NaiveBayesClassifier.train(train_set)
>>> print nltk.classify.accuracy(classifier, test_set)
def rte_features(rtepair):
extractor = nltk.RTEFeatureExtractor(rtepair)
features = {}
features['word_overlap'] = len(extractor.overlap('word'))
features['word_hyp_extra'] = len(extractor.hyp_extra('word'))219
features['ne_overlap'] = len(extractor.overlap('ne'))
features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
return features
>> rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
>>> extractor = nltk.RTEFeatureExtractor(rtepair)
>>> print extractor.text_word
>>> import random
>>> from nltk.corpus import brown
>>> tagged_sents = list(brown.tagged_sents(categories='news'))
>>> random.shuffle(tagged_sents)
>>> size = int(len(tagged_sents) * 0.1)
>>> train_set, test_set = tagged_sents[size:], tagged_sents[:size]
> file_ids = brown.fileids(categories='news')
>>> size = int(len(file_ids) * 0.1)
>>> train_set = brown.tagged_sents(file_ids[size:])
>>> test_set = brown.tagged_sents(file_ids[:size])
>> def tag_list(tagged_sents):
... return [tag for sent in tagged_sents for (word, tag) in sent]
>>> def apply_tagger(tagger, corpus):
... return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]
>>> gold = tag_list(brown.tagged_sents(categories='editorial'))
>>> test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))
>>> cm = nltk.ConfusionMatrix(gold, test)
mport math
def entropy(labels):
freqdist = nltk.FreqDist(labels)
probs = [freqdist.freq(l) for l in nltk.FreqDist(labels)]
return -sum([p * math.log(p,2) for p in probs])
> def ie_preprocess(document):
... sentences = nltk.sent_tokenize(document)
... sentences = [nltk.word_tokenize(sent) for sent in sentences]
... sentences = [nltk.pos_tag(sent) for sent in sentences]
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ?
... ("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN")]
>>> grammar = "NP: {?*}"
>>> cp = nltk.RegexpParser(grammar)
>>> result = cp.parse(sentence)
>>> print result
grammar = r"""
NP: {?*} # chunk determiner/possessive, adjectives and nouns
{+} # chunk sequences of proper nouns
cp = nltk.RegexpParser(grammar)
sentence = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"), ?
("her", "PP$"), ("long", "JJ"), ("golden", "JJ"), ("hair", "NN")]
>>> print cp.parse(sentence)
> nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]
>>> grammar = "NP: {} # Chunk two consecutive nouns"
>>> cp = nltk.RegexpParser(grammar)
>>> print cp.parse(nouns)
>> cp = nltk.RegexpParser('CHUNK: { }')
>>> brown = nltk.corpus.brown240
>>> for sent in brown.tagged_sents():
... tree = cp.parse(sent)
... for subtree in tree.subtrees():
... if subtree.node == 'CHUNK': print subtree
grammar = r"""
{<.*>+} # Chunk everything
}+{ # Chink sequences of VBD and IN
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN")]241
cp = nltk.RegexpParser(grammar)
>>> print cp.parse(sentence)
nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()
> from nltk.corpus import conll2000
>>> print conll2000.chunked_sents('train.txt')[99]
> from nltk.corpus import conll2000
>>> cp = nltk.RegexpParser("")
>>> test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
>>> print cp.evaluate(test_sents)
grammar = r"NP: {<[CDJNP].*>+}"
>>> cp = nltk.RegexpParser(grammar)
>>> print cp.evaluate(test_sents)
class UnigramChunker(nltk.ChunkParserI):
def __init__(self, train_sents): ?
train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
for sent in train_sents]
self.tagger = nltk.UnigramTagger(train_data) ?
def parse(self, sentence): ?
pos_tags = [pos for (word,pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
in zip(sentence, chunktags)]
return nltk.chunk.conlltags2tree(conlltags)
> test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
>>> train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
>>> unigram_chunker = UnigramChunker(train_sents)
>>> print unigram_chunker.evaluate(test_sents)
class ConsecutiveNPChunkTagger(nltk.TaggerI): ?
def __init__(self, train_sents):
train_set = []
for tagged_sent in train_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
history = []
for i, (word, tag) in enumerate(tagged_sent):
featureset = npchunk_features(untagged_sent, i, history) ?
train_set.append( (featureset, tag) )
self.classifier = nltk.MaxentClassifier.train( ?
train_set, algorithm='megam', trace=0)
def tag(self, sentence):
history = []
for i, word in enumerate(sentence):
featureset = npchunk_features(sentence, i, history)
tag = self.classifier.classify(featureset)
return zip(sentence, history)
class ConsecutiveNPChunker(nltk.ChunkParserI): ④
def __init__(self, train_sents):
tagged_sents = [[((w,t),c) for (w,t,c) in
for sent in train_sents]
self.tagger = ConsecutiveNPChunkTagger(tagged_sents)
def parse(self, sentence):
tagged_sents = self.tagger.tag(sentence)
conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]247
return nltk.chunk.conlltags2tree(conlltags)
> def npchunk_features(sentence, i, history):
... word, pos = sentence[i]
... if i == 0:
... prevword, prevpos = "", ""
... else:
... prevword, prevpos = sentence[i-1]
... if i == len(sentence)-1:
... nextword, nextpos = "", ""
... else:
... nextword, nextpos = sentence[i+1]
... return {"pos": pos,
... "word": word,
... "prevpos": prevpos,
... "nextpos": nextpos,
... "prevpos+pos": "%s+%s" % (prevpos, pos),
... "pos+nextpos": "%s+%s" % (pos, nextpos),
... "tags-since-dt": tags_since_dt(sentence, i)}
>>> def tags_since_dt(sentence, i):
... tags = set()
... for word, pos in sentence[:i]:
... if pos == 'DT':
... tags = set()
... else:
... tags.add(pos)
... return '+'.join(sorted(tags))
>>> chunker = ConsecutiveNPChunker(train_sents)
>>> print chunker.evaluate(test_sents)
grammar = r"""
NP: {+} # Chunk sequences of DT, JJ, NN
PP: {} # Chunk prepositions followed by NP
VP: {+$} # Chunk verbs and their arguments
CLAUSE: {} # Chunk NP, VP
cp = nltk.RegexpParser(grammar)
sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),
("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
>>> print cp.parse(sentence)
> cp = nltk.RegexpParser(grammar, loop=2)
>>> print cp.parse(sentence)
>> tree1 = nltk.Tree('NP', ['Alice'])
def traverse(t):
except AttributeError:
print t,
# Now we know that t.node is defined
print '(', t.node,
for child in t:
print ')',
>>> t = nltk.Tree('(S (NP Alice) (VP chased (NP the rabbit)))')
>>> traverse(t)
> sent = nltk.corpus.treebank.tagged_sents()[22]
>>> print nltk.ne_chunk(sent, binary=True)
>> print nltk.ne_chunk(sent)
> IN = re.compile(r'.*\bin\b(?!\b.+ing)')
>>> for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
... for rel in nltk.sem.extract_rels('ORG', 'LOC', doc,
... corpus='ieer', pattern = IN):
... print nltk.sem.show_raw_rtuple(rel)
>>> groucho_grammar = nltk.parse_cfg("""
... S -> NP VP
... PP -> P NP
... NP -> Det N | Det N PP | 'I'
... VP -> V NP | VP PP
... Det -> 'an' | 'my'
... N -> 'elephant' | 'pajamas'
... V -> 'shot'261
... P -> 'in'
... """)
>>> sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
>>> parser = nltk.ChartParser(groucho_grammar)
>>> trees = parser.nbest_parse(sent)
>>> for tree in trees:
... print tree
grammar1 = nltk.parse_cfg("""
S -> NP VP
VP -> V NP | V NP PP
PP -> P NP
V -> "saw" | "ate" | "walked"
NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
Det -> "a" | "an" | "the" | "my"
N -> "man" | "dog" | "cat" | "telescope" | "park"
P -> "in" | "on" | "by" | "with"
>>> sent = "Mary saw Bob".split()
>>> rd_parser = nltk.RecursiveDescentParser(grammar1)
>>> for tree in rd_parser.nbest_parse(sent):
... print tree
>> grammar1 = nltk.data.load('file:mygrammar.cfg')
>>> sent = "Mary saw Bob".split()
>>> rd_parser = nltk.RecursiveDescentParser(grammar1)
>>> for tree in rd_parser.nbest_parse(sent):267
... print tree
>> rd_parser = nltk.RecursiveDescentParser(grammar1)
>>> sent = 'Mary saw a dog'.split()
>>> for t in rd_parser.nbest_parse(sent):
... print t
>>> sr_parse = nltk.ShiftReduceParser(grammar1)
>>> sent = 'Mary saw a dog'.split()
>>> print sr_parse.parse(sent)
def init_wfst(tokens, grammar):
numtokens = len(tokens)
wfst = [[None for i in range(numtokens+1)] for j in range(numtokens+1)]
for i in range(numtokens):
productions = grammar.productions(rhs=tokens[i])
wfst[i][i+1] = productions[0].lhs()
return wfst
def complete_wfst(wfst, tokens, grammar, trace=False):
index = dict((p.rhs(), p.lhs()) for p in grammar.productions())
numtokens = len(tokens)
for span in range(2, numtokens+1):
for start in range(numtokens+1-span):
end = start + span
for mid in range(start+1, end):
nt1, nt2 = wfst[start][mid], wfst[mid][end]
if nt1 and nt2 and (nt1,nt2) in index:
wfst[start][end] = index[(nt1,nt2)]
if trace:
print "[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]" %
(start, nt1, mid, nt2, end, start, index[(nt1,nt2)], end)
return wfst
def display(wfst, tokens):
print '\nWFST ' + ' '.join([("%-4d" % i) for i in range(1, len(wfst))])
for i in range(len(wfst)-1):273
print "%d " % i,
for j in range(1, len(wfst)):
print "%-4s" % (wfst[i][j] or '.'),
>>> tokens = "I shot an elephant in my pajamas".split()
>>> wfst0 = init_wfst(tokens, groucho_grammar)
>>> display(wfst0, tokens)
WFST 1 2 3 4 5 6 7
0 NP . . . . . .
1 . V . . . . .
2 . . Det . . . .
3 . . . N . . .
4 . . . . P . .
5 . . . . . Det .
6 . . . . . . N
>>> wfst1 = complete_wfst(wfst0, tokens, groucho_grammar)
>>> display(wfst1, tokens)
>>> groucho_dep_grammar = nltk.parse_dependency_grammar("""
... 'shot' -> 'I' | 'elephant' | 'in'
... 'elephant' -> 'an' | 'in'
... 'in' -> 'pajamas'
... 'pajamas' -> 'my'
... """)
>>> print groucho_dep_grammar
> pdp = nltk.ProjectiveDependencyParser(groucho_dep_grammar)
>>> sent = 'I shot an elephant in my pajamas'.split()
>>> trees = pdp.parse(sent)
>>> for tree in trees
print tree
> from nltk.corpus import treebank
>>> t = treebank.parsed_sents('wsj_0001.mrg')[0]
def filter(tree):
child_nodes = [child.node for child in tree
if isinstance(child, nltk.Tree)]
return (tree.node == 'VP') and ('S' in child_nodes)
entries = nltk.corpus.ppattach.attachments('training')
>>> table = nltk.defaultdict(lambda: nltk.defaultdict(set))
>>> for entry in entries:
... key = entry.noun1 + '-' + entry.prep + '-' + entry.noun2
... table[key][entry.attachment].add(entry.verb)
>>> for key in sorted(table):
... if len(table[key]) > 1:
... print key, 'N:', sorted(table[key]['N']), 'V:', sorted(table[key]['V'])
>> tokens = ["fish"] * 5
>>> cp = nltk.ChartParser(grammar)
>>> for tree in cp.nbest_parse(tokens):
... print tree
def give(t):
return t.node == 'VP' and len(t) > 2 and t[1].node == 'NP'\
and (t[2].node == 'PP-DTV' or t[2].node == 'NP')\
and ('give' in t[0].leaves() or 'gave' in t[0].leaves())
def sent(t):
return ' '.join(token for token in t.leaves() if token[0] not in '*-0')
def print_node(t, width):
output = "%s %s: %s / %s: %s" %\
(sent(t[0]), t[1].node, sent(t[1]), t[2].node, sent(t[2]))
if len(output) > width:
output = output[:width] + "..."
print output
>>> for tree in nltk.corpus.treebank.parsed_sents():
... for t in tree.subtrees(give):
... print_node(t, 72)
grammar = nltk.parse_pcfg("""
S -> NP VP [1.0]
VP -> TV NP [0.4]
VP -> IV [0.3]
VP -> DatV NP NP [0.3]
TV -> 'saw' [1.0]
IV -> 'ate' [1.0]
DatV -> 'gave' [1.0]
NP -> 'telescopes' [0.8]
NP -> 'Jack' [0.2]
>>> print grammar
viterbi_parser = nltk.ViterbiParser(grammar)
>>> print viterbi_parser.parse(['Jack', 'saw', 'telescopes']
> sent = "Kim chased Lee"
>>> tokens = sent.split()
>>> lee = {'CAT': 'NP', 'ORTH': 'Lee', 'REF': 'l'}
>>> def lex2fs(word):288
... for fs in [kim, lee, chase]:
... if fs['ORTH'] == word:
... return fs
>>> subj, verb, obj = lex2fs(tokens[0]), lex2fs(tokens[1]), lex2fs(tokens[2])
>>> verb['AGT'] = subj['REF'] # agent of 'chase' is Kim
>>> verb['PA T'] = obj['REF'] # patient of 'chase' is Lee
>>> for k in ['ORTH', 'REL', 'AGT', 'PAT']: # check featstruct of 'chase'
print "%-5s => %s" % (k, verb[k])
>> tokens = 'Kim likes children'.split()
>>> from nltk import load_parser ?
>>> cp = load_parser('grammars/book_grammars/feat0.fcfg', trace=2) ?
>>> trees = cp.nbest_parse(tokens)
> fs1 = nltk.FeatStruct(TENSE='past', NUM='sg')
fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem')
> print nltk.FeatStruct("""[NAME='Lee', ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],
... SPOUSE=[NAME='Kim', ADDRESS->(1)]]""")
>> fs1 = nltk.FeatStruct(NUMBER=74, STREET='rue Pascal')
>>> fs2 = nltk.FeatStruct(CITY='Paris')
>>> print fs1.unify(fs2)
> fs0 = nltk.FeatStruct("""[NAME=Lee,
... STREET='rue Pascal'],
... SPOUSE= [NAME=Kim,
... STREET='rue Pascal']]]""")
>>> print fs0
> tokens = 'ich folge den Katzen'.split()
>>> cp = load_parser('grammars/book_grammars/german.fcfg')
>>> for tree in cp.nbest_parse(tokens):
... print tree
>> nltk.data.show_cfg('grammars/book_grammars/sql0.fcfg')
>>> from nltk import load_parser
>>> cp = load_parser('grammars/book_grammars/sql0.fcfg')
>>> query = 'What cities are located in China'
>>> trees = cp.nbest_parse(query.split())
>>> answer = trees[0].node['sem']
>>> q = ' '.join(answer)
>>> print q
lp = nltk.LogicParser()
>>> SnF = lp.parse('SnF')
>>> NotFnS = lp.parse('-FnS')
>>> R = lp.parse('SnF -> -FnS')
>>> prover = nltk.Prover9()
>>> prover.prove(NotFnS, [SnF, R])
> val = nltk.Valuation([('P', True), ('Q', True), ('R', False)])
>> dom = set([])
>>> g = nltk.Assignment(dom)
>> m = nltk.Model(dom, val)
> tlp = nltk.LogicParser(type_check=True)
>>> parsed = tlp.parse('walk(angus)')
>>> parsed.argument
> g = nltk.Assignment(dom, [('x', 'o'), ('y', 'c')])
>> m = nltk.Model(dom, val)
>>> m.evaluate('see(olive, y)', g)
m.evaluate('see(y, x)', g)
> g.purge()
>> m.evaluate('see(olive, y)', g)
>>> m.evaluate('see(bertie, olive) & boy(bertie) & -walk(bertie)', g)
>> m.evaluate('exists x.(girl(x) & walk(x))', g)
> fmla1 = lp.parse('girl(x) | boy(x)')
>>> m.satisfiers(fmla1, 'x', g)
>>> fmla2 = lp.parse('girl(x) -> walk(x)')
>>> m.satisfiers(fmla2, 'x', g)
>>> fmla3 = lp.parse('walk(x) -> girl(x)')
>>> m.satisfiers(fmla3, 'x', g)
> m.evaluate('all x.(girl(x) -> walk(x))', g)
a4 = lp.parse('exists y. (woman(y) & all x. (man(x) -> love(x,y)))')
>>> a5 = lp.parse('man(adam)')
>>> a6 = lp.parse('woman(eve)')
>>> g = lp.parse('love(adam,eve)')
>>> mc = nltk.MaceCommand(g, assumptions=[a4, a5, a6])
>>> mc.build_model()
>> a7 = lp.parse('all x. (man(x) -> -woman(x))')
>>> g = lp.parse('love(adam,eve)')
>>> mc = nltk.MaceCommand(g, assumptions=[a4, a5, a6, a7])
>>> mc.build_model()
> lp = nltk.LogicParser()
>>> e = lp.parse(r'\x.(walk(x) & chew_gum(x))')
>> lp = nltk.LogicParser()
>>> tvp = lp.parse(r'\X x.X(\y.chase(x,y))')
>>> np = lp.parse(r'(\P.exists x.(dog(x) & P(x)))')
>>> vp = nltk.ApplicationExpression(tvp, np)
>>> print vp
>> from nltk import load_parser
>>> parser = load_parser('grammars/book_grammars/simple-sem.fcfg', trace=0)
>>> sentence = 'Angus gives a bone to every dog'
>>> tokens = sentence.split()
>>> trees = parser.nbest_parse(tokens)
>>> for tree in trees:
... print tree.node['SEM']
val = nltk.parse_valuation(v)
>>> g = nltk.Assignment(val.domain)
>>> m = nltk.Model(val.domain, val)
>>> sent = 'Cyril sees every boy'
>>> grammar_file = 'grammars/book_grammars/simple-sem.fcfg'
>>> results = nltk.batch_evaluate([sent], grammar_file, m, g)[0]
>>> for (syntree, semrel, value) in results:
... print semrep
... print value
>> drs2 = dp.parse('([x], [walk(x)]) + ([y], [run(y)])')
dt = nltk.DiscourseTester(['A student dances', 'Every student is a person'])
>>> dt.readings()
>> dt.retract_sentence('No person dances', verbose=True)
rom nltk.tag import RegexpTagger
>>> tagger = RegexpTagger(
... [('^(chases|runs)$', 'VB'),
... ('^(a)$', 'ex_quant'),
... ('^(every)$', 'univ_quant'),
... ('^(dog|boy)$', 'NN'),
... ('^(He)$', 'PRP')
... ])
>>> rc = nltk.DrtGlueReadingCommand(depparser=nltk.MaltParser(tagger=tagger))
>>> dt = nltk.DiscourseTester(['Every dog chases a boy', 'He runs'], rc)
>>> dt.readings()
> import csv
>>> lexicon = csv.reader(open('dict.csv'))
>>> pairs = [(lexeme, defn) for (lexeme, _, _, defn) in lexicon]
>>> lexemes, defns = zip(*pairs)
>>> defn_words = set(w for defn in defns for w in defn.split())
>>> sorted(defn_words.difference(lexemes)
>> idx = nltk.Index((defn_word, lexeme) ?
... for (lexeme, defn) in pairs ?
... for defn_word in nltk.word_tokenize(defn) ?
... if len(defn_word) > 3) ④
>>> idx_file = open("dict.idx", "w")
>>> for word in sorted(idx):
... idx_words = ', '.join(idx[word])
... idx_line = "%s: %s\n" % (word, idx_words) ⑤
... idx_file.write(idx_line)
>>> idx_file.close()
> merchant_file = nltk.data.find('corpora/shakespeare/merchant.xml')
>>> raw = open(merchant_file).read()
>>> print raw[0:168]
>> from nltk.etree.ElementTree import ElementTree
>>> merchant = ElementTree().parse(merchant_file)
or i, act in enumerate(merchant.findall('ACT')):
... for j, scene in enumerate(act.findall('SCENE')):
... for k, speech in enumerate(scene.findall('SPEECH')):
... for line in speech.findall('LINE'):
... if 'music' in str(line.text):
... print "Act %d Scene %d Speech %d: %s" % (i+1, j+1, k+1, line.text)
>> speaker_seq = [s.text for s in merchant.findall('ACT/SCENE/SPEECH/SPEAKER
>>> speaker_freq = nltk.FreqDist(speaker_seq)
>>> top5 = speaker_freq.keys()[:5]
import sys
>>> from nltk.etree.ElementTree import ElementTree
>>> tree = ElementTree(lexicon[3])
>>> tree.write(sys.stdout)
from nltk.corpus import toolbox
>>> lexicon = toolbox.xml('rotokas.dic')
>>> sum(len(entry) for entry in lexicon) / len(lexicon)
from nltk.etree.ElementTree import SubElement
def cv(s):
s = s.lower()
s = re.sub(r'[^a-z]', r'_', s)
s = re.sub(r'[aeiou]', r'V', s)
s = re.sub(r'[^V_]', r'C', s)
return (s)
def add_cv_field(entry):
for field in entry:
if field.tag == 'lx':
cv_field = SubElement(entry, 'cv')
cv_field.text = cv(field.text)
>>> lexicon = toolbox.xml('rotokas.dic')
>>> add_cv_field(lexicon[53])
>>> print nltk.to_sfm_string(lexicon[53])
> fd = nltk.FreqDist(':'.join(field.tag for field in entry) for entry in lexicon)
grammar = nltk.parse_cfg('''
S -> Head PS Glosses Comment Date Sem_Field Examples
Head -> Lexeme Root
Lexeme -> "lx"
Root -> "rt" |
PS -> "ps"
Glosses -> Gloss Glosses |
Gloss -> "ge" | "tkp" | "eng"
Date -> "dt"
Sem_Field -> "sf"
Examples -> Example Ex_Pidgin Ex_English Examples |
Example -> "ex"
Ex_Pidgin -> "xp"
Ex_English -> "xe"
Comment -> "cmt" | "nt" |
def validate_lexicon(grammar, lexicon, ignored_tags):
rd_parser = nltk.RecursiveDescentParser(grammar)
for entry in lexicon:
marker_list = [field.tag for field in entry if field.tag not in ignored_tags]
if rd_parser.nbest_parse(marker_list):
print "+", ':'.join(marker_list) ?
print "-", ':'.join(marker_list) ?
>>> lexicon = toolbox.xml('rotokas.dic')[10:20]
>>> ignored_tags = ['arg', 'dcsv', 'pt', 'vx'] ?
>>> validate_lexicon(grammar, lexicon, ignored_tags)