foo = 'Monty'
bar = foo
foo = 'Python' #bar 是foo 的一个副本,所以当用一个新的字符串'Python'覆盖foo 时,bar 的值不会受到影响。
foo = ['Monty', 'Python']
bar = foo
foo[1] = 'Bodkin'
bar #bar = foo 行并不会复制变量的内容,只有它的“引用对象”
['Monty', 'Bodkin']
实际上,两个链表对象foo 和bar 引用计算机内存中的相同的位置;更新foo 将会修改bar,反之亦然。
empty = []
nested = [empty, empty, empty]
[[], [], []]
[['Python'], ['Python'], ['Python']]
nested = [[]] * 3
[[], [], []]
nested[1].append('Python') #修改链表中的一个元素,所有的元素都改变了
[['Python'], ['Python'], ['Python']]
(80141320, 80141320, 80141320)
nested[1] = ['Monty'] #当我们分配一个新值给链表中的一个元素时,它并不会传送给其他元素
[['Python'], ['Monty'], ['Python']]
size = 5
python = ['Python']
snake_nest = [python] * size
snake_nest[0] == snake_nest[1] == snake_nest[2] == snake_nest[3] == snake_nest[4]
snake_nest[0] is snake_nest[1] is snake_nest[2] is snake_nest[3] is snake_nest[4]
import random
position = random.choice(range(size))
snake_nest[position] = ['Python']
[['Python'], ['Python'], ['Python'], ['Python'], ['Python']]
snake_nest[0] == snake_nest[1] == snake_nest[2] == snake_nest[3] == snake_nest[4] #==只能保证值相同
snake_nest[0] is snake_nest[1] is snake_nest[2] is snake_nest[3] is snake_nest[4] #is 既要求值相同,而且结构也需相同
[id(snake) for snake in snake_nest] #函数id()检测不同的位置
[79659272, 79659272, 79659272, 79659208, 79659272]
mixed = ['cat', '', ['dog'], []]
for element in mixed:
if element: #一个非空字符串或链表被判定为真,而一个空字符串或链表的被判定为假。所以,不必在条件中写:len(element) > 0
animals = ['cat', 'dog']
if 'cat' in animals:
elif 'dog' in animals:
print(2) # 表达式中if 子句条件满足,Python 就不会比较elif 子句,所有程序永远不会输出2
if 'cat' in animals:
if 'dog' in animals:
elif 子句比单独的if 子句潜在的给我们更多信息;当它被判定为真时,告诉我们不仅条件满足而且前面的if 子句的条件不满足。
sent = ['No', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '.']
all(len(w) > 4 for w in sent)
any(len(w) > 4 for w in sent)
t = 'walk', 'fem', 3
('walk', 'fem', 3)
t= 'snark',
t= ()
raw = 'I turned off the spectroroute' #字符串
text = ['I', 'turned', 'off', 'the', 'spectroroute'] #链表
pair = (6, 'turned') #元组
raw[2], text[3], pair[1]
('t', 'the', 'turned')
raw[-3:], text[-3:], pair[-3:]
('ute', ['off', 'the', 'spectroroute'], (6, 'turned'))
len(raw), len(text), len(pair)
(29, 5, 2)
set(text) #定义一个集合
{'I', 'off', 'spectroroute', 'the', 'turned'}
表4.1. 遍历序列的各种方式
Python | 表达式评论 |
for item in s | 遍历s 中的元素 |
for item in sorted(s) | 按顺序遍历s 中的元素 |
for item in set(s) | 遍历s 中的无重复的元素 |
for item in reversed(s) | 按逆序遍历s 中的元素 |
for item in set(s).difference(t) | 遍历在集合s 中不在集合t 的元素 |
for item in random.shuffle(s) | 按随机顺序遍历s 中的元素 |
raw = 'Red lorry, yellow lorry, red lorry, yellow lorry.'
import nltk
from nltk import word_tokenize
text = nltk.word_tokenize(raw)
fdist = nltk.FreqDist(text)
['yellow', 'Red', '.', 'lorry', ',', 'red']
for key in fdist:
words = ['I', 'turned', 'off', 'the', 'spectroroute']
words[2], words[3], words[4] = words[3], words[4], words[2]
['I', 'turned', 'the', 'spectroroute', 'off']
tmp = words[2]
words[2] = words[3]
words[3] = words[4]
words[4] = tmp
['I', 'turned', 'spectroroute', 'off', 'the']
words = ['I', 'turned', 'off', 'the', 'spectroroute']
tags = ['noun', 'verb', 'prep', 'det', 'noun']
list(zip(words, tags)) #zip()取两个或两个以上的序列中的项目,将它们“压缩”打包成单个的配对链表。
[('I', 'noun'),
('turned', 'verb'),
('off', 'prep'),
('the', 'det'),
('spectroroute', 'noun')]
list(enumerate(words)) #给定一个序列words,enumerate(words)返回一个包含索引和索引处项目的配对。
[(0, 'I'), (1, 'turned'), (2, 'off'), (3, 'the'), (4, 'spectroroute')]
text = nltk.corpus.nps_chat.words()
cut = int(0.9 * len(text)) #分割数据,90%的数据来“训练”一个系统,剩余10%进行测试
training_data, test_data = text[:cut], text[cut:]
text == training_data + test_data
len(training_data) / len(test_data)
words = 'I turned off the spectroroute'.split()
wordlens = [(len(word), word) for word in words]
' '.join(w for (_, w) in wordlens) #下划线只是一个普通的Python变量,约定可以用下划线表示不会使用其值的变量。)
'I off the turned spectroroute'
lexicon = [
... ('the', 'det', ['Di:', 'D@']),
... ('off', 'prep', ['Qf', 'O:f'])
... ]
lexicon[1] = ('turned', 'VBD', ['t3:nd', 't3`nd'])
[('off', 'prep', ['Qf', 'O:f']), ('turned', 'VBD', ['t3:nd', 't3`nd'])]
del lexicon[0]
[('turned', 'VBD', ['t3:nd', 't3`nd'])]
lexicon = tuple(lexicon)
(('turned', 'VBD', ['t3:nd', 't3`nd']),)
lexicon[1] = ('turned', 'VBD', ['t3:nd', 't3`nd'])
TypeError: 'tuple' object does not support item assignment
in ()
----> 1 lexicon[1] = ('turned', 'VBD', ['t3:nd', 't3`nd'])
TypeError: 'tuple' object does not support item assignment
text = '''"When I use a word," Humpty Dumpty said in rather a scornful tone,
... "it means just what I choose it to mean - neither more nor less."'''
[w.lower() for w in nltk.word_tokenize(text)][:5] #产生器表达式
['``', 'when', 'i', 'use', 'a']
max([w.lower() for w in nltk.word_tokenize(text)])
max(w.lower() for w in nltk.word_tokenize(text))
Python 代码风格指南:
cv_word_pairs = [(cv, w) for w in rotokas_words
… for cv in re.findall(’[ptksvr][aeiou]’, w)]
cfd = nltk.ConditionalFreqDist(
… (genre, word)
… for genre in brown.categories()
… for word in brown.words(categories=genre))
ha_words = ['aaahhhh', 'ah', 'ahah', 'ahahah', 'ahh', 'ahhahahaha',
... 'ahhh', 'ahhhh', 'ahhhhhh', 'ahhhhhhhhhhhhhh', 'ha',
... 'haaa', 'hah', 'haha', 'hahaaa', 'hahah', 'hahaha']
if (len(syllables) > 4 and len(syllables[2]) == 3 and
… syllables[2][2] in [aeiou] and syllables[2][3] == syllables[1][3]):
… process(syllables)
if len(syllables) > 4 and len(syllables[2]) == 3 and
… syllables[2][2] in [aeiou] and syllables[2][3] == syllables[1][3]:
… process(syllables)
tokens = nltk.corpus.brown.words(categories='news')
count = 0
total = 0
for token in tokens:
count += 1
total += len(token)
print(total / count)
total = sum(len(t) for t in tokens) #生成器表达式
print(total / len(tokens))
word_list = []
len_word_list = len(word_list)
i = 0
while i < len(tokens):
j = 0
while j < len_word_list and word_list[j] < tokens[i]:
j += 1
if j == 0 or tokens[i] != word_list[j]:
word_list.insert(j, tokens[i])
len_word_list += 1
i += 1
word_list = sorted(set(tokens)) #等效的声明版本使用熟悉的内置函数
fd = nltk.FreqDist(nltk.corpus.brown.words())
cumulative = 0.0
most_common_words = [word for (word, count) in fd.most_common()]
for rank, word in enumerate(most_common_words):
cumulative += fd.freq(word)
print("%3d %6.2f%% %s" % (rank + 1, cumulative * 100, word))
if cumulative > 0.25:
1 5.40% the
2 10.42% ,
3 14.67% .
4 17.78% of
5 20.19% and
6 22.40% to
7 24.29% a
8 25.97% in
text = nltk.corpus.gutenberg.words('milton-paradise.txt')
longest = ''
for word in text:
if len(word) > len(longest):
longest = word
maxlen = max(len(word) for word in text)
[word for word in text if len(word) == maxlen]
sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']
n = 3
[sent[i:i+n] for i in range(len(sent)-n+1)]
[['The', 'dog', 'gave'],
['dog', 'gave', 'John'],
['gave', 'John', 'the'],
['John', 'the', 'newspaper']]
确保循环变量范围的正确相当棘手的。因为这是NLP 中的常见操作,NLTK 提供了支持函数bigrams(text)、trigrams(text)和一个更通用的ngrams(text, n)。
建立一个m 行n 列的数组,其中每个元素是一个集合
import pprint
m, n = 3, 7
array = [[set() for i in range(n)] for j in range(m)]
[[set(), set(), set(), set(), set(), set(), set()],
[set(), set(), set(), set(), set(), set(), set()],
[set(), set(), set(), set(), set(), {'Alice'}, set()]]
#例4-1. 从文件读取文本
import re
def get_text(file):
"""Read text from a file, normalizing whitespace and stripping HTML markup."""
text = open(file).read()
text = re.sub('\s+', ' ', text)
text = re.sub(r'<.*?>', ' ', text)
return text
想从一个HTML 文件得到干净的文字,都可以用文件的名字作为唯一的参数调用get_text()。它会返回一个字符串,我们可以将它指定给一个变量,例如:contents = get_text(“test.html”)。
Help on function get_text in module __main__:
Read text from a file, normalizing whitespace and stripping HTML markup.
def repeat(msg, num):
return ' '.join([msg] * num)
monty = 'Monty Python'
repeat(monty, 3)
'Monty Python Monty Python Monty Python'
def monty():
return "Monty Python"
'Monty Python'
repeat(monty(), 3)
'Monty Python Monty Python Monty Python'
repeat('Monty Python', 3)
'Monty Python Monty Python Monty Python'
def my_sort1(mylist): # good: modifies its argument, no return value
def my_sort2(mylist): # good: doesn't touch its argument, returns value
return sorted(mylist)
def my_sort3(mylist): # bad: modifies its argument and also returns it
return mylist
将一个空字符串分配给w,将一个空链表分配给p。调用该函数后,w 没有变,而p 改变了
def set_up(word, properties):
word = 'lolcat'
properties = 5
w = ''
p = []
set_up(w, p)
w = ''
word = w
word = 'lolcat'
p = []
properties = p
properties = 5
TypeError Traceback (most recent call last)
in ()
1 p = []
2 properties = p
----> 3 properties.append['noun']
4 properties = 5
5 p
TypeError: 'builtin_function_or_method' object is not subscriptable
注意!一个函数可以使用global 声明创建一个新的全局变量。然而,这种
def tag(word):
if word in ['a', 'the', 'all']:
return 'det'
return 'noun'
tag(["'Tis", 'but', 'a', 'scratch'])
def tag(word):
assert isinstance(word, basestring), "argument to tag() must be a string"
if word in ['a', 'the', 'all']:
return 'det'
return 'noun'
tag(["'Tis", 'but', 'a', 'scratch']) #防御性编程
NameError: name 'basestring' is not defined
in ()
----> 1 tag(["'Tis", 'but', 'a', 'scratch']) #防御性编程
in tag(word)
1 def tag(word):
----> 2 assert isinstance(word, basestring), "argument to tag() must be a string"
3 if word in ['a', 'the', 'all']:
4 return 'det'
5 else:
NameError: name 'basestring' is not defined
#例4-2. 设计不佳的函数用来计算高频词。
from urllib import request
from bs4 import BeautifulSoup
def freq_words(url, freqdist, n):
html = request.urlopen(url).read().decode('utf8')
raw = BeautifulSoup(html).get_text()
for word in word_tokenize(raw):
freqdist[word.lower()] += 1
result = []
for word, count in freqdist.most_common(n):
result = result + [word]
constitution = ""
fd = nltk.FreqDist()
freq_words(constitution, fd, 30)
["''", '//', ':', ')', '(', 'https', '``', 'http', 'location.replace', ',', ';', 'location.href.replace']
C:\Program Files\Anaconda3\lib\site-packages\bs4\ UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml").
The code that caused this warning is on line 193 of the file C:\Program Files\Anaconda3\lib\ To get rid of this warning, change code that looks like this:
BeautifulSoup([your markup])
to this:
BeautifulSoup([your markup], "lxml")
from urllib import request
from bs4 import BeautifulSoup
def freq_words(url, n):
html = request.urlopen(url).read().decode('utf8')
text = BeautifulSoup(html).get_text()
freqdist = nltk.FreqDist(word.lower() for word in word_tokenize(text))
return [word for (word, _) in fd.most_common(n)]
freq_words(constitution, 30)
C:\Program Files\Anaconda3\lib\site-packages\bs4\ UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml").
The code that caused this warning is on line 193 of the file C:\Program Files\Anaconda3\lib\ To get rid of this warning, change code that looks like this:
BeautifulSoup([your markup])
to this:
BeautifulSoup([your markup], "lxml")
docstring 中可以包括一个doctest块,说明使用的函数和预期的输出。这些都可以使用
Python 的docutils 模块自动测试。docstring 中应当记录函数的每个参数的类型和返回类型。
例如4-4 一个完整的docstring 的演示,包括一行总结,一个更详细的解释,一个doctest 例
子以及特定参数、类型、返回值和异常的epytext 标记。
def accuracy(reference, test):
Calculate the fraction of test items that equal the corresponding reference items.
Given a list of reference values and a corresponding list of test values,
return the fraction of corresponding values that are equal.
In particular, return the fraction of indexes
{0>> accuracy(['ADJ', 'N', 'V', 'N'], ['N', 'N', 'V', 'ADJ'])
:param reference: An ordered list of reference values
:type reference: list
:param test: A list of values to compare against the corresponding
reference values
:type test: list
:return: the accuracy score
:rtype: float
:raises ValueError: If reference and length do not have the same length
if len(reference) != len(test):
raise ValueError("Lists must have the same length.")
num_correct = 0
for x, y in zip(reference, test):
if x == y:
num_correct += 1
return float(num_correct) / len(reference)
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the',
... 'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']
def extract_property(prop):
return [prop(word) for word in sent]
extract_property(len) #传递内置函数len()
[4, 4, 2, 3, 5, 1, 3, 3, 6, 4, 4, 4, 2, 10, 1]
def last_letter(word):
return word[-1]
extract_property(last_letter) #用户定义的函数last_letter()
['e', 'e', 'f', 'e', 'e', ',', 'd', 'e', 's', 'l', 'e', 'e', 'f', 's', '.']
extract_property(lambda w: w[-1]) # lambda 表达式
['e', 'e', 'f', 'e', 'e', ',', 'd', 'e', 's', 'l', 'e', 'e', 'f', 's', '.']
例4-5. 累计输出到一个链表
def search1(substring, words):
result = []
for word in words:
if substring in word:
return result
def search2(substring, words):
for word in words:
if substring in word:
yield word
for item in search1('zz', nltk.corpus.brown.words()):
print(item, end=" ")
Grizzlies' fizzled Rizzuto huzzahs dazzler jazz Pezza Pezza Pezza embezzling embezzlement pizza jazz Ozzie nozzle drizzly puzzle puzzle dazzling Sizzling guzzle puzzles dazzling jazz jazz Jazz jazz Jazz jazz jazz Jazz jazz jazz jazz Jazz jazz dizzy jazz Jazz puzzler jazz jazzmen jazz jazz Jazz Jazz Jazz jazz Jazz jazz jazz jazz Jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz Jazz Jazz jazz jazz nozzles nozzle puzzle buzz puzzle blizzard blizzard sizzling puzzled puzzle puzzle muzzle muzzle muezzin blizzard Neo-Jazz jazz muzzle piazzas puzzles puzzles embezzle buzzed snazzy buzzes puzzled puzzled muzzle whizzing jazz Belshazzar Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie's Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie blizzard blizzards blizzard blizzard fuzzy Lazzeri Piazza piazza palazzi Piazza Piazza Palazzo Palazzo Palazzo Piazza Piazza Palazzo palazzo palazzo Palazzo Palazzo Piazza piazza piazza piazza Piazza Piazza Palazzo palazzo Piazza piazza pizza Piazza Palazzo palazzo dazzling puzzling Wozzek dazzling dazzling buzzing Jazz jazz Jazz Jazz jazz jazz jazz jazz Jazz jazz jazz jazz Fuzzy Lizzy Lizzy jazz fuzzy puzzles puzzling puzzling dazzle puzzle dazzling puzzled jazz jazz jazz jazzy whizzed frazzled quizzical puzzling poetry-and-jazz poetry-and-jazz jazz jazz jazz jazz jazz jazz jazz Jazz jazz jazz jazz poetry-and-jazz jazz jazz jazz Dizzy jazz jazz jazz jazz jazz poetry-and-jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz dazzled bedazzlement bedazzled Piazzo nozzles nozzles buzzing dazzles dizzy puzzling puzzling puzzling puzzle muzzle puzzled nozzle Pozzatti Pozzatti Pozzatti puzzled Pozzatti Pozzatti dazzling pizzicato Jazz jazz jazz jazz jazz nozzle grizzled fuzzy muzzle puzzled puzzle muzzle blizzard buzz dizzily drizzle drizzle drizzle sizzled puzzled puzzled puzzled fuzzed buzz buzz buzz buzz-buzz-buzz buzzes fuzzy frizzled drizzle drizzle drizzling drizzling fuzz jazz jazz fuzz puzzle puzzling Nozze mezzo puzzled puzzled dazzling muzzle muzzle muzzle buzzed whizzed sizzled palazzos puzzlement frizzling puzzled puzzled puzzled dazzling muzzles fuzzy jazz ex-jazz sizzle grizzly guzzled buzzing fuzz nuzzled Kizzie Kizzie Kizzie Kezziah Kizzie Kizzie Buzz's Buzz Buzz Buzz Buzz Buzz Buzz Buzz Buzz dizzy piazza buzzing Puzzled dizziness dazzled Piazza Carrozza fuzzy dizzy buzzing buzzing puzzled puzzling puzzled puzzled Quizzical pizza
for item in search2('zz', nltk.corpus.brown.words()):
print(item, end=" ")
Grizzlies' fizzled Rizzuto huzzahs dazzler jazz Pezza Pezza Pezza embezzling embezzlement pizza jazz Ozzie nozzle drizzly puzzle puzzle dazzling Sizzling guzzle puzzles dazzling jazz jazz Jazz jazz Jazz jazz jazz Jazz jazz jazz jazz Jazz jazz dizzy jazz Jazz puzzler jazz jazzmen jazz jazz Jazz Jazz Jazz jazz Jazz jazz jazz jazz Jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz Jazz Jazz jazz jazz nozzles nozzle puzzle buzz puzzle blizzard blizzard sizzling puzzled puzzle puzzle muzzle muzzle muezzin blizzard Neo-Jazz jazz muzzle piazzas puzzles puzzles embezzle buzzed snazzy buzzes puzzled puzzled muzzle whizzing jazz Belshazzar Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie's Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie blizzard blizzards blizzard blizzard fuzzy Lazzeri Piazza piazza palazzi Piazza Piazza Palazzo Palazzo Palazzo Piazza Piazza Palazzo palazzo palazzo Palazzo Palazzo Piazza piazza piazza piazza Piazza Piazza Palazzo palazzo Piazza piazza pizza Piazza Palazzo palazzo dazzling puzzling Wozzek dazzling dazzling buzzing Jazz jazz Jazz Jazz jazz jazz jazz jazz Jazz jazz jazz jazz Fuzzy Lizzy Lizzy jazz fuzzy puzzles puzzling puzzling dazzle puzzle dazzling puzzled jazz jazz jazz jazzy whizzed frazzled quizzical puzzling poetry-and-jazz poetry-and-jazz jazz jazz jazz jazz jazz jazz jazz Jazz jazz jazz jazz poetry-and-jazz jazz jazz jazz Dizzy jazz jazz jazz jazz jazz poetry-and-jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz dazzled bedazzlement bedazzled Piazzo nozzles nozzles buzzing dazzles dizzy puzzling puzzling puzzling puzzle muzzle puzzled nozzle Pozzatti Pozzatti Pozzatti puzzled Pozzatti Pozzatti dazzling pizzicato Jazz jazz jazz jazz jazz nozzle grizzled fuzzy muzzle puzzled puzzle muzzle blizzard buzz dizzily drizzle drizzle drizzle sizzled puzzled puzzled puzzled fuzzed buzz buzz buzz buzz-buzz-buzz buzzes fuzzy frizzled drizzle drizzle drizzling drizzling fuzz jazz jazz fuzz puzzle puzzling Nozze mezzo puzzled puzzled dazzling muzzle muzzle muzzle buzzed whizzed sizzled palazzos puzzlement frizzling puzzled puzzled puzzled dazzling muzzles fuzzy jazz ex-jazz sizzle grizzly guzzled buzzing fuzz nuzzled Kizzie Kizzie Kizzie Kezziah Kizzie Kizzie Buzz's Buzz Buzz Buzz Buzz Buzz Buzz Buzz Buzz dizzy piazza buzzing Puzzled dizziness dazzled Piazza Carrozza fuzzy dizzy buzzing buzzing puzzled puzzling puzzled puzzled Quizzical pizza
def permutations(seq):
if len(seq) <= 1:
yield seq
for perm in permutations(seq[1:]):
for i in range(len(perm)+1):
yield perm[:i] + seq[0:1] + perm[i:]
list(permutations(['police', 'fish', 'buffalo']))
[['police', 'fish', 'buffalo'],
['fish', 'police', 'buffalo'],
['fish', 'buffalo', 'police'],
['police', 'buffalo', 'fish'],
['buffalo', 'police', 'fish'],
['buffalo', 'fish', 'police']]
def is_content_word(word):
return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.']
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the',
... 'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']
list(filter(is_content_word, sent))
['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']
[w for w in sent if is_content_word(w)]
['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']
lengths = list(map(len, nltk.corpus.brown.sents(categories='news')))
sum(lengths) / len(lengths)
lengths = [len(sent) for sent in nltk.corpus.brown.sents(categories='news')]
sum(lengths) / len(lengths)
def repeat(msg='' , num=1):
return msg * num
repeat(num=3) #关键字参数
repeat(msg='Alice') #关键字参数
repeat(num=5, msg='Alice') #关键字参数
def generic(*args, **kwargs): #定义一个函数,接受任意数量的未命名和命名参数,并通过一个就地的参数链表*args 和一个就地的关键字参数字典**kwargs 来访问它们。
print( kwargs)
generic(1, "African swallow", monty="python") #当*args 作为函数参数时,它实际上对应函数所有的未命名参数。
(1, 'African swallow')
{'monty': 'python'}
song = [['four', 'calling', 'birds'],
... ['three', 'French', 'hens'],
... ['two', 'turtle', 'doves']]
list(zip(song[0], song[1], song[2]))
[('four', 'three', 'two'),
('calling', 'French', 'turtle'),
('birds', 'hens', 'doves')]
list(zip(*song)) #*song 仅仅是一个方便的记号,相当于输入了song[0],song[1],song[2]
[('four', 'three', 'two'),
('calling', 'French', 'turtle'),
('birds', 'hens', 'doves')]
def freq_words(file, min=1, num=10):
text = open(file).read()
tokens = word_tokenize(text)
freqdist = nltk.FreqDist(t for t in tokens if len(t) >= min)
return freqdist.most_common(num)
fw = freq_words(‘ch01.rst’, 4, 10)
fw = freq_words(‘ch01.rst’, min=4, num=10)
fw = freq_words(‘ch01.rst’, num=10, min=4)
设置了verbose 标志将会报告其进展情况:
def freq_words(file, min=1, num=10, verbose=False):
freqdist = FreqDist()
if verbose: print("Opening", file)
text = open(file).read()
if verbose: print("Read in %d characters" % len(file))
for word in word_tokenize(text):
if len(word) >= min:
freqdist[word] += 1
if verbose and freqdist.N() % 100 == 0: print(".", sep="")
if verbose: print
return freqdist.most_common(num)
Python 模块只是一些单独的.py 文件,from module import * 导入这个模块
Python 提供了一个调试器,它允许你监视程序的执行,指定程序暂停运行的行号(即断点),逐步调试代码段和检查变量的值。
import pdb
# import mymodule
解决一个大小为n 的问题,可以将其分成两半,然后处理一个或多个大小为n/2 的问题。一种一般的方式来实现这种方法是使用递归。
def factorial1(n):
result = 1
for i in range(n):
result *= (i+1)
return result
def factorial2(n):
if n == 1:
return 1
return n * factorial2(n-1)
def size1(s):
return 1 + sum(size1(child) for child in s.hyponyms())
def size2(s):
layer = [s]
total = 0
while layer:
total += len(layer)
layer = [h for c in layer for h in c.hyponyms()]
return total
from nltk.corpus import wordnet as wn
dog = wn.synset('dog.n.01')
例4-6. 构建一个字母查找树:一个递归函数建立一个嵌套的字典结构,每一级嵌套包
def insert(trie, key, value):
if key:
first, rest = key[0], key[1:]
if first not in trie:
trie[first] = {}
insert(trie[first], rest, value)
trie['value'] = value
trie = {}
insert(trie, 'chat', 'cat')
insert(trie, 'chien', 'dog')
insert(trie, 'chair', 'flesh')
insert(trie, 'chic', 'stylish')
trie = dict(trie) # for nicer printing
pprint.pprint(trie, width=40)
{'c': {'h': {'a': {'i': {'r': {'value': 'flesh'}},
't': {'value': 'cat'}},
'i': {'c': {'value': 'stylish'},
'e': {'n': {'value': 'dog'}}}}}}
例4-7. 一个简单的全文检索系统
def raw(file):
contents = open(file).read()
contents = re.sub(r'<.*?>', ' ', contents)
contents = re.sub('\s+', ' ', contents)
return contents
def snippet(doc, term):
text = ' '*30 + raw(doc) + ' '*30
pos = text.index(term)
return text[pos-30:pos+30]
print("Building Index...")
files = nltk.corpus.movie_reviews.abspaths()
idx = nltk.Index((w, f) for f in files for w in raw(f).split())
query = ''
while query != "quit":
query = input("query> ") # use raw_input() in Python 2
if query in idx:
for doc in idx[query]:
print(snippet(doc, query))
print("Not found")
Building Index...
query> efsdfds
Not found
query> we
the problems with the movie ? well , its main problem is tha.........................................
" and you get something that well describes him and his art
例4-8. 预处理已标注的语料库数据,将所有的词和标注转换成整数
def preprocess(tagged_corpus):
words = set()
tags = set()
for sent in tagged_corpus:
for word, tag in sent:
wm = dict((w,i) for (i,w) in enumerate(words))
tm = dict((t,i) for (i,t) in enumerate(tags))
return [[(wm[w], tm[t]) for (w,t) in sent] for sent in tagged_corpus]
from timeit import Timer
vocab_size = 10000
setup_list = "import random; vocab = range(%d)" % vocab_size
setup_set = "import random; vocab = set(range(%d))" % vocab_size
statement = "random.randint(0, %d) in vocab" % (vocab_size * 2)
print(Timer(statement, setup_list).timeit(100))
print(Timer(statement, setup_set).timeit(100))
动态规划(Dynamic programming)是一种自然语言处理中被广泛使用的算法设计的一般方法。动态规划用于解决包含多个重叠的子问题的问题。
例4-9. 四种方法计算梵文旋律:(一)迭代;(二)自底向上的动态规划;(三)自上而下的动态规划;(四)内置默记法。
def virahanka1(n):
if n == 0:
return [""]
elif n == 1:
return ["S"]
s = ["S" + prosody for prosody in virahanka1(n-1)]
l = ["L" + prosody for prosody in virahanka1(n-2)]
return s + l
def virahanka2(n):
lookup = [[""], ["S"]]
for i in range(n-1):
s = ["S" + prosody for prosody in lookup[i+1]]
l = ["L" + prosody for prosody in lookup[i]]
lookup.append(s + l)
return lookup[n]
def virahanka3(n, lookup={0:[""], 1:["S"]}):
if n not in lookup:
s = ["S" + prosody for prosody in virahanka3(n-1)]
l = ["L" + prosody for prosody in virahanka3(n-2)]
lookup[n] = s + l
return lookup[n]
from nltk import memoize
def virahanka4(n):
if n == 0:
return [""]
elif n == 1:
return ["S"]
s = ["S" + prosody for prosody in virahanka4(n-1)]
l = ["L" + prosody for prosody in virahanka4(n-2)]
return s + l
from numpy import arange
from matplotlib import pyplot
colors = 'rgbcmyk' # red, green, blue, cyan, magenta, yellow, black
def bar_chart(categories, words, counts):
"Plot a bar chart showing counts for each word by category"
ind = arange(len(words))
width = 1 / (len(categories) + 1)
bar_groups = []
for c in range(len(categories)):
bars =*width, counts[categories[c]], width,
color=colors[c % len(colors)])
pyplot.xticks(ind+width, words)
pyplot.legend([b[0] for b in bar_groups], categories, loc='upper left')
pyplot.title('Frequency of Six Modal Verbs by Genre')
genres = ['news', 'religion', 'hobbies', 'government', 'adventure']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
import nltk
cfdist = nltk.ConditionalFreqDist(
(genre, word)
for genre in genres
for word in nltk.corpus.brown.words(categories=genre)
if word in modals)
counts = {}
for genre in genres:
counts[genre] = [cfdist[genre][word] for word in modals]
bar_chart(genres, modals, counts)
from matplotlib import use, pyplot
print('Content-Type: text/html')
Content-Type: text/html
C:\Program Files\Anaconda3\lib\site-packages\ipykernel\ UserWarning:
This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.
The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
File "C:\Program Files\Anaconda3\lib\", line 193, in _run_module_as_main
"__main__", mod_spec)
File "C:\Program Files\Anaconda3\lib\", line 85, in _run_code
exec(code, run_globals)
File "C:\Program Files\Anaconda3\lib\site-packages\ipykernel\", line 3, in
File "C:\Program Files\Anaconda3\lib\site-packages\traitlets\config\", line 653, in launch_instance
File "C:\Program Files\Anaconda3\lib\site-packages\ipykernel\", line 474, in start
File "C:\Program Files\Anaconda3\lib\site-packages\zmq\eventloop\", line 162, in start
super(ZMQIOLoop, self).start()
File "C:\Program Files\Anaconda3\lib\site-packages\tornado\", line 887, in start
handler_func(fd_obj, events)
File "C:\Program Files\Anaconda3\lib\site-packages\tornado\", line 275, in null_wrapper
return fn(*args, **kwargs)
File "C:\Program Files\Anaconda3\lib\site-packages\zmq\eventloop\", line 440, in _handle_events
File "C:\Program Files\Anaconda3\lib\site-packages\zmq\eventloop\", line 472, in _handle_recv
self._run_callback(callback, msg)
File "C:\Program Files\Anaconda3\lib\site-packages\zmq\eventloop\", line 414, in _run_callback
callback(*args, **kwargs)
File "C:\Program Files\Anaconda3\lib\site-packages\tornado\", line 275, in null_wrapper
return fn(*args, **kwargs)
File "C:\Program Files\Anaconda3\lib\site-packages\ipykernel\", line 276, in dispatcher
return self.dispatch_shell(stream, msg)
File "C:\Program Files\Anaconda3\lib\site-packages\ipykernel\", line 228, in dispatch_shell
handler(stream, idents, msg)
File "C:\Program Files\Anaconda3\lib\site-packages\ipykernel\", line 390, in execute_request
user_expressions, allow_stdin)
File "C:\Program Files\Anaconda3\lib\site-packages\ipykernel\", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "C:\Program Files\Anaconda3\lib\site-packages\ipykernel\", line 501, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "C:\Program Files\Anaconda3\lib\site-packages\IPython\core\", line 2717, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "C:\Program Files\Anaconda3\lib\site-packages\IPython\core\", line 2821, in run_ast_nodes
if self.run_code(code, result):
File "C:\Program Files\Anaconda3\lib\site-packages\IPython\core\", line 2881, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "", line 2, in
from matplotlib import pyplot
File "C:\Program Files\Anaconda3\lib\site-packages\matplotlib\", line 72, in
from matplotlib.backends import pylab_setup
File "C:\Program Files\Anaconda3\lib\site-packages\matplotlib\backends\", line 14, in
line for line in traceback.format_stack()
from ipykernel import kernelapp as app
NetworkX 包定义和操作被称为图的由节点和边组成的结构。它可以从 得到。NetworkX 可以和Matplotlib 结合使用可视化如WordNet 的网络结构(语义网络)。
Anaconda Jupyter不同版本Python下共存使用
import networkx as nx
import matplotlib
from nltk.corpus import wordnet as wn
from networkx.drawing.nx_agraph import graphviz_layout
def traverse(graph, start, node):
graph.depth[] = node.shortest_path_distance(start)
for child in node.hyponyms():
traverse(graph, start, child)
def hyponym_graph(start):
G = nx.Graph()
G.depth = {}
traverse(G, start, start)
return G
def graph_draw(graph):
node_size = [16 * for n in graph],
node_color = [graph.depth[n] for n in graph],
with_labels = False)
dog = wn.synset('dog.n.01')
graph = hyponym_graph(dog)
import csv
input_file = open("lexicon.csv", "rb")
for row in csv.reader(input_file):
from numpy import array
cube = array([ [[0,0,0], [1,1,1], [2,2,2]],
[[3,3,3], [4,4,4], [5,5,5]],
[[6,6,6], [7,7,7], [8,8,8]] ])
array([[6, 7, 8],
[6, 7, 8],
[6, 7, 8]])
array([[7, 7, 7],
[8, 8, 8]])
from numpy import linalg
a=array([[4,0], [3,-5]])
u,s,vt = linalg.svd(a)
array([[-0.4472136 , -0.89442719],
[-0.89442719, 0.4472136 ]])
array([6.32455532, 3.16227766])
array([[-0.70710678, 0.70710678],
[-0.70710678, -0.70710678]])
《Python自然语言处理》123 4,作者:Steven Bird, Ewan Klein & Edward Loper,是实践性很强的一部入门读物,2009年第一版,2015年第二版,本学习笔记结合上述版本,对部分内容进行了延伸学习、练习,在此分享,期待对大家有所帮助,欢迎加我微信(验证:NLP),一起学习讨论,不足之处,欢迎指正。
