第一章 语言处理与python
1 语言计算 文本与词汇
NLTK入门
下载安装nltk
http://www.nltk.org
下载数据
>>> import nltk
>>> nltk.download()
下载完成,加载文本
>>> from nltk.book import *
*** Introductory Examples for the NLTK Book***
Loading text1, ..., text9 and sent1, ...,sent9
Type the name of the text or sentence toview it.
Type: 'texts()' or 'sents()' to list thematerials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K .Chesterton 1908
输入名字即可查询对应文本
>>> text1
>>> text2
搜索文本
>>> text1.concordance('monstrous')
Displaying 11 of 11 matches:
ong the former , one was of a mostmonstrous size . ... This came towards us ,
ON OF THE PSALMS . " Touching thatmonstrous bulk of the whale or ork we have r
ll over with a heathenish array ofmonstrous clubs and spears . Some were thick
d as you gazed , and wondered whatmonstrous cannibal and savage could ever hav
that has survived the flood ; mostmonstrous and most mountainous ! That Himmal
they might scout at Moby Dick as amonstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of theMonstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with themonstrous pictures of whales , I am strongly
ere to enter upon those still moremonstrous stories of them which are to be fo
ght have been rummaged out of thismonstrous cabinet there is no telling . But
of Whale - Bones ; for Whales of amonstrous size are oftentimes cast up dead u
#还有哪些词出现在相同的上文中
>>> text1.similar('monstrous')
impalpable puzzled part mystifying gamesomehorrible maddens
domineering curious exasperate untowardcontemptible careful
trustworthy delightfully christian meanuncommon abundant wise
>>> text2.similar('monstrous')
very exceedingly so heartily as vastamazingly extremely great sweet a
remarkably good
>>>text2.common_contexts(['monstrous','very'])
be_glad a_lucky am_glad is_pretty a_pretty
#美国总统就职演说词汇分布图
>>> text4.dispersion_plot(['citizens','democracy','freedom','duties','America'])
#产生随机文本
>>> text3.generate()
Traceback (most recent call last):
File "
AttributeError: 'Text' object has noattribute 'generate'
#计数词汇
>>> len(text3) #创世纪有44764个单词和标点符号
44764
#取得排序后的词汇条目
>>> sorted(set(text3))
['!', "'", '(', ')', ',', ',)','.', '.)', ':', ';', ';)', '?', '?)', 'A', 'Abel', 'Abelmizraim', 'Abidah','Abide', 'Abimael', 'Abimelech', 'Abr', 'A
brah', 'Abraham', 'Abram', 'Accad', 'Achbor','Adah', 'Adam', 'Adbeel', 'Admah', 'Adullamite', 'After', 'Aholibamah','Ahuzzath', 'Ajah', 'Akan', 'All
', 'Allonbachuth', 'Almighty', 'Almodad','Also', 'Alvah', 'Alvan', 'Am', 'Amal', 'Amalek', 'Amalekites', 'Ammon','Amorite', 'Amorites', 'Amraphel',
#取得词汇总数
>>> len(set(text3))
2789
#每个词平均使用了16次
>>> from __future__ importdivision
>>> len(text3)/len(set(text3))
16.050197203298673
#计算单词在文中出现次数
>>> text3.count('smote')
5
#计算特定词在文中占据的百分比
>>>100*text4.count('a')/len(text4)
1.4643016433938312
#定义函数实现以上功能
>>> def lexical_diversity(text):
... return len(text)/len(set(text))
...
>>> def percentage(count,total):
... return 100*count/total
...
#函数调用
>>> lexical_diversity(text3)
16.050197203298673
>>> percentage(text4.count('a'),len(text4))
1.4643016433938312
2 将文本当作词链表
链表定义
>>>sent1=['call','me','retacn','.']
>>> sent1
['call', 'me', 'retacn', '.']
>>> len(sent1)
4
>>> lexical_diversity(sent1)
1.0
>>> ['monty','python']+['and','the','holy','grail']
['monty', 'python', 'and', 'the', 'holy','grail']
>>> sent1.append('some')
>>> sent1
['call', 'me', 'retacn', '.', 'some']
列表索引
>>> text4[173]
'awaken'
>>> text4.index('awaken')
173
#切片
>>> text5[16715:16735]
['U86', 'thats', 'why', 'something','like', 'gamefly', 'is', 'so', 'good', 'because', 'you', 'can', 'actually','play', 'a', 'full', 'game', 'without
', 'buying', 'it']
>>> text6[1600:1625]
['We', "'", 're', 'an','anarcho', '-', 'syndicalist', 'commune', '.', 'We', 'take', 'it', 'in','turns', 'to', 'act', 'as', 'a', 'sort', 'of', 'execu
tive', 'officer', 'for', 'the', 'week']
#索引的错误使用
>>> len(text5)
45010
>>> text5[45010]
Traceback (most recent call last):
File "
File "D:\Python35\lib\site-packages\nltk\text.py", line 314,in __getitem__
return self.tokens[i]
IndexError: list index out of range
#使用索引示例
>>> text8[:3]
['25', 'SEXY', 'MALE']
>>> text8[4860:]
['FREE', 'to', 'advertise', 'in','Perfect', 'Match', '!']
变量
>>>sent1=['call','me','retacn','.']
字符串
>>> name='retacn'
>>> name[0]
'r'
>>> name[:5]
'retac'
>>> name[:]
'retacn'
#乘法/加法
>>> name*2
'retacnretacn'
>>> name+'!'
'retacn!'
#把字符串分割成链表
>>> ' '.join(['retacn','coder'])
'retacn coder'
>>> 'retacn coder'.split()
['retacn', 'coder']
3 简单的统计
>>>saying=['retacn','is','a','coder','!']
>>> saying
['retacn', 'is', 'a', 'coder', '!']
>>> tokens=set(saying)
>>> tokens
{'is', 'retacn', 'a', 'coder', '!'}
>>> tokens=sorted(tokens)
>>> tokens
['!', 'a', 'coder', 'is', 'retacn']
>>> tokens[-2:]
['is', 'retacn']
频率分布
#找到书中使用最频繁的50个字
>>> fdist1=FreqDist(text1)
>>> fdist1
FreqDist({',': 18713, 'the': 13721, '.':6862, 'of': 6536, 'and': 6024, 'a': 4569, 'to': 4542, ';': 4072, 'in': 3916,'that': 2982, ...})
>>> vocabulary1=fdist1.keys()
#单词总数
>>> len(fdist1)
19317
>>> vocabulary1=list(vocabulary1)
>>> vocabulary1[:50]
['sweetest', 'demeanor', 'cutlets','bathing', 'eddy', 'summits', 'eager', 'carcass', 'splintered', 'coppers','ruinous', 'ease', 'Gather', 'immovable
', 'Verdes', 'breathing', 'colony','dreamed', 'deepeningly', 'artisan', 'placid', 'seven', 'manipulator','Cistern', 'favourites', 'GOLDSMITH', 'Nick
', 'swooping', 'coupled', 'vocation','greatness', 'Tower', 'reelingly', 'ceti', 'Lavater', 'Zealand', 'unread','Spin', 'crape', 'screamed', '.\'"',
'charged', 'sublimer', 'phantoms', 'cheer','saved', 'timber', 'skins', 'yore', 'spot']
#出现906次
>>> fdist1['whale']
906
#查看词汇的累计频率图
>>>fdist1.plot(50,cumulative=True)
#查询只出现一次的词
>>> fdist1.hapaxes()
细粒度的选择词
#查找长度大于15的词
>>> v=set(text1)
>>> long_word=[w for w in v iflen(w)>15]
>>> long_word
['uninterpenetratingly','uncomfortableness', 'supernaturalness', 'physiognomically','characteristically', 'cannibalistically', 'CIRCUMNAVIGATION', '
circumnavigating', 'indiscriminately','apprehensiveness', 'superstitiousness', 'circumnavigation','simultaneousness', 'circumnavigations', 'preterna
turalness', 'Physiognomically','indispensableness', 'comprehensiveness', 'hermaphroditical', 'irresistibleness','undiscriminating', 'subterraneousne
ss', 'responsibilities','uncompromisedness']
>>> sorted(long_word)
['CIRCUMNAVIGATION', 'Physiognomically','apprehensiveness', 'cannibalistically', 'characteristically','circumnavigating', 'circumnavigation', 'circu
mnavigations', 'comprehensiveness','hermaphroditical', 'indiscriminately', 'indispensableness','irresistibleness', 'physiognomically', 'preternatura
lness', 'responsibilities','simultaneousness', 'subterraneousness', 'supernaturalness', 'superstitiousness','uncomfortableness', 'uncompromisedness'
, 'undiscriminating','uninterpenetratingly']
#长度超过7个字符且出现次数大于7
>>> fdist5=FreqDist(text5)
>>> sorted([w for w in set(fdist5)if len(w)>7 and fdist5[w]>7])
['#14-19teens', '#talkcity_adults', '((((((((((','........', 'Question', 'actually', 'anything', 'computer', 'cute.-ass','everyone', 'football', 'in
nocent', 'listening', 'remember','seriously', 'something', 'together', 'tomorrow', 'watching']
词语搭配和双连词
>>>bigrams(['more','is','said','than','done'])
Traceback (most recent call last):
File "
NameError: name 'bigrams' is not defined
#导入模块
>>> from nltk import *
>>>bigrams(['more','is','said','than','done'])
>>> v=bigrams(['more','is','said','than','done'])
>>> v
>>> list(v)
[('more', 'is'), ('is', 'said'), ('said','than'), ('than', 'done')]
#查找双连词
>>> text4.collocations()
United States; fellow citizens; four years;years ago; Federal
Government; General Government; Americanpeople; Vice President; Old
World; Almighty God; Fellow citizens; ChiefMagistrate; Chief Justice;
God bless; every citizen; Indian tribes;public debt; one another;
foreign nations; political parties
>>> text8.collocations()
would like; medium build; social drinker;quiet nights; non smoker;
long term; age open; Would like; easygoing; financially secure; fun
times; similar interests; Age open;weekends away; poss rship; well
presented; never married; single mum;permanent relationship; slim
Build
计算其他东西
#查看文本中词长分布
>>> [len(w) for w in text1][:10]
[1, 4, 4, 2, 6, 8, 4, 1, 9, 1]
#计数链表中每个数字出现的次数
>>> fdist=FreqDist([len(w) for win text1])
>>> fdist
FreqDist({3: 50223, 1: 47933, 4: 42345, 2:38513, 5: 26597, 6: 17111, 7: 14399, 8: 9966, 9: 6428, 10: 3528, ...})
#样本总数
>>> fdist.N()
#同上
>>> len(fdist)
19
>>> fdist.keys()
dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12, 13, 14, 15, 16, 17, 18, 20])
#可以看出最长字符个数为20
>>> list(fdist.keys())
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15, 16, 17, 18, 20]
>>> fdist.items()
dict_items([(1, 47933), (2, 38513), (3,50223), (4, 42345), (5, 26597), (6, 17111), (7, 14399), (8, 9966), (9, 6428),(10, 3528), (11, 1873), (12, 105
3), (13, 567),(14, 177), (15, 70), (16, 22), (17, 12), (18, 1), (20, 1)])
#最频繁词的长度是3
>>> fdist.max()
3
#长度为3的词
>>> fdist[3]
50223
>>> fdist.freq(3)
0.19255882431878046
Nltk频率分布类中常用函数
Fdist=FreqDist(text) #创建频率分布
Fdist.inc(‘’) #增加样本
Fdist[‘’] #给定样本出现的次数
Fdist.freq(‘’) #给定样本的频率
fdist.N() #样本总数
Fdist.keys() #样本链表(频率递减速)
For w in fdist: #遍历样本链表
Fdist.max() #数值最大的样本
Fdist.tabulate() #绘制频率分布表
Fdist.plot() #绘制频率分布图
Fdist.plot(cumulative=True) #绘制累积频率分布图
Fdist1 4 决策与控制 条件 数值比较运算符 >>> from nltk.book import * *** Introductory Examples for the NLTK Book*** Loading text1, ..., text9 and sent1, ...,sent9 Type the name of the text or sentence toview it. Type: 'texts()' or 'sents()' to list thematerials. text1: Moby Dick by Herman Melville 1851 text2: Sense and Sensibility by Jane Austen1811 text3: The Book of Genesis text4: Inaugural Address Corpus text5: Chat Corpus text6: Monty Python and the Holy Grail text7: Wall Street Journal text8: Personals Corpus text9: The Man Who Was Thursday by G . K .Chesterton 1908 >>> sent1 ['Call', 'me', 'Ishmael', '.'] >>> sent7 ['Pierre', 'Vinken', ',', '61', 'years','old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive','director', 'Nov.', '29', '.'] >>> [w for w in sent7 iflen(w)<4] [',', '61', 'old', ',', 'the', 'as', 'a','29', '.'] >>> [w for w in sent7 iflen(w)<=4] [',', '61', 'old', ',', 'will', 'join','the', 'as', 'a', 'Nov.', '29', '.'] >>> [w for w in sent7 iflen(w)==4] ['will', 'join', 'Nov.'] >>> [w for w in sent7 iflen(w)!=4] ['Pierre', 'Vinken', ',', '61', 'years','old', ',', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', '29', '.'] 词汇比较运算符 #以指定字符开头 >>> sorted(w for w in set(text1)if w.startswith('ab')) ['aback', 'abaft', 'abandon', 'abandoned','abandonedly', 'abandonment', 'abased', 'abasement', 'abashed', 'abate','abated', 'abatement', 'abating', #以指定字符开头 >>> sorted(w for w in set(text1)if w.endswith('ableness')) ['comfortableness', 'honourableness','immutableness', 'indispensableness', 'indomitableness', 'intolerableness','palpableness', 'reasonableness', 'u ncomfortableness'] #包含指定字符 >>> sorted([term for term inset(text4) if 'gnt' in term]) ['Sovereignty', 'sovereignties','sovereignty'] #首字母大写 >>> sorted([term for term inset(text6) if term.istitle()]) ['A', 'Aaaaaaaaah', 'Aaaaaaaah', 'Aaaaaah','Aaaah', 'Aaaaugh', 'Aaagh', 'Aaah', 'Aaauggh', 'Aaaugh', 'Aaauugh', 'Aagh','Aah', 'Aauuggghhh', 'Aauuugh ', 'Aauuuuugh', 'Aauuuves', 'Action','Actually', 'African', 'Ages', 'Aggh', 'Agh', 'Ah', 'Ahh', 'Alice', 'All','Allo', 'Almighty', 'Alright', 'Am', #数字 >>> sorted([term for term inset(text6) if term.isdigit()]) ['1', '10', '11', '12', '13', '14', '15','16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '3', '4', '5', '6','7', '8', '9'] #小写 Islower() #大写 Isupper() #字母 Isalpha() #字母或数字 Isalnum() 对每个元素进行操作 >>> [len(s.upper() for s in text1] >>> [len(w.upper() for s in text1] #词汇计数 >>> len(text1) 260819 >>> len(set(text1)) 19317 >>> len(set([word.lower() for wordin text1])) 17231 >>> len(set([word.lower() for wordin text1 if word.isalpha()])) 16948 嵌套代码块 >>> word='cat' >>> if len(word)<5: ... print('word length is less than 5') ... word length is less than 5 #循环体内要空4个空格 >>> for word in sent1: ... print(word) ... Call me Ishmael #条件循环 >>> for word in sent1: ... if(word.endswith('l')): ... print(word) ... Call Ishmael >>> for token in sent1: ... if token.islower(): ... print(token,'is a lowercase word') ... elif token.istitle(): ... print(token,'is a titlecase word') ... else: ... print(token,'is punctuation') ... Call is a titlecase word me is a lowercase word Ishmael is a titlecase word . is punctuation 5 自动理解自然语言 语言理解技术 词意消歧 指代消解 自动生成语言 机器翻译 人机对话系统