第一章 语言处理与python
1 语言计算 文本与词汇
>>> import nltk
>>> nltk.download()
>>> from nltk.book import *
*** Introductory Examples for the NLTK Book***
Loading text1, ..., text9 and sent1, ...,sent9
Type the name of the text or sentence toview it.
Type: 'texts()' or 'sents()' to list thematerials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K .Chesterton 1908
>>> text1
>>> text2
>>> text1.concordance('monstrous')
Displaying 11 of 11 matches:
ong the former , one was of a mostmonstrous size . ... This came towards us ,
ON OF THE PSALMS . " Touching thatmonstrous bulk of the whale or ork we have r
ll over with a heathenish array ofmonstrous clubs and spears . Some were thick
d as you gazed , and wondered whatmonstrous cannibal and savage could ever hav
that has survived the flood ; mostmonstrous and most mountainous ! That Himmal
they might scout at Moby Dick as amonstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of theMonstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with themonstrous pictures of whales , I am strongly
ere to enter upon those still moremonstrous stories of them which are to be fo
ght have been rummaged out of thismonstrous cabinet there is no telling . But
of Whale - Bones ; for Whales of amonstrous size are oftentimes cast up dead u
>>> text1.similar('monstrous')
impalpable puzzled part mystifying gamesomehorrible maddens
domineering curious exasperate untowardcontemptible careful
trustworthy delightfully christian meanuncommon abundant wise
>>> text2.similar('monstrous')
very exceedingly so heartily as vastamazingly extremely great sweet a
remarkably good
be_glad a_lucky am_glad is_pretty a_pretty
>>> text4.dispersion_plot(['citizens','democracy','freedom','duties','America'])
>>> text3.generate()
Traceback (most recent call last):
File "
AttributeError: 'Text' object has noattribute 'generate'
>>> len(text3) #创世纪有44764个单词和标点符号
>>> sorted(set(text3))
['!', "'", '(', ')', ',', ',)','.', '.)', ':', ';', ';)', '?', '?)', 'A', 'Abel', 'Abelmizraim', 'Abidah','Abide', 'Abimael', 'Abimelech', 'Abr', 'A
brah', 'Abraham', 'Abram', 'Accad', 'Achbor','Adah', 'Adam', 'Adbeel', 'Admah', 'Adullamite', 'After', 'Aholibamah','Ahuzzath', 'Ajah', 'Akan', 'All
', 'Allonbachuth', 'Almighty', 'Almodad','Also', 'Alvah', 'Alvan', 'Am', 'Amal', 'Amalek', 'Amalekites', 'Ammon','Amorite', 'Amorites', 'Amraphel',
>>> len(set(text3))
>>> from __future__ importdivision
>>> len(text3)/len(set(text3))
>>> text3.count('smote')
>>> def lexical_diversity(text):
... return len(text)/len(set(text))
>>> def percentage(count,total):
... return 100*count/total
>>> lexical_diversity(text3)
>>> percentage(text4.count('a'),len(text4))
2 将文本当作词链表
>>> sent1
['call', 'me', 'retacn', '.']
>>> len(sent1)
>>> lexical_diversity(sent1)
>>> ['monty','python']+['and','the','holy','grail']
['monty', 'python', 'and', 'the', 'holy','grail']
>>> sent1.append('some')
>>> sent1
['call', 'me', 'retacn', '.', 'some']
>>> text4[173]
>>> text4.index('awaken')
>>> text5[16715:16735]
['U86', 'thats', 'why', 'something','like', 'gamefly', 'is', 'so', 'good', 'because', 'you', 'can', 'actually','play', 'a', 'full', 'game', 'without
', 'buying', 'it']
>>> text6[1600:1625]
['We', "'", 're', 'an','anarcho', '-', 'syndicalist', 'commune', '.', 'We', 'take', 'it', 'in','turns', 'to', 'act', 'as', 'a', 'sort', 'of', 'execu
tive', 'officer', 'for', 'the', 'week']
>>> len(text5)
>>> text5[45010]
Traceback (most recent call last):
File "
File "D:\Python35\lib\site-packages\nltk\text.py", line 314,in __getitem__
return self.tokens[i]
IndexError: list index out of range
>>> text8[:3]
['25', 'SEXY', 'MALE']
>>> text8[4860:]
['FREE', 'to', 'advertise', 'in','Perfect', 'Match', '!']
>>> name='retacn'
>>> name[0]
>>> name[:5]
>>> name[:]
>>> name*2
>>> name+'!'
>>> ' '.join(['retacn','coder'])
'retacn coder'
>>> 'retacn coder'.split()
['retacn', 'coder']
3 简单的统计
>>> saying
['retacn', 'is', 'a', 'coder', '!']
>>> tokens=set(saying)
>>> tokens
{'is', 'retacn', 'a', 'coder', '!'}
>>> tokens=sorted(tokens)
>>> tokens
['!', 'a', 'coder', 'is', 'retacn']
>>> tokens[-2:]
['is', 'retacn']
>>> fdist1=FreqDist(text1)
>>> fdist1
FreqDist({',': 18713, 'the': 13721, '.':6862, 'of': 6536, 'and': 6024, 'a': 4569, 'to': 4542, ';': 4072, 'in': 3916,'that': 2982, ...})
>>> vocabulary1=fdist1.keys()
>>> len(fdist1)
>>> vocabulary1=list(vocabulary1)
>>> vocabulary1[:50]
['sweetest', 'demeanor', 'cutlets','bathing', 'eddy', 'summits', 'eager', 'carcass', 'splintered', 'coppers','ruinous', 'ease', 'Gather', 'immovable
', 'Verdes', 'breathing', 'colony','dreamed', 'deepeningly', 'artisan', 'placid', 'seven', 'manipulator','Cistern', 'favourites', 'GOLDSMITH', 'Nick
', 'swooping', 'coupled', 'vocation','greatness', 'Tower', 'reelingly', 'ceti', 'Lavater', 'Zealand', 'unread','Spin', 'crape', 'screamed', '.\'"',
'charged', 'sublimer', 'phantoms', 'cheer','saved', 'timber', 'skins', 'yore', 'spot']
>>> fdist1['whale']
>>> fdist1.hapaxes()
>>> v=set(text1)
>>> long_word=[w for w in v iflen(w)>15]
>>> long_word
['uninterpenetratingly','uncomfortableness', 'supernaturalness', 'physiognomically','characteristically', 'cannibalistically', 'CIRCUMNAVIGATION', '
circumnavigating', 'indiscriminately','apprehensiveness', 'superstitiousness', 'circumnavigation','simultaneousness', 'circumnavigations', 'preterna
turalness', 'Physiognomically','indispensableness', 'comprehensiveness', 'hermaphroditical', 'irresistibleness','undiscriminating', 'subterraneousne
ss', 'responsibilities','uncompromisedness']
>>> sorted(long_word)
['CIRCUMNAVIGATION', 'Physiognomically','apprehensiveness', 'cannibalistically', 'characteristically','circumnavigating', 'circumnavigation', 'circu
mnavigations', 'comprehensiveness','hermaphroditical', 'indiscriminately', 'indispensableness','irresistibleness', 'physiognomically', 'preternatura
lness', 'responsibilities','simultaneousness', 'subterraneousness', 'supernaturalness', 'superstitiousness','uncomfortableness', 'uncompromisedness'
, 'undiscriminating','uninterpenetratingly']
>>> fdist5=FreqDist(text5)
>>> sorted([w for w in set(fdist5)if len(w)>7 and fdist5[w]>7])
['#14-19teens', '#talkcity_adults', '((((((((((','........', 'Question', 'actually', 'anything', 'computer', 'cute.-ass','everyone', 'football', 'in
nocent', 'listening', 'remember','seriously', 'something', 'together', 'tomorrow', 'watching']
Traceback (most recent call last):
File "
NameError: name 'bigrams' is not defined
>>> from nltk import *
>>> v=bigrams(['more','is','said','than','done'])
>>> v
>>> list(v)
[('more', 'is'), ('is', 'said'), ('said','than'), ('than', 'done')]
>>> text4.collocations()
United States; fellow citizens; four years;years ago; Federal
Government; General Government; Americanpeople; Vice President; Old
World; Almighty God; Fellow citizens; ChiefMagistrate; Chief Justice;
God bless; every citizen; Indian tribes;public debt; one another;
foreign nations; political parties
>>> text8.collocations()
would like; medium build; social drinker;quiet nights; non smoker;
long term; age open; Would like; easygoing; financially secure; fun
times; similar interests; Age open;weekends away; poss rship; well
presented; never married; single mum;permanent relationship; slim
>>> [len(w) for w in text1][:10]
[1, 4, 4, 2, 6, 8, 4, 1, 9, 1]
>>> fdist=FreqDist([len(w) for win text1])
>>> fdist
FreqDist({3: 50223, 1: 47933, 4: 42345, 2:38513, 5: 26597, 6: 17111, 7: 14399, 8: 9966, 9: 6428, 10: 3528, ...})
>>> fdist.N()
>>> len(fdist)
>>> fdist.keys()
dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12, 13, 14, 15, 16, 17, 18, 20])
>>> list(fdist.keys())
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15, 16, 17, 18, 20]
>>> fdist.items()
dict_items([(1, 47933), (2, 38513), (3,50223), (4, 42345), (5, 26597), (6, 17111), (7, 14399), (8, 9966), (9, 6428),(10, 3528), (11, 1873), (12, 105
3), (13, 567),(14, 177), (15, 70), (16, 22), (17, 12), (18, 1), (20, 1)])
>>> fdist.max()
>>> fdist[3]
>>> fdist.freq(3)
Fdist=FreqDist(text) #创建频率分布
Fdist.inc(‘’) #增加样本
Fdist[‘’] #给定样本出现的次数
Fdist.freq(‘’) #给定样本的频率
fdist.N() #样本总数
Fdist.keys() #样本链表(频率递减速)
For w in fdist: #遍历样本链表
Fdist.max() #数值最大的样本
Fdist.tabulate() #绘制频率分布表
Fdist.plot() #绘制频率分布图
Fdist.plot(cumulative=True) #绘制累积频率分布图
Fdist1 4 决策与控制 条件 数值比较运算符 >>> from nltk.book import * *** Introductory Examples for the NLTK Book*** Loading text1, ..., text9 and sent1, ...,sent9 Type the name of the text or sentence toview it. Type: 'texts()' or 'sents()' to list thematerials. text1: Moby Dick by Herman Melville 1851 text2: Sense and Sensibility by Jane Austen1811 text3: The Book of Genesis text4: Inaugural Address Corpus text5: Chat Corpus text6: Monty Python and the Holy Grail text7: Wall Street Journal text8: Personals Corpus text9: The Man Who Was Thursday by G . K .Chesterton 1908 >>> sent1 ['Call', 'me', 'Ishmael', '.'] >>> sent7 ['Pierre', 'Vinken', ',', '61', 'years','old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive','director', 'Nov.', '29', '.'] >>> [w for w in sent7 iflen(w)<4] [',', '61', 'old', ',', 'the', 'as', 'a','29', '.'] >>> [w for w in sent7 iflen(w)<=4] [',', '61', 'old', ',', 'will', 'join','the', 'as', 'a', 'Nov.', '29', '.'] >>> [w for w in sent7 iflen(w)==4] ['will', 'join', 'Nov.'] >>> [w for w in sent7 iflen(w)!=4] ['Pierre', 'Vinken', ',', '61', 'years','old', ',', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', '29', '.'] 词汇比较运算符 #以指定字符开头 >>> sorted(w for w in set(text1)if w.startswith('ab')) ['aback', 'abaft', 'abandon', 'abandoned','abandonedly', 'abandonment', 'abased', 'abasement', 'abashed', 'abate','abated', 'abatement', 'abating', #以指定字符开头 >>> sorted(w for w in set(text1)if w.endswith('ableness')) ['comfortableness', 'honourableness','immutableness', 'indispensableness', 'indomitableness', 'intolerableness','palpableness', 'reasonableness', 'u ncomfortableness'] #包含指定字符 >>> sorted([term for term inset(text4) if 'gnt' in term]) ['Sovereignty', 'sovereignties','sovereignty'] #首字母大写 >>> sorted([term for term inset(text6) if term.istitle()]) ['A', 'Aaaaaaaaah', 'Aaaaaaaah', 'Aaaaaah','Aaaah', 'Aaaaugh', 'Aaagh', 'Aaah', 'Aaauggh', 'Aaaugh', 'Aaauugh', 'Aagh','Aah', 'Aauuggghhh', 'Aauuugh ', 'Aauuuuugh', 'Aauuuves', 'Action','Actually', 'African', 'Ages', 'Aggh', 'Agh', 'Ah', 'Ahh', 'Alice', 'All','Allo', 'Almighty', 'Alright', 'Am', #数字 >>> sorted([term for term inset(text6) if term.isdigit()]) ['1', '10', '11', '12', '13', '14', '15','16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '3', '4', '5', '6','7', '8', '9'] #小写 Islower() #大写 Isupper() #字母 Isalpha() #字母或数字 Isalnum() 对每个元素进行操作 >>> [len(s.upper() for s in text1] >>> [len(w.upper() for s in text1] #词汇计数 >>> len(text1) 260819 >>> len(set(text1)) 19317 >>> len(set([word.lower() for wordin text1])) 17231 >>> len(set([word.lower() for wordin text1 if word.isalpha()])) 16948 嵌套代码块 >>> word='cat' >>> if len(word)<5: ... print('word length is less than 5') ... word length is less than 5 #循环体内要空4个空格 >>> for word in sent1: ... print(word) ... Call me Ishmael #条件循环 >>> for word in sent1: ... if(word.endswith('l')): ... print(word) ... Call Ishmael >>> for token in sent1: ... if token.islower(): ... print(token,'is a lowercase word') ... elif token.istitle(): ... print(token,'is a titlecase word') ... else: ... print(token,'is punctuation') ... Call is a titlecase word me is a lowercase word Ishmael is a titlecase word . is punctuation 5 自动理解自然语言 语言理解技术 词意消歧 指代消解 自动生成语言 机器翻译 人机对话系统