在前面的学习中,我们介绍了如何把文本内容分解成n-gram模型,或者说是n个单词长度的词组。从最基本的功能上说,这个集合可以用来确定这段文字中最常见的单词和短语。另外,还可以提取原文中那些最常用的短语周围的句子,对原文进行看似合理的概括。
下面我们使用美国第九任总统威廉.亨利.哈里森的就职演说来作为数据归纳的文字样本。http://pythonscraping.com/files/inaugurationSpeech.txt.我们简单修改一下前面使用的n-gram模型,就可以获取2-gram序列频率数据,本篇采用另外一个写法:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator
def cleanInput(input):
input=re.sub('\n+'," ",input)
input=re.sub('\[[0-9]*\]',"",input)
input=re.sub(' +'," ",input)
input=bytes(input,"UTF-8")
input=input.decode("ascii","ignore")
cleanInput=[]
input=input.split(' ')
for item in input:
item=item.strip(string.punctuation)
if(len(item)>1) or (item.lower()=='a' or item.lower()=='i'):
cleanInput.append(item)
return cleanInput
def ngrams(input,n):
input=cleanInput(input)
output={}
for i in range(len(input)-n+1):
ngramTemp=" ".join(input[i:i+n])
if ngramTemp not in output:
output[ngramTemp]=0
output[ngramTemp]+=1
return output
content=str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(),'utf-8')
ngrams=ngrams(content,2)
sortedNgrams=sorted(ngrams,key=operator.itemgetter(1),reverse=True)
print(ngrams)
还可以使用如下方式:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator
def cleanInput(input):
input=re.sub('\n+'," ",input)
input=re.sub('\[[0-9]*\]',"",input)
input=re.sub(' +'," ",input)
input=bytes(input,"UTF-8")
input=input.decode("ascii","ignore")
cleanInput=[]
input=input.split(' ')
for item in input:
item=item.strip(string.punctuation)
if(len(item)>1) or (item.lower()=='a' or item.lower()=='i'):
cleanInput.append(item)
return cleanInput
def ngrams(input,n):
input=cleanInput(input)
output=[]
outputnew=[]
for i in range(len(input)-n+1):
output.append(str(input[i:i+n]))
setout=set(output)
for item in setout:
outputnew.append((item,output.count(item)))
return outputnew
content=str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(),'utf-8')
ngrams=ngrams(content,2)
sortedNgrams=sorted(ngrams,key=operator.itemgetter(1),reverse=True)
print(sortedNgrams)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator
def cleanInput(input):
input=re.sub('\n+'," ",input)
input=re.sub('\[[0-9]*\]',"",input)
input=re.sub(' +'," ",input)
input=bytes(input,"UTF-8")
input=input.decode("ascii","ignore")
cleanInput=[]
input=input.split(' ')
for item in input:
item=item.strip(string.punctuation)
if(len(item)>1) or (item.lower()=='a' or item.lower()=='i'):
cleanInput.append(item)
return cleanInput
def ngrams(input,n):
input=cleanInput(input)
output={}
for i in range(len(input)-n+1):
ngramTemp=" ".join(input[i:i+n])
if ngramTemp not in output:
output[ngramTemp]=0
output[ngramTemp]+=1
return output
def isCommon(ngram):
commonWords=["the","be","and","of","a","in","to","have","it",
"i","that","for","you","he","with","on","do","say","this",
"they","is","an","at","but","we","his","from","that","not",
"by","she","or","as","what","go","their","can","who","get",
"if","would","her","all","my","make","about","know","will",
"as","up","one","time","has","been","there","year","so",
"think","when","which","them","some","me","people","take",
"out","into","just","see","him","your","come","could","now",
"than","like","other","how","then","its","our","two","more",
"these","want","way","look","first","also","new","because",
"day","more","use","no","man","find","here","thing","give","many",
"well"]
ngram=ngram.lower()
if ngram in commonWords:
return True
return False
content=str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(),'utf-8')
ngrams=ngrams(content,2)
ngramsPicked=ngrams.copy()
for key,value in ngrams.items():
words=key.split(" ")
for eachWord in words:
flag=isCommon(eachWord)
if flag==True:
ngramsPicked.pop(key)
break
sortedNgrams=sorted(ngramsPicked.items(),key=operator.itemgetter(1),reverse=True)
print(sortedNgrams)
通过结果分析,发现去除了那些无意义的词汇。('United States', 10), ('General Government', 4), ('executive department', 4), ('whole country', 3), ('Mr Jefferson', 3), ('same causes', 3), ('Chief Magistrate', 3), ('Government should', 3), ('legislative body', 3), ('called upon', 3).
现在有些核心的词汇已经被抽取出来了,它们怎么帮助我们归纳这些文字呢?一种方法就是搜索包含每个核心n-gram序列的第一句话,这样超过3个的前五个n-gram的序列的搜索结果就是我们这篇发言的总结