这里有一个大文本,文件请从 http://10.125.9.144:8000/document.zip 获取,在解压后大约有20m(实际比赛时文件是1.1G)。 文本中都是英文单词,空格以及英文的标点符号: [.,;-~"?'!] (句号,逗号,分号,破折号,波浪号,双引号,问号,单引号,感叹号)
请统计出该文本中最常出现的前10个单词(不区分大小写)。 请注意,在统计中这20个单词请忽略(the, and, i, to, of, a, in, was, that, had, he, you, his, my, it, as, with, her, for, on)
#http://stackoverflow.com/questions/4215472/python-take-max-n-elements-from-some-list import re, collections import heapq ignore_words = ['the','and','to','in','a','that','he','was','it','his','of', 'is', 'with', 'as', 'i', 'had', 'for', 'at', 'by', 'on','not', 'be', 'from', 'but', 's', 'you', 'or', 'her', 'him', 'which'] def words(text): return re.findall('[a-z]+', text.lower()) def train(features): model = collections.defaultdict(lambda: 1) for f in features: model[f] += 1 return model import time starttime = time.time() f = file('/duitang/data/nltk_data/big.txt').read() endtime = time.time() exe_time = (endtime - starttime)*1000 print 'read',exe_time starttime = time.time() f = words(f) endtime = time.time() exe_time = (endtime - starttime)*1000 print 're',exe_time starttime = time.time() f = train(f) endtime = time.time() exe_time = (endtime - starttime)*1000 print 'dict',exe_time starttime = time.time() max_list=heapq.nlargest(40,f,key=f.get) nmax_list = [] for m in max_list: if m in ignore_words: continue nmax_list.append(m) print nmax_list endtime = time.time() exe_time = (endtime - starttime)*1000 print 'sort',exe_time