def opendb(): opdb=open('dbase_doc_id','r') docid=pickle.load(opdb) return docid1.构建“To”域的倒排表(“From”域与之相类似,在此给出“To”域的算法)
def invertedlist_to(): print '------------------\'To\'-------------------' doc_id=opendb() # to_mapping contains the hash map of the mail addr. to_mapping={} doclist=open('out.txt','r') for eachline in doclist: eachline_split=eachline[0:-2] fp=open(eachline_split,'r') msg=email.message_from_file(fp) to=msg.get("To") if to!=None: # the email list is all divided by ','. to_token=to.split(',') for i in to_token: # get rid of ENTER KEY and BLACK SPACES in front of every to_token if it has. i=re.sub('^\s*|\r\n','',i) # count the times that 'i' appears in each doc signal=0 if i in to_mapping.keys(): for eachdoc in to_mapping[i]: if eachdoc[0]==doc_id[eachline_split]: eachdoc[1]=eachdoc[1]+1 signal=1 break else: pass else: pass if signal==0: to_mapping.setdefault(i,[]).append([doc_id[eachline_split],1]) doclist.close() mydb=open('dbase_to','w') pickle.dump(to_mapping,mydb) print '- Done!' print 'Total',len(to_mapping),'words !' print '(Pickle file \'dbase_to\' is generated,' print 'it\'s the inverted list of \'To\' fields in the mail list.' print '\'dbase_to\' has already been stored in hard the disk.)' print '--------------End for \'To\'---------------' print
2. 构建“Subject”域和邮件主题的倒排表
思路与上面完全相同,只是分词方法不同。"Subject"和邮件主题都是传达内容的语句,因此直接将每个单词进行分词。subject_token=re.split('[^\w]*',subject)这里采取的方案较简单,未考虑同一个词的多种表达形式(复数,ing等),也为去除停用词(这一点非常不好,致使后来的Top 50失去意义)
# 'Date' field contains times like '23:34:00', and I think that should keep this style. date_token=re.split('[^[\w:]]*',date)
最后导出序列化的字典