# -*- coding: utf-8 -*-
"""
Created on Sun Oct 7 09:00:32 2018
@author: asus
"""
#9 作者归属问题
#9.1.3 获取数据
import os
import sys
data_folder = os.path.join(
"E:\\books\Python数据挖掘入门与实践\作者归属问题,支持向量机",
"Data", "books")
#数据可用爬虫爬取下来,但时间太长
#加载文件时跳过古腾堡项目的说明
def clean_book(document):
lines = document.split("\n")
#遍历文档的每一行,寻找作品的开头和结尾,中间部分就是作品内容。
start = 0
end = len(lines)
for i in range(len(lines)):
line = lines[i]
if line.startswith("*** START OF THIS PROJECT GUTENBERG"):
start = i + 1
elif line.startswith("*** END OF THIS PROJECT GUTENBERG"):
end = i - 1
#最后,用换行符把所有行连接起来,得到作品内容。
return "\n".join(lines[start:end])
#创建函数,加载所有图书,进行上述预处理操作
import numpy as np
#声明加载图的函数,参数为图书所在目录books,该目录下是一系列以作者名字命名的子文件夹,
#图书文件就在这些子文件夹中
def load_books_data(folder=data_folder):
documents = []
authors = []
#获取books目录下的所有子文件夹
subfolders = [subfolder for subfolder in os.listdir(folder) if
os.path.isdir(os.path.join(folder, subfolder))]
#遍历这些子文件夹,使用enumerate函数为这些子文件夹指定索引
for author_number, subfolder in enumerate(subfolders):
#获取子文件夹的绝对路径,查找里面的所有图书文件
full_subfolder_path = os.path.join(folder, subfolder)
for document_name in os.listdir(full_subfolder_path):
with open(os.path.join(full_subfolder_path), document_name) as inf:
documents.append(clean_book(inf.read()))
authors.append(author_number)
return documents, np.array(authors, dtype='int')
documents, classes = load_books_data(data_folder)
#9.2 功能词
#统计功能词
function_words = ["a", "able", "aboard", "about", "above", "absent",
"according" , "accordingly", "across", "after", "against",
"ahead", "albeit", "all", "along", "alongside", "although",
"am", "amid", "amidst", "among", "amongst", "amount", "an",
"and", "another", "anti", "any", "anybody", "anyone",
"anything", "are", "around", "as", "aside", "astraddle",
"astride", "at", "away", "bar", "barring", "be", "because",
"been", "before", "behind", "being", "below", "beneath",
"beside", "besides", "better", "between", "beyond", "bit",
"both", "but", "by", "can", "certain", "circa", "close",
"concerning", "consequently", "considering", "could",
"couple", "dare", "deal", "despite", "down", "due", "during",
"each", "eight", "eighth", "either", "enough", "every",
"everybody", "everyone", "everything", "except", "excepting",
"excluding", "failing", "few", "fewer", "fifth", "first",
"five", "following", "for", "four", "fourth", "from", "front",
"given", "good", "great", "had", "half", "have", "he",
"heaps", "hence", "her", "hers", "herself", "him", "himself",
"his", "however", "i", "if", "in", "including", "inside",
"instead", "into", "is", "it", "its", "itself", "keeping",
"lack", "less", "like", "little", "loads", "lots", "majority",
"many", "masses", "may", "me", "might", "mine", "minority",
"minus", "more", "most", "much", "must", "my", "myself",
"near", "need", "neither", "nevertheless", "next", "nine",
"ninth", "no", "nobody", "none", "nor", "nothing",
"notwithstanding", "number", "numbers", "of", "off", "on",
"once", "one", "onto", "opposite", "or", "other", "ought",
"our", "ours", "ourselves", "out", "outside", "over", "part",
"past", "pending", "per", "pertaining", "place", "plenty",
"plethora", "plus", "quantities", "quantity", "quarter",
"regarding", "remainder", "respecting", "rest", "round",
"save", "saving", "second", "seven", "seventh", "several",
"shall", "she", "should", "similar", "since", "six", "sixth",
"so", "some", "somebody", "someone", "something", "spite",
"such", "ten", "tenth", "than", "thanks", "that", "the",
"their", "theirs", "them", "themselves", "then", "thence",
"therefore", "these", "they", "third", "this", "those",
"though", "three", "through", "throughout", "thru", "thus",
"till", "time", "to", "tons", "top", "toward", "towards",
"two", "under", "underneath", "unless", "unlike", "until",
"unto", "up", "upon", "us", "used", "various", "versus",
"via", "view", "wanting", "was", "we", "were", "what",
"whatever", "when", "whenever", "where", "whereas",
"wherever", "whether", "which", "whichever", "while",
"whilst", "who", "whoever", "whole", "whom", "whomever",
"whose", "will", "with", "within", "without", "would", "yet",
"you", "your", "yours", "yourself", "yourselves"]
#有了功能词列表,我们来创建功能词统计工具
from sklearn.feature_extraction.text import CountVectorizer
extractor = CountVectorizer(vocabulary=function_words)
#9.2.2 用功能次进行分类
#支持向量机SVC
from sklearn.svm import SVC
#from sklearn.cross_validation import cross_val_score 更新了
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
#from sklearn import grid_search 更新
#该模块在0.18版本中被弃用,支持所有重构的类和函数都被移动到的model_selection模块
from sklearn.model_selection import GridSearchCV
#from sklearn.model_selection import train_test_split
#支持向量机接收一系列参数。我们用字典结构来组织参数。参数kernel使用linear和rbf。
#C的值取1或10。接着用网络搜索法寻找最优参数。
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
#高斯内核(例如rbf)只适合用于数据集相对较小的情况,比如特征值少于10000。
svr = SVC()
grid = GridSearchCV(svr, parameters)
#接着创建流水线,把特征抽取和参数搜索两个步骤加入流水线中,特征(仅功能词)抽取使用
#CountVectorizer类,参数搜索使用SVM。
pipeline1 = Pipeline([('feature_extraction', extractor),
('clf', grid)])
#然后,使用cross_val_score对流水线的结果进行交叉检验
scores = cross_val_score(pipeline1, documents, classes, scoring='f1')
print(np.mean(scores))
#9.3 支持向量机
#二分类器,假如我们有两个类别的数据,而这两个类别敲好能被一条线分开,线上所有点为一类,
#线下所有点为一类。SVM要做的就是找到这条线,并找出的是最佳分割线。让各点到分割线之间的
#距离最大化。
#9.3.1 用SVN分类
#C参数与分类器正确比例相关,但可能带来过拟合的风险。C值越高,间隔越小,表示要尽可能把
#所有数据正确分类。C值越小,间隔约达——有些数据将无法正确分类。C值越低,过拟合训练数据
#的可能性就低,但是分类效果可能会相对较差。
#9.3.2 内核
#如果数据线性不可分,就需要将其置入更高维的空间中,加入更多伪特征知道数据线性可分。
#寻找最佳分割显示往往需要计算个体之间的内积(又称点积)。
#常用的内核函数有几种,线性内核(linerr),高斯内核(rbf),Sigmoind内核。
#这些内核能够有效地确定两类数据之间的距离。
#9.4 字符N元语法
#N元语法由一系列的N个为一组的对象组成。N为魅族对象的个数(对文本来说,N通常取2到6之间
#的值)。基于单词的N元语法被广泛应用在通常与文档肢体相关的各项研究中。然而,基于字符的
#N元语法被证明在作者归属问题上效果很好。
#抽取字符N元语法
#我们接下来用CountVectorizer类来抽取N元语法,需要设置analyzer参数,指定N的值。
#scikit-learn的N元语法抽取工具提供了range参数,允许用户抽取不同长度的N元语法。这里
#用不到,抽取相同长度,range参数的两个值使用相同值即可。
pipeline = Pipeline([(
'feature_extraction', CountVectorizer(analyzer='char',
ngram_range=(3, 3))), (
'classifier', grid)])
scores = cross_val_score(pipeline, documents, classes, scoring='f1')
print("Score: {:.3f}".format(np.mean(scores)))
#9.5 使用安然公司数据集
#https://www.cs.cmu.edu/~./enron/
#非Linux系统,使用免费的7-zip(http://www.7-zip.org/)等软件来解压
import os
#指定数据集所在位置
enron_data_folder = os.path.join(
"E:\\books\Python数据挖掘入门与实践\作者归属问题,支持向量机",
"enron_mail_20150507.tar", "maildir", "blair")
#9.5.2 创建数据集家在工具
#我们现在创建一个函数,它接受几个发件人作为参数,返回他们所发送的邮件。我们需要的有效
#信息是邮件内容而不是邮件本身。因此,还需要邮件解析器。
from email.parser import Parser
p = Parser()
#设置随机状态
from sklearn.utils import check_random_state
#用min_docs_author参数指定每个发件人至少发过10封邮件,用max_doce_author参数指定最多
#从一个用户那里抽取100封邮件。我们还用num_authors限定了收件人数量——默认为10.
def get_enron_corpus(num_authors=10, data_folder=enron_data_folder,
min_docs_author=10, max_docs_author=100,
random_state=None):
random_state = check_random_state(random_state)
#获取到安然公司员工的邮箱,随机对得到的邮箱进行排序。
email_addresses = sorted(os.listdir(data_folder))
random_state.shuffle(email_addresses)
#我们创建文档列表、类别列表,author_num指的是每个新发件人的类别编号。
documents = []
classes = []
author_num = 0
#我们还需要记录我们所用到的收件人及他们的编号。
authors = {}
#接下来,遍历邮箱文件夹,查找它下面名字中含有"sent"的表示发件箱的子文件夹。
for user in email_addresses:
users_email_folder = os.path.join(data_folder, user)
mail_folders = [os.path.join(users_email_folder, subfolder) for subfolder in os.listdir(users_email_folder) if "sent" in subfolder]
#获取子文件夹中的每一封邮件
try:
authored_emails = [open(os.path.join(mail_folder, email_filename),
encoding='cp1252').read() for mail_folder in mail_folders for email_filename in os.listdir(mail_folder)]
except IsADirectoryError:
continue
if len(authored_emails) < min_docs_author:
continue
if len(authored_emails) > max_docs_author:
authored_emails = authored_emails[:max_docs_author]
#获取邮件内容,然后把邮件内容添加到数据集中
contents = [p.parsestr(email)._payload for email in authored_emails]
documents.extend(contents)
#把该发件人添加到类别列表中,每一封邮件添加一次。
classes.extend([author_num] * len(authored_emails))
#记录该收件人的编号,再把编号加1,以便下一个收件人使用
authors[user] = author_num
author_num += 1
#检测收件人数量是否达到我们设置的值,如果是,跳出循环,返回数据集
if author_num >= num_authors or author_num >= len(email_addresses):
break
return documents, np.array(classes), authors
documents, classes, authors = get_enron_corpus(data_folder=enron_data_folder,
random_state=14)
import quotequail #函数封装
def remove_replies(email_contents):
r = quotequail.unwrap(email_contents)
if r in None:
return email_contents
if 'text_top' in r:
return r['text_top']
elif 'text' in r:
return r['text']
return email_contents
documents = [remove_replies(document) for document in documents]
#9.5.3 组装起来
scores = cross_val_score(pipeline, documents, classes, scoring='f1')
print("Score: {:.3f}".format(np.mean(scores)))
#9.5.4 评估
from sklearn.cross_validation import train_test_split
training_documents, testing_documents, y_train, y_test = train_test_split(
documents, classes, random_state=14)
pipeline.fit(training_documents, y_train)
y_pred = pipeline.predict(testing_documents)
#使用网格搜索
print(pipeline.named_steps['classifier'].best_params_)
#创建混淆矩阵
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_pred, y_test)
cm = cm / cm.astype(np.float).sum(axis=1)
sorted_authors = sorted(authors.keys(), key=lambda x:authors[x])
%matplotlib inline
from matplotlib import pyplot as plt
plt.figure(figsize=(10,10))
plt.imshow(cm, cmap='Blues')
tick_marks = np.arange(len(sorted_authors))
plt.xticks(tick_marks, sorted_authors)
plt.yticks(tick_marks, sorted_authors)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()