categories = ['alt.atheism', 'soc.religion.christian',
'comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import load_files
from sklearn.datasets import load_files
twenty_train = fetch_20newsgroups(subset='train',
categories=categories, shuffle=True, random_state=42)
type(twenty_train)
sklearn.datasets.base.Bunch
twenty_train.target_names
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
len(twenty_train.data)
2257
len(twenty_train.filenames)
2257
type(twenty_train.data[0])
str
print("\n".join(twenty_train.data[0].split("\n")[:3]))
From: [email protected] (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
print(twenty_train.target_names[twenty_train.target[0]])
comp.graphics
print(twenty_train.data[0])
From: [email protected] (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14
Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format. We would also like to
do the same, converting to HPGL (HP plotter) files.
Please email any response.
Is this the correct group?
Thanks in advance. Michael.
--
Michael Collier (Programmer) The Computer Unit,
Email: [email protected] The City University,
Tel: 071 477-8000 x3769 London,
Fax: 071 477-8565 EC1V 0HB.
twenty_train.target[0]
1
for t in twenty_train.target[:10]:
print(twenty_train.target_names[t])
comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med
twenty_train.target
array([1, 1, 3, ..., 2, 2, 2])
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
(2257, 35788)
count_vect.vocabulary_.get(u'algorithm')
4690
字典:文字包
1. 为训练集中的每个词设定一个固定的ID,就像字典一样
2. 对于每一个文本,计算词频并保存在 X[i, j]
的形式,i是文本的编号,j是字典里的索引变好,也就是说
X[文本编号,字典索引编号]输出值为 改词词频
对于这样的词汇包n_features
就是不同的词汇,就是说数据集的列是不同的词汇名
- 通常这样的属性列的数量会大于十万
如果n_samples
的数量为10000(一万),那么数据集的大小将达到10000*100000*4bytes=4GB
文字包通常有很多空元素,或者说是0元素,所以单词包是一个高维度分散数据集,如果只保留非空值那么将会节省大量空间
scipy.sparse矩阵就是做这个事情的,skilearn里面包含了这一个功能
将文本数据处理成之前所说的文字包形式可以使用
CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect.fit_transform(twenty_train.data)
<2257x35788 sparse matrix of type ''
with 365886 stored elements in Compressed Sparse Row format>
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
(2257, 35788)
Countervextor 支持将文字列成字典,从而可以索引
count_vect.vocabulary_.get('algorithm')
4690
记录出现次数是一个很不错的开始,但是有一个问题需要注意
对于同一个主题的数据,更长的文本意味着更多的记录次数
1. 解决办法
- 用 出现次数 / 该文章总字数 来作为 频率
- 这样的方法在这里称为 tf (term frequency)
2. 解决方法2
- 降低常用词权重(weight)
- 降低权重称为 tf-idf (Term frequency times inverse document frequency) 词频和文本频率的倒数
from sklearn.feature_extraction.text import TfidfTransformer
有一些需要理清的概念
1. iterm frequcency
- 维基百科解释
- ft,d 一个词在文本中出现的次数总和
- tf(t,d)=ft,d
- t:单词t在文本d中的出现次数
- D:总文本
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape
(2257, 35788)
之前的两个步骤使用一个fit_transform就可以了
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
(2257, 35788)
之前的操作将原始数据转换成文字包的形势,接着优化文字包,让他更有效率一些
下一步训练分类器,就可以对数据分类了
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
X_new_counts
<2x35788 sparse matrix of type ''
with 9 stored elements in Compressed Sparse Row format>
X_new_counts.toarray()
array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]])
for doc, category in zip(docs_new, predicted):
print('%r => %s'%(doc, twenty_train.target_names[category]))
'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
predicted
array([3, 1])
[i for i in zip(docs_new, predicted)]
[('God is love', 3), ('OpenGL on the GPU is fast', 1)]
为了让上述的 vectorizer => transformer => classifier 更加简单,我们使用 Pipeline
pipeline就是一个整合过的分类器,整合了上述功能
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])
现在只需要一行代码就可以使用分类器分类了
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
准确率测试非常简单
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
categories=categories,
shuffle=True,
random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)
0.83488681757656458
predicted == twenty_test.target
array([ True, True, False, ..., True, True, True], dtype=bool)
支持向量机被誉为最好的文本分类算法
缺点是比较慢
只需要更改pipeline的一个参数就可以
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge',
penalty='l2',
alpha=1e-3,
n_iter=5,
random_state=42)
),])
_=text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)
0.9127829560585885
可以看到预测准确率的提高,从84%提高到91%
更加细致的分类器结果展示
from sklearn import metrics
print(metrics.classification_report(twenty_test.target,
predicted,
target_names=twenty_test.target_names))
precision recall f1-score support
alt.atheism 0.95 0.81 0.87 319
comp.graphics 0.88 0.97 0.92 389
sci.med 0.94 0.90 0.92 396
soc.religion.christian 0.90 0.95 0.93 398
avg / total 0.92 0.91 0.91 1502
metrics.confusion_matrix(twenty_test.target, predicted)
array([[258, 11, 15, 35],
[ 4, 379, 3, 3],
[ 5, 33, 355, 3],
[ 5, 10, 4, 379]])
迷惑矩阵confusion metrics
因为只是用了四个种类的数据进行试验,所以产生十足试验数据详细报告,而之前使用全部数据的时候,样本数量以及类别远大于目前
对于不同的算法,往往有很多参数需要设置,寻找最优的参数可以使用
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),
}
用于实验最佳参数,由于上面使用的是pipeline,所以参数的字典使用了name__para的形式
n_jobs使用-1作为参数可以自动判定cpu核心数目并使用多线程
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
twenty_train.target_names[gs_clf.predict(['God is love'])]
/Users/Houbowei/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: DeprecationWarning: converting an array with ndim > 0 to an index will result in an error in the future
if __name__ == '__main__':
'soc.religion.christian'
gs_clf.grid_scores_
[mean: 0.90430, std: 0.00570, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'clf__alpha': 0.01},
mean: 0.92113, std: 0.01206, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': True, 'clf__alpha': 0.01},
mean: 0.81303, std: 0.01682, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': False, 'clf__alpha': 0.01},
mean: 0.83562, std: 0.02234, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'clf__alpha': 0.01},
mean: 0.96544, std: 0.00329, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'clf__alpha': 0.001},
mean: 0.95968, std: 0.00641, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': True, 'clf__alpha': 0.001},
mean: 0.92158, std: 0.00284, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': False, 'clf__alpha': 0.001},
mean: 0.93088, std: 0.00290, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'clf__alpha': 0.001}]
每一行是一个sklearn类
type(py_list[0])
sklearn.grid_search._CVScoreTuple
gs_clf.grid_scores_[0]
mean: 0.90430, std: 0.00570, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'clf__alpha': 0.01}
type(gs_clf.grid_scores_)
list
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
print("%s: %r" % (param_name, best_parameters[param_name]))
clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)
score
0.96544085068675234
文本挖掘的教学内容到此就结束了,下面会尝试实例来联系相关技巧