sklearn.feature_extraction.text.CountVectorizer
和sklearn.feature_extraction.text.TfidfVectorizer
类受到许多可伸缩性问题的困扰,这些问题都源于vocabulary_
属性(Python 字典)的内部使用,它用于将 unicode 字符串特征名称映射为整数特征索引。
主要的可扩展性问题是:
vocabulary_
是一个共享状态:复杂的同步和开销vocabulary_
需要从数据中学习:在遍历一次整个数据集之前无法知道其大小vocabulary_
属性的工作原理。 在fit
的时候,语料库的标记由整数索引唯一标识,并且该映射存储在词汇表中:为了更好地理解这个问题,让我们看一下vocabulary_
属性的工作原理。 在fit
的时候,语料库的标记由整数索引唯一标识,并且该映射存储在词汇表中:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
vectorizer.fit(["The cat sat on the mat.",])
vectorizer.vocabulary_
X = vectorizer.transform([
"The cat sat on the mat.",
"This cat is a nice cat.",
]).toarray()
print(len(vectorizer.vocabulary_))
print(vectorizer.get_feature_names())
print(X)
让我们用稍大的语料库重新拟合:
vectorizer = CountVectorizer(min_df=1)
vectorizer.fit([
"The cat sat on the mat.",
"The quick brown fox jumps over the lazy dog.",
])
vectorizer.vocabulary_
X = vectorizer.transform([
"The cat sat on the mat.",
"This cat is a nice cat.",
]).toarray()
print(len(vectorizer.vocabulary_))
print(vectorizer.get_feature_names())
print(X)
vocabulary_随着训练语料库的大小而(以对数方式)增长。
下载数据
import os
import zipfile
try:
from urllib.request import urlopen
except ImportError:
from urllib import urlopen
import tarfile
IMDB_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
IMDB_ARCHIVE_NAME = "aclImdb_v1.tar.gz"
def get_datasets_folder():
here = os.path.dirname(__file__)
notebooks = os.path.join(here, 'notebooks')
datasets_folder = os.path.abspath(os.path.join(notebooks, 'datasets'))
datasets_archive = os.path.abspath(os.path.join(notebooks, 'datasets.zip'))
if not os.path.exists(datasets_folder):
if os.path.exists(datasets_archive):
print("Extracting " + datasets_archive)
zf = zipfile.ZipFile(datasets_archive)
zf.extractall('.')
assert os.path.exists(datasets_folder)
else:
print("Creating datasets folder: " + datasets_folder)
os.makedirs(datasets_folder)
else:
print("Using existing dataset folder:" + datasets_folder)
return datasets_folder
def check_imdb(datasets_folder):
print("\nChecking availability of the IMDb dataset")
archive_path = os.path.join(datasets_folder, IMDB_ARCHIVE_NAME)
imdb_path = os.path.join(datasets_folder, 'IMDb')
train_path = os.path.join(imdb_path, 'aclImdb', 'train')
test_path = os.path.join(imdb_path, 'aclImdb', 'test')
if not os.path.exists(imdb_path):
if not os.path.exists(archive_path):
print("Downloading dataset from %s (84.1MB)" % IMDB_URL)
opener = urlopen(IMDB_URL)
open(archive_path, 'wb').write(opener.read())
else:
print("Found archive: " + archive_path)
print("Extracting %s to %s" % (archive_path, imdb_path))
tar = tarfile.open(archive_path, "r:gz")
tar.extractall(path=imdb_path)
tar.close()
os.remove(archive_path)
print("Checking that the IMDb train & test directories exist...")
assert os.path.exists(train_path)
assert os.path.exists(test_path)
print("=> Success!")
if __name__ == "__main__":
datasets_folder = get_datasets_folder()
check_imdb(datasets_folder)
print("\nLoading Labeled Faces Data (~200MB)")
from sklearn.datasets import fetch_lfw_people
fetch_lfw_people(min_faces_per_person=70, resize=0.4,
data_home=datasets_folder)
print("=> Success!")
import os
train_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'train')
test_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'test')
# 通过 scikit-learn 的load_files函数,将它们加载到我们的活动会话中:
from sklearn.datasets import load_files
train = load_files(container_path=(train_path),categories=['pos', 'neg'])
test = load_files(container_path=(test_path),categories=['pos', 'neg'])
train.keys()
# dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
import numpy as np
for label, data in zip(('TRAINING', 'TEST'), (train, test)):
print('\n\n%s' % label)
print('Number of documents:', len(data['data']))
print('\n1st document:\n', data['data'][0])
print('\n1st label:', data['target'][0])
print('\nClass names:', data['target_names'])
print('Class count:', np.unique(data['target']), ' -> ',np.bincount(data['target']))
要解决基于词汇表的向量化器的局限性,可以使用散列技巧。 我们可以使用散列函数和模运算,而不是在 Python 字典中构建和存储特征名称到特征索引的显式映射:
from sklearn.utils.murmurhash import murmurhash3_bytes_u32
# encode for python 3 compatibility
for word in "the cat sat on the mat".encode("utf-8").split():
print("{0} => {1}".format(word, murmurhash3_bytes_u32(word, 0) % 2 ** 20))
这种映射完全是无状态的,并且输出空间的维度预先明确固定(这里我们使用2 ** 20
的模,这意味着大约 1M 的维度)。 这使得有可能解决基于词汇表的向量化器的局限性,既可用于并行化,也可用于在线/核外学习。
HashingVectorizer
类是CountVectorizer
(或use_idf=False
的TfidfVectorizer
类)的替代品,它在内部使用 murmurhash 哈希函数:
from sklearn.feature_extraction.text import HashingVectorizer
h_vectorizer = HashingVectorizer(encoding='latin-1')
h_vectorizer
analyzer = h_vectorizer.build_analyzer()
analyzer('This is a test sentence.')
我们可以将数据集向量化为scipy
稀疏矩阵,就像我们使用CountVectorizer
或TfidfVectorizer
一样,除了我们可以直接调用transform
方法:没有必要拟合,因为HashingVectorizer
是无状态变换器:
docs_train, y_train = train['data'], train['target']
docs_valid, y_valid = test['data'][:12500], test['target'][:12500]
docs_test, y_test = test['data'][12500:], test['target'][12500:]
h_vectorizer.transform(docs_train)
最后,让我们在 IMDb 训练子集上训练一个LogisticRegression分类器:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
h_pipeline = Pipeline([
('vec', HashingVectorizer(encoding='latin-1')),
('clf', LogisticRegression(random_state=1)),
])
h_pipeline.fit(docs_train, y_train)
print('Train accuracy', h_pipeline.score(docs_train, y_train))
print('Validation accuracy', h_pipeline.score(docs_valid, y_valid))
import gc
gc.collect()
核外学习是在不放不进内存或 RAM 的数据集上训练机器学习模型的任务。 这需要以下条件:
具有固定输出维度的特征提取层
提前知道所有类别的列表(在这种情况下,我们只有正面和负面的评论)
支持增量学习的机器学习算法(scikit-learn 中的partial_fit
方法)。
在以下部分中,我们将建立一个简单的批量训练函数来迭代地训练SGDClassifier
。
但首先,让我们将文件名加载到 Python 列表中:
train_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'train')
train_pos = os.path.join(train_path, 'pos')
train_neg = os.path.join(train_path, 'neg')
fnames = [os.path.join(train_pos, f) for f in os.listdir(train_pos)] +\
[os.path.join(train_neg, f) for f in os.listdir(train_neg)]
fnames[:3]
# 目标
y_train = np.zeros((len(fnames), ), dtype=int)
y_train[:12500] = 1
np.bincount(y_train)
from sklearn.base import clone
def batch_train(clf, fnames, labels, iterations=25, batchsize=1000, random_seed=1):
vec = HashingVectorizer(encoding='latin-1')
idx = np.arange(labels.shape[0])
c_clf = clone(clf)
rng = np.random.RandomState(seed=random_seed)
for i in range(iterations):
rnd_idx = rng.choice(idx, size=batchsize)
documents = []
for i in rnd_idx:
with open(fnames[i], 'r', encoding='latin-1') as f:
documents.append(f.read())
X_batch = vec.transform(documents)
batch_labels = labels[rnd_idx]
c_clf.partial_fit(X=X_batch,
y=batch_labels,
classes=[0, 1])
return c_clf
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(loss='log', random_state=1, max_iter=1000)
sgd = batch_train(clf=sgd,
fnames=fnames,
labels=y_train)
vec = HashingVectorizer(encoding='latin-1')
sgd.score(vec.transform(docs_test), y_test)
在我们上面的batch_train
函数的实现中,我们在每次迭代中随机抽取k
个训练样本作为批量,这可以被视为带放回的随机子采样。
下面实现无放回地迭代文档,即它在每次迭代中使用每个文档一次
import os
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.base import clone
from sklearn.datasets import load_files
def batch_train(clf, fnames, labels, iterations=1,batchsize=1000, random_seed=1):
vec = HashingVectorizer(encoding='latin-1')
idx = np.arange(labels.shape[0])
c_clf = clone(clf)
rng = np.random.RandomState(seed=random_seed)
shuffled_idx = rng.permutation(range(len(fnames)))
fnames_ary = np.asarray(fnames)
for _ in range(iterations):
for batch in np.split(shuffled_idx, len(fnames) // 1000):
documents = []
for fn in fnames_ary[batch]:
with open(fn, 'r',encoding="UTF-8") as f:
documents.append(f.read())
X_batch = vec.transform(documents)
batch_labels = labels[batch]
c_clf.partial_fit(X=X_batch,
y=batch_labels,
classes=[0, 1])
return c_clf
# Out-of-core Training
train_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'train')
train_pos = os.path.join(train_path, 'pos')
train_neg = os.path.join(train_path, 'neg')
fnames = [os.path.join(train_pos, f) for f in os.listdir(train_pos)] +\
[os.path.join(train_neg, f) for f in os.listdir(train_neg)]
y_train = np.zeros((len(fnames), ), dtype=int)
y_train[:12500] = 1
np.bincount(y_train)
sgd = SGDClassifier(loss='log', random_state=1)
sgd = batch_train(clf=sgd,
fnames=fnames,
labels=y_train)
# Testing
test_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'test')
test = load_files(container_path=(test_path),categories=['pos', 'neg'])
docs_test, y_test = test['data'][12500:], test['target'][12500:]
vec = HashingVectorizer(encoding='latin-1')
print('accuracy:', sgd.score(vec.transform(docs_test), y_test))