#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2010 Radim Rehurek# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ 这个模块包含各种通用实用函数。 """ from __future__ import with_statement import logging import warnings logger = logging.getLogger(__name__) try: from html.entities import name2codepoint as n2cp except ImportError: from htmlentitydefs import name2codepoint as n2cp try: import cPickle as _pickle except ImportError: import pickle as _pickle import re import unicodedata import os import random import itertools import tempfile from functools import wraps # for `synchronous` function lock import multiprocessing import shutil import sys from contextlib import contextmanager import subprocess import numpy as np import numbers import scipy.sparse if sys.version_info[0] >= 3: unicode = str from six import iterkeys, iteritems, u, string_types, unichr from six.moves import xrange try: from smart_open import smart_open except ImportError: logger.info("smart_open library not found; falling back to local-filesystem-only") def make_closing(base, **attrs): """ Add support for `with Base(attrs) as fout:` to the base class if it's missing. The base class' `close()` method will be called on context exit, to always close the file properly. This is needed for gzip.GzipFile, bz2.BZ2File etc in older Pythons (<=2.6), which otherwise raise "AttributeError: GzipFile instance has no attribute '__exit__'". """ if not hasattr(base, '__enter__'): attrs['__enter__'] = lambda self: self if not hasattr(base, '__exit__'): attrs['__exit__'] = lambda self, type, value, traceback: self.close() return type('Closing' + base.__name__, (base, object), attrs) def smart_open(fname, mode='rb'): _, ext = os.path.splitext(fname) if ext == '.bz2': from bz2 import BZ2File return make_closing(BZ2File)(fname, mode) if ext == '.gz': from gzip import GzipFile return make_closing(GzipFile)(fname, mode) return open(fname, mode) PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE) def get_random_state(seed): """ Turn seed into a np.random.RandomState instance. Method originally from maciejkula/glove-python, and written by @joshloyal. """ if seed is None or seed is np.random: return np.random.mtrand._rand if isinstance(seed, (numbers.Integral, np.integer)): return np.random.RandomState(seed) if isinstance(seed, np.random.RandomState): return seed raise ValueError('%r cannot be used to seed a np.random.RandomState instance' % seed) def synchronous(tlockname): """ A decorator to place an instance-based lock around a method. Adapted from http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/ """ def _synched(func): @wraps(func) def _synchronizer(self, *args, **kwargs): tlock = getattr(self, tlockname) logger.debug("acquiring lock %r for %s" % (tlockname, func.__name__)) with tlock: # use lock as a context manager to perform safe acquire/release pairs logger.debug("acquired lock %r for %s" % (tlockname, func.__name__)) result = func(self, *args, **kwargs) logger.debug("releasing lock %r for %s" % (tlockname, func.__name__)) return result return _synchronizer return _synched class NoCM(object): def acquire(self): pass def release(self): pass def __enter__(self): pass def __exit__(self, type, value, traceback): pass nocm = NoCM() @contextmanager def file_or_filename(input): """ 返回一个类似文件的对象,准备从一开始就读取。“输入” 文件名(gz/bz2也支持)或类似于文件的对象支持查找。 """ if isinstance(input, string_types): # input was a filename: open as file yield smart_open(input) else: # input already a file-like object; just reset to the beginning input.seek(0) yield input #字符转换 def deaccent(text): """ 从给定的字符串中删除强调。输入文本要么是unicode字符串,要么是utf8编码的bytestring。 返回带有口音的输入字符串,如unicode。 >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek") u'Sef chomutovskych komunistu dostal postou bily prasek' """ if not isinstance(text, unicode): # assume utf8 for byte strings, use default (strict) error handling text = text.decode('utf8') norm = unicodedata.normalize("NFD", text) result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn') return unicodedata.normalize("NFC", result) def copytree_hardlink(source, dest): """ 递归地复制一个目录,这是一个目录。copytree,但hardlink文件 而不是复制。仅在UNIX系统上可用。 """ copy2 = shutil.copy2 try: shutil.copy2 = os.link shutil.copytree(source, dest) finally: shutil.copy2 = copy2 def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, lower=False): """ 迭代地将标记作为unicode字符串,去掉重音符号 通过赋值,可以选择将unidoce字符串设置为小写 对于其中一个参数,小写的,低的,或低的。 输入文本可以是unicode的,也可以是utf-8编码的字节字符串。 输出的标记是字母的最大连续序列。 字符(没有数字!)。 >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True)) [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu'] """ lowercase = lowercase or to_lower or lower text = to_unicode(text, encoding, errors=errors) if lowercase: text = text.lower() if deacc: text = deaccent(text) return simple_tokenize(text) def simple_tokenize(text): for match in PAT_ALPHABETIC.finditer(text): yield match.group() def simple_preprocess(doc, deacc=False, min_len=2, max_len=15): """ 将文档转换为标记列表。 这个小写的例子,标记化,去重音(可选的)。输出是最后的结果 令牌=unicode字符串,它不会再被处理。 """ tokens = [ token for token in tokenize(doc, lower=True, deacc=deacc, errors='ignore') if min_len <= len(token) <= max_len and not token.startswith('_') ] return tokens def any2utf8(text, errors='strict', encoding='utf8'): """将字符串(编码中的unicode或bytestring)转换为utf8中的bytestring。""" if isinstance(text, unicode): return text.encode('utf8') # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8 return unicode(text, encoding, errors=errors).encode('utf8') to_utf8 = any2utf8 def any2unicode(text, encoding='utf8', errors='strict'): """将字符串(由编码或unicode编码的bytestring)转换为unicode。""" if isinstance(text, unicode): return text return unicode(text, encoding, errors=errors) to_unicode = any2unicode def call_on_class_only(*args, **kwargs): """Raise exception when load methods are called on instance""" raise AttributeError('This method should be called on a class object.') class SaveLoad(object): """ 从这个类继承的对象具有保存/加载函数,这是一个解/pickle的函数 他们到磁盘。 这将使用pickle来进行/序列化,因此对象不能包含 不可picklable属性,例如lambda函数等。 """ @classmethod def load(cls, fname, mmap=None): """ 从文件中加载一个先前保存的对象(也可以看到保存)。 如果使用大型数组存储了对象,则可以加载该对象 通过mmap(共享内存)使用mmap='r'来实现这些数组。默认值:不使用 mmap,作为普通对象加载大数组。 如果正在加载的文件被压缩了(或者“。”广州’或‘bz2”) mmap=没有设置。如果这种情况下,负载将会增加一个IOError。 遇到。 """ logger.info("loading %s object from %s" % (cls.__name__, fname)) compress, subname = SaveLoad._adapt_by_suffix(fname) obj = unpickle(fname) obj._load_specials(fname, mmap, compress, subname) logger.info("loaded %s", fname) return obj def _load_specials(self, fname, mmap, compress, subname): """ 加载特殊存储的属性,并提供相同的属性 机会递归地包含了SaveLoad实例。 """ mmap_error = lambda x, y: IOError( 'Cannot mmap compressed object %s in file %s. ' % (x, y) + 'Use `load(fname, mmap=None)` or uncompress files manually.') for attrib in getattr(self, '__recursive_saveloads', []): cfname = '.'.join((fname, attrib)) logger.info("loading %s recursively from %s.* with mmap=%s" % ( attrib, cfname, mmap)) getattr(self, attrib)._load_specials(cfname, mmap, compress, subname) for attrib in getattr(self, '__numpys', []): logger.info("loading %s from %s with mmap=%s" % ( attrib, subname(fname, attrib), mmap)) if compress: if mmap: raise mmap_error(attrib, subname(fname, attrib)) val = np.load(subname(fname, attrib))['val'] else: val = np.load(subname(fname, attrib), mmap_mode=mmap) setattr(self, attrib, val) for attrib in getattr(self, '__scipys', []): logger.info("loading %s from %s with mmap=%s" % ( attrib, subname(fname, attrib), mmap)) sparse = unpickle(subname(fname, attrib)) if compress: if mmap: raise mmap_error(attrib, subname(fname, attrib)) with np.load(subname(fname, attrib, 'sparse')) as f: sparse.data = f['data'] sparse.indptr = f['indptr'] sparse.indices = f['indices'] else: sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap) sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap) sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap) setattr(self, attrib, sparse) for attrib in getattr(self, '__ignoreds', []): logger.info("setting ignored attribute %s to None" % (attrib)) setattr(self, attrib, None) @staticmethod def _adapt_by_suffix(fname): """Give appropriate compress setting and filename formula""" if fname.endswith('.gz') or fname.endswith('.bz2'): compress = True subname = lambda *args: '.'.join(list(args) + ['npz']) else: compress = False subname = lambda *args: '.'.join(list(args) + ['npy']) return (compress, subname) def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): """ 将对象保存到文件中(也可以看到load)。 如果分别是None,则自动检测到大 numpy / scipy。存储和存储对象中的稀疏数组 成单独的文件。这避免了pickle内存错误 允许mmap在负载上有效地返回大型阵列。 你也可以手动设置,在这种情况下,它必须是 将属性名存储在单独的文件中。那 在本例中没有执行自动检查。 忽略是一组不序列化的属性名(文件 处理、缓存等)。在随后的load()这些属性中 被设置为None。 pickleprotocol默认为2,因此可以导入pickle对象 在python2和3中。 """ logger.info( "saving %s object under %s, separately %s" % ( self.__class__.__name__, fname, separately)) compress, subname = SaveLoad._adapt_by_suffix(fname) restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol, compress, subname) try: pickle(self, fname, protocol=pickle_protocol) finally: # restore attribs handled specially for obj, asides in restores: for attrib, val in iteritems(asides): setattr(obj, attrib, val) logger.info("saved %s", fname) def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): """ Save aside any attributes that need to be handled separately, including by recursion any attributes that are themselves SaveLoad instances. Returns a list of (obj, {attrib: value, ...}) settings that the caller should use to restore each object's attributes that were set aside during the default pickle(). """ asides = {} sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix) if separately is None: separately = [] for attrib, val in iteritems(self.__dict__): if isinstance(val, np.ndarray) and val.size >= sep_limit: separately.append(attrib) elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit: separately.append(attrib) # whatever's in `separately` or `ignore` at this point won't get pickled for attrib in separately + list(ignore): if hasattr(self, attrib): asides[attrib] = getattr(self, attrib) delattr(self, attrib) recursive_saveloads = [] restores = [] for attrib, val in iteritems(self.__dict__): if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading recursive_saveloads.append(attrib) cfname = '.'.join((fname, attrib)) restores.extend(val._save_specials( cfname, None, sep_limit, ignore, pickle_protocol, compress, subname)) try: numpys, scipys, ignoreds = [], [], [] for attrib, val in iteritems(asides): if isinstance(val, np.ndarray) and attrib not in ignore: numpys.append(attrib) logger.info("storing np array '%s' to %s" % ( attrib, subname(fname, attrib))) if compress: np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val)) else: np.save(subname(fname, attrib), np.ascontiguousarray(val)) elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore: scipys.append(attrib) logger.info("storing scipy.sparse array '%s' under %s" % ( attrib, subname(fname, attrib))) if compress: np.savez_compressed( subname(fname, attrib, 'sparse'), data=val.data, indptr=val.indptr, indices=val.indices) else: np.save(subname(fname, attrib, 'data'), val.data) np.save(subname(fname, attrib, 'indptr'), val.indptr) np.save(subname(fname, attrib, 'indices'), val.indices) data, indptr, indices = val.data, val.indptr, val.indices val.data, val.indptr, val.indices = None, None, None try: # store array-less object pickle(val, subname(fname, attrib), protocol=pickle_protocol) finally: val.data, val.indptr, val.indices = data, indptr, indices else: logger.info("not storing attribute %s" % (attrib)) ignoreds.append(attrib) self.__dict__['__numpys'] = numpys self.__dict__['__scipys'] = scipys self.__dict__['__ignoreds'] = ignoreds self.__dict__['__recursive_saveloads'] = recursive_saveloads except: # restore the attributes if exception-interrupted for attrib, val in iteritems(asides): setattr(self, attrib, val) raise return restores + [(self, asides)] def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): """ Save the object to file (also see `load`). `fname_or_handle` is either a string specifying the file name to save to, or an open file-like object which can be written to. If the object is a file handle, no special array handling will be performed; all attributes will be saved to the same file. If `separately` is None, automatically detect large numpy/scipy.sparse arrays in the object being stored, and store them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays back on load efficiently. You can also set `separately` manually, in which case it must be a list of attribute names to be stored in separate files. The automatic check is not performed in this case. `ignore` is a set of attribute names to *not* serialize (file handles, caches etc). On subsequent load() these attributes will be set to None. `pickle_protocol` defaults to 2 so the pickled object can be imported in both Python 2 and 3. """ try: _pickle.dump(self, fname_or_handle, protocol=pickle_protocol) logger.info("saved %s object" % self.__class__.__name__) except TypeError: # `fname_or_handle` does not have write attribute self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol) #endclass SaveLoad def identity(p): """Identity fnc, for flows that don't accept lambda (pickling etc).""" return p def get_max_id(corpus): """ Return the highest feature id that appears in the corpus. For empty corpora (no features at all), return -1. """ maxid = -1 for document in corpus: maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document])) # [-1] to avoid exceptions from max(empty) return maxid #产生ID的关键类 class FakeDict(object): """ 这个类的对象充当映射整数-str(integer)的字典 一个指定范围的整数小于0,numterms)。 这是为了避免在num术语庞大时分配真正的字典。 是对记忆的浪费。 """ def __init__(self, num_terms): self.num_terms = num_terms def __str__(self): return "FakeDict(num_terms=%s)" % self.num_terms def __getitem__(self, val): if 0 <= val < self.num_terms: return str(val) raise ValueError("internal id out of bounds (%s, expected <0..%s))" % (val, self.num_terms)) def iteritems(self): for i in xrange(self.num_terms): yield i, str(i) def keys(self): """ 重写.keys()函数,该函数用于确定最大值 语料库的内部id=词汇的维数。 为了避免实现整个范围(0,self。num术语)”,这将返回 最高的id=self。num_terms - 1]。 """ return [self.num_terms - 1] def __len__(self): return self.num_terms def get(self, val, default=None): if 0 <= val < self.num_terms: return str(val) return default def dict_from_corpus(corpus): """ 扫描语料库中出现的所有单词id,然后构造并返回一个映射 它映射每个wordId-str(wordId)。 每当需要显示单词时,这个函数就会被使用(与之相反的是 他们的id)但是没有文字映射。生成的映射 只涵盖语料库中实际使用的词,最高的词是被发现的。 """ num_terms = 1 + get_max_id(corpus) id2word = FakeDict(num_terms) return id2word def is_corpus(obj): """ 检查obj是否是一个文集。返回(is文集,新)2元组,在那里 obj是obj,如果obj是可迭代的,或者新生成的序列是相同的 obj是一个迭代器。 obj是一个语料库,如果它支持对文档进行迭代,那就是文档 反过来,任何操作都是2元组(int,float)的序列。 注意:一个“空的”语料库(空的输入序列)是不确定的,所以在本例中 结果被强制定义为is文集=False。 """ try: if 'Corpus' in obj.__class__.__name__: # the most common case, quick hack return True, obj except: pass try: if hasattr(obj, 'next') or hasattr(obj, '__next__'): # the input is an iterator object, meaning once we call next() # that element could be gone forever. we must be careful to put # whatever we retrieve back again doc1 = next(obj) obj = itertools.chain([doc1], obj) else: doc1 = next(iter(obj)) # empty corpus is resolved to False here if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...) return True, obj # the first document is empty=>assume this is a corpus id1, val1 = next(iter(doc1)) # if obj is a 1D numpy array(scalars) instead of 2-tuples, it resolves to False here id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float) except Exception: return False, obj return True, obj def get_my_ip(): """ 尝试获取我们的外部ip(从pyro名称服务器的角度) 它试图回避伪造/etc/hosts条目和其他的问题 本地的错误配置,常常会破坏主机名的解析。 如果所有其他方法都失败了,则返回到简单的socket.gethostbyname()的查找。 """ import socket try: import Pyro4 # we know the nameserver must exist, so use it as our anchor point ns = Pyro4.naming.locateNS() s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect((ns._pyroUri.host, ns._pyroUri.port)) result, port = s.getsockname() except: try: # see what ifconfig says about our default interface import commands result = commands.getoutput("ifconfig").split("\n")[1].split()[1][5:] if len(result.split('.')) != 4: raise Exception() except: # give up, leave the resolution to gethostbyname result = socket.gethostbyname(socket.gethostname()) return result class RepeatCorpus(SaveLoad): """ Used in the tutorial on distributed computing and likely not useful anywhere else. """ def __init__(self, corpus, reps): """ 将文集作为另一个长度代表的语料库。这是通过 不断重复来自文集的文档,直到被请求 len(结果)= =代表的长度。重复做 出现动态=有效,通过“itertools”。 >>> corpus = [[(1, 0.5)], []] # 2 documents >>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)]] """ self.corpus = corpus self.reps = reps def __iter__(self): return itertools.islice(itertools.cycle(self.corpus), self.reps) class RepeatCorpusNTimes(SaveLoad): def __init__(self, corpus, n): """ Repeat a `corpus` `n` times. >>> corpus = [[(1, 0.5)], []] >>> list(RepeatCorpusNTimes(corpus, 3)) # repeat 3 times [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []] """ self.corpus = corpus self.n = n def __iter__(self): for _ in xrange(self.n): for document in self.corpus: yield document class ClippedCorpus(SaveLoad): def __init__(self, corpus, max_docs=None): """ 返回一个语料库,它是输入可迭代语料库的“头部”。 maxdocs之后的任何文档都将被忽略。这有效地限制了 返回的语料库的长度到小于=maxdocs。设置“max_docs =没有” “没有限制”,有效地包装了整个输入语料库。 """ self.corpus = corpus self.max_docs = max_docs def __iter__(self): return itertools.islice(self.corpus, self.max_docs) def __len__(self): return min(self.max_docs, len(self.corpus)) class SlicedCorpus(SaveLoad): def __init__(self, corpus, slice_): """ 返回一个语料库,它是输入可迭代语料库的一部分。 只有当语料库可索引时,才可以使用负面切片。 否则,将对文集进行迭代。 切片也可以是一个np。ndarray来支持复杂的索引。 注意:计算切片库的大小是很昂贵的 当使用切片时,必须对其进行遍历。 使用列表或np。ndarray没有这个缺点,但是 消耗更多的内存。 """ self.corpus = corpus self.slice_ = slice_ self.length = None def __iter__(self): if hasattr(self.corpus, 'index') and len(self.corpus.index) > 0: return (self.corpus.docbyoffset(i) for i in self.corpus.index[self.slice_]) else: return itertools.islice(self.corpus, self.slice_.start, self.slice_.stop, self.slice_.step) def __len__(self): # check cached length, calculate if needed if self.length is None: if isinstance(self.slice_, (list, np.ndarray)): self.length = len(self.slice_) else: self.length = sum(1 for x in self) return self.length def safe_unichr(intval): try: return unichr(intval) except ValueError: # ValueError: unichr() arg not in range(0x10000) (narrow Python build) s = "\\U%08x" % intval # return UTF16 surrogate pair return s.decode('unicode-escape') def decode_htmlentities(text): """ Decode HTML entities in text, coded as hex, decimal or named. Adapted from http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py >>> u = u'E tu vivrai nel terrore - L'aldilà (1981)' >>> print(decode_htmlentities(u).encode('UTF-8')) E tu vivrai nel terrore - L'aldilà (1981) >>> print(decode_htmlentities("l'eau")) l'eau >>> print(decode_htmlentities("foo < bar")) foo < bar """ def substitute_entity(match): try: ent = match.group(3) if match.group(1) == "#": # decoding by number if match.group(2) == '': # number is in decimal return safe_unichr(int(ent)) elif match.group(2) in ['x', 'X']: # number is in hex return safe_unichr(int(ent, 16)) else: # they were using a name cp = n2cp.get(ent) if cp: return safe_unichr(cp) else: return match.group() except: # in case of errors, return original input return match.group() return RE_HTML_ENTITY.sub(substitute_entity, text) def chunkize_serial(iterable, chunksize, as_numpy=False): """ 返回元素的返回元素,在纯大小的列表中。最后返回 元素可能更小(如果集合的长度不能被块大小整除)。 >>> print(list(grouper(range(10), 3))) [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] """ it = iter(iterable) while True: if as_numpy: # convert each document to a 2d numpy array (~6x faster when transmitting # chunk data over the wire, in Pyro) wrapped_chunk = [[np.array(doc) for doc in itertools.islice(it, int(chunksize))]] else: wrapped_chunk = [list(itertools.islice(it, int(chunksize)))] if not wrapped_chunk[0]: break # memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference yield wrapped_chunk.pop() grouper = chunkize_serial class InputQueue(multiprocessing.Process): def __init__(self, q, corpus, chunksize, maxsize, as_numpy): super(InputQueue, self).__init__() self.q = q self.maxsize = maxsize self.corpus = corpus self.chunksize = chunksize self.as_numpy = as_numpy def run(self): it = iter(self.corpus) while True: chunk = itertools.islice(it, self.chunksize) if self.as_numpy: # HACK XXX convert documents to numpy arrays, to save memory. # This also gives a scipy warning at runtime: # "UserWarning: indices array has non-integer dtype (float64)" wrapped_chunk = [[np.asarray(doc) for doc in chunk]] else: wrapped_chunk = [list(chunk)] if not wrapped_chunk[0]: self.q.put(None, block=True) break try: qsize = self.q.qsize() except NotImplementedError: qsize = '?' logger.debug("prepared another chunk of %i documents (qsize=%s)" % (len(wrapped_chunk[0]), qsize)) self.q.put(wrapped_chunk.pop(), block=True) #endclass InputQueue if os.name == 'nt': warnings.warn("detected Windows; aliasing chunkize to chunkize_serial") def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy): yield chunk else: def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): """ 将一串值分成更小的块。 每个块的长度都是块大小,除了最后一个可能比较小的块。 一个曾经的输入流(来自生成器的语料库)是可以的,分块完成 有效地出现通过itertools。 如果maxsize-1,不要在连续的块收益之间等待,但是 相反,要不断地填充一个短队列(大小为maxsize) 提前的块。这是通过启动一个单独的进程来实现的,并且 这意味着减少输入/输出延迟,这在语料库出现时是非常重要的 从慢速介质(如硬盘)。 如果maxsize==0,不要在并行操作中浪费时间,只需要简单地生成块大小 通过“chunkize_serial()(没有I / O优化)。 >>> for chunk in chunkize(range(10), 4): print(chunk) [0, 1, 2, 3] [4, 5, 6, 7] [8, 9] """ assert chunksize > 0 if maxsize > 0: q = multiprocessing.Queue(maxsize=maxsize) worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy) worker.daemon = True worker.start() while True: chunk = [q.get(block=True)] if chunk[0] is None: break yield chunk.pop() else: for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy): yield chunk def smart_extension(fname, ext): fname, oext = os.path.splitext(fname) if oext.endswith('.bz2'): fname = fname + oext[:-4] + ext + '.bz2' elif oext.endswith('.gz'): fname = fname + oext[:-3] + ext + '.gz' else: fname = fname + oext + ext return fname def pickle(obj, fname, protocol=2): """Pickle object `obj` to file `fname`. `protocol` defaults to 2 so pickled objects are compatible across Python 2.x and 3.x. """ with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows _pickle.dump(obj, fout, protocol=protocol) def unpickle(fname): """Load pickled object from `fname`""" with smart_open(fname, 'rb') as f: # Because of loading from S3 load can't be used (missing readline in smart_open) if sys.version_info > (3, 0): return _pickle.load(f, encoding='latin1') else: return _pickle.loads(f.read()) def revdict(d): """ 反向映射字典。 当两个键映射到相同的值时,只有一个键会被保存在这个值中。 结果(保留的是任意的)。 """ return dict((v, k) for (k, v) in iteritems(dict(d))) def toptexts(query, texts, index, n=10): """ 调试fnc以帮助检查最高n最相似的文档(根据a 相似索引索引),以查看它们是否与查询相关。 文本是任何可以为每个文档返回深刻见解的对象 通过文本检验,如全文或代码片段。 返回一个3元组的列表(检验,doc与查询的相似性,文本检验)。 """ sims = index[query] # perform a similarity query against the corpus sims = sorted(enumerate(sims), key=lambda item: -item[1]) result = [] for topid, topcosine in sims[:n]: # only consider top-n most similar docs result.append((topid, topcosine, texts[topid])) return result def randfname(prefix='gensim'): randpart = hex(random.randint(0, 0xffffff))[2:] return os.path.join(tempfile.gettempdir(), prefix + randpart) def upload_chunked(server, docs, chunksize=1000, preprocess=None): """ 内存友好的将文档上载到SimServer(或Pyro SimServer代理)。 使用该函数来训练或索引大型集合—避免发送 整个语料库作为一个单独的Pyro内存对象。这些文件 将会被发送到较小的块中,每一个都是纯文档大小的文档。 """ start = 0 for chunk in grouper(docs, chunksize): end = start + len(chunk) logger.info("uploading documents %i-%i" % (start, end - 1)) if preprocess is not None: pchunk = [] for doc in chunk: doc['tokens'] = preprocess(doc['text']) del doc['text'] pchunk.append(doc) chunk = pchunk server.buffer(chunk) start = end def getNS(host=None, port=None, broadcast=True, hmac_key=None): """ Return a Pyro name server proxy. """ import Pyro4 try: return Pyro4.locateNS(host, port, broadcast, hmac_key) except Pyro4.errors.NamingError: raise RuntimeError("Pyro name server not found") def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf={}): """ 使用名称服务器注册对象(如果不运行,启动名称服务器 但是)并且阻塞直到该守护进程终止。该对象在下面注册 如果设置了随机后缀,那么名字或名字加上一些随机后缀。 """ if random_suffix: name += '.' + hex(random.randint(0, 0xffffff))[2:] import Pyro4 with getNS(**ns_conf) as ns: with Pyro4.Daemon(ip or get_my_ip(), port or 0) as daemon: # register server for remote access uri = daemon.register(obj, name) ns.remove(name) ns.register(name, uri) logger.info("%s registered with nameserver (URI '%s')" % (name, uri)) daemon.requestLoop() def has_pattern(): """ 函数返回一个标志,指示是否安装了模式 """ try: from pattern.en import parse return True except ImportError: return False def lemmatize( content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, stopwords=frozenset(), min_length=2, max_length=15): """ 这个函数只有在安装了可选的“模式”包时才可用。 使用来自模式的英语lemmatizer来提取utf-8编码的令牌 其基本形式=引理,如。“是”,“是”,“是”等等。 这是一个更聪明的版本,将单词上下文考虑在内。 在缺省情况下只考虑名词、动词、形容词和副词(=所有其他引理被丢弃)。 >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21') ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN'] >>> lemmatize('The study ranks high.') ['study/NN', 'rank/VB', 'high/JJ'] >>> lemmatize('The ranks study hard.') ['rank/NN', 'study/VB', 'hard/RB'] """ if not has_pattern(): raise ImportError("Pattern library is not installed. Pattern library is needed in order to use lemmatize function") from pattern.en import parse if light: import warnings warnings.warn("The light flag is no longer supported by pattern.") # tokenization in `pattern` is weird; it gets thrown off by non-letters, # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little # FIXME this throws away all fancy parsing cues, including sentence structure, # abbreviations etc. content = u(' ').join(tokenize(content, lower=True, errors='ignore')) parsed = parse(content, lemmata=True, collapse=False) result = [] for sentence in parsed: for token, tag, _, _, lemma in sentence: if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords: if allowed_tags.match(tag): lemma += "/" + tag[:2] result.append(lemma.encode('utf8')) return result def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0): """ 创建一个随机的gensim稀疏矢量。每个坐标都是非0的 概率检验,每个非零的坐标值都是从 Poisson的一个参数,等于拉姆。 """ nnz = np.random.uniform(size=(dim,)) data = [(i, float(np.random.poisson(lam=lam) + 1.0)) for i in xrange(dim) if nnz[i] < prob_nnz] return data def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0): """ 创建一个随机的gensimstyle语料库,作为一个列表(int,float)元组, 用作模拟语料库。 """ data = [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam) for _ in xrange(n_items)] return data def prune_vocab(vocab, min_reduce, trim_rule=None): """ 从vocab字典中删除所有条目,计数小于minreduce。 修改vocab,返回所有被修剪的计数的和。 """ result = 0 old_len = len(vocab) for w in list(vocab): # make a copy of dict's keys if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule): # vocab[w] <= min_reduce: result += vocab[w] del vocab[w] logger.info("pruned out %i tokens with count <=%i (before %i, after %i)", old_len - len(vocab), min_reduce, old_len, len(vocab)) return result def qsize(queue): """返回可用的(近似)队列大小;不(OS X)-1。""" try: return queue.qsize() except NotImplementedError: # OS X doesn't support qsize return -1 RULE_DEFAULT = 0 RULE_DISCARD = 1 RULE_KEEP = 2 def keep_vocab_item(word, count, min_count, trim_rule=None): default_res = count >= min_count if trim_rule is None: return default_res else: rule_res = trim_rule(word, count, min_count) if rule_res == RULE_KEEP: return True elif rule_res == RULE_DISCARD: return False else: return default_res def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs): """ 使用参数运行命令,并将输出作为字节字符串返回。 从Python 2.7中返回,因为它在stdlib中实现为纯Python。 >>> check_output(args=['/usr/bin/python', '--version']) Python 2.6.2 Added extra KeyboardInterrupt handling """ try: logger.debug("COMMAND: %s %s", popenargs, kwargs) process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs) output, unused_err = process.communicate() retcode = process.poll() if retcode: cmd = kwargs.get("args") if cmd is None: cmd = popenargs[0] error = subprocess.CalledProcessError(retcode, cmd) error.output = output raise error return output except KeyboardInterrupt: process.terminate() raise def sample_dict(d, n=10, use_random=True): """ 从字典d中选择n项,并将它们作为一个列表返回。 如果使用随机选择的话,这些项目是随机选择的。 根据自然的“迭代”的说法。 """ selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n) return [(key, d[key]) for key in selected_keys] def strided_windows(ndarray, window_size): """ 产生一个numpy。窗户的ndarray,从一个滑动窗口 >>> strided_windows(np.arange(5), 2) array([[0, 1], [1, 2], [2, 3], [3, 4]]) >>> strided_windows(np.arange(10), 5) array([[0, 1, 2, 3, 4], [1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7], [4, 5, 6, 7, 8], [5, 6, 7, 8, 9]]) Args: ndarray: either a numpy.ndarray or something that can be converted into one. window_size: sliding window size. Returns: numpy.ndarray of the subsequences produced by sliding a window of the given size over the `ndarray`. Since this uses striding, the individual arrays are views rather than copies of `ndarray`. Changes to one view modifies the others and the original. """ ndarray = np.asarray(ndarray) if window_size == ndarray.shape[0]: return np.array([ndarray]) elif window_size > ndarray.shape[0]: return np.ndarray((0, 0)) stride = ndarray.strides[0] return np.lib.stride_tricks.as_strided( ndarray, shape=(ndarray.shape[0] - window_size + 1, window_size), strides=(stride, stride)) def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include_doc_num=False): """使用窗口大小的滑动窗口在给定的文本上生成一个生成器。 所生成的窗口是一些文本的子序列的视图。使用深拷贝 相反,通过“复制= True”。 参数: 文本:字符串句子的列表。 窗口大小:滑动窗口的大小。 复制:错误使用文本的视图(默认)或True来生成深度副本。 忽略大小:忽略至少窗口大小(默认行为)的文档。 如果是假的,窗口大小的文档将作为完整的文档被产生。 """ for doc_num, document in enumerate(texts): for window in _iter_windows(document, window_size, copy, ignore_below_size): if include_doc_num: yield (doc_num, window) else: yield window def _iter_windows(document, window_size, copy=False, ignore_below_size=True): doc_windows = strided_windows(document, window_size) if doc_windows.shape[0] == 0: if not ignore_below_size: yield document.copy() if copy else document else: for doc_window in doc_windows: yield doc_window.copy() if copy else doc_window