import six
from twisted.python.failure import Failure
from scrapy.middleware import MiddlewareManager
from scrapy.utils.defer import mustbe_deferred
from scrapy.utils.conf import build_component_list
def mustbe_deferred(f, *args, **kw):
"""Same as twisted.internet.defer.maybeDeferred, but delay calling
callback/errback to next reactor loop
"""
try:
result = f(*args, **kw)
except IgnoreRequest as e:
return defer_fail(failure.Failure(e))
except:
return defer_fail(failure.Failure())
else:
return defer_result(result)
def build_component_list(compdict, custom=None, convert=update_classpath):
"""Compose a component list from a { class: order } dictionary."""
def _check_components(complist):
if len({
convert(c) for c in complist}) != len(complist):
raise ValueError('Some paths in {!r} convert to the same object, '
'please update your settings'.format(complist))
def _map_keys(compdict):
if isinstance(compdict, BaseSettings):
compbs = BaseSettings()
for k, v in six.iteritems(compdict):
prio = compdict.getpriority(k)
if compbs.getpriority(convert(k)) == prio:
raise ValueError('Some paths in {!r} convert to the same '
'object, please update your settings'
''.format(list(compdict.keys())))
else:
compbs.set(convert(k), v, priority=prio)
return compbs
else:
_check_components(compdict)
return {
convert(k): v for k, v in six.iteritems(compdict)}
def _validate_values(compdict):
"""Fail if a value in the components dict is not a real number or None."""
for name, value in six.iteritems(compdict):
if value is not None and not isinstance(value, numbers.Real):
raise ValueError('Invalid value {} for component {}, please provide ' \
'a real number or None instead'.format(value, name))
if isinstance(custom, (list, tuple)):
_check_components(custom)
return type(custom)(convert(c) for c in custom)
if custom is not None:
compdict.update(custom)
_validate_values(compdict)
compdict = without_none_values(_map_keys(compdict))
return [k for k, v in sorted(six.iteritems(compdict), key=itemgetter(1))]
def _check_components(complist):
if len({
convert(c) for c in complist}) != len(complist):
raise ValueError('Some paths in {!r} convert to the same object, '
'please update your settings'.format(complist))
def update_classpath(path):
"""Update a deprecated path from an object with its new location"""
for prefix, replacement in DEPRECATION_RULES:
if path.startswith(prefix):
new_path = path.replace(prefix, replacement, 1)
warnings.warn("`{}` class is deprecated, use `{}` instead".format(path, new_path),
ScrapyDeprecationWarning)
return new_path
return path
DEPRECATION_RULES = [
('scrapy.contrib_exp.downloadermiddleware.decompression.', 'scrapy.downloadermiddlewares.decompression.'),
('scrapy.contrib_exp.iterators.', 'scrapy.utils.iterators.'),
('scrapy.contrib.downloadermiddleware.', 'scrapy.downloadermiddlewares.'),
('scrapy.contrib.exporter.', 'scrapy.exporters.'),
('scrapy.contrib.linkextractors.', 'scrapy.linkextractors.'),
('scrapy.contrib.loader.processor.', 'scrapy.loader.processors.'),
('scrapy.contrib.loader.', 'scrapy.loader.'),
('scrapy.contrib.pipeline.', 'scrapy.pipelines.'),
('scrapy.contrib.spidermiddleware.', 'scrapy.spidermiddlewares.'),
('scrapy.contrib.spiders.', 'scrapy.spiders.'),
('scrapy.contrib.', 'scrapy.extensions.'),
('scrapy.command.', 'scrapy.commands.'),
('scrapy.dupefilter.', 'scrapy.dupefilters.'),
('scrapy.linkextractor.', 'scrapy.linkextractors.'),
('scrapy.telnet.', 'scrapy.extensions.telnet.'),
('scrapy.spider.', 'scrapy.spiders.'),
('scrapy.squeue.', 'scrapy.squeues.'),
('scrapy.statscol.', 'scrapy.statscollectors.'),
('scrapy.utils.decorator.', 'scrapy.utils.decorators.'),
('scrapy.spidermanager.SpiderManager', 'scrapy.spiderloader.SpiderLoader'),
]
def _check_components(complist):
if len({
convert(c) for c in complist}) != len(complist):
raise ValueError('Some paths in {!r} convert to the same object, '
'please update your settings'.format(complist))
def _map_keys(compdict):
if isinstance(compdict, BaseSettings):
compbs = BaseSettings()
for k, v in six.iteritems(compdict):
prio = compdict.getpriority(k)
if compbs.getpriority(convert(k)) == prio:
raise ValueError('Some paths in {!r} convert to the same '
'object, please update your settings'
''.format(list(compdict.keys())))
else:
compbs.set(convert(k), v, priority=prio)
return compbs
else:
_check_components(compdict)
return {
convert(k): v for k, v in six.iteritems(compdict)}
def _validate_values(compdict):
"""Fail if a value in the components dict is not a real number or None."""
for name, value in six.iteritems(compdict):
if value is not None and not isinstance(value, numbers.Real):
raise ValueError('Invalid value {} for component {}, please provide ' \
'a real number or None instead'.format(value, name))
def without_none_values(iterable):
"""Return a copy of `iterable` with all `None` entries removed.
If `iterable` is a mapping, return a dictionary where all pairs that have
value `None` have been removed.
"""
try:
return {
k: v for k, v in six.iteritems(iterable) if v is not None}
except AttributeError:
return type(iterable)((v for v in iterable if v is not None))
"""
Return a callable object that fetches the given item(s) from its operand.
After f = itemgetter(2), the call f(r) returns r[2].
After g = itemgetter(2, 5, 3), the call g(r) returns (r[2], r[5], r[3])
"""
def _get_mwlist_from_settings(cls, settings):
return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
def _add_middleware(self, mw):
super(SpiderMiddlewareManager, self)._add_middleware(mw)
if hasattr(mw, 'process_spider_input'):
self.methods['process_spider_input'].append(mw.process_spider_input)
if hasattr(mw, 'process_spider_output'):
self.methods['process_spider_output'].insert(0, mw.process_spider_output)
if hasattr(mw, 'process_spider_exception'):
self.methods['process_spider_exception'].insert(0, mw.process_spider_exception)
if hasattr(mw, 'process_start_requests'):
self.methods['process_start_requests'].insert(0, mw.process_start_requests)
process_spider_input,
process_spider_output
process_spider_exception
process_start_requests
def process_spider_input(response):
for method in self.methods['process_spider_input']:
try:
result = method(response=response, spider=spider)
assert result is None, \
'Middleware %s must returns None or ' \
'raise an exception, got %s ' \
% (fname(method), type(result))
except:
return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider)
def process_spider_exception(_failure):
exception = _failure.value
for method in self.methods['process_spider_exception']:
result = method(response=response, exception=exception, spider=spider)
assert result is None or _isiterable(result), \
'Middleware %s must returns None, or an iterable object, got %s ' % \
(fname(method), type(result))
if result is not None:
return result
return _failure
def process_spider_output(result):
for method in self.methods['process_spider_output']:
result = method(response=response, result=result, spider=spider)
assert _isiterable(result), \
'Middleware %s must returns an iterable object, got %s ' % \
(fname(method), type(result))
return result
dfd = mustbe_deferred(process_spider_input, response)
dfd.addErrback(process_spider_exception)
dfd.addCallback(process_spider_output)
def process_start_requests(self, start_requests, spider):
return self._process_chain('process_start_requests', start_requests, spider)