该文章仅供学习,如有错误,欢迎指出
1.开始创建一个项目
mkdir lagou
2.进入到文件夹下创建python3的虚拟环境
pipenv install scrapy
3.进入pipenv 下使用scrapy命令创建爬虫项目
pipenv shell
scrapy startproject lagou
cd lagou
scrapy genspider -o crawl test www.lagou.com
Scrapy 为我们提供了四种模板,根据不同的网页类型,我们可以选用不同的爬虫模板,这里我们使用的是crawl模板
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['https://www.lagou.com']
start_urls = ['https://www.lagou.com/']
rules = (
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
def parse_item(self, response):
i = {}
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
return i
#crawl模板代码
解释spider-crawl模板
class CrawlSpider(Spider):
rules = ()
def __init__(self, *a, **kw):
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()
def parse(self, response):
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
def parse_start_url(self, response):
return []
def process_results(self, response, results):
return results
def _build_request(self, rule, link):
r = Request(url=link.url, callback=self._response_downloaded)
r.meta.update(rule=rule, link_text=link.text)
return r
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def _response_downloaded(self, response):
rule = self._rules[response.meta['rule']]
return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for requests_or_item in iterate_spider_output(cb_res):
yield requests_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
def _compile_rules(self):
def get_method(method):
if callable(method):
return method
elif isinstance(method, six.string_types):
return getattr(self, method, None)
self._rules = [copy.copy(r) for r in self.rules]
for rule in self._rules:
rule.callback = get_method(rule.callback)
rule.process_links = get_method(rule.process_links)
rule.process_request = get_method(rule.process_request)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
spider._follow_links = crawler.settings.getbool(
'CRAWLSPIDER_FOLLOW_LINKS', True)
return spider
def set_crawler(self, crawler):
super(CrawlSpider, self).set_crawler(crawler)
self._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)
start_urls定义了初始爬取的网页地址
(一)使用parse_start_url去对start_urls作处理
def parse_start_url(self, response):
return []
#源码里面返回一个空的列表,如果要运行爬虫,我们需要对他进行处理
(二)使用parse对parse_start_url解析后的response进行处理,并且把结果交给_parse_response方法
这里的self.parse_start_url, cb_kwargs={}都为回调函数,一个为开始的网址,一个用来收集rule下的callback函数
rules = (
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
Rule是CrawlSpider模板爬取内容的规则,如果Follow是False,那么爬虫只会爬取符合allow的网页。
而如果follows是True,那么爬虫会爬取start_urls的网页下的所有符合allow的网页连接
rule无论有无callback,都由同一个_parse_response函数处理,只不过他会判断是否有follow和callback
对Rule的解释在文章下面
def parse(self, response):
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)#
(三)判断接受的内容是否为回调函数(该回调函数可能是rule中的解析函数,也可能是 parse_start_url函数)
如果是parse_start_url的函数,则会交给process_results处理。返回cb_res的一个列表
使用iterate_spider_output对列表进行操作,返回item,如果选择了follows那么会把response下的url继续跟进
def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for requests_or_item in iterate_spider_output(cb_res):
yield requests_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
(四)
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
对Rule的解释
class Rule(object):
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity):
self.link_extractor = link_extractor
self.callback = callback
self.cb_kwargs = cb_kwargs or {}
self.process_links = process_links
self.process_request = process_request
if follow is None:
self.follow = False if callback else True
else:
self.follow = follow
对模板的理解有些不足,之后会慢慢补