2019-01-25百度图片spider

import scrapy
import re
from ..items import BaiduspiderItem

class BaiduSpider(scrapy.Spider):
    name = 'baidu'
    # allowed_domains = ['www.baidu.com']
    start_urls = ['http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%CD%BC%C6%AC&fr=ala&ala=1&alatpl=others&pos=0']

    def parse(self, response):
        html = response.text
        urls = re.findall('"thumbURL":"(.*?)"',html)
        for index, url in enumerate(urls):
            yield scrapy.Request(url =url, meta={'index':index},callback = self.parse_img)  #callback self.方法名

    def parse_img(self,response):
        item = BaiduspiderItem()
        item['img_name'] = response.meta['index']
        item['img_content'] = response.body  #response.body 返回字节码,response.text 返回字符串
        yield item #传递给pipelines```

你可能感兴趣的:(2019-01-25百度图片spider)