scrapy snippet

1. spider文件

from scrapy.contrib.spiders import CrawlSpider, Rule

from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from scrapy.selector import HtmlXPathSelector



item = DomzItem()

image_urls = hxs.select('//img/@src').extract()

item['image_urls'] = ["http:" + x for x in image_urls]

return item

 

from scrapy.selector import HtmlXPathSelector

hxs = HtmlXPathSelector(response)

 

class MySpider(CrawlSpider): #控制下载速度

    name = 'myspider'

    download_delay = 2

 

$ scrapy crawl somespider -s JOBDIR=crawls/somespider-1
#这样开始下载之后可以Ctrl + C停止,恢复下载还是同样的命令
$ scrapy crawl somespider -s JOBDIR=crawls/somespider-1

 

 

name = "wikipedia"

allowed_domains = ["wikipedia.org"]

start_urls = [

  "http://en.wikipedia.org/wiki/Pune"

]

 

 

 2. setting文件

ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']

IMAGES_STORE= '...'

 

3. item 文件

 image_urls = Field()

 images = Field() 

 

你可能感兴趣的:(scrapy)