爬取豆瓣书籍分类标签

# -*- coding: cp936 -*-
from scrapy.spider import Spider  
from scrapy.selector import Selector
from zhihu.items import question
from zhihu.settings import printhxs
from scrapy.selector import HtmlXPathSelector
class QuestionSpider(Spider):  
    name = "q"  
    start_urls = [
        "http://book.douban.com/tag/"
        ]
    def parse(self, response):
        print "+++++++++++++++++++++++++"
        sel = Selector(response)
        url= sel.xpath('//div/div/div/div/table[@class="tagCol"]/today/tr/td/@href').extract()
        print url
        print "+++++++++++++++++++++++++"
       # return items
# -*- coding: utf-8 -*-
from scrapy.spider import Spider  
from scrapy.selector import Selector
from qunawang.items import question
from qunawang.settings import printhxs
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
class QuestionSpider(Spider):  
    name = "douban"
    start_urls = []
    u1 = 'http://book.douban.com/tag/小说?start='
    u2 = '&type=T'
    for i in range(50):
        url = u1 + str(i*20) + u2
        start_urls.append(url)
    def parse2(self, response):  
        sel = Selector(response)
        item = question()
        stringtitle = sel.xpath('//div/h1/span/text()').extract()
        item['title']= stringtitle[0]
        
        stringgrade = sel.xpath('//div/div/p/strong/text()').extract()
        item['grade']= stringgrade[0]
        
        stringview = sel.xpath('//div/div/p/span/a/span/text()').extract()
        item['views']= stringview[0]
        
        item['index'] = sel.xpath('//div/div[@class="indent"]/span/a/text()').extract()
        
        return item
   
    def parse(self, response):
        sel = Selector(response)
        item = question()
        urls = sel.xpath('//ul[@class="subject-list"]/li[@class="subject-item"]/div[@class="info"]/h2/a/@href').extract()
        for url in urls:
            print url
            yield Request(url, callback =self.parse2)

Scrapy研究探索(七)——如何防止被ban之策略大集合

你可能感兴趣的:(爬取豆瓣书籍分类标签)