scrapy ------ 爬取豆瓣电影TOP250

转载自 —> 原文

#items.py
# -*- coding: utf-8 -*-
import scrapy

class DoubanMovieItem(scrapy.Item):
    ranking = scrapy.Field()        #排名
    movie_name = scrapy.Field()     #电影名称
    score = scrapy.Field()          #评分
    score_num = scrapy.Field()      #评论人数
#douban_spider.py
#-*- coding:utf-8 -*-

from scrapy.spider import Spider
from scrapyspider.items import DoubanMovieItem
import scrapy


class DoubanMovieTop250spider(Spider):
    name = 'douban_movie_top250'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
    }

    def start_requests(self):
        url = 'https://movie.douban.com/top250'
        yield scrapy.Request(url, headers=self.headers)

    def parse(self,response):
        item = DoubanMovieItem()
        movies = response.xpath('//ol[@class="grid_view"]/li')
        for movie in movies:
            item['ranking'] = movie.xpath('.//div[@class="pic"]/em/text()').extract()[0]
            item['movie_name'] = movie.xpath('.//div[@class="hd"]/a/span[1]/text()').extract()[0]
            item['score'] = movie.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').extract()[0]
            item['score_num'] = movie.xpath('.//div[@class="star"]/span[4]/text()').extract()[0]
            yield item

        next_url = response.xpath('//span[@class="next"]/a/@href').extract()  #获取下一页链接
        if next_url:
            next_url = 'https://movie.douban.com/top250' + next_url[0]
            yield scrapy.Request(next_url, headers=self.headers)

你可能感兴趣的:(python,笔记)