scrapy爬取豆瓣电影

目标网站:http://movie.douban.com/top250
目标内容:
  • 电影名称
  • 电影信息
  • 电影评分
输出结果:生成csv文件

首先settings配置

BASE_DIR = os.path.dirname(os.path.dirname(__file__))
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3128.0 Safari/537.36'
# 输出到csv文件
FEED_URI = 'file:///' + os.path.join(BASE_DIR, 'douban.csv')
FEED_FORMAT = 'CSV'

Item设置

class DoubanMovieItem(scrapy.Item):
    title = scrapy.Field()
    movie_info = scrapy.Field()
    star = scrapy.Field()
    quote = scrapy.Field()

爬虫编写

# 基本信息
class DoubantestSpider(Spider):
    name = 'doubantest'
    start_urls = ['https://movie.douban.com/top250']

    base_url = 'https://movie.douban.com/top250'
    # 内容解析
    def parse(self, response):
        douban_movie_item = DoubanMovieItem()
        movies = response.xpath('//div[@class="info"]')
        for movie in movies:
            title = movie.xpath('div[@class="hd"]/a/span/text()').extract()
            movie_info = movie.xpath('div[@class="bd"]/p/text()').extract()
            star = movie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()[0]
            quote = movie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
            if quote:
                quote = quote[0]
            douban_movie_item['title'] = ''.join(title)
            douban_movie_item['movie_info'] = ';'.join(movie_info)
            douban_movie_item['star'] = star
            douban_movie_item['quote'] = quote or ''

        yield douban_movie_item
    # 爬取下一页
    next_link = response.xpath('//span[@class="next"]/link/@href').extract()
    if next_link:
        next_link = self.base_url + next_link[0]
        yield Request(next_link, callback=self.parse)

运行main.py,结果

douban.csv

你可能感兴趣的:(scrapy爬取豆瓣电影)