使用Scrapy框架来抓取排行前100的猫眼电影信息

1、首先创建好爬虫项目和爬虫类(下面为maoyanmovie.py爬虫类),编写时用scrapy shell来调试。

# -*- coding: utf-8 -*-
import scrapy
import json


class MaoyanmovieSpider(scrapy.Spider):
    name = 'maoyanmovie'
    # allowed_domains = ['https://maoyan.com/board/4']
    list = []
    for n in range(0, 10):
        url = 'https://maoyan.com/board/4?offset={}'.format(n*10)
        list.append(url)
    start_urls = list
    print(start_urls)

    def parse(self, response):
        html1 = response.xpath('//*[@id="app"]/div/div/div[1]/dl')[0]
        titles = html1.xpath('//div[@class="movie-item-info"]/p[@class="name"]/a/@title').extract()
        actors = html1.xpath('//div[@class="movie-item-info"]/p[@class="star"]/text()').extract()
        times = html1.xpath('//div[@class="movie-item-info"]/p[@class="releasetime"]/text()').extract()
        scores1 = html1.xpath(
            '//div[@class="movie-item-number score-num"]/p[@class="score"]/i[@class="integer"]/text()').extract()
        scores2 = html1.xpath(
            '//div[@class="movie-item-number score-num"]/p[@class="score"]/i[@class="fraction"]/text()').extract()
        ranks = html1.xpath('//dd/i[1]/text()').extract()

        scores = []
        new_actors = []
        new_releastimes = []

        for score1, score2 in zip(scores1, scores2):
            new_score = score1 + score2
            scores.append(new_score)

        for i in actors:
            i = str(i)
            i = i.strip()
            if i:
                i = i.replace('主演:', '')
                new_actors.append(i)
            else:
                new_actors.append(i)

        for i in times:
            i = str(i)
            i = i.strip()
            if i:
                i = i.replace('上映时间:', '')
                new_releastimes.append(i)
            else:
                new_releastimes.append(i)
        with open('movie.json', 'a', encoding='utf-8') as f:
            item = {}
            for title, actor, releastime, score, rank in zip(titles, new_actors, new_releastimes, scores, ranks):  # 拉链函数
                print(str(title) + '----' + str(actor) + '----' + str(releastime) + '---' + str(score) + '---' + rank)
                item['电影名称'] = str(title)
                item['演员'] = str(actor)
                item['上映时间'] = str(releastime)
                item['评分'] = str(score)
                item['排行榜'] = str(rank)
                data = json.dumps(item, ensure_ascii=False) + ',' + '\n'  # 把字典、列表转化为字符串
                f.write(data)
  

2、对settings.py进行一系列变量设置

BOT_NAME = 'maoyan' #自己带有的(创建爬虫项目就有的)

SPIDER_MODULES = ['maoyan.spiders']# 创建项目就有
NEWSPIDER_MODULE = 'maoyan.spiders'#创建项目就有

USER_AGENT = 'maoyan (+http://www.yourdomain.com)'

ROBOTSTXT_OBEY = True

SPIDER_MIDDLEWARES = {
   'maoyan.middlewares.MaoyanSpiderMiddleware': 543,
}

DOWNLOADER_MIDDLEWARES = {
   'maoyan.middlewares.MaoyanDownloaderMiddleware': 543,
}

注意:复制一行:Ctrl+d、删除一行:Ctrl+y

你可能感兴趣的:(使用Scrapy框架来抓取排行前100的猫眼电影信息)