电影排行榜(requests + bs4 & scrapy)

一、requests、bs4


 注意:pycharm-终端输入-pip install bs4下载安装包并导入模块

import requests, time, csv
from bs4 import BeautifulSoup

header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)\
           AppleWebKit/537.36(KHTML, like Gecko)Chrome/77.0.3865.120 \
           Safari/537.36 Core/1.77.119.400 QQBrowser/10.9.4817.400'}

urls = ['https://movie.douban.com/top250?start={}&filter='.format(i * 25) for i in range(10)]

movie_directory = []

for url in urls:
    res = requests.get(url, headers=header)
    soup = BeautifulSoup(res.text, 'html.parser')
    item = soup.find_all('div', class_='hd')
    for i in item:
        tag = i.find('a')
        name = tag.find(class_="title").text
        link = tag['href']
        print(name, link)
        movie_directory.append([name, link])

    time.sleep(1.5)

print('数据爬取完成')

with open('豆瓣电影TOP250.csv', 'w', newline='', encoding='utf-8') as wb:
    csv_writer = csv.writer(wb)
    csv_writer.writerow(['影名', '网址'])
    for i in movie_directory:
        csv_writer.writerow(i)
print('数据写入完成')

二、scrapy


注意:pycharm-终端输入-pip install scrapy下载安装包并导入模块

           import scrapy  

           scrapy startproject + 爬虫项目名称 创建项目。

           scrapy startproject scrapypython

           cd 爬虫项目名称 scrapy genspider + 爬虫文件名 + 爬取数据域名 创建爬虫文件

           cd  scrapypython

           scrapy genspide douban movie.douban.com

           scrapy crawl + 爬虫文件名 执行爬取操作

           scrapy crawl douban


1.douban.py(爬虫文件)

import scrapy
from scrapypython.items import Movie

START = 0
LIST_URL = ['https://movie.douban.com/top250?start={}&filter='.format(num*25) for num in range(10)]


class DoubanSpider(scrapy.Spider):
    name = 'douban'
    allowed_domains = ['movie.douban.com']
    start_urls = ['https://movie.douban.com/top250']

    def parse_movie(self, response):
        for item in response.css('div.item'):
            movie = Movie()
            movie['rank'] = item.css('div.pic em::text').get()
            movie['name'] = item.css('div.info>div.hd>a span.title::text').get()
            movie['link'] = item.css('div.hd>a::attr(href)').get()
            movie['score'] = item.css('div.star>span.rating_num::text').get()
            movie['quote'] = item.css('div.bd>p.quote span.inq::text').get()
            yield movie

    def parse(self, response):
        for url in LIST_URL:
            yield scrapy.Request(url, self.parse_movie)

2.items(爬取目标)

import scrapy

class Movie(scrapy.Item):
   rank = scrapy.Field()  #排名
   name = scrapy.Field()  #影名 
   link = scrapy.Field()  #链接
   score = scrapy.Field() #评分
   quote = scrapy.Field() #简介

3.settings(爬取设置)

BOT_NAME = 'scrapypython'  

SPIDER_MODULES = ['scrapypython.spiders'] 
NEWSPIDER_MODULE = 'scrapypython.spiders'

ROBOTSTXT_OBEY = True

FEED_URI = 'douban.csv'
FEED_FORMAT = 'csv'
FEED_EXPORT_ENCODING = 'utf-8'
DEFAULT_REQUEST_HEADERS = {
    'authority': 'movie.douban.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 \
Safari/537.36 Core/1.94.169.400 QQBrowser/11.0.5130.400'
}
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_TARGET_CONCURRENCY = 10

电影排行榜(requests + bs4 & scrapy)_第1张图片

 

你可能感兴趣的:(练习,scrapy,python,数据分析)