这里直接看代码吧!自己还在学习过程中,可能会有错误,请大家见谅。
scrapy startproject doubanmovie
这个文件是全局的配置文件。
# 指定字符集,否则中文显示会有问题
FEED_EXPORT_ENCODING = 'UTF-8'
# 指定用户终端,否则有些网站会禁止访问,显示403
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
# 开启pipeline处理数据
ITEM_PIPELINES = {
'doubanmovie.pipelines.DoubanmoviePipeline': 300,
}
在这里指定爬取的内容,以字典形式存储。
import scrapy
class DoubanmovieItem(scrapy.Item):
# define the fields for your item here like:
# 指定爬取的字段
movie_name = scrapy.Field() # 电影名字
info = scrapy.Field() # 电影简介
ranking = scrapy.Field() # 电影排行
在spiders目录下创建自己的spider文件,自定义文件名。
from scrapy.spiders import CrawlSpider,Spider
from doubanmovie.items import DoubanmovieItem
import logging
class Spider(Spider):
name = "doubanmovie" # 项目名称,此处必须唯一
allow_domains = "movie.douban.com" # 网站域名
# 动态生成网站链接
start_urls = []
for n in range(0,250,25):
url = "https://movie.douban.com/top250?start={}&filter=".format(n)
start_urls.append(url)
def parse(self, response):
for line in response.xpath('//ol[@class="grid_view"]/li'):
item = DoubanmovieItem()
item["movie_name"] = line.xpath('div/div[2]/div[1]/a/span[1]/text()').extract()
item["info"] = line.xpath('div/div[2]/div[2]/p[2]/span/text()').extract()
item["ranking"] = line.xpath('div/div[1]/em/text()').extract()
yield item
def close(spider, reason):
print("但spider关闭时,执行这个函数")
def log(self, message, level=logging.DEBUG, **kw):
print("log函数",message)
# -*- coding: utf-8 -*-
import pymysql
import ssl
from functools import wraps
from datetime import datetime
ssl._create_default_https_context = ssl._create_unverified_context
class DoubanmoviePipeline:
# 初始化,创建数据库连接
def __init__(self):
self.conn = pymysql.Connect(
'127.0.0.1', 'root', '12345678', 'test', 3306, charset='utf8'
)
self.cursor = self.conn.cursor()
# 爬虫启动时,执行这个函数
def open_spider(self, spider):
print("爬虫启动")
def process_item(self, item, spider):
# 判断字段是否为空,如果为空则赋值为空字符串,否则列表取空值会报错
if len(item["movie_name"]) != 0:
movie_name = item["movie_name"][0]
else:
movie_name = ""
if len(item["info"]) != 0:
movie_info = item["info"][0]
# 如果info信息中有单引号,插入数据库会报错,这里将一个单引号改为两个单引号,就可以插入数据库
movie_info = movie_info.replace("'", "''")
else:
movie_info = ""
if len(item["ranking"]) != 0:
ranking = int(item["ranking"][0])
else:
ranking = 0
sql = "insert into douban_movie (movie_name,movie_info,ranking) value ('{}','{}','{}')".format(movie_name,
movie_info,
ranking)
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
self.conn.rollback()
print(e)
# 爬取结束时,执行这个函数
def close_spider(self, spider):
self.conn.close()
self.cursor.close()
print("爬虫关闭")
scrapy crawl doubanmovie # 这里的doubanmovie是自定义spider的name