mm131爬虫(scrapy)

Scrapy 基本使用

1. Install
pip install scrapy
2. 新建爬虫项目
scrapy startproject 
3. 新建爬虫,在 spiders 目录下创建(常用版本)
  1. 普通版本的爬虫初始化
scrapy genspider  
  1. 具有较高链接获取能力的爬虫初始化
scrapy genspider -t crawl  
4. 项目主要结构及其功能
file function
settings 基础信息配置、功能管理
items 数据存储结构(初始化 scrapy.Field())
middleware 中间件,过程信息的处理
pipelines 管道,用于数据的使用功能实现(下载, JSON, MYSQL and so on)
5. 注意事项
  1. scrapy.Request(url=url, callback=self.parse_list) 可以进行函数回调,但 params 需要自行手动添加,暂不支持像 requests.get(params={}) 的形式。
  2. linkextractor(allow=r' ', tags=(), attrs=()) 常用项 allow 是允许的规则链接,另外还有 noallow,这个则相反。tags 为链接选择的标签,默认 a, area。attrs 则是标签的属性,默认 href。
  3. linkextractor() 需要 .extract_links 返回已匹配的链接,返回链接对象为 Link。
  4. Rule 是以 start_urls 为请求对象,依次进行页面请求,并在获取的链接中进行 linkextractor,获取后的链接可以使用回调函数 callback,并且会在匹配到的对象中,依次重复匹配,直到各页面再无可匹配的链接。
  5. 本次的数据存放以 MYSQL 为主
  6. MYSQL 的字串处理的坑:
1 -  create table 的结构不能使用 (\' 或 \") 只能使用 ` ,也就是 tab 上面的键。
2 - 后面的指令均可使用 (\' 或 \") ,比如说插入 insert into  (keys) values (values),这里的 keys 不需要添加 (\' 或 \") 如果包含关键字则需要 ` 包裹。而 values 可以使用或不使用 (\' 或 \"),数字可包裹也可以不包裹,建议一视同仁全包。

实现源码

网页爬取

image_info.py

from scrapy import *
from lxml import etree
import requests, re, time
from mm131.items import Mm131DetailItem
from scrapy.utils.project import get_project_settings
from pymysql import connect
from urllib import parse
settings = get_project_settings()


class ImageInfoSpider(Spider):
    name = 'image_info'
    allowed_domains = ['https://m.mm131.net']
    start_url = 'https://m.mm131.net/'
    list_api = 'https://m.mm131.net/more.php'
    db = 'mysql'
    table = 'image_info'

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.conn = self.mysql_connect_init()
        self.cur = self.conn.cursor()
    
    def mysql_init(self):
        self.mysql_db_init()
        self.mysql_table_init()
        self.mysql_init_close()
    
    @staticmethod
    def mysql_connect_init():
        return connect (
            host=settings.get("MYSQL_HOST"),
            port=settings.get("MYSQL_PORT"),
            user=settings.get("MYSQL_USER"),
            password=settings.get("MYSQL_PSD"),
            charset=settings.get("MYSQL_CHARSET")
        )

    def mysql_db_init(self):
        db = settings.get("MYSQL_DB")
        self.cur.execute("""
        create database if not exists {}
        """.format(db))
        self.cur.execute("""
        use {};
        """.format(db))
    
    def mysql_table_init(self):
        self.cur.execute("""
        create table if not exists `{}` (
            `page` smallint,
            `title` varchar(40),
            `time` timestamp,
            `type` varchar(20),
            `label` varchar(50),
            `avid` int PRIMARY KEY,
            `url` varchar(50),
            `type_en` varchar(20),
            `id` int unique auto_increment,
            `update` datetime
        ) ENGINE=InnoDB DEFAULT CHARSET={};
        """.format(
            self.table, 
            settings.get("MYSQL_CHARSET")
        ))
        self.conn.commit()
    
    def mysql_init_close(self):
        self.cur.close()
        self.conn.close()
    
    @staticmethod
    def gbk(req):
        return req.content.decode('gbk')

    def get_list_info(self, page):
        cont =  (
            self.gbk(
                requests.get(
                    url='https://m.mm131.net/more.php',
                    headers={'referer': self.start_url},
                    params={'page': page}
                )
            )
        )
        return (cont, self.has_page(cont))
    
    def get_list_url(self, type_en, type_num, page):
        return (
            parse.urljoin(
                self.start_url, type_en, "list_{}_{}.html"
                .format(type_num, str(page))
            )
        )
    
    @staticmethod
    def has_page(cont):
        return not (cont.strip() == '')

    @staticmethod
    def parse_list_urls(cont):
        parser = etree.HTML(cont)
        return parser.xpath('//article/div[contains(@class, "post-content")]/a/@href')

    def parse_url_info(self, url):
        parts = url.split('/')
        return (parts[3], re.search(r'\d+', parts[4]).group(0))
    
    @staticmethod
    def parse_detail_page(cont):
        items = {}
        parser = etree.HTML(cont)
        page_re = re.search(r'\d+', parser.xpath('//*[@id="content"]/article/div[4]/span/text()')[0].split('/')[1])
        items['page'] = page_re.group(0) if page_re else 0
        items['title'] = parser.xpath('//*[@id="content"]/article/div[1]/h2/text()')[0]
        items['time'] = parser.xpath('//*[@id="content"]/article/div[1]/div/span/text()')[0]
        items['type'] = parser.xpath('//*[@id="content"]/article/div[5]/span[1]/a/text()')[0]
        items['label'] = '/'.join(parser.xpath('//*[@id="content"]/article/div[5]/span[2]//a/text()'))
        return items

    def parse_nav(self):
        items = []
        parser = etree.HTML(
            self.gbk(
                requests.get(self.start_url)
            )
        )
        nav_items = parser.xpath("""
        //nav[@class='slide-menu']/ul/li[@class='dropdown']
        /ul//li[contains(@class,'cat-item')]
        """)
        for nav in nav_items:
            try:
                type_url = nav.xpath("a/@href")[0]
                if type_url.find('app') == -1:
                    type_name = nav.xpath("a/text()")[0]
                    items.append(
                        (
                            type_name, 
                            type_url, 
                            type_url.split('/')[3]
                        )
                    )
            except IndexError as e:
                continue
        return items
    
    def parse_list_info(self, url):
        parser = etree.HTML(
            self.gbk(
                requests.get(url)
            )
        )
        content = parser.xpath("//content[@id='content']")[0]
        pages = content.xpath("nav//span[@id='spn']/text()")[0]
        page_num = int(re.findall(r"\d+", pages)[1])
        types = content.xpath("nav//a[@id='xbtn']/@href")[0]
        type_num = int(re.findall(r"\d+", types)[0])
        return (
            page_num, 
            type_num
        )

    def navs_requests(self):
        navs = self.parse_nav()
        for nav in navs:
            (pnum, tnum) = self.parse_list_info(nav[1])
            for i in range(1, pnum+1):
                url = self.get_list_url(nav[1], tnum, i)
                urls = self.parse_list_urls(
                    self.gbk(
                        requests.get(url)
                    )
                )
                for url in urls:
                    yield Request(url=url)

    def start_requests(self):
        self.mysql_init()
        page = 1
        while True:
            (cont, stat) = self.get_list_info(page)
            if not stat: break
            urls = self.parse_list_urls(cont)
            for url in urls:
                yield Request(url=url)
            page += 1

    def parse(self, response):
        info = self.parse_detail_page(response.text)
        (avtype, avid) = self.parse_url_info(response.request.url)
        item = Mm131DetailItem()
        item['url'] = response.request.url
        item['avid'] = avid
        item['type_en'] = avtype
        item['page'] = info['page']
        item['title'] = info['title']
        item['time'] = info['time']
        item['type'] = info['type']
        item['label'] = info['label']
        yield item

详情单组图片下载

crawl_image_info.py

from requests.sessions import Request
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from mm131.items import Mm131DownloadImageItem
import requests, re
from lxml import etree
from os.path import join


class CrawlImageInfoSpider(CrawlSpider):
    name = 'crawl_image_info'
    allowed_domains = ['m.mm131.net']
    headers = {'referer': 'https://m.mm131.net'}
    file_type = 'img'
    start_urls = []

    def __init__(self, url, path, suffix='jpg', *a, **kw):
        super().__init__(*a, **kw)
        self.path = path
        self.img_suffix = suffix
        self.task_init(url)

    @staticmethod
    def get_url_info(url):
        items = url.split('/')
        info = re.findall(r'\d+', items[4])
        page = 1 if len(info) == 1 else info[1]
        return (items, info, page)

    def task_init(self, url):
        try:
            (items, info, page) = self.get_url_info(url)
            self.__setattr__('img_url_model', '{}//{}/'.format(items[0], items[2]) + '{}/{}_{}.html')
            self.__setattr__('type', items[3])
            self.__setattr__('id', info[0])
            self.start_urls.append(url)
        except AttributeError as ae:
            raise Exception('Please input attributes is inclusive of url')

    def parse_start_url(self, response):
        link = LinkExtractor(allow=r'{}'.format(self.img_url_model.format(self.type, self.id, r'\d+')))
        links = link.extract_links(response)
        stat = self.parse_image(response)
        if stat: yield stat
        for link in links:
            next_url = link.url
            yield scrapy.Request(url=next_url, callback=self.parse_start_url)

    def parse_image(self, response):
        try:
            url = response.request.url
            item = Mm131DownloadImageItem()
            item['src'] = response.xpath('//*[@id="content"]/article/div[3]/a/img/@src').extract()[0]
            item['path'] = join(self.path, '{}.{}'.format(self.get_url_info(url)[2], self.img_suffix))
            return item
        except IndexError as ie:
            return False

存储类结构

items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class Mm131DetailItem(scrapy.Item):
    page = scrapy.Field()
    title = scrapy.Field()
    time = scrapy.Field()
    type = scrapy.Field()
    label = scrapy.Field()
    avid = scrapy.Field()
    type_en = scrapy.Field()
    url = scrapy.Field()

class Mm131DownloadImageItem(scrapy.Item):
    src = scrapy.Field()
    path = scrapy.Field()

管道(Mysql & 图片下载)

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from requests.api import head
from scrapy.utils.project import get_project_settings
import pymysql, requests
settings = get_project_settings()

class MysqlPipeLine:
    host = settings.get("MYSQL_HOST")
    port = settings.get("MYSQL_PORT")
    user = settings.get("MYSQL_USER")
    psd = settings.get("MYSQL_PSD")
    charset = settings.get("MYSQL_CHARSET")
    db = settings.get("MYSQL_DB")
    table_status = False
    db_status = False
    run_status = False
    table = None

    def __init__(self):
        self.connection = pymysql.connect(
            host=self.host,
            port=self.port,
            user=self.user,
            password=self.psd,
            charset=self.charset)
        self.cursor = self.connection.cursor()
        self.check_db()

    def check_db(self):
        self.cursor.execute("""
        CREATE DATABASE IF NOT EXISTS {}
        """.format(self.db))
        self.cursor.execute('use {};'.format(self.db))

    @staticmethod
    def mysql_key_words():
        return ['order', 'time']

    @staticmethod
    def insert_val_reg(name):
        return '\'{}\''.format(name)

    def insert_col_reg(self, name):
        if name in self.mysql_key_words():
            return '`{}`'.format(name)
        else:
            return name

    def get_insert_keys_and_values(self, item):
        values = []
        keys = []
        for k in item:
            keys.append(self.insert_col_reg(k))
            values.append(self.insert_val_reg(item[k]))
        return (
            ', '.join(keys),
            ', '.join(values)
        )

    def get_insert_order(self, item):
        keys, values = self.get_insert_keys_and_values(item)
        return """
        INSERT INTO {} ({}) VALUES ({});
        """.format(self.table, keys, values)

    def insert(self, item, spider):
        sql = self.get_insert_order(item)
        self.cursor.execute(sql)
        self.connection.commit()

    def has_table(self, spider):
        try:
            self.table = spider.table
        except AttributeError:
            self.table = None
        self.table_status = (
            self.table is not None
        )
        return self.table_status

    def is_mysql(self, spider):
        try:
            db = spider.db
        except AttributeError:
            db = None
        self.db_status = (
            db is not None and db.lower() == 'mysql'
        )
        return self.db_status

    def open_spider(self, spider):
        if self.is_mysql(spider) and self.has_table(spider):
            self.run_status = True

    def process_item(self, item, spider):
        if self.run_status:
            self.insert(item, spider)
        return item

    def close_spider(self, spider):
        if self.run_status:
            self.cursor.close()
            self.connection.close()

class Mm131ImagePipeLine:
    @staticmethod
    def download_img(src, path, headers):
        try:    
            rsp = requests.get(src, headers=headers)
            if rsp.status_code == 200:
                content = rsp.content
                with open(path, "wb+") as f:
                    f.write(content)
                    f.close()
        except Exception as e:
            print(e)
            
    def process_item(self, item, spider):
        try:
            if spider.file_type == 'img':
                self.download_img (
                    src=item.get('src'), 
                    path=item.get('path'),
                    headers=spider.headers
                )
        except AttributeError as ae:
            print(ae)
        return item

中间件(没特殊处理可以不用更改)

middlewares.py

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter


class Mm131SpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request or item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class Mm131DownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

配置文件

settings.py

# Scrapy settings for mm131 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'mm131'

SPIDER_MODULES = ['mm131.spiders']
NEWSPIDER_MODULE = 'mm131.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'mm131.middlewares.Mm131SpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'mm131.middlewares.Mm131DownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'mm131.pipelines.MysqlPipeLine': 300,
   'mm131.pipelines.Mm131ImagePipeLine': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PSD = 'root'
MYSQL_DB = 'mm131'
MYSQL_CHARSET = 'utf8'

SQL 选择与下载文件(可独立运行)

sql_match.py

import pymysql, time
import numpy as np
from os import system, mkdir
from os.path import join, exists

base_path = 'F:\新建文件夹\you_know'
conn = pymysql.connect(
    host='localhost',
    port=3306,
    user='root',
    password='root',
    db='mm131',
    charset='utf8'
)
cur = conn.cursor()
cur.execute("""
select title, type, label, url from image_info where title like '%米%';
""")
res = np.array(cur.fetchall()).tolist()

# # Download Images
def download():
    for item in res:
        order = """
        scrapy crawl crawl_image_info -a url={} -a path={}
        """
        path = join(base_path, item[1])
        if not exists(path): mkdir(path)
        path = join(path, item[0])
        if not exists(path): mkdir(path)
        system(order.format(item[3], path))

# Write down some info about what we have downloaded
def log():
    log_path = '{}/log.txt'.format(base_path)
    mode = 'a' if exists(log_path) else 'w'
    with open(log_path, mode, encoding='utf-8') as fp:
        num = len(res)
        fp.write(f'Count: {num}/Date: {time.time()}\n')
        for val in res:
            fp.writelines(val)
            fp.write('\n')
        fp.close()

if __name__ == 'main':
    download()
    log()

你可能感兴趣的:(mm131爬虫(scrapy))