获取来自每个国家的香水品牌名单

目标:获取来自每个国家的香水品牌名单
目标网站:https://www.fragrantica.asia/country/China.html
技术路线:scrapy + mysql
爬虫文件结构:
获取来自每个国家的香水品牌名单_第1张图片
代码:
fragrantica.py
# -*- coding: utf-8 -*-
import scrapy
import re
from Fgt.items import FgtItem


class FragranticaSpider(scrapy.Spider):
    name = 'fragrantica'
    allowed_domains = ['fragrantica.asia']
    start_urls = ['https://www.fragrantica.asia/country/China.html']
    agent = "  "

    def parse(self, response):
        sel = scrapy.Selector(response)
        # 获取国家
        a_list = sel.css('select option ').extract()
        pat = 'value="(.*?.html)"'
        for i in a_list:
            res = re.compile(pat).findall(i)[0]
            #构建网址
            url='https://www.fragrantica.asia'+res
            yield scrapy.Request(url=url, callback=self.parse_info, dont_filter=True)

    def parse_info(self, response):
        sel = scrapy.Selector(response)
        # 获取品牌
        b_list = sel.css('.nduList a ').extract()
        for j in b_list:
            url = 'https://www.fragrantica.asia'
            pat1 = 'href="(.*?.html)"'           #提取品牌网址
            pat2 = 'src="(.*?.jpg)"'             #提取品牌logo
            pat3 = '.html">(.*?) 
item.py
# -*- coding: utf-8 -*-

import scrapy

class FgtItem(scrapy.Item):
    html= scrapy.Field()        #品牌网址
    company = scrapy.Field()    #品牌名称
    logo = scrapy.Field()       #品牌logo
    
    def get_insert_sql(self):
        #具体插入值
        insert_sql = """
            insert into fgt(html,company,logo)
            VALUES (%s, %s, %s)
        """
        params = (self["html"],
                  self["company"],
                  self["logo"],
                  )
        return insert_sql, params
pipeline.py
# -*- coding: utf-8 -*-

import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi

class FgtPipeline(object):
    def process_item(self, item, spider):
        return item

# 异步操作mysql插入
class MysqlTwistedPipeline(object):
    def __init__(self, dbpool):
        self.dbpool = dbpool

    @classmethod
     class
    def from_settings(cls, settings):
        # setting值可以当做字典来取值
        dbparms = dict(
            host=settings["MYSQL_HOST"],
            db=settings["MYSQL_DBNAME"],
            user=settings["MYSQL_USER"],
            passwd=settings["MYSQL_PASSWORD"],
            charset='utf8',
            cursorclass=MySQLdb.cursors.DictCursor,
            use_unicode=True,
        )
        # 连接池ConnectionPool
        dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
        # 此处相当于实例化pipeline, 要在init中接收。
        return cls(dbpool)

    def process_item(self, item, spider):
        # 使用twisted将mysql插入变成异步执行:参数1:我们自定义一个函数,里面可以写我们的插入逻辑
        query = self.dbpool.runInteraction(self.do_insert, item)
        # 添加自己的处理异常的函数
        query.addErrback(self.handle_error, item, spider)

    def do_insert(self, cursor, item):
        # 执行具体的插入
        # 根据不同的item 构建不同的sql语句并插入到mysql中
        insert_sql, params = item.get_insert_sql()
        cursor.execute(insert_sql, params)

    def handle_error(self, failure, item, spider):
        # 处理异步插入的异常
        print(failure)
数据库表设计
CREATE TABLE fgt(
    html VARCHAR(100),
    company VARCHAR(40),
    logo VARCHAR(100)
);
爬取结果,共计3276条数据
获取来自每个国家的香水品牌名单_第2张图片

你可能感兴趣的:(获取来自每个国家的香水品牌名单)