2021-07-22

本周四,项目也是在有条不紊的继续进行中。

针对外国网站如何爬取信息,我进行了学习和实践。

有请主角scrapy登场

我们建一个abroadwebsite的项目和名为abroad的爬虫(通用爬虫 -t crawl)

先分析站点信息

会发现每一个站点网址都会有“site”这个字符,把它存入Rules LinkExtractor 中的allow里

打开网址

这里有网站的具体信息,我们用xpath把自己认为有用的提取出来就行

最后我们还要把每一页到下一页的节点分析出来

这里把下一页的网址存入Rules LinkExtractor中就可以一页页地爬取了

分析完毕上代码(只上改动了的)

爬虫 abroad

# -*- coding: utf-8 -*-

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from abroadwebsite.items import *

USER_AGENT = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"

class AbroadSpider(CrawlSpider):

    name = 'abroad'

    allowed_domains = ['www.kanguowai.com']

    start_urls = ['https://www.kanguowai.com/site/']

    rules = (

        Rule(LinkExtractor(allow='site\.*', restrict_xpaths='//dl[@class="picture_lie"]'), callback='parse_item'),

        Rule(LinkExtractor(restrict_xpaths='//div[@class="page"]//a[@title="下一页"]'))

    )

    def start_requests(self):

        for url in self.start_urls:

            yield scrapy.Request(url, headers={"User-Agent": USER_AGENT})

    def parse_item(self, response):

        item=AbroadwebsiteItem()

        item['website_name']=response.xpath('//ul[@class="baseinfo"]/li[1]/h1/text()').extract_first()

        item["country"]=response.xpath('//li[@class="linfo"]/a/text()').extract_first()

        item["url"]=response.xpath('//li[@class="linfo siteurl"]/a/text()').extract_first()

        item["form"]=response.xpath('//li[@class="rinfo"]/a/text()').extract_first()

        item["introduction"]=response.xpath('//div[@class="sitetext"]/p/text()').extract_first()

        item["img_path"]=response.xpath('//div[@class="sitepic"]/img/@src').extract_first()

        yield item

        pass

items

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class AbroadwebsiteItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    website_name=scrapy.Field()

    country=scrapy.Field()

    url=scrapy.Field()

    form=scrapy.Field()

    introduction=scrapy.Field()

    img_path=scrapy.Field()

settings只上一点有用的

ITEM_PIPELINES = {

    'abroadwebsite.pipelines.ImagesPipeline':300,

  'abroadwebsite.pipelines.AbroadwebsitePipeline': 301,

}

MYSQL_HOST='localhost'

MYSQL_DATABASE='spider'

MYSQL_ROOT='root'

MYSQL_PASSWORD='123'

USE='use spider'

TABLE='abroadwebsites'

DROP="drop table if exists %s"%TABLE

CREATE='create table %s(website_name varchar(255)  NOT NULL,country varchar(255),url varchar(255),form varchar(255),introduction varchar(255),img_path varchar(255))'%TABLE

SAVEIN='insert into '+TABLE+' (website_name,country,url,form,introduction,img_path) values(%s,%s,%s,%s,%s,%s)'

Root_path='D:/pics1/'

IMAGES_STORE='D:/pics1'

pipelines比较复杂 包括了保存图片的方法

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from abroadwebsite.settings import *

import pymysql

import logging

from scrapy import Request

from scrapy.exceptions import DropItem

from scrapy.pipelines.images import ImagesPipeline

class ImagesPipeline(ImagesPipeline):

    def get_media_requests(self,item,info):

        yield Request('https://www.kanguowai.com'+item["img_path"])

    def file_path(self,request,response=None,info=None):

        url=request.url

        file_name=url.split('/')[-1]

        return file_name

    def item_completed(self,results,item,info):

        image_paths=[x['path'] for ok,x in results if ok]

        if not image_paths:

            raise DropItem("Image Downloaded Failed")

        return item

class AbroadwebsitePipeline(object):

    def __init__(self):

        self.connect=pymysql.connect(MYSQL_HOST,MYSQL_ROOT,MYSQL_PASSWORD,MYSQL_DATABASE)

        self.cursor=self.connect.cursor()

        self.cursor.execute(USE)  # 选定数据库

        self.cursor.execute(DROP)

        self.cursor.execute(CREATE)

    def process_item(self, item, spider):

        try:

            self.cursor.execute(SAVEIN,(item["website_name"],item["country"],item["url"],item["form"],item["introduction"],Root_path+item["img_path"].split('/')[-1]))

            self.connect.commit()

        except Exception as error:

            logging.log(error)

        return item,

    def close_spider(self,spider):

        self.connect.close()

这四部分代码包括了爬取数据——保存图片 ——存入数据库的三个目的。

你可能感兴趣的:(2021-07-22)