本周四,项目也是在有条不紊的继续进行中。
针对外国网站如何爬取信息,我进行了学习和实践。
有请主角scrapy登场
我们建一个abroadwebsite的项目和名为abroad的爬虫(通用爬虫 -t crawl)
先分析站点信息
会发现每一个站点网址都会有“site”这个字符,把它存入Rules LinkExtractor 中的allow里
打开网址
这里有网站的具体信息,我们用xpath把自己认为有用的提取出来就行
最后我们还要把每一页到下一页的节点分析出来
这里把下一页的网址存入Rules LinkExtractor中就可以一页页地爬取了
分析完毕上代码(只上改动了的)
爬虫 abroad
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from abroadwebsite.items import *
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
class AbroadSpider(CrawlSpider):
name = 'abroad'
allowed_domains = ['www.kanguowai.com']
start_urls = ['https://www.kanguowai.com/site/']
rules = (
Rule(LinkExtractor(allow='site\.*', restrict_xpaths='//dl[@class="picture_lie"]'), callback='parse_item'),
Rule(LinkExtractor(restrict_xpaths='//div[@class="page"]//a[@title="下一页"]'))
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, headers={"User-Agent": USER_AGENT})
def parse_item(self, response):
item=AbroadwebsiteItem()
item['website_name']=response.xpath('//ul[@class="baseinfo"]/li[1]/h1/text()').extract_first()
item["country"]=response.xpath('//li[@class="linfo"]/a/text()').extract_first()
item["url"]=response.xpath('//li[@class="linfo siteurl"]/a/text()').extract_first()
item["form"]=response.xpath('//li[@class="rinfo"]/a/text()').extract_first()
item["introduction"]=response.xpath('//div[@class="sitetext"]/p/text()').extract_first()
item["img_path"]=response.xpath('//div[@class="sitepic"]/img/@src').extract_first()
yield item
pass
items
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class AbroadwebsiteItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
website_name=scrapy.Field()
country=scrapy.Field()
url=scrapy.Field()
form=scrapy.Field()
introduction=scrapy.Field()
img_path=scrapy.Field()
settings只上一点有用的
ITEM_PIPELINES = {
'abroadwebsite.pipelines.ImagesPipeline':300,
'abroadwebsite.pipelines.AbroadwebsitePipeline': 301,
}
MYSQL_HOST='localhost'
MYSQL_DATABASE='spider'
MYSQL_ROOT='root'
MYSQL_PASSWORD='123'
USE='use spider'
TABLE='abroadwebsites'
DROP="drop table if exists %s"%TABLE
CREATE='create table %s(website_name varchar(255) NOT NULL,country varchar(255),url varchar(255),form varchar(255),introduction varchar(255),img_path varchar(255))'%TABLE
SAVEIN='insert into '+TABLE+' (website_name,country,url,form,introduction,img_path) values(%s,%s,%s,%s,%s,%s)'
Root_path='D:/pics1/'
IMAGES_STORE='D:/pics1'
pipelines比较复杂 包括了保存图片的方法
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from abroadwebsite.settings import *
import pymysql
import logging
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class ImagesPipeline(ImagesPipeline):
def get_media_requests(self,item,info):
yield Request('https://www.kanguowai.com'+item["img_path"])
def file_path(self,request,response=None,info=None):
url=request.url
file_name=url.split('/')[-1]
return file_name
def item_completed(self,results,item,info):
image_paths=[x['path'] for ok,x in results if ok]
if not image_paths:
raise DropItem("Image Downloaded Failed")
return item
class AbroadwebsitePipeline(object):
def __init__(self):
self.connect=pymysql.connect(MYSQL_HOST,MYSQL_ROOT,MYSQL_PASSWORD,MYSQL_DATABASE)
self.cursor=self.connect.cursor()
self.cursor.execute(USE) # 选定数据库
self.cursor.execute(DROP)
self.cursor.execute(CREATE)
def process_item(self, item, spider):
try:
self.cursor.execute(SAVEIN,(item["website_name"],item["country"],item["url"],item["form"],item["introduction"],Root_path+item["img_path"].split('/')[-1]))
self.connect.commit()
except Exception as error:
logging.log(error)
return item,
def close_spider(self,spider):
self.connect.close()
这四部分代码包括了爬取数据——保存图片 ——存入数据库的三个目的。