ra.py
import scrapy
from readdang.items import ReaddangItem
class RdSpider(scrapy.Spider):
name = 'rd'
allowed_domains = ['category.dangdang.com']
start_urls = ['http://category.dangdang.com/cp01.49.01.00.00.00.html']
# 基础页链接
base_url = 'http://category.dangdang.com/pg'
page = 1
def parse(self, response): # 爬取网站得到response后,自动回调parse方法
li_list = response.xpath('//ul[@id="component_59"]/li')
# 遍历
for li in li_list:
# 提取selector标签中的文字内容须在后面加上 .extract_first
# 第一张图片和其他图片的标签属性不一样,一张图片的src可以直接爬取到,其他图片的路径则是在data-original下面
src = li.xpath('.//img/@data-original').extract_first()
# 判断获取的路径是否为None
if src:
src = 'http:' + src
else:
src = 'http:' + li.xpath('.//img/@src').extract_first()
# 爬取
name = li.xpath('.//img/@alt').extract_first() # 书名
press = li.xpath('.//span//a[@name="P_cbs"]/text()').extract_first() # 出版社
author = li.xpath('.//span[1]/a[1]/@title').extract_first() # 作者
date = li.xpath('.//p[@class="search_book_author"]//span[2]/text()').extract_first() # 出版时间
b_href = 'http:' + li.xpath('./a[@class="pic"]/@href').extract_first() #点击图片进入详情页
print(src, name, press, author, b_id, date,b_href) # 输出
book = ReaddangItem(src=src, name=name, press=press, author=author, b_id=b_id, date=date, b_href=b_href) #
yield book
if self.page < 100:
self.page = self.page + 1
url = self.base_url + str(self.page) + '-cp01.49.01.00.00.00.html'
yield scrapy.Request(url=url, callback=self.parse)
items.py管道
import scrapy
class ReaddangItem(scrapy.Item):
## 爬取图片路径、名称、出版社、作者
src = scrapy.Field()
name = scrapy.Field()
press = scrapy.Field()
author = scrapy.Field()
# 出版时间
date = scrapy.Field()
# 链接
b_href = scrapy.Field()
piplines.py下载器
import pymysql
from itemadapter import ItemAdapter
# 导入settings 包含数据库等配置
from readdang import settings
import urllib.request
# 多条管道开启
class ReaddangPipeline:
# 在爬虫文件开始前,执行此方法,将文件打开
def open_spider(self, spider):
print("------爬取开始------")
self.fp = open('books.json', 'w', encoding='utf-8')
# item就是yield后面的book对象
def process_item(self, item, spider):
# 向文件中写入内容
self.fp.write(str(item))
return item
# 在爬虫文件结束后,执行此方法,将文件关闭
def close_spider(self, spider):
self.fp.close()
print("------爬取结束------")
# 根据实际情况是否需要此类
class DangDownLoadPipeline:
def process_item(self, item, spider):
# 下载图片到项目指定文件夹
# 在路径前添加 http:
url = 'http:' + item.get('src')
filename = './books/' + item.get('name') + '.jpg'
urllib.request.urlretrieve(url=url, filename=filename)
return item
class DBPipeline:
# ==========================================================
def __init__(self):
# 配置连接 MySQL数据库
self.connect = pymysql.connect(host=settings.MYSQL_HOST,
port=3306,
user=settings.MYSQL_USER,
passwd=settings.MYSQL_PASSWD,
db=settings.MYSQL_DBNAME,
charset='utf8',
use_unicode=True)
# 连接
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
# 插入数据
try:
# 处理数据为空的字段
name = item.get("name", "N/A") # 有的图书有数据项缺失,这里做了容错处理
author = item.get("author", "N/A")
src = item.get("src", "N/A")
press = item.get("press", "N/A")
date= item.get("date", "N/A")
b_href = item.get("b_href","N/A")
# sql插入数据语句
sql = "insert into bookclass_tbook(name, author, src, press,b_id,date,b_href) VALUES (%s, %s, %s, %s, %s, %s,%s)" #
# 执行数据库操作
self.cursor.execute(sql, (name, author, src, press,b_id,date,b_href)) #
self.connect.commit() # 提交
except Exception:
# 出现错误时打印错误日志
print('出错了')
# 或者
# except Exception e:
# # 直接打印报错信息
# print(e)
return item
def close_spider(self, spider): # 关闭数据库
self.cursor.close()
self.connect.close()
# ==============================================================
settings
BOT_NAME = 'readdang'
SPIDER_MODULES = ['readdang.spiders']
NEWSPIDER_MODULE = 'readdang.spiders'
# 开启多个下载并设置其下载优先级
ITEM_PIPELINES = {
'readdang.pipelines.ReaddangPipeline': 30,
'readdang.pipelines.DangDownLoadPipeline': 300,
'readdang.pipelines.DBPipeline': 100,
}
# 配置连接MySQL数据库信息
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = '需要连接的数据库'
MYSQL_USER = '数据库账号'
MYSQL_PASSWD = '数据库密码'