一、项目准备
- 开发环境:python3
- 开发工具:pycharm
- 使用技术:Scrapy + Django + PyMySQL
二、图书管理系统
1.创建项目
django-admin startproject web_book(项目名称)
cd web_book
python manage.py startapp book(子应用名称)
- 注册子应用:在settings.py文件中,INSTALLED_APPS项中进行子应用注册
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'book.apps.BookConfig',
]
2.MySQL配置
pip install PyMySQL
- 2、导入pymsql并创建实例化对象:在Django的工程同名子目录的__init__.py文件中添加如下语句。
from pymysql import install_as_MyAQLdb
install_as_MySQLdb()
- 3、配置MySQL信息:在Django的工程同名子目录的settings.py文件中修改DATABASES配置信息
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.mysql',
'HOST': '127.0.0.1',
'PORT': 3306,
'USER': 'root',
'PASSWORD': '123456',
'NAME': 'book'
}
}
mysql -u root -p
create database book charset=utf8;
3.创建图书模型类
from django.db import models
class BookInfo(models.Model):
category = models.CharField(max_length=50, default="大类", verbose_name="图书大类")
small_category = models.CharField(max_length=50, default="小类", verbose_name="图书小分类")
name = models.CharField(max_length=100, default="无", verbose_name="书名")
author = models.CharField(max_length=50, default="无", verbose_name="作者")
store = models.CharField(max_length=100, default="无", verbose_name="出版社")
pub_date = models.CharField(max_length=30, null=True, verbose_name="出版时间")
price = models.DecimalField(decimal_places=2, max_digits=10, default="0.00", verbose_name="价格")
default_image = models.ImageField(null=True, verbose_name="图片")
class Meta:
verbose_name = "图书"
verbose_name_plural = verbose_name
def __str__(self):
return self.name
cd web_book
python manage.py makemigrations
python manage.py migrate
4.Admin站点配置
LANGUAGE_CODE = 'zh-hans'
TIME_ZONE = 'Asia/Shanghai'
- 创建超级管理员用户
- 用户名
- 密码:至少8位,数字和字母混合使用
- 邮箱:符合邮箱格式即可
python manage.py createsuperuser
from django.contrib import admin
from .models import BookInfo
class BookInfoAdmin(admin.ModelAdmin):
list_display = ['id', "category", "small_category", 'name', "author", "store", "price"]
admin.site.register(BookInfo, BookInfoAdmin)
admin.site.site_header = '小七书城'
admin.site.site_title = '小七书城MIS'
admin.site.index_title = '欢迎使用小七书城MIS'
python manage.py runserver
- 访问链接http://127.0.0.1:8000/admin
三、爬取图书数据
1.爬取分析
- 京东所有图书的总入口 :
- https://book.jd.com/booksort.html
- 解析大分类的名字 - 52个大分类
- 解析小分类名字和链接 - 882个小分类
- 每个小分类的图书列表url,如:中国当代小说
- https://list.jd.com/list.html?cat=1713,3258,3297
- 翻页的网址page:
- https://list.jd.com/list.html?cat=1713%2C3258%2C3297&page=3&s=53&click=0
- 解析下一页的网址,如果没有值,代表当前小分类抓取完毕
2.数据清洗解析 - xpath
start_urls = "https://book.jd.com/booksort.html"
dt_list = '//*[@id="booksort"]/div[2]/dl/dt'
for dt in dt_list:
category = dt.xpath('./a/text()')
em_list = './following-siblings::*[1]/em'
for em in em_list:
small_category = './a/text()'
small_link = 'http:' + "./a/@href"
list_book = '//*[@id="J_goodsList"]/ul/li/div'
for book in list_book:
name = ".//div[@class='p-name']/a/em/text()"
author = ".//div[@class='p-bookdetails']/span[@class='p-bi-name']/a/text()"
store = ".//div[@class='p-bookdetails']/span[@class='p-bi-store']/a/text()"
price = ".//div[@class='p-price']/strong/i/text()"
default_image = ".//div[@class='p-img']/a/img/@src"
next_url = ".//a[@class='pn-next']/@href"
3.实现爬取
- 3.1创建爬虫项目:
- 创建项目:
scrapy startproject BOOK
- 进入项目:
cd BOOK
- 创建爬虫:
scrapy genspider book jd.com
- 运行爬虫:
scrapy crawl book
- 3.2 爬虫项目配置:settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
ROBOTSTXT_OBEY = False
class BookSpider(scrapy.Spider):
name = 'book'
allowed_domains = ['jd.com', 'p.3.cn']
start_urls = ['https://book.jd.com/booksort.html']
page = 0
def parse(self, response):
dt_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt[1]')
for dt in dt_list:
item = {
}
item['category'] = dt.xpath('a/text()').extract_first()
em_list = dt.xpath('./following-sibling::*[1]/em')
for em in em_list[:1]:
item['small_category'] = em.xpath('a/text()').extract_first()
small_link = 'https:' + em.xpath('a/@href').extract_first()
yield scrapy.Request(small_link, callback=self.parse_book, meta={
'book': deepcopy(item)})
def parse_book(self, response):
item = response.meta['book']
book_list = response.xpath('//div[@id="plist"]/ul/li')
for book in book_list[:1]:
item['default_image'] = "https:" + book.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
item['name'] = book.xpath('.//div[@class="p-name"]/a/em/text()').extract_first().strip()
item['author'] = book.xpath('.//span[@class="author_type_1"]/a/text()').extract_first()
item['store'] = book.xpath('.//span[@class="p-bi-store"]/a/text()').extract_first()
item['time'] = book.xpath('.//span[@class="p-bi-date"]/text()').extract_first().strip()
item['price'] = book.xpath('.//div[@class="p-price"]/strong/i/text()').extract_first()
"""
https://p.3.cn/prices/mgets?skuIds=J_11757834
"""
book_id = book.xpath('./div/@data-sku').extract_first()
price_url = 'https://p.3.cn/prices/mgets?skuIds=J_{}'.format(book_id)
yield scrapy.Request(price_url, callback=self.parse_price, meta={
'book': deepcopy(item)})
def parse_price(self, response):
item = response.meta['book']
item['price'] = json.loads(response.body.decode())[0]['p']
yield item
self.page += 1
if self.page > 4:
return
next_url = response.xpath('//a[@class="pn-next"]/@href').extract_first()
if next_url:
yield response.follow(
next_url,
callback=self.parse_book,
meta={
'book': item}
)
- 3.4 存储数据库:
- 3.4.1 在pipeline.py文件中使用pipeline将爬取的数据入库
from pymysql import connect
from jdBook import settings
class JdbookPipeline(object):
def open_spider(self, spider):
self.client = connect(
host=settings.MYSQL_HOST,
user=settings.MYSQL_USER,
password=settings.MYSQL_PASSWORD,
database=settings.MYSQL_DB_NAME,
charset="utf8",
)
self.cur = self.client.cursor()
def process_item(self,item,spider):
try:
values = (
None,
item["category"],
item["small_category"],
item["name"],
item["author"],
item["store"],
item["pub_date"],
item["price"],
item["default_image"],
)
print(values)
sql = "insert into book.book_bookinfo values (%s,%s,%s,%s,%s,%s,%s,%s,%s);"
self.cur.execute(sql, values)
self.client.commit()
except Exception as e:
print(e)
return item
def close_spider(self,spider):
self.client.close()
- 3.4.2 在settings.py文件中开启pipeline
ITEM_PIPELINES = {
"BOOK.pipelines.BookPipeline":300,
}