scrapy startproject crawl_novel
cd crawl_novel/
cd crawl_novel/
cd spiders
scrapy genspider basic www
cd ..
vi items.py
-- coding: utf-8 --
import scrapy
class CrawlNovelItem(scrapy.Item):
# Primary fields
title = scrapy.Field()
author = scrapy.Field()
classify = scrapy.Field()
recommend = scrapy.Field()
chapter_urls = scrapy.Field()
# Calculated fields
chapter = scrapy.Field()
# Housekeeping fields
url = scrapy.Field()
project = scrapy.Field()
spider = scrapy.Field()
server = scrapy.Field()
date = scrapy.Field()
vi basic.py
-- coding: utf-8 --
import datetime
import socket
from urllib.parse import urlparse,urljoin
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, Join
from crawl_novel.items import CrawlNovelItem
class BasicSpider(scrapy.Spider):
name = 'basic'
allowed_domains = ['www']
start_urls = ['http://www.biquge.info/22_22559/']
def parse(self, response):
# 定义装载器
l = ItemLoader(item=CrawlNovelItem(), response=response)
# 使用处理器
# 去除首位空白符,使结果按照收尾标题格式
l.add_xpath('title', '//h1[1]/text()', MapCompose(str.strip, str.title))
l.add_xpath('author', '//*[@id="info"]/p[1]/text()', MapCompose(str.strip))
l.add_xpath('classify', '//*[@id="info"]/p[2]/text()', MapCompose(str.strip))
# 将多个结果连接在一起
l.add_xpath('recommend', '//*[@id="listtj"]//text()', Join())
# 使用lambda表达式(以response.url为基础,将相对路径i转化为绝对路径)
l.add_xpath('chapter_urls', '//*[@id="list"]//a/@href', MapCompose(lambda i: urljoin(response.url, i)))
# 添加管理字段
l.add_value('url', response.url)
l.add_value('project', self.settings.get('BOT_NAME'))
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
l.add_value('date', datetime.datetime.now())
return l.load_item()