本人步骤:
1>setting.py:
BOT_NAME = 'newding'
SPIDER_MODULES = ['newding.spiders']
NEWSPIDER_MODULE = 'newding.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'newding.pipelines.NewdingPipeline': 300,
}
以上配置;创建项目会自动出现这些
以下是想要入数据库的(阶段):
MYSQL_USER = 'root'
MYSQL_PASSWORD = '12345678'
MYSQL_HOST = '127.0.0.1'
MYSQL_PORT = '3306'
MYSQL_DB = 'xiaoshuo'
2>RUN.py
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'newding1s']) #执行项目命令
3>items.py
import scrapy
class NewdingItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# pass
title = scrapy.Field()
types = scrapy.Field()
zijie = scrapy.Field()
book_url = scrapy.Field()
4>sql.py
from newding.settings import *
import mysql.connector
db = mysql.connector.connect(user=MYSQL_USER, password=MYSQL_PASSWORD, host=MYSQL_HOST, port=MYSQL_PORT, db=MYSQL_DB)
cursor = db.cursor(buffered=True)
class Sql():
@classmethod
def insert_book(cls, title, types, zijie, book_url):
sql = "insert INTO book_table(`title`,`types`,`zijie`,`book_url`)VALUES ('" + title + "','" + types + "','" + zijie + "','" + book_url + "')"
cursor.execute(sql) # 游标执行sql语句
db.commit() # 提交数据
@classmethod
def select_book(self, book_url):
# 这一段代码会查找name_id这个字段,如果存在则会返回1不存在则会返回0
sql = "select EXISTS (select 1 FROM book_table WHERE book_url='" + book_url + "')"
cursor.execute(sql)
list = cursor.fetchall() # 游标查询所有超链接
return list
5>pipelines.py
from .sql import Sql #引入sql.py文件
class NewdingPipeline(object):
def process_item(self, item, spider):
# return item
title = item['title']
types = item['types']
zijie = item['zijie']
book_url = item['book_url']
if not Sql.select_book(book_url)[0][0]:
Sql.insert_book(title, types, zijie, book_url)
else:
print('该小说已存在')
6>newding1s.py (项目py文件)
import requests
import scrapy
from scrapy.http import Request
from scrapy.spiders import CrawlSpider, Rule, Request ##CrawlSpider与Rule配合使用可以骑到历遍全站的作用、Request干啥的我就不解释了
from scrapy.linkextractors import LinkExtractor
from newding.items import *
class Newding1sSpider(scrapy.Spider):
#name,allowed_domains,start_urls三个字段为固定格式,不能随意改变
name = 'newding1s'
allowed_domains = ['23us.so']
start_urls = ['http://www.23us.so/']
def parse(self, response):
start_urls = "http://www.23us.so/list/"
end_url = ".html"
for i in range(1, 10): #循环顶点板块
the_url = start_urls + str(i) + '_1' + end_url #拼接板块连接
# print(the_url)
yield Request(the_url, self.san) #将当前函数传到san函数(传到下一函数)
def san(self, response):
yeshu = response.xpath('//*[@id="pagelink"]/a/text()').extract()[-1] #用xpath匹配出最大页数
yeshu_url = response.xpath('//*[@id="pagelink"]/a/@href').extract()[0] #用xpath匹配出最大页数的连接
qie = yeshu_url[:-6] #http://www.23us.so/list/1_
for i in range(1, int(yeshu) + 1): #循环最大页数
qie_html = qie + str(i) + ".html" #拼接板块+页数
# print(qie_html)
yield Request(qie_html, self.si)
def si(self, response):
#进入当前文章连接封面
shu_url = response.xpath('//*[@id="content"]/dd[1]/table/tr[2]/td[1]/a/@href').extract()[0]
# print(shu_url)
yield Request(shu_url, self.wu)
def wu(self, response):
item = NewdingItem() #引用item相对应的字段
types = response.xpath('//*[@id="at"]/tr[1]/td[1]/a/text()').extract()[0] #小说类型
zijie = response.xpath('//*[@id="at"]/tr[2]/td[2]/text()').extract()[0].replace('\xa0', '') #.replace('\xa0', '')处理特殊字符,防止出现乱码
title = response.xpath('//*[@id="content"]/dd[1]/h1/text()').extract()[0]
book_url = response.xpath('//a[@class="read"]/@href').extract()[0]
item['title'] = title
item['types'] = types
item['zijie'] = zijie
item['book_url'] = book_url
return item
做的项目是顶点小说信息入库(scrapy框架的优点是清晰明了,缺点是之间关系太繁琐)
可以参考另一片代码笔记xpath爬顶点页面信息;可以看出之间明显区别