这里只需要一个py文件就能实现数据采集
它区别于之前记录的方式,这里没有使用Scrapy框架,直接通过Requests提取
使用Requests,需要提前下载好第三方插件库
代码注释我已经写的挺清晰的了~~~
1. 创建普通的python爬虫项目
2. 爬取正确的数据
(1) 对爬取的数据进行格式转换
3. 爬取的数据进行数据库存储
# 文件名
myCrawler.py
# coding:utf-8
import requests
from lxml import etree
import pymysql
# 执行命令:python myCrawler.py
def job():
# 获取网页源代码
url = 'http://book.zongheng.com/chapter/885037/58155562.html'
data = requests.get(url)
selector = etree.HTML(data.content)
# 标题章节div(包括章节数、本章名称)
title_1 = selector.xpath("//div[@class='title_txtbox']/text()") # ['第一章黑衣剑客']
# 取出标题文字
title_2 = str(title_1).split("['")[-1].split("']")[0] # 第一章黑衣剑客
# 1.提取标题,章节数以外的文字
if "章" in title_2:
title = title_2[str(title_2).index("章") + 1:] # 黑衣剑客
else:
title = title_2
# 2.提取章节数,根据"章"字对标题进行截取
if "章" in title_2:
chapter = title_2[0:title_2.index("章") + 1] # 第一章
else:
chapter = "-"
# 3.内容 content
content_1 = selector.xpath("//div[@class='content']//text()")
# 数组转字符串: String = "".join(arrary)
# 字符串清除空格: String.strip()
# 数组转字符串,并清空内容里的空格
content = "".join(content_1).strip()
#print(chapter + '\t' + title + '\n' + content + '\n\n')
# 连接数据库
db = pymysql.connect("127.0.0.1", "root", "123456", "mypython")
cursor = db.cursor()
print(" ---- 数据库连接成功 ---- ")
querySql = "SElECT * from fiction where chapter ='%s' " % (chapter)
cursor.execute(querySql)
data = cursor.fetchone()
# 判断数据库是否存在
if data:
print(" ---- 数据已存在 ---- ")
else:
# 数据不存在就新增入库
insertSql = "INSERT INTO fiction(chapter, title, content) VALUES( '%s', '%s','%s' ) " % (chapter, title, content)
try:
cursor.execute(insertSql)
db.commit() # 提交到数据库执行
print(" ---- 新增成功 ---- ")
except Exception as e:
db.rollback() # 异常回滚
print(e)
cursor.close()
# 执行及调用方法
job()
python myCrawler.py
CREATE DATABASE mypython
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for fiction
-- ----------------------------
DROP TABLE IF EXISTS `fiction`;
CREATE TABLE `fiction` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键id',
`chapter` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '小说章节数',
`title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '小说标题',
`content` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '小说内容',
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8mb4 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;