', re.S) #获取章节内容
chapter_content = pattern.findall(html)
for content in chapter_content:
content = content.replace(" ", "").replace("
", "")
#插入mongodb
novel_col.insert_one({'name':name,'chapter':chapter[1]})
chapter_col.insert_one({'name':chapter[1],'content':content})
爬取效果,确保安装了mongodb和可视化管理工具Robo3T,打开Robo3T:
novels集合:存储小说名和章节名
chapters集合:存储章节名和章节内容
完整代码:
import os
import re
import time
import requests
from requests import RequestException
import pymongo
def get_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.text
return None
except RequestException:
return None
def get_list(page):
# 我们可以只通过a标签来解析
pattern = re.compile('(.*?)', re.S)
list = pattern.findall(page)
return list[10:] # 只通过a标签来解析 前10个并不是小说,所以从第11个开始
def get_chapter(novel_url):
html = get_page(novel_url)
pattern = re.compile("(.*?) ", re.S)
chapters = pattern.findall(html)
return chapters[:5] # 取前5章 也可以取全部
def get_content(chapter, name):
chapter_url = 'http://www.xbiquge.la' + chapter[0]
html = get_page(chapter_url)
pattern = re.compile('(.*?)', re.S)
chapter_content = pattern.findall(html)
for content in chapter_content:
content = content.replace(" ", "").replace("
", "")
novel_col.insert_one({'name':name,'chapter':chapter[1]})
chapter_col.insert_one({'name':chapter[1],'content':content})
if __name__ == '__main__':
#连接MongoDB
client = pymongo.MongoClient('mongodb://localhost:27017')
#指定数据库
db = client.novel
#指定集合
novel_col = db.novels
chapter_col = db.chapters
# 首页url
url = 'http://www.xbiquge.la/xiaoshuodaquan/'
# 发送请求,获取响应
page = get_page(url)
# 获取小说列表 解析响应
novel_list = get_list(page)
print(novel_list)
for item in novel_list:
novel_chapter = get_chapter(item[0]) #得到章节列表
print(novel_chapter)
# 按小说章节 分别保存到文本文件
for chapter in novel_chapter:
get_content(chapter, item[1])