import os
import requests
from bs4 import BeautifulSoup
#声明请求头
header = {
'User-Agent':'Mozilla/5.0(Windows NT 10.0) Applewebkit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'}
#创建保存小说文本的文件夹
if not os.path.exists('D:/小说'):
os.mkdir('D:/小说/')
#访问网站并获取页面数据
response = requests.get('http://www.biquw.com/book/16583/')
response.encoding = response.apparent_encoding
print(response.text)
#lxml:html解析库,将html代码转换为python对象 python对象可以对html代码进行控制
soup = BeautifulSoup(response.text,'lxml')
book_list = soup.find('div',class_='book_list').find_all('a')
#soup对象获取批量数据后返回的是一个列表,我们可以对列表迭代提取
for book in book_list:
book_name = book.text
#获取列表数据,获取文章详情页链接,在a标签的href属性中
book_url = book['href']
#获取到详情页之后二次访问获取文章数据
book_info_html = requests.get('http://www.biquw.com/book/16583/' + book_url, 'headers=headers')
book_info_html.encoding = book_info_html.apparent_encoding
soup = BeautifulSoup(book_info_html.text,'lxml')
info = soup.find('div',id='htmlContent')
print(info.text)
with open('D:/小说/' + book_name + '.txt','a',encoding='utf-8')as f:
f.write(info.text)