python爬取小说

 

import os
import requests
from bs4 import BeautifulSoup
#声明请求头
header = {
    'User-Agent':'Mozilla/5.0(Windows NT 10.0) Applewebkit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'}
#创建保存小说文本的文件夹
if not os.path.exists('D:/小说'):
    os.mkdir('D:/小说/')
#访问网站并获取页面数据

response = requests.get('http://www.biquw.com/book/16583/')
response.encoding = response.apparent_encoding
print(response.text)

#lxml:html解析库,将html代码转换为python对象 python对象可以对html代码进行控制
soup = BeautifulSoup(response.text,'lxml')
book_list = soup.find('div',class_='book_list').find_all('a')
#soup对象获取批量数据后返回的是一个列表,我们可以对列表迭代提取
for book in book_list:
    book_name = book.text
    #获取列表数据,获取文章详情页链接,在a标签的href属性中
    book_url = book['href']
    #获取到详情页之后二次访问获取文章数据
    book_info_html = requests.get('http://www.biquw.com/book/16583/' + book_url, 'headers=headers')
    book_info_html.encoding = book_info_html.apparent_encoding
    soup = BeautifulSoup(book_info_html.text,'lxml')
    info = soup.find('div',id='htmlContent')
    print(info.text)
    with open('D:/小说/' + book_name + '.txt','a',encoding='utf-8')as f:
        f.write(info.text)

你可能感兴趣的:(python,python)