之前3.51版本的python只能爬取2.56MB的内容,后来觉得可能是版本问题,重新下载了3.64版的,果然就全部爬下来了。
from lxml import etree
import requests
url = "https://www.biquge.info/22_22564/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/......'}
response = requests.get(url,headers=headers)
html = etree.HTML(response.content)
book_url = html.xpath('//div/dl/dd//@href') #获取所有章节的网页地址
real_book_url = []
main_url = "https://www.biquge.info/22_22564/"
for i in book_url:
real_book_url.append(main_url+i)
print(real_book_url)
#获取各个章节的内容
for real_book_url_i in real_book_url:
book_response = requests.get(real_book_url_i, headers=headers)
book_html = etree.HTML(book_response.content)
word_list = book_html.xpath('//div[@id="content"]/text()')
bookname = book_html.xpath('//div/h1/text()')
#写入内容
with open("zz.txt", "a", encoding='utf-8') as f:
f.write(bookname[0])
for word in word_list:
word=word+"\r\n"
with open("zz.txt", "a", encoding='utf-8') as f:
f.write(word)