安装 requests 、pyquery库
定义了search类 初始化时传入小说第一章url 和小说名即可
再调用all_content方法即可
# -*-coding:utf8-*-
import re
import requests
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
class searchbook:
def __init__(self,url,bookname):
self.url=url
self.bookname=bookname
self.baseurl=url.split('/')[0]+'//'+url.split('/')[2]
_headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
def __writeintxt__(self,info):
"""写入txt文件,封装"""
newname=self.bookname + '.txt'
with open(newname,'a+',encoding='utf-8') as file:
file.write('\n'.join([info]))
file.write('\n' + '='*20 + '\n')
def __getallchapter__(self,url):
"""获取章节内容"""
try:
one_chapter=requests.get(url,headers=self._headers)
#为防止编码问题调用apparent_encoding,自动识别编码
#print(one_chapter.encoding)
one_chapter.encoding=self.codemode(one_chapter)
#print(one_chapter.encoding)
#加入headers标识计算机
if one_chapter.status_code==200:
content=one_chapter.text
#print(content)
doc=pq(content)
#获取文章标题、正文、下一章节URL
title=doc('.bookname h1').text()
self.__writeintxt__(title)
maincontent=doc('#content').text()
self.__writeintxt__(maincontent)
next_url=doc('.bottem1 a').items()
#print(next_url,type(next_url))
all_url=[]
for a in next_url:
#print (a.text())
if a.text() in ["上一章","章节目录","下一章","章节列表",]:
all_url.append(a.attr.href)
#print(all_url)
#获取下一章节的url
if len(all_url) == 3:
if all_url[-1]:
#print(all_url)
if all_url[-1][:1:]=='/':
next_url=self.baseurl+all_url[-1]
print(title,'\n',next_url)
else:
next_url=self.baseurl +all_url[-2] + all_url[-1]
print(title,'\n',next_url)
self.__getallchapter__(next_url)
else:
pass
else:
print('end')
except RequestException:
print('error')
def getcharset(self,content):
"""该方法是apparent_code的源代码截取
留在此处可以考虑重载该方法
但本例暂不考虑"""
charset=re.compile(r']' , flags=re.I)
print(charset.findall(content))
return (charset.findall(content))
def all_content(self):
"""此函数获取"""
self.__getallchapter__(self.url)
def getcontent(self,info):
"""此方法为避免__getallchapter__方法太过冗长而设"""
pass
def codemode(self,getrequest):
"""为应对网页出现乱码问题
例如无法识别charset等导致apparent_encoding无法正常识别编码
重新定义编码"""
if getrequest.encoding in ['Windows-1252','ISO-8859-1']:
return ('gbk')
else :
return getrequest.apparent_encoding