现在爬小说越来越难了,受到很多限制,格式也有很多变化,这里是一些用过的旧代码,可以改改再继续用。
# 爬取笔趣阁小说
from urllib.request import urlopen
from requests.exceptions import RequestException
import re
from requests import get
import time
import requests
from lxml import etree
requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
params = {
'enc':'utf-8'
}
urlfirst ="https://www.biqugexx.com/0_427/" #目录页 格式
# response = get(urlfirst)
response = requests.get(url=urlfirst,params=params,headers=headers)
# 打印出所请求页面返回的编码方式
# print(response.encoding)
# response.apparent_encoding是通过内容分析出的编码方式,这里是urf-8
# print(response.apparent_encoding)
# 转码
content = response.text.encode(response.encoding).decode(response.apparent_encoding)
# print(content)
# print(response.text)
html = etree.HTML(response.text) #etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正
k = ""
# s=" 第一百二十八章 阳明公也是 "
# p = "第"
# r=re.findall("第",s)
# pattern = "(.*?) "
# result = re.findall(pattern,content)
# print(result)
# exit()
# content = result
# pattern = "(第[\u4e00-\u9fa5]+章 [\u4e00-\u9fa5]+) "
pattern = " 第[0-9]+章 "
# /28/28056/21593229.html' >第六十一章 死道友不死贫道
result = re.findall(pattern,content) #
# print(result)
for pp in result:
time.sleep(3) # 太频繁会被封IP
k += "\n"+pp[1]+"\n" # 标题
print(pp[1])
chapter = "https://www.biqugexx.com"+pp[0]+".html"
print(chapter)
try:
response = requests.get(url=chapter,params=params,headers=headers)
content = response.text.encode(response.encoding).decode(response.apparent_encoding)
pattern = " (.*?) ps."
result = re.findall(pattern,content) # 获得每章数据 ;
print(result) # 测试输出内容
for r in result:
r = r.replace(" "," ")
r = r.replace("
","\n")
k += r
except RequestException as e:
print(e)
with open("biquge.txt","w+",encoding="utf8")as f:
f.write(k)
# 爬取笔趣阁小说
from urllib.request import urlopen
from requests.exceptions import RequestException
import re
from requests import get
import time
import requests
from lxml import etree
requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
params = {
'enc':'utf-8'
}
urlfirst ="https://www.bokan.cc/12/12620/" #目录页 格式
# response = get(urlfirst)
response = requests.get(url=urlfirst,params=params,headers=headers)
# 打印出所请求页面返回的编码方式
# print(response.encoding)
# response.apparent_encoding是通过内容分析出的编码方式,这里是urf-8
# print(response.apparent_encoding)
# 转码
content = response.text.encode(response.encoding).decode(response.apparent_encoding)
# print(content)
# print(response.text)
html = etree.HTML(response.text) #etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正
k = ""
# s=" 第一百二十八章 阳明公也是 "
# p = "第"
# r=re.findall("第",s)
# pattern = "(.*?) "
# result = re.findall(pattern,content)
# print(result)
# exit()
# content = result
# pattern = "(第[\u4e00-\u9fa5]+章 [\u4e00-\u9fa5]+) "
pattern = "第[0-9]+章 "
# /28/28056/21593229.html' >第六十一章 死道友不死贫道
result = re.findall(pattern,content) #
# print(result)
for pp in result:
time.sleep(3) # 太频繁会被封IP
k += "\n"+pp[1]+"\n" # 标题
print(pp[1])
chapter = "https://www.bokan.cc"+pp[0]+".html"
print(chapter)
try:
response = requests.get(url=chapter,params=params,headers=headers)
content = response.text.encode(response.encoding).decode(response.apparent_encoding)
pattern = " 第(.*?)"
result = re.findall(pattern,content) # 获得每章数据 ;
print(result) # 测试输出内容
for r in result:
r = r.replace(" "," ")
r = r.replace("
","\n")
k += r
except RequestException as e:
print(e)
with open("biquge.txt","w+",encoding="utf8")as f:
f.write(k)
import requests
import re
import time
s = requests.Session()
url = 'https://www.xxbiquge.com/2_2634/'
url = 'https://www.xsbiquge.com/91_91879/'
html = s.get(url)
html.encoding = 'utf-8'
# 获取章节
caption_title_1 = re.findall(r'',html.text)
caption_title_2 = re.findall(r'',html.text) #第1章
print(len(caption_title_1),len(caption_title_2))
# 写文件
path = r'title.txt' # 这是我存放的位置,你可以进行更改
file_name = open(path,'a',encoding='utf-8')
k = 0
# 循环下载每一章
for i in caption_title_1:
nn = int(caption_title_2[k])
k = k+1
print(nn)
if(nn>1000):
time.sleep(1)
caption_title_1 = 'https://www.xxbiquge.com'+i
# 网页源代码
s1 = requests.Session()
r1 = s1.get(caption_title_1)
r1.encoding = 'utf-8'
# 获取章节名
name = re.findall(r'',r1.text)[0]
print(name)
# cc = re.findall(r'第([0-9]+)章',name)
# nn = int(cc[0])
# if(nn>900):
file_name.write(name)
file_name.write('\n')
# 获取章节内容
chapters = re.findall(r'(.*?)',r1.text,re.S)[0]
chapters = chapters.replace(' ', '')
chapters = chapters.replace('readx();', '')
chapters = chapters.replace('& lt;!--go - - & gt;', '')
chapters = chapters.replace('<!--go-->', '')
chapters = chapters.replace('()', '')
# 转换字符串
s = str(chapters)
s_replace = s.replace('
',"\n")
while True:
index_begin = s_replace.find("<")
index_end = s_replace.find(">",index_begin+1)
if index_begin == -1:
break
s_replace = s_replace.replace(s_replace[index_begin:index_end+1],"")
pattern = re.compile(r' ',re.I)
fiction = pattern.sub(' ',s_replace)
file_name.write(fiction)
file_name.write('\n')
file_name.close()