编译环境:Python3.7.0
需要安装的库:requests
首先第一步:获取网页的源码(测试的URL:http://quanben5.com/n/yuzui/xiaoshuo.html)
import requests
#获取网页的源码
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def main():
url = 'http://quanben5.com/n/yuzui/xiaoshuo.html'
html = get_one_page(url)
print(html)
main()
运行截图:
第二步:获取所有章节的URL
此时需要库:re
import re
import requests
#获取网页的源码
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
#获取所有章的url
def get_url(html):
#用正则表达式提取出所需要的部分
URL = re.findall('',html,re.S)
list_url = [] # 定义一个列表来存储所有章的URL
for url_ in URL:
list_url.append( 'http://quanben5.com' + url_ )
for url_ in list_url:
print(url_)
def main():
url = 'http://quanben5.com/n/yuzui/xiaoshuo.html'
html = get_one_page(url)
get_url(html)
main()
效果图:
获取单章的内容:
import re
import requests
#获取网页的源码
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
'''
#获取所有章的url
def get_url(html):
#用正则表达式提取出所需要的部分
URL = re.findall('',html,re.S)
list_url = [] # 定义一个列表来存储所有章的URL
for url_ in URL:
list_url.append( 'http://quanben5.com' + url_ )
for url_ in list_url:
print(url_)
'''
#获取单章的内容
def get_content(html):
title = re.findall('(.*?)
',html,re.S)
title = title[0]
print(title)
content = re.findall('(.*?)
',html,re.S)
for sentence in content:
print(sentence)
def main():
url = 'http://quanben5.com/n/yuzui/41935.html'
html = get_one_page(url)
#get_url(html)
get_content(html)
main()
效果图:
下载单章内容:
import re
import requests
#获取网页的源码
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
'''
#获取所有章的url
def get_url(html):
#用正则表达式提取出所需要的部分
URL = re.findall('',html,re.S)
list_url = [] # 定义一个列表来存储所有章的URL
for url_ in URL:
list_url.append( 'http://quanben5.com' + url_ )
for url_ in list_url:
'''
#获取单章的内容
def get_content(html):
title = re.findall('(.*?)
',html,re.S)
title = title[0]
print(title)
write_to_file(title)
content = re.findall('(.*?)
',html,re.S)
for sentence in content:
print(sentence)
write_to_file(sentence)
write_to_file('\n')
#将内容保存到本地
def write_to_file(content):
with open('result.txt','a',encoding = 'utf-8') as f:
f.write(content+'\n')
def main():
url = 'http://quanben5.com/n/yuzui/41935.html'
html = get_one_page(url)
#get_url(html)
get_content(html)
main()
效果图:
下载全部内容:
import re
import requests
#获取网页的源码
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
#获取所有章的url
def get_url(html):
#用正则表达式提取出所需要的部分
URL = re.findall('',html,re.S)
list_url = [] # 定义一个列表来存储所有章的URL
for url_ in URL:
list_url.append( 'http://quanben5.com' + url_ )
return list_url
#获取单章的内容
def get_content(html):
title = re.findall('(.*?)
',html,re.S)
title = title[0]
print(title + '开始下载')
write_to_file(title)
content = re.findall('(.*?)
',html,re.S)
for sentence in content:
write_to_file(sentence)
write_to_file('\n')
#将内容保存到本地
def write_to_file(content):
with open('result.txt','a',encoding = 'utf-8') as f:
f.write(content+'\n')
#
def save_content(list_url):
for url_ in list_url:
html_ = get_one_page(url_)
get_content(html_)
def main():
url = 'http://quanben5.com/n/yuzui/xiaoshuo.html'
html = get_one_page(url)
list_url = get_url(html)
save_content(list_url)
main()
效果图:
完善搜索功能:
搜索后:
正则提取我们想要的目标URL:
import re
import requests
#定义全局变量keyword,方便创建text
keyword = input('请输入你要下载的小说:')
#获取网页的源码
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def main():
url = 'http://quanben5.com/index.php?c=book&a=search&keywords='+keyword
html = get_one_page(url)
url1 = re.findall(r'',html,re.S)
url1 = url1[0]
print(url1)
url2 = 'http://quanben5.com'+url1+'/xiaoshuo.html'
print(url2)
main()
运行如下:
有时候搜索不到,程序异常就会退出
下一步,加上异常处理:
import re
import requests
#定义全局变量keyword,方便创建text
keyword = input('请输入你要下载的小说:')
#获取网页的源码
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def main():
url = 'http://quanben5.com/index.php?c=book&a=search&keywords='+keyword
html = get_one_page(url)
url1 = re.findall(r'',html,re.S)
if url1 == []:
print('搜索不到!!!')
else:
url1 = url1[0]
url2 = 'http://quanben5.com'+url1+'/xiaoshuo.html'
print(url2)
main()
效果图:
----------------------------------------------------------------------分割线---------------------------------------------------------------------------------
上个完整代码:
import os
import re
import sys
import requests
#定义全局变量keyword,方便创建text
keyword = input('请输入你要下载的小说:')
name = str(keyword) + '.txt'
#获取网页的源码
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
#获取所有章的url
def get_url(html):
#用正则表达式提取出所需要的部分
URL = re.findall('',html,re.S)
list_url = [] # 定义一个列表来存储所有章的URL
for url_ in URL:
list_url.append( 'http://quanben5.com' + url_ )
return list_url
#获取单章的内容
def get_content(html):
title = re.findall('(.*?)
',html,re.S)
title = title[0]
print(title + '开始下载')
write_to_file(title)
content = re.findall('(.*?)
',html,re.S)
for sentence in content:
write_to_file(sentence)
write_to_file('\n')
#将内容保存到本地
def write_to_file(content):
with open(name,'a',encoding = 'utf-8') as f:
f.write(content+'\n')
#将所有章节保存到本地
def save_content(list_url):
for url_ in list_url:
html_ = get_one_page(url_)
get_content(html_)
def main():
url = 'http://quanben5.com/index.php?c=book&a=search&keywords='+keyword
html = get_one_page(url)
url1 = re.findall(r'',html,re.S)
if url1 == []:
print('搜索不到!!!')
flag = input('是否退出:(Y or N):')
if flag == 'Y':
sys.exit()
elif flag == 'y':
sys.exit()
else:
print('搜不到能怎么办,我也很无奈-.-||')
else:
url1 = url1[0] # 获得小说URL
url2 = 'http://quanben5.com'+url1+'/xiaoshuo.html' # 获得小说目录页URL
html2 = get_one_page(url2)
list_url = get_url(html2)
print(name + '开始下载!!!')
save_content(list_url)
main()
最后运行一下!!!(写博客访问得有点频繁,后面的章节就不放出来了,被限制连接了)