import urllib.request
import re
import os
import time
'''
@Author:王磊
@Time :2018/11/10 15:39:02
'''
def get_html(url):
'''获取url界面数据'''
page = urllib.request.Request(url)
html = urllib.request.urlopen(page).read().decode("utf-8")
#print(html)
return html
def get_all_index():
'''获取站点下所有小数的目录地址以及小说名称'''
html = get_html("http://book.zongheng.com/store/c0/c0/b0/u0/p1/v9/s1/t0/u0/i1/ALL.html")
#print(html)
# reg = re.compile(r'')
reg = re.compile(r'(.*?)')
urls_names = re.findall(reg, html)
res_list = []
for url_name in urls_names:
mete_list = []
html_in = get_html(url_name[0])
reg_in = re.compile(r'')
url_mete = re.findall(reg_in, html_in)
url_mete.append(url_name[1])
res_list.append(url_mete)
return res_list
def get_urls_titles_list(html):
'''获取目录下当前元素的章节内容地址以及章节名称'''
reg = re.compile(r'(.*?)')
urls_titles = re.findall(reg, html)
#print(urls_titles)
return urls_titles
def get_content(url):
'''通过章节内容地址获取章节内容'''
html = get_html(url)
reg1 = re.compile(r'(.*?)
')
# reg1 = re.compile(r'(.*?)')
content = re.findall(reg1, html)[0:-1]
str1 = ""
for str0 in content:
str1 += str0 + "\r\n"
return str1
def run():
#获取站点书名地址目录数据
books = get_all_index()
for book in books:
# 创建存储目录地址
if not os.path.exists("C:\\Users\\asus\\Desktop\\pc\\story\\" + book[1]):
os.mkdir("C:\\Users\\asus\\Desktop\\pc\\story\\" + book[1])
#获取页面数据
html = get_html(book[0])
#获取界面地址
urls_titles_list = get_urls_titles_list(html)
print("*" * 10 + "开始下载书籍:《%s》" % book[1] + "*" * 10)
if len(urls_titles_list) != 0:
list_err = []
print("开始下载章节!")
for url_title in urls_titles_list:
print("正在下载章节:", url_title[1])
try:
content = get_content(url_title[0])
with open("C:\\Users\\asus\\Desktop\\pc\\story\\" + book[1] + "\\" + url_title[1] + ".txt", "a", encoding="utf-8") as f:
f.write(content)
print("章节:%s下载成功!" % url_title[1])
except Exception as e:
list_err.append(url_title[1])
print("章节:%s下载失败!" % url_title[1])
continue
#防止ip过度活跃,降低下载速度
time.sleep(1)
if len(list_err) == 0:
print("《" + book[1] + "》所有章节已经下载完成!没有失败下载的章节!")
else:
print("《%s》以下章节下载失败:" % book[1])
for errdownload in list_err:
print(errdownload)
else:
print("操作频繁导致站点黑名单识别或网络异常!请稍后尝试或更新ip地址!")
time.sleep(3)
if __name__ == "__main__":
run()
☞点击这里与我探讨☚
♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪
♪♪后续会更新系列基于Python的爬虫小例子,欢迎关注。♪♪
♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪