最近新买了kindle,想用kindle看海贼王漫画,看了一些制作mobi的方法都需要先把海贼王的漫画图片下下来,就想用python试试,免得一页一页保存。
找爬的网站
检查网页源代码,可以找到图片地址
查看网页response返回的html,可以看的里面图片地址的末尾被存储在mhurl里面,下一张图片的在mhurl1里面。
我都忘了CSS什么的怎么用了,就直接用正则表达式匹配了一下,mhurl的地址找出来下载图片,然后看mhurl1来判断是否有下一张(其实好像都是16页,没啥需要判断的)
import requests
import re
import time
def get_one_page(url,headers):
response = requests.get(url,headers = headers)
if response.status_code == 200:
return response
else:
get_one_page(url,headers)#有的网页一下子刷不出来,多进几次就行,其实这里有可能出现无限递归。但是能用了我就懒得改了
def downloadpic(url,num,filepath):
path1 = 'http://www-mipengine-org.mipcdn.com/i/p1.manhuapan.com/'
pattern = re.compile('var mhurl="(.*)";var\sUrl=')
pattern2 = re.compile('var mhurl1="(.*)"')#检测是否到最后一页
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
response = get_one_page(url,headers)
html = response.text
result = re.findall(pattern, html)
path = path1 + result[0]
print(path)
response = get_one_page(path,headers = headers)
print(response.content)
with open(filepath +'第' + str(num) +'张.jpg','wb') as fw:
fw.write(response.content)
result2 = re.findall(pattern2,html)
if result2:
return True#有下一页
else:
return False#没有下一页
def mkdir(path):
import os
path = path.strip()#去掉空格
path = path.rstrip('\\')#去掉尾部反斜杠
isExists = os.path.exists(path)#路径是否存在
if not isExists :
os.makedirs(path)
return True
else:
return False
def downwords(url,num):
basicpath = r'D:\manga\Download\海贼王'
totalpath = basicpath+str('\\第'+ str(num) +'话')
mkdir(totalpath)
flag = True
for i in range(16):
if i ==0:
totalurl = url
else:
totalurl = url+ '/index_' +str(i)+'.html'
if(not downloadpic(totalurl,i, totalpath + '\\')):
break
#time.sleep("0.5")
def main():
basicurl = 'https://www.fzdm.com/manhua/2/'
for i in range(945,966):
url = basicurl + str(i)
downwords(url,i)
main()