共实现两个网站,漫画栈处于完成品状态,最近没空做了,写个博客记录一下。
第一步,导入库:
import json
import os
from time import sleep
import requests
from lxml import etree
import re
from RandomUAMiddleware import RandomUAMiddleware
from imgWebInfo import lenON, imgWebInfo
同样的,定义一个获取网页的函数:
def getHtml(url):
try:
h = RandomUAMiddleware()
header = {
'User-Agent': h.Agent
}
r = requests.get(url, headers=header)
# 解决解析乱码问题
r.raise_for_status()
r.encoding = r.apparent_encoding
# sleep(5)
# print(r.text)
return r
except Exception as e:
print(e)
return ""
获取页面所有漫画图片链接:
def getImgUrl(html, listUrl, id_2):
x = imgWebInfo(id_2)
xhtml = etree.HTML(html.content.decode(x.charset)) # 解析网页
if x.flag3 == 0:
scriptInfo = str(xhtml.xpath(x.imgScript))
scriptInfo = scriptInfo.replace(';', '\n')
imgUrl = re.findall(x.imgUrl, scriptInfo)
path = re.findall(x.imgPath, scriptInfo)
for elm in imgUrl:
listUrl.append(elm)
return path[0]
else:
pageTitle = xhtml.xpath(x.pageTitle)
# print(pageTitle)
# id_cartoon = pageUrl[0].replace('/', '')
# print(id_cartoon)
# id_page = pageTitle[0].replace(id_cartoon, '')
# id_page = id_page.replace('/', '')
# id_page = id_page.replace('.html', '')
# print(id_page)
l = re.findall(r'\d+', pageTitle[0])
# print(l)
imgJson = x.getJson.replace('[\']', l[1])
imgJson = imgJson.replace('[/]', l[0])
# print(imgJson)
jsonHtml = getHtml(imgJson)
print(jsonHtml.text)
try:
loadJosn = json.loads(jsonHtml.text, strict=False)
for u in loadJosn['data']['page']:
listUrl.append(u['image'])
except:
print("获取图片失败")
return ''
通过图片地址列表下载图片:
def downLoad(urlList, minPath, rootPath, id_1):
# print(urlList)
x = imgWebInfo(id_1)
if x.flag3 == 0:
x = x.downloadRoot
else:
x = ''
for u in range(0, len(urlList)):
path = rootPath + str(u) + '.jpg'
print(path)
img_url = x + minPath + urlList[u][0]
# print(img_url)
print("下载中-----")
# print(img_url)
try:
if not os.path.exists(rootPath):
os.makedirs(rootPath)
if not os.path.exists(path):
r = requests.get(img_url)
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("图片下载成功", end='\n\n')
else:
print("文件已存在", end='\n\n')
except Exception as e:
print(e)
通过漫画主页获得漫画所有章节的页面地址和名字:
def getPageAllUrl(cartoon, CList, id_2):
x = imgWebInfo(id_2)
xhtml = etree.HTML(cartoon.content.decode(x.charset))
newU = []
if x.flag2 == 0:
url = xhtml.xpath(x.pageUrl)
url = [str(p) for p in url]
for o in url:
newU.append(x.root + o)
url = newU
else:
r = re.findall('data-hreflink.+?\.html', str(cartoon.text))
for u in r:
u = str(u).replace('data-hreflink=\"', '')
newU.append(u)
url = newU
# print(url)
# print(type(url))
name = xhtml.xpath(x.pageName)
name = [str(p) for p in name]
newL = []
# print(name)
for u in name:
u = u.replace(' ', '')
u = u.replace('\n', '')
u = u.strip()
newL.append(u)
while '' in newL:
newL.remove('')
# print(len(newL))
if x.flag2 == 0:
for i_1 in range(0, len(newL)):
al = [url[i_1], newL[i_1]]
CList.append(al)
else:
for i_1 in range(0, len(newL)):
al = [x.root + url[i_1], newL[i_1]]
CList.append(al)
# print(CList)
通过漫画名获取漫画主页地址:
def getCartoon(name, id_1):
global getUrl, i
x = imgWebInfo(id_1)
if x.flag1 == 0:
getUrl = x.getUrl + name[0]
else:
getUrl = x.getUrl.replace('[\']', name[0])
html = getHtml(getUrl)
# print(getUrl)
xhtml = etree.HTML(html.content.decode(x.charset)) # 解析网页
if x.flag4 == 0:
c = xhtml.xpath(x.cartoon)
cartoonUrl = c[0]
# print(cartoonUrl)
else:
c = xhtml.xpath(x.cartoon)
cartoonUrl = x.root + c[0]
name = xhtml.xpath(x.cartoonTitle)
print(name[0])
i = int(input("输入1确认下载\n"))
if i != 1:
return None
return cartoonUrl, name
减少错误将函数的调用抽成一个独立函数:
def downloadCartoon(cList, cHtml, id_1, name=None):
global i
# print(cHtml.text)
x = imgWebInfo(id_1)
getPageAllUrl(cHtml, cList, id_1)
# print(cList)
for i in range(0, len(cList)):
pageUrl = cList[i][0]
# print(pageUrl)
pageName = cList[i][1]
print(pageName, end='\n---------------------------\n')
# print(pageUrl, end="\n\n")
imgList = []
print(name)
# 自定义目录吧,不设置也行
rootP = '' + str(name[0]) + '\\' + pageName + '\\'
path = getImgUrl(getHtml(pageUrl), imgList, id_1)
downLoad(imgList, path, rootP, id_1)
主函数:
if __name__ == '__main__':
for i in range(lenON().lenON):
a = imgWebInfo(i)
if a.flag == 1:
flag = "搜索功能差"
else:
flag = ""
print(a.name + " id:(" + str(i) + ") " + flag, end=" | ")
if i % 4 == 0 and i != 0:
print()
print("退出请输入999")
print("搜索功能差的网站请尝试删字或换字搜索,实在不行请前往获取需要爬取漫画的主页地址\n如果下载完成程序会自动退出")
i = 0
while i != 1:
try:
cartoonList = []
d = int(input("要爬取的网站id:\n"))
if d == 999:
break
elif d in lenON().noeGoodId:
q = int(input("输入1进行名字查找,输入2输入需要爬取的漫画主页进行爬取\n"))
while q != 2:
n = input("需要查找的漫画名:\n")
mainPage, n = getCartoon(n, d)
print(mainPage)
print(n)
if mainPage is None:
continue
downloadCartoon(cartoonList, getHtml(mainPage), d, n)
else:
cartUrl = str(input(str(d) + "输入需要爬取漫画的主页地址:\n"))
b = input("请输入保存的漫画名:\n")
downloadCartoon(cartoonList, getHtml(cartUrl), d, b)
break
elif d in lenON().badId:
cartUrl = str(input("输入需要爬取漫画的主页地址:\n"))
b = input("请输入保存的漫画名:\n")
downloadCartoon(cartoonList, getHtml(cartUrl), d, b)
break
elif d >= lenON().lenON:
print("请输入正确的id!\n")
continue
else:
n = input("输入想爬取的漫画的名字(可少字不能错字多字):\n")
mainPage, n = getCartoon(n, d)
if mainPage is None:
continue
downloadCartoon(cartoonList, getHtml(mainPage), d, n)
break
except Exception as e:
print(e)
continue
获取请求头的类在往期博客里。
网站规则类:
WebList = [
{
'name': '古风漫画网',
'root': 'https://www.gufengmh8.com',
'getUrl': 'https://www.gufengmh8.com/search/?keywords=',
'flag1': 0,
'pageUrl': '//*[@id="chapters"]/div[3]/div[2]/ul/li/a/@href',
'flag2': 0,
'pageName': '//*[@id="chapters"]/div[3]/div[2]/ul/li/a/span/text()',
'flag3': 0,
'imgScript': '/html/body/script[1]/text()',
'imgUrl': '1.+?.jpg',
'imgPath': 'images.+/',
'cartoon': '//*[@id="contList"]/li[1]/a/@href',
'cartoonTitle': '//*[@id="contList"]/li[1]/a/@title',
'flag': 1,
'downloadRoot': 'https://res.xiaoqinre.com/',
'flag4': 0,
'charset': 'utf-8'
},
{
'name': '漫画栈',
'root': 'https://www.mkzhan.com',
'getUrl': 'https://www.mkzhan.com/search/?keyword=',
'flag1': 0,
'pageUrl': '/html/body/div[2]/div[1]/div[2]/a/@hreflink', # 自定义标签怎么提取??????
'flag2': 1,
'pageName': '/html/body/div[3]/div[1]/div[1]/div[2]/ul/li//a/text()',
'flag3': 1,
'pageTitle': '/html/body/div[2]/div[1]/div[2]/h1/a/@href',
'cartoonUrl': '/html/body/div[2]/div[1]/div[2]/a[2]/@href',
'getJson': 'https://comic.mkzcdn.com/chapter/content/v1/?chapter_id=[\']&comic_id=[/]&format=1&quality=1&type=1',
'cartoon': '/html/body/div[2]/div[1]/div[2]/a/@href',
'cartoonTitle': '/html/body/div[2]/div[1]/div[2]/p[1]/a/text()',
'flag': 0,
'flag4': 1,
'charset': 'utf-8'
}
]
'''
flag1为0表示getUrl可以直接拼接
flag2为0表示pageUrl不需要拼接,可以直接获取
flag3为0表示可以直接读取script获取图片地址
flag为0表示网站搜索功能正常
flag4为0表示获取漫画主页时无需拼接
'''
class imgWebInfo(object):
def __init__(self, id):
self.cartoonInfo = WebList[id]
self.name = self.cartoonInfo['name']
self.root = self.cartoonInfo['root']
self.getUrl = self.cartoonInfo['getUrl']
self.flag1 = self.cartoonInfo['flag1']
self.pageUrl = self.cartoonInfo['pageUrl']
self.flag2 = self.cartoonInfo['flag2']
self.pageName = self.cartoonInfo['pageName']
self.flag3 = self.cartoonInfo['flag3']
# 不同网站获取图片url不同
if self.flag3 == 0:
self.imgScript = self.cartoonInfo['imgScript']
self.imgUrl = self.cartoonInfo['imgUrl']
self.imgPath = self.cartoonInfo['imgPath']
self.downloadRoot = self.cartoonInfo['downloadRoot']
else:
self.pageTitle = self.cartoonInfo['pageTitle']
self.cartoonUrl = self.cartoonInfo['cartoonUrl']
self.getJson = self.cartoonInfo['getJson']
self.cartoon = self.cartoonInfo['cartoon']
self.cartoonTitle = self.cartoonInfo['cartoonTitle']
self.flag = self.cartoonInfo['flag']
self.flag4 = self.cartoonInfo['flag4']
self.charset = self.cartoonInfo['charset']
class lenON(object):
def __init__(self):
self.lenON = len(WebList)
self.badId = []
self.noeGoodId = [0]