爬取漫画网站漫画(爬虫学习)

共实现两个网站,漫画栈处于完成品状态,最近没空做了,写个博客记录一下。

第一步,导入库:

import json
import os
from time import sleep

import requests
from lxml import etree
import re

from RandomUAMiddleware import RandomUAMiddleware
from imgWebInfo import lenON, imgWebInfo

同样的,定义一个获取网页的函数:

def getHtml(url):
    try:
        h = RandomUAMiddleware()
        header = {
            'User-Agent': h.Agent
        }
        r = requests.get(url, headers=header)
        # 解决解析乱码问题
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        # sleep(5)
        # print(r.text)
        return r
    except Exception as e:
        print(e)
        return ""

获取页面所有漫画图片链接:

def getImgUrl(html, listUrl, id_2):
    x = imgWebInfo(id_2)
    xhtml = etree.HTML(html.content.decode(x.charset))    # 解析网页
    if x.flag3 == 0:
        scriptInfo = str(xhtml.xpath(x.imgScript))
        scriptInfo = scriptInfo.replace(';', '\n')
        imgUrl = re.findall(x.imgUrl, scriptInfo)
        path = re.findall(x.imgPath, scriptInfo)
        for elm in imgUrl:
            listUrl.append(elm)
        return path[0]
    else:
        pageTitle = xhtml.xpath(x.pageTitle)
        # print(pageTitle)
        # id_cartoon = pageUrl[0].replace('/', '')
        # print(id_cartoon)
        # id_page = pageTitle[0].replace(id_cartoon, '')
        # id_page = id_page.replace('/', '')
        # id_page = id_page.replace('.html', '')
        # print(id_page)
        l = re.findall(r'\d+', pageTitle[0])
        # print(l)
        imgJson = x.getJson.replace('[\']', l[1])
        imgJson = imgJson.replace('[/]', l[0])
        # print(imgJson)
        jsonHtml = getHtml(imgJson)
        print(jsonHtml.text)
        try:
            loadJosn = json.loads(jsonHtml.text, strict=False)
            for u in loadJosn['data']['page']:
                listUrl.append(u['image'])
        except:
            print("获取图片失败")
        return ''

通过图片地址列表下载图片:

def downLoad(urlList, minPath, rootPath, id_1):
    # print(urlList)
    x = imgWebInfo(id_1)
    if x.flag3 == 0:
        x = x.downloadRoot
    else:
        x = ''
    for u in range(0, len(urlList)):
        path = rootPath + str(u) + '.jpg'
        print(path)
        img_url = x + minPath + urlList[u][0]
        # print(img_url)
        print("下载中-----")
        # print(img_url)
        try:
            if not os.path.exists(rootPath):
                os.makedirs(rootPath)
            if not os.path.exists(path):
                r = requests.get(img_url)
                with open(path, 'wb') as f:
                    f.write(r.content)
                    f.close()
                    print("图片下载成功", end='\n\n')
            else:
                print("文件已存在", end='\n\n')
        except Exception as e:
            print(e)
通过漫画主页获得漫画所有章节的页面地址和名字:
def getPageAllUrl(cartoon, CList, id_2):
    x = imgWebInfo(id_2)
    xhtml = etree.HTML(cartoon.content.decode(x.charset))
    newU = []
    if x.flag2 == 0:
        url = xhtml.xpath(x.pageUrl)
        url = [str(p) for p in url]
        for o in url:
            newU.append(x.root + o)
        url = newU
    else:
        r = re.findall('data-hreflink.+?\.html', str(cartoon.text))
        for u in r:
            u = str(u).replace('data-hreflink=\"', '')
            newU.append(u)
        url = newU
    # print(url)
    # print(type(url))
    name = xhtml.xpath(x.pageName)
    name = [str(p) for p in name]
    newL = []
    # print(name)
    for u in name:
        u = u.replace(' ', '')
        u = u.replace('\n', '')
        u = u.strip()
        newL.append(u)
    while '' in newL:
        newL.remove('')
    # print(len(newL))
    if x.flag2 == 0:
        for i_1 in range(0, len(newL)):
            al = [url[i_1], newL[i_1]]
            CList.append(al)
    else:
        for i_1 in range(0, len(newL)):
            al = [x.root + url[i_1], newL[i_1]]
            CList.append(al)
    # print(CList)
通过漫画名获取漫画主页地址:
def getCartoon(name, id_1):
    global getUrl, i
    x = imgWebInfo(id_1)
    if x.flag1 == 0:
        getUrl = x.getUrl + name[0]
    else:
        getUrl = x.getUrl.replace('[\']', name[0])
    html = getHtml(getUrl)
    # print(getUrl)
    xhtml = etree.HTML(html.content.decode(x.charset))  # 解析网页
    if x.flag4 == 0:
        c = xhtml.xpath(x.cartoon)
        cartoonUrl = c[0]
        # print(cartoonUrl)
    else:
        c = xhtml.xpath(x.cartoon)
        cartoonUrl = x.root + c[0]
    name = xhtml.xpath(x.cartoonTitle)
    print(name[0])
    i = int(input("输入1确认下载\n"))
    if i != 1:
        return None
    return cartoonUrl, name

减少错误将函数的调用抽成一个独立函数:

def downloadCartoon(cList, cHtml, id_1, name=None):
    global i
    # print(cHtml.text)
    x = imgWebInfo(id_1)
    getPageAllUrl(cHtml, cList, id_1)
    # print(cList)
    for i in range(0, len(cList)):
        pageUrl = cList[i][0]
        # print(pageUrl)
        pageName = cList[i][1]
        print(pageName, end='\n---------------------------\n')
        # print(pageUrl, end="\n\n")
        imgList = []
        print(name)
        # 自定义目录吧,不设置也行
        rootP = '' + str(name[0]) + '\\' + pageName + '\\'
        path = getImgUrl(getHtml(pageUrl), imgList, id_1)
        downLoad(imgList, path, rootP, id_1)

主函数:

if __name__ == '__main__':
    for i in range(lenON().lenON):
        a = imgWebInfo(i)
        if a.flag == 1:
            flag = "搜索功能差"
        else:
            flag = ""
        print(a.name + " id:(" + str(i) + ") " + flag, end=" |  ")
        if i % 4 == 0 and i != 0:
            print()
    print("退出请输入999")
    print("搜索功能差的网站请尝试删字或换字搜索,实在不行请前往获取需要爬取漫画的主页地址\n如果下载完成程序会自动退出")
    i = 0
    while i != 1:
        try:
            cartoonList = []
            d = int(input("要爬取的网站id:\n"))
            if d == 999:
                break
            elif d in lenON().noeGoodId:
                q = int(input("输入1进行名字查找,输入2输入需要爬取的漫画主页进行爬取\n"))
                while q != 2:
                    n = input("需要查找的漫画名:\n")
                    mainPage, n = getCartoon(n, d)
                    print(mainPage)
                    print(n)
                    if mainPage is None:
                        continue
                    downloadCartoon(cartoonList, getHtml(mainPage), d, n)
                else:
                    cartUrl = str(input(str(d) + "输入需要爬取漫画的主页地址:\n"))
                    b = input("请输入保存的漫画名:\n")
                    downloadCartoon(cartoonList, getHtml(cartUrl), d, b)
                    break
            elif d in lenON().badId:
                cartUrl = str(input("输入需要爬取漫画的主页地址:\n"))
                b = input("请输入保存的漫画名:\n")
                downloadCartoon(cartoonList, getHtml(cartUrl), d, b)
                break
            elif d >= lenON().lenON:
                print("请输入正确的id!\n")
                continue
            else:
                n = input("输入想爬取的漫画的名字(可少字不能错字多字):\n")
                mainPage, n = getCartoon(n, d)
                if mainPage is None:
                    continue
                downloadCartoon(cartoonList, getHtml(mainPage), d, n)
                break
        except Exception as e:
            print(e)
            continue

获取请求头的类在往期博客里。

网站规则类:

WebList = [
    {
        'name': '古风漫画网',
        'root': 'https://www.gufengmh8.com',
        'getUrl': 'https://www.gufengmh8.com/search/?keywords=',
        'flag1': 0,
        'pageUrl': '//*[@id="chapters"]/div[3]/div[2]/ul/li/a/@href',
        'flag2': 0,
        'pageName': '//*[@id="chapters"]/div[3]/div[2]/ul/li/a/span/text()',
        'flag3': 0,
        'imgScript': '/html/body/script[1]/text()',
        'imgUrl': '1.+?.jpg',
        'imgPath': 'images.+/',
        'cartoon': '//*[@id="contList"]/li[1]/a/@href',
        'cartoonTitle': '//*[@id="contList"]/li[1]/a/@title',
        'flag': 1,
        'downloadRoot': 'https://res.xiaoqinre.com/',
        'flag4': 0,
        'charset': 'utf-8'
    },
    {
        'name': '漫画栈',
        'root': 'https://www.mkzhan.com',
        'getUrl': 'https://www.mkzhan.com/search/?keyword=',
        'flag1': 0,
        'pageUrl': '/html/body/div[2]/div[1]/div[2]/a/@hreflink',      # 自定义标签怎么提取??????
        'flag2': 1,
        'pageName': '/html/body/div[3]/div[1]/div[1]/div[2]/ul/li//a/text()',
        'flag3': 1,
        'pageTitle': '/html/body/div[2]/div[1]/div[2]/h1/a/@href',
        'cartoonUrl': '/html/body/div[2]/div[1]/div[2]/a[2]/@href',
        'getJson': 'https://comic.mkzcdn.com/chapter/content/v1/?chapter_id=[\']&comic_id=[/]&format=1&quality=1&type=1',
        'cartoon': '/html/body/div[2]/div[1]/div[2]/a/@href',
        'cartoonTitle': '/html/body/div[2]/div[1]/div[2]/p[1]/a/text()',
        'flag': 0,
        'flag4': 1,
        'charset': 'utf-8'
    }
]

'''
    flag1为0表示getUrl可以直接拼接
    flag2为0表示pageUrl不需要拼接,可以直接获取
    flag3为0表示可以直接读取script获取图片地址
    flag为0表示网站搜索功能正常
    flag4为0表示获取漫画主页时无需拼接
'''


class imgWebInfo(object):
    def __init__(self, id):
        self.cartoonInfo = WebList[id]
        self.name = self.cartoonInfo['name']
        self.root = self.cartoonInfo['root']
        self.getUrl = self.cartoonInfo['getUrl']
        self.flag1 = self.cartoonInfo['flag1']
        self.pageUrl = self.cartoonInfo['pageUrl']
        self.flag2 = self.cartoonInfo['flag2']
        self.pageName = self.cartoonInfo['pageName']
        self.flag3 = self.cartoonInfo['flag3']
        # 不同网站获取图片url不同
        if self.flag3 == 0:
            self.imgScript = self.cartoonInfo['imgScript']
            self.imgUrl = self.cartoonInfo['imgUrl']
            self.imgPath = self.cartoonInfo['imgPath']
            self.downloadRoot = self.cartoonInfo['downloadRoot']
        else:
            self.pageTitle = self.cartoonInfo['pageTitle']
            self.cartoonUrl = self.cartoonInfo['cartoonUrl']
            self.getJson = self.cartoonInfo['getJson']

        self.cartoon = self.cartoonInfo['cartoon']
        self.cartoonTitle = self.cartoonInfo['cartoonTitle']
        self.flag = self.cartoonInfo['flag']
        self.flag4 = self.cartoonInfo['flag4']
        self.charset = self.cartoonInfo['charset']


class lenON(object):
    def __init__(self):
        self.lenON = len(WebList)
        self.badId = []
        self.noeGoodId = [0]

 

你可能感兴趣的:(随手一记,python,爬虫)