python爬漫画,一人*下漫画,多线程快速的下载

下载漫画的代码,这个网站本身就是盗版的,也没有什么反爬,就是在下载图片的时候,在请求头里面有个referer需要注意下,需要带上是第几页,其他的就是正常下载了。

下载思路是在目录页里面获取所有章节的url,然后遍历了下所有url,在下载过程中加了一个多线程
下载每个章节下的图片。

import requests, os, re, threading


class YiRenZhiXia():
    def __init__(self):
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Referer': 'https://www.lbsulu.com/mh/yirenzhixia/160246.html',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Mobile Safari/537.36',
        }
        self.comic_chapter_list_url = 'https://m.bnmanhua.com/comic/15.html' # 这是目录url
        self.chapter_prefix = 'https://m.bnmanhua.com' # 章节url的前缀,目录页获取的章节url没有前缀
        self.img_prefix = 'https://img.yaoyaoliao.com/' # 图片url的前缀,章节里面获取的图片url也没有前缀

    def get_chapter_url(self): # 从目录页获取所有的章节url
        response = requests.get(self.comic_chapter_list_url, headers=self.headers).text
        # print(response)
        chapter_url_list = list(re.findall('
  • (.*?)
  • '
    , response)) # 正则提取所有章节url和章节的名字 # print(chapter_url_list) return chapter_url_list def get_img_url(self, url): #从章节里面获取所有漫画图片的url chapter_prefix = 'https://m.bnmanhua.com' response = requests.get(chapter_prefix + url, headers=self.headers).text img_url_list = re.findall(" var z_img='\[(.*?)\]';", response) # 正则提取下图片url列表 return img_url_list[0].split(',') # 切割一下,做成图片url的list def save(self, i, url, referer, title): # 保存图片 headers = { 'accept': 'image / webp,image/apng,image/*, */*;q=0.8', 'accept - encoding': 'gzip, deflate, br', 'accept - language': 'zh-CN, zh;q = 0.9', # 'referer': 'https://m.bnmanhua.com/comic/15/2369.html?p=5', 'https://img.yaoyaoliao.com/images/comic/159/316518/1519527845bO68qo1r4N0X8Hfr.jpg' 'referer': referer, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' } response = requests.get(url, headers=headers).content # 下载图片 if os.path.isdir('E:\一人之下漫画' + '/' + title): pass else: os.mkdir('E:\一人之下漫画' + '/' + title) with open('E:\一人之下漫画' + '/' + title + '/' + str(i + 1) + '.' + title + '.jpg', 'wb') as f: f.write(response) print('正在下载:' + title + '-' + str(i + 1)) def run(self): os.mkdir('E:\一人之下漫画') chapter_url_list = self.get_chapter_url() for chapter_url, chapter_title in chapter_url_list: print( '-------------------------------------------开始章节:' + chapter_title + '-------------------------------------------') img_url_list = self.get_img_url(chapter_url) threads = [] for i in range(len(img_url_list)): if i == 0: referer = self.chapter_prefix + chapter_url else: referer = self.chapter_prefix + chapter_url + str(i + 1) url = self.img_prefix + img_url_list[i].split('"')[1].replace('\\', '') t = threading.Thread(target=self.save, args=(i, url, referer, chapter_title)) threads.append(t) for s in threads: s.start() for j in threads: j.join() if __name__ == '__main__': yirenzhixia = YiRenZhiXia() yirenzhixia.run()

    你可能感兴趣的:(python)