爬虫案例 单线程与多线程爬取漫画

由于版权问题,暂不明述具体网站

遇到问题

  • 在不停访问每一个章节时,需要用response.close(),避免与服务器连接过多,无法进行新的连接
  • request库保存图片时,图片的url不能出现汉字,使用
    request.quote(url, safe=";/?: @&=+$,", encoding=“utf-8”)
    具体请看https://blog.csdn.net/a12355556/article/details/113726856

单线程爬取

使用正则获得每一章节的名字,url,包含的所有图片url,使用request保存图片

代码

import requests,re,os
from urllib import request
novel_name = ''
def get_chapters(url):
    r = requests.get(url).content.decode('gbk')
    novel_name = re.findall('(.*?)',r)[0]
    if not os.path.exists(novel_name):
        os.mkdir(novel_name)
    chapters_urls = [ 'http://comic.kkkkdm.com'+i for i in re.findall('(/comiclist/\d+/\d+/\d+.htm)',r)][::4]
    names=[]
    [ names.append(i) for i in re.findall('(.*?)',r) if len(i)>2 and str(type(i))==""]
    for i in range(len(names)):
        num=0
        ch_name = novel_name+'\\'+names[i]
        if not os.path.exists(ch_name):
            print(names[i]+'下载中')
            os.mkdir(ch_name)
        try:
            imgs = one_chap_urls(chapters_urls[i])
            for img_url in imgs:
                print(img_url)
                save(ch_name+'\\{}.png'.format(str(num)),img_url)
                num+=1
            print(ch_name+'下载完成')
        except:
            print(names[i]+'下载完成')
            continue
    
        
    
def one_chap_urls(url):
    imgs=[]
    r = requests.get(url).content.decode('gbk')
    num = int(re.search('共(\d+)页',r).group(1))
    urls = [re.sub('\d+.htm','',url)+'{}.htm'.format(str(i)) for i in range(1,num+1)]
    for url in urls:
        try:
            r = requests.get(url).content.decode('gbk')
            img = 'http://ss1.kkkkdm.com'+re.findall('IMG SRC=(.*?)>',r)[0].replace('"+m201304d+"','/').replace("'",'')
            
            imgs.append(img)
        except:
            continue
    return imgs
def save(name,url):
    
    try:
        if not os.path.exists(name):
            request.urlretrieve(url,name)
            
    except:
        url = request.quote(url, safe=";/?:@&=+$,", encoding="utf-8")
        if not os.path.exists(name):
            request.urlretrieve(url,name)
if __name__ == '__main__':
    url = input("请输入漫画目录地址:")
    get_chapters(url)

多线程爬取

先把每一章节url放入chapter_queue队列中,生产者用来解析每一章节的url,生成需要下载的图片url,并把url放入img_queue队列中;
消费者用来下载img_queue中的url使之为图片

代码

from lxml import etree
from urllib import request
import os,re,time,requests,threading,random
from queue import Queue

class Producer(threading.Thread):
    def __init__(self,chapter_queue,img_queue,*args,**kwargs):
        super(Producer, self).__init__(*args,**kwargs)
        self.chapter_queue = chapter_queue
        self.img_queue = img_queue


    def run(self):
        while True:
            if self.chapter_queue.empty():
                break
            url,name = self.chapter_queue.get(block=True)
            self.parse_chapter(url,name)
            
    def parse_chapter(self,url,name):
        n=0
        ch_name = os.getcwd()+'\\'+novel_name+'\\'+name+'\\'
        if not os.path.exists(ch_name):
            os.mkdir(ch_name)
        try:
            imgs = self.one_chap_urls(url)
            for img_url in imgs:
                img_name=ch_name+'{}.png'.format(str(n))
                self.img_queue.put((img_url,img_name))
                n+=1
        except Exception as e:
            print(e)
            pass
    def one_chap_urls(self,url):
        imgs=[]
        response = requests.get(url)
        r = response.content.decode('gbk')
        response.close()
        num = int(re.search('共(\d+)页',r).group(1))
        urls = [re.sub('\d+.htm','',url)+'{}.htm'.format(str(i)) for i in range(1,num+1)]
        for url in urls:
            try:
                r1 = requests.get(url)
                r1.close()
                r1 = r1.content.decode('gbk')
                img = 'http://ss1.kkkkdm.com'+re.findall('IMG SRC=(.*?)>',r1)[0].replace('"+m201304d+"','/').replace("'",'')

                imgs.append(img)
            except:
                continue
        return imgs

class Consumer(threading.Thread):
    def __init__(self,chapter_queue,img_queue,*args,**kwargs):
        super(Consumer, self).__init__(*args,**kwargs)
        self.chapter_queue = chapter_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.img_queue.empty():
                time.sleep(20)
                if self.chapter_queue.empty():
                    print("队列空")
                    return
            
            img = self.img_queue.get(block=True)
            time.sleep(0.1)
            url,filename = img

            url0 = request.quote(url, safe=";/?:@&=+$,", encoding="utf-8")
            try:
                request.urlretrieve(url0,filename)
                print(filename+'  下载完成!')
            except Exception as e:
                print(e)
                print(filename+'  下载失败!')
                continue

def main():
    global novel_name
    start = time.time()
    chapter_queue = Queue(1000)
    img_queue = Queue(2000)
    url = input("请输入漫画目录地址:") 
    response = requests.get(url)
    r = response.content.decode('gbk')
    response.close()
    novel_name = re.findall('(.*?)',r)[0]
    if not os.path.exists(novel_name):
        os.mkdir(novel_name)
    chapters_urls = [ 'http://comic.kkkkdm.com'+i for i in re.findall('(/comiclist/\d+/\d+/\d+.htm)',r)][::4]
    names=[]
    [ names.append(i) for i in re.findall('(.*?)',r) if len(i)>2 and str(type(i))==""]
    print('章节数:'+str(len(chapters_urls)))
    num=0
    for ur in chapters_urls:
        chapter_queue.put((ur,names[num]))
        num+=1
          
    print("生产者正在生产url")
    for x in range(20):
        
        t = Producer(chapter_queue,img_queue)
        t.start()
   
    print("消费者正在下载图片")
    for x in range(200):
        
        
        t = Consumer(chapter_queue,img_queue)
        t.start()
        time.sleep(1) 
if __name__ == '__main__':
	print('请在此网站找目录哦  http://comic.kkkkdm.com/')
    main()   

你可能感兴趣的:(爬虫,#,爬虫案例)