多线程及普通方法爬取斗图啦

(缺流程图)

1.普通方法(xpath)

import requests, random, time
from lxml import etree
from urllib import request
first = time.time()


def parse_html(text):
    html = etree.HTML(text)
    pics = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
    # print(pics)
    for img in pics:
        # get获得url
        img_url = img.get('data-original')
        # print(img_url)
        name = list(img.get('alt'))
        name = [i for i in name if i not in '/|']
        # name = list(filter(lambda x:x not in '/|', name))
        new_name = ''.join(name)
        try:
            request.urlretrieve(img_url, r'D:\pycharm\projects\test\pic/{}.jpg'.format(new_name))
        except:
            raise
        else:
            print(f'正在下载{new_name}')


def parse_page(url):
    headers_choice = [{
                          'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50safari 5.1 – Windows'},
                      {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)TT'}, {
                          'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'},
                      {
                          'User-Agent': 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'},
                      {
                          'User-Agent': 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'},
                      {
                          'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)'},
                      {
                          'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
                      {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)Firefox 4.0.1 – MAC'}]
    proxy_choice = [{'http': '54.214.52.181:80'}, {'http': '34.93.243.67:80'}, {'https': '119.27.170.46:8888'},
                    {'https': '58.220.95.80:9401'}]
    headers = random.choice(headers_choice)
    proxy = random.choice(proxy_choice)
    try:
        text = requests.get(url, headers=headers, proxies=proxy).content.decode('utf-8')
    except requests.exceptions.ProxyError:
        print('连接错误')
    except UnicodeEncodeError:
        print('编码更改')
        headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Cookie': '_agep=1601992588; _agfp=4b5864586b360b9b3511ad1fed0a73c1; _agtk=ad0fc9700101cd027f014ae151fe390a; Hm_lvt_2fc12699c699441729d4b335ce117f40=1603343105,1603766984,1603775871,1603790208; Hm_lpvt_2fc12699c699441729d4b335ce117f40=1603795061', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
        response = requests.get(url, headers=headers)
        # print(response.content)
        # 由chardet提供的正确编码
        response.encoding = response.apparent_encoding
        text = response.text
        # print(text)
        parse_html(text)
    else:
        parse_html(text)


def main(num):
    for page in range(1, num + 1):
        url = f'https://www.doutula.com/photo/list/?page={page}'
        parse_page(url)


if __name__ == '__main__':
    main(10)
    end = time.time()
    print('end-first:', end-first)

2.多线程

import threading, requests, random, time
from queue import Queue
from lxml import etree
from urllib import request


class Producer(threading.Thread):
    def __init__(self, page_queue, img_queue):
        super().__init__()
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_page(url)

    def parse_html(self, text):
        html = etree.HTML(text)
        pics = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
        # print(pics)
        for img in pics:
            # get获得url
            img_url = img.get('data-original')
            # print(img_url)
            name = list(img.get('alt'))
            name = [i for i in name if i not in '/|']
            # name = list(filter(lambda x:x not in '/|', name))
            new_name = ''.join(name)
            self.img_queue.put((img_url, new_name))

    def parse_page(self, url):
        headers_choice = [{
            'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50safari 5.1 – Windows'},
            {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)TT'}, {
                'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'},
            {
                'User-Agent': 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'},
            {
                'User-Agent': 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'},
            {
                'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)'},
            {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
            {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)Firefox 4.0.1 – MAC'}]
        proxy_choice = [{'http': '54.214.52.181:80'}, {'http': '34.93.243.67:80'}, {'https': '119.27.170.46:8888'},
                        {'https': '58.220.95.80:9401'}]
        headers = random.choice(headers_choice)
        proxy = random.choice(proxy_choice)
        try:
            text = requests.get(url, headers=headers, proxies=proxy).content.decode('utf-8')
        except requests.exceptions.ProxyError:
            print('连接错误')
        except UnicodeEncodeError:
            print('编码更改')
            headers = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                'Cookie': '_agep=1601992588; _agfp=4b5864586b360b9b3511ad1fed0a73c1; _agtk=ad0fc9700101cd027f014ae151fe390a; Hm_lvt_2fc12699c699441729d4b335ce117f40=1603343105,1603766984,1603775871,1603790208; Hm_lpvt_2fc12699c699441729d4b335ce117f40=1603795061',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
            response = requests.get(url, headers=headers)
            # print(response.content)
            # 由chardet提供的正确编码
            response.encoding = response.apparent_encoding
            text = response.text
            # print(text)
            self.parse_html(text)
        else:
            self.parse_html(text)


class Consumer(threading.Thread):
    def __init__(self, page_queue, img_queue):
        super().__init__()
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.page_queue.empty() and self.img_queue.empty():
                break
            img_url, new_name = self.img_queue.get()
            try:
                request.urlretrieve(img_url, r'D:\pycharm\projects\test\pic/{}.jpg'.format(new_name))
            except:
                raise
            else:
                print(f'正在下载{new_name}')


def main(num):
    page_queue = Queue(100)
    img_queue = Queue(500)
    for page in range(1, num + 1):
        url = f'https://www.doutula.com/photo/list/?page={page}'
        page_queue.put(url)

    for i in range(5):
        t = Producer(page_queue, img_queue)
        t.start()

    for i in range(5):
        t = Consumer(page_queue, img_queue)
        t.start()
        t.join()


if __name__ == '__main__':
    first = time.time()
    main(10)
    end = time.time()
    print('end-first:', end-first)

你可能感兴趣的:(爬虫学习记录)