Python多线程网络爬虫实战:获取豆瓣评价人数超过1w+的书籍

爬取页面:https://www.douban.com/doulist/36708212/

Python版本:3.6.5

主要涉及内容:多线程、网络爬虫、面向对象程序设计 

注释标注为 # 多线程 的部分可以直接复制,并根据需求稍加修改即可实现多线程爬虫。

# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup
import time
import random
import threading

class Spider_url():  # 爬虫
    def get_page(self, url, recount=3):  # recount默认值为3,代表发生错误时,重复发起请求的次数
        while recount >= 1:
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                }
                r = requests.get(url, headers=headers, timeout=10)
                soup = BeautifulSoup(r.text, 'html.parser')
                break
            except:
                print(url + "获取页面失败!")
                recount -= 1
                soup = None
                time.sleep(random.uniform(1,2))
        return soup

    def maxnum(self):  # 获取网页页数(这部分其实没什么必要)
        url = "https://www.douban.com/doulist/36708212/"
        soups = self.get_page(url)
        soup = soups 
        page_url_list_text = soup.find('div', attrs = "paginator")
        page_url_list = page_url_list_text.find_all('a')
        max_num = int(page_url_list[-2].text)
        return max_num

    def get_book_url(self, soup):  # 获取书的页面链接
        urls = soup.find_all('div', attrs="post")
        for url in urls:
            url = url.find('a').get('href')
            yield url


    def page_url(self, num):  # 返回全部页面的URL的生成器
        for page in range(num):
            url = "https://www.douban.com/doulist/36708212/?start="+str(page*25)+"&sort=seq&playable=0&sub_type="
            yield url

class mult_thread(threading.Thread):  # 多线程
    def __init__(self, func, args=()):
        super(mult_thread, self).__init__()
        self.func = func
        self.args = args

    def run(self):
        self.result = self.func(*self.args)

    def get_result(self):
        try:
            return self.result
        except Exception:
            return None
    # 多线程方法构造

if __name__ == '__main__':
    ts = time.time()  # 计时开始
    su = Spider_url()
    max_num = su.maxnum()
    page_urls = su.page_url(max_num)

    # 多线程
    soups = []
    for page_url in page_urls:
        t = mult_thread(su.get_page, args=(page_url,))
        soups.append(t)
        t.start()
    temp = []
    for t in soups:
        t.join()
        temp.append(t.get_result())
    soups = temp
    #

    book_urls = []
    for soup in soups:
        book_urls.append(su.get_book_url(soup))
    filename = 'threading_all_url.txt'
    with open(filename, 'w') as f:  # 将获取的书籍URL链接保存
        for book_url in book_urls:
            for url in book_url:
                f.write(url+"\n")
        print("获取书籍的URL链接已全部保存至"+filename)

    te = time.time()  # 计时结束
    print("共耗时:"+str(te-ts)+"s")
    input()
    # 多线程耗时:2.8955507278442383s
    # 单线程耗时:15.677197694778442s
    # 耗时与网络状况以及计算机性能有关,以上数据仅供参考

补充说明:

多线程与多进程适用条件:

· 多线程适用于I/O密集型任务

· 多进程适用于计算密集型任务

想了解Python多进程的教程与实战,可以看我另一篇文章《如何提高Python程序运行效率:多进程基于multiprocessing模块的Process方法》,文章链接:https://blog.csdn.net/qq_29750277/article/details/81031468

对关键字yield不了解的,可以看我另一篇文章《Python小知识点——生成器(generator)与关键字yield的运用》,文章链接:https://blog.csdn.net/qq_29750277/article/details/82025506

你可能感兴趣的:(Python)