爬取页面:https://www.douban.com/doulist/36708212/
Python版本:3.6.5
主要涉及内容:多线程、网络爬虫、面向对象程序设计
注释标注为 # 多线程 的部分可以直接复制,并根据需求稍加修改即可实现多线程爬虫。
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import time
import random
import threading
class Spider_url(): # 爬虫
def get_page(self, url, recount=3): # recount默认值为3,代表发生错误时,重复发起请求的次数
while recount >= 1:
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
r = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(r.text, 'html.parser')
break
except:
print(url + "获取页面失败!")
recount -= 1
soup = None
time.sleep(random.uniform(1,2))
return soup
def maxnum(self): # 获取网页页数(这部分其实没什么必要)
url = "https://www.douban.com/doulist/36708212/"
soups = self.get_page(url)
soup = soups
page_url_list_text = soup.find('div', attrs = "paginator")
page_url_list = page_url_list_text.find_all('a')
max_num = int(page_url_list[-2].text)
return max_num
def get_book_url(self, soup): # 获取书的页面链接
urls = soup.find_all('div', attrs="post")
for url in urls:
url = url.find('a').get('href')
yield url
def page_url(self, num): # 返回全部页面的URL的生成器
for page in range(num):
url = "https://www.douban.com/doulist/36708212/?start="+str(page*25)+"&sort=seq&playable=0&sub_type="
yield url
class mult_thread(threading.Thread): # 多线程
def __init__(self, func, args=()):
super(mult_thread, self).__init__()
self.func = func
self.args = args
def run(self):
self.result = self.func(*self.args)
def get_result(self):
try:
return self.result
except Exception:
return None
# 多线程方法构造
if __name__ == '__main__':
ts = time.time() # 计时开始
su = Spider_url()
max_num = su.maxnum()
page_urls = su.page_url(max_num)
# 多线程
soups = []
for page_url in page_urls:
t = mult_thread(su.get_page, args=(page_url,))
soups.append(t)
t.start()
temp = []
for t in soups:
t.join()
temp.append(t.get_result())
soups = temp
#
book_urls = []
for soup in soups:
book_urls.append(su.get_book_url(soup))
filename = 'threading_all_url.txt'
with open(filename, 'w') as f: # 将获取的书籍URL链接保存
for book_url in book_urls:
for url in book_url:
f.write(url+"\n")
print("获取书籍的URL链接已全部保存至"+filename)
te = time.time() # 计时结束
print("共耗时:"+str(te-ts)+"s")
input()
# 多线程耗时:2.8955507278442383s
# 单线程耗时:15.677197694778442s
# 耗时与网络状况以及计算机性能有关,以上数据仅供参考
补充说明:
多线程与多进程适用条件:
· 多线程适用于I/O密集型任务
· 多进程适用于计算密集型任务
想了解Python多进程的教程与实战,可以看我另一篇文章《如何提高Python程序运行效率:多进程基于multiprocessing模块的Process方法》,文章链接:https://blog.csdn.net/qq_29750277/article/details/81031468
对关键字yield不了解的,可以看我另一篇文章《Python小知识点——生成器(generator)与关键字yield的运用》,文章链接:https://blog.csdn.net/qq_29750277/article/details/82025506