逻辑流程
首先是些主要参数, 其中有需要爬取的站点链接, headers, Queue的关闭时间, 和内容界面的线程抓取数量
index = 'https://www.chazidian.com' list_page = index + '/xiaohua{}/{}' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36' } timeout = 1 thread_quantity = 5
get_url: 解析列表中的url解析并传入Queue队列中
def get_url(queue): for i in range(1, 75 + 1): r = requests.get(list_page.format('', str(i)), headers=headers) pq = PyQuery(r.text) doc = pq('div.arctcot h3 a') for j in doc.items(): queue.put(index + j.attr('href'))
get_conntent: 从Queue队列中获取url, 并解析内容界面, 并打印出来, 这里需要用while循环一直获取界面, 使用try_except捕获队列的超时异常
def get_content(queue): try: while True: url = queue.get(timeout=timeout) if 'https://' in url: r = requests.get(url, headers=headers) pq = PyQuery(r.text) doc = pq('div.arctcot') title = doc('a').text() content = doc('div.article_detail').text() img = doc('div.article_detail img').attr('src') # print(title.text()) # print(content.text()) if title and img: if not 'http://' in img: print(url) print({title: index + img}) elif title and content: if not (content in title): print(url) print({title: content}) except Empty: print('-' * 100) print('抓取完毕')
__main__: 进行线程开启和运行时间的计算
if __name__ == '__main__': start = time.time() queue_ = Queue(maxsize=1000) list_ = threading.Thread(target=get_url, args=(queue_,)) list_.start() if True: for i in range(thread_quantity): content = threading.Thread(target=get_content, args=(queue_,)) content.start() content.join() list_.join() end = time.time() print('用时: ', end - start - timeout)
下面是所有代码

from queue import Queue, Empty import threading import requests from pyquery import PyQuery import time index = 'https://www.chazidian.com' list_page = index + '/xiaohua{}/{}' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36' } timeout = 1 thread_quantity = 5 def get_url(queue): for i in range(1, 75 + 1): r = requests.get(list_page.format('', str(i)), headers=headers) pq = PyQuery(r.text) doc = pq('div.arctcot h3 a') for j in doc.items(): queue.put(index + j.attr('href')) # print(doc) pass def get_content(queue): try: while True: url = queue.get(timeout=timeout) if 'https://' in url: r = requests.get(url, headers=headers) pq = PyQuery(r.text) doc = pq('div.arctcot') title = doc('a').text() content = doc('div.article_detail').text() img = doc('div.article_detail img').attr('src') # print(title.text()) # print(content.text()) if title and img: if not 'http://' in img: print(url) print({title: index + img}) elif title and content: if not (content in title): print(url) print({title: content}) except Empty: print('-' * 100) print('抓取完毕') if __name__ == '__main__': start = time.time() queue_ = Queue(maxsize=1000) list_ = threading.Thread(target=get_url, args=(queue_,)) list_.start() if True: for i in range(thread_quantity): content = threading.Thread(target=get_content, args=(queue_,)) content.start() content.join() list_.join() end = time.time() print('用时: ', end - start - timeout)