多线程爬虫,主要依靠多线程库threading和爬虫库resquests
import threading
import requests
from tqdm import tqdm
def multi_thread():
pbar = tqdm(total = 10,desc = 'multi thread')
course_ids = list(range(10))
def process():
while course_ids:
course_id = course_ids.pop()
'''-----------------页面请求--------------------------------'''
response = requests.get(f'http://127.0.0.1:8080/courses/{course_id + 1}')
'''------------------页面访问成功-------------------------'''
if response.status_code == 200:
with open(f'download/{course_id+1}.html','w') as f:
f.write(response.text)
pbar.update(1)
'''---------------开启多线程--------------------------'''
t1 = threading.Thread(target = process)
t2 = threading.Thread(target = process)
t1.start()
t2.start()
if __name__=='__main__':
multi_thread()
开启100个线程爬虫
import threading
import requests
from tqdm import tqdm
def multi_thread():
# 初始化一个 tqdm 对象,长度为 100
pbar = tqdm(total=100, desc='multi thread, thread num: 100')
# 创建一个 0-100 的 list
course_ids = list(range(100))
def process():
# 当 list 不为空时,消耗一个元素进行处理
while course_ids:
course_id = course_ids.pop()
response = requests.get(f'http://127.0.0.1:8080/courses/{course_id + 1}')
if response.status_code == 200:
with open(f'download/{course_id + 1}.html', 'w') as f:
f.write(response.text)
# 处理完成一个页面,进度条加一
pbar.update(1)
threads = []
for _ in range(100):
t = threading.Thread(target=process)
threads.append(t)
# 启动线程
for t in threads:
t.start()
if __name__ == '__main__':
multi_thread()