多线程爬虫

多线程爬虫,主要依靠多线程库threading和爬虫库resquests

import threading 
import requests
from tqdm import tqdm 

def multi_thread():
    pbar = tqdm(total = 10,desc = 'multi thread')
    course_ids = list(range(10))
    def process():
        while course_ids:
            course_id = course_ids.pop()
         '''-----------------页面请求--------------------------------'''
            response = requests.get(f'http://127.0.0.1:8080/courses/{course_id + 1}')
        '''------------------页面访问成功-------------------------'''
            if response.status_code == 200:
                with open(f'download/{course_id+1}.html','w') as f:
                    f.write(response.text)
            pbar.update(1)
    '''---------------开启多线程--------------------------'''
    t1 = threading.Thread(target = process)
    t2 = threading.Thread(target = process)
    t1.start()
    t2.start()
if __name__=='__main__':
    multi_thread()

开启100个线程爬虫

import threading

import requests
from tqdm import tqdm


def multi_thread():
    # 初始化一个 tqdm 对象,长度为 100
    pbar = tqdm(total=100, desc='multi thread, thread num: 100')
    # 创建一个 0-100 的 list
    course_ids = list(range(100))
    def process():
        # 当 list 不为空时,消耗一个元素进行处理
        while course_ids:
            course_id = course_ids.pop()
            response = requests.get(f'http://127.0.0.1:8080/courses/{course_id + 1}')
            if response.status_code == 200:
                with open(f'download/{course_id + 1}.html', 'w') as f:
                    f.write(response.text)
            # 处理完成一个页面,进度条加一
            pbar.update(1)

    threads = []
    for _ in range(100):
        t = threading.Thread(target=process)
        threads.append(t)
    # 启动线程
    for t in threads:
        t.start()


if __name__ == '__main__':
    multi_thread()

你可能感兴趣的:(爬虫,知识图谱,爬虫,python)