队列+多线程+selenium

import time
import re
import threading
import queue
from selenium import webdriver


baseurl = 'http://www....{}...html'
# 递推表达式生成url列表
url_list = [baseurl.format(x) for x in range(1, 13)]
# 创建队列q
q = queue.Queue()
# 向队列添加数据
for i in url_list:
    q.put(i)

class dragen(object):
    def __init__(self):
        self.max_thread = 12  # 最大线程数

    def getdata(self, que):
        # 单个线程判空,非空继续取出元素执行,直到取完所有元素
        while not que.empty():
            url = que.get()
            print(url)
            options = webdriver.ChromeOptions()
            options.binary_location = r"C:\....\chrome.exe"
            driver = webdriver.Chrome(options=options)
            driver.get(url)
            # 等待加载完毕,获取完整信息
            time.sleep(10)
            content = driver.page_source
            driver.close()
            url_data = re.findall('"url":"(.*?)",', content)
            print(url_data)
            # 获取失败的继续加入队列运行
            if len(url_data) == 0:
                q.put(url)


    def many_t(self):
        t_lists = []
        # 按最大线程数创建线程运行方法
        for i in range(self.max_thread):
            t = threading.Thread(target=self.getdata, args=(q, ))
            t.start()
            t_lists.append(t)
        for t_list in t_lists:
            t_list.join()

    def main(self):
        self.many_t()

def main():
    dragen().main()

if __name__ == '__main__':
    main()

你可能感兴趣的:(爬虫,selenium,python,测试工具)