python3 requests多线程爬取xici 代理ip并验证

使用环境 Python3

使用到的库
requests,lxml,threading,queue,time
需要额外安装的库
requests ,lxml
安装命令
pip install requests  lxml 

开始爬虫

工作流程:
1. 构造 URL 列表
2. 获取 URL 响应页面 
3. 提取页面有用数据  
4. 保存数据
5. 清洗数据
上代码
import requests
import threading
from queue import Queue
from lxml import etree
import time


class proxySpider():
    def __init__(self):
        self.url_temp = "https://www.xicidaili.com/nn/"
        self.headers = {"User-Agent": "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"}
        self.url_queue = Queue()
        self.page_queue = Queue()
        self.content_queue = Queue()

    def get_url_list(self):
        for i in range(1, 6):
            self.url_queue.put(self.url_temp + str(i))
            # print(self.url_temp + str(i))
            # print("empty" if self.url_queue.empty() else "! enpty")

    def parse_url(self):
        while True:
            if self.url_queue.empty() != True:
                url = self.url_queue.get()
                resp = requests.get(url, headers=self.headers)
                # print(resp.status_code)
                if resp.status_code == 200:
                    # print(resp.status_code)
                    self.page_queue.put(resp.content)
                self.url_queue.task_done()

    def get_content_list(self):
        while True:
            if self.page_queue.empty() != True:
                page = self.page_queue.get()
                html = etree.HTML(page.decode("utf-8"))
                proxy_list = []
                for each in html.xpath("//tr[@class='odd']"):
                    ip = each.xpath("./td[2]/text()")[0]
                    port = each.xpath("./td[3]/text()")[0]
                    if ip:
                        proxy = ip[0] + ":" + port[0]  # 拼接 ip和 port
                        proxy_list.append(proxy)
                self.content_queue.put(proxy_list)
                self.page_queue.task_done()

    def save(self):
        while True:
            if not self.content_queue.empty():
                print("写入中...")
                with open("./res2.txt", "a", encoding="utf-8") as f:
                    content = self.content_queue.get()
                    for each in content:
                        f.write(each + "\n")
                self.content_queue.task_done()

    def run(self):
        thread_list = []
        # 1. 构建url
        self.get_url_list()
        # 2. 获取url响应页面
        # for i in range(3):
        #     t_paser = threading.Thread(target=self.parse_url)
        #     thread_list.append(t_paser)
        # self.parse_url()
        thread_list.extend([threading.Thread(target=self.parse_url) for i in range(3)])
        # 3. 获取内容
        thread_list.extend([threading.Thread(target=self.get_content_list) for i in range(3)])
        # 4. 保存
        thread_list.extend([threading.Thread(target=self.save) for i in range(2)])
        for each in thread_list:
            each.setDaemon(True)
            each.start()
        for q in [self.url_queue, self.page_queue, self.content_queue]:
            q.join()


if __name__ == "__main__":
    start_time = time.time()
    spider = proxySpider()
    spider.run()
    end_time = time.time()
    cost_time = end_time - start_time
    print("时间开销: {}".format(cost_time))

代理ip有效性检查类

class checkIp():
    def __init__(self):
        self.url = 'http://www.baidu.com'
        self.headers = {"User-Agent": "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"}
        self.ip_queue = Queue()
        self.ip_valid_queue = Queue()

    def get_ip(self):
        f = open("./res2.txt", "r", encoding="utf-8")
        while True:
            ip_temp = f.readline().strip()
            # print('ip:{}'.format(ip_temp))
            if ip_temp == '':
                f.close()
                break
            elif not self.ip_queue.full():
                self.ip_queue.put(ip_temp)

    def check_ip(self):
        url = self.url
        # pass
        # print(self.ip_queue.qsize())
        while True:
            ip_temp = self.ip_queue.get()
            # print(ip_temp)
            proxy_ip = {"https": "https://" + ip_temp, "http": "http://" + ip_temp}
            # print(proxy_ip)
            try:
                resp = requests.get(url, headers=self.headers, proxies=proxy_ip, verify=False, timeout=2)
                if resp.status_code == 200:
                    self.ip_valid_queue.put(ip_temp)
                    # print(ip_temp)
            except Exception:
                pass
                # print('丢弃{}'.format(ip_temp))
            self.ip_queue.task_done()

    def save(self):

        while True:
            with open("./res2.txt", "a", encoding="utf-8") as f:
                if self.ip_valid_queue.empty() != True:
                    ip = self.ip_valid_queue.get()
                    f.write(ip + '\n')
                    self.ip_valid_queue.task_done()

    def run(self):
		thread_list = []
        # 1. 取出ip
        # 此处线程数不建议更改,资源读取问题未解决,故只使用一个线程
        thread_list.extend([threading.Thread(target=self.get_ip) for i in range(1)])
        # 2. 验证proxyIp
        thread_list.extend([threading.Thread(target=self.check_ip) for i in range(3)])
        # 3. 保存有效proxyIp
        thread_list.extend([threading.Thread(target=self.save) for i in range(4)])
        for each in thread_list:
            # print(type(each))
            each.setDaemon(True)	#设置守护线程
            each.start()
        for i in [self.ip_queue, self.ip_valid_queue]:
            i.join()

遇到的问题与总结:

  1. 验证代理时出现 报错: requests.exceptions.ProxyError: HTTPConnectionPool(host=‘113.65.5.186’, port…
    原因: 是因为我的机器启用代理
    解决方法: 进入电脑设置 关闭代理, 再启动 代理 ip有效性测试程序
  2. 有待解决问题: 队列应当设置上限 (特别是在大项目中). 本人在设置上限后,不能有效的解决出现的bug,所以放弃了 机器资源的节省管理.(但这只适用于小文本资源的程序)
    最后,欢迎各位看官提出建议.

你可能感兴趣的:(python,爬虫)