使用环境 Python3
使用到的库
requests,lxml,threading,queue,time
需要额外安装的库
requests ,lxml
安装命令
pip install requests lxml
开始爬虫
工作流程:
1. 构造 URL 列表
2. 获取 URL 响应页面
3. 提取页面有用数据
4. 保存数据
5. 清洗数据
上代码
import requests
import threading
from queue import Queue
from lxml import etree
import time
class proxySpider():
def __init__(self):
self.url_temp = "https://www.xicidaili.com/nn/"
self.headers = {"User-Agent": "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"}
self.url_queue = Queue()
self.page_queue = Queue()
self.content_queue = Queue()
def get_url_list(self):
for i in range(1, 6):
self.url_queue.put(self.url_temp + str(i))
def parse_url(self):
while True:
if self.url_queue.empty() != True:
url = self.url_queue.get()
resp = requests.get(url, headers=self.headers)
if resp.status_code == 200:
self.page_queue.put(resp.content)
self.url_queue.task_done()
def get_content_list(self):
while True:
if self.page_queue.empty() != True:
page = self.page_queue.get()
html = etree.HTML(page.decode("utf-8"))
proxy_list = []
for each in html.xpath("//tr[@class='odd']"):
ip = each.xpath("./td[2]/text()")[0]
port = each.xpath("./td[3]/text()")[0]
if ip:
proxy = ip[0] + ":" + port[0]
proxy_list.append(proxy)
self.content_queue.put(proxy_list)
self.page_queue.task_done()
def save(self):
while True:
if not self.content_queue.empty():
print("写入中...")
with open("./res2.txt", "a", encoding="utf-8") as f:
content = self.content_queue.get()
for each in content:
f.write(each + "\n")
self.content_queue.task_done()
def run(self):
thread_list = []
self.get_url_list()
thread_list.extend([threading.Thread(target=self.parse_url) for i in range(3)])
thread_list.extend([threading.Thread(target=self.get_content_list) for i in range(3)])
thread_list.extend([threading.Thread(target=self.save) for i in range(2)])
for each in thread_list:
each.setDaemon(True)
each.start()
for q in [self.url_queue, self.page_queue, self.content_queue]:
q.join()
if __name__ == "__main__":
start_time = time.time()
spider = proxySpider()
spider.run()
end_time = time.time()
cost_time = end_time - start_time
print("时间开销: {}".format(cost_time))
代理ip有效性检查类
class checkIp():
def __init__(self):
self.url = 'http://www.baidu.com'
self.headers = {"User-Agent": "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"}
self.ip_queue = Queue()
self.ip_valid_queue = Queue()
def get_ip(self):
f = open("./res2.txt", "r", encoding="utf-8")
while True:
ip_temp = f.readline().strip()
if ip_temp == '':
f.close()
break
elif not self.ip_queue.full():
self.ip_queue.put(ip_temp)
def check_ip(self):
url = self.url
while True:
ip_temp = self.ip_queue.get()
proxy_ip = {"https": "https://" + ip_temp, "http": "http://" + ip_temp}
try:
resp = requests.get(url, headers=self.headers, proxies=proxy_ip, verify=False, timeout=2)
if resp.status_code == 200:
self.ip_valid_queue.put(ip_temp)
except Exception:
pass
self.ip_queue.task_done()
def save(self):
while True:
with open("./res2.txt", "a", encoding="utf-8") as f:
if self.ip_valid_queue.empty() != True:
ip = self.ip_valid_queue.get()
f.write(ip + '\n')
self.ip_valid_queue.task_done()
def run(self):
thread_list = []
thread_list.extend([threading.Thread(target=self.get_ip) for i in range(1)])
thread_list.extend([threading.Thread(target=self.check_ip) for i in range(3)])
thread_list.extend([threading.Thread(target=self.save) for i in range(4)])
for each in thread_list:
each.setDaemon(True)
each.start()
for i in [self.ip_queue, self.ip_valid_queue]:
i.join()
遇到的问题与总结:
- 验证代理时出现 报错: requests.exceptions.ProxyError: HTTPConnectionPool(host=‘113.65.5.186’, port…
原因: 是因为我的机器启用代理
解决方法: 进入电脑设置 关闭代理, 再启动 代理 ip有效性测试程序
- 有待解决问题: 队列应当设置上限 (特别是在大项目中). 本人在设置上限后,不能有效的解决出现的bug,所以放弃了 机器资源的节省管理.(但这只适用于小文本资源的程序)
最后,欢迎各位看官提出建议.