了解到代理IP及其端口的价值后,我们知道必须拥有一定数目的可用IP才能够完成大量数据的爬取。
但代理IP从哪儿来呢?
自身搭建的IP代理池能够满足绝大部分需求了
倘若需要做专业性较强的爬虫,建议还是去找一些优质的网站购买稳定服务。
首先给出几个免费代理IP网址
https://www.kuaidaili.com/
http://www.66ip.cn/index.html
http://www.ip3366.net/
https://www.89ip.cn/index_1
打开89免费代理网页 我们很快找到了我们需要的信息——IP及其端口
通过XPATH其页面内容进行解析获取
URL提供越多 抓取到的IP就越多 相同的XPATH能够对同一网站的不同页面进行请求
但针对不同的网站 需要重新编写XPATH。
url = 'https://www.89ip.cn/index_1.html'
html = requests.get(url=url, headers=headers)
tree = etree.HTML(html, parser=parser) # 加载html文件
ip_list = tree.xpath('//div[@class="layui-form"]//tr/td[1]/text()')
post_list = tree.xpath('//div[@class="layui-form"]//tr/td[2]/text()')
从该页上抓取到25个IP
180.165.133.13 : 53281
36.137.70.178 : 7777
27.42.168.46 : 55481
47.105.91.226 : 8118
221.122.91.61 : 80
183.247.202.230 : 30001
183.154.220.72 : 9000
171.92.20.37 : 9000
171.92.21.168 : 9000
223.10.18.173 : 8118
183.247.215.218 : 30001
222.174.11.87 : 7890
183.222.217.168 : 9091
182.139.111.125 : 9000
60.211.218.78 : 53281
220.170.145.103 : 7302
183.247.199.114 : 30001
218.1.142.142 : 57114
222.64.153.165 : 9000
61.61.26.181 : 80
218.28.141.66 : 8001
223.94.85.131 : 9091
221.178.239.200 : 7302
182.139.110.124 : 9000
43.248.133.29 : 8080
此时使用字典存储而非列表 是为了去重
防止相同的IP被重复写入 增加负担。
免费代理提高的IP往往质量较低 为保证后续使用时的效率
完成对IP的初步收集后,我们需要对这些IP进行可用性检验。
访问http://httpbin.org/ip能够观察到当前访问所用的IP
如我访问的结果是
{
"origin": "223.104.40.44"
}
那么 我们使用相应代理IP对该网页进行请求
获取响应结果后与传入的代理IP进行比较 就能够得知代理是否成功。
def test(ip, port):
# 如果代理成功 则页面解析获取的IP应当与输入IP相同
# True 代理成功 False代理失败
print('开始测试' + str(ip) + '...')
url = 'http://httpbin.org/ip'
proxies = {"http": f"http://{ip}:{port}", "https": f"http://{ip}:{port}"}
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.33'
}
html = getHTMLText(url=url, headers=headers, data=None, proxies=proxies)
if html == "GET异常":
return False
return parse(html)[0] == ip
def test_list(ip_dic):
ip_list = list(ip_dic.keys())
for num in range(len(ip_list)):
if test(ip_list[num], ip_dic[ip_list[num]]):
print(str(ip_list[num]) + '有效')
else:
print(str(ip_list[num]) + '无效')
ip_dic.pop(ip_list[num])
return ip_dic
随机进行两个IP测试:
ip_dic = {
'101.200.127.149': '3129',
'58.220.95.114': '10053'
}
test_list(ip_dic)
运行结果:
开始测试101.200.127.149...
101.200.127.149有效
开始测试58.220.95.114...
58.220.95.114无效
将经过检验的IP及其端口存储在本地 方便其余爬虫程序调用
存储的方式有很多:mysql,txt,excel等等 我这用了最简单的文本文件存储。
## 4.结果展示
def save_ip_text(ip_dic):
for ip in list(ip_dic.keys()):
with open("IP_Pool.txt", 'a', encoding='utf-8') as fd:
fd.write(str(ip) + ",\t" + str(ip_dic[ip]) + '\n')
print('可用IP池已保存至IP_Pool.txt')
def show_ip(ip_dic):
# 简单打印
for ip in list(ip_dic.keys()):
print(str(ip) + ":\t" + str(ip_dic[ip]))
运行结果:
完整搭建代理IP池,其中收集IP与IP有效性检验均使用了单线程。
import random
import time
import re
from multiprocessing.dummy import Pool
import requests
from lxml import etree
# 1.获取网页静态源码的requests框架
def getHTMLText(url, data, headers, proxies, code='utf-8'):
try:
# headers 避免被检测出自身为程序访问 将自己伪装成浏览器
r = requests.get(url=url, params=data, headers=headers, proxies=proxies)
# t = random.randint(1, 5) # 随机睡眠 降低机器辩认度
# time.sleep(t)
r.raise_for_status()
r.encoding = code
return r.text
# 返回静态源码或异常提示
except:
return "GET异常"
# 2.代理池
# 1
def get_kuaidaili_IP():
# 获取快代理网站前三页IP及其端口
print('抓取快代理网站前三页IP及其端口')
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.33'
}
parser = etree.HTMLParser(encoding="utf-8")
ip_dic = {}
for i in range(1, 4):
url = 'https://free.kuaidaili.com/free/inha/' + str(i) + '/'
html = getHTMLText(url=url, headers=headers, data=None, proxies=None)
tree = etree.HTML(html, parser=parser) # 加载html文件
ip_list = tree.xpath('/html/body/div/div[4]/div[2]/div[2]/div[2]/table/tbody/tr/td[1]/text()')
post_list = tree.xpath('/html/body/div/div[4]/div[2]/div[2]/div[2]/table/tbody/tr/td[2]/text()')
dic = dict(zip(ip_list, post_list))
ip_dic = dict(ip_dic, **dic)
return ip_dic
# 2
def get_66ip_IP():
# 获取66免费代理网前三页IP及其端口
print('抓取66免费代理网前三页IP及其端口')
ip_dic = {}
parser = etree.HTMLParser(encoding="utf-8")
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.33'
}
def obtain(url):
html = getHTMLText(url=url, headers=headers, data=None, proxies=None)
tree = etree.HTML(html, parser=parser) # 加载html文件
ip_list = tree.xpath('//*[@id="main"]/div[1]/div[2]/div[1]//tr/td[1]/text()')
post_list = tree.xpath('//*[@id="main"]/div[1]/div[2]/div[1]//tr/td[2]/text()')
dic = dict(zip(ip_list, post_list))
return dic
url = 'http://www.66ip.cn/index.html'
ip_dic = dict(ip_dic, **obtain(url))
for i in range(2, 4):
url = 'http://www.66ip.cn/' + str(i) + '.html'
ip_dic = dict(ip_dic, **obtain(url))
return ip_dic
# 3
def get_ip3366_IP():
# 获取3366云代理网站前三页IP及其端口
print('抓取3366云代理网站前三页IP及其端口')
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.33'
}
parser = etree.HTMLParser(encoding="utf-8")
ip_dic = {}
for i in range(1, 4):
url = 'http://www.ip3366.net/free/?stype=1&page=' + str(i)
html = getHTMLText(url=url, headers=headers, data=None, proxies=None)
tree = etree.HTML(html, parser=parser) # 加载html文件
ip_list = tree.xpath('//*[@id="list"]/table/tbody/tr/td[1]/text()')
post_list = tree.xpath('//*[@id="list"]/table/tbody/tr/td[2]/text()')
dic = dict(zip(ip_list, post_list))
ip_dic = dict(ip_dic, **dic)
return ip_dic
# 4
def get_89ip_IP():
# 获取89免费代理网站前三页IP及其端口
print('抓取89免费代理网站前三页IP及其端口')
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.33'
}
parser = etree.HTMLParser(encoding="utf-8")
ip_dic = {}
for i in range(1, 4):
url = 'https://www.89ip.cn/index_1' + str(i) + '.html'
html = getHTMLText(url=url, headers=headers, data=None, proxies=None)
tree = etree.HTML(html, parser=parser) # 加载html文件
ip_list = tree.xpath('//div[@class="layui-form"]//tr/td[1]/text()')
post_list = tree.xpath('//div[@class="layui-form"]//tr/td[2]/text()')
dic = dict(zip(ip_list, post_list))
ip_dic = dict(ip_dic, **dic)
return ip_dic
# 5
def get_kxdaili_IP():
# 获取云代理网站高匿与普匿两页IP及其端口
print('抓取云代理网站高匿与普匿两页IP及其端口')
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.33'
}
parser = etree.HTMLParser(encoding="utf-8")
ip_dic = {}
for i in range(1, 2):
url = 'http://www.kxdaili.com/dailiip.html'
html = getHTMLText(url=url, headers=headers, data=None, proxies=None)
tree = etree.HTML(html, parser=parser) # 加载html文件
ip_list = tree.xpath('//div[@class="hot-product-content"]//tr/td[1]/text()')
post_list = tree.xpath('//div[@class="hot-product-content"]//tr/td[2]/text()')
dic = dict(zip(ip_list, post_list))
ip_dic = dict(ip_dic, **dic)
for i in range(1, 2):
url = 'http://www.kxdaili.com/dailiip/2/1.html'
html = getHTMLText(url=url, headers=headers, data=None, proxies=None)
tree = etree.HTML(html, parser=parser) # 加载html文件
ip_list = tree.xpath('//div[@class="hot-product-content"]//tr/td[1]/text()')
post_list = tree.xpath('//div[@class="hot-product-content"]//tr/td[2]/text()')
dic = dict(zip(ip_list, post_list))
ip_dic = dict(ip_dic, **dic)
return ip_dic
## 3.测试
def parse(html):
# 利用正则表达式 解析并获取页面中所有IP地址
ip_list = re.findall(
r'(?,
html)
return ip_list
def test(ip, port):
# 如果代理成功 则页面解析获取的IP应当与输入IP相同
# True 代理成功 False代理失败
print('开始测试' + str(ip) + '...')
url = 'http://httpbin.org/ip'
proxies = {"http": f"http://{ip}:{port}", "https": f"http://{ip}:{port}"}
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.33'
}
html = getHTMLText(url=url, headers=headers, data=None, proxies=proxies)
if html == "GET异常":
return False
return parse(html)[0] == ip
def test_list(ip_dic):
ip_list = list(ip_dic.keys())
for num in range(len(ip_list)):
if test(ip_list[num], ip_dic[ip_list[num]]):
print(str(ip_list[num]) + '有效')
else:
print(str(ip_list[num]) + '无效')
ip_dic.pop(ip_list[num])
return ip_dic
## 4.结果展示
def save_ip_text(ip_dic):
for ip in list(ip_dic.keys()):
with open("IP_Pool.txt", 'a', encoding='utf-8') as fd:
fd.write(str(ip) + ",\t" + str(ip_dic[ip]) + '\n')
print('可用IP池已保存至IP_Pool.txt')
def show_ip(ip_dic):
# 简单打印
for ip in list(ip_dic.keys()):
print(str(ip) + ":\t" + str(ip_dic[ip]))
def main():
print('------------------------------------------------')
print('------------------------------------------------')
print('1.开始初步IP收集')
ip_dic = {}
ip_dic = dict(ip_dic, **get_kuaidaili_IP())
ip_dic = dict(ip_dic, **get_66ip_IP())
ip_dic = dict(ip_dic, **get_ip3366_IP())
ip_dic = dict(ip_dic, **get_89ip_IP())
ip_dic = dict(ip_dic, **get_kxdaili_IP())
print('2.完成初步IP收集')
print('抓取到共计\t' + str(len(ip_dic)) + '个IP')
print('------------------------------------------------')
print('------------------------------------------------')
print('3.开始可用性测试')
ip_dic = test_list(ip_dic)
print('------------------------------------------------')
print('------------------------------------------------')
print('4.有效IP存储')
save_ip_text(ip_dic)
print('最终有效IP数目计为\t' + str(len(ip_dic)))
if __name__ == '__main__':
main()
运行结果:
"D:\Program Files\Python\python.exe"
------------------------------------------------
------------------------------------------------
1.开始初步IP收集
抓取快代理网站前三页IP及其端口
抓取66免费代理网前三页IP及其端口
抓取3366云代理网站前三页IP及其端口
抓取89免费代理网站前三页IP及其端口
抓取云代理网站高匿与普匿两页IP及其端口
2.完成初步IP收集
抓取到共计 100个IP
------------------------------------------------
------------------------------------------------
3.开始可用性测试
开始测试117.114.149.66...
117.114.149.66无效
开始测试122.9.101.6...
122.9.101.6无效
开始测试47.113.90.161...
47.113.90.161有效
开始测试222.74.73.202...
........
------------------------------------------------
------------------------------------------------
4.有效IP存储
可用IP池已保存至IP_Pool.txt
最终有效IP数目计为 5
Process finished with exit code 0
最终成功抓取了多个网站共十来页的免费IP并进行了可用性测试,最终保存至本地文本文件中。
之后使用时随机进行文本文件提取即可。
但我们也能够看出免费代理的有效率的确很低 经过筛选后最终能够使用的IP不足10个。
将初步收集的字典放入URLS中 就能够完成多线程验证。
相比之前单线程速度要提高了许多。
import threading
import requests
import time
import queue
import re
start = time.time()
# 填充队列
URLs = {
'120.220.220.95': '8085',
'101.200.127.149': '3129',
'183.247.199.215': '30001',
'61.216.185.88': '60808'
}
# 为线程定义一个函数
class myThread(threading.Thread):
# 定义线程
def __init__(self, name, q):
threading.Thread.__init__(self)
# 线程名称
self.name = name
#
self.q = q
def run(self):
# 开始线程
print("Starting " + self.name)
while True:
try:
# 执行crawl耗时操作
crawl(self.name, self.q)
except:
break
# 退出线程
print("Exiting " + self.name)
def getHTMLText(url, data, headers, proxies, code='utf-8'):
try:
r = requests.get(url=url, params=data, headers=headers, proxies=proxies)
r.raise_for_status()
r.encoding = code
return r.text
except:
return "GET异常"
def parse(html):
# 利用正则表达式 解析并获取页面中所有IP地址
ip_list = re.findall(
r'(?,
html)
return ip_list
def crawl(threadNmae, q):
ip = q.get(timeout=2)
print(threadNmae + '开始测试' + str(ip) + '...')
url = 'http://httpbin.org/ip'
proxies = {"http": f"http://{ip}:{URLs.get(ip)}", "https": f"http://{ip}:{URLs.get(ip)}"}
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.33'
}
html = getHTMLText(url=url, headers=headers, data=None, proxies=proxies)
if html == "GET异常":
print(str(ip) + '无效')
return False
if parse(html)[0] == ip:
print(str(ip) + '有效')
else:
print(str(ip) + '无效')
URLs.pop(ip)
return parse(html)[0] == ip
workQueue = queue.Queue(len(URLs.keys()))
for url in URLs.keys():
workQueue.put(url)
threads = []
for i in range(1, 5):
# 创建4个新线程
thread = myThread("Thread-" + str(i), q=workQueue)
# 开启新线程
thread.start()
# 添加新线程到线程列表
threads.append(thread)
# 等待所有线程完成
for thread in threads:
thread.join()
end = time.time()
print("Queue多线程IP验证耗时:{} s".format(end - start))
print("Exiting Main Thread")
运行结果:
"D:\Program Files\Python\python.exe"
Starting Thread-1
Thread-1开始测试120.220.220.95...
Starting Thread-2
Thread-2开始测试101.200.127.149...
Starting Thread-3
Thread-3开始测试183.247.199.215...
Starting Thread-4
Thread-4开始测试61.216.185.88...
183.247.199.215无效
101.200.127.149有效
Exiting Thread-3
Exiting Thread-2
120.220.220.95有效
Exiting Thread-1
61.216.185.88无效
Exiting Thread-4
Queue多线程IP验证耗时:23.041887998580933 s
Exiting Main Thread
Process finished with exit code 0