爬取ip(IPPool.py)
import requests
from lxml import etree
from fake_useragent import UserAgent
#伪装
ua = UserAgent()
headers = {'User-Agent':ua.random}
def get_ip():
ip_list = []
#路径
url = 'https://www.xicidaili.com/nt/' #ip是有时效的,只爬取第一页
#请求
response = requests.get(url=url,headers=headers)
#设置编码
response.encoding = response.apparent_encoding
response = response.text
response = etree.HTML(response)
tr_list = response.xpath('//tr[@class="odd"]')
for i in tr_list:
#ip
ip = i.xpath('./td[2]/text()')[0]
#端口号
port = i.xpath('./td[3]/text()')[0]
#协议
agreement = i.xpath('./td[6]/text()')[0]
agreement = agreement.lower()
#拼装完整路径
ip = agreement + '://' + ip + ':' + port
ip_list.append(ip)
return ip_list
if __name__ == '__main__':
ip_list = get_ip()
print(ip_list)
测试ip
测试方法一(from multiprocessing.dummy import Pool)
import requests
from multiprocessing.dummy import Pool
#获取爬取到的ip列表
from IPPool import get_ip
test_list = get_ip()
#定义一个全局列表,用来存放有效ip
ip_list = []
#ip测试网站
外汇代理
url = 'http://icanhazip.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
def ip_test(ip):
try:
if ip.split(":")[0] == 'http':
proxies = {
'http': ip
}
else:
proxies = {
'https': ip
}
response = requests.get(url=url, headers=headers, proxies=proxies, timeout=3)
ip_list.append(ip)
print(ip + "可用")
except:
print(ip + "不可用")
if __name__ == '__main__':
pool = Pool(4)
pool.map(ip_test, test_list)
print(ip_list)
print("总共爬取%s个ip,可用ip为:%s,不可用ip为:%s"%(len(test_list),len(ip_list),len(test_list)-len(ip_list)))
测试方法二(Threading多线程队列)
import threading
import requests
import queue
from fake_useragent import UserAgent
#获取爬取到的ip列表
from IPPool import get_ip
test_list = get_ip()
#定义一个全局列表,用来存放有效ip
ip_pool = []
#随机头伪装
ua = UserAgent()
headers = {'User-Agent':ua.random}
url = 'https://www.csdn.net/'
# url = 'http://icanhazip.com/'
def test_ip(queue_list):
while True:
if queue_list.empty():
break
else:
ip = queue_list.get()
if ip.split(":")[0] == 'http':
proxies = {
'http' : ip
}
else:
proxies = {
'https': ip
}
try:
response = requests.get(url=url, headers=headers, proxies=proxies,timeout=3)
if response.status_code == 200:
print("【%s】测试%s,测试结果【可用】" % (threading.current_thread().name, proxies))
ip_pool.append(ip)
except:
print("【%s】测试%s,测试结果【不可用】" % (threading.current_thread().name, proxies))
if __name__ == '__main__':
queue_list = queue.Queue()#创建队列
#将爬取的ip放入队列中
for i in test_list:
queue_list.put(i)
#创建线程
out_thread = [threading.Thread(target=test_ip, args=(queue_list,), name="进程%s" % item) for item in range(5)]
for thread in out_thread:
thread.start()
for thread in out_thread:
thread.join()
print('测试完成')
print(ip_pool)
print("总共爬取%s个ip,可用ip为:%s,不可用ip为:%s"%(len(test_list),len(ip_pool),len(test_list)-len(ip_pool)))
IPPool2.py
import requests
from lxml import etree
from fake_useragent import UserAgent
#伪装
ua = UserAgent()
headers = {'User-Agent':ua.random}
def get_ip():
ip_list = []
#路径
url = 'https://www.kuaidaili.com/free/intr/'
#请求
response = requests.get(url=url,headers=headers)
#设置编码
response.encoding = response.apparent_encoding
response = response.text
response = etree.HTML(response)
tr_list = response.xpath('//*[@id="list"]/table/tbody/tr')
for i in tr_list:
ip = i.xpath('./td[1]/text()')[0]
ip_list.append(ip)
return ip_list
if __name__ == '__main__':
ip_list = get_ip()
# print(ip_list)