快代理代理IP爬取
import os
import requests
import time
import random
from fake_useragent import UserAgent
from lxml import etree
import csv
"""
快代理-ip
"""
import os
import requests
import time
import random
from fake_useragent import UserAgent
from lxml import etree
import csv
"""
快代理-ip
"""
class IPSipder(object):
def __init__(self):
self.url = "https://www.kuaidaili.com/free/inha/{}/"
self.headers = {'User-Agent': UserAgent().random}
# 记录有效IP总数
self.count = 0
# 获取所有ip行
def get_html(self, url):
html = requests.get(url=url, headers=self.headers).text
parser_html = etree.HTML(html)
tr_list = parser_html.xpath('//tbody/tr')
return tr_list
# 解析出ip和port
def parser_html(self, tr_list):
proxies_list = []
for tr in tr_list:
# 获取ip
ip = tr.xpath('./td[@data-title="IP"]/text()')[0].strip()
# 获取port
port = tr.xpath('./td[@data-title="PORT"]/text()')[0].strip()
# 将ip和port封装到字典
ip_dict = {
"http": "http://" + ip + ":" + port,
"https": "https://" + ip + ":" + port
}
proxies_list.append(ip_dict)
return proxies_list
# 检查IP是否可用
def check_ip(self, proxies_list):
use_proxy = []
for ip in proxies_list:
try:
#
response = requests.get(url="http://httpbin.org/", headers=self.headers, proxies=ip, timeout=3)
# response = requests.get(url="https://www.baidu.com/", headers=self.headers, proxies=ip, timeout=3)
# 判断ip是否可用
if response.status_code == 200:
use_proxy.append(ip)
self.count += 1
print('当前检测ip', ip, '检测可用')
except Exception as e:
print('当前检测ip', ip, '请求超时,检测不合格')
# else:
# print('当前检测ip', ip, '检测可用')
time.sleep(random.randint(2, 3))
return use_proxy
# 保存有效ip到csv文件,如不要保存,可用在run方法中将其注释掉即可
def save_ip(self, proxy, save_filename):
try:
if proxy:
# 设置将保持的文件放到桌面
save_path = "c:/Users/" + os.getlogin() + "/Desktop/"
save_file = save_path + save_filename
print("文件保存于:"+save_file+".csv")
with open(save_file + ".csv", 'a+', encoding='utf-8') as f:
fieldnames = ['http', 'https']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writerows(proxy)
except Exception as e:
print(e.args)
def run(self):
begin = int(input("请输入要抓取的开始页:"))
end = int(input("请输入要抓取的终止页:"))
filename = input("请输入保存文件名称:")
for page in range(begin, end + 1):
print(f"#################{page}页################################")
# 重构url
url = self.url.format(page)
# 获取IP的行
parser_html = self.get_html(url)
# 解析
proxies_list = self.parser_html(parser_html)
# 检查
proxy = self.check_ip(proxies_list)
# 将可用的IP代理存入文件中:如若不想保存到文件中,将下面这行代码注销即可
self.save_ip(proxy, filename)
# 休眠
time.sleep(random.randint(2, 3))
if __name__ == "__main__":
start = time.time()
spider = IPSipder()
spider.run()
stop = time.time()
print('有效IP共(' + str(spider.count) + ')个', '搜索耗时:%.2f' % (stop - start))
请输入要抓取的开始页:1
请输入要抓取的终止页:2
请输入保存文件名称:ips
#################1页################################
当前检测ip {'http': 'http://113.121.40.139:9999', 'https': 'https://113.121.40.139:9999'} 请求超时,检测不合格
当前检测ip {'http': 'http://183.236.123.242:8060', 'https': 'https://183.236.123.242:8060'} 请求超时,检测不合格
当前检测ip {'http': 'http://113.121.47.252:9999', 'https': 'https://113.121.47.252:9999'} 请求超时,检测不合格
当前检测ip {'http': 'http://113.121.42.24:9999', 'https': 'https://113.121.42.24:9999'} 请求超时,检测不合格
当前检测ip {'http': 'http://36.134.91.82:8888', 'https': 'https://36.134.91.82:8888'} 请求超时,检测不合格
当前检测ip {'http': 'http://121.233.206.167:8888', 'https': 'https://121.233.206.167:8888'} 请求超时,检测不合格
当前检测ip {'http': 'http://182.34.101.178:9999', 'https': 'https://182.34.101.178:9999'} 请求超时,检测不合格
当前检测ip {'http': 'http://60.170.204.30:8060', 'https': 'https://60.170.204.30:8060'} 请求超时,检测不合格
当前检测ip {'http': 'http://61.216.156.222:60808', 'https': 'https://61.216.156.222:60808'} 请求超时,检测不合格
当前检测ip {'http': 'http://182.140.244.163:8118', 'https': 'https://182.140.244.163:8118'} 请求超时,检测不合格
当前检测ip {'http': 'http://222.74.73.202:42055', 'https': 'https://222.74.73.202:42055'} 请求超时,检测不合格
当前检测ip {'http': 'http://113.121.39.175:9999', 'https': 'https://113.121.39.175:9999'} 请求超时,检测不合格
#################2页################################
当前检测ip {'http': 'http://182.34.21.175:9999', 'https': 'https://182.34.21.175:9999'} 请求超时,检测不合格
当前检测ip {'http': 'http://114.232.109.121:8888', 'https': 'https://114.232.109.121:8888'} 请求超时,检测不合格
当前检测ip {'http': 'http://114.231.42.76:8888', 'https': 'https://114.231.42.76:8888'} 请求超时,检测不合格
当前检测ip {'http': 'http://47.97.191.179:8018', 'https': 'https://47.97.191.179:8018'} 请求超时,检测不合格
当前检测ip {'http': 'http://218.75.69.50:57903', 'https': 'https://218.75.69.50:57903'} 请求超时,检测不合格
当前检测ip {'http': 'http://114.232.109.219:8888', 'https': 'https://114.232.109.219:8888'} 请求超时,检测不合格
当前检测ip {'http': 'http://183.236.232.160:8080', 'https': 'https://183.236.232.160:8080'} 请求超时,检测不合格
当前检测ip {'http': 'http://183.64.239.19:8060', 'https': 'https://183.64.239.19:8060'} 请求超时,检测不合格
当前检测ip {'http': 'http://60.205.132.71:80', 'https': 'https://60.205.132.71:80'} 检测可用
当前检测ip {'http': 'http://114.232.109.19:8888', 'https': 'https://114.232.109.19:8888'} 请求超时,检测不合格
当前检测ip {'http': 'http://182.34.103.235:9999', 'https': 'https://182.34.103.235:9999'} 请求超时,检测不合格
当前检测ip {'http': 'http://113.121.37.103:9999', 'https': 'https://113.121.37.103:9999'} 请求超时,检测不合格
文件保存于:c:/Users/qwy/Desktop/ips.csv
有效IP共(1)个 搜索耗时:132.67