爬虫如何获取免费代理IP(三)

快代理代理IP爬取

代码实现

import os

import requests
import time
import random
from fake_useragent import UserAgent
from lxml import etree
import csv

"""
快代理-ip
"""

import os

import requests
import time
import random
from fake_useragent import UserAgent
from lxml import etree
import csv

"""
快代理-ip
"""


class IPSipder(object):

    def __init__(self):
        self.url = "https://www.kuaidaili.com/free/inha/{}/"
        self.headers = {'User-Agent': UserAgent().random}
        # 记录有效IP总数
        self.count = 0

    # 获取所有ip行
    def get_html(self, url):
        html = requests.get(url=url, headers=self.headers).text
        parser_html = etree.HTML(html)
        tr_list = parser_html.xpath('//tbody/tr')
        return tr_list

    # 解析出ip和port
    def parser_html(self, tr_list):
        proxies_list = []
        for tr in tr_list:
            # 获取ip
            ip = tr.xpath('./td[@data-title="IP"]/text()')[0].strip()
            # 获取port
            port = tr.xpath('./td[@data-title="PORT"]/text()')[0].strip()
            # 将ip和port封装到字典
            ip_dict = {
                "http": "http://" + ip + ":" + port,
                "https": "https://" + ip + ":" + port
            }
            proxies_list.append(ip_dict)

        return proxies_list

    # 检查IP是否可用
    def check_ip(self, proxies_list):
        use_proxy = []

        for ip in proxies_list:
            try:
                #
                response = requests.get(url="http://httpbin.org/", headers=self.headers, proxies=ip, timeout=3)
                # response = requests.get(url="https://www.baidu.com/", headers=self.headers, proxies=ip, timeout=3)
                # 判断ip是否可用
                if response.status_code == 200:
                    use_proxy.append(ip)
                    self.count += 1
                    print('当前检测ip', ip, '检测可用')
            except Exception as e:
                print('当前检测ip', ip, '请求超时,检测不合格')

            # else:
            #     print('当前检测ip', ip, '检测可用')
            time.sleep(random.randint(2, 3))
        return use_proxy

    # 保存有效ip到csv文件,如不要保存,可用在run方法中将其注释掉即可
    def save_ip(self, proxy, save_filename):
        try:
            if proxy:
                # 设置将保持的文件放到桌面
                save_path = "c:/Users/" + os.getlogin() + "/Desktop/"
                save_file = save_path + save_filename
                print("文件保存于:"+save_file+".csv")
                with open(save_file + ".csv", 'a+', encoding='utf-8') as f:
                    fieldnames = ['http', 'https']
                    writer = csv.DictWriter(f, fieldnames=fieldnames)
                    writer.writerows(proxy)
        except Exception as e:
            print(e.args)

    def run(self):
        begin = int(input("请输入要抓取的开始页:"))
        end = int(input("请输入要抓取的终止页:"))
        filename = input("请输入保存文件名称:")
        for page in range(begin, end + 1):
            print(f"#################{page}页################################")
            # 重构url
            url = self.url.format(page)
            # 获取IP的行
            parser_html = self.get_html(url)
            # 解析
            proxies_list = self.parser_html(parser_html)
            # 检查
            proxy = self.check_ip(proxies_list)

            # 将可用的IP代理存入文件中:如若不想保存到文件中,将下面这行代码注销即可
            self.save_ip(proxy, filename)
        # 休眠
        time.sleep(random.randint(2, 3))


if __name__ == "__main__":
    start = time.time()
    spider = IPSipder()
    spider.run()
    stop = time.time()
    print('有效IP共(' + str(spider.count) + ')个', '搜索耗时:%.2f' % (stop - start))


代码运行

请输入要抓取的开始页:1
请输入要抓取的终止页:2
请输入保存文件名称:ips
#################1页################################
当前检测ip {'http': 'http://113.121.40.139:9999', 'https': 'https://113.121.40.139:9999'} 请求超时,检测不合格
当前检测ip {'http': 'http://183.236.123.242:8060', 'https': 'https://183.236.123.242:8060'} 请求超时,检测不合格
当前检测ip {'http': 'http://113.121.47.252:9999', 'https': 'https://113.121.47.252:9999'} 请求超时,检测不合格
当前检测ip {'http': 'http://113.121.42.24:9999', 'https': 'https://113.121.42.24:9999'} 请求超时,检测不合格
当前检测ip {'http': 'http://36.134.91.82:8888', 'https': 'https://36.134.91.82:8888'} 请求超时,检测不合格
当前检测ip {'http': 'http://121.233.206.167:8888', 'https': 'https://121.233.206.167:8888'} 请求超时,检测不合格
当前检测ip {'http': 'http://182.34.101.178:9999', 'https': 'https://182.34.101.178:9999'} 请求超时,检测不合格
当前检测ip {'http': 'http://60.170.204.30:8060', 'https': 'https://60.170.204.30:8060'} 请求超时,检测不合格
当前检测ip {'http': 'http://61.216.156.222:60808', 'https': 'https://61.216.156.222:60808'} 请求超时,检测不合格
当前检测ip {'http': 'http://182.140.244.163:8118', 'https': 'https://182.140.244.163:8118'} 请求超时,检测不合格
当前检测ip {'http': 'http://222.74.73.202:42055', 'https': 'https://222.74.73.202:42055'} 请求超时,检测不合格
当前检测ip {'http': 'http://113.121.39.175:9999', 'https': 'https://113.121.39.175:9999'} 请求超时,检测不合格
#################2页################################
当前检测ip {'http': 'http://182.34.21.175:9999', 'https': 'https://182.34.21.175:9999'} 请求超时,检测不合格
当前检测ip {'http': 'http://114.232.109.121:8888', 'https': 'https://114.232.109.121:8888'} 请求超时,检测不合格
当前检测ip {'http': 'http://114.231.42.76:8888', 'https': 'https://114.231.42.76:8888'} 请求超时,检测不合格
当前检测ip {'http': 'http://47.97.191.179:8018', 'https': 'https://47.97.191.179:8018'} 请求超时,检测不合格
当前检测ip {'http': 'http://218.75.69.50:57903', 'https': 'https://218.75.69.50:57903'} 请求超时,检测不合格
当前检测ip {'http': 'http://114.232.109.219:8888', 'https': 'https://114.232.109.219:8888'} 请求超时,检测不合格
当前检测ip {'http': 'http://183.236.232.160:8080', 'https': 'https://183.236.232.160:8080'} 请求超时,检测不合格
当前检测ip {'http': 'http://183.64.239.19:8060', 'https': 'https://183.64.239.19:8060'} 请求超时,检测不合格
当前检测ip {'http': 'http://60.205.132.71:80', 'https': 'https://60.205.132.71:80'} 检测可用
当前检测ip {'http': 'http://114.232.109.19:8888', 'https': 'https://114.232.109.19:8888'} 请求超时,检测不合格
当前检测ip {'http': 'http://182.34.103.235:9999', 'https': 'https://182.34.103.235:9999'} 请求超时,检测不合格
当前检测ip {'http': 'http://113.121.37.103:9999', 'https': 'https://113.121.37.103:9999'} 请求超时,检测不合格
文件保存于:c:/Users/qwy/Desktop/ips.csv
有效IP共(1)个 搜索耗时:132.67

你可能感兴趣的:(爬虫,python,爬虫,python)