HTTP代理池实现

话不多说,代码呈上:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/2/26 13:02
# @Author  : One Fine
__author__ = "One Fine"
# @Site    : 
# @File    : crawl_xici_ip.py
# @Software: PyCharm

import MySQLdb
import requests
from scrapy.selector import Selector

connect = MySQLdb.connect(host='127.0.0.1', user='root963', passwd='963123',
                          db='spider_test', charset='utf8')


class GetIP(object):

    def judeg_ip(self, ip=None, port=None):
        proxy = {}
        if ip:
            proxy = {
                'http': "{0}:{1}".format(ip, port)  # 或者加上http://
            }
            print(proxy["http"])

        test_http_url = "http://www.test.onefine.top/ip"

        try:
            response = requests.get(test_http_url, proxies=proxy, timeout=(0.8, 2))
        except Exception as e:
            print("jugeg_ip exception: ", e)
            self.delete_ip(ip)
            return False
        print(">>>status_code: ", response.status_code)
        print(">>>text: ", response.text)
        if 200 <= response.status_code < 300:
            print(str(response.status_code)+", ip可以用!\n")
            return True
        else:
            self.delete_ip(ip)  # 将此ip从数据库中删除
            return False

    def delete_ip(self, ip=None):
        if not ip:  # ip=None
            return True
        # 从数据库中删除无效的ip
        delete_sql = "delete from `ip_proxy_pool` where ip='{0}'".format(ip)
        try:
            cursor = connect.cursor()
            cursor.execute(delete_sql)
            connect.commit()
            print("已移除IP: %s ." % ip)
            return True
        except Exception as e:
            print("delete_ip exception: ", e)
            print("IP: %s 移除失败." % ip)

        cursor.close()

    def get_random_ip(self):
        print("正在获取ip,请稍后...")

        random_sql = "select ip, port from ip_proxy_pool order by rand() limit 1;"
        cursor = connect.cursor()
        results = cursor.execute(random_sql)

        if results == 0:
            print("ip获取失败:数据库为空!")
            return False

        for ip_info in cursor.fetchall():
            ip = ip_info[0]
            port = ip_info[1]
            if self.judeg_ip(ip, port):
                print("可用代理:", ip+':'+port)
                return ip, port
            else:
                return self.get_random_ip()

        cursor.close()


def get_ip():
    try:
        ip, port = GetIP().get_random_ip()
        print("成功获取:", ip+':'+port)
        return ip+':'+port
    except Exception as e:
        print('exception: ', e)
        print('获取失败!')
        return


proxy = {
    'http': get_ip()
}


def crawl_ips():

    # 爬取某网站的国内高匿代理IP
    url = 'https://www.xicidaili.com/nn/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/63.0.3239.132 Safari/537.36',

    }

    res = requests.get(url, headers=headers, proxies=proxy)
    if not res.status_code == 200:
        print('status_code: ', res.status_code)
        return
    selector = Selector(text=res.text)
    page_number = selector.xpath('//a[last()-1]/text()').extract_first()  # 获取总页数
    page_numbers = int(selector.xpath('//a[last()-1]/text()').extract_first())

    for i in range(1, page_numbers+1):
        # if i == 1:
        #     response = res
        response = requests.get(url+str(i), headers=headers, proxies=proxy)

        selector = Selector(text=response.text)
        # print(response.text)

        all_trs = selector.xpath('//*[@id="ip_list"]//tr')

        ip_list = []
        for tr in all_trs[1:]:

            try:
                country = tr.xpath('td[1]/img/@alt').extract()[0]  # 国家
                server_address = tr.xpath('td[4]/a/text()').extract()[0]  # 服务器地址
            except Exception as e:
                print("crawl_ips exception 1: ", e)
                # continue
                country = server_address = None
            ip = tr.xpath('td[2]/text()').extract()[0]  # IP地址
            port = tr.xpath('td[3]/text()').extract()[0]  # 端口
            anonymity = tr.xpath('td[5]/text()').extract()[0]  # 是否匿名
            type = tr.xpath('td[6]/text()').extract()[0]  # 类型
            speed = tr.xpath('td[7]/div/@title').extract()[0]  # 速度
            connection_time = tr.xpath('td[8]/div/@title').extract()[0]  # 连接时间
            survival_time = tr.xpath('td[9]/text()').extract()[0]  # 存活时间
            verify_time = tr.xpath('td[10]/text()').extract()[0]  # 验证时间

            ip_list.append((country, ip, port, server_address, anonymity, type, speed,
                            connection_time, survival_time, verify_time))

        # 存入数据库
        for ip_info in ip_list:
            import datetime

            cursor = connect.cursor()

            speed = float(ip_info[6].split('秒')[0])
            connection_time = float(ip_info[7].split('秒')[0])
            verify_time = datetime.datetime.strptime("20"+ip_info[9]+":00", "%Y-%m-%d %H:%M:%S")

            # 注意传递值的时候字符串需要引号
            sql = "INSERT INTO `ip_proxy_pool` VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}', {6}, " \
                  "{7}, '{8}', '{9}');".format(ip_info[0], ip_info[1], ip_info[2], ip_info[3], ip_info[4],
                                             ip_info[5], speed, connection_time, ip_info[8], verify_time)
            print('sql', sql)

            try:
                cursor.execute(sql)

                connect.commit()
            except Exception as e:
                print('insert exception: ', e)


if __name__ == '__main__':
    crawl_ips()

你可能感兴趣的:(#,Python)