利用爬虫构建IP代理池

  • 准备工作, 安装好我们所要使用到的包.

    # 用于连接MongoDB数据库的包,要事先安装好MongoDB数据库
    pip install pymongo
    # 用于发送请求的包
    pip install requests
    # 用于解析HTML文档,提取我们想要的内容
    pip install lxml
    
  • 源代码(注:免费的代理ip通常不够稳定,偶尔测试使用一下还可以,但常用代理ip还是付费的比较稳定):

    import threading
    import pymongo
    import requests
    from lxml import html
    
    url = 'https://www.xicidaili.com/nn/1'
    
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
    }
    etree = html.etree
    
    response = requests.get(url=url, headers=headers)
    
    html = etree.HTML(response.text)
    # ip
    ip = html.xpath("//tr[@class='odd']/td[2]/text()")
    # 端口号
    port = html.xpath("//tr[@class='odd']/td[3]/text()")
    # 隐匿性
    anon = html.xpath("//tr[@class='odd']/td[5]/text()")
    # 是http还是https协议
    prot = html.xpath("//tr[@class='odd']/td[6]/text()")
    
    
    # 创建一个列表保存http类型的ip
    http_list = []
    # 创建一个列表保存https类型的ip
    https_list = []
    # 创建一个列表保存能使用的ip
    goodip_list = []
    
    
    for index, value in enumerate(anon):
        if value == '高匿':
            ip_port = ip[index] + ':' + port[index]
            if prot[index] == 'http':
                http_list.append(ip_port)
            else:
                https_list.append(ip_port)
    
    
    # 测试得到的ip是否可用
    def ip_test(proxy):
        try:
            urls = r'http://ip.tool.chinaz.com/'
    
            result = requests.get(
                url=urls,
                headers=headers,
                proxies={
                    'http': f'http://{proxy}',
                    'https': f'https://{proxy}',
                },
                timeout=5,
            )
            # 得到对应的页面内容
            htmls = etree.HTML(result.text)
            # 返回一个列表
            ip_get = htmls.xpath(r'//dd[@class="fz24"]/text()')
    
            if ip_get[0] == proxy[:-5]:
                return True
            elif ip_get[0] == proxy[:-6]:
                return True
            else:
                return False
        except:
            return False
    
    
    def save_ip(proxy, proxy_type):
        if ip_test(proxy):
            print(f'可使用ip:{proxy}, 类型为:{proxy_type}')
            goodip_list.append({'ip': proxy})
    
    
    if __name__ == '__main__':
        tasks = []  # 线程池
        for i in http_list:
            task = threading.Thread(target=save_ip, args=(i, 'http'))
            tasks.append(task)
            task.start()
        for j in https_list:
            task = threading.Thread(target=save_ip, args=(j, 'https'))
            tasks.append(task)
            task.start()
        for k in tasks:
            k.join()
        # 创建mongodb数据库连接
        conn = pymongo.MongoClient()
        # 选择数据库
        db = conn.proxy
        # 查询数据库中存在的集合
        coll_list = db.list_collection_names()
        if 'proxys' in coll_list:
            # 如果存在则清空集合中的所有数据
            proxys = db.proxys
            proxys.delete_many({})
        else:
            # 如果不存在则创建集合proxys
            proxys = db.proxys
        # 往集合proxys中添加数据
        proxys.insert_many(goodip_list)
    
        print('已完成所有操作!')
    

你可能感兴趣的:(爬虫学习)