代理ip和通过ip定位,通过代理ip爬取网页

根据ip看到具体位置

openGPS: https://www.opengps.cn/Data/IP/ipplus.aspx

https://ip.rtbasia.com/

https://www.ipip.net/ip.html

 

国内透明代理ip:http://www.xicidaili.com/nt

免费代理ip:http://www.xicidaili.com/

 

获取免费代理ip(验证哪些ip是有效的)

import urllib3
import re
import requests
import time
from threading import Thread
from threading import Lock
from queue import Queue

# 从西刺抓下来的所有代理ip
all_find_list = []
# 将所有抓到的代理压入队列,四个线程可以从队列中获取代理ip
gaoni_queue = Queue()
# 能够成功连接的代理ip
success_list = []

lock = Lock()


def get_proxy(checking_ip):
    # 根据得到的代理ip,设置proxy的格式
    proxy_ip = 'http://' + checking_ip
    proxy_ips = 'https://' + checking_ip
    proxy = {'https': proxy_ips, 'http': proxy_ip}
    return proxy


def checking_ip():
    global gaoni_queue
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    }

    while 1:
        # 若从队列1秒内无法获得代理ip,说明所有代理均已检测完成,抛出Empty异常
        try:
            checking_ip = gaoni_queue.get(True, 1)
        except:
            gaoni_queue.task_done()
            break

        proxy = get_proxy(checking_ip)
        url = 'https://www.csdn.net/'
        # 使用上面的url,测试代理ip是否能够链接
        try:
            page = requests.get(url, headers=headers, proxies=proxy)
        except:
            lock.acquire()
            print(checking_ip, '失败')
            lock.release()
        else:
            lock.acquire()
            print(checking_ip, '成功')
            success_list.append(checking_ip)
            lock.release()


def get_all():
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
    global all_find_list
    for i in range(1, 2):
        # 从xici网站的高匿页面获取ip
        url = 'http://www.xicidaili.com/nn/%d' % i
        r = requests.get(url, headers=headers)
        data = r.text
        # 抓取所需数据的正则表达式
        p = r'(.*?)\s+(.*?)\s+\s+(.*?)\s+\s+(.*?)'
        find_list = re.findall(p, data)
        all_find_list += find_list
    # 将ip地址与端口组成规定格式
    for row in all_find_list:
        ip = row[0] + ':' + row[1]
        gaoni_queue.put(ip)


if __name__ == '__main__':
    get_all()
    print
    gaoni_queue.qsize()
    thread_1 = Thread(target=checking_ip)
    thread_2 = Thread(target=checking_ip)
    thread_3 = Thread(target=checking_ip)
    thread_4 = Thread(target=checking_ip)
    thread_1.start()
    thread_2.start()
    thread_3.start()
    thread_4.start()
    thread_1.join()
    thread_2.join()
    thread_3.join()
    thread_4.join()
    f = open("E:/ip.txt", "w")
    for row in success_list:
        f.write(row + '\n')
    f.close()

使用代理ip去爬取网页

import urllib.request
import random

proxy_list = []
url = 'https://blog.csdn.net/cuicanxingchen123456/article/details/84306382'
f = open("E:/ip.txt")
line = f.readline().strip('\n')
while line:
    proxy_list.append(line)
    line = f.readline().strip('\n')
f.close()

# iplist = ['115.32.41.100:80','58.30.231.36:80','123.56.90.175:3128']
proxy_support = urllib.request.ProxyHandler({'http':random.choice(proxy_list)})
opener = urllib.request.build_opener(proxy_support)
# opener.addheaders = [('User-Agent','Test_Proxy_Python3.5_maminyao')]
opener.addheaders = [('User-Agent','Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)')]
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html)

 

你可能感兴趣的:(大数据)