Python爬虫——建立IP代理池

爬取ip(IPPool.py)

import requests

from lxml import etree

from fake_useragent import UserAgent

#伪装

ua = UserAgent()

headers = {'User-Agent':ua.random}

def get_ip():

ip_list = []

#路径

url = 'https://www.xicidaili.com/nt/' #ip是有时效的,只爬取第一页

#请求

response = requests.get(url=url,headers=headers)

#设置编码

response.encoding = response.apparent_encoding

response = response.text

response = etree.HTML(response)

tr_list = response.xpath('//tr[@class="odd"]')

for i in tr_list:

#ip

ip = i.xpath('./td[2]/text()')[0]

#端口号

port = i.xpath('./td[3]/text()')[0]

#协议

agreement = i.xpath('./td[6]/text()')[0]

agreement = agreement.lower()

#拼装完整路径

ip = agreement + '://' + ip + ':' + port

ip_list.append(ip)

return ip_list

if __name__ == '__main__':

ip_list = get_ip()

print(ip_list)

测试ip

测试方法一(from multiprocessing.dummy import Pool)

import requests

from multiprocessing.dummy import Pool

#获取爬取到的ip列表

from IPPool import get_ip

test_list = get_ip()

#定义一个全局列表,用来存放有效ip

ip_list = []

#ip测试网站

外汇代理

url = 'http://icanhazip.com'

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'

}

def ip_test(ip):

try:

if ip.split(":")[0] == 'http':

proxies = {

'http': ip

}

else:

proxies = {

'https': ip

}

response = requests.get(url=url, headers=headers, proxies=proxies, timeout=3)

ip_list.append(ip)

print(ip + "可用")

except:

print(ip + "不可用")

if __name__ == '__main__':

pool = Pool(4)

pool.map(ip_test, test_list)

print(ip_list)

print("总共爬取%s个ip,可用ip为:%s,不可用ip为:%s"%(len(test_list),len(ip_list),len(test_list)-len(ip_list)))

测试方法二(Threading多线程队列)

import threading

import requests

import queue

from fake_useragent import UserAgent

#获取爬取到的ip列表

from IPPool import get_ip

test_list = get_ip()

#定义一个全局列表,用来存放有效ip

ip_pool = []

#随机头伪装

ua = UserAgent()

headers = {'User-Agent':ua.random}

url = 'https://www.csdn.net/'

# url = 'http://icanhazip.com/'

def test_ip(queue_list):

while True:

if queue_list.empty():

break

else:

ip = queue_list.get()

if ip.split(":")[0] == 'http':

proxies = {

'http' : ip

}

else:

proxies = {

'https': ip

}

try:

response = requests.get(url=url, headers=headers, proxies=proxies,timeout=3)

if response.status_code == 200:

print("【%s】测试%s,测试结果【可用】" % (threading.current_thread().name, proxies))

ip_pool.append(ip)

except:

print("【%s】测试%s,测试结果【不可用】" % (threading.current_thread().name, proxies))

if __name__ == '__main__':

queue_list = queue.Queue()#创建队列

#将爬取的ip放入队列中

for i in test_list:

queue_list.put(i)

#创建线程

out_thread = [threading.Thread(target=test_ip, args=(queue_list,), name="进程%s" % item) for item in range(5)]

for thread in out_thread:

thread.start()

for thread in out_thread:

thread.join()

print('测试完成')

print(ip_pool)

print("总共爬取%s个ip,可用ip为:%s,不可用ip为:%s"%(len(test_list),len(ip_pool),len(test_list)-len(ip_pool)))

IPPool2.py

import requests

from lxml import etree

from fake_useragent import UserAgent

#伪装

ua = UserAgent()

headers = {'User-Agent':ua.random}

def get_ip():

ip_list = []

#路径

url = 'https://www.kuaidaili.com/free/intr/'

#请求

response = requests.get(url=url,headers=headers)

#设置编码

response.encoding = response.apparent_encoding

response = response.text

response = etree.HTML(response)

tr_list = response.xpath('//*[@id="list"]/table/tbody/tr')

for i in tr_list:

ip = i.xpath('./td[1]/text()')[0]

ip_list.append(ip)

return ip_list

if __name__ == '__main__':

ip_list = get_ip()

# print(ip_list)

你可能感兴趣的:(Python爬虫——建立IP代理池)