建立自己的ip池

换ip是使用爬虫不可避免的一部分,建立属于自己优质的ip池可以大大提高我们的效率

下面是一个非常简单的一个版本,初步了解一下ip池的建立流程,以后还会继续更新

import requests
from lxml import etree
import pymysql
conn = pymysql.connect()   #连接数据库
cursor = conn.cursor()

url = "https://www.xicidaili.com/nn/{}"
headers = {
	"Referer": "https://www.xicidaili.com/nn",
	"Sec-Fetch-Dest": "style",
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}

#抓取ip
def get_ip(url,num):
	proxies_lst = []
	for i in range(num):
		url =  url.format(num)
		try:   #避免一个网页的请求失败导致整个程序终止
			response = requests.get(url,headers=headers).content
			html = etree.HTML(response)
			ip_lst = html.xpath(".//table[@id='ip_list']//tr/td[2]")
			port_lst = html.xpath(".//table[@id='ip_list']//tr/td[3]")
			protocol_lst = html.xpath(".//table[@id='ip_list']//tr/td[6]")
			for i in range(len(ip_lst)):
				protocol = protocol_lst[i]
				ip = ip_lst[i]
				port = port_lst[i]
				agent_ip = "{}://{}:{}".format(protocol,ip,port)
				proxy = {protocol:agent_ip}
				proxies_lst.append(proxy)
		except:
			print(url,"failed request")
	return proxies_lst    #返回抓取的ip列表

#测试ip的可用性
def test_ip(lst):
	test_url = "http://ip.chinaz.com/"
	for proxies in lst:
		try:
			response = requests.get(test_url,proxies=proxies)
		except:
			print(proxies,"无效")
			lst.remove(proxies)
		print(proxies,response.status_code)
	return lst   #返回可以用的ip列表

#将可用的ip保存到数据库
def save_ip(callable_lst):
	for proxies in callable_lst:
		cursor.execute("insert into ipool(ip) values(%s)",str(proxies))   #往数据库中插入数据
		conn.commit()
	


#从数据库中调用ip
def use_ip(num):
	cursor.execute("select * from ipool limit %s",num)
	conn.commit
	ip_tuple = cursor.fetchall()
	ip_lst = []
	for ip in ip_tuple:
		ip_lst.append(dict(ip[1]))
	return ip_lst

if __name__ == "__main__":
	lst = get_ip(url)   #抓取ip
	callable_lst = test_ip(lst)   #ip清洗
	save_ip(callable_lst)
	cursor.close()
	conn.close()

你可能感兴趣的:(爬虫学习)