写爬虫用上代理IP是很常见的事情,那么我们就写一个包来方便自己以后写爬虫。
还是用的老IP网站——http://www.xicidaili.com/
获取的是高匿的代理页面,可以上西刺代理网站上看源代码,发现写的不是很正规,我用的是xpath的方法抓取网页内容,也可以用re正则的方法来,个人认为xpath比较简单,速度也不慢。
下面是核心实现部分的代码:
def GetIP(nub, HTTPUrl=0, HTTPSUrl=0):
'''
获取ip
nub 是获取的有效代理IP数量
detectionURL 是测试的的网站URL
'''
print("正在获取代理IP...")
page = 1
while len(usableIP) <= nub:
# print("测试第"+str(page)+"页")
url = "http://www.xicidaili.com/nt/"+str(page)
headers = header()
sess = requests.Session()
html = sess.get(url, headers=headers).text
selector = etree.HTML(html)
# 获取ip
ipList = selector.xpath('//td[2]/text()')
# 获取端口号
portList = selector.xpath('//td[3]/text()')
# 获取ip类型
typeList = selector.xpath('//td[6]/text()')
'''单个测试
c = detectionIP('183.62.196.10', '3128', 'HTTP')
return c
'''
'''这是程序测试ip可用性,速度很慢,采用多线程'''
# 用来储存线程,这样可以让主线程等待
threadc = []
# 验证代理ip是否可用
for i in range(0, 99):
c = threading.Thread(target=detectionIP, args=(ipList[i], portList[i], typeList[i], HTTPSUrl, HTTPUrl))
c.start()
threadc.append(c)
for c in threadc:
c.join()
page += 1
print("获取代理IP成功")
return usableIP
在上面的代码中用到了detectionIP 是用来验证代理ip的有效性的,这是我写的验证方法,不是很通用,当时也是为了下载听书小说。
验证的方法也是很简单的,就是用代理IP访问一下目标网站(最好不要用百度,因为基本上测不出来),推荐用自己要爬取的网站,这样比较有效。访问目标网站返回的code=200就算是访问成功了,但是我发现有的IP不是很健康,代理IP后不管访问什么网站都是跳转到了有道,所以在下面又做了一个小验证。
下面是验证IP的代码:
def detectionIP(ip, port, type, httpsurl, httpurl):
'''
IP验证存活性
'''
try:
if type == 'HTTP'and httpurl != 0:
IPAgency = {
"http": "http://"+ip+":"+port
}
r = requests.get(httpurl, headers=header(), proxies=IPAgency, timeout=5)
if r.status_code == 200:
try:
content = requests.get(httpurl, headers=header(), proxies=IPAgency, timeout=5) # 进一步测试ip的有效性
content.encoding = "GBK"
s = etree.HTML(content.text)
text = s.xpath('//a[@href="//www.wotingpingshu.com"]/text()')[0] # xpath取出来的是列表
if text == "我听评书网":
usableIP.append(ip+":"+port) # + "@" + type +" "+str(r.status_code))
except BaseException:
pass
elif type == 'HTTPS'and httpsurl != 0:
IPAgency = {
"https": "http://"+ip+":"+port
}
r = requests.get(httpsurl, headers=header(), proxies=IPAgency, timeout=5)
if r.status_code == 200:
usableIP.append(ip+":"+port) # + "@" + type" "+str(r.status_code))
except BaseException:
# print(ip+" 无效 "+type)
pass
代码的主要部分就是上面这些,全部代码在下面:
import requests
import random
import threading
import re
from lxml import etree
headersList = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
]
usableIP = []
def header():
headers = {"User-Agent": random.choice(headersList)}
return headers
def detectionIP(ip, port, type, httpsurl, httpurl):
'''
IP验证存活性
'''
try:
if type == 'HTTP'and httpurl != 0:
IPAgency = {
"http": "http://"+ip+":"+port
}
r = requests.get(httpurl, headers=header(), proxies=IPAgency, timeout=5)
if r.status_code == 200:
try:
content = requests.get(httpurl, headers=header(), proxies=IPAgency, timeout=5) # 进一步测试ip的有效性
content.encoding = "GBK"
s = etree.HTML(content.text)
text = s.xpath('//a[@href="//www.wotingpingshu.com"]/text()')[0] # xpath取出来的是列表
if text == "我听评书网":
usableIP.append(ip+":"+port) # + "@" + type +" "+str(r.status_code))
except BaseException:
pass
elif type == 'HTTPS'and httpsurl != 0:
IPAgency = {
"https": "http://"+ip+":"+port
}
r = requests.get(httpsurl, headers=header(), proxies=IPAgency, timeout=5)
if r.status_code == 200:
usableIP.append(ip+":"+port) # + "@" + type" "+str(r.status_code))
except BaseException:
# print(ip+" 无效 "+type)
pass
def GetIP(nub, HTTPUrl=0, HTTPSUrl=0):
'''
获取ip
nub 是获取的有效代理IP数量
detectionURL 是测试的的网站URL
'''
print("正在获取代理IP...")
page = 1
while len(usableIP) <= nub:
# print("测试第"+str(page)+"页")
url = "http://www.xicidaili.com/nt/"+str(page)
headers = header()
sess = requests.Session()
html = sess.get(url, headers=headers).text
selector = etree.HTML(html)
# 获取ip
ipList = selector.xpath('//td[2]/text()')
# 获取端口号
portList = selector.xpath('//td[3]/text()')
# 获取ip类型
typeList = selector.xpath('//td[6]/text()')
'''单个测试
c = detectionIP('183.62.196.10', '3128', 'HTTP')
return c
'''
'''这是程序测试ip可用性,速度很慢,采用多线程'''
# 用来储存线程,这样可以让主线程等待
threadc = []
# 验证代理ip是否可用
for i in range(0, 99):
c = threading.Thread(target=detectionIP, args=(ipList[i], portList[i], typeList[i], HTTPSUrl, HTTPUrl))
c.start()
threadc.append(c)
for c in threadc:
c.join()
page += 1
print("获取代理IP成功")
'''这是程序的最后一段,展示注释 现在不用这段代码了
# 随机一个数,用来随机取IP及端口号
nub = random.randint(0, 99)
IPAgency = {
"http": "http://"+ipList[nub]+":"+portList[nub]
}
'''
'''
newIP = []
f = open("IP.txt", "w")
for i in range(0, 99):
usableIP.append(ipList[i] + ":" + portList[i])
f.write(ipList[i] + ":" + portList[i]+"@"+typeList[i]+"\n")
f.close()
'''
return usableIP
if __name__ == "__main__":
# 测试用的是百度,但是百度屏蔽了几乎所有的免费IP代理
IP = GetIP(3, 'http://www.wotingpingshu.com/',)
# print("http:", ip['http'])
pattern = re.compile(r'^(.*)\@(.*)$')
f = open("IP.txt", "w")
for i in IP:
print(i)
# print(pattern.match(i).group(1))
f.write(i+'\n')
f.close()