使用爬虫的过程中,发现一般的网站都会对IP的访问有一定的限制,所以我们需要使用IP代理功能。
下面是个获取西刺免费代理IP的一个小demo。
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import re
import sys
import time
import datetime
import threading
from random import choice
import requests
import bs4
class Proxy:
def __init__(self, url='http://www.xicidaili.com/nn', header='', user_agent=''):
self.url = url
self.header = header
self.user_agent = user_agent
def getIpList(self):
# 获取代理IP(取当前页的ip列表,每页100条ip)
url = self.url
headers = self.header
r = requests.get(url, headers=headers)
soup = bs4.BeautifulSoup(r.text, 'html.parser')
data = soup.table.find_all("td")
# 匹配规则需要用浏览器的开发者工具进行查看
# 匹配IP:208.135.217.21
ip_compile = re.compile(r'(\d+\.\d+\.\d+\.\d+) ')
# 匹配端口:808
port_compile = re.compile(r'(\d+) ')
# 获取所有IP,返回的是数组[]
ip = re.findall(ip_compile, str(data))
# 获取所有端口:返回的是数组[]
port = re.findall(port_compile, str(data))
# 组合IP+端口,如:125.135.217.7:808
return [":".join(i) for i in zip(ip, port)]
# 打开页面。执行操作
def done(self, code=0, ips=[]):
try:
# 随机选取一个ip
ip = choice(ips)
except:
return False
else:
proxies = {
"http": ip,
}
headers_ = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
"Referer": "https://best.zhaopin.com/",
"User-Agent": choice(self.user_agent),
}
try:
# url
link = ''
requests.get(link, headers=headers_, proxies=proxies, verify=False)
except requests.exceptions.ConnectionError:
print("Connection Error")
if not ips:
print("not ip")
sys.exit()
# 删除不可用
if ip in ips:
ips.remove(ip)
# 重新请求
self.done(code, ips)
else:
date = datetime.datetime.now().strftime('%H:%M:%S')
print(u"第%s次 [%s] [%s]: (剩余可用代理IP数:%s)" % (code, date, ip, len(ips)))
if __name__ == '__main__':
url = 'http://www.xicidaili.com/nn'
user_agent = [
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
"Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",
"Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",
]
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
"Referer": "http://www.xicidaili.com",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
}
proxy = Proxy(url, headers, user_agent)
ips = []
# python3把xrange()与rang()e整合为一个range()
for i in range(500):
# 每隔1000次重新获取一次最新的代理IP
if i % 1000 == 0:
ips.extend(proxy.getIpList())
# 启用线程,隔2秒产生一个线程
t1 = threading.Thread(target=proxy.done, args=(i, ips))
t1.start()
# time.sleep的最小单位是毫秒
time.sleep(2)