1、需求
图里是需求,爬虫写死,更换代理不由它去做处理。自动会更换。
2 方案选型
- 1、之前有个Python版本的正向代理。忘了叫啥了。因为项目建立和写文章不是同时期,所以找不到了。内存占用什么的特别大,速度也不是很快。所以选择了方案2
- 2、openresty:通过nginx做正向代理。lua脚本更换代理,代理由redis读出来。
3、实践
安装openresty
yum install yum-utils
yum-config-manager --add-repo https://openresty.org/package/centos/openresty.repo
yum install openresty
或者下面这种装法(任选其一)
wget https://openresty.org/package/centos/openresty.repo
sudo mv openresty.repo /etc/yum.repos.d/
sudo yum check-update
sudo yum install -y openresty
mac安装:
brew install openresty/brew/openresty
编写配置文件
新建nginx_redis.conf
redis单机版
worker_processes 16; #nginx worker 数量
error_log /data/logs/openresty/error.log; #指定错误日志文件路径
events {
worker_connections 1024;
}
stream {
## TCP 代理日志格式定义
log_format tcp_proxy '$remote_addr [$time_local] '
'$protocol $status $bytes_sent $bytes_received '
'$session_time "$upstream_addr" '
'"$upstream_bytes_sent" "$upstream_bytes_received" "$upstream_connect_time"';
## TCP 代理日志配置
access_log /data/logs/openresty/tcp-access.log tcp_proxy;
open_log_file_cache off;
## TCP 代理配置
upstream backend{
server 127.0.0.2:1101;# 爱写啥写啥 反正下面的代码也给你改了
balancer_by_lua_block {
-- 初始化balancer
local balancer = require "ngx.balancer"
local host = "127.0.0.3"
local port = 3888 # 这是代理机的代理端口
host = ngx.ctx.proxy_host
-- 设置 balancer
local ok, err = balancer.set_current_peer(host, port)
if not ok then
ngx.log(ngx.ERR, "failed to set the peer: ", err)
end
}
}
server {
preread_by_lua_block{
local redis = require("resty.redis")
--创建实例
local redis_instance = redis:new()
--设置超时(毫秒)
redis_instance:set_timeout(3000)
--建立连接
local rhost = "10.8.181.1"
local rport = 6379
local ok, err = redis_instance:connect(rhost, rport)
local oke, err = redis_instance:select(15)
if not oke then
ngx.log(ngx.ERR,"connect to redis error : ", err)
return redis_instance:close()
end
local res, err = redis_instance:rpoplpush("vps","vps")
-- ngx.log(ngx.ERR,"res num error : ", res)
if not res then
ngx.log(ngx.ERR,"res num error : ", err)
return redis_instance:close()
end
-- ngx.log(ngx.ERR,"redis data = ",res..":3888");
ngx.ctx.proxy_host = res
redis_instance:close()
}
# 下面是本机的端口,也就是爬虫固定写死的端口
listen 0.0.0.0:3889; #监听本机地址和端口,当使用keeplived的情况下使用keeplived VIP
proxy_connect_timeout 3s;
proxy_timeout 10s;
#set_by_lua_file $backend set.lua;
#proxy_pass $backend; #这里填写对端的地址
proxy_pass backend; #这里填写对端的地址
}
}
redis集群版
worker_processes 16; #nginx worker 数量
error_log /data/logs/openresty/error-pa-redis.log; #指定错误日志文件路径
events {
worker_connections 1024;
}
stream {
## TCP 代理日志格式定义
log_format tcp_proxy '$remote_addr [$time_local] '
'$protocol $status $bytes_sent $bytes_received '
'$session_time "$upstream_addr" '
'"$upstream_bytes_sent" "$upstream_bytes_received" "$upstream_connect_time"';
## TCP 代理日志配置
access_log /data/logs/openresty/tcp-access-pa-redis.log tcp_proxy;
open_log_file_cache off;
## TCP 代理配置
upstream backend{
server 127.0.0.2:11201;
balancer_by_lua_block {
-- 初始化balancer
local balancer = require "ngx.balancer"
local host = "127.0.0.3"
local port = 3888
host = ngx.ctx.proxy_host
-- 设置 balancer
local ok, err = balancer.set_current_peer(host, port)
if not ok then
ngx.log(ngx.ERR, "failed to set the peer: ", err)
end
}
}
lua_shared_dict redis_cluster_slot_locks 100k;
server {
preread_by_lua_block{
local config = {
name = "mengmugai", --rediscluster name
serv_list = { --redis cluster node list(host and port),
{ ip = "10.8.181.1", port = 16379 },
{ ip = "10.8.181.2", port = 16379 },
{ ip = "10.8.181.3", port = 16379 },
{ ip = "10.8.181.4", port = 16379 }
},
keepalive_timeout = 60000, --redis connection pool idle timeout
keepalive_cons = 1000, --redis connection pool size
connect_timeout = 1000, --timeout while connecting
read_timeout = 1000, --timeout while reading
send_timeout = 1000, --timeout while sending
max_redirection = 5, --maximum retry attempts for redirection,
max_connection_attempts = 1, --maximum retry attempts for connection
auth = "renzhengmima" --set password while setting auth
}
local redis_cluster = require "rediscluster"
local red_c = redis_cluster:new(config)
local res, err = red_c:rpoplpush("vps","vps")
if err then
ngx.log(ngx.ERR, "pa redis err: ", err)
else
ngx.log(ngx.ERR,"redis data = ",res..":3888");
end
ngx.ctx.proxy_host = res
red_c:close()
}
listen 0.0.0.0:3889; #监听本机地址和端口,当使用keeplived的情况下使用keeplived VIP
proxy_connect_timeout 3s;
proxy_timeout 10s;
#set_by_lua_file $backend set.lua;
#proxy_pass $backend; #这里填写对端的地址
proxy_pass backend; #这里填写对端的地址
}
}
运行
如果是单机版的话
直接/usr/local/openresty/nginx/sbin/nginx -c /data/openresty-proxy/conf/nginx_redis.conf
就行了。具体文件路径 还有代码里的日志路径自己去抉择
如果是集群版的话
连接redis集群需要用到 lua-resty-redis-cluster模块
github地址https://github.com/cuiweixie/lua-resty-redis-cluster
下载之后,需要用2个文件rediscluster.lua和redis_slot.c 都在lib里面
复制包中的 redis_slot.c和rediscluster.lu 到openresty安装目录的lualib下
.c文件无法在Nginx配置文件中引入,需要编译成.so文件,编译命令
# 安装gcc、c++编译器以及内核文件
yum -y install gcc gcc-c++ kernel-devel
# centos自带lua需要执行此命令再编译,自己安装过lua不需要
yum install lua-devel
#编译命令
gcc redis_slot.c -fPIC -shared -o libredis_slot.so
#查看结果
直接/usr/local/openresty/nginx/sbin/nginx -c /data/openresty-proxy/conf/nginx_redis.conf
就行了
最后
爬虫写好代理试一下ip:3889试试就行了。文章可能因为脱敏有点改乱了
参考:
https://blog.csdn.net/qq_22494169/article/details/109357667
https://blog.csdn.net/zyt425916200/article/details/78113547
https://github.com/openresty/lua-resty-redis#connect