nodejs实践录:使用pm2实现服务的“平滑更新”

本文介绍如何使用pm2实现业务程序的“平滑更新”。

背景

正常运行的程序难免会遇到升级的问题。如何让客户对升级无感,是一个大问题。本文不涉及CICD方面的AB测试、灰度升级等,只是利用mp2的特性来实现“平滑更新”目的,文末给出结论。

代码

本文的测试程序分服务端和客户端,测试时,先让服务端运行,然后让客户端一直连接服务器,接着使用不同的pm2配置更新服务器,观察客户端的连接状态。

服务端代码:

/*
用于测试http服务器连接性,每0.5秒发送GET请求,如连接不上,返回错误,打印err
*/

var request = require('request');

const log = require('../lib/log.js');

function httpGet(url, requestData)
{
    var head = {
        url: url,
        method: "GET",
        json: true,
        headers: {
            "content-type": "application/json",
            'User-Agent': 'git-oschina-hook',
            'X-Gitee-Token': 'webhook password',
            'X-Gitee-Event': 'Push Hook'
        },
        body: requestData
        // body: JSON.stringify(requestData)
    };

    request(head, function fn(error, response, body) {
        if (!error && response.statusCode == 200) {
            log.print('=============GET content: \n', body) // 成功
        }else{
            log.err('err');
        }
    });
}

function getHttp()
{
    var url = 'http://127.0.0.1:4000/';
    var msg = new Object();
    msg['devid'] = '123456798';
    msg['station'] = '01';
    msg['stateCode']=='501';
    //log.print('will get the http.');
    httpGet(url, msg);
}

var g_loopId = null;

function startLoop()
{
    g_loopId = setInterval(loopFunc, 500);
}

function stopLoop()
{
    clearInterval(g_loopId);
}

function loopFunc()
{
    getHttp();
}

function main()
{
    log.print('start looping...');
    startLoop();
}

main();

客户端代码:

/*
用于测试http服务器连接性,每0.5秒发送GET请求,如连接不上,返回错误,打印err
*/

var request = require('request');

const log = require('../lib/log.js');

function httpGet(url, requestData)
{
    var head = {
        url: url,
        method: "GET",
        json: true,
        headers: {
            "content-type": "application/json",
            'User-Agent': 'git-oschina-hook',
            'X-Gitee-Token': 'webhook password',
            'X-Gitee-Event': 'Push Hook'
        },
        body: requestData
        // body: JSON.stringify(requestData)
    };

    request(head, function fn(error, response, body) {
        if (!error && response.statusCode == 200) {
            log.print('=============GET content: \n', body) // 成功
        }else{
            log.err('err');
        }
    });
}

function getHttp()
{
    var url = 'http://127.0.0.1:4000/';
    var msg = new Object();
    msg['devid'] = '123456798';
    msg['station'] = '01';
    msg['stateCode']=='501';
    //log.print('will get the http.');
    httpGet(url, msg);
}

var g_loopId = null;

function startLoop()
{
    g_loopId = setInterval(loopFunc, 500);
}

function stopLoop()
{
    clearInterval(g_loopId);
}

function loopFunc()
{
    getHttp();
}

function main()
{
    log.print('start looping...');
    startLoop();
}

main();

ecosystem.config.js配置文件:

module.exports = {
  // APP是一个数组,可以有多个
  // 参数参考:https://pm2.io/doc/en/runtime/reference/ecosystem-file/
  apps : [
  {
    name: 'app',
    script: 'server.js',
    args: 'null',
    instances: 2,
    exec_mode: "cluster", // 集群模式,如不指定,默认为fork
    autorestart: false,
    min_uptime: "60s",
    max_restarts: 3,
    watch: false,
    //error_file: "./logs/app-err.log",
    //out_file: "./logs/app-out.log",
    log: "./logs/app.log",
    //log_date_format: "YYYY-MM-DD HH:mm Z", // pm2 log添加日期
    max_memory_restart: '1G',
    //listen_timeout: 3000,
    kill_timeout: 3000,
    // wait_ready: true,
    env:
    {
      NODE_ENV: 'development'
    },
    env_production:
    {
      NODE_ENV: 'production'
    }
  }
  ],

  // 部署,暂未使用,不用理
  deploy : {
    production : {
      user : 'node',
      host : '212.83.163.1',
      ref  : 'origin/master',
      repo : '[email protected]:repo.git',
      path : '/var/www/production',
      'post-deploy' : 'npm install && pm2 reload ecosystem.config.js --env production'
    }
  }
};

测试

使用pm2 start启动服务。此时使用客户端连接,正常连接。此处不表。

使用pm2 reload all重新启动。此时,输出如果日志:

// 1、先启动2个进程,并提示在线
PM2      | App [app:0] starting in -cluster mode-
PM2      | App [app:1] starting in -cluster mode-
PM2      | App [app:1] online
PM2      | App [app:0] online
1|app    | [2019-01-06 15:46:44.628]  httpserver is listening on 4000
0|app    | [2019-01-06 15:46:44.633]  httpserver is listening on 4000
// 2、再停止旧进程
PM2      | -reload- New worker listening
PM2      | Stopping app:app id:_old_1
PM2      | -reload- New worker listening
PM2      | Stopping app:app id:_old_0
// 3、旧进程退出
PM2      | App name:app id:_old_1 disconnected
PM2      | App [app:_old_1] exited with code [1] via signal [SIGINT]
PM2      | App name:app id:_old_0 disconnected
PM2      | App [app:_old_0] exited with code [1] via signal [SIGINT]
PM2      | pid=16952 msg=process killed
PM2      | pid=4504 msg=process killed

// 4、新的进程开始提供服务

下面看看pm2 restart all的结果,输出日志如下:

PM2         | App name:app id:0 disconnected
PM2         | App [app:0] exited with code [1] via signal [SIGINT]
PM2         | App name:app id:1 disconnected
PM2         | App [app:1] exited with code [1] via signal [SIGINT]
PM2         | pid=15528 msg=process killed
PM2         | App [app:0] starting in -cluster mode-
PM2         | pid=11808 msg=process killed
PM2         | App [app:1] starting in -cluster mode-
PM2         | App [app:1] online
PM2         | (node:10372) [DEP0007] DeprecationWarning: worker.suicide is deprecated. Please use worker.exitedAfterDisconnect.
PM2         | App [app:0] online
PM2         | (node:10372) [DEP0007] DeprecationWarning: worker.suicide is deprecated. Please use worker.exitedAfterDisconnect.
1|app       | [2019-01-06 16:04:55.163]  httpserver is listening on 4000
0|app       | [2019-01-06 16:04:55.192]  httpserver is listening on 4000

由于pm2日志没有打印时间(可以配置,但本文没有做),所以无法准确知道重启所花时间,但根据笔者目测,是在3~5秒之间,这个时间值仅限于本例,通过其它实践知道pm2重启速度还是比较快的。

看看这个时候客户端的信息:

[2019-01-06 16:04:50.151]  =============GET content:
 here is my voice: Hello World! now is:
Sun Jan 06 2019 16:04:50 GMT+0800 (中国标准时间)  cnt: 52

[2019-01-06 16:04:51.650]  err
[2019-01-06 16:04:52.154]  err
[2019-01-06 16:04:52.654]  err
[2019-01-06 16:04:53.160]  err
[2019-01-06 16:04:53.657]  err
[2019-01-06 16:04:54.159]  err
[2019-01-06 16:04:54.857]  err
[2019-01-06 16:04:55.241]  =============GET content:
 here is my voice: Hello World! now is:
Sun Jan 06 2019 16:04:55 GMT+0800 (中国标准时间)  cnt: 1

[2019-01-06 16:04:55.677]  =============GET content:
 here is my voice: Hello World! now is:
Sun Jan 06 2019 16:04:55 GMT+0800 (中国标准时间)  cnt: 2

可以看到,有5秒左右时间是打印“err”,即连接不到服务器

小结

对于socket而言,使用短连接的,可以实现“平滑”,因为每次都使用新的socket去连接,比如http请求。但是,如果是长连接,始终存在socket切换的时刻,因为毕竟进程是重新启动了的。这时,socket客户端会进行重新连接了。所以有一定局限性(但也是没办法的,进程重启,是不能保持socket连接的)。
笔者接触过的socket客户端,都有重连的机制。可以在实际工作中,不妨考虑本文的方案。

李迟 2019.2.5 周二

你可能感兴趣的:(nodejs)