mysql-proxy源码中,使用了一种进程保活的方法。这种方法的基本原理是:当父进程完成基本的初始化后,创建子进程,由子进程继续后面的主体逻辑。而父进程wait子进程的退出状态。一旦发现子进程是由于收到信号而退出的,则重启子进程。
这种方法的实现代码如下:
#include
#include
#include
#include
#include
#include
#include
#include
static void signal_forward(int sig)
{
signal(sig, SIG_IGN); /* we don't want to create a loop here */
kill(0, sig);
}
/**
* keep the ourself alive
*
* if we or the child gets a SIGTERM, we quit too
* on everything else we restart it
*/
int chassis_unix_proc_keepalive()
{
int nprocs = 0;
pid_t child_pid = -1;
/* we ignore SIGINT and SIGTERM and just let it be forwarded to the child instead
* as we want to collect its PID before we shutdown too
*
* the child will have to set its own signal handlers for this
*/
for (;;)
{
/* try to start the children */
while (nprocs < 1)
{
pid_t pid = fork();
if (pid == 0)
{
/* child */
printf("we are the child: %d\n", getpid());
return 0;
}
else if (pid < 0)
{
/* fork() failed */
printf("fork() failed: %s[%d]\n", strerror(errno), errno);
return -1;
}
else
{
/* we are the angel, let's see what the child did */
printf("[father]: we try to keep PID=%d alive\n", pid);
/* forward a few signals that are sent to us to the child instead */
signal(SIGINT, signal_forward);
signal(SIGTERM, signal_forward);
signal(SIGHUP, signal_forward);
signal(SIGUSR1, signal_forward);
signal(SIGUSR2, signal_forward);
child_pid = pid;
nprocs++;
}
}
if (child_pid != -1)
{
struct rusage rusage;
int exit_status;
pid_t exit_pid;
printf("[father]: waiting for %d\n", child_pid);
#ifdef HAVE_WAIT4
exit_pid = wait4(child_pid, &exit_status, 0, &rusage);
#else
memset(&rusage, 0, sizeof(rusage)); /* make sure everything is zero'ed out */
exit_pid = waitpid(child_pid, &exit_status, 0);
#endif
printf("[father]: %d returned: %d\n", child_pid, exit_pid);
if (exit_pid == child_pid)
{
/* our child returned, let's see how it went */
if (WIFEXITED(exit_status))
{
printf("[father]: PID=%d exited normally with exit-code = %d (it used %ld kBytes max)\n",
child_pid,
WEXITSTATUS(exit_status),
rusage.ru_maxrss / 1024);
return 1;
}
else if (WIFSIGNALED(exit_status))
{
int time_towait = 60;
/* our child died on a signal
*
* log it and restart */
printf("[father]: PID=%d died on signal=%d (it used %ld kBytes max) ... waiting 1min before restart\n",
child_pid,
WTERMSIG(exit_status),
rusage.ru_maxrss / 1024);
/**
* to make sure we don't loop as fast as we can, sleep a bit between
* restarts
*/
signal(SIGINT, SIG_DFL);
signal(SIGTERM, SIG_DFL);
signal(SIGHUP, SIG_DFL);
while (time_towait > 0) time_towait = sleep(time_towait);
nprocs--;
child_pid = -1;
}
else if (WIFSTOPPED(exit_status))
{
}
else
{
printf("[father]: should not reached\n");
}
}
else if (-1 == exit_pid)
{
/* EINTR is ok, all others bad */
if (EINTR != errno)
{
/* how can this happen ? */
printf("[father]: wait4(%d, ...) failed: %s[%d]\n",
child_pid,
strerror(errno),
errno);
return -1;
}
}
else
{
printf("[father]: should not reached\n");
}
}
}
}
int main()
{
int ret = chassis_unix_proc_keepalive();
if (ret > 0)
{
exit(0);
}
else if (ret < 0)
{
exit(-1);
}
else
{
/* we are the child, go on */
}
for(;;)
{
printf("hello, world\n");
sleep(10);
}
}
这里的主体逻辑,就是每隔10秒打印一次” hello,world”。程序运行结果如下:
[father]: we try to keep PID=1824 alive
[father]: waiting for 1824
we are the child: 1824
hello, world
hello, world
...
(向子进程发送SIGKILL信号)
[father]: 1824 returned: 1824
[father]: PID=1824 died on signal=9 (it used 0 kBytes max) ... waiting 1min before restart
[father]: we try to keep PID=1853 alive
[father]: waiting for 1853
we are the child: 1853
hello, world
hello, world
hello, world
...
(向父进程发送SIGINT信号)
[father]: 1853 returned: 1853
[father]: PID=1853 died on signal=2 (it used 0 kBytes max) ... waiting 1min before restart
[father]: we try to keep PID=1870 alive
[father]: waiting for 1870
we are the child: 1870
hello, world
hello, world
hello, world
hello, world
...