现象:当system server进程crash时,发现zygote进程会被杀掉,此后Zyogote进程和system server被重新启动。
分析:在init解析init.rc时,Zygote进程作为一个服务被定义,且被声明为自动重启。因此一旦Zygote进程退出,则init会收到子进程退出信号从而重新启动zygote服务,进而Zygote启动System Server。同样,在System server被Zygote作为子进程启动后,Zygote通过信号监听该子进程状态,一旦退出Zygote将会杀死自身等待init再次运行。另外system server进程将监听service manager进程,如service manager退出则杀掉自身从而导致zygote被重启。
下面为相关代码:
Zygote启动system server入口:
libcore/dalvik/src/main/java/dalvik/system/Zygote.java
/**
* Special method to start the system server process.
* @deprecated use {@link Zygote#forkSystemServer(int, int, int[], int, int[][])}
*/
@Deprecated
public static int forkSystemServer(int uid, int gid, int[] gids,
boolean enableDebugger, int[][] rlimits) {
int debugFlags = enableDebugger ? DEBUG_ENABLE_DEBUGGER : 0;
return forkAndSpecialize(uid, gid, gids, debugFlags, rlimits);
}
forkAndSpecialize是一个JNI函数,其定义见Dalvik_dalvik_system_Zygote_fork(),在其中注册信号处理函数,在有子进程退出时将检查进程pid,仅当中止的子进程pid为system server时才杀掉本进程(zygote进程)。
dalvik_system_Zygote.c
/* native public static int fork(); */
static void Dalvik_dalvik_system_Zygote_fork(const u4* args, JValue* pResult)
{
pid_t pid;
if (!gDvm.zygote) {
dvmThrowException("Ljava/lang/IllegalStateException;",
"VM instance not started with -Xzygote");
RETURN_VOID();
}
if (!dvmGcPreZygoteFork()) {
LOGE("pre-fork heap failed\n");
dvmAbort();
}
setSignalHandler(); //这里注册信号处理,以监测子进程状态
dvmDumpLoaderStats("zygote");
pid = fork();
#ifdef HAVE_ANDROID_OS
if (pid == 0) {
/* child process */
extern int gMallocLeakZygoteChild;
gMallocLeakZygoteChild = 1;
}
#endif
RETURN_INT(pid);
}
/*
* configure sigchld handler for the zygote process
* This is configured very late, because earlier in the dalvik lifecycle
* we can fork() and exec() for the verifier/optimizer, and we
* want to waitpid() for those rather than have them be harvested immediately.
*
* This ends up being called repeatedly before each fork(), but there's
* no real harm in that.
*/
static void setSignalHandler()
{
int err;
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = sigchldHandler; //信号处理函数地址
err = sigaction (SIGCHLD, &sa, NULL); //设置子进程中止时的信号处理函数
if (err < 0) {
LOGW("Error setting SIGCHLD handler: %s", strerror(errno));
}
}
/*
* This signal handler is for zygote mode, since the zygote
* must reap its children
*/
static void sigchldHandler(int s)
{
pid_t pid;
int status;
while ((pid = waitpid(-1, &status, WNOHANG)) > 0) { //得到中止的子进程pid
/* Log process-death status that we care about. In general it is not
safe to call LOG(...) from a signal handler because of possible
reentrancy. However, we know a priori that the current implementation
of LOG() is safe to call from a SIGCHLD handler in the zygote process.
If the LOG() implementation changes its locking strategy or its use
of syscalls within the lazy-init critical section, its use here may
become unsafe. */
if (WIFEXITED(status)) {
if (WEXITSTATUS(status)) {
LOG(LOG_DEBUG, ZYGOTE_LOG_TAG, "Process %d exited cleanly (%d)\n",
(int) pid, WEXITSTATUS(status));
} else {
IF_LOGV(/*should use ZYGOTE_LOG_TAG*/) {
LOG(LOG_VERBOSE, ZYGOTE_LOG_TAG,
"Process %d exited cleanly (%d)\n",
(int) pid, WEXITSTATUS(status));
}
}
} else if (WIFSIGNALED(status)) {
if (WTERMSIG(status) != SIGKILL) {
LOG(LOG_DEBUG, ZYGOTE_LOG_TAG,
"Process %d terminated by signal (%d)\n",
(int) pid, WTERMSIG(status));
} else {
IF_LOGV(/*should use ZYGOTE_LOG_TAG*/) {
LOG(LOG_VERBOSE, ZYGOTE_LOG_TAG,
"Process %d terminated by signal (%d)\n",
(int) pid, WTERMSIG(status));
}
}
}
/*
* If the just-crashed process is the system_server, bring down zygote
* so that it is restarted by init and system server will be restarted
* from there.
*/
if (pid == gDvm.systemServerPid) { //仅当中止的子进程为system server时才杀掉本进程(zygote进程)
LOG(LOG_INFO, ZYGOTE_LOG_TAG,
"Exit zygote because system server (%d) has terminated\n",
(int) pid);
kill(getpid(), SIGKILL); //杀掉Zygote进程,将导致system server被init重启
}
}
if (pid < 0) {
LOG(LOG_WARN, ZYGOTE_LOG_TAG,
"Zygote SIGCHLD error in waitpid: %s\n",strerror(errno));
}
}
在Zygote被杀掉后,即init.rc中下面的service被杀掉:
service zygote /system/bin/app_process -Xzygote /system/bin --zygote --start-system-server //启动SystemServer
class zygote_services
socket zygote stream 666
onrestart write /sys/android_power/request_state wake
onrestart write /sys/power/state on
onrestart restart media
onrestart restart netd
init进程启动后将进入无限循环以监听init.rc中启动的service状态,如发现有service退出则会重新启动该service。以下为init进程监听子进程的代码:
system/core/init/init.c
int main(int argc, char **argv)
{
int fd_count = 0;
struct pollfd ufds[4];
char *tmpdev;
char* debuggable;
char tmp[32];
int property_set_fd_init = 0;
int signal_fd_init = 0;
int keychord_fd_init = 0;
struct rlimit rlim;
struct rlimit rlim_new;
if (!strcmp(basename(argv[0]), "ueventd"))
return ueventd_main(argc, argv);
/* clear the umask */
umask(0);
/* Get the basic filesystem setup we need put
* together in the initramdisk on / and then we'll
* let the rc file figure out the rest.
*/
mkdir("/dev", 0755);
mkdir("/proc", 0755);
mkdir("/sys", 0755);
mount("tmpfs", "/dev", "tmpfs", 0, "mode=0755");
mkdir("/dev/pts", 0755);
mkdir("/dev/socket", 0755);
mount("devpts", "/dev/pts", "devpts", 0, NULL);
mount("proc", "/proc", "proc", 0, NULL);
mount("sysfs", "/sys", "sysfs", 0, NULL);
/* We must have some place other than / to create the
* device nodes for kmsg and null, otherwise we won't
* be able to remount / read-only later on.
* Now that tmpfs is mounted on /dev, we can actually
* talk to the outside world.
*/
open_devnull_stdio();
log_init();
init_parse_config_file("/init.rc"); //解析文件 /init.rc
/* pull the kernel commandline and ramdisk properties file in */
import_kernel_cmdline(0);
get_hardware_name(hardware, &revision);
snprintf(tmp, sizeof(tmp), "/init.%s.rc", hardware); //解析文件 /init.%hardware%.rc,如:init.goldfish.rc,应该是放硬件相关的内容
init_parse_config_file(tmp);
action_for_each_trigger("early-init", action_add_queue_tail); //action列表中名为early-init的,将此action放在列表尾
queue_builtin_action(wait_for_coldboot_done_action, "wait_for_coldboot_done");
queue_builtin_action(property_init_action, "property_init");
queue_builtin_action(keychord_init_action, "keychord_init");
queue_builtin_action(console_init_action, "console_init");
queue_builtin_action(set_init_properties_action, "set_init_properties");
if (getrlimit(RLIMIT_CORE, &rlim)==0) {
rlim_new.rlim_cur = rlim_new.rlim_max = RLIM_INFINITY;
if (setrlimit(RLIMIT_CORE, &rlim_new)!=0) {
/* failed. try raising just to the old max */
rlim_new.rlim_cur = rlim_new.rlim_max = rlim.rlim_max;
(void) setrlimit(RLIMIT_CORE, &rlim_new);
}
}
/* execute all the boot actions to get us started */
action_for_each_trigger("init", action_add_queue_tail);
action_for_each_trigger("early-fs", action_add_queue_tail);
action_for_each_trigger("fs", action_add_queue_tail);
action_for_each_trigger("post-fs", action_add_queue_tail);
queue_builtin_action(property_service_init_action, "property_service_init");
queue_builtin_action(signal_init_action, "signal_init");
queue_builtin_action(check_startup_action, "check_startup");
/* execute all the boot actions to get us started */
action_for_each_trigger("early-boot", action_add_queue_tail);
action_for_each_trigger("boot", action_add_queue_tail);
queue_all_device_triggers();
execute_one_command();
device_triggers_enabled = 1;
/* run all property triggers based on current state of the properties */
queue_builtin_action(queue_property_triggers_action, "queue_propety_triggers");
#if BOOTCHART
queue_builtin_action(bootchart_init_action, "bootchart_init");
#endif
for(;;) { //无限循环
int nr, i, timeout = -1;
execute_one_command();
restart_processes(); //检查有无service需要重新启动
if (!property_set_fd_init && get_property_set_fd() > 0) {
ufds[fd_count].fd = get_property_set_fd();
ufds[fd_count].events = POLLIN;
ufds[fd_count].revents = 0;
fd_count++;
property_set_fd_init = 1;
}
if (!signal_fd_init && get_signal_fd() > 0) {
ufds[fd_count].fd = get_signal_fd();
ufds[fd_count].events = POLLIN;
ufds[fd_count].revents = 0;
fd_count++;
signal_fd_init = 1;
}
if (!keychord_fd_init && get_keychord_fd() > 0) {
ufds[fd_count].fd = get_keychord_fd();
ufds[fd_count].events = POLLIN;
ufds[fd_count].revents = 0;
fd_count++;
keychord_fd_init = 1;
}
if (process_needs_restart) {
timeout = (process_needs_restart - gettime()) * 1000;
if (timeout < 0)
timeout = 0;
}
if (!action_queue_empty() || cur_action)
timeout = 0;
#if BOOTCHART
if (bootchart_count > 0) {
if (timeout < 0 || timeout > BOOTCHART_POLLING_MS)
timeout = BOOTCHART_POLLING_MS;
if (bootchart_step() < 0 || --bootchart_count == 0) {
bootchart_finish();
bootchart_count = 0;
}
}
#endif
nr = poll(ufds, fd_count, timeout);
if (nr <= 0)
continue;
for (i = 0; i < fd_count; i++) {
if (ufds[i].revents == POLLIN) {
if (ufds[i].fd == get_property_set_fd())
handle_property_set_fd();
else if (ufds[i].fd == get_keychord_fd())
handle_keychord();
else if (ufds[i].fd == get_signal_fd())
handle_signal(); //检查中止的子进程
}
}
}
return 0;
}
子进程退出处理函数
system/core/init/signal_handler.c
void handle_signal(void)
{
char tmp[32];
/* we got a SIGCHLD - reap and restart as needed */
read(signal_recv_fd, tmp, sizeof(tmp)); //为什么读32字节?
while (!wait_for_one_process(0)) //处理所有中断的子进程
;
}
下面函数得到中止的进程pid并查到对应的Service,然后执行启动该服务前需执行的命令,并置服务标志位的SVC_RESTARTING。在init主函数循环中将根据该标志位启动服务。
static int wait_for_one_process(int block) //block为0
{
pid_t pid;
int status;
struct service *svc;
struct socketinfo *si;
time_t now;
struct listnode *node;
struct command *cmd;
while ( (pid = waitpid(-1, &status, block ? 0 : WNOHANG)) == -1 && errno == EINTR ); //得到中止的进程pid
if (pid <= 0) return -1; //无效pid,no action
svc = service_find_by_pid(pid); //查找pid对应service
if (!svc) {
ERROR("untracked pid %d exited\n", pid);
return 0;
}
if (!(svc->flags & SVC_ONESHOT)) { //此Service仅需运行一次
kill(-pid, SIGKILL);
NOTICE("process '%s' killing any children in process group\n", svc->name);
}
/* remove any sockets we may have created */
for (si = svc->sockets; si; si = si->next) { //关闭service中所有socket
char tmp[128];
snprintf(tmp, sizeof(tmp), ANDROID_SOCKET_DIR"/%s", si->name);
unlink(tmp);
}
svc->pid = 0;
svc->flags &= (~SVC_RUNNING);
/* oneshot processes go into the disabled state on exit */
if (svc->flags & SVC_ONESHOT) {
svc->flags |= SVC_DISABLED;
}
/* disabled processes do not get restarted automatically */
if (svc->flags & SVC_DISABLED) {
notify_service_state(svc->name, "stopped");
return 0;
}
now = gettime();
if (svc->flags & SVC_CRITICAL) { //检查关键服务状态
if (svc->time_crashed + CRITICAL_CRASH_WINDOW >= now) {
if (++svc->nr_crashed > CRITICAL_CRASH_THRESHOLD) {
ERROR("critical process '%s' exited %d times in %d minutes; "
"rebooting into recovery mode\n", svc->name,
CRITICAL_CRASH_THRESHOLD, CRITICAL_CRASH_WINDOW / 60);
sync();
__reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2,
LINUX_REBOOT_CMD_RESTART2, "recovery");
return 0;
}
} else {
svc->time_crashed = now;
svc->nr_crashed = 1; //为何设为1??
}
}
svc->flags |= SVC_RESTARTING; //置位,以便下次运行restart_processes时启动该服务
/* Execute all onrestart commands for this service. */
list_for_each(node, &svc->onrestart.commands) { //运行服务启动前应执行的命令
cmd = node_to_item(node, struct command, clist);
cmd->func(cmd->nargs, cmd->args);
}
notify_service_state(svc->name, "restarting"); //更新此服务状态属性值为restarting
return 0;
}
system/core/init/init.c
static void restart_processes()
{
process_needs_restart = 0;
service_for_each_flags(SVC_RESTARTING,
restart_service_if_needed);
}
void service_for_each_flags(unsigned matchflags,
void (*func)(struct service *svc))
{
struct listnode *node;
struct service *svc;
list_for_each(node, &service_list) {
svc = node_to_item(node, struct service, slist);
if (svc->flags & matchflags) { //如果某个service的标志位SVC_RESTARTING置位
func(svc); //执行函数restart_service_if_needed以启动service
}
}
}
启动service svc
static void restart_service_if_needed(struct service *svc)
{
time_t next_start_time = svc->time_started + 5; //service上次启动的时间增加5秒
if (next_start_time <= gettime()) { //如果Service上次启动时间距今大于5秒
svc->flags &= (~SVC_RESTARTING);
service_start(svc, NULL); //重新启动该service
return;
}...
}
另外,system server进程会监听service manager进程状态。一旦service manager进程退出,system server进程会自动退出:
system_init.cpp
class GrimReaper : public IBinder::DeathRecipient {
public:
GrimReaper() { }
virtual void binderDied(const wp& who)
{
kill(getpid(), SIGKILL); //杀掉自身进程
}
};
} // namespace android
extern "C" status_t system_init()
{...
sp sm = defaultServiceManager();
sp grim = new GrimReaper();
sm->asBinder()->linkToDeath(grim, grim.get(), 0); //监听ServiceManager binder对象