现象:当system server进程crash时,发现zygote进程会被杀掉,此后Zyogote进程和system server被重新启动。
分析:在init解析init.rc时,Zygote进程作为一个服务被定义,且被声明为自动重启。因此一旦Zygote进程退出,则init会收到子进程退出信号从而重新启动zygote服务,进而Zygote启动System Server。同样,在System server被Zygote作为子进程启动后,Zygote通过信号监听该子进程状态,一旦退出Zygote将会杀死自身等待init再次运行。另外system server进程将监听service manager进程,如service manager退出则杀掉自身从而导致zygote被重启。
下面为相关代码:
Zygote启动system server入口:
libcore/dalvik/src/main/java/dalvik/system/Zygote.java
/** * Special method to start the system server process. * @deprecated use {@link Zygote#forkSystemServer(int, int, int[], int, int[][])} */ @Deprecated public static int forkSystemServer(int uid, int gid, int[] gids, boolean enableDebugger, int[][] rlimits) { int debugFlags = enableDebugger ? DEBUG_ENABLE_DEBUGGER : 0; return forkAndSpecialize(uid, gid, gids, debugFlags, rlimits); }
forkAndSpecialize是一个JNI函数,其定义见Dalvik_dalvik_system_Zygote_fork(),在其中注册信号处理函数,在有子进程退出时将检查进程pid,仅当中止的子进程pid为system server时才杀掉本进程(zygote进程)。
dalvik_system_Zygote.c
/* native public static int fork(); */ static void Dalvik_dalvik_system_Zygote_fork(const u4* args, JValue* pResult) { pid_t pid; if (!gDvm.zygote) { dvmThrowException("Ljava/lang/IllegalStateException;", "VM instance not started with -Xzygote"); RETURN_VOID(); } if (!dvmGcPreZygoteFork()) { LOGE("pre-fork heap failed\n"); dvmAbort(); } setSignalHandler(); //这里注册信号处理,以监测子进程状态 dvmDumpLoaderStats("zygote"); pid = fork(); #ifdef HAVE_ANDROID_OS if (pid == 0) { /* child process */ extern int gMallocLeakZygoteChild; gMallocLeakZygoteChild = 1; } #endif RETURN_INT(pid); } /* * configure sigchld handler for the zygote process * This is configured very late, because earlier in the dalvik lifecycle * we can fork() and exec() for the verifier/optimizer, and we * want to waitpid() for those rather than have them be harvested immediately. * * This ends up being called repeatedly before each fork(), but there's * no real harm in that. */ static void setSignalHandler() { int err; struct sigaction sa; memset(&sa, 0, sizeof(sa)); sa.sa_handler = sigchldHandler; //信号处理函数地址 err = sigaction (SIGCHLD, &sa, NULL); //设置子进程中止时的信号处理函数 if (err < 0) { LOGW("Error setting SIGCHLD handler: %s", strerror(errno)); } } /* * This signal handler is for zygote mode, since the zygote * must reap its children */ static void sigchldHandler(int s) { pid_t pid; int status; while ((pid = waitpid(-1, &status, WNOHANG)) > 0) { //得到中止的子进程pid /* Log process-death status that we care about. In general it is not safe to call LOG(...) from a signal handler because of possible reentrancy. However, we know a priori that the current implementation of LOG() is safe to call from a SIGCHLD handler in the zygote process. If the LOG() implementation changes its locking strategy or its use of syscalls within the lazy-init critical section, its use here may become unsafe. */ if (WIFEXITED(status)) { if (WEXITSTATUS(status)) { LOG(LOG_DEBUG, ZYGOTE_LOG_TAG, "Process %d exited cleanly (%d)\n", (int) pid, WEXITSTATUS(status)); } else { IF_LOGV(/*should use ZYGOTE_LOG_TAG*/) { LOG(LOG_VERBOSE, ZYGOTE_LOG_TAG, "Process %d exited cleanly (%d)\n", (int) pid, WEXITSTATUS(status)); } } } else if (WIFSIGNALED(status)) { if (WTERMSIG(status) != SIGKILL) { LOG(LOG_DEBUG, ZYGOTE_LOG_TAG, "Process %d terminated by signal (%d)\n", (int) pid, WTERMSIG(status)); } else { IF_LOGV(/*should use ZYGOTE_LOG_TAG*/) { LOG(LOG_VERBOSE, ZYGOTE_LOG_TAG, "Process %d terminated by signal (%d)\n", (int) pid, WTERMSIG(status)); } } } /* * If the just-crashed process is the system_server, bring down zygote * so that it is restarted by init and system server will be restarted * from there. */ if (pid == gDvm.systemServerPid) { //仅当中止的子进程为system server时才杀掉本进程(zygote进程) LOG(LOG_INFO, ZYGOTE_LOG_TAG, "Exit zygote because system server (%d) has terminated\n", (int) pid); kill(getpid(), SIGKILL); //杀掉Zygote进程,将导致system server被init重启 } } if (pid < 0) { LOG(LOG_WARN, ZYGOTE_LOG_TAG, "Zygote SIGCHLD error in waitpid: %s\n",strerror(errno)); } }
在Zygote被杀掉后,即init.rc中下面的service被杀掉:
service zygote /system/bin/app_process -Xzygote /system/bin --zygote --start-system-server //启动SystemServer class zygote_services socket zygote stream 666 onrestart write /sys/android_power/request_state wake onrestart write /sys/power/state on onrestart restart media onrestart restart netd
init进程启动后将进入无限循环以监听init.rc中启动的service状态,如发现有service退出则会重新启动该service。以下为init进程监听子进程的代码:
system/core/init/init.c
int main(int argc, char **argv) { int fd_count = 0; struct pollfd ufds[4]; char *tmpdev; char* debuggable; char tmp[32]; int property_set_fd_init = 0; int signal_fd_init = 0; int keychord_fd_init = 0; struct rlimit rlim; struct rlimit rlim_new; if (!strcmp(basename(argv[0]), "ueventd")) return ueventd_main(argc, argv); /* clear the umask */ umask(0); /* Get the basic filesystem setup we need put * together in the initramdisk on / and then we'll * let the rc file figure out the rest. */ mkdir("/dev", 0755); mkdir("/proc", 0755); mkdir("/sys", 0755); mount("tmpfs", "/dev", "tmpfs", 0, "mode=0755"); mkdir("/dev/pts", 0755); mkdir("/dev/socket", 0755); mount("devpts", "/dev/pts", "devpts", 0, NULL); mount("proc", "/proc", "proc", 0, NULL); mount("sysfs", "/sys", "sysfs", 0, NULL); /* We must have some place other than / to create the * device nodes for kmsg and null, otherwise we won't * be able to remount / read-only later on. * Now that tmpfs is mounted on /dev, we can actually * talk to the outside world. */ open_devnull_stdio(); log_init(); init_parse_config_file("/init.rc"); //解析文件 /init.rc /* pull the kernel commandline and ramdisk properties file in */ import_kernel_cmdline(0); get_hardware_name(hardware, &revision); snprintf(tmp, sizeof(tmp), "/init.%s.rc", hardware); //解析文件 /init.%hardware%.rc,如:init.goldfish.rc,应该是放硬件相关的内容 init_parse_config_file(tmp); action_for_each_trigger("early-init", action_add_queue_tail); //action列表中名为early-init的,将此action放在列表尾 queue_builtin_action(wait_for_coldboot_done_action, "wait_for_coldboot_done"); queue_builtin_action(property_init_action, "property_init"); queue_builtin_action(keychord_init_action, "keychord_init"); queue_builtin_action(console_init_action, "console_init"); queue_builtin_action(set_init_properties_action, "set_init_properties"); if (getrlimit(RLIMIT_CORE, &rlim)==0) { rlim_new.rlim_cur = rlim_new.rlim_max = RLIM_INFINITY; if (setrlimit(RLIMIT_CORE, &rlim_new)!=0) { /* failed. try raising just to the old max */ rlim_new.rlim_cur = rlim_new.rlim_max = rlim.rlim_max; (void) setrlimit(RLIMIT_CORE, &rlim_new); } } /* execute all the boot actions to get us started */ action_for_each_trigger("init", action_add_queue_tail); action_for_each_trigger("early-fs", action_add_queue_tail); action_for_each_trigger("fs", action_add_queue_tail); action_for_each_trigger("post-fs", action_add_queue_tail); queue_builtin_action(property_service_init_action, "property_service_init"); queue_builtin_action(signal_init_action, "signal_init"); queue_builtin_action(check_startup_action, "check_startup"); /* execute all the boot actions to get us started */ action_for_each_trigger("early-boot", action_add_queue_tail); action_for_each_trigger("boot", action_add_queue_tail); queue_all_device_triggers(); execute_one_command(); device_triggers_enabled = 1; /* run all property triggers based on current state of the properties */ queue_builtin_action(queue_property_triggers_action, "queue_propety_triggers"); #if BOOTCHART queue_builtin_action(bootchart_init_action, "bootchart_init"); #endif for(;;) { //无限循环 int nr, i, timeout = -1; execute_one_command(); restart_processes(); //检查有无service需要重新启动 if (!property_set_fd_init && get_property_set_fd() > 0) { ufds[fd_count].fd = get_property_set_fd(); ufds[fd_count].events = POLLIN; ufds[fd_count].revents = 0; fd_count++; property_set_fd_init = 1; } if (!signal_fd_init && get_signal_fd() > 0) { ufds[fd_count].fd = get_signal_fd(); ufds[fd_count].events = POLLIN; ufds[fd_count].revents = 0; fd_count++; signal_fd_init = 1; } if (!keychord_fd_init && get_keychord_fd() > 0) { ufds[fd_count].fd = get_keychord_fd(); ufds[fd_count].events = POLLIN; ufds[fd_count].revents = 0; fd_count++; keychord_fd_init = 1; } if (process_needs_restart) { timeout = (process_needs_restart - gettime()) * 1000; if (timeout < 0) timeout = 0; } if (!action_queue_empty() || cur_action) timeout = 0; #if BOOTCHART if (bootchart_count > 0) { if (timeout < 0 || timeout > BOOTCHART_POLLING_MS) timeout = BOOTCHART_POLLING_MS; if (bootchart_step() < 0 || --bootchart_count == 0) { bootchart_finish(); bootchart_count = 0; } } #endif nr = poll(ufds, fd_count, timeout); if (nr <= 0) continue; for (i = 0; i < fd_count; i++) { if (ufds[i].revents == POLLIN) { if (ufds[i].fd == get_property_set_fd()) handle_property_set_fd(); else if (ufds[i].fd == get_keychord_fd()) handle_keychord(); else if (ufds[i].fd == get_signal_fd()) handle_signal(); //检查中止的子进程 } } } return 0; }
子进程退出处理函数
system/core/init/signal_handler.c
void handle_signal(void) { char tmp[32]; /* we got a SIGCHLD - reap and restart as needed */ read(signal_recv_fd, tmp, sizeof(tmp)); //为什么读32字节? while (!wait_for_one_process(0)) //处理所有中断的子进程 ; }
下面函数得到中止的进程pid并查到对应的Service,然后执行启动该服务前需执行的命令,并置服务标志位的SVC_RESTARTING。在init主函数循环中将根据该标志位启动服务。
static int wait_for_one_process(int block) //block为0 { pid_t pid; int status; struct service *svc; struct socketinfo *si; time_t now; struct listnode *node; struct command *cmd; while ( (pid = waitpid(-1, &status, block ? 0 : WNOHANG)) == -1 && errno == EINTR ); //得到中止的进程pid if (pid <= 0) return -1; //无效pid,no action svc = service_find_by_pid(pid); //查找pid对应service if (!svc) { ERROR("untracked pid %d exited\n", pid); return 0; } if (!(svc->flags & SVC_ONESHOT)) { //此Service仅需运行一次 kill(-pid, SIGKILL); NOTICE("process '%s' killing any children in process group\n", svc->name); } /* remove any sockets we may have created */ for (si = svc->sockets; si; si = si->next) { //关闭service中所有socket char tmp[128]; snprintf(tmp, sizeof(tmp), ANDROID_SOCKET_DIR"/%s", si->name); unlink(tmp); } svc->pid = 0; svc->flags &= (~SVC_RUNNING); /* oneshot processes go into the disabled state on exit */ if (svc->flags & SVC_ONESHOT) { svc->flags |= SVC_DISABLED; } /* disabled processes do not get restarted automatically */ if (svc->flags & SVC_DISABLED) { notify_service_state(svc->name, "stopped"); return 0; } now = gettime(); if (svc->flags & SVC_CRITICAL) { //检查关键服务状态 if (svc->time_crashed + CRITICAL_CRASH_WINDOW >= now) { if (++svc->nr_crashed > CRITICAL_CRASH_THRESHOLD) { ERROR("critical process '%s' exited %d times in %d minutes; " "rebooting into recovery mode\n", svc->name, CRITICAL_CRASH_THRESHOLD, CRITICAL_CRASH_WINDOW / 60); sync(); __reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, LINUX_REBOOT_CMD_RESTART2, "recovery"); return 0; } } else { svc->time_crashed = now; svc->nr_crashed = 1; //为何设为1?? } } svc->flags |= SVC_RESTARTING; //置位,以便下次运行restart_processes时启动该服务 /* Execute all onrestart commands for this service. */ list_for_each(node, &svc->onrestart.commands) { //运行服务启动前应执行的命令 cmd = node_to_item(node, struct command, clist); cmd->func(cmd->nargs, cmd->args); } notify_service_state(svc->name, "restarting"); //更新此服务状态属性值为restarting return 0; }
system/core/init/init.c
static void restart_processes() { process_needs_restart = 0; service_for_each_flags(SVC_RESTARTING, restart_service_if_needed); } void service_for_each_flags(unsigned matchflags, void (*func)(struct service *svc)) { struct listnode *node; struct service *svc; list_for_each(node, &service_list) { svc = node_to_item(node, struct service, slist); if (svc->flags & matchflags) { //如果某个service的标志位SVC_RESTARTING置位 func(svc); //执行函数restart_service_if_needed以启动service } } }
启动service svc
static void restart_service_if_needed(struct service *svc) { time_t next_start_time = svc->time_started + 5; //service上次启动的时间增加5秒 if (next_start_time <= gettime()) { //如果Service上次启动时间距今大于5秒 svc->flags &= (~SVC_RESTARTING); service_start(svc, NULL); //重新启动该service return; }... }
另外,system server进程会监听service manager进程状态。一旦service manager进程退出,system server进程会自动退出:
system_init.cpp
class GrimReaper : public IBinder::DeathRecipient { public: GrimReaper() { } virtual void binderDied(const wp<IBinder>& who) { kill(getpid(), SIGKILL); //杀掉自身进程 } }; } // namespace android
extern "C" status_t system_init() {... sp<IServiceManager> sm = defaultServiceManager(); sp<GrimReaper> grim = new GrimReaper(); sm->asBinder()->linkToDeath(grim, grim.get(), 0); //监听ServiceManager binder对象