system_server crash现象研究

现象:当system server进程crash时,发现zygote进程会被杀掉,此后Zyogote进程和system server被重新启动。


分析:在init解析init.rc时,Zygote进程作为一个服务被定义,且被声明为自动重启。因此一旦Zygote进程退出,则init会收到子进程退出信号从而重新启动zygote服务,进而Zygote启动System Server。同样,在System server被Zygote作为子进程启动后,Zygote通过信号监听该子进程状态,一旦退出Zygote将会杀死自身等待init再次运行。另外system server进程将监听service manager进程,如service manager退出则杀掉自身从而导致zygote被重启。


下面为相关代码:


Zygote启动system server入口:

libcore/dalvik/src/main/java/dalvik/system/Zygote.java

    /**
     * Special method to start the system server process.
     * @deprecated use {@link Zygote#forkSystemServer(int, int, int[], int, int[][])}
     */
    @Deprecated
    public static int forkSystemServer(int uid, int gid, int[] gids,
            boolean enableDebugger, int[][] rlimits) {
        int debugFlags = enableDebugger ? DEBUG_ENABLE_DEBUGGER : 0;
        return forkAndSpecialize(uid, gid, gids, debugFlags, rlimits);
    }

forkAndSpecialize是一个JNI函数,其定义见Dalvik_dalvik_system_Zygote_fork(),在其中注册信号处理函数,在有子进程退出时将检查进程pid,仅当中止的子进程pid为system server时才杀掉本进程(zygote进程)。

dalvik_system_Zygote.c

/* native public static int fork(); */
static void Dalvik_dalvik_system_Zygote_fork(const u4* args, JValue* pResult)
{
    pid_t pid;

    if (!gDvm.zygote) {
        dvmThrowException("Ljava/lang/IllegalStateException;",
            "VM instance not started with -Xzygote");

        RETURN_VOID();
    }

    if (!dvmGcPreZygoteFork()) {
        LOGE("pre-fork heap failed\n");
        dvmAbort();
    }

    setSignalHandler();   //这里注册信号处理,以监测子进程状态

    dvmDumpLoaderStats("zygote");
    pid = fork();

#ifdef HAVE_ANDROID_OS
    if (pid == 0) {
        /* child process */
        extern int gMallocLeakZygoteChild;
        gMallocLeakZygoteChild = 1;
    }
#endif

    RETURN_INT(pid);
}

/*
 * configure sigchld handler for the zygote process
 * This is configured very late, because earlier in the dalvik lifecycle
 * we can fork() and exec() for the verifier/optimizer, and we
 * want to waitpid() for those rather than have them be harvested immediately.
 *
 * This ends up being called repeatedly before each fork(), but there's
 * no real harm in that.
 */
static void setSignalHandler()
{
    int err;
    struct sigaction sa;

    memset(&sa, 0, sizeof(sa));

    sa.sa_handler = sigchldHandler;          //信号处理函数地址 

    err = sigaction (SIGCHLD, &sa, NULL);    //设置子进程中止时的信号处理函数

    if (err < 0) {
        LOGW("Error setting SIGCHLD handler: %s", strerror(errno));
    }
}

/*
 * This signal handler is for zygote mode, since the zygote
 * must reap its children
 */
static void sigchldHandler(int s)
{
    pid_t pid;
    int status;

    while ((pid = waitpid(-1, &status, WNOHANG)) > 0) { //得到中止的子进程pid
        /* Log process-death status that we care about.  In general it is not
           safe to call LOG(...) from a signal handler because of possible
           reentrancy.  However, we know a priori that the current implementation
           of LOG() is safe to call from a SIGCHLD handler in the zygote process.
           If the LOG() implementation changes its locking strategy or its use
           of syscalls within the lazy-init critical section, its use here may
           become unsafe. */
        if (WIFEXITED(status)) {
            if (WEXITSTATUS(status)) {
                LOG(LOG_DEBUG, ZYGOTE_LOG_TAG, "Process %d exited cleanly (%d)\n",
                    (int) pid, WEXITSTATUS(status));
            } else {
                IF_LOGV(/*should use ZYGOTE_LOG_TAG*/) {
                    LOG(LOG_VERBOSE, ZYGOTE_LOG_TAG,
                        "Process %d exited cleanly (%d)\n",
                        (int) pid, WEXITSTATUS(status));
                }
            }
        } else if (WIFSIGNALED(status)) {
            if (WTERMSIG(status) != SIGKILL) {
                LOG(LOG_DEBUG, ZYGOTE_LOG_TAG,
                    "Process %d terminated by signal (%d)\n",
                    (int) pid, WTERMSIG(status));
            } else {
                IF_LOGV(/*should use ZYGOTE_LOG_TAG*/) {
                    LOG(LOG_VERBOSE, ZYGOTE_LOG_TAG,
                        "Process %d terminated by signal (%d)\n",
                        (int) pid, WTERMSIG(status));
                }
            }
        }

        /*
         * If the just-crashed process is the system_server, bring down zygote
         * so that it is restarted by init and system server will be restarted
         * from there.
         */
        if (pid == gDvm.systemServerPid) { //仅当中止的子进程为system server时才杀掉本进程(zygote进程)
            LOG(LOG_INFO, ZYGOTE_LOG_TAG,
                "Exit zygote because system server (%d) has terminated\n",
                (int) pid);
            kill(getpid(), SIGKILL); //杀掉Zygote进程,将导致system server被init重启
        }
    }

    if (pid < 0) {
        LOG(LOG_WARN, ZYGOTE_LOG_TAG,
            "Zygote SIGCHLD error in waitpid: %s\n",strerror(errno));
    }
}


在Zygote被杀掉后,即init.rc中下面的service被杀掉:

service zygote /system/bin/app_process -Xzygote /system/bin --zygote --start-system-server   //启动SystemServer  
    class zygote_services  
    socket zygote stream 666  
    onrestart write /sys/android_power/request_state wake  
    onrestart write /sys/power/state on  
    onrestart restart media  
    onrestart restart netd  

init进程启动后将进入无限循环以监听init.rc中启动的service状态,如发现有service退出则会重新启动该service。以下为init进程监听子进程的代码:


system/core/init/init.c

int main(int argc, char **argv)
{
    int fd_count = 0;
    struct pollfd ufds[4];
    char *tmpdev;
    char* debuggable;
    char tmp[32];
    int property_set_fd_init = 0;
    int signal_fd_init = 0;
    int keychord_fd_init = 0;
    struct rlimit rlim;
    struct rlimit rlim_new;

    if (!strcmp(basename(argv[0]), "ueventd"))
        return ueventd_main(argc, argv);

    /* clear the umask */
    umask(0);

        /* Get the basic filesystem setup we need put
         * together in the initramdisk on / and then we'll
         * let the rc file figure out the rest.
         */
    mkdir("/dev", 0755);
    mkdir("/proc", 0755);
    mkdir("/sys", 0755);

    mount("tmpfs", "/dev", "tmpfs", 0, "mode=0755");
    mkdir("/dev/pts", 0755);
    mkdir("/dev/socket", 0755);
    mount("devpts", "/dev/pts", "devpts", 0, NULL);
    mount("proc", "/proc", "proc", 0, NULL);
    mount("sysfs", "/sys", "sysfs", 0, NULL);

        /* We must have some place other than / to create the
         * device nodes for kmsg and null, otherwise we won't
         * be able to remount / read-only later on.
         * Now that tmpfs is mounted on /dev, we can actually
         * talk to the outside world.
         */
    open_devnull_stdio();
    log_init();
    
    init_parse_config_file("/init.rc"); //解析文件 /init.rc

    /* pull the kernel commandline and ramdisk properties file in */
    import_kernel_cmdline(0);

    get_hardware_name(hardware, &revision);
    snprintf(tmp, sizeof(tmp), "/init.%s.rc", hardware); //解析文件 /init.%hardware%.rc,如:init.goldfish.rc,应该是放硬件相关的内容
    init_parse_config_file(tmp);

    action_for_each_trigger("early-init", action_add_queue_tail); //action列表中名为early-init的,将此action放在列表尾

    queue_builtin_action(wait_for_coldboot_done_action, "wait_for_coldboot_done");
    queue_builtin_action(property_init_action, "property_init");
    queue_builtin_action(keychord_init_action, "keychord_init");
    queue_builtin_action(console_init_action, "console_init");
    queue_builtin_action(set_init_properties_action, "set_init_properties");

    if (getrlimit(RLIMIT_CORE, &rlim)==0) {
            rlim_new.rlim_cur = rlim_new.rlim_max = RLIM_INFINITY;
            if (setrlimit(RLIMIT_CORE, &rlim_new)!=0) {
                 /* failed. try raising just to the old max */
                 rlim_new.rlim_cur = rlim_new.rlim_max =  rlim.rlim_max;
                 (void) setrlimit(RLIMIT_CORE, &rlim_new);
             }
    }
        /* execute all the boot actions to get us started */
    action_for_each_trigger("init", action_add_queue_tail);
    action_for_each_trigger("early-fs", action_add_queue_tail);
    action_for_each_trigger("fs", action_add_queue_tail);
    action_for_each_trigger("post-fs", action_add_queue_tail);

    queue_builtin_action(property_service_init_action, "property_service_init");
    queue_builtin_action(signal_init_action, "signal_init");
    queue_builtin_action(check_startup_action, "check_startup");

    /* execute all the boot actions to get us started */
    action_for_each_trigger("early-boot", action_add_queue_tail);
    action_for_each_trigger("boot", action_add_queue_tail);

    queue_all_device_triggers();
    execute_one_command();
    device_triggers_enabled = 1;

        /* run all property triggers based on current state of the properties */
    queue_builtin_action(queue_property_triggers_action, "queue_propety_triggers");


#if BOOTCHART
    queue_builtin_action(bootchart_init_action, "bootchart_init");
#endif

    for(;;) { //无限循环
        int nr, i, timeout = -1;

        execute_one_command();
        restart_processes();    //检查有无service需要重新启动

        if (!property_set_fd_init && get_property_set_fd() > 0) {
            ufds[fd_count].fd = get_property_set_fd();
            ufds[fd_count].events = POLLIN;
            ufds[fd_count].revents = 0;
            fd_count++;
            property_set_fd_init = 1;
        }
        if (!signal_fd_init && get_signal_fd() > 0) {
            ufds[fd_count].fd = get_signal_fd();
            ufds[fd_count].events = POLLIN;
            ufds[fd_count].revents = 0;
            fd_count++;
            signal_fd_init = 1;
        }
        if (!keychord_fd_init && get_keychord_fd() > 0) {
            ufds[fd_count].fd = get_keychord_fd();
            ufds[fd_count].events = POLLIN;
            ufds[fd_count].revents = 0;
            fd_count++;
            keychord_fd_init = 1;
        }

        if (process_needs_restart) {
            timeout = (process_needs_restart - gettime()) * 1000;
            if (timeout < 0)
                timeout = 0;
        }

        if (!action_queue_empty() || cur_action)
            timeout = 0;

#if BOOTCHART
        if (bootchart_count > 0) {
            if (timeout < 0 || timeout > BOOTCHART_POLLING_MS)
                timeout = BOOTCHART_POLLING_MS;
            if (bootchart_step() < 0 || --bootchart_count == 0) {
                bootchart_finish();
                bootchart_count = 0;
            }
        }
#endif

        nr = poll(ufds, fd_count, timeout);
        if (nr <= 0)
            continue;

        for (i = 0; i < fd_count; i++) {
            if (ufds[i].revents == POLLIN) {
                if (ufds[i].fd == get_property_set_fd())
                    handle_property_set_fd();
                else if (ufds[i].fd == get_keychord_fd())
                    handle_keychord();
                else if (ufds[i].fd == get_signal_fd())
                    handle_signal();               //检查中止的子进程
            }
        }
    }

    return 0;
}

子进程退出处理函数

system/core/init/signal_handler.c

void handle_signal(void)
{
    char tmp[32];

    /* we got a SIGCHLD - reap and restart as needed */
    read(signal_recv_fd, tmp, sizeof(tmp));      //为什么读32字节?
    while (!wait_for_one_process(0)) //处理所有中断的子进程
        ;
}

下面函数得到中止的进程pid并查到对应的Service,然后执行启动该服务前需执行的命令,并置服务标志位的SVC_RESTARTING。在init主函数循环中将根据该标志位启动服务。

static int wait_for_one_process(int block) //block为0
{
    pid_t pid;
    int status;
    struct service *svc;
    struct socketinfo *si;
    time_t now;
    struct listnode *node;
    struct command *cmd;

    while ( (pid = waitpid(-1, &status, block ? 0 : WNOHANG)) == -1 && errno == EINTR ); //得到中止的进程pid
    if (pid <= 0) return -1;      //无效pid,no action

    svc = service_find_by_pid(pid); //查找pid对应service
    if (!svc) {
        ERROR("untracked pid %d exited\n", pid);
        return 0;
    }

    if (!(svc->flags & SVC_ONESHOT)) { //此Service仅需运行一次
        kill(-pid, SIGKILL);
        NOTICE("process '%s' killing any children in process group\n", svc->name);
    }

    /* remove any sockets we may have created */
    for (si = svc->sockets; si; si = si->next) { //关闭service中所有socket
        char tmp[128];
        snprintf(tmp, sizeof(tmp), ANDROID_SOCKET_DIR"/%s", si->name);
        unlink(tmp);
    }

    svc->pid = 0;
    svc->flags &= (~SVC_RUNNING);

        /* oneshot processes go into the disabled state on exit */
    if (svc->flags & SVC_ONESHOT) {
        svc->flags |= SVC_DISABLED;
    }

        /* disabled processes do not get restarted automatically */
    if (svc->flags & SVC_DISABLED) {
        notify_service_state(svc->name, "stopped");
        return 0;
    }

    now = gettime();
    if (svc->flags & SVC_CRITICAL) { //检查关键服务状态
        if (svc->time_crashed + CRITICAL_CRASH_WINDOW >= now) {
            if (++svc->nr_crashed > CRITICAL_CRASH_THRESHOLD) {
                ERROR("critical process '%s' exited %d times in %d minutes; "
                      "rebooting into recovery mode\n", svc->name,
                      CRITICAL_CRASH_THRESHOLD, CRITICAL_CRASH_WINDOW / 60);
                sync();
                __reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2,
                         LINUX_REBOOT_CMD_RESTART2, "recovery");
                return 0;
            }
        } else {
            svc->time_crashed = now;
            svc->nr_crashed = 1;  //为何设为1??
        }
    }

    svc->flags |= SVC_RESTARTING;          //置位,以便下次运行restart_processes时启动该服务

    /* Execute all onrestart commands for this service. */
    list_for_each(node, &svc->onrestart.commands) {     //运行服务启动前应执行的命令
        cmd = node_to_item(node, struct command, clist);
        cmd->func(cmd->nargs, cmd->args);
    }
    notify_service_state(svc->name, "restarting");  //更新此服务状态属性值为restarting
    return 0;
}


system/core/init/init.c

static void restart_processes()
{
    process_needs_restart = 0;
    service_for_each_flags(SVC_RESTARTING,
                           restart_service_if_needed);
}

void service_for_each_flags(unsigned matchflags,
                            void (*func)(struct service *svc))
{
    struct listnode *node;
    struct service *svc;
    list_for_each(node, &service_list) {
        svc = node_to_item(node, struct service, slist);
        if (svc->flags & matchflags) {      //如果某个service的标志位SVC_RESTARTING置位
            func(svc);             //执行函数restart_service_if_needed以启动service
        }
    }
} 

启动service svc

static void restart_service_if_needed(struct service *svc)
{
    time_t next_start_time = svc->time_started + 5;     //service上次启动的时间增加5秒

    if (next_start_time <= gettime()) { //如果Service上次启动时间距今大于5秒
        svc->flags &= (~SVC_RESTARTING);
        service_start(svc, NULL);         //重新启动该service
        return;
    }...
}



另外,system server进程会监听service manager进程状态。一旦service manager进程退出,system server进程会自动退出:

system_init.cpp

class GrimReaper : public IBinder::DeathRecipient {
public: 
    GrimReaper() { }

    virtual void binderDied(const wp<IBinder>& who)
    {
        kill(getpid(), SIGKILL);     //杀掉自身进程
    }
};

} // namespace android

extern "C" status_t system_init()
{...    
    sp<IServiceManager> sm = defaultServiceManager();    
    
    sp<GrimReaper> grim = new GrimReaper();
    sm->asBinder()->linkToDeath(grim, grim.get(), 0);  //监听ServiceManager binder对象


你可能感兴趣的:(server,service,System,action,Crash,Signal)