之前在Android AMS启动Activity流程一文有写过进程启动的流程,这里不再赘述,这一章主要跟踪下杀进程的处理方式和流程。启动涉及到SING_QUIT3信号的处理,因为涉及trace的dump则单独在下一章说明。
Process.java 官方的解释Tools for managing OS processes,之前在进程启动的时候AMS就是调用它的方法start然后实现和Zygote的Socket通信,请求fork应用进程的。
这里同样提供了杀进程的方法。
这里提供了三个方法,SIGNAL_KILL 这里定义为9,就是发9信号给对应进程。
killProcess(int pid)
==》sendSignal(pid, SIGNAL_KILL);
killProcessQuiet(int pid)
=》sendSignalQuiet(pid, SIGNAL_KILL);
killProcessGroup(int uid, int pid)
sendSignal是一个native方法,之前我们文档将Android启动流程的时候,Zygote启动阶段会调用AndroidRuntime::startReg注册系统Framework层的JNI方法。
AndroidRuntime.cpp
static const RegJNIRec gRegJNI[] = {
REG_JNI(register_android_os_Process),
因此这个调用的是android_util_Process.cpp的android_os_Process_sendSignal方法
==》android_os_Process_sendSignal
void android_os_Process_sendSignal(JNIEnv* env, jobject clazz, jint pid, jint sig)
{
if (pid > 0) {
ALOGI("Sending signal. PID: %" PRId32 " SIG: %" PRId32, pid, sig);
kill(pid, sig);
}
}
上面提到的sendSignalQuiet(pid, SIGNAL_KILL);
void android_os_Process_sendSignalQuiet(JNIEnv* env, jobject clazz, jint pid, jint sig)
{
if (pid > 0) {
kill(pid, sig);
}
}
两个主要都是调用Kill函数
接着说killProcessGroup(int uid, int pid),它也是个native方法,对应JNI的实现是
jint android_os_Process_killProcessGroup(JNIEnv* env, jobject clazz, jint uid, jint pid)
{
return killProcessGroup(uid, pid, SIGKILL);
}
/system/core/libprocessgroup/processgroup.cpp
这里有killProcessGroup和killProcessGroupOnce两个入口函数,区别就是一个杀一次就行,另一个最大循环40次,直到指定进程组杀干净
int killProcessGroup(uid_t uid, int initialPid, int signal) {
return killProcessGroup(uid, initialPid, signal, 40 /*retries*/);
}
int killProcessGroupOnce(uid_t uid, int initialPid, int signal) {
return killProcessGroup(uid, initialPid, signal, 0 /*retries*/);
}
下面继续分析killProcessGroup的实现函数,以及杀进程在这个文件里最终的调用函数doKillProcessGroupOnce
这个函数的主要实现杀进程的作用是通过调用doKillProcessGroupOnce实现的。
这里我们上文提到两个函数传入的retries不同以实现多次下发杀进程的命令以达到杀死进程组的目的。
这里实现一个循环,通过retries值作为计数上限,循环调用doKillProcessGroupOnce。
static int killProcessGroup(uid_t uid, int initialPid, int signal, int retries) {
std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
int retry = retries;
int processes;
//循环调用doKillProcessGroupOnce,直到processes小于等于0或者循环次数到达上限退出循环
while ((processes = doKillProcessGroupOnce(uid, initialPid, signal)) > 0) {
LOG(ERROR) << "Killed " << processes << " processes for processgroup " << initialPid;
if (retry > 0) {
std::this_thread::sleep_for(5ms);//线程休眠5ms
--retry;
} else {
break;
}
}
//processes 小于0
if (processes < 0) {
PLOG(ERROR) << "Error encountered killing process cgroup uid " << uid << " pid "
<< initialPid;
return -1;
}
//获取结束时间
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
//通过结束时间和起始时间计算杀进程一共耗掉的时间
auto ms = std::chrono::duration_cast
// processes == 0说明进程已经被成功杀了
if (processes == 0) {
if (retries > 0) {
LOG(ERROR) << "Successfully killed process cgroup uid " << uid << " pid " << initialPid
<< " in " << static_cast
}
//成功杀死进程,移除进程组相关目录
return removeProcessGroup(uid, initialPid);
} else {
if (retries > 0) {
LOG(ERROR) << "Failed to kill process cgroup uid " << uid << " pid " << initialPid
<< " in " << static_cast
<< " processes remain";
}
return -1;
}
}
这里具体执行杀死线程的逻辑,最终也是调用kill 函数实现
static idoKillProcessGroupOncent (uid_t uid, int initialPid, int signal) {
ProcessGroup process_group;
//先去如下目录打开节点 /acct/uid_
if (!process_group.Open(uid, initialPid)) {
PLOG(WARNING) << "Failed to open process cgroup uid " << uid << " pid " << initialPid;
return -errno;
}
std::set
pgids.emplace(initialPid);
std::set
int ret;
pid_t pid;
int processes = 0;
//GetOneAppProcess从上述节点中获取相应pid
while ((ret = process_group.GetOneAppProcess(&pid)) > 0 && pid >= 0) {
processes++;
if (pid == 0) {
LOG(ERROR) << "Yikes, we've been told to kill pid 0! How about we don't do that?";
continue;
}
pid_t pgid = getpgid(pid);//查询该pid的组识别码gid
if (pgid == -1) PLOG(ERROR) << "getpgid(" << pid << ") failed";
if (pgid == pid) {//pid和gid相同加入pgids列表
pgids.emplace(pid);
} else {
pids.emplace(pid);//否则加入pids列表
}
}
// 从pids列表清除所有已经包含在pgids列表的pid
for (auto it = pids.begin(); it != pids.end();) {
pid_t pgid = getpgid(pid);
if (pgids.count(pgid) == 1) {
it = pids.erase(it);
} else {
++it;
}
}
// 向pgids列表的所以线程发送kill 信号杀死线程
for (const auto pgid : pgids) {
LOG(ERROR) << "Killing process group " << -pgid << " in uid " << uid
<< " as part of process cgroup " << initialPid;
if (kill(-pgid, signal) == -1) {
PLOG(ERROR) << "kill(" << -pgid << ", " << signal << ") failed";
}
}
// 向pids组的所有线程发送kill 信号杀死线程
for (const auto pid : pids) {
LOG(ERROR) << "Killing pid " << pid << " in uid " << uid << " as part of process cgroup "
<< initialPid;
if (kill(pid, signal) == -1) {
PLOG(ERROR) << "kill(" << pid << ", " << signal << ") failed";
}
}
return ret >= 0 ? processes : ret;
}
static int removeProcessGroup(uid_t uid, int pid)
{
int ret;
char path[PROCESSGROUP_MAX_PATH_LEN] = {0};
convertUidPidToPath(path, sizeof(path), uid, pid);
ret = rmdir(path);
convertUidToPath(path, sizeof(path), uid);
rmdir(path);
return ret;
}
上一节的3个方法,最终杀进程的实现方法都是调用kill(pid, sig)方法,该方法位于用户空间的Native层,经过系统调用进入到Linux内核的sys_kill方法。
接下来,主要分析下内核态杀进程的过程。
调用kill函数正常会调用到sys_kill()方法,但是目前没有找到它在Kernel的定义。
/kernel/include/linux/syscalls.h
asmlinkage long sys_kill(int pid, int sig);
从网上了解到sys_kill()是通过宏定义SYSCALL_DEFINE2的方式实现的,后续这块的机制会仔细研究一下。
/kernel/kernel/signal.c
这个方法中2指的是传入两个参数,也就是sys_kill的pid和sig
这个方法主要是创建siginfo的结构体,并赋值相关信息,然后调用kill_something_info
SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct siginfo info;
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_USER;
info.si_pid = task_tgid_vnr(current);
info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
return kill_something_info(sig, &info, pid);
}
kill_something_info
static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
{
int ret;
if (pid > 0) {
//读者在读取由RCU保护的共享数据时使用该函数标记它进入读端临界区,禁止切换上下文
rcu_read_lock();
//pid 大于0,给pid所对应的进程发送信号
ret = kill_pid_info(sig, info, find_vpid(pid));
//该函数与rcu_read_lock配对使用,用以标记读者退出读端临界区
rcu_read_unlock();
return ret;
}
read_lock(&tasklist_lock);
if (pid != -1) {
ret = __kill_pgrp_info(sig, info,
//判断pid是否为0,为0发送进程组信号,<-1 则给-pid对应进程发信号
pid ? find_vpid(-pid) : task_pgrp(current));
} else {
int retval = 0, count = 0;
struct task_struct * p;
// pid =-1,给所有进程发信号
for_each_process(p) {
if (task_pid_vnr(p) > 1 &&
!same_thread_group(p, current)) {
int err = group_send_sig_info(sig, info, p);
++count;
if (err != -EPERM)
retval = err;
}
}
ret = count ? retval : -ESRCH;
}
read_unlock(&tasklist_lock);
return ret;
}
int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
{
int error = -ESRCH;
struct task_struct *p;
for (;;) {
rcu_read_lock();
//根据pid查询到进程对应的task_struct对象
p = pid_task(pid, PIDTYPE_PID);
if (p)
error = group_send_sig_info(sig, info, p);
rcu_read_unlock();
if (likely(!p || error != -ESRCH))
return error;
}
}
int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
int ret;
rcu_read_lock();
//检查sig信号是否合法和权限问题
ret = check_kill_permission(sig, info, p);
rcu_read_unlock();
if (!ret && sig)
//调用send_signal()函数,然后进一步调用到__send_signal()函数
ret = do_send_sig_info(sig, info, p, true);
return ret;
}
static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
int group, int from_ancestor_ns)
{
struct sigpending *pending;
struct sigqueue *q;
int override_rlimit;
int ret = 0, result;
........
out_set:
//将信号发送给正在监听的signalfd
signalfd_notify(t, sig);
//将信号加入信号集合
sigaddset(&pending->signal, sig);
//继续处理信号
complete_signal(sig, t, group);
ret:
trace_signal_generate(sig, info, t, group, result);
return ret;
}
static void complete_signal(int sig, struct task_struct *p, int group)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
#ifdef CONFIG_BOOST_KILL
cpumask_t new_mask = CPU_MASK_NONE;
#endif
//查找处理该信号的线程
if (wants_signal(sig, p))
t = p;
else if (!group || thread_group_empty(p))
/*
* There is just one thread and it does not need to be woken.
* It will dequeue unblocked signals before it runs again.
*/
return;
else {
/*
* 找到一个适合的线程
*/
t = signal->curr_target;
while (!wants_signal(sig, t)) {
t = next_thread(t);
if (t == signal->curr_target)
/*
* No thread needs to be woken.
* Any eligible threads will see
* the signal in the queue soon.
*/
return;
}
signal->curr_target = t;
}
//找到一个能被杀掉的线程,如果信号是9信号,就杀死整个线程组
if (sig_fatal(p, sig) &&
!(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
!sigismember(&t->real_blocked, sig) &&
(sig == SIGKILL || !t->ptrace)) {
/这个信号将杀死整个线程组
if (!sig_kernel_coredump(sig)) {
/*
* Start a group exit and wake everybody up.
* This way we don't have other threads
* running and doing things after a slower
* thread has the fatal signal pending.
*/
signal->flags = SIGNAL_GROUP_EXIT;
signal->group_exit_code = sig;
signal->group_stop_count = 0;
t = p;
//循环遍历整个线程组,给线程发9信号
do {
#ifdef CONFIG_BOOST_KILL
if (sysctl_boost_killing) {
if (can_nice(t, -20))
set_user_nice(t, -20);
arch_get_fast_cpus(&new_mask);
}
#endif
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
//向信号集加入SIGKILL信号
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
} while_each_thread(p, t);
return;
}
}
* 信号已经在共享队列里
* 唤醒选择的目标线程,并把该信号移除队列
signal_wake_up(t, sig == SIGKILL);
return;
}
SIGKILL信号也就是9信号,不会被捕获,不会交给目标线程的signal Catcher线程处理,而3 信号 SIGQUIT则会被捕获
信号的捕获则需要先说下进程启动的时候对于捕获信号的相关处理以及如何启动SignalCatcher线程
下面的方法都是进程启动时候涉及的内容,不再赘述。
com_android_internal_os_Zygote.cpp
这里只列出我们需要关注的方法
static pid_t ForkAndSpecializeCommon(JNIEnv* env, uid_t uid, gid_t gid, jintArray javaGids,
jint debug_flags, jobjectArray javaRlimits,
jlong permittedCapabilities, jlong effectiveCapabilities,
jint mount_external,
jstring java_se_info, jstring java_se_name,
bool is_system_server, jintArray fdsToClose,
jintArray fdsToIgnore,
jstring instructionSet, jstring dataDir) {
//设置子进程的signal信号处理函数
SetSignalHandlers();
//fork 子进程
pid_t pid = fork();
if (pid == 0) {
//设置子进程的signal信号处理函数为默认函数
UnsetChldSignalHandler();
//进入虚拟机,执行相关操作,等价于调用zygote.callPostForkChildHooks()
env->CallStaticVoidMethod(gZygoteClass, gCallPostForkChildHooks, debug_flags,
is_system_server, instructionSet);
} else if (pid > 0) {
进入父进程zygote
}
主要注册信号,设置信号和对应的处理动作,
SIGCHLD,在一个进程终止或者停止时,将SIGCHLD信号发送给其父进程
此处信号对应SigChldHandler
static void SetSignalHandlers() {
struct sigaction sig_chld = {};
sig_chld.sa_handler = SigChldHandler;
if (sigaction(SIGCHLD, &sig_chld, NULL) < 0) {
ALOGW("Error setting SIGCHLD handler: %s", strerror(errno));
}
struct sigaction sig_hup = {};
sig_hup.sa_handler = SIG_IGN;
if (sigaction(SIGHUP, &sig_hup, NULL) < 0) {
ALOGW("Error setting SIGHUP handler: %s", strerror(errno));
}
}
static void UnsetChldSignalHandler() {
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_DFL;
//在fork的子进程中,设置信号SIGCHLD的处理器恢复为默认行为
if (sigaction(SIGCHLD, &sa, NULL) < 0) {
ALOGW("Error unsetting SIGCHLD handler: %s", strerror(errno));
}
}
相当于调用zygote.callPostForkChildHooks()
最后会调用
Runtime::Current()->InitNonZygoteOrPostFork
void Runtime::InitNonZygoteOrPostFork(
824 JNIEnv* env, bool is_system_server, NativeBridgeAction action, const char* isa) {
825 is_zygote_ = false;
826
......
839
840 // 创建java堆处理的线程池
841 heap_->CreateThreadPool();
842 // 重置GC性能数据,进程创建前不会GC到当前app
843 // before fork aren't attributed to an app.
844 heap_->ResetGcPerformanceInfo();
845
846 // We may want to collect profiling samples for system server, but we never want to JIT there.
847 if ((!is_system_server || !jit_options_->UseJitCompilation()) &&
848 !safe_mode_ &&
849 (jit_options_->UseJitCompilation() || jit_options_->GetSaveProfilingInfo()) &&
850 jit_ == nullptr) {
851 // Note that when running ART standalone (not zygote, nor zygote fork),
852 // the jit may have already been created.
853 CreateJit();
854 }
855 //设置信号处理函数
856 StartSignalCatcher();
857
858 // 启动JDWP thread. 当命令行 debugger的 flags 设置 "suspend=y",
859 // 就会暂停 runtime,
860 Dbg::StartJdwp();
861}
这里启动SignalCatcher线程
863void Runtime::StartSignalCatcher() {
864 if (!is_zygote_) {
865 signal_catcher_ = new SignalCatcher(stack_trace_file_, use_tombstoned_traces_);
866 }
867}
这里看下signal_catcher的构造函数和run 执行后的情况
/art/runtime/signal_catcher.cc
73SignalCatcher::SignalCatcher(const std::string& stack_trace_file,
74 bool use_tombstoned_stack_trace_fd)
75 : stack_trace_file_(stack_trace_file),
76 use_tombstoned_stack_trace_fd_(use_tombstoned_stack_trace_fd),
77 lock_("SignalCatcher lock"),
78 cond_("SignalCatcher::cond_", lock_),
79 thread_(nullptr) {
80#if !defined(ART_TARGET_ANDROID)
81 // We're not running on Android, so we can't communicate with tombstoned
82 // to ask for an open file.
83 CHECK(!use_tombstoned_stack_trace_fd_);
84#endif
85
86 SetHaltFlag(false);
87
88 // 通过pthread_create命令创建一个线程,线程名为signal catcher thread.
这个就是我们打trace的时候,见到的信号捕捉的线程。
SignalCatcher是一个守护线程,用于捕获SIGQUIT、SIGUSR1信号,并采取相应的行为
89 CHECK_PTHREAD_CALL(pthread_create, (&pthread_, nullptr, &Run, this), "signal catcher thread");
90
91 Thread* self = Thread::Current();
92 MutexLock mu(self, lock_);
93 while (thread_ == nullptr) {
94 cond_.Wait(self);
95 }
96}
void* SignalCatcher::Run(void* arg) {
232 SignalCatcher* signal_catcher = reinterpret_cast
//检查当前线程是否为null
233 CHECK(signal_catcher != nullptr);
234
235 Runtime* runtime = Runtime::Current();
//检查当前线程是否依附在Android Runtime
236 CHECK(runtime->AttachCurrentThread("Signal Catcher", true, runtime->GetSystemThreadGroup(),
237 !runtime->IsAotCompiler()));
238
239 Thread* self = Thread::Current();
240 DCHECK_NE(self->GetState(), kRunnable);
241 {
242 MutexLock mu(self, signal_catcher->lock_);
243 signal_catcher->thread_ = self;
244 signal_catcher->cond_.Broadcast(self);
245 }
246
247 // Set up mask with signals we want to handle.
248 SignalSet signals;
249 signals.Add(SIGQUIT);//添加对SIGQUIT 也就是3信号的处理
250 signals.Add(SIGUSR1);//添加SIGUSR1 也就是10信号的处理
251
252 while (true) {
阻塞线程,等待信号到来
253 int signal_number = signal_catcher->WaitForSignal(self, signals);
当信号捕捉到以后,解除和ART的绑定
254 if (signal_catcher->ShouldHalt()) {
255 runtime->DetachCurrentThread();
256 return nullptr;
257 }
258
259 switch (signal_number) {
260 case SIGQUIT:
261 signal_catcher->HandleSigQuit();//dump线程trace
262 break;
263 case SIGUSR1:
264 signal_catcher->HandleSigUsr1();//强制GC
265 break;
266 default:
267 LOG(ERROR) << "Unexpected signal %d" << signal_number;
268 break;
269 }
270 }
271}