1,问题描述
硬件环境:飞腾S2500(64核)
OS:kylinOS, linux preempt rt, 4.19.90
测试命令:sigwaittest -p 90 -i 1000 -a 1
测试结果:信号混洗值最大超过了80us,与飞腾其他CPU的设备相比较,增大了很多
2,调试过程
1)日志分析
使用trace-cmd命令来抓取ftrace log:trace-cmd start -e all; sigwaittest -p 90 -i 1000 -a 1 -b 80
得到如下日志:
sigwaitt-13091 1....0 1741.963614: sys_enter: NR 131 (3321, 3322, a, 20c49ba5e353f7cf, 3b9ac9ff, 44cc0)
sigwaitt-13091 1...10 1741.963615: sys_enter_tgkill: tgid: 0x00003321, pid: 0x00003322, sig: 0x0000000a
sigwaitt-13091 1d..11 1741.963616: sched_waking: comm=sigwaittest pid=13090 prio=9 target_cpu=001
sigwaitt-13091 1d..21 1741.963617: sched_wakeup: sigwaittest:13090 [9] success=1 CPU:001
sigwaitt-13091 1....1 1741.963618: signal_generate: sig=10 errno=0 code=-6 comm=sigwaittest pid=13090 grp=0 res=0
=====chensong:70us
sigwaitt-13091 1d..21 1741.963689: sched_waking: comm=sigwaittest pid=13089 prio=120 target_cpu=041
sigwaitt-13091 1d..31 1741.963690: sched_wakeup: sigwaittest:13089 [120] success=1 CPU:041
=====
sigwaitt-13091 1....0 1741.963691: kfree: (__audit_syscall_exit+0x1d8) call_site=ffff0000081d26b0 ptr=(nil)
sigwaitt-13091 1....0 1741.963691: sys_exit: NR 131 = 0
sigwaitt-13091 1...10 1741.963692: sys_exit_tgkill: 0x0
对比正常时候的日志:
sigwaitt-13091 1....0 1741.962569: sys_enter: NR 131 (3321, 3322, a, 20c49ba5e353f7cf, 3b9ac9ff, 44cc0)
sigwaitt-13091 1...10 1741.962570: sys_enter_tgkill: tgid: 0x00003321, pid: 0x00003322, sig: 0x0000000a
sigwaitt-13091 1d..11 1741.962571: sched_waking: comm=sigwaittest pid=13090 prio=9 target_cpu=001
sigwaitt-13091 1d..21 1741.962572: sched_wakeup: sigwaittest:13090 [9] success=1 CPU:001
sigwaitt-13091 1....1 1741.962573: signal_generate: sig=10 errno=0 code=-6 comm=sigwaittest pid=13090 grp=0 res=0
sigwaitt-13091 1....0 1741.962574: kfree: (__audit_syscall_exit+0x1d8) call_site=ffff0000081d26b0 ptr=(nil)
sigwaitt-13091 1....0 1741.962574: sys_exit: NR 131 = 0
sigwaitt-13091 1...10 1741.962574: sys_exit_tgkill: 0x0
sigwaitt-13091 1....0 1741.962575: sys_enter: NR 137 (fffdf677e848, fffdf677e6b8, 0, 8, 0, fffdf677f0e0)
sigwaitt-13091 1...10 1741.962575: sys_enter_rt_sigtimedwait: uthese: 0xfffdf677e848, uinfo: 0xfffdf677e6b8, uts: 0x00000000, sigsetsize: 0x00000008
sigwaitt-13091 1d..10 1741.962576: rcu_utilization: Start context switch
sigwaitt-13091 1d..10 1741.962577: rcu_utilization: End context switch
sigwaitt-13091 1d..20 1741.962578: sched_switch: sigwaittest:13091 [9] S ==> sigwaittest:13090 [9]
我们会发现,在signal_generate后,sigwaittest的测试线程又唤醒了一个线程,消耗了70us。
2)过程分析
命令“sigwaittest -p 90 -i 1000 -a 1 -b 80”会创建三个线程,一个主线程main thread,,是一个CFS的普通线程,很大一部分工作是打印测试的结果,一个是sender,是一个实时线程,负责发送信号,还有一个是receiver,负责接收信号,sigwaittest主要就是测试sender发送信号到receiver接收到信号所花费的时间。
3)代码分析
首先看一下发送信号的kill函数,对应内核中系统调用tgkill:
使用function_grath来看一下tgkill的调用过程:trace-cmd start -p function_graph -g sys_tgkill;sigwaittest -p 90 -i 1000 -a 1 -l 100;trace-cmd stop
# tracer: function_graph
#
# CPU DURATION FUNCTION CALLS
# | | | | | | |
1) | sys_tgkill() {
1) | do_tkill() {
1) | __task_pid_nr_ns() {
1) 0.770 us | __rcu_read_lock();
1) 0.500 us | __rcu_read_unlock();
1) 3.062 us | }
1) | from_kuid_munged() {
1) 0.500 us | map_id_up();
1) 1.563 us | }
1) | do_send_specific() {
1) 0.479 us | __rcu_read_lock();
1) 0.666 us | find_task_by_vpid();
1) | __task_pid_nr_ns() {
1) 0.500 us | __rcu_read_lock();
1) 0.479 us | __rcu_read_unlock();
1) 2.542 us | }
1) | check_kill_permission() {
1) | audit_signal_info() {
1) | auditd_test_task() {
1) 0.500 us | __rcu_read_lock();
1) 0.479 us | __rcu_read_unlock();
1) 2.604 us | }
1) | audit_signal_info_syscall() {
1) 0.500 us | __rcu_read_lock();
1) 0.500 us | __rcu_read_unlock();
1) 2.458 us | }
1) 7.042 us | }
1) 0.604 us | security_task_kill();
1) 9.271 us | }
1) | do_send_sig_info() {
1) | __lock_task_sighand() {
1) 0.500 us | __rcu_read_lock();
1) | rt_spin_lock() {
1) 0.500 us | __rcu_read_lock();
1) 0.500 us | migrate_disable();
1) 2.480 us | }
1) 0.500 us | __rcu_read_unlock();
1) 5.459 us | }
1) | send_signal() {
1) 0.563 us | siginfo_layout();
1) 0.479 us | __rcu_read_lock();
1) 0.500 us | __rcu_read_lock();
1) 0.500 us | __rcu_read_unlock();
1) 0.500 us | __rcu_read_unlock();
1) 0.563 us | task_active_pid_ns();
1) | __task_pid_nr_ns() {
1) 0.500 us | __rcu_read_lock();
1) 0.480 us | __rcu_read_unlock();
1) 2.479 us | }
1) | __send_signal() {
1) 0.583 us | prepare_signal();
1) | __sigqueue_do_alloc() {
1) 0.563 us | __rcu_read_lock();
1) 0.480 us | __rcu_read_unlock();
1) | kmem_cache_alloc() {
1) 0.479 us | should_failslab();
1) 1.708 us | }
1) 4.833 us | }
1) | complete_signal() {
1) 0.604 us | _raw_spin_lock_irqsave();
1) 0.500 us | _raw_spin_unlock_irqrestore();
1) | signal_wake_up_state() {
1) | wake_up_state() {
1) | try_to_wake_up() {
1) 0.562 us | _raw_spin_lock_irqsave();
1) 0.542 us | _raw_spin_lock();
1) 0.541 us | update_rq_clock();
1) | ttwu_do_activate() {
1) | activate_task() {
1) | enqueue_task_rt() {
1) | dequeue_rt_stack() {
1) 0.521 us | dequeue_top_rt_rq();
1) 1.521 us | }
1) 0.500 us | update_rt_migration();
1) 0.500 us | _raw_spin_lock();
1) 0.521 us | enqueue_top_rt_rq();
1) 5.604 us | }
1) 6.958 us | }
1) | ttwu_do_wakeup() {
1) | check_preempt_curr() {
1) 0.625 us | check_preempt_curr_rt();
1) 1.625 us | }
1) 0.521 us | task_woken_rt();
1) 4.605 us | }
1) + 13.084 us | }
1) 0.541 us | _raw_spin_unlock_irqrestore();
1) + 19.104 us | }
1) + 20.146 us | }
1) + 21.188 us | }
1) + 24.542 us | }
1) + 32.709 us | }
1) + 42.792 us | }
1) | rt_spin_unlock() {
1) 0.563 us | migrate_enable();
1) 0.521 us | __rcu_read_unlock();
1) 2.750 us | }
1) + 53.271 us | }
1) 0.541 us | __rcu_read_unlock();
1) + 70.500 us | }
1) + 77.562 us | }
1) + 81.333 us | }
------------------------------------------
1) sigwait-31931 => sigwait-31930
------------------------------------------
简化一下调用过程就是:
tgkill
--> do_tkill
-->do_send_specific
-->do_send_sig_info
--> lock_task_sighand //申请锁tsk->sighand->siglock
send_signal //唤醒receiver
unlock_task_sighand(p, &flags); //释放tsk->sighand->siglock
在通常情况下,sender调用send_signal唤醒receiver,这个过程就结束了。但在发生错误的日志中,我们发现sender还唤醒了main thread,那么,很可能main thread也在申请tsk->sighand->siglock,这个时候它正在siglock的等待队列中等待,那么,当sender调用unlock_task_sighand的时候,就会去唤醒main thread。
我们再来看看main thread的代码:
26 while (!mustshutdown) {
527 int printed;
528 int errorlines = 0;
529
530 for (i = 0; i < num_threads; i++)
531 mustshutdown |= receiver[i].shutdown |
532 sender[i].shutdown;
533
534 if (receiver[0].samples > oldsamples || mustshutdown) {
...//打印结果
581 pthread_sigmask(SIG_SETMASK, &sigset, NULL);
582
583 nanosleep(&maindelay, NULL);
584
585 sigemptyset(&sigset);
586 pthread_sigmask(SIG_SETMASK, &sigset, NULL);
}
其中pthread_sigmask(SIG_SETMASK, &sigset, NULL)会进入内核调用:
2870 int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2871 {
2872 struct task_struct *tsk = current;
2873 sigset_t newset;
2874
2875 /* Lockless, only current can change ->blocked, never from irq */
2876 if (oldset)
2877 *oldset = tsk->blocked;
2878
2879 switch (how) {
2880 case SIG_BLOCK:
2881 sigorsets(&newset, &tsk->blocked, set);
2882 break;
2883 case SIG_UNBLOCK:
2884 sigandnsets(&newset, &tsk->blocked, set);
2885 break;
2886 case SIG_SETMASK:
2887 newset = *set;
2888 break;
2889 default:
2890 return -EINVAL;
2891 }
2892
2893 __set_current_blocked(&newset);
2894 return 0;
2895 }
在函数 __set_current_blocked(&newset)中,也需要申请tsk->sighand->siglock
2846 void __set_current_blocked(const sigset_t *newset)
2847 {
2848 struct task_struct *tsk = current;
2849
2850 /*
2851 * In case the signal mask hasn't changed, there is nothing we need
2852 * to do. The current->blocked shouldn't be modified by other task.
2853 */
2854 if (sigequalsets(&tsk->blocked, newset))
2855 return;
2856
2857 spin_lock_irq(&tsk->sighand->siglock);
2858 __set_task_blocked(tsk, newset);
2859 spin_unlock_irq(&tsk->sighand->siglock);
2860 }
所形成的关系大概是这样
sender main thread
lock_task_sighand
send_signal(sig, info, p, type) spin_lock_irq(&tsk->sighand->siglock) //被阻塞
unlock_task_sighand(p, &flags) 获取锁,继续调用sigprocmask其他的事情
__set_task_blocked(tsk, newset);
spin_unlock_irq(&tsk->sighand->siglock);//释放锁
本来这个锁的释放,唤醒进程都是很简短的过程,通常都是几微秒,为什么这个设备上会出现70us的问题呢,我们看sender和receiver都运行在CPU1上,而main thread是运行在CPU41上,是不是不在一个numa node上,对远端的内存访问会消耗很长时间?
3, 解决方案:将main thread与sender,receiver放到同一个node上
taskset -c 2 ./sigwaittest -p 90 -i 1000 -a 1 -b 80 //不再重现
如果将main thread强制放在CPU41上呢:
taskset -c 41 ./sigwaittest -p 90 -i 1000 -a 1 -b 80 // 很快重现
4, 后记
同事后来又反馈了一个问题,该测试在标准内核上运行,就不会出现同样的问题。
我想了一下,发现sender所调用的unlock_task_sighand-->spin_unlock_irqrestore,使用的是spin lock,spin lock在preempt rt和标准的内核有着不同的语义。
在preempt rt内核中,spin lock其实就是rtmutex,是可以休眠的,所以main thread会在申请不到siglock的时候会进入休眠等待的状态,sender在释放siglock的时候,会在等待队列中唤醒main thread,导致后面消耗时间很多的情况。
在标准内核,spin lock是忙等,sender释放之后不需要唤醒main thread,少了这个步骤,至少就不会出与上面log描述的同样的问题了。