msm8916平台Linux watchdog详解(1)

在msm8610里,有一个debug选项LOCKUP_DETECTOR。之前有过说明。
在msm8916里,又多了CONFIG_MSM_WATCHDOG_V2 和CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU两个宏。
**CONFIG_MSM_WATCHDOG_V2后面再说,先说一下LOCKUP_DETECTOR和CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU。

下面来分析一下代码,然后看一下系统怎么检测soft lockup和hard lockup。

接下来我们从具体代码入手分析linux(3.10)是如何实现这两种lockup的探测的:

1.初始化

static struct smp_hotplug_thread watchdog_threads = {
    .store          = &softlockup_watchdog, //softlockup_watchdog保存本进程的task_struct指针的指针
    .thread_should_run  = watchdog_should_run,
    .thread_fn      = watchdog, //真正watchdog/%u的运行函数是smpboot_thread_fn,
                                //由smpboot_thread_fn会调用watchdog函数
    .thread_comm        = "watchdog/%u",
    .setup          = watchdog_enable,
    .park           = watchdog_disable,
    .unpark         = watchdog_enable,
};
void __init lockup_detector_init(void)
{
    set_sample_period();//这里设置一个sample_period时间为4秒对应的纳秒数
    //smpboot_register_percpu_thread()函数和smp_hotplug_thread变量配合。
    //如果cpu hotplug等发生,即生成相应的thread等。这个和msm8610的
    //register_cpu_notifier()注册notifier call函数虽然接口不一样,但意义应该差不多。
    //后面再具体分析smp相关。
    if (smpboot_register_percpu_thread(&watchdog_threads)) {
        pr_err("Failed to create watchdog threads, disabled\n");
        watchdog_disabled = -ENODEV;
    }
}

在初始化的时候会调用watchdog_enable()

static void watchdog_enable(unsigned int cpu)
{
    struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);

    /* kick off the timer for the hardlockup detector */
    hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    hrtimer->function = watchdog_timer_fn;

    if (!watchdog_enabled) {
        kthread_park(current);
        return;
    }

    /* Enable the perf event */
    watchdog_nmi_enable(cpu);

    /* done here because hrtimer_start can only pin to smp_processor_id() */
    //每个cpu在启动的时候都会调用一次watchdog_enable(),然后每4秒就会调用一次watchdog_timer_fn
    hrtimer_start(hrtimer, ns_to_ktime(sample_period),
              HRTIMER_MODE_REL_PINNED);

    /* initialize timestamp */
    watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
    __touch_watchdog();
}

2.softlockup检测

然后第一次watchdog_enable运行之后,过4秒就会调用watchdog_timer_fn()。

static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
    unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);//watchdog_touch_ts每次在
        //__touch_watchdog里边会赋值成get_timestamp(),get_timestamp()函数在
        //watchdog.c文件里边的定义,返回值是秒数!
        //但touch_ts的值绝大多数时间都是0,因为touch_softlockup_watchdog()函数和
        //touch_nmi_watchdog()频繁被调用,把watchdog_touch_ts的值赋值为0
        //没有仔细看touch_softlockup_watchdog和touch_nmi_watchdog的调用点,但好像都是在
        //调度的时候调用这两个函数去把watchdog_touch_ts清零的。
        //如果这里发现touch_ts不是0,那表明上次watchdog被唤醒,一直到现在都没有调度,应该是有问题的。
        //那怎么判断是否是真的有问题呢?继续往下看
    struct pt_regs *regs = get_irq_regs();
    int duration;

    pr_info("watchdog_timer_fn started\n");
    /* kick the hardlockup detector */
    watchdog_interrupt_count();

    /* test for hardlockups on the next cpu */
    watchdog_check_hardlockup_other_cpu();//这个先不管,后面再说明怎么检测其他cpu上是否有hardlock的

    /* kick the softlockup detector */
    wake_up_process(__this_cpu_read(softlockup_watchdog));//唤醒watchdog进程
    //前面说过softlockup_watchdog这里保存watchdog相关的task_struct指针的指针,
    //所以这里用来唤醒watchdog的task。

    /* .. and repeat */
    hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));//延长当前hrtimer到期的时间!!
        //使用这个函数,下一次hrtimer到期的时间是现在的时间加上ns_to_ktime(sample_period)这个时间!!

    if (touch_ts == 0) {
        if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
            /* * If the time stamp was touched atomically * make sure the scheduler tick is up to date. */
            __this_cpu_write(softlockup_touch_sync, false);
            sched_clock_tick();
        }

        /* Clear the guest paused flag on watchdog reset */
        kvm_check_and_clear_guest_paused();
        __touch_watchdog();
        return HRTIMER_RESTART;
    }

    /* check for a softlockup * This is done by making sure a high priority task is * being scheduled. The task touches the watchdog to * indicate it is getting cpu time. If it hasn't then * this is a good indication some task is hogging the cpu */
    //能跑到这里,像上面说的,上次watchdog进程被唤醒之后,到现在一直是没有调度的。
    //但是怎么判断现在已经有问题呢?下面is_softlockup()函数判断,上次watchdog被唤醒到现在这段时间,是否大于20秒
    //如果大于20秒,那就进入下面的if语句了,表明系统有问题。但具体是hardlock还是softlock还得判断一下
    duration = is_softlockup(touch_ts);
    if (unlikely(duration)) {
        /* * If a virtual machine is stopped by the host it can look to * the watchdog like a soft lockup, check to see if the host * stopped the vm before we issue the warning */
        if (kvm_check_and_clear_guest_paused())
            return HRTIMER_RESTART;

        /* only warn once */
        if (__this_cpu_read(soft_watchdog_warn) == true)
            return HRTIMER_RESTART;

        printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
            smp_processor_id(), duration,
            current->comm, task_pid_nr(current));
        print_modules();
        print_irqtrace_events(current);
        if (regs)
            show_regs(regs);
        else
            dump_stack();

        if (softlockup_panic)
            panic("softlockup: hung tasks");
        __this_cpu_write(soft_watchdog_warn, true);
    } else
        __this_cpu_write(soft_watchdog_warn, false);

    return HRTIMER_RESTART;//重新开始hrtimer!!和上面的
            //hrtimer_forward_now函数配合,可以重新
            //开始hrtimer并设置下一次运行的时间!!
}

你可能感兴趣的:(msm8916平台Linux watchdog详解(1))