SDIO WiFi调试经验总结

最近和硬件同事调SDIO WiFi时遇到一个奇怪的问题:只要启动wlan0网卡,CPU负载就会很高,系统严重卡顿。用top命令查看:

Mem: 9744K used, 16672K free, 0K shrd, 0K buff, 5248K cached
CPU:  0.0% usr 96.9% sys  0.0% nic  3.0% idle  0.0% io  0.0% irq  0.0% sirq
Load average: 1.27 0.32 0.11 2/35 119
  PID  PPID USER     STAT   VSZ %VSZ CPU %CPU COMMAND
   68     2 root     RW       0  0.0   0 96.9 [ksdioirqd/mmc1]
   73    57 root     D     4716 17.8   0  0.0 wpa_supplicant -Dnl80211 -iwlan0 -
   57     1 root     S     1840  6.9   0  0.0 -sh
    1     0 root     S     1824  6.9   0  0.0 {linuxrc} init
  119    57 root     R     1820  6.8   0  0.0 top
   59     1 root     S     1812  6.8   0  0.0 telnetd
   45     2 root     SWN      0  0.0   0  0.0 [jffs2_gcd_mtd2]
   40     2 root     DW       0  0.0   0  0.0 [wl_event_handle]
   35     2 root     SW       0  0.0   0  0.0 [kworker/u2:1]
   15     2 root     SW       0  0.0   0  0.0 [kworker/0:1]
    7     2 root     SW       0  0.0   0  0.0 [rcu_preempt]
   11     2 root     SW<      0  0.0   0  0.0 [khelper]
   13     2 root     SW<      0  0.0   0  0.0 [bioset]
   10     2 root     SW       0  0.0   0  0.0 [watchdog/0]
    2     0 root     SW       0  0.0   0  0.0 [kthreadd]
    3     2 root     SW       0  0.0   0  0.0 [ksoftirqd/0]
    4     2 root     SW       0  0.0   0  0.0 [kworker/0:0]
   14     2 root     SW<      0  0.0   0  0.0 [kblockd]
    6     2 root     SW       0  0.0   0  0.0 [kworker/u2:0]

发现ksdioirqd/mmc1 CPU占用率很高,于是查看内核代码发现:

drivers/mmc/core/sdio_irq.c中的sdio_card_irq_get函数创建了 ksdioirqd/mmc1线程。

static int sdio_card_irq_get(struct mmc_card *card)
{
	struct mmc_host *host = card->host;

	WARN_ON(!host->claimed);

	if (!host->sdio_irqs++) {
		atomic_set(&host->sdio_irq_thread_abort, 0);
		host->sdio_irq_thread =
			kthread_run(sdio_irq_thread, host, "ksdioirqd/%s",
				mmc_hostname(host));
		if (IS_ERR(host->sdio_irq_thread)) {
			int err = PTR_ERR(host->sdio_irq_thread);
			host->sdio_irqs--;
			return err;
		}
	}

	return 0;
}

接着追踪代码:

static int sdio_irq_thread(void *_host)
{
	struct mmc_host *host = _host;
	struct sched_param param = { .sched_priority = 1 };
	unsigned long period, idle_period;
	int ret;

	sched_setscheduler(current, SCHED_FIFO, ¶m);

	/*
	 * We want to allow for SDIO cards to work even on non SDIO
	 * aware hosts.  One thing that non SDIO host cannot do is
	 * asynchronous notification of pending SDIO card interrupts
	 * hence we poll for them in that case.
	 */
	idle_period = msecs_to_jiffies(10);
	period = (host->caps & MMC_CAP_SDIO_IRQ) ?
		MAX_SCHEDULE_TIMEOUT : idle_period;

	pr_debug("%s: IRQ thread started (poll period = %lu jiffies)\n",
		 mmc_hostname(host), period);

	do {
		/*
		 * We claim the host here on drivers behalf for a couple
		 * reasons:
		 *
		 * 1) it is already needed to retrieve the CCCR_INTx;
		 * 2) we want the driver(s) to clear the IRQ condition ASAP;
		 * 3) we need to control the abort condition locally.
		 *
		 * Just like traditional hard IRQ handlers, we expect SDIO
		 * IRQ handlers to be quick and to the point, so that the
		 * holding of the host lock does not cover too much work
		 * that doesn't require that lock to be held.
		 */
		ret = __mmc_claim_host(host, &host->sdio_irq_thread_abort);
		if (ret)
			break;
		ret = process_sdio_pending_irqs(host);//检查是否有任何中断
		host->sdio_irq_pending = false;
		mmc_release_host(host);

		/*
		 * Give other threads a chance to run in the presence of
		 * errors.
		 */
		if (ret < 0) {
			set_current_state(TASK_INTERRUPTIBLE);
			if (!kthread_should_stop())
				schedule_timeout(HZ);
			set_current_state(TASK_RUNNING);
		}

		/*
		 * Adaptive polling frequency based on the assumption
		 * that an interrupt will be closely followed by more.
		 * This has a substantial benefit for network devices.
		 */
		if (!(host->caps & MMC_CAP_SDIO_IRQ)) {
			if (ret > 0)
				period /= 2;//如果有中断产生则将唤醒的时间间隔减半,否则的话不断的加大唤醒间隔直到最大。
			else {
				period++; //在有中断产生的时候产生polling,一旦polling检查到没有中断就改成中断模式
				if (period > idle_period)
					period = idle_period;
			}
		}

		set_current_state(TASK_INTERRUPTIBLE);
		if (host->caps & MMC_CAP_SDIO_IRQ) {
			mmc_host_clk_hold(host);
			host->ops->enable_sdio_irq(host, 1);
			mmc_host_clk_release(host);
		}
		if (!kthread_should_stop())
			schedule_timeout(period);
		set_current_state(TASK_RUNNING);
	} while (!kthread_should_stop());

	if (host->caps & MMC_CAP_SDIO_IRQ) {
		mmc_host_clk_hold(host);
		host->ops->enable_sdio_irq(host, 0);
		mmc_host_clk_release(host);
	}

	pr_debug("%s: IRQ thread exiting with code %d\n",
		 mmc_hostname(host), ret);

	return ret;
}

对于不支持SDIO 中断的host,kerne会采用polling的方式来实现伪中断:不断的唤醒ksdioirqd来检查SDIO的CCCR的中断标志位。支持SDIO中断的host,则直接在SDIO中断产生的时候由host的驱动来负责通知mmc子系统唤醒ksdioirqd来检查中断。

顺着process_sdio_pending_irqs函数往下跟会发现:

static int mmc_io_rw_direct_host(struct mmc_host *host, int write, unsigned fn,
	unsigned addr, u8 in, u8 *out)
{
	struct mmc_command cmd = {0};
	int err;

	BUG_ON(!host);
	BUG_ON(fn > 7);

	/* sanity check */
	if (addr & ~0x1FFFF)
		return -EINVAL;

	cmd.opcode = SD_IO_RW_DIRECT;
	cmd.arg = write ? 0x80000000 : 0x00000000;
	cmd.arg |= fn << 28;
	cmd.arg |= (write && out) ? 0x08000000 : 0x00000000;
	cmd.arg |= addr << 9;
	cmd.arg |= in;
	cmd.flags = MMC_RSP_SPI_R5 | MMC_RSP_R5 | MMC_CMD_AC;

	err = mmc_wait_for_cmd(host, &cmd, 0);
	if (err)
		return err;

	if (mmc_host_is_spi(host)) {
		/* host driver already reported errors */
	} else {
		if (cmd.resp[0] & R5_ERROR)
			return -EIO;
		if (cmd.resp[0] & R5_FUNCTION_NUMBER)
			return -EINVAL;
		if (cmd.resp[0] & R5_OUT_OF_RANGE)
			return -ERANGE;
	}

	if (out) {
		if (mmc_host_is_spi(host))
			*out = (cmd.resp[0] >> 8) & 0xFF;
		else
			*out = cmd.resp[0] & 0xFF;
	}

	return 0;
}

其中mmc_wait_for_cmd函数引起了我的注意,因为,听硬件同事提起过SDIO cmd线的事。

接着看mmc_wait_for_cmd函数:

/**
 *	mmc_wait_for_cmd - start a command and wait for completion
 *	@host: MMC host to start command
 *	@cmd: MMC command to start
 *	@retries: maximum number of retries
 *
 *	Start a new MMC command for a host, and wait for the command
 *	to complete.  Return any error that occurred while the command
 *	was executing.  Do not attempt to parse the response.
 */
int mmc_wait_for_cmd(struct mmc_host *host, struct mmc_command *cmd, int retries)
{
	struct mmc_request mrq = {NULL};

	WARN_ON(!host->claimed);

	memset(cmd->resp, 0, sizeof(cmd->resp));
	cmd->retries = retries;

	mrq.cmd = cmd;
	cmd->data = NULL;

	mmc_wait_for_req(host, &mrq);

	return cmd->error;
}

先看注释: 

 *  Start a new MMC command for a host, and wait for the command
 *    to complete.  Return any error that occurred while the command
 *    was executing.  Do not attempt to parse the response.

翻译:为host开启一个新的MMC命令,并等待命令执行完成,返回命令执行时发生的任何错误。不要试图解释响应。

再顺着跟一跟代码:

static void mmc_wait_for_req_done(struct mmc_host *host,
				  struct mmc_request *mrq)
{
	struct mmc_command *cmd;

	while (1) {
		wait_for_completion(&mrq->completion);

		cmd = mrq->cmd;
		if (!cmd->error || !cmd->retries ||
		    mmc_card_removed(host->card))
			break;

		pr_debug("%s: req failed (CMD%u): %d, retrying...\n",
			 mmc_hostname(host), cmd->opcode, cmd->error);
		cmd->retries--;
		cmd->error = 0;
		host->ops->request(host, mrq);
	}
}

其中host->ops->request(host, mrq);回调函数会调到:

static void jzmmc_request(struct mmc_host *mmc, struct mmc_request *mrq)
{
	struct jzmmc_host *host = mmc_priv(mmc);

	if (!test_bit(JZMMC_CARD_PRESENT, &host->flags)) {
		dev_vdbg(host->dev, "No card present\n");
		mrq->cmd->error = -ENOMEDIUM;
		mmc_request_done(mmc, mrq);
		return;
	}

	/*
	 * It means that this request may flush cache in interrupt context.
	 * It never happens in design, but we add BUG_ON here to prevent it.
	 */
	if ((host->state != STATE_IDLE) && (mrq->data != NULL)) {
		dev_warn(host->dev, "operate in non-idle state\n");
		WARN_ON(1);
	}

	host->mrq = mrq;
	host->data = mrq->data;
	host->cmd = mrq->cmd;

	if (host->data)
		dev_dbg(host->dev, "op:%d arg:0x%08X sz:%uk\n",
			 host->cmd->opcode, host->cmd->arg,
			 host->data->blocks >> 1);
	else
		dev_dbg(host->dev, "op:%d\n", host->cmd->opcode);

	host->cmdat = host->cmdat_def;

	if(host->data) {
		if ((host->data->sg_len == 1)
		    && (sg_dma_len(host->data->sg)) < PIO_THRESHOLD) {
			enable_pio_mode(host);
		}

		jzmmc_data_pre(host, host->data);
	}
	/*
	 * We would get mmc_request_done at last, unless some terrible error
	 * occurs such as intensity rebounding of VDD, that maybe result in
	 * no action to complete the request.
	 */
	host->timeout_cnt = 0;
	mod_timer(&host->request_timer, jiffies +
		  msecs_to_jiffies(TIMEOUT_PERIOD));

	jzmmc_command_start(host, host->cmd);
	if (host->data) {
		jzmmc_data_start(host, host->data);

	}
	if (unlikely(test_and_clear_bit(JZMMC_CARD_NEED_INIT, &host->flags)))
		host->cmdat_def &= ~CMDAT_INIT;
}

    看注释:

    /*
     * We would get mmc_request_done at last, unless some terrible error
     * occurs such as intensity rebounding of VDD, that maybe result in
     * no action to complete the request.
     */

翻译:我们将最终调用mmc_request_done函数完成请求,除非出现一些严重的错误,例如VDD的强烈反弹,这种错误可能导致请求不能完成。

看来,jzmmc_request这个函数在执行过程中出现的某些错误,会导致请求不能完成。这样mmc_wait_for_cmd函数就等不到命令执行完成而返回错误。由于sdio_irq_thread采用polling的方式来实现伪中断:不断的唤醒ksdioirqd来检查SDIO的CCCR的中断标志位。所以,mmc_wait_for_cmd如果一直等不到命令执行完成,sdio_irq_thread就会一直唤醒ksdioirqd线程。

那么,具体是什么原因导致请求不能完成呢?

我们接着分析jzmmc_request函数,其中jzmmc_command_start函数真正为SDIO host开启了一个mmc 命令,我们看一下它的实现:

static void jzmmc_command_start(struct jzmmc_host *host, struct mmc_command *cmd)
{
	unsigned long cmdat = 0;
	unsigned long imsk;

	if (cmd->flags & MMC_RSP_BUSY)
		cmdat |= CMDAT_BUSY;
	if (request_need_stop(host->mrq))
		cmdat |= CMDAT_AUTO_CMD12;


	switch (mmc_resp_type(cmd)) {
#define _CASE(S,D) case MMC_RSP_##S: cmdat |= CMDAT_RESPONSE_##D; break
		_CASE(R1, R1); 	/* r1 = r5,r6,r7 */
		_CASE(R1B, R1);
		_CASE(R2, R2);
		_CASE(R3, R3); 	/* r3 = r4 */
	default:
		break;
#undef _CASE
	}
	host->cmdat |= cmdat;
	if (!is_pio_mode(host)) {
		imsk = IMASK_TIME_OUT_RES | IMASK_END_CMD_RES;
		enable_msc_irq(host, imsk);
		host->state = STATE_WAITING_RESP;
	}
	msc_writel(host, CMD, cmd->opcode);
	msc_writel(host, ARG, cmd->arg);
	msc_writel(host, CMDAT, host->cmdat);
	msc_writel(host, CTRL, CTRL_START_OP);
	if (is_pio_mode(host)) {
		if (wait_cmd_response(host) < 0) {
			cmd->error = -ETIMEDOUT;
			del_timer_sync(&host->request_timer);
			mmc_request_done(host->mmc, host->mrq);
			return;
		}
		jzmmc_command_done(host, host->cmd);
		if (!host->data) {
			del_timer_sync(&host->request_timer);
			mmc_request_done(host->mmc, host->mrq);
		}
	}
}

我们看到了msc_writel函数,这个函数开始操作寄存器写命令了。正常情况下msc_writel不会执行失败,除非硬件有问题。再次想起硬件同事关于SDIO cmd线的话。跟硬件同事确认发现,SDIO cmd线没有接上拉电阻。分析可能是因为SDIO cmd线没有接上拉电阻导致msc_writel执行失败,进而导致上述jzmmc_request函数执行失败,请求不能完成。

于是,硬件同事接上上拉电阻,一切正常了。

你可能感兴趣的:(其他)