SCSI有两种常见的故障类型。
一种是卡自身故障发出错误中断或者中断里面自带错误信息;
另外一种是卡没有响应,丢中断触发定时器超时错误。
对于第一种故障模型:
硬件中断执行结束后会触发软中断,流程如下
void blk_done_softirq(struct softirq_action *h)
{
struct list_head *cpu_list, local_list;
local_irq_disable();
cpu_list = &__get_cpu_var(blk_cpu_done);
list_replace_init(cpu_list, &local_list);
local_irq_enable();
while (!list_empty(&local_list)) {//遍历链表,执行钩子函数
struct request *rq;
rq = list_entry(local_list.next, struct request, csd.list);
list_del_init(&rq->csd.list);
rq->q->softirq_done_fn(rq);
=>void scsi_softirq_done(struct request *rq)
{
/*解析底层控制器中断的处理结果,对于USB控制器,是由usb_stor_invoke_transport完成
*错误一般是重试,走NEEDS_RETRY分支,最多重试5次,超过5次走default分支
*/
disposition = scsi_decide_disposition(cmd);
switch (disposition) {
case SUCCESS:
scsi_finish_command(cmd);
break;
case NEEDS_RETRY:
scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
break;
case ADD_TO_MLQUEUE:
scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
break;
default:
ret = !scsi_eh_scmd_add(cmd, 0);
=>int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
{
struct Scsi_Host *shost = scmd->device->host;
unsigned long flags;
int ret = 0;
if (!shost->ehandler)
return 0;
spin_lock_irqsave(shost->host_lock, flags);
if (scsi_host_set_state(shost, SHOST_RECOVERY))
if (scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY))
goto out_unlock;
ret = 1;
scmd->eh_eflags |= eh_flag;
list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
shost->host_failed++;
scsi_eh_wakeup(shost);//唤醒异常处理线程
void scsi_eh_wakeup(struct Scsi_Host *shost)//内容太多,详见下面
out_unlock:
spin_unlock_irqrestore(shost->host_lock, flags);
return ret;
}
if (ret)
scsi_finish_command(cmd);
}
}
}
}
错误处理线程流程如下:
void scsi_eh_wakeup(struct Scsi_Host *shost)
{
if (shost->host_busy == shost->host_failed)
wake_up_process(shost->ehandler);
=>int scsi_error_handler(void *data)
{
struct Scsi_Host *shost = data;
/*
* We use TASK_INTERRUPTIBLE so that the thread is not
* counted against the load average as a running process.
* We never actually get interrupted because kthread_run
* disables signal delivery for the created thread.
*/
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0) ||
shost->host_failed != shost->host_busy) {
schedule();
set_current_state(TASK_INTERRUPTIBLE);
continue;
}
__set_current_state(TASK_RUNNING);
/*
* We have a host that is failing for some reason. Figure out
* what we need to do to get it up and online again (if we can).
* If we fail, we end up taking the thing offline.
*/
if (shost->transportt->eh_strategy_handler)//如果有自定义的钩子函数则执行自定义钩子函数
shost->transportt->eh_strategy_handler(shost);
else
scsi_unjam_host(shost);//系统默认钩子函数
=>void scsi_unjam_host(struct Scsi_Host *shost)
{
unsigned long flags;
LIST_HEAD(eh_work_q);
LIST_HEAD(eh_done_q);
spin_lock_irqsave(shost->host_lock, flags);
list_splice_init(&shost->eh_cmd_q, &eh_work_q);
spin_unlock_irqrestore(shost->host_lock, flags);
SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q));
if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q))
if (!scsi_eh_abort_cmds(&eh_work_q, &eh_done_q))
/*Scsiglue.c (drivers\usb\storage): .eh_abort_handler = command_abort,*/
=>int command_abort(struct scsi_cmnd *srb)
{
set_bit(US_FLIDX_TIMED_OUT, &us->dflags);//定时器唤醒
if (!test_bit(US_FLIDX_RESETTING, &us->dflags)) {
set_bit(US_FLIDX_ABORTING, &us->dflags);
usb_stor_stop_transport(us);
/* Stop the current URB transfer */
=>void usb_stor_stop_transport(struct us_data *us)
{
if (test_and_clear_bit(US_FLIDX_URB_ACTIVE, &us->dflags)) {
US_DEBUGP("-- cancelling URB\n");
usb_unlink_urb(us->current_urb);
=>int usb_unlink_urb(struct urb *urb)
{
return usb_hcd_unlink_urb(urb, -ECONNRESET);
=>int usb_hcd_unlink_urb (struct urb *urb, int status)
{
retval = unlink1(hcd, urb, status);
=>int unlink1(struct usb_hcd *hcd, struct urb *urb, int status)
{
value = usb_rh_urb_dequeue(hcd, urb, status);
=>int usb_rh_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
{
usb_hcd_giveback_urb(hcd, urb, status);
=>void usb_hcd_giveback_urb(struct usb_hcd *hcd, struct urb *urb, int status)
{
urb->status = status;
urb->complete (urb);
}
}
}
}
}
}
}
}
}
scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q);
=>void scsi_eh_ready_devs(struct Scsi_Host *shost,
struct list_head *work_q,
struct list_head *done_q)
{
if (!scsi_eh_stu(shost, work_q, done_q))//逐级从轻到重复位
if (!scsi_eh_bus_device_reset(shost, work_q, done_q))
if (!scsi_eh_target_reset(shost, work_q, done_q))
if (!scsi_eh_bus_reset(shost, work_q, done_q))
if (!scsi_eh_host_reset(work_q, done_q))
/*搞不定则将其踢出去*/
scsi_eh_offline_sdevs(work_q, done_q);
}
scsi_eh_flush_done_q(&eh_done_q);
}
/*
* Note - if the above fails completely, the action is to take
* individual devices offline and flush the queue of any
* outstanding requests that may have been pending. When we
* restart, we restart any I/O to any other devices on the bus
* which are still online.
*/
scsi_restart_operations(shost);
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
shost->ehandler = NULL;
return 0;
}
}