ide-scsi error handling

public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed

* ide-scsi error handling
@ 2003-05-18 23:07 Willem Riede
  2003-05-19 13:01 ` Douglas Gilbert
  2003-05-19 14:36 ` Randy.Dunlap
  0 siblings, 2 replies; 6+ messages in thread
From: Willem Riede @ 2003-05-18 23:07 UTC (permalink / raw)
  To: linux-scsi

Now that current 2.5 kernels boot again for me (haven't been able
to run current kernels for months, but I digress) I've done some
more work on the error handling in ide-scsi.

The patch below works for me "most of the time". I'd appreciate if
others try it, and report results, or inspect the patch and comment.

Against 2.5.69-bk13.

Thanks, Willem Riede.

diff -uwr linux-2.5.69-bk13/drivers/ide/ide-iops.c linux-2.5.69-bk13-wr/drivers/ide/ide-iops.c
--- linux-2.5.69-bk13/drivers/ide/ide-iops.c	2003-05-18 11:10:22.000000000 -0400
+++ linux-2.5.69-bk13-wr/drivers/ide/ide-iops.c	2003-05-18 13:51:02.000000000 -0400
@@ -1134,6 +1134,7 @@
 		if (hwif->reset_poll(drive)) {
 			printk(KERN_ERR "%s: host reset_poll failure for %s.\n",
 				hwif->name, drive->name);
+			hwgroup->busy--;
 			return ide_stopped;
 		}
 	}
@@ -1179,6 +1180,7 @@
 		}
 	}
 	hwgroup->poll_timeout = 0;	/* done polling */
+	hwgroup->busy--;
 	return ide_stopped;
 }
 
@@ -1267,6 +1269,7 @@
 #if OK_TO_RESET_CONTROLLER
 	if (!IDE_CONTROL_REG) {
 		spin_unlock_irqrestore(&ide_lock, flags);
+		hwgroup->busy--;
 		return ide_stopped;
 	}
 
@@ -1315,6 +1318,7 @@
 
 ide_startstop_t ide_do_reset (ide_drive_t *drive)
 {
+	HWGROUP(drive)->busy++;
 	return do_reset1(drive, 0);
 }
 
diff -uwr linux-2.5.69-bk13/drivers/scsi/ide-scsi.c linux-2.5.69-bk13-wr/drivers/scsi/ide-scsi.c
--- linux-2.5.69-bk13/drivers/scsi/ide-scsi.c	2003-05-18 11:11:30.000000000 -0400
+++ linux-2.5.69-bk13-wr/drivers/scsi/ide-scsi.c	2003-05-18 14:23:47.000000000 -0400
@@ -270,7 +270,7 @@
 	printk("]\n");
 }
 
-static int idescsi_check_condition(ide_drive_t *drive, struct request *failed_command)
+static int idescsi_check_condition(ide_drive_t *drive, idescsi_pc_t *failed_command)
 {
 	idescsi_scsi_t *scsi = drive_to_idescsi(drive);
 	idescsi_pc_t   *pc;
@@ -298,8 +298,8 @@
 	rq->flags = REQ_SENSE;
 	pc->timeout = jiffies + WAIT_READY;
 	/* NOTE! Save the failed packet command in "rq->buffer" */
-	rq->buffer = (void *) failed_command->special;
-	pc->scsi_cmd = ((idescsi_pc_t *) failed_command->special)->scsi_cmd;
+	rq->buffer = (void *) failed_command;
+	pc->scsi_cmd = failed_command->scsi_cmd;
 	if (test_bit(IDESCSI_LOG_CMD, &scsi->log)) {
 		printk ("ide-scsi: %s: queue cmd = ", drive->name);
 		hexdump(pc->c, 6);
@@ -307,6 +307,23 @@
 	return ide_do_drive_cmd(drive, rq, ide_preempt);
 }
 
+ide_startstop_t idescsi_atapi_abort (ide_drive_t *drive, const char *msg)
+{
+	struct request *rq;
+
+	if (drive == NULL || (rq = HWGROUP(drive)->rq) == NULL)
+		return ide_stopped;
+	/* retry only "normal" I/O: */
+	if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) {
+		rq->errors = 1;
+		ide_end_drive_cmd(drive, BUSY_STAT, 0);
+		return ide_stopped;
+	}
+	rq->errors |= ERROR_RESET;
+	DRIVER(drive)->end_request(drive, 0, 0);
+	return ide_stopped;
+}
+
 static int idescsi_end_request (ide_drive_t *drive, int uptodate, int nrsecs)
 {
 	idescsi_scsi_t *scsi = drive_to_idescsi(drive);
@@ -342,7 +359,7 @@
 	} else if (rq->errors) {
 		if (log)
 			printk ("ide-scsi: %s: check condition for %lu\n", drive->name, pc->scsi_cmd->serial_number);
-		if (!idescsi_check_condition(drive, rq))
+		if (!idescsi_check_condition(drive, pc))
 			/* we started a request sense, so we'll be back, exit for now */
 			return 0;
 		pc->scsi_cmd->result = (CHECK_CONDITION << 1) | (DID_OK << 16);
@@ -536,12 +553,8 @@
 		set_bit(PC_DMA_OK, &pc->flags);
 
 	if (test_bit(IDESCSI_DRQ_INTERRUPT, &scsi->flags)) {
-		if (HWGROUP(drive)->handler != NULL)
-			BUG();
-		ide_set_handler(drive, &idescsi_transfer_pc,
-				get_timeout(pc), NULL);
 		/* Issue the packet command */
-		HWIF(drive)->OUTB(WIN_PACKETCMD, IDE_COMMAND_REG);
+		ide_execute_command(drive, WIN_PACKETCMD, &idescsi_transfer_pc, get_timeout(pc), NULL);
 		return ide_started;
 	} else {
 		/* Issue the packet command */
@@ -633,6 +646,7 @@
 	.cleanup		= idescsi_cleanup,
 	.do_request		= idescsi_do_request,
 	.end_request		= idescsi_end_request,
+	.abort			= idescsi_atapi_abort,
 	.drives			= LIST_HEAD_INIT(idescsi_driver.drives),
 };
 
@@ -664,8 +678,6 @@
 	.ioctl		= idescsi_ide_ioctl,
 };
 
-static int idescsi_attach(ide_drive_t *drive);
-
 static int idescsi_slave_configure(Scsi_Device * sdp)
 {
 	/* Configure detected device */
@@ -846,13 +858,15 @@
 	return 1;
 }
 
-static int idescsi_abort (Scsi_Cmnd *cmd)
+static int idescsi_scsi_eh_abort (Scsi_Cmnd *cmd)
 {
-	int countdown = 8;
+	int countdown = 120;    /* maximum is 12 seconds because ide interrupt timeout is 10 sec. */
 	unsigned long flags;
 	idescsi_scsi_t *scsi = scsihost_to_idescsi(cmd->device->host);
 	ide_drive_t *drive = scsi->drive;
 
+	if (!drive)
+		return FAILED;
 	printk (KERN_ERR "ide-scsi: abort called for %lu\n", cmd->serial_number);
 	while (countdown--) {
 		/* is cmd active?
@@ -863,56 +877,85 @@
 			/* yep - let's give it some more time - 
 			 * we can do that, we're in _our_ error kernel thread */
 			spin_unlock_irqrestore(&ide_lock, flags);
-			scsi_sleep(HZ);
+#if IDESCSI_DEBUG_LOG
+			printk(KERN_WARNING "ide-scsi: waiting in abort\n");
+#endif
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ/10);
 			continue;
 		}
 		/* no, but is it queued in the ide subsystem? */
 		if (elv_queue_empty(&drive->queue)) {
 			spin_unlock_irqrestore(&ide_lock, flags);
+#if IDESCSI_DEBUG_LOG
+			printk(KERN_WARNING "ide-scsi: abort success return\n");
+#endif
 			return SUCCESS;
 		}
 		spin_unlock_irqrestore(&ide_lock, flags);
+		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(HZ/10);
 	}
+#if IDESCSI_DEBUG_LOG
+	printk(KERN_WARNING "ide-scsi: abort fail return\n");
+#endif
 	return FAILED;
 }
 
-static int idescsi_reset (Scsi_Cmnd *cmd)
+static int idescsi_scsi_eh_reset (Scsi_Cmnd *cmd)
 {
+	int countdown = 10;
 	unsigned long flags;
 	struct request *req;
 	idescsi_scsi_t *idescsi = scsihost_to_idescsi(cmd->device->host);
 	ide_drive_t *drive = idescsi->drive;
 
+	if (!drive)
+		return FAILED;
 	printk (KERN_ERR "ide-scsi: reset called for %lu\n", cmd->serial_number);
-	/* first null the handler for the drive and let any process
-	 * doing IO (on another CPU) run to (partial) completion
-	 * the lock prevents processing new requests */
+        /*
+         * Abort the current command on the group if there is one, taking care not to
+         * allow anything else to be queued and to die on the spot if we miss one somehow
+         */
 	spin_lock_irqsave(&ide_lock, flags);
-	while (HWGROUP(drive)->handler) {
-		HWGROUP(drive)->handler = NULL;
-		schedule_timeout(1);
-	}
-	/* now nuke the drive queue */
-	while ((req = elv_next_request(&drive->queue))) {
-		blkdev_dequeue_request(req);
-		end_that_request_last(req);
-	}
+        DRIVER(drive)->abort(drive, "drive reset");
+        if (HWGROUP(drive)->handler)
+                BUG();
 	/* FIXME - this will probably leak memory */
 	HWGROUP(drive)->rq = NULL;
 	if (drive_to_idescsi(drive))
 		drive_to_idescsi(drive)->pc = NULL;
+        /*
+         * we use the busy flag to reserve the hwgroup for ourselves without holding
+         * the ide lock for a long time during the reset, reset will clear the busy
+         */
+        HWGROUP(drive)->busy = 1;
 	spin_unlock_irqrestore(&ide_lock, flags);
+#if IDESCSI_DEBUG_LOG
+	printk(KERN_WARNING "ide-scsi: drive->abort completed, now do_reset\n");
+#endif
 	/* finally, reset the drive (and its partner on the bus...) */
-	ide_do_reset (drive);	
+	(void) ide_do_reset (drive);
+	/* in theory, this can take 30 seconds, but ide_spin_wait_hwgroup waits only 3,
+	 * usually, that is enough, but we call repeatedly, just to be covered */
+	while (ide_spin_wait_hwgroup(drive) && countdown--)
+		printk (KERN_INFO "ide-scsi: waiting for reset drive to complete\n");
+	/* for some reason when successful ide_spin_wait_hwgroup exits with ide_lock taken */
+	if (countdown) spin_unlock_irq(&ide_lock);
+	if (HWGROUP(drive)->handler) {
+		printk (KERN_CRIT "ide-scsi: reset drive did not complete in time\n");
+		return FAILED;
+	}
+#if IDESCSI_DEBUG_LOG
+	printk(KERN_WARNING "ide-scsi: reset success return\n");
+#endif
 	return SUCCESS;
 }
 
 static int idescsi_bios(struct scsi_device *sdev, struct block_device *bdev,
 		sector_t capacity, int *parm)
 {
-	idescsi_scsi_t *idescsi = scsihost_to_idescsi(sdev->host);
-	ide_drive_t *drive = idescsi->drive;
+	ide_drive_t *drive = scsihost_to_idescsi(sdev->host)->drive;
 
 	if (drive->bios_cyl && drive->bios_head && drive->bios_sect) {
 		parm[0] = drive->bios_head;
@@ -929,8 +972,10 @@
 	.slave_configure        = idescsi_slave_configure,
 	.ioctl			= idescsi_ioctl,
 	.queuecommand		= idescsi_queue,
-	.eh_abort_handler	= idescsi_abort,
-	.eh_device_reset_handler = idescsi_reset,
+	.eh_abort_handler	= idescsi_scsi_eh_abort,
+	.eh_device_reset_handler = idescsi_scsi_eh_reset,
+	.eh_bus_reset_handler   = idescsi_scsi_eh_reset,
+	.eh_host_reset_handler  = idescsi_scsi_eh_reset,
 	.bios_param		= idescsi_bios,
 	.can_queue		= 40,
 	.this_id		= -1,

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: ide-scsi error handling
  2003-05-18 23:07 ide-scsi error handling Willem Riede
@ 2003-05-19 13:01 ` Douglas Gilbert
  2003-05-19 23:42   ` Willem Riede
  2003-05-19 14:36 ` Randy.Dunlap
  1 sibling, 1 reply; 6+ messages in thread
From: Douglas Gilbert @ 2003-05-19 13:01 UTC (permalink / raw)
  To: wrlk; +Cc: linux-scsi, rddunlap

[-- Attachment #1: Type: text/plain, Size: 1011 bytes --]

Willem Riede wrote:
> Now that current 2.5 kernels boot again for me (haven't been able
> to run current kernels for months, but I digress) I've done some
> more work on the error handling in ide-scsi.
> 
> The patch below works for me "most of the time". I'd appreciate if
> others try it, and report results, or inspect the patch and comment.

Willem,
As reported to you earlier I got the attached "sleeping
from illegal context" followed by an oops when I tried
to exercise ide-scsi with your patch on bk13. My setup:
   - SMP kernel running on a UP machine with all kernel
     debugging configured on
   - PIO which is the default (can try with DMA later)
   - cdrecord on an atapi writer running at the same time
     as sg_dd on an atapi reader (i.e. 2 active ide-scsi
     devices)
The machine was still usable after the oops (sg_dd
seg faulted).

This was same test that broke my patches a few days
back. Randy Dunlap seemed to be able to get ide-scsi
to break with far less provocation.

Doug Gilbert

[-- Attachment #2: ide-scsi2569bk14wr.txt --]
[-- Type: text/plain, Size: 3428 bytes --]

Debug: sleeping function called from illegal context at include/linux/rwsem.h:43
Call Trace:
 [<c011f92c>] __might_sleep+0x5c/0x5e
 [<c011b2d8>] do_page_fault+0x78/0x4a8
 [<e0823c4b>] idescsi_transfer_pc+0xfb/0x130 [ide_scsi]
 [<c0233ce7>] start_request+0x107/0x160
 [<c021ac05>] elv_queue_empty+0x25/0x30
 [<c0233dc6>] ide_do_request+0x56/0x3e0
 [<c021a9e3>] __elv_add_request+0x33/0x50
 [<c011b260>] do_page_fault+0x0/0x4a8
 [<c010a1c9>] error_code+0x2d/0x38
 [<e0824534>] idescsi_queue+0x244/0x6d0 [ide_scsi]
 [<c0246909>] __scsi_get_command+0x29/0xc0
 [<c0246fc8>] scsi_dispatch_cmd+0x218/0x3e0
 [<c0247320>] scsi_done+0x0/0x80
 [<c024a6d0>] scsi_times_out+0x0/0x90
 [<c024d946>] scsi_prep_fn+0xd6/0x150
 [<c024ca70>] scsi_init_cmd_errh+0xa0/0xd0
 [<c024dc06>] scsi_request_fn+0x246/0x410
 [<c021a9e3>] __elv_add_request+0x33/0x50
 [<c021ce1c>] blk_insert_request+0x9c/0xf0
 [<c024c819>] scsi_do_req+0x49/0xa0
 [<c024c673>] scsi_insert_special_req+0x33/0x40
 [<e0893062>] sg_common_write+0x1c2/0x240 [sg]
 [<e08944a0>] sg_cmd_done+0x0/0x330 [sg]
 [<e0892df9>] sg_new_write+0x1e9/0x290 [sg]
 [<c024d9fe>] scsi_request_fn+0x3e/0x410
 [<e0893cbf>] sg_ioctl+0xbdf/0xe00 [sg]
 [<c021aab6>] elv_next_request+0x16/0x100
 [<c024d9fe>] scsi_request_fn+0x3e/0x410
 [<c011da9a>] __wake_up_common+0x3a/0x60
 [<c016f074>] kill_fasync+0x44/0x4d
 [<e0894645>] sg_cmd_done+0x1a5/0x330 [sg]
 [<c0247719>] scsi_finish_command+0xf9/0x150
 [<c024748c>] scsi_softirq+0xec/0x240
 [<c016f42d>] sys_ioctl+0x15d/0x2e6
 [<c0118abd>] smp_apic_timer_interrupt+0xcd/0x140
 [<c010975f>] syscall_call+0x7/0xb

Unable to handle kernel paging request at virtual address 6b6b6b7b
 printing eip:
e0824534
*pde = 00000000
Oops: 0000 [#1]
CPU:    0
EIP:    0060:[<e0824534>]    Tainted: G S
EFLAGS: 00010086
EIP is at idescsi_queue+0x244/0x6d0 [ide_scsi]
eax: 6b6b6b6b   ebx: df65eb1c   ecx: c0406b48   edx: 00000000
esi: dfd94998   edi: de1c3000   ebp: db233c84   esp: db233c3c
ds: 007b   es: 007b   ss: 0068
Process sg_dd (pid: 1900, threadinfo=db232000 task=ddcac080)
Stack: c0406b48 dffe587c 00000003 c0246909 dfdff500 00000020 0011b565 c150b400 
       00000000 dfd949e5 de3d7c38 dffe587c c0406b48 db233c98 0000ea60 de3d7aac 
       dfd94998 de3d7a80 db233cd4 c0246fc8 dfd94998 c0247320 c024a6d0 db233cb8 
Call Trace:
 [<c0246909>] __scsi_get_command+0x29/0xc0
 [<c0246fc8>] scsi_dispatch_cmd+0x218/0x3e0
 [<c0247320>] scsi_done+0x0/0x80
 [<c024a6d0>] scsi_times_out+0x0/0x90
 [<c024d946>] scsi_prep_fn+0xd6/0x150
 [<c024ca70>] scsi_init_cmd_errh+0xa0/0xd0
 [<c024dc06>] scsi_request_fn+0x246/0x410
 [<c021a9e3>] __elv_add_request+0x33/0x50
 [<c021ce1c>] blk_insert_request+0x9c/0xf0
 [<c024c819>] scsi_do_req+0x49/0xa0
 [<c024c673>] scsi_insert_special_req+0x33/0x40
 [<e0893062>] sg_common_write+0x1c2/0x240 [sg]
 [<e08944a0>] sg_cmd_done+0x0/0x330 [sg]
 [<e0892df9>] sg_new_write+0x1e9/0x290 [sg]
 [<c024d9fe>] scsi_request_fn+0x3e/0x410
 [<e0893cbf>] sg_ioctl+0xbdf/0xe00 [sg]
 [<c021aab6>] elv_next_request+0x16/0x100
 [<c024d9fe>] scsi_request_fn+0x3e/0x410
 [<c011da9a>] __wake_up_common+0x3a/0x60
 [<c016f074>] kill_fasync+0x44/0x4d
 [<e0894645>] sg_cmd_done+0x1a5/0x330 [sg]
 [<c0247719>] scsi_finish_command+0xf9/0x150
 [<c024748c>] scsi_softirq+0xec/0x240
 [<c016f42d>] sys_ioctl+0x15d/0x2e6
 [<c0118abd>] smp_apic_timer_interrupt+0xcd/0x140
 [<c010975f>] syscall_call+0x7/0xb

Code: 8b 40 10 8b 70 34 81 7e 04 ad 4e ad de 74 1c c7 44 24 04 0f 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: ide-scsi error handling
  2003-05-18 23:07 ide-scsi error handling Willem Riede
  2003-05-19 13:01 ` Douglas Gilbert
@ 2003-05-19 14:36 ` Randy.Dunlap
  2003-05-19 14:37   ` Randy.Dunlap
  1 sibling, 1 reply; 6+ messages in thread
From: Randy.Dunlap @ 2003-05-19 14:36 UTC (permalink / raw)
  To: wrlk; +Cc: linux-scsi

On Sun, 18 May 2003 19:07:06 -0400 Willem Riede <wrlk@riede.org> wrote:

| Now that current 2.5 kernels boot again for me (haven't been able
| to run current kernels for months, but I digress) I've done some
| more work on the error handling in ide-scsi.
| 
| The patch below works for me "most of the time". I'd appreciate if
| others try it, and report results, or inspect the patch and comment.
| 
| Against 2.5.69-bk13.

Hi,

Here's what I get after applying this patch.
Machine is dual-proc P4 1.7 GHz IBM Netfinity, with 2 ATA hard
drives (hda, hdb), CD-ROM (hdc), CD-RW (hdd), 1 SCSI hard drive.

scsi2 : SCSI host adapter emulation for IDE ATAPI devices
ide-scsi: abort called for 32
hdd: lost interrupt
ide-scsi: CoD != 0 in idescsi_pc_intr
hdd: DMA disabled
hdd: ATAPI reset complete
hdd: irq timeout: status=0x80 { Busy }
hdd: ATAPI reset complete
hdd: irq timeout: status=0x80 { Busy }
hdd: ATAPI reset complete
hdd: irq timeout: status=0x80 { Busy }

I rebooted after > 5 minutes of nothing else logged.

--
~Randy

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: ide-scsi error handling
  2003-05-19 14:36 ` Randy.Dunlap
@ 2003-05-19 14:37   ` Randy.Dunlap
  0 siblings, 0 replies; 6+ messages in thread
From: Randy.Dunlap @ 2003-05-19 14:37 UTC (permalink / raw)
  To: wrlk; +Cc: linux-scsi

On Mon, 19 May 2003 07:36:26 -0700 "Randy.Dunlap" <rddunlap@osdl.org> wrote:

| On Sun, 18 May 2003 19:07:06 -0400 Willem Riede <wrlk@riede.org> wrote:
| 
| | Now that current 2.5 kernels boot again for me (haven't been able
| | to run current kernels for months, but I digress) I've done some
| | more work on the error handling in ide-scsi.
| | 
| | The patch below works for me "most of the time". I'd appreciate if
| | others try it, and report results, or inspect the patch and comment.
| | 
| | Against 2.5.69-bk13.
| 
| Hi,
| 
| Here's what I get after applying this patch.
| Machine is dual-proc P4 1.7 GHz IBM Netfinity, with 2 ATA hard
| drives (hda, hdb), CD-ROM (hdc), CD-RW (hdd), 1 SCSI hard drive.
| 
| scsi2 : SCSI host adapter emulation for IDE ATAPI devices
| ide-scsi: abort called for 32
| hdd: lost interrupt
| ide-scsi: CoD != 0 in idescsi_pc_intr
| hdd: DMA disabled
| hdd: ATAPI reset complete
| hdd: irq timeout: status=0x80 { Busy }
| hdd: ATAPI reset complete
| hdd: irq timeout: status=0x80 { Busy }
| hdd: ATAPI reset complete
| hdd: irq timeout: status=0x80 { Busy }
| 
| 
| I rebooted after > 5 minutes of nothing else logged.

I didn't make this clear.  This is hanging during boot.
I didn't run any userspace programs.

--
~Randy

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: ide-scsi error handling
  2003-05-19 13:01 ` Douglas Gilbert
@ 2003-05-19 23:42   ` Willem Riede
  2003-05-20 11:24     ` Douglas Gilbert
  0 siblings, 1 reply; 6+ messages in thread
From: Willem Riede @ 2003-05-19 23:42 UTC (permalink / raw)
  To: dougg; +Cc: linux-scsi, rddunlap

On 2003.05.19 09:01, Douglas Gilbert wrote:
> Debug: sleeping function called from illegal context at include/linux/rwsem.h:43
> Call Trace:
>  [<c011f92c>] __might_sleep+0x5c/0x5e
>  [<c011b2d8>] do_page_fault+0x78/0x4a8
>  [<e0823c4b>] idescsi_transfer_pc+0xfb/0x130 [ide_scsi]

A page fault while in idescsi_transfer_pc?!
What memory would be accessed that is allowed to be paged out?

By the way, I have never seen that problem. When ide-scsi fails for me, it
is in the same way Randy reports. While my change improves mean-time-to-hang
significantly on my machine, it obviously doesn't for Randy. Back to the
drawing board :-(

Thanks, Willem Riede.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: ide-scsi error handling
  2003-05-19 23:42   ` Willem Riede
@ 2003-05-20 11:24     ` Douglas Gilbert
  0 siblings, 0 replies; 6+ messages in thread
From: Douglas Gilbert @ 2003-05-20 11:24 UTC (permalink / raw)
  To: wrlk; +Cc: linux-scsi

[-- Attachment #1: Type: text/plain, Size: 1331 bytes --]

Willem Riede wrote:
> On 2003.05.19 09:01, Douglas Gilbert wrote:
> 
>>Debug: sleeping function called from illegal context at include/linux/rwsem.h:43
>>Call Trace:
>> [<c011f92c>] __might_sleep+0x5c/0x5e
>> [<c011b2d8>] do_page_fault+0x78/0x4a8
>> [<e0823c4b>] idescsi_transfer_pc+0xfb/0x130 [ide_scsi]
> 
> 
> A page fault while in idescsi_transfer_pc?!
> What memory would be accessed that is allowed to be paged out?
> 
> By the way, I have never seen that problem. When ide-scsi fails for me, it
> is in the same way Randy reports. While my change improves mean-time-to-hang
> significantly on my machine, it obviously doesn't for Randy. Back to the
> drawing board :-(

Willem,
When I tried today, my test went for a while then
failed with a timeout and an abort lockup (which
you reported as fixed but I don't have that fix):
   hdb: irq timeout: status=0xd0 { Busy }
   ide-scsi: abort called for 330982
   hdb: ATAPI reset complete
  <<machine lockup, alt-sysrq inactive>>

Attached is a patch to idescsi_queue(). Won't fix the
problems we are seeing now. Changes:
   - returns 0 on error (not 1 which means "busy")
   - yield DID_NO_CONNECT for channel, id or lun
     invalid (this should fix the "responding to
     multiple lun" problem often seen in lk 2.4
   - memset the whole of pc and rq to zero

Doug Gilbert



[-- Attachment #2: ide-scsi2569bk13wr_d1.diff --]
[-- Type: text/plain, Size: 1741 bytes --]

--- linux/drivers/scsi/ide-scsi.c	2003-05-19 12:30:34.000000000 +1000
+++ linux/drivers/scsi/ide-scsi.c2569bk13wr_d1	2003-05-20 20:30:46.000000000 +1000
@@ -795,11 +795,20 @@
 
 static int idescsi_queue (Scsi_Cmnd *cmd, void (*done)(Scsi_Cmnd *))
 {
+	struct scsi_device * sdev = cmd->device;
 	idescsi_scsi_t *scsi = scsihost_to_idescsi(cmd->device->host);
 	ide_drive_t *drive = scsi->drive;
 	struct request *rq = NULL;
 	idescsi_pc_t *pc = NULL;
 
+	if ((sdev->channel > 0) ||
+	    (sdev->id >= sdev->host->max_id) ||
+	    (sdev->lun >= sdev->host->max_lun)) {
+		printk(KERN_INFO "ide-scsi: channel:id:lun %d:%d:%d not "
+		       "present\n", sdev->channel, sdev->id, sdev->lun);
+		cmd->result = DID_NO_CONNECT << 16;
+		goto abort1;
+	}
 	if (!drive) {
 		printk (KERN_ERR "ide-scsi: drive id %d not present\n", cmd->device->id);
 		goto abort;
@@ -811,9 +820,8 @@
 		printk (KERN_ERR "ide-scsi: %s: out of memory\n", drive->name);
 		goto abort;
 	}
-
-	memset (pc->c, 0, 12);
-	pc->flags = 0;
+	memset(pc, 0, sizeof(idescsi_pc_t));
+	memset(rq, 0, sizeof(struct request));
 	pc->rq = rq;
 	memcpy (pc->c, cmd->cmnd, cmd->cmd_len);
 	if (cmd->use_sg) {
@@ -846,16 +854,17 @@
 	rq->special = (char *) pc;
 	rq->bio = idescsi_dma_bio (drive, pc);
 	rq->flags = REQ_SPECIAL;
-	spin_unlock_irq(cmd->device->host->host_lock);
+	spin_unlock_irq(sdev->host->host_lock);
 	(void) ide_do_drive_cmd (drive, rq, ide_end);
-	spin_lock_irq(cmd->device->host->host_lock);
+	spin_lock_irq(sdev->host->host_lock);
 	return 0;
 abort:
+	cmd->result = DID_ERROR << 16;
+abort1:
 	if (pc) kfree (pc);
 	if (rq) kfree (rq);
-	cmd->result = DID_ERROR << 16;
 	done(cmd);
-	return 1;
+	return 0;
 }
 
 static int idescsi_scsi_eh_abort (Scsi_Cmnd *cmd)

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2003-05-20 11:11 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-05-18 23:07 ide-scsi error handling Willem Riede
2003-05-19 13:01 ` Douglas Gilbert
2003-05-19 23:42   ` Willem Riede
2003-05-20 11:24     ` Douglas Gilbert
2003-05-19 14:36 ` Randy.Dunlap
2003-05-19 14:37   ` Randy.Dunlap

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox