[PATCH] qla2xxx: Fix dpc_thread race on the module unload

public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] qla2xxx: Fix dpc_thread race on the module unload
@ 2008-07-28 17:33 Vladislav Bolkhovitin
  2008-07-28 17:41 ` Andrew Vasquez
                   ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: Vladislav Bolkhovitin @ 2008-07-28 17:33 UTC (permalink / raw)
  To: linux-driver; +Cc: linux-scsi, scst-devel

This patch fixes race on dpc_thread field of struct scsi_qla_host,
which can lead to crash on the module unload.

This patch is against 2.6.26

Signed-off-by: Vladislav Bolkhovitin <vst@vlnb.net>

 qla_def.h |    1 +
 qla_mbx.c |    2 +-
 qla_os.c  |   36 ++++++++++++++++++++++++++----------
 3 files changed, 28 insertions(+), 11 deletions(-)

diff -upr linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h
--- linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h	2008-07-14 01:51:29.000000000 +0400
+++ linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h	2008-07-24 10:07:39.000000000 +0400
@@ -2425,6 +2436,7 @@ typedef struct scsi_qla_host {
 	void			*sfp_data;
 	dma_addr_t		sfp_data_dma;
 
+	spinlock_t		dpc_lock;
 	struct task_struct	*dpc_thread;
 	uint8_t dpc_active;                  /* DPC routine is active */
 
diff -upr linux-2.6.26/drivers/scsi/qla2xxx/qla_mbx.c linux-2.6.26/drivers/scsi/qla2xxx/qla_mbx.c
--- linux-2.6.26/drivers/scsi/qla2xxx/qla_mbx.c	2008-07-14 01:51:29.000000000 +0400
+++ linux-2.6.26/drivers/scsi/qla2xxx/qla_mbx.c	2008-07-23 19:27:20.000000000 +0400
@@ -2683,7 +2687,7 @@ qla24xx_report_id_acquisition(scsi_qla_h
 		set_bit(VP_IDX_ACQUIRED, &vha->vp_flags);
 		set_bit(VP_DPC_NEEDED, &ha->dpc_flags);
 
-		wake_up_process(ha->dpc_thread);
+		qla2xxx_wake_dpc(ha);
 	}
 }
 
diff -upr linux-2.6.26/drivers/scsi/qla2xxx/qla_os.c linux-2.6.26/drivers/scsi/qla2xxx/qla_os.c
--- linux-2.6.26/drivers/scsi/qla2xxx/qla_os.c	2008-07-14 01:51:29.000000000 +0400
+++ linux-2.6.26/drivers/scsi/qla2xxx/qla_os.c	2008-07-24 10:13:45.000000000 +0400
@@ -1632,6 +1630,8 @@ qla2x00_probe_one(struct pci_dev *pdev, 
 	/* load the F/W, read paramaters, and init the H/W */
 	ha->instance = num_hosts;
 
+	spin_lock_init(&ha->dpc_lock);
+
 	mutex_init(&ha->vport_lock);
 	init_completion(&ha->mbx_cmd_comp);
 	complete(&ha->mbx_cmd_comp);
@@ -1764,6 +1770,26 @@ qla2x00_remove_one(struct pci_dev *pdev)
 }
 
 static void
+qla2x00_stop_dpc_thread(scsi_qla_host_t *ha)
+{
+	struct task_struct *t = NULL;
+
+	spin_lock_irq(&ha->dpc_lock);
+	if (ha->dpc_thread != NULL) {
+		t = ha->dpc_thread;
+		/*
+		 * qla2xxx_wake_dpc checks for ->dpc_thread
+		 * so we need to zero it out.
+		 */
+		ha->dpc_thread = NULL;
+	}
+	spin_unlock_irq(&ha->dpc_lock);
+
+	if (t != NULL)
+		kthread_stop(t);
+}
+
+static void
 qla2x00_free_device(scsi_qla_host_t *ha)
 {
 	qla2x00_abort_all_cmds(ha, DID_NO_CONNECT << 16);
@@ -1775,16 +1801,7 @@ qla2x00_free_device(scsi_qla_host_t *ha)
 	ha->flags.online = 0;
 
 	/* Kill the kernel thread for this host */
-	if (ha->dpc_thread) {
-		struct task_struct *t = ha->dpc_thread;
-
-		/*
-		 * qla2xxx_wake_dpc checks for ->dpc_thread
-		 * so we need to zero it out.
-		 */
-		ha->dpc_thread = NULL;
-		kthread_stop(t);
-	}
+	qla2x00_stop_dpc_thread(ha);
 
 	if (ha->flags.fce_enabled)
 		qla2x00_disable_fce_trace(ha, NULL, NULL);
@@ -2427,8 +2444,11 @@ qla2x00_do_dpc(void *data)
 void
 qla2xxx_wake_dpc(scsi_qla_host_t *ha)
 {
+	unsigned long flags;
+	spin_lock_irqsave(&ha->dpc_lock, flags);
 	if (ha->dpc_thread)
 		wake_up_process(ha->dpc_thread);
+	spin_unlock_irqrestore(&ha->dpc_lock, flags);
 }
 
 /*


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-28 17:33 [PATCH] qla2xxx: Fix dpc_thread race on the module unload Vladislav Bolkhovitin
@ 2008-07-28 17:41 ` Andrew Vasquez
  2008-07-28 17:49   ` Vladislav Bolkhovitin
  2008-07-28 17:56 ` James Bottomley
  2008-07-28 18:07 ` Andrew Vasquez
  2 siblings, 1 reply; 22+ messages in thread
From: Andrew Vasquez @ 2008-07-28 17:41 UTC (permalink / raw)
  To: Vladislav Bolkhovitin; +Cc: linux-driver, linux-scsi, scst-devel

On Mon, 28 Jul 2008, Vladislav Bolkhovitin wrote:

> This patch fixes race on dpc_thread field of struct scsi_qla_host,
> which can lead to crash on the module unload.
> 
> This patch is against 2.6.26
> 
> Signed-off-by: Vladislav Bolkhovitin <vst@vlnb.net>

Vladislav,

Could you provide some details on the failure you encountered which
prompted this patch (backtrace/reproduction method)?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-28 17:41 ` Andrew Vasquez
@ 2008-07-28 17:49   ` Vladislav Bolkhovitin
  0 siblings, 0 replies; 22+ messages in thread
From: Vladislav Bolkhovitin @ 2008-07-28 17:49 UTC (permalink / raw)
  To: Andrew Vasquez; +Cc: linux-driver, linux-scsi, scst-devel

Andrew Vasquez wrote:
> On Mon, 28 Jul 2008, Vladislav Bolkhovitin wrote:
> 
>> This patch fixes race on dpc_thread field of struct scsi_qla_host,
>> which can lead to crash on the module unload.
>>
>> This patch is against 2.6.26
>>
>> Signed-off-by: Vladislav Bolkhovitin <vst@vlnb.net>
> 
> Vladislav,
> 
> Could you provide some details on the failure you encountered which
> prompted this patch (backtrace/reproduction method)?

Sure. Here is the original report from Gal Rosen:

--------------------------------------------------------------------

This issue occurs when rmmod'ing the qla target then scst modules and
then the qla initiator from a script, constantly, but it is not related
to the target specifically, it happened also when in one shell loading
the qla driver and in other shell rmmod it.

Running the script that rmmod'ing the modules cause panic. All modules
except the qla2xxx were unloaded successfully. The panic occur because
the qla2xxx_wake_dpc() is not protected well. If this function called
from one of the dpc threads or from interrupt, and at the same time some
one unloading the module, then the pointer to the task structure (the
dpc_thread) will be changed to NULL while the wake_up_process() try to
use it.

--------------------------------------------------------------------

Vlad

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-28 17:33 [PATCH] qla2xxx: Fix dpc_thread race on the module unload Vladislav Bolkhovitin
  2008-07-28 17:41 ` Andrew Vasquez
@ 2008-07-28 17:56 ` James Bottomley
  2008-07-28 18:14   ` Vladislav Bolkhovitin
  2008-07-29  4:27   ` Christoph Hellwig
  2008-07-28 18:07 ` Andrew Vasquez
  2 siblings, 2 replies; 22+ messages in thread
From: James Bottomley @ 2008-07-28 17:56 UTC (permalink / raw)
  To: Vladislav Bolkhovitin; +Cc: linux-driver, linux-scsi, scst-devel

On Mon, 2008-07-28 at 21:33 +0400, Vladislav Bolkhovitin wrote:
> This patch fixes race on dpc_thread field of struct scsi_qla_host,
> which can lead to crash on the module unload.
> 
> This patch is against 2.6.26

I'm afraid adding a lock is almost certainly the wrong way to handle
this type of failure.  What should be done is to make sure the qla is
correctly shut down (i.e. no tasks requiring the dpc_thread can be
performed) *before* killing the thread ... it sounds like shutdown is
slightly broken in the current driver ... could you post the oops
details and we can try to work out what the problem is

James

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-28 17:56 ` James Bottomley
@ 2008-07-28 18:14   ` Vladislav Bolkhovitin
  2008-07-29  7:30     ` Gal Rosen
                       ` (2 more replies)
  2008-07-29  4:27   ` Christoph Hellwig
  1 sibling, 3 replies; 22+ messages in thread
From: Vladislav Bolkhovitin @ 2008-07-28 18:14 UTC (permalink / raw)
  To: James Bottomley, Gal Rosen; +Cc: linux-driver, linux-scsi, scst-devel

James Bottomley wrote:
> On Mon, 2008-07-28 at 21:33 +0400, Vladislav Bolkhovitin wrote:
>> This patch fixes race on dpc_thread field of struct scsi_qla_host,
>> which can lead to crash on the module unload.
>>
>> This patch is against 2.6.26
> 
> I'm afraid adding a lock is almost certainly the wrong way to handle
> this type of failure. 

Why? It's simple and fully solves the problem. All the events, which 
left unhandled, because there is nobody to wake up by 
qla2xxx_wake_dpc(), are not relevant after the driver's shutdown.

> What should be done is to make sure the qla is
> correctly shut down (i.e. no tasks requiring the dpc_thread can be
> performed) *before* killing the thread ...

Sure, in ideal it would be the best approach. But, certainly, it would 
be a lot more complicated and error-prone.

 From other side, actually, it doesn't matter much for me how it will be 
fixed, if it's fixed.

> it sounds like shutdown is
> slightly broken in the current driver ... could you post the oops
> details and we can try to work out what the problem is

Gal, can you send the details, please?

> James
> 
> 
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-28 18:14   ` Vladislav Bolkhovitin
@ 2008-07-29  7:30     ` Gal Rosen
  2008-07-30  7:10     ` Gal Rosen
  2008-07-31  6:12     ` Gal Rosen
  2 siblings, 0 replies; 22+ messages in thread
From: Gal Rosen @ 2008-07-29  7:30 UTC (permalink / raw)
  To: Vladislav Bolkhovitin
  Cc: James Bottomley, linux-driver, linux-scsi, scst-devel

Hi All,

I used qla initiator driver based on 2.6.23-8 which was modified to
support target mode and virtual port on AL topology.
The issue does not related to the target. As I wrote in other thread it
happens also when in one shell insmod'ing the driver and in other shell
rmmod'ing it.
My Oops is pointing as Andrew stated to the
qla24xx_report_id_acquisition() routine, because after loading the
driver I created vport, then rmmod'ing the driver without deleting the
vport. In that case interrupt comes to report about the deletion of the
vport, and request the dpc thread. So for sure we need to call
qla2xxx_wake_dpc() routine instead of calling wake_up_process() routine,
where we don't check the dpc_thread pointer.
I am not sure this is enough but I can test it, because as I wrote again
in other thread, and Vlad quote here, if someone request wake up of the
dpc thread, check that it is not NULL, call to wake_up_process()
routine, and exactly at this point other task got the CPU, set the
pointer to NULL and stop the dpc thread, then we have a problem.

Below is the Oops with Vlad fix revision 470 in SCST with the addition
of the lock but without the change in qla_mbx.c.  

[67324.049497] [4525]: scst: exit_scst:1652:SCST unloaded
[67324.765692] qla2xxx 0000:06:00.1: LIP reset occured (f8ef).
[67325.295690] qla2xxx 0000:06:00.1: LIP occured (f8ef).
[67325.315689] qla2xxx 0000:06:00.1: LOOP UP detected (4 Gbps).
[67325.415700] Unable to handle kernel NULL pointer dereference at
0000000000000008 RIP: 
[67325.416599] [<ffffffff80226b1e>] task_rq_lock+0x2e/0x90
[67325.418171] PGD 22d737067 PUD 22df2d067 PMD 0 
[67325.419114] Oops: 0000 [1] SMP 
[67325.419736] CPU 0 
[67325.420162] Modules linked in: qla2xxx loop scsi_transport_fc
[67325.421333] Pid: 4382, comm: qla2xxx_12_dpc Tainted: GF
2.6.23.8-64bit-fc-nfs #10
[67325.422998] RIP: 0010:[<ffffffff80226b1e>] [<ffffffff80226b1e>]
task_rq_lock+0x2e/0x90
[67325.424736] RSP: 0018:ffff81022e14bb00 EFLAGS: 00010086
[67325.425870] RAX: 0000000000000086 RBX: 000000000000000f RCX:
0000000000000000
[67325.427379] RDX: 0000000000000000 RSI: ffff81022e14bb80 RDI:
0000000000000000
[67325.428899] RBP: ffff81022e14bb20 R08: 0000000000000000 R09:
ffff81022d160010
[67325.430406] R10: 0000000000000000 R11: ffffffff8050faf0 R12:
ffffffff807a4000
[67325.431925] R13: 0000000000000000 R14: ffff81022e14bb80 R15:
0000000000000246
[67325.433433] FS: 0000000000000000(0000) GS:ffffffff8071d000(0000)
knlGS:0000000000000000
[67325.435147] CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[67325.436371] CR2: 0000000000000008 CR3: 000000022dec0000 CR4:
00000000000006e0
[67325.437890] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[67325.439398] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
[67325.440921] Process qla2xxx_12_dpc (pid: 4382, threadinfo
ffff81022e14a000, task ffff81022c6040c0)
[67325.442813] Stack: 000000000000000f ffff81022d160000
0000000000000000 ffffc20000038000
[67325.444551] ffff81022e14bbb0 ffffffff80226ddf ffff81022e14bbd0
0000000000000046
[67325.446143] 0000000000000000 ffff810008abc918 000000002c6042e0
ffff81022c6040c0
[67325.447697] Call Trace:
[67325.448252] [<ffffffff80226ddf>] try_to_wake_up+0x2f/0x370
[67325.449485] [<ffffffff80237854>] lock_timer_base+0x34/0x70
[67325.450736]
[<ffffffff8805a9cb>] :qla2xxx:qla24xx_process_response_queue+0x18b/0x1f0
[67325.452442] [<ffffffff8805bdd8>] :qla2xxx:qla24xx_intr_handler
+0x158/0x200
[67325.453976] [<ffffffff80237710>] process_timeout+0x0/0x10
[67325.455196] [<ffffffff88054b0b>] :qla2xxx:qla2x00_mailbox_command
+0x1db/0x5d0
[67325.456787] [<ffffffff80597980>] thread_return+0x0/0x5c0
[67325.457978] [<ffffffff80237854>] lock_timer_base+0x34/0x70
[67325.459214] [<ffffffff88056a17>] :qla2xxx:qla2x00_get_adapter_id
+0x87/0x130
[67325.460765] [<ffffffff80237710>] process_timeout+0x0/0x10
[67325.461983] [<ffffffff8804fa9f>] :qla2xxx:qla2x00_configure_loop
+0x33f/0x17a0
[67325.463573] [<ffffffff80237854>] lock_timer_base+0x34/0x70
[67325.464807] [<ffffffff880568ea>] :qla2xxx:qla2x00_get_retry_cnt
+0x5a/0x100
[67325.466333] [<ffffffff8805882c>] :qla2xxx:__qla2x00_marker
+0xec/0x110
[67325.467781] [<ffffffff880588b0>] :qla2xxx:qla2x00_marker+0x60/0x90
[67325.469157] [<ffffffff88051b3e>] :qla2xxx:qla2x00_abort_isp
+0x24e/0x610
[67325.470638] [<ffffffff8804c290>] :qla2xxx:qla2x00_do_dpc+0x430/0x560
[67325.472052] [<ffffffff8804be60>] :qla2xxx:qla2x00_do_dpc+0x0/0x560
[67325.473431] [<ffffffff8804be60>] :qla2xxx:qla2x00_do_dpc+0x0/0x560
[67325.474820] [<ffffffff80242e8b>] kthread+0x4b/0x80
[67325.475902] [<ffffffff8020c608>] child_rip+0xa/0x12
[67325.477005] [<ffffffff80242e40>] kthread+0x0/0x80
[67325.478069] [<ffffffff8020c5fe>] child_rip+0x0/0x12
[67325.479171] 
[67325.479486] 
[67325.479486] Code: 49 8b 45 08 4c 89 e3 8b 40 18 48 8b 04 c5 40 8c 74
80 48 03 
[67325.481393] RIP [<ffffffff80226b1e>] task_rq_lock+0x2e/0x90
[67325.482617] RSP <ffff81022e14bb00>
[67325.483356] CR2: 0000000000000008

Message from syslogd@HP1b at Tue Jul 22 02:25:48 2008 ...

Message from syslogd@HP1b at Tue Jul 22 02:25:48 2008 ...
HP1b kernel: [67325.419114] Oops: 0000 [1] SMP 
HP1b kernel: [67325.483356] CR2: 0000000000000008

Gal.

On Mon, 2008-07-28 at 22:14 +0400, Vladislav Bolkhovitin wrote:
> James Bottomley wrote:
> > On Mon, 2008-07-28 at 21:33 +0400, Vladislav Bolkhovitin wrote:
> >> This patch fixes race on dpc_thread field of struct scsi_qla_host,
> >> which can lead to crash on the module unload.
> >>
> >> This patch is against 2.6.26
> > 
> > I'm afraid adding a lock is almost certainly the wrong way to handle
> > this type of failure. 
> 
> Why? It's simple and fully solves the problem. All the events, which 
> left unhandled, because there is nobody to wake up by 
> qla2xxx_wake_dpc(), are not relevant after the driver's shutdown.
> 
> > What should be done is to make sure the qla is
> > correctly shut down (i.e. no tasks requiring the dpc_thread can be
> > performed) *before* killing the thread ...
> 
> Sure, in ideal it would be the best approach. But, certainly, it would 
> be a lot more complicated and error-prone.
> 
>  From other side, actually, it doesn't matter much for me how it will be 
> fixed, if it's fixed.
> 
> > it sounds like shutdown is
> > slightly broken in the current driver ... could you post the oops
> > details and we can try to work out what the problem is
> 
> Gal, can you send the details, please?
> 
> > James
> > 
> > 
> > 
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-28 18:14   ` Vladislav Bolkhovitin
  2008-07-29  7:30     ` Gal Rosen
@ 2008-07-30  7:10     ` Gal Rosen
  2008-07-31  6:12     ` Gal Rosen
  2 siblings, 0 replies; 22+ messages in thread
From: Gal Rosen @ 2008-07-30  7:10 UTC (permalink / raw)
  To: Vladislav Bolkhovitin
  Cc: James Bottomley, linux-driver, linux-scsi, scst-devel

From: James Bottomley <James.Bottomley@Ha...> - 2008-07-29 15:28

On Tue, 2008-07-29 at 19:13 +0400, Vladislav Bolkhovitin wrote:
> James Bottomley wrote:
> > On Tue, 2008-07-29 at 13:32 +0400, Vladislav Bolkhovitin wrote:
> >> Nope, taking only one that hunk from this patch isn't sufficient.
> >> Around 
> >> dpc_thread there is pretty simple and classical race. You can't do
> >>
> >> if (x != NULL)
> >> y = *x;
> >>
> >> without any protection, if x can be set to NULL by another thread.
It 
> >> can happen exactly between "if" and "*x" and hence lead to a crash,
> >> correct?
> > 
> > No.
> 
> What "No"? The above unlocked "if (x != NULL) y = *x;" is always safe 
> now? ;)

No ... no as in your analysis based on the example is not correct to
conclude protection is required. We have quite a number of examples of
this within the linux kernel (the SCSI error thread would be one).

But the wake up of the SCSI error thread is also called by holding a
spinlock (but not to protect the stopping of the thread).
The difference here is that the assignment of the thread to NULL is in
the thread function, when exiting from the while loop, and not before
calling kthread_stop() routine (like in the qla).
Maybe this would be the solution.  

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-28 18:14   ` Vladislav Bolkhovitin
  2008-07-29  7:30     ` Gal Rosen
  2008-07-30  7:10     ` Gal Rosen
@ 2008-07-31  6:12     ` Gal Rosen
  2008-07-31  9:11       ` Vladislav Bolkhovitin
  2 siblings, 1 reply; 22+ messages in thread
From: Gal Rosen @ 2008-07-31  6:12 UTC (permalink / raw)
  To: Vladislav Bolkhovitin
  Cc: James Bottomley, linux-driver, linux-scsi, scst-devel

Andrew,

Add checking of the flag online still does not answer the race that Vlad
stated before, unless you thought about another way without lock to
protect this thread.

> Nope, taking only one that hunk from this patch isn't sufficient.
> Around 
> dpc_thread there is pretty simple and classical race. You can't do
> 
> if (x != NULL)
> y = *x;
> 
> without any protection, if x can be set to NULL by another thread. It 
> can happen exactly between "if" and "*x" and hence lead to a crash,
> correct?

Gal.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-31  6:12     ` Gal Rosen
@ 2008-07-31  9:11       ` Vladislav Bolkhovitin
  2008-07-31 16:02         ` Andrew Vasquez
  0 siblings, 1 reply; 22+ messages in thread
From: Vladislav Bolkhovitin @ 2008-07-31  9:11 UTC (permalink / raw)
  To: linux-driver; +Cc: Gal Rosen, James Bottomley, linux-scsi, scst-devel

Gal Rosen wrote:
> Andrew,
> 
> Add checking of the flag online still does not answer the race that Vlad
> stated before,

Yes, that's true.

And, since ha->flags.online set to 0 not only on shutdown, I'm afraid 
you could introduce a new set of subtle bugs, if not for the moment, but 
in the future, because with your patch it gets impossible to wake up the 
DPC thread if HA is offline.

Vlad

> unless you thought about another way without lock to
> protect this thread.
> 
>> Nope, taking only one that hunk from this patch isn't sufficient.
>> Around 
>> dpc_thread there is pretty simple and classical race. You can't do
>>
>> if (x != NULL)
>> y = *x;
>>
>> without any protection, if x can be set to NULL by another thread. It 
>> can happen exactly between "if" and "*x" and hence lead to a crash,
>> correct?
> 
> Gal.
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-31  9:11       ` Vladislav Bolkhovitin
@ 2008-07-31 16:02         ` Andrew Vasquez
  2008-07-31 17:41           ` Vladislav Bolkhovitin
  0 siblings, 1 reply; 22+ messages in thread
From: Andrew Vasquez @ 2008-07-31 16:02 UTC (permalink / raw)
  To: Vladislav Bolkhovitin
  Cc: linux-driver, Gal Rosen, James Bottomley, linux-scsi, scst-devel

On Thu, 31 Jul 2008, Vladislav Bolkhovitin wrote:

> Gal Rosen wrote:
>> Andrew,
>>
>> Add checking of the flag online still does not answer the race that Vlad
>> stated before,
>
> Yes, that's true.
>
> And, since ha->flags.online set to 0 not only on shutdown, I'm afraid you 
> could introduce a new set of subtle bugs, if not for the moment, but in the 
> future, because with your patch it gets impossible to wake up the DPC 
> thread if HA is offline.

You are both missing a subtle point, it's incidental if
qla2xxx_wake_dpc() misses a 'wake-up' due the HBA being 'offline', as
the qla2x00_timer(), woken up every second, will do the wake-up if
necessary. The code changes offered, close the window as tightly as
possible without introducing needlessly complex changes.  All this
infrastructure is legacy constucts from a time long-before work-queues
and the like.

This area is one of several where we are moving to clean-up and
modernize in our upstream offering:

* dropping the heavy DPC thread in favor of work-queues.
* using proper life-cycle and reference-handling of fcport objects
  (yes, the objects will be freed after use).
* refactoring the HA respresentations for physical and virtual ports,
  the current memcpy() physical-HA to vport-HA and slight-mods is
  error-prone and doesn't scale.
* drop the illogical single physical-HA maintains all fcports across
  N-vports.

The changes above are large (170k diffs so far), and at this point are
being run-through our testing.  The hope is to get the changes
upstream during one of the next two merge windows.

Given the infrustructure mods and our focus on that front, if there's
something small and contained you can offer above what I've proposed
we'll be interested in reviewing any patches you'd push forward.

Regards,
Andrew Vasquez

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-31 16:02         ` Andrew Vasquez
@ 2008-07-31 17:41           ` Vladislav Bolkhovitin
  2008-07-31 17:55             ` Andrew Vasquez
  0 siblings, 1 reply; 22+ messages in thread
From: Vladislav Bolkhovitin @ 2008-07-31 17:41 UTC (permalink / raw)
  To: Andrew Vasquez
  Cc: linux-driver, Gal Rosen, James Bottomley, linux-scsi, scst-devel


Andrew Vasquez wrote:
> On Thu, 31 Jul 2008, Vladislav Bolkhovitin wrote:
> 
>> Gal Rosen wrote:
>>> Andrew,
>>>
>>> Add checking of the flag online still does not answer the race that Vlad
>>> stated before,
>> Yes, that's true.
>>
>> And, since ha->flags.online set to 0 not only on shutdown, I'm afraid you 
>> could introduce a new set of subtle bugs, if not for the moment, but in the 
>> future, because with your patch it gets impossible to wake up the DPC 
>> thread if HA is offline.
> 
> You are both missing a subtle point, it's incidental if
> qla2xxx_wake_dpc() misses a 'wake-up' due the HBA being 'offline', as
> the qla2x00_timer(), woken up every second, will do the wake-up if
> necessary. The code changes offered, close the window as tightly as
> possible without introducing needlessly complex changes.  All this
> infrastructure is legacy constucts from a time long-before work-queues
> and the like.
> 
> This area is one of several where we are moving to clean-up and
> modernize in our upstream offering:
> 
> * dropping the heavy DPC thread in favor of work-queues.
> * using proper life-cycle and reference-handling of fcport objects
>   (yes, the objects will be freed after use).
> * refactoring the HA respresentations for physical and virtual ports,
>   the current memcpy() physical-HA to vport-HA and slight-mods is
>   error-prone and doesn't scale.
> * drop the illogical single physical-HA maintains all fcports across
>   N-vports.

Nice to hear that!

> The changes above are large (170k diffs so far), and at this point are
> being run-through our testing.  The hope is to get the changes
> upstream during one of the next two merge windows.
> 
> Given the infrustructure mods and our focus on that front, if there's
> something small and contained you can offer above what I've proposed
> we'll be interested in reviewing any patches you'd push forward.

Then, I believe, my patch should go in as a temporal measure. I don't 
think we should crash users for 2 more major releases. The same is true 
for my other patch "Proposed protection of fcports field of struct 
scsi_qla_host" as well, because without it there should be no big 
problems to crash the driver via sysfs.

> Regards,
> Andrew Vasquez
> --
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-31 17:41           ` Vladislav Bolkhovitin
@ 2008-07-31 17:55             ` Andrew Vasquez
  2008-08-01 12:28               ` Vladislav Bolkhovitin
  0 siblings, 1 reply; 22+ messages in thread
From: Andrew Vasquez @ 2008-07-31 17:55 UTC (permalink / raw)
  To: Vladislav Bolkhovitin
  Cc: linux-driver, Gal Rosen, James Bottomley, linux-scsi, scst-devel

On Thu, 31 Jul 2008, Vladislav Bolkhovitin wrote:

>> The changes above are large (170k diffs so far), and at this point are
>> being run-through our testing.  The hope is to get the changes
>> upstream during one of the next two merge windows.
>>
>> Given the infrustructure mods and our focus on that front, if there's
>> something small and contained you can offer above what I've proposed
>> we'll be interested in reviewing any patches you'd push forward.
>
> Then, I believe, my patch should go in as a temporal measure. I don't think 
> we should crash users for 2 more major releases.

So the 'online' check concerns you?  So, add an 'unloading' flag, set
it on remove_one(), save a copy of dpc_thread at qla2xxx_wake_dpc(),
then wake_up() saved value if not 'unloading'.  Of course the direct
wake_up() in request_acqusition should be modded to call
qla2xxx_wake_dpc().  We're not talking about some huge window here.

> The same is true for my 
> other patch "Proposed protection of fcports field of struct scsi_qla_host" 
> as well, because without it there should be no big problems to crash the 
> driver via sysfs.

We're still looking through your patches...  So how exactly would it
be 'be no big problems to crash the driver via sysfs.'???

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-31 17:55             ` Andrew Vasquez
@ 2008-08-01 12:28               ` Vladislav Bolkhovitin
  0 siblings, 0 replies; 22+ messages in thread
From: Vladislav Bolkhovitin @ 2008-08-01 12:28 UTC (permalink / raw)
  To: Andrew Vasquez
  Cc: linux-driver, Gal Rosen, James Bottomley, linux-scsi, scst-devel

Andrew Vasquez wrote:
> On Thu, 31 Jul 2008, Vladislav Bolkhovitin wrote:
> 
>>> The changes above are large (170k diffs so far), and at this point are
>>> being run-through our testing.  The hope is to get the changes
>>> upstream during one of the next two merge windows.
>>>
>>> Given the infrustructure mods and our focus on that front, if there's
>>> something small and contained you can offer above what I've proposed
>>> we'll be interested in reviewing any patches you'd push forward.
>> Then, I believe, my patch should go in as a temporal measure. I don't think 
>> we should crash users for 2 more major releases.
> 
> So the 'online' check concerns you?  So, add an 'unloading' flag, set
> it on remove_one(), save a copy of dpc_thread at qla2xxx_wake_dpc(),
> then wake_up() saved value if not 'unloading'.  Of course the direct
> wake_up() in request_acqusition should be modded to call
> qla2xxx_wake_dpc().  We're not talking about some huge window here.

My main concern is not about the 'online' flag, but about that your 
patch doesn't fix the race, which it's intended to fix, because:

1. It doesn't handle possibility of CPU to reorder commands. Hence, it 
could be possible that other CPUs can see 'online' flag set to 0 *after* 
dpc_thread set to NULL, despite in the code "ha->flags.online = 0" 
precede "ha->dpc_thread = NULL". To address that the corresponding 
memory barriers around ha->flags.online assignment and reading are needed.

2. More importantly, it doesn't handle possibility for task_struct, to 
which dpc_thread field refers, to be destroyed exactly between lines

if (ha->flags.online && t)
	wake_up_process(t);

or inside wake_up_process(). Hence, already freed and possibly reused 
data will be accessed by wake_up_process() with all corresponding bad 
consequences.

>> The same is true for my 
>> other patch "Proposed protection of fcports field of struct scsi_qla_host" 
>> as well, because without it there should be no big problems to crash the 
>> driver via sysfs.
> 
> We're still looking through your patches...  So how exactly would it
> be 'be no big problems to crash the driver via sysfs.'???

Example scenario: one or many applications read in a loop any sysfs 
entry, which go over fcports list, and at the same time DPC thread adds 
to it new entries. If CPU can reorder commands (most modern CPUs can) it 
is possible that list_for_each_entry() will access partially inserted 
entry => NULL pointer dereference.

To see better what I mean look at the difference between list_add() and 
list_add_rcu() as well as between list_for_each_entry() and 
list_for_each_entry_rcu().

Vlad

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-28 17:56 ` James Bottomley
  2008-07-28 18:14   ` Vladislav Bolkhovitin
@ 2008-07-29  4:27   ` Christoph Hellwig
  2008-07-29  9:32     ` Vladislav Bolkhovitin
  1 sibling, 1 reply; 22+ messages in thread
From: Christoph Hellwig @ 2008-07-29  4:27 UTC (permalink / raw)
  To: James Bottomley
  Cc: Vladislav Bolkhovitin, linux-driver, linux-scsi, scst-devel

On Mon, Jul 28, 2008 at 12:56:16PM -0500, James Bottomley wrote:
> On Mon, 2008-07-28 at 21:33 +0400, Vladislav Bolkhovitin wrote:
> > This patch fixes race on dpc_thread field of struct scsi_qla_host,
> > which can lead to crash on the module unload.
> > 
> > This patch is against 2.6.26
> 
> I'm afraid adding a lock is almost certainly the wrong way to handle
> this type of failure.  What should be done is to make sure the qla is
> correctly shut down (i.e. no tasks requiring the dpc_thread can be
> performed) *before* killing the thread ... it sounds like shutdown is
> slightly broken in the current driver ... could you post the oops
> details and we can try to work out what the problem is

I'm not even sure it's the current driver as he mentions that he using
some out of tree target driver which might be the problem.

Can you reproduce it with a stock 2.6.27-rc tree?


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-29  4:27   ` Christoph Hellwig
@ 2008-07-29  9:32     ` Vladislav Bolkhovitin
  0 siblings, 0 replies; 22+ messages in thread
From: Vladislav Bolkhovitin @ 2008-07-29  9:32 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: James Bottomley, linux-driver, linux-scsi, scst-devel

Christoph Hellwig wrote:
> On Mon, Jul 28, 2008 at 12:56:16PM -0500, James Bottomley wrote:
>> On Mon, 2008-07-28 at 21:33 +0400, Vladislav Bolkhovitin wrote:
>>> This patch fixes race on dpc_thread field of struct scsi_qla_host,
>>> which can lead to crash on the module unload.
>>>
>>> This patch is against 2.6.26
>> I'm afraid adding a lock is almost certainly the wrong way to handle
>> this type of failure.  What should be done is to make sure the qla is
>> correctly shut down (i.e. no tasks requiring the dpc_thread can be
>> performed) *before* killing the thread ... it sounds like shutdown is
>> slightly broken in the current driver ... could you post the oops
>> details and we can try to work out what the problem is
> 
> I'm not even sure it's the current driver as he mentions that he using
> some out of tree target driver which might be the problem.
> 
> Can you reproduce it with a stock 2.6.27-rc tree?

The race is still in 2.6.27-rc1.

Vlad



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-28 17:33 [PATCH] qla2xxx: Fix dpc_thread race on the module unload Vladislav Bolkhovitin
  2008-07-28 17:41 ` Andrew Vasquez
  2008-07-28 17:56 ` James Bottomley
@ 2008-07-28 18:07 ` Andrew Vasquez
  2008-07-29  9:32   ` Vladislav Bolkhovitin
  2 siblings, 1 reply; 22+ messages in thread
From: Andrew Vasquez @ 2008-07-28 18:07 UTC (permalink / raw)
  To: Vladislav Bolkhovitin; +Cc: linux-driver, linux-scsi, scst-devel

On Mon, 28 Jul 2008, Vladislav Bolkhovitin wrote:

> This patch fixes race on dpc_thread field of struct scsi_qla_host,
> which can lead to crash on the module unload.
> 
> This patch is against 2.6.26
> 
> Signed-off-by: Vladislav Bolkhovitin <vst@vlnb.net>
> 
>  qla_def.h |    1 +
>  qla_mbx.c |    2 +-
>  qla_os.c  |   36 ++++++++++++++++++++++++++----------
>  3 files changed, 28 insertions(+), 11 deletions(-)
> 
> diff -upr linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h
> --- linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h	2008-07-14 01:51:29.000000000 +0400
> +++ linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h	2008-07-24 10:07:39.000000000 +0400
> @@ -2425,6 +2436,7 @@ typedef struct scsi_qla_host {
>  	void			*sfp_data;
>  	dma_addr_t		sfp_data_dma;
>  
> +	spinlock_t		dpc_lock;

As James mentioned in another email, there's something else going on
with qla2xxx shutdown process which should be addressed...

>  	struct task_struct	*dpc_thread;
>  	uint8_t dpc_active;                  /* DPC routine is active */
>  
> diff -upr linux-2.6.26/drivers/scsi/qla2xxx/qla_mbx.c linux-2.6.26/drivers/scsi/qla2xxx/qla_mbx.c
> --- linux-2.6.26/drivers/scsi/qla2xxx/qla_mbx.c	2008-07-14 01:51:29.000000000 +0400
> +++ linux-2.6.26/drivers/scsi/qla2xxx/qla_mbx.c	2008-07-23 19:27:20.000000000 +0400
> @@ -2683,7 +2687,7 @@ qla24xx_report_id_acquisition(scsi_qla_h
>  		set_bit(VP_IDX_ACQUIRED, &vha->vp_flags);
>  		set_bit(VP_DPC_NEEDED, &ha->dpc_flags);
>  
> -		wake_up_process(ha->dpc_thread);
> +		qla2xxx_wake_dpc(ha);

Ok, your change looks correct here...  This is defintely the correct
way the qla24xx_report_id_acquisition() routine should be triggering
the DPC thread.

I'm guessing the backtrace in "Gal Rosen's" report pointed to
qla24xx_report_id_acquisition()?  If, so, I'm not entirely sure we
need anything more to 'protect' tear-down...

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-28 18:07 ` Andrew Vasquez
@ 2008-07-29  9:32   ` Vladislav Bolkhovitin
  2008-07-29 14:40     ` James Bottomley
  2008-07-30 10:30     ` Andrew Vasquez
  0 siblings, 2 replies; 22+ messages in thread
From: Vladislav Bolkhovitin @ 2008-07-29  9:32 UTC (permalink / raw)
  To: Andrew Vasquez; +Cc: linux-driver, linux-scsi, scst-devel

Andrew Vasquez wrote:
> On Mon, 28 Jul 2008, Vladislav Bolkhovitin wrote:
> 
>> This patch fixes race on dpc_thread field of struct scsi_qla_host,
>> which can lead to crash on the module unload.
>>
>> This patch is against 2.6.26
>>
>> Signed-off-by: Vladislav Bolkhovitin <vst@vlnb.net>
>>
>>  qla_def.h |    1 +
>>  qla_mbx.c |    2 +-
>>  qla_os.c  |   36 ++++++++++++++++++++++++++----------
>>  3 files changed, 28 insertions(+), 11 deletions(-)
>>
>> diff -upr linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h
>> --- linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h	2008-07-14 01:51:29.000000000 +0400
>> +++ linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h	2008-07-24 10:07:39.000000000 +0400
>> @@ -2425,6 +2436,7 @@ typedef struct scsi_qla_host {
>>  	void			*sfp_data;
>>  	dma_addr_t		sfp_data_dma;
>>  
>> +	spinlock_t		dpc_lock;
> 
> As James mentioned in another email, there's something else going on
> with qla2xxx shutdown process which should be addressed...
> 
>>  	struct task_struct	*dpc_thread;
>>  	uint8_t dpc_active;                  /* DPC routine is active */
>>  
>> diff -upr linux-2.6.26/drivers/scsi/qla2xxx/qla_mbx.c linux-2.6.26/drivers/scsi/qla2xxx/qla_mbx.c
>> --- linux-2.6.26/drivers/scsi/qla2xxx/qla_mbx.c	2008-07-14 01:51:29.000000000 +0400
>> +++ linux-2.6.26/drivers/scsi/qla2xxx/qla_mbx.c	2008-07-23 19:27:20.000000000 +0400
>> @@ -2683,7 +2687,7 @@ qla24xx_report_id_acquisition(scsi_qla_h
>>  		set_bit(VP_IDX_ACQUIRED, &vha->vp_flags);
>>  		set_bit(VP_DPC_NEEDED, &ha->dpc_flags);
>>  
>> -		wake_up_process(ha->dpc_thread);
>> +		qla2xxx_wake_dpc(ha);
> 
> Ok, your change looks correct here...  This is defintely the correct
> way the qla24xx_report_id_acquisition() routine should be triggering
> the DPC thread.
> 
> I'm guessing the backtrace in "Gal Rosen's" report pointed to
> qla24xx_report_id_acquisition()?  If, so, I'm not entirely sure we
> need anything more to 'protect' tear-down...

Nope, taking only one that hunk from this patch isn't sufficient. Around 
dpc_thread there is pretty simple and classical race. You can't do

if (x != NULL)
	y = *x;

without any protection, if x can be set to NULL by another thread. It 
can happen exactly between "if" and "*x" and hence lead to a crash, correct?

What this patch does is adding such protection.

Vlad

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-29  9:32   ` Vladislav Bolkhovitin
@ 2008-07-29 14:40     ` James Bottomley
  2008-07-29 15:13       ` Vladislav Bolkhovitin
  2008-07-30 10:30     ` Andrew Vasquez
  1 sibling, 1 reply; 22+ messages in thread
From: James Bottomley @ 2008-07-29 14:40 UTC (permalink / raw)
  To: Vladislav Bolkhovitin
  Cc: Andrew Vasquez, linux-driver, linux-scsi, scst-devel

On Tue, 2008-07-29 at 13:32 +0400, Vladislav Bolkhovitin wrote:
> Nope, taking only one that hunk from this patch isn't sufficient.
> Around 
> dpc_thread there is pretty simple and classical race. You can't do
> 
> if (x != NULL)
>         y = *x;
> 
> without any protection, if x can be set to NULL by another thread. It 
> can happen exactly between "if" and "*x" and hence lead to a crash,
> correct?

No.

Today we go to reasonable lengths to avoid additional locking.
Spinlocks are nasty in most boxes because of the potential for
introducing hot points and cacheline bouncing.

The first question you ask is how hot is the variable ... as in how
constantly changing is it?  This one is set at start of day and cleared
when the thread is killed in shutdown.  Basically there's a single not
NULL -> NULL transition.  Once NULL, it never goes back to not NULL.

The point is that in this particular circumstance, the lock is overkill.
You can either use other indicators to show that the driver is being
removed (and the thread is dying) or you can simply use some type of
refcounting to ensure the thread isn't killed until it's no longer
needed.

Even if it were a constantly changing variable, and therefore a more
ideal candidate for locking, we might still look into atomics and bitops
before adding a lock.

James

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-29 14:40     ` James Bottomley
@ 2008-07-29 15:13       ` Vladislav Bolkhovitin
  2008-07-29 15:28         ` James Bottomley
  0 siblings, 1 reply; 22+ messages in thread
From: Vladislav Bolkhovitin @ 2008-07-29 15:13 UTC (permalink / raw)
  To: James Bottomley
  Cc: Vladislav Bolkhovitin, Andrew Vasquez, linux-driver, linux-scsi,
	scst-devel

James Bottomley wrote:
> On Tue, 2008-07-29 at 13:32 +0400, Vladislav Bolkhovitin wrote:
>> Nope, taking only one that hunk from this patch isn't sufficient.
>> Around 
>> dpc_thread there is pretty simple and classical race. You can't do
>>
>> if (x != NULL)
>>         y = *x;
>>
>> without any protection, if x can be set to NULL by another thread. It 
>> can happen exactly between "if" and "*x" and hence lead to a crash,
>> correct?
> 
> No.

What "No"? The above unlocked "if (x != NULL) y = *x;" is always safe 
now? ;)

> Today we go to reasonable lengths to avoid additional locking.
> Spinlocks are nasty in most boxes because of the potential for
> introducing hot points and cacheline bouncing.
> 
> The first question you ask is how hot is the variable ... as in how
> constantly changing is it?  This one is set at start of day and cleared
> when the thread is killed in shutdown.  Basically there's a single not
> NULL -> NULL transition.  Once NULL, it never goes back to not NULL.
> 
> The point is that in this particular circumstance, the lock is overkill.
> You can either use other indicators to show that the driver is being
> removed (and the thread is dying) or you can simply use some type of
> refcounting to ensure the thread isn't killed until it's no longer
> needed.
> 
> Even if it were a constantly changing variable, and therefore a more
> ideal candidate for locking, we might still look into atomics and bitops
> before adding a lock.

Sure, all you wrote above is correct, but in this *particular* case what 
you propose is an overkill, not use of the spinlock. Waking up DPC 
thread isn't on the hot path by any mean, it's quite rare event. So, all 
those lockfree techniques comparing to a simple single lock would 
introduce only additional complicated code without any measurable gain.

But, as I already wrote, I don't care much how this problem will be 
fixed. I care only that it should be fixed. So, my point was only that 
use from my patch only one hunk

-		wake_up_process(ha->dpc_thread);
+		qla2xxx_wake_dpc(ha);

is insufficient to fix the problem.

Vlad


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-29 15:13       ` Vladislav Bolkhovitin
@ 2008-07-29 15:28         ` James Bottomley
  2008-07-29 15:36           ` Vladislav Bolkhovitin
  0 siblings, 1 reply; 22+ messages in thread
From: James Bottomley @ 2008-07-29 15:28 UTC (permalink / raw)
  To: Vladislav Bolkhovitin
  Cc: Andrew Vasquez, linux-driver, linux-scsi, scst-devel

On Tue, 2008-07-29 at 19:13 +0400, Vladislav Bolkhovitin wrote:
> James Bottomley wrote:
> > On Tue, 2008-07-29 at 13:32 +0400, Vladislav Bolkhovitin wrote:
> >> Nope, taking only one that hunk from this patch isn't sufficient.
> >> Around 
> >> dpc_thread there is pretty simple and classical race. You can't do
> >>
> >> if (x != NULL)
> >>         y = *x;
> >>
> >> without any protection, if x can be set to NULL by another thread. It 
> >> can happen exactly between "if" and "*x" and hence lead to a crash,
> >> correct?
> > 
> > No.
> 
> What "No"? The above unlocked "if (x != NULL) y = *x;" is always safe 
> now? ;)

No ... no as in your analysis based on the example is not correct to
conclude protection is required.  We have quite a number of examples of
this within the linux kernel (the SCSI error thread would be one).

> > Today we go to reasonable lengths to avoid additional locking.
> > Spinlocks are nasty in most boxes because of the potential for
> > introducing hot points and cacheline bouncing.
> > 
> > The first question you ask is how hot is the variable ... as in how
> > constantly changing is it?  This one is set at start of day and cleared
> > when the thread is killed in shutdown.  Basically there's a single not
> > NULL -> NULL transition.  Once NULL, it never goes back to not NULL.
> > 
> > The point is that in this particular circumstance, the lock is overkill.
> > You can either use other indicators to show that the driver is being
> > removed (and the thread is dying) or you can simply use some type of
> > refcounting to ensure the thread isn't killed until it's no longer
> > needed.
> > 
> > Even if it were a constantly changing variable, and therefore a more
> > ideal candidate for locking, we might still look into atomics and bitops
> > before adding a lock.
> 
> Sure, all you wrote above is correct, but in this *particular* case what 
> you propose is an overkill, not use of the spinlock. Waking up DPC 
> thread isn't on the hot path by any mean, it's quite rare event. So, all 
> those lockfree techniques comparing to a simple single lock would 
> introduce only additional complicated code without any measurable gain.

It's important to think about this kind of thing even in cases where it
doesn't necessarily matter that much just so it becomes standard
practice in cases where it does.

> But, as I already wrote, I don't care much how this problem will be 
> fixed. I care only that it should be fixed. So, my point was only that 
> use from my patch only one hunk
> 
> -		wake_up_process(ha->dpc_thread);
> +		qla2xxx_wake_dpc(ha);
> 
> is insufficient to fix the problem.

Yes, I agree with that ... I just want to see a better fix.

James



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-29 15:28         ` James Bottomley
@ 2008-07-29 15:36           ` Vladislav Bolkhovitin
  0 siblings, 0 replies; 22+ messages in thread
From: Vladislav Bolkhovitin @ 2008-07-29 15:36 UTC (permalink / raw)
  To: James Bottomley; +Cc: Andrew Vasquez, linux-driver, linux-scsi, scst-devel


James Bottomley wrote:
> On Tue, 2008-07-29 at 19:13 +0400, Vladislav Bolkhovitin wrote:
>> James Bottomley wrote:
>>> On Tue, 2008-07-29 at 13:32 +0400, Vladislav Bolkhovitin wrote:
>>>> Nope, taking only one that hunk from this patch isn't sufficient.
>>>> Around 
>>>> dpc_thread there is pretty simple and classical race. You can't do
>>>>
>>>> if (x != NULL)
>>>>         y = *x;
>>>>
>>>> without any protection, if x can be set to NULL by another thread. It 
>>>> can happen exactly between "if" and "*x" and hence lead to a crash,
>>>> correct?
>>> No.
>> What "No"? The above unlocked "if (x != NULL) y = *x;" is always safe 
>> now? ;)
> 
> No ... no as in your analysis based on the example is not correct to
> conclude protection is required.  We have quite a number of examples of
> this within the linux kernel (the SCSI error thread would be one).
> 
>>> Today we go to reasonable lengths to avoid additional locking.
>>> Spinlocks are nasty in most boxes because of the potential for
>>> introducing hot points and cacheline bouncing.
>>>
>>> The first question you ask is how hot is the variable ... as in how
>>> constantly changing is it?  This one is set at start of day and cleared
>>> when the thread is killed in shutdown.  Basically there's a single not
>>> NULL -> NULL transition.  Once NULL, it never goes back to not NULL.
>>>
>>> The point is that in this particular circumstance, the lock is overkill.
>>> You can either use other indicators to show that the driver is being
>>> removed (and the thread is dying) or you can simply use some type of
>>> refcounting to ensure the thread isn't killed until it's no longer
>>> needed.
>>>
>>> Even if it were a constantly changing variable, and therefore a more
>>> ideal candidate for locking, we might still look into atomics and bitops
>>> before adding a lock.
>> Sure, all you wrote above is correct, but in this *particular* case what 
>> you propose is an overkill, not use of the spinlock. Waking up DPC 
>> thread isn't on the hot path by any mean, it's quite rare event. So, all 
>> those lockfree techniques comparing to a simple single lock would 
>> introduce only additional complicated code without any measurable gain.
> 
> It's important to think about this kind of thing even in cases where it
> doesn't necessarily matter that much just so it becomes standard
> practice in cases where it does.

Agree. For example, in SCST core translation from LUN to internal device 
is done in a lockless manner. As well as commands serialization.

>> But, as I already wrote, I don't care much how this problem will be 
>> fixed. I care only that it should be fixed. So, my point was only that 
>> use from my patch only one hunk
>>
>> -		wake_up_process(ha->dpc_thread);
>> +		qla2xxx_wake_dpc(ha);
>>
>> is insufficient to fix the problem.
> 
> Yes, I agree with that ... I just want to see a better fix.

Let's see then what Andrew will say.

> James
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] qla2xxx: Fix dpc_thread race on the module unload
  2008-07-29  9:32   ` Vladislav Bolkhovitin
  2008-07-29 14:40     ` James Bottomley
@ 2008-07-30 10:30     ` Andrew Vasquez
  1 sibling, 0 replies; 22+ messages in thread
From: Andrew Vasquez @ 2008-07-30 10:30 UTC (permalink / raw)
  To: Vladislav Bolkhovitin; +Cc: linux-driver, linux-scsi, scst-devel

On Tue, 29 Jul 2008, Vladislav Bolkhovitin wrote:

>> Ok, your change looks correct here...  This is defintely the correct
>> way the qla24xx_report_id_acquisition() routine should be triggering
>> the DPC thread.
>>
>> I'm guessing the backtrace in "Gal Rosen's" report pointed to
>> qla24xx_report_id_acquisition()?  If, so, I'm not entirely sure we
>> need anything more to 'protect' tear-down...
>
> Nope, taking only one that hunk from this patch isn't sufficient. Around 
> dpc_thread there is pretty simple and classical race. You can't do
>
> if (x != NULL)
> 	y = *x;
>
> without any protection, if x can be set to NULL by another thread. It can 
> happen exactly between "if" and "*x" and hence lead to a crash, correct?
>
> What this patch does is adding such protection.

So there's a couple of ways we can address this:

* Drop the qla2xxx_wake_dpc() calls from all paths and let the timer
  handle triggering of the DPC routine...

* Simply extend the qla2xxx_wake_dpc() function to be a bit more
  careful during wakeup. In this case, check that ha->flags.online is
  set before waking-up the process.

The first option seems a bit heavy-handed, the second, a more
reasonable option without introducing any additional locks or the
like.

I'll queue-up the following to my queue, if there's no major
objections...

--
av

---

diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c
index bc90d6b..813bc77 100644
--- a/drivers/scsi/qla2xxx/qla_mbx.c
+++ b/drivers/scsi/qla2xxx/qla_mbx.c
@@ -2686,7 +2686,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *ha,
 		set_bit(VP_IDX_ACQUIRED, &vha->vp_flags);
 		set_bit(VP_DPC_NEEDED, &ha->dpc_flags);
 
-		wake_up_process(ha->dpc_thread);
+		qla2xxx_wake_dpc(ha);
 	}
 }
 
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 7c8af7e..12b127a 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -2451,8 +2451,10 @@ qla2x00_do_dpc(void *data)
 void
 qla2xxx_wake_dpc(scsi_qla_host_t *ha)
 {
-	if (ha->dpc_thread)
-		wake_up_process(ha->dpc_thread);
+	struct task_struct *t = ha->dpc_thread;
+
+	if (ha->flags.online && t)
+		wake_up_process(t);
 }
 
 /*

^ permalink raw reply related	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2008-08-01 12:28 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-07-28 17:33 [PATCH] qla2xxx: Fix dpc_thread race on the module unload Vladislav Bolkhovitin
2008-07-28 17:41 ` Andrew Vasquez
2008-07-28 17:49   ` Vladislav Bolkhovitin
2008-07-28 17:56 ` James Bottomley
2008-07-28 18:14   ` Vladislav Bolkhovitin
2008-07-29  7:30     ` Gal Rosen
2008-07-30  7:10     ` Gal Rosen
2008-07-31  6:12     ` Gal Rosen
2008-07-31  9:11       ` Vladislav Bolkhovitin
2008-07-31 16:02         ` Andrew Vasquez
2008-07-31 17:41           ` Vladislav Bolkhovitin
2008-07-31 17:55             ` Andrew Vasquez
2008-08-01 12:28               ` Vladislav Bolkhovitin
2008-07-29  4:27   ` Christoph Hellwig
2008-07-29  9:32     ` Vladislav Bolkhovitin
2008-07-28 18:07 ` Andrew Vasquez
2008-07-29  9:32   ` Vladislav Bolkhovitin
2008-07-29 14:40     ` James Bottomley
2008-07-29 15:13       ` Vladislav Bolkhovitin
2008-07-29 15:28         ` James Bottomley
2008-07-29 15:36           ` Vladislav Bolkhovitin
2008-07-30 10:30     ` Andrew Vasquez

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox