[PATCH 2/2] sg: fix races with ioctl(SG

public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 2/2] sg: fix races with ioctl(SG_IO)
@ 2009-01-05 19:11 Tony Battersby
  0 siblings, 0 replies; 2+ messages in thread
From: Tony Battersby @ 2009-01-05 19:11 UTC (permalink / raw)
  To: Douglas Gilbert, FUJITA Tomonori, James.Bottomley
  Cc: Christoph Hellwig, linux-scsi

sg_io_owned needs to be set before the command is sent to the midlevel;
otherwise, a quickly-completing command may cause a different CPU
to see "srp->done == 1 && !srp->sg_io_owned", which would lead to
incorrect behavior.

Check srp->done and set srp->orphan while holding rq_list_lock to
prevent races with sg_rq_end_io().

Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
---
 sg.c |   21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)
--- linux-2.6.28/drivers/scsi/sg.c.orig	2009-01-05 11:08:09.000000000 -0500
+++ linux-2.6.28/drivers/scsi/sg.c	2009-01-05 11:10:46.000000000 -0500
@@ -197,7 +197,7 @@ static ssize_t sg_new_read(Sg_fd * sfp, 
 			   Sg_request * srp);
 static ssize_t sg_new_write(Sg_fd *sfp, struct file *file,
 			const char __user *buf, size_t count, int blocking,
-			int read_only, Sg_request **o_srp);
+			int read_only, int sg_io_owned, Sg_request **o_srp);
 static int sg_common_write(Sg_fd * sfp, Sg_request * srp,
 			   unsigned char *cmnd, int timeout, int blocking);
 static int sg_read_oxfer(Sg_request * srp, char __user *outp, int num_read_xfer);
@@ -607,7 +607,8 @@ sg_write(struct file *filp, const char _
 		return -EFAULT;
 	blocking = !(filp->f_flags & O_NONBLOCK);
 	if (old_hdr.reply_len < 0)
-		return sg_new_write(sfp, filp, buf, count, blocking, 0, NULL);
+		return sg_new_write(sfp, filp, buf, count,
+				    blocking, 0, 0, NULL);
 	if (count < (SZ_SG_HEADER + 6))
 		return -EIO;	/* The minimum scsi command length is 6 bytes. */
 
@@ -688,7 +689,7 @@ sg_write(struct file *filp, const char _
 
 static ssize_t
 sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf,
-		 size_t count, int blocking, int read_only,
+		 size_t count, int blocking, int read_only, int sg_io_owned,
 		 Sg_request **o_srp)
 {
 	int k;
@@ -708,6 +709,7 @@ sg_new_write(Sg_fd *sfp, struct file *fi
 		SCSI_LOG_TIMEOUT(1, printk("sg_new_write: queue full\n"));
 		return -EDOM;
 	}
+	srp->sg_io_owned = sg_io_owned;
 	hp = &srp->header;
 	if (__copy_from_user(hp, buf, SZ_SG_IO_HDR)) {
 		sg_remove_request(sfp, srp);
@@ -854,10 +856,9 @@ sg_ioctl(struct inode *inode, struct fil
 				return -EFAULT;
 			result =
 			    sg_new_write(sfp, filp, p, SZ_SG_IO_HDR,
-					 blocking, read_only, &srp);
+					 blocking, read_only, 1, &srp);
 			if (result < 0)
 				return result;
-			srp->sg_io_owned = 1;
 			while (1) {
 				result = 0;	/* following macro to beat race condition */
 				__wait_event_interruptible(sfp->read_wait,
@@ -867,14 +868,16 @@ sg_ioctl(struct inode *inode, struct fil
 					return -ENODEV;
 				if (sfp->closed)
 					return 0;	/* request packet dropped already */
-				if (0 == result)
+				write_lock_irq(&sfp->rq_list_lock);
+				if (srp->done) {
+					srp->done = 2;
+					write_unlock_irq(&sfp->rq_list_lock);
 					break;
+				}
 				srp->orphan = 1;
+				write_unlock_irq(&sfp->rq_list_lock);
 				return result;	/* -ERESTARTSYS because signal hit process */
 			}
-			write_lock_irqsave(&sfp->rq_list_lock, iflags);
-			srp->done = 2;
-			write_unlock_irqrestore(&sfp->rq_list_lock, iflags);
 			result = sg_new_read(sfp, p, SZ_SG_IO_HDR, srp);
 			return (result < 0) ? result : 0;
 		}



^ permalink raw reply	[flat|nested] 2+ messages in thread

* [PATCH 0/2] sg: fix races during device removal (v2)
@ 2009-01-05 19:07 Tony Battersby
  2009-01-10 17:26 ` FUJITA Tomonori
  0 siblings, 1 reply; 2+ messages in thread
From: Tony Battersby @ 2009-01-05 19:07 UTC (permalink / raw)
  To: Douglas Gilbert, FUJITA Tomonori, James.Bottomley
  Cc: Christoph Hellwig, linux-scsi

This is version 2 of my patch, this time using a kref instead of int.
There are also a lot of other changes since the first version since I
found even more bugs both through testing and source code analysis.
I have split the patch up into two parts: the first patch fixes
races between open, close, device removal, and command completion.
The second patch fixes some races I spotted in ioctl(SG_IO).

Below are a list of test cases fixed by the patch.

----------

test #1

open /dev/sgX
send a command that takes a long time (e.g. any tape drive seek
command)
before command completes, echo 1 >
/sys/class/scsi_generic/sgX/device/delete

without patch:
oops

with patch:
test program gets ENODEV immediately
keventd sleeps until the cmd is complete
this is suboptimal since it starves other users of keventd while
waiting for the command to complete, but it is better than an oops

----------

test #2

open /dev/sgX
send a command that takes a long time (e.g. any tape drive seek
command) without waiting for it to complete
close fd
before command completes, echo 1 >
/sys/class/scsi_generic/sgX/device/delete

without patch:
oops when the command does complete (sg_rq_end_io() bad pointer deref)

with patch:
keventd sleeps until the cmd is complete
this is suboptimal since it starves other users of keventd while
waiting for the command to complete, but it is better than an oops

----------

test #3

open /dev/sgX
send a command that takes a long time (e.g. any tape drive seek
command) without waiting for it to complete
close fd
rmmod sg

without patch:
rmmod succeeds without waiting for the command to complete
oops when the command does complete (sg_rq_end_io() callback no
longer exists)

with patch:
sg module usage count does not drop to 0 until the command completes,
so cannot rmmod

----------

test #4

open /dev/sgX
loop: send commands, check results
unplug SAS cable
mptsas automatically removes the device and fails active commands
the test program detects the failed commands and closes its fds at
the same time
this results in the following sequence:
sg_remove() enter
sg_release() enter
sg_release() exit
sg_remove() exit

without patch:
oops

with patch:
ok

----------

test #5

open /dev/sgX
loop: send commands, check results
unplug SAS cable
mptsas automatically removes the device and fails active commands
the test program detects the failed commands, but sleeps for a second
before closing its fds
this results in the following sequence:
sg_remove() enter
sg_remove() exit
sg_release() enter
sg_release() exit

without patch:
oops

with patch:
ok

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH 0/2] sg: fix races during device removal (v2)
  2009-01-05 19:07 [PATCH 0/2] sg: fix races during device removal (v2) Tony Battersby
@ 2009-01-10 17:26 ` FUJITA Tomonori
  2009-01-14 20:31   ` Tony Battersby
  0 siblings, 1 reply; 2+ messages in thread
From: FUJITA Tomonori @ 2009-01-10 17:26 UTC (permalink / raw)
  To: tonyb; +Cc: dgilbert, fujita.tomonori, James.Bottomley, hch, linux-scsi

On Mon, 05 Jan 2009 14:07:19 -0500
Tony Battersby <tonyb@cybernetics.com> wrote:

> This is version 2 of my patch, this time using a kref instead of int.
> There are also a lot of other changes since the first version since I
> found even more bugs both through testing and source code analysis.
> I have split the patch up into two parts: the first patch fixes
> races between open, close, device removal, and command completion.
> The second patch fixes some races I spotted in ioctl(SG_IO).
> 
> Below are a list of test cases fixed by the patch.

Thanks for the great work,


> ----------
> 
> test #1
> 
> open /dev/sgX
> send a command that takes a long time (e.g. any tape drive seek
> command)
> before command completes, echo 1 >
> /sys/class/scsi_generic/sgX/device/delete
> 
> without patch:
> oops
> 
> with patch:
> test program gets ENODEV immediately
> keventd sleeps until the cmd is complete
> this is suboptimal since it starves other users of keventd while
> waiting for the command to complete, but it is better than an oops

As you said, we can do better. It's not correct to make
class_interface's remove_dev hook, sg_remove wait for the completion
of all the commands.


> test #2
> 
> open /dev/sgX
> send a command that takes a long time (e.g. any tape drive seek
> command) without waiting for it to complete
> close fd
> before command completes, echo 1 >
> /sys/class/scsi_generic/sgX/device/delete
> 
> without patch:
> oops when the command does complete (sg_rq_end_io() bad pointer deref)
> 
> with patch:
> keventd sleeps until the cmd is complete
> this is suboptimal since it starves other users of keventd while
> waiting for the command to complete, but it is better than an oops

Ditto.


The main problem is about the lifetime of sg_dev and sg_fd. I think
that we can fix it in a simpler and better way.

You use kref for sg_dev. We can use kref for sg_fd too. I think that
that's all we need to do.

A sg_request gets the reference of sg_fd. It makes sure that sg_fd
doesn't go away if any outstanding sg_requests exists.

sg_fd gets the reference of sg_dev (with scsi_device) and makes sure
that the sg module doesn't go away.

Then sg_remove doesn't wait for anything. sg_fd gets the reference of
sg_dev so sg_dev doesn't go away. sg_rq_end_io puts the reference of
sg_fd then sg_fd is freed and sg_dev is freed safely.

Here's a patch to do the above. It's even simpler than the current sg
code. I confirmed that sg can pass the first, second and third tests
with this patch.

I think that 4th and 5th tests fail because of the lifetime of sg_dev
and sg_fd. Can you please try these tests with this patch?

=
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Sat, 10 Jan 2009 23:25:24 +0900
Subject: [PATCH] sg: fix the lifetime management of sg_dev and sg_fd

To fix the lifetime of sg_dev and sg_fd, this patch adds kref for
sg_dev and sg_fd.

A sg_request gets the reference of sg_fd. It makes sure that sg_fd
doesn't go away if any outstanding sg_requests exists.

sg_fd gets the reference of sg_dev (with scsi_device) and makes sure
that the sg module doesn't go away.


Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
---
 drivers/scsi/sg.c |  198 +++++++++++++++++++++++++----------------------------
 1 files changed, 93 insertions(+), 105 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 5103855..11b4a56 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -158,6 +158,8 @@ typedef struct sg_fd {		/* holds the state of a file descriptor */
 	char next_cmd_len;	/* 0 -> automatic (def), >0 -> use on next write() */
 	char keep_orphan;	/* 0 -> drop orphan (def), 1 -> keep for read() */
 	char mmap_called;	/* 0 -> mmap() never called on this fd */
+	struct kref f_ref;
+	struct execute_work ew;
 } Sg_fd;
 
 typedef struct sg_device { /* holds the state of each scsi generic device */
@@ -171,6 +173,7 @@ typedef struct sg_device { /* holds the state of each scsi generic device */
 	char sgdebug;		/* 0->off, 1->sense, 9->dump dev, 10-> all devs */
 	struct gendisk *disk;
 	struct cdev * cdev;	/* char_dev [sysfs: /sys/cdev/major/sg<n>] */
+	struct kref d_ref;
 } Sg_device;
 
 static int sg_fasync(int fd, struct file *filp, int mode);
@@ -194,13 +197,13 @@ static void sg_build_reserve(Sg_fd * sfp, int req_size);
 static void sg_link_reserve(Sg_fd * sfp, Sg_request * srp, int size);
 static void sg_unlink_reserve(Sg_fd * sfp, Sg_request * srp);
 static Sg_fd *sg_add_sfp(Sg_device * sdp, int dev);
-static int sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp);
-static void __sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp);
+static void sg_remove_sfp(struct kref *);
 static Sg_request *sg_get_rq_mark(Sg_fd * sfp, int pack_id);
 static Sg_request *sg_add_request(Sg_fd * sfp);
 static int sg_remove_request(Sg_fd * sfp, Sg_request * srp);
 static int sg_res_in_use(Sg_fd * sfp);
 static Sg_device *sg_get_dev(int dev);
+static void sg_put_dev(struct sg_device *sdp);
 #ifdef CONFIG_SCSI_PROC_FS
 static int sg_last_dev(void);
 #endif
@@ -238,21 +241,19 @@ sg_open(struct inode *inode, struct file *filp)
 	SCSI_LOG_TIMEOUT(3, printk("sg_open: dev=%d, flags=0x%x\n", dev, flags));
 	sdp = sg_get_dev(dev);
 	if ((!sdp) || (!sdp->device)) {
-		unlock_kernel();
-		return -ENXIO;
+		retval = -ENXIO;
+		goto sg_put;
 	}
 	if (sdp->detached) {
-		unlock_kernel();
-		return -ENODEV;
+		retval = -ENODEV;
+		goto sg_put;
 	}
 
 	/* This driver's module count bumped by fops_get in <linux/fs.h> */
 	/* Prevent the device driver from vanishing while we sleep */
 	retval = scsi_device_get(sdp->device);
-	if (retval) {
-		unlock_kernel();
-		return retval;
-	}
+	if (retval)
+		goto sg_put;
 
 	if (!((flags & O_NONBLOCK) ||
 	      scsi_block_when_processing_errors(sdp->device))) {
@@ -308,11 +309,12 @@ sg_open(struct inode *inode, struct file *filp)
 		retval = -ENOMEM;
 		goto error_out;
 	}
-	unlock_kernel();
-	return 0;
-
-      error_out:
-	scsi_device_put(sdp->device);
+	retval = 0;
+error_out:
+	if (retval)
+		scsi_device_put(sdp->device);
+sg_put:
+	sg_put_dev(sdp);
 	unlock_kernel();
 	return retval;
 }
@@ -327,13 +329,11 @@ sg_release(struct inode *inode, struct file *filp)
 	if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp)))
 		return -ENXIO;
 	SCSI_LOG_TIMEOUT(3, printk("sg_release: %s\n", sdp->disk->disk_name));
-	if (0 == sg_remove_sfp(sdp, sfp)) {	/* Returns 1 when sdp gone */
-		if (!sdp->detached) {
-			scsi_device_put(sdp->device);
-		}
-		sdp->exclude = 0;
-		wake_up_interruptible(&sdp->o_excl_wait);
-	}
+
+	sdp->exclude = 0;
+	wake_up_interruptible(&sdp->o_excl_wait);
+
+	kref_put(&sfp->f_ref, sg_remove_sfp);
 	return 0;
 }
 
@@ -1259,12 +1259,10 @@ static void sg_rq_end_io(struct request *rq, int uptodate)
 		return;
 	}
 	sfp = srp->parentfp;
-	if (sfp)
-		sdp = sfp->parentdp;
-	if ((NULL == sdp) || sdp->detached) {
+	sdp = sfp->parentdp;
+
+	if (sdp->detached)
 		printk(KERN_INFO "sg_cmd_done: device detached\n");
-		return;
-	}
 
 	sense = rq->sense;
 	result = rq->errors;
@@ -1303,18 +1301,7 @@ static void sg_rq_end_io(struct request *rq, int uptodate)
 	}
 	/* Rely on write phase to clean out srp status values, so no "else" */
 
-	if (sfp->closed) {	/* whoops this fd already released, cleanup */
-		SCSI_LOG_TIMEOUT(1, printk("sg_cmd_done: already closed, freeing ...\n"));
-		sg_finish_rem_req(srp);
-		srp = NULL;
-		if (NULL == sfp->headrp) {
-			SCSI_LOG_TIMEOUT(1, printk("sg_cmd_done: already closed, final cleanup\n"));
-			if (0 == sg_remove_sfp(sdp, sfp)) {	/* device still present */
-				scsi_device_put(sdp->device);
-			}
-			sfp = NULL;
-		}
-	} else if (srp && srp->orphan) {
+	if (srp->orphan) {
 		if (sfp->keep_orphan)
 			srp->sg_io_owned = 0;
 		else {
@@ -1322,7 +1309,8 @@ static void sg_rq_end_io(struct request *rq, int uptodate)
 			srp = NULL;
 		}
 	}
-	if (sfp && srp) {
+
+	if (srp) {
 		/* Now wake up any sg_read() that is waiting for this packet. */
 		kill_fasync(&sfp->async_qp, SIGPOLL, POLL_IN);
 		write_lock_irqsave(&sfp->rq_list_lock, iflags);
@@ -1330,6 +1318,8 @@ static void sg_rq_end_io(struct request *rq, int uptodate)
 		wake_up_interruptible(&sfp->read_wait);
 		write_unlock_irqrestore(&sfp->rq_list_lock, iflags);
 	}
+
+	kref_put(&sfp->f_ref, sg_remove_sfp);
 }
 
 static struct file_operations sg_fops = {
@@ -1391,6 +1381,7 @@ static Sg_device *sg_alloc(struct gendisk *disk, struct scsi_device *scsidp)
 	init_waitqueue_head(&sdp->o_excl_wait);
 	sdp->sg_tablesize = min(q->max_hw_segments, q->max_phys_segments);
 	sdp->index = k;
+	kref_init(&sdp->d_ref);
 
 	error = 0;
  out:
@@ -1496,41 +1487,18 @@ sg_remove(struct device *cl_dev, struct class_interface *cl_intf)
 	unsigned long iflags;
 	Sg_fd *sfp;
 	Sg_fd *tsfp;
-	Sg_request *srp;
-	Sg_request *tsrp;
-	int delay;
 
 	if (!sdp)
 		return;
 
-	delay = 0;
+	sdp->detached = 1;
+
 	write_lock_irqsave(&sg_index_lock, iflags);
-	if (sdp->headfp) {
-		sdp->detached = 1;
-		for (sfp = sdp->headfp; sfp; sfp = tsfp) {
-			tsfp = sfp->nextfp;
-			for (srp = sfp->headrp; srp; srp = tsrp) {
-				tsrp = srp->nextrp;
-				if (sfp->closed || (0 == sg_srp_done(srp, sfp)))
-					sg_finish_rem_req(srp);
-			}
-			if (sfp->closed) {
-				scsi_device_put(sdp->device);
-				__sg_remove_sfp(sdp, sfp);
-			} else {
-				delay = 1;
-				wake_up_interruptible(&sfp->read_wait);
-				kill_fasync(&sfp->async_qp, SIGPOLL,
-					    POLL_HUP);
-			}
-		}
-		SCSI_LOG_TIMEOUT(3, printk("sg_remove: dev=%d, dirty\n", sdp->index));
-		if (NULL == sdp->headfp) {
-			idr_remove(&sg_index_idr, sdp->index);
-		}
-	} else {	/* nothing active, simple case */
-		SCSI_LOG_TIMEOUT(3, printk("sg_remove: dev=%d\n", sdp->index));
-		idr_remove(&sg_index_idr, sdp->index);
+	for (sfp = sdp->headfp; sfp; sfp = tsfp) {
+		tsfp = sfp->nextfp;
+
+		wake_up_interruptible(&sfp->read_wait);
+		kill_fasync(&sfp->async_qp, SIGPOLL, POLL_HUP);
 	}
 	write_unlock_irqrestore(&sg_index_lock, iflags);
 
@@ -1540,11 +1508,8 @@ sg_remove(struct device *cl_dev, struct class_interface *cl_intf)
 	sdp->cdev = NULL;
 	put_disk(sdp->disk);
 	sdp->disk = NULL;
-	if (NULL == sdp->headfp)
-		kfree(sdp);
 
-	if (delay)
-		msleep(10);	/* dirty detach so delay device destruction */
+	sg_put_dev(sdp);
 }
 
 module_param_named(scatter_elem_sz, scatter_elem_sz, int, S_IRUGO | S_IWUSR);
@@ -1993,6 +1958,7 @@ sg_add_request(Sg_fd * sfp)
 	if (resp) {
 		resp->nextrp = NULL;
 		resp->header.duration = jiffies_to_msecs(jiffies);
+		kref_get(&sfp->f_ref);
 	}
 	write_unlock_irqrestore(&sfp->rq_list_lock, iflags);
 	return resp;
@@ -2060,6 +2026,7 @@ sg_add_sfp(Sg_device * sdp, int dev)
 	init_waitqueue_head(&sfp->read_wait);
 	rwlock_init(&sfp->rq_list_lock);
 
+	kref_init(&sfp->f_ref);
 	sfp->timeout = SG_DEFAULT_TIMEOUT;
 	sfp->timeout_user = SG_DEFAULT_TIMEOUT_USER;
 	sfp->force_packid = SG_DEF_FORCE_PACK_ID;
@@ -2087,6 +2054,9 @@ sg_add_sfp(Sg_device * sdp, int dev)
 	sg_build_reserve(sfp, bufflen);
 	SCSI_LOG_TIMEOUT(3, printk("sg_add_sfp:   bufflen=%d, k_use_sg=%d\n",
 			   sfp->reserve.bufflen, sfp->reserve.k_use_sg));
+
+	kref_get(&sdp->d_ref);
+	__module_get(THIS_MODULE);
 	return sfp;
 }
 
@@ -2114,48 +2084,38 @@ __sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp)
 			(int) sfp->reserve.bufflen, (int) sfp->reserve.k_use_sg));
 		sg_remove_scat(&sfp->reserve);
 	}
-	sfp->parentdp = NULL;
 	SCSI_LOG_TIMEOUT(6, printk("__sg_remove_sfp:    sfp=0x%p\n", sfp));
 	kfree(sfp);
 }
 
-/* Returns 0 in normal case, 1 when detached and sdp object removed */
-static int
-sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp)
+static void sg_remove_sfp_usercontext(struct work_struct *work)
 {
+	struct sg_fd *sfp = container_of(work, struct sg_fd, ew.work);
+	struct sg_device *sdp = sfp->parentdp;
+	unsigned long iflags;
 	Sg_request *srp;
 	Sg_request *tsrp;
-	int dirty = 0;
-	int res = 0;
 
 	for (srp = sfp->headrp; srp; srp = tsrp) {
 		tsrp = srp->nextrp;
-		if (sg_srp_done(srp, sfp))
-			sg_finish_rem_req(srp);
-		else
-			++dirty;
+		sg_finish_rem_req(srp);
 	}
-	if (0 == dirty) {
-		unsigned long iflags;
 
-		write_lock_irqsave(&sg_index_lock, iflags);
-		__sg_remove_sfp(sdp, sfp);
-		if (sdp->detached && (NULL == sdp->headfp)) {
-			idr_remove(&sg_index_idr, sdp->index);
-			kfree(sdp);
-			res = 1;
-		}
-		write_unlock_irqrestore(&sg_index_lock, iflags);
-	} else {
-		/* MOD_INC's to inhibit unloading sg and associated adapter driver */
-		/* only bump the access_count if we actually succeeded in
-		 * throwing another counter on the host module */
-		scsi_device_get(sdp->device);	/* XXX: retval ignored? */	
-		sfp->closed = 1;	/* flag dirty state on this fd */
-		SCSI_LOG_TIMEOUT(1, printk("sg_remove_sfp: worrisome, %d writes pending\n",
-				  dirty));
-	}
-	return res;
+	write_lock_irqsave(&sg_index_lock, iflags);
+	__sg_remove_sfp(sdp, sfp);
+	write_unlock_irqrestore(&sg_index_lock, iflags);
+
+	scsi_device_put(sdp->device);
+
+	sg_put_dev(sdp);
+
+	module_put(THIS_MODULE);
+}
+
+static void sg_remove_sfp(struct kref *kref)
+{
+	struct sg_fd *sfp = container_of(kref, struct sg_fd, f_ref);
+	execute_in_process_context(sg_remove_sfp_usercontext, &sfp->ew);
 }
 
 static int
@@ -2197,6 +2157,28 @@ sg_last_dev(void)
 }
 #endif
 
+static void sg_device_release(struct kref *kref)
+{
+	struct sg_device *sdp = container_of(kref, struct sg_device, d_ref);
+	unsigned long flags;
+
+	SCSI_LOG_TIMEOUT(3, printk("sg_remove: dev=%d\n", sdp->index));
+
+	write_lock_irqsave(&sg_index_lock, flags);
+	idr_remove(&sg_index_idr, sdp->index);
+	write_unlock_irqrestore(&sg_index_lock, flags);
+
+	kfree(sdp);
+}
+
+static void sg_put_dev(struct sg_device *sdp)
+{
+	if (!sdp)
+		return;
+
+	kref_put(&sdp->d_ref, sg_device_release);
+}
+
 static Sg_device *
 sg_get_dev(int dev)
 {
@@ -2205,6 +2187,8 @@ sg_get_dev(int dev)
 
 	read_lock_irqsave(&sg_index_lock, iflags);
 	sdp = idr_find(&sg_index_idr, dev);
+	if (sdp)
+		kref_get(&sdp->d_ref);
 	read_unlock_irqrestore(&sg_index_lock, iflags);
 
 	return sdp;
@@ -2478,6 +2462,7 @@ static int sg_proc_seq_show_dev(struct seq_file *s, void *v)
 			      (int) scsi_device_online(scsidp));
 	else
 		seq_printf(s, "-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\n");
+	sg_put_dev(sdp);
 	return 0;
 }
 
@@ -2498,6 +2483,7 @@ static int sg_proc_seq_show_devstrs(struct seq_file *s, void *v)
 			   scsidp->vendor, scsidp->model, scsidp->rev);
 	else
 		seq_printf(s, "<no active device>\n");
+	sg_put_dev(sdp);
 	return 0;
 }
 
@@ -2582,7 +2568,7 @@ static int sg_proc_seq_show_debug(struct seq_file *s, void *v)
 		if (NULL == scsidp) {
 			seq_printf(s, "device %d detached ??\n", 
 				   (int)it->index);
-			return 0;
+			goto out;
 		}
 
 		if (sg_get_nth_sfp(sdp, 0)) {
@@ -2602,6 +2588,8 @@ static int sg_proc_seq_show_debug(struct seq_file *s, void *v)
 		}
 		sg_proc_debug_helper(s, sdp);
 	}
+out:
+	sg_put_dev(sdp);
 	return 0;
 }
 
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH 0/2] sg: fix races during device removal (v2)
  2009-01-10 17:26 ` FUJITA Tomonori
@ 2009-01-14 20:31   ` Tony Battersby
  2009-01-19  6:57     ` FUJITA Tomonori
  0 siblings, 1 reply; 2+ messages in thread
From: Tony Battersby @ 2009-01-14 20:31 UTC (permalink / raw)
  To: FUJITA Tomonori
  Cc: dgilbert, James.Bottomley, hch, linux-scsi, Greg Kroah-Hartman

[CC: Greg Kroah-Hartman <greg@kroah.com>]

Fujita, your patch results in simpler code than my version 2 patch,
but it still leaves some subtle races and other problems.  The crux
of the problem is that kref_put() needs do be done while holding a
lock if there is still a way for some other CPU to find a reference
to the object in between the time the refcount drops to 0 and the
time the destructor is called.

For example, look at sg_get_dev() and sg_put_dev().  In my patch,
I locked sg_index_lock in sg_put_dev() and called kref_put() while
holding the lock.  In your patch, you moved the lock from sg_put_dev()
to the destructor function.  This introduces a subtle race with
sg_get_dev():

CPU 1:
sg_put_dev()
  kref_put(): refcount 1 -> 0

CPU 2:
sg_get_dev()
  lock sg_index_lock
  idr_find()
  kref_get(): refcount 0 -> 1: bug
  unlock sg_index_lock

CPU 1:
sg_device_release()
  lock sg_index_lock
  idr_remove
  unlock sg_index_lock
  kfree(sdp)

CPU 2:
access sdp which has already been freed
sg_put_dev(): double-free bug

Besides holding the lock during kref_put(), I also considered two
other simple ways to avoid this race:
1) Do idr_remove() from sg_remove().
2) Return NULL in sg_get_dev() if sdp->detached.

However, both of these options would have changed the behavior of
the /proc/scsi/sg/* functions that show information for devices that
are in the process of being detached.  I wanted to fix bugs without
changing other behavior, so I chose to call kref_put() under lock in
my previous patches.

A similar problem exists with sg_remove_sfp() in your patch.
The refcount for the sg_fd can drop to 0 while a different CPU can
still find a reference to it by scanning sg_device::headfp.  The good
news is that most places that scan sg_fd's using sdp->headfp never
drop sg_index_lock while still holding a reference, so for most cases
it is safe for the destructor to work the way you have it.  The one
exception is sg_get_nth_sfp(), which needs to kref_get() the sg_fd it
returns (my version 2 patch doesn't fix this problem either since I
just noticed it).  But then the kref_get() would have a race similar
to the one I described above between sg_get_dev() and sg_put_dev().

There are several solutions to this problem:
1) Call kref_put(&sfp->f_ref, sg_remove_sfp) while holding
sg_index_lock, and have the destructor unlink sfp from sdp->headfp
before dropping the lock.  This is ugly since the destructor may want
to drop sg_index_lock (e.g. to call scsi_device_put()), but the saved
irqflags variable is in a different function.
2) Unlink sfp from sdp->headfp in sg_release().  The downside to this
is that the /proc/scsi/sg/* functions no longer show fds that have
been closed but still have commands pending.
3) Set sfp->closed in sg_release() and have sg_get_nth_sfp() return
NULL if sfp->closed.  Same downside as #2.
4) Add a variation of kref_get() that uses atomic_inc_not_zero().

When I was designing my previous patches, I anticipated running into
these complications with kref, which is one reason I avoided it.
Other people have experienced similar problems with the kref interface
(search "kref_get_not_zero" or "cgroups: fix probable race with
put_css_set[_taskexit] and find_css_set").  If you still want to use
kref, I think that we are either going to have to change the behavior
of /proc/scsi/sg/*, or else use atomic_inc_not_zero().  Since I still
prefer not to change valid user-visible behavior in a patch that fixes
so many possible oopses, I will go with atomic_inc_not_zero().  And,
since I am going that route, I will use it for sg_get_dev() also,
so that sg_put_dev() doesn't need to acquire sg_index_lock.

For the purposes of this patch, I added the local function
sg_kref_get_not_zero() to sg.c.  It would be better to add
kref_get_not_zero() to kref.c, but I see in the mailing list archives
that there has been some resistance to that idea because it complicates
the kref interface.  However, since it has come up more than once,
perhaps it would be better to go ahead and make it an official part
of the interface.  If anyone wants to support that idea, I will break
it out into a separate patch.

Your patch also has a problem with management of f_ref.  You do
kref_get() in sg_add_request() and kref_put() in sg_rq_end_io()().
However, an error can occur in between sg_add_request() and sending
the command to the midlevel, so sg_rq_end_io() may never be called.
It is better to do kref_get() just before sending the command to the
midlevel; that way the refcount counts active commands waiting for
the completion callback rather than prepared requests.

Your patch doesn't fix the problem of various functions calling
SCSI_LOG_TIMEOUT with sdp->disk->disk_name after sg_remove() sets
sdp->disk to NULL.  This is why I moved put_disk() from sg_remove()
to sg_device_release().

Your patch doesn't fix the problem of forgetting to do
wake_up_interruptible(&sdp->o_excl_wait) after removing sfp from
sdp->headfp.  This is important if sg_release() is called with
commands still active; the last command that finishes should wake up
any O_EXCL waiters.

So below is version 3 of my patch combining my ideas and your ideas.
For testing with mpt, I commented out mptscsih_synchronize_cache()
in mptscsih.c as a workaround for the bug I mentioned in a previous
message.  This patch still fixes all the test cases previously listed,
but with the additional improvements that it is simpler and it no
longer blocks keventd waiting for commands to complete.

My previously-posted "[PATCH 2/2] sg: fix races with ioctl(SG_IO)"
still needs to be applied after this patch.

=
From: Tony Battersby <tonyb@cybernetics.com>
Date: Wed, 14 Jan 2009 15:30:00 -0500
Subject: [PATCH] sg: fix races during device removal (v3)

sg has the following problems related to device removal:

* opening a sg fd races with removing a device
* closing a sg fd races with removing a device
* /proc/scsi/sg/* access races with removing a device
* command completion races with removing a device
* command completion races with closing a sg fd
* can rmmod sg with active commands

These problems can cause kernel oopses, memory-use-after-free, or
double-free errors.  This patch fixes these problems by using krefs
to manage the lifetime of sg_device and sg_fd.

Each command submitted to the midlevel holds a reference to sg_fd
until the completion callback.  This ensures that sg_fd doesn't go
away if the fd is closed with commands still outstanding.

sg_fd gets the reference of sg_device (with scsi_device) and also
makes sure that the sg module doesn't go away.

/proc functions get and put references to sg_fd and sg_device as
needed.

Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
---
 sg.c |  342 ++++++++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 184 insertions(+), 158 deletions(-)
--- linux-2.6.29-rc1-git4/drivers/scsi/sg.c.orig	2009-01-14 14:16:14.000000000 -0500
+++ linux-2.6.29-rc1-git4/drivers/scsi/sg.c	2009-01-14 14:25:45.000000000 -0500
@@ -158,6 +158,8 @@ typedef struct sg_fd {		/* holds the sta
 	char next_cmd_len;	/* 0 -> automatic (def), >0 -> use on next write() */
 	char keep_orphan;	/* 0 -> drop orphan (def), 1 -> keep for read() */
 	char mmap_called;	/* 0 -> mmap() never called on this fd */
+	struct kref f_ref;
+	struct execute_work ew;
 } Sg_fd;
 
 typedef struct sg_device { /* holds the state of each scsi generic device */
@@ -171,6 +173,7 @@ typedef struct sg_device { /* holds the 
 	char sgdebug;		/* 0->off, 1->sense, 9->dump dev, 10-> all devs */
 	struct gendisk *disk;
 	struct cdev * cdev;	/* char_dev [sysfs: /sys/cdev/major/sg<n>] */
+	struct kref d_ref;
 } Sg_device;
 
 static int sg_fasync(int fd, struct file *filp, int mode);
@@ -194,13 +197,13 @@ static void sg_build_reserve(Sg_fd * sfp
 static void sg_link_reserve(Sg_fd * sfp, Sg_request * srp, int size);
 static void sg_unlink_reserve(Sg_fd * sfp, Sg_request * srp);
 static Sg_fd *sg_add_sfp(Sg_device * sdp, int dev);
-static int sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp);
-static void __sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp);
+static void sg_remove_sfp(struct kref *);
 static Sg_request *sg_get_rq_mark(Sg_fd * sfp, int pack_id);
 static Sg_request *sg_add_request(Sg_fd * sfp);
 static int sg_remove_request(Sg_fd * sfp, Sg_request * srp);
 static int sg_res_in_use(Sg_fd * sfp);
 static Sg_device *sg_get_dev(int dev);
+static void sg_put_dev(struct sg_device *sdp);
 #ifdef CONFIG_SCSI_PROC_FS
 static int sg_last_dev(void);
 #endif
@@ -210,6 +213,37 @@ static int sg_last_dev(void);
 #define SZ_SG_IOVEC sizeof(sg_iovec_t)
 #define SZ_SG_REQ_INFO sizeof(sg_req_info_t)
 
+/**
+ * sg_kref_get_not_zero - increment refcount for object if not already 0.
+ * @kref: object.
+ *
+ * If the object is not being destroyed, then increment the refcount and return
+ * true.  Otherwise, return false without doing anything.
+ *
+ * Depending on how you manage the referenced object, there may be ways of
+ * finding the object in between the time kref_put() decrements the refcount to
+ * 0 and the time the destructor finishes.  For example, if the object can be
+ * found in a lookup table, and the destructor removes the object from that
+ * table, then one CPU may still be able to find the object in that table
+ * for a brief period while a different CPU is running the destructor.  To
+ * prevent problems, either call kref_put() while holding a lock that prevents
+ * another CPU from finding the object, or else use this function instead of
+ * kref_get() when using the lookup table to find the object.
+ *
+ * This function should be called while holding whatever lock is used to
+ * protect the data structure used to find the object.  If this function
+ * returns false, then do not access the object after dropping the lock, and do
+ * not call kref_put().
+ */
+static bool __must_check sg_kref_get_not_zero(struct kref *kref)
+{
+	if (likely(atomic_inc_not_zero(&kref->refcount))) {
+		smp_mb__after_atomic_inc();
+		return true;
+	}
+	return false;
+}
+
 static int sg_allow_access(struct file *filp, unsigned char *cmd)
 {
 	struct sg_fd *sfp = (struct sg_fd *)filp->private_data;
@@ -238,21 +272,19 @@ sg_open(struct inode *inode, struct file
 	SCSI_LOG_TIMEOUT(3, printk("sg_open: dev=%d, flags=0x%x\n", dev, flags));
 	sdp = sg_get_dev(dev);
 	if ((!sdp) || (!sdp->device)) {
-		unlock_kernel();
-		return -ENXIO;
+		retval = -ENXIO;
+		goto sg_put;
 	}
 	if (sdp->detached) {
-		unlock_kernel();
-		return -ENODEV;
+		retval = -ENODEV;
+		goto sg_put;
 	}
 
 	/* This driver's module count bumped by fops_get in <linux/fs.h> */
 	/* Prevent the device driver from vanishing while we sleep */
 	retval = scsi_device_get(sdp->device);
-	if (retval) {
-		unlock_kernel();
-		return retval;
-	}
+	if (retval)
+		goto sg_put;
 
 	if (!((flags & O_NONBLOCK) ||
 	      scsi_block_when_processing_errors(sdp->device))) {
@@ -303,16 +335,19 @@ sg_open(struct inode *inode, struct file
 	if ((sfp = sg_add_sfp(sdp, dev)))
 		filp->private_data = sfp;
 	else {
-		if (flags & O_EXCL)
+		if (flags & O_EXCL) {
 			sdp->exclude = 0;	/* undo if error */
+			wake_up_interruptible(&sdp->o_excl_wait);
+		}
 		retval = -ENOMEM;
 		goto error_out;
 	}
-	unlock_kernel();
-	return 0;
-
-      error_out:
-	scsi_device_put(sdp->device);
+	retval = 0;
+error_out:
+	if (retval)
+		scsi_device_put(sdp->device);
+sg_put:
+	sg_put_dev(sdp);
 	unlock_kernel();
 	return retval;
 }
@@ -327,13 +362,13 @@ sg_release(struct inode *inode, struct f
 	if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp)))
 		return -ENXIO;
 	SCSI_LOG_TIMEOUT(3, printk("sg_release: %s\n", sdp->disk->disk_name));
-	if (0 == sg_remove_sfp(sdp, sfp)) {	/* Returns 1 when sdp gone */
-		if (!sdp->detached) {
-			scsi_device_put(sdp->device);
-		}
-		sdp->exclude = 0;
-		wake_up_interruptible(&sdp->o_excl_wait);
-	}
+
+	sfp->closed = 1;
+
+	sdp->exclude = 0;
+	wake_up_interruptible(&sdp->o_excl_wait);
+
+	kref_put(&sfp->f_ref, sg_remove_sfp);
 	return 0;
 }
 
@@ -755,6 +790,7 @@ sg_common_write(Sg_fd * sfp, Sg_request 
 	hp->duration = jiffies_to_msecs(jiffies);
 
 	srp->rq->timeout = timeout;
+	kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */
 	blk_execute_rq_nowait(sdp->device->request_queue, sdp->disk,
 			      srp->rq, 1, sg_rq_end_io);
 	return 0;
@@ -1247,25 +1283,28 @@ sg_mmap(struct file *filp, struct vm_are
 static void sg_rq_end_io(struct request *rq, int uptodate)
 {
 	struct sg_request *srp = rq->end_io_data;
-	Sg_device *sdp = NULL;
+	Sg_device *sdp;
 	Sg_fd *sfp;
 	unsigned long iflags;
 	unsigned int ms;
 	char *sense;
-	int result, resid;
+	int result, resid, done = 1;
 
-	if (NULL == srp) {
-		printk(KERN_ERR "sg_cmd_done: NULL request\n");
+	if (unlikely(NULL == srp)) { /* shouldn't happen */
+		printk(KERN_ERR "sg_rq_end_io: NULL request\n");
 		return;
 	}
+
 	sfp = srp->parentfp;
-	if (sfp)
-		sdp = sfp->parentdp;
-	if ((NULL == sdp) || sdp->detached) {
-		printk(KERN_INFO "sg_cmd_done: device detached\n");
+	if (unlikely(NULL == sfp)) { /* shouldn't happen */
+		printk(KERN_ERR "sg_rq_end_io: NULL sg_fd\n");
 		return;
 	}
 
+	sdp = sfp->parentdp;
+	if (unlikely(sdp->detached))
+		printk(KERN_INFO "sg_rq_end_io: device detached\n");
+
 	sense = rq->sense;
 	result = rq->errors;
 	resid = rq->data_len;
@@ -1303,33 +1342,25 @@ static void sg_rq_end_io(struct request 
 	}
 	/* Rely on write phase to clean out srp status values, so no "else" */
 
-	if (sfp->closed) {	/* whoops this fd already released, cleanup */
-		SCSI_LOG_TIMEOUT(1, printk("sg_cmd_done: already closed, freeing ...\n"));
-		sg_finish_rem_req(srp);
-		srp = NULL;
-		if (NULL == sfp->headrp) {
-			SCSI_LOG_TIMEOUT(1, printk("sg_cmd_done: already closed, final cleanup\n"));
-			if (0 == sg_remove_sfp(sdp, sfp)) {	/* device still present */
-				scsi_device_put(sdp->device);
-			}
-			sfp = NULL;
-		}
-	} else if (srp && srp->orphan) {
+	write_lock_irqsave(&sfp->rq_list_lock, iflags);
+	if (unlikely(srp->orphan)) {
 		if (sfp->keep_orphan)
 			srp->sg_io_owned = 0;
-		else {
-			sg_finish_rem_req(srp);
-			srp = NULL;
-		}
+		else
+			done = 0;
 	}
-	if (sfp && srp) {
-		/* Now wake up any sg_read() that is waiting for this packet. */
-		kill_fasync(&sfp->async_qp, SIGPOLL, POLL_IN);
-		write_lock_irqsave(&sfp->rq_list_lock, iflags);
-		srp->done = 1;
+	srp->done = done;
+	write_unlock_irqrestore(&sfp->rq_list_lock, iflags);
+
+	if (likely(done)) {
+		/* Now wake up any sg_read() that is waiting for this
+		   packet. */
 		wake_up_interruptible(&sfp->read_wait);
-		write_unlock_irqrestore(&sfp->rq_list_lock, iflags);
-	}
+		kill_fasync(&sfp->async_qp, SIGPOLL, POLL_IN);
+	} else
+		sg_finish_rem_req(srp); /* call with srp->done == 0 */
+
+	kref_put(&sfp->f_ref, sg_remove_sfp);
 }
 
 static struct file_operations sg_fops = {
@@ -1391,6 +1422,7 @@ static Sg_device *sg_alloc(struct gendis
 	init_waitqueue_head(&sdp->o_excl_wait);
 	sdp->sg_tablesize = min(q->max_hw_segments, q->max_phys_segments);
 	sdp->index = k;
+	kref_init(&sdp->d_ref);
 
 	error = 0;
  out:
@@ -1488,63 +1520,33 @@ out:
 	return error;
 }
 
-static void
-sg_remove(struct device *cl_dev, struct class_interface *cl_intf)
+static void sg_remove(struct device *cl_dev, struct class_interface *cl_intf)
 {
 	struct scsi_device *scsidp = to_scsi_device(cl_dev->parent);
 	Sg_device *sdp = dev_get_drvdata(cl_dev);
 	unsigned long iflags;
 	Sg_fd *sfp;
-	Sg_fd *tsfp;
-	Sg_request *srp;
-	Sg_request *tsrp;
-	int delay;
 
-	if (!sdp)
+	if (!sdp || sdp->detached)
 		return;
 
-	delay = 0;
-	write_lock_irqsave(&sg_index_lock, iflags);
-	if (sdp->headfp) {
-		sdp->detached = 1;
-		for (sfp = sdp->headfp; sfp; sfp = tsfp) {
-			tsfp = sfp->nextfp;
-			for (srp = sfp->headrp; srp; srp = tsrp) {
-				tsrp = srp->nextrp;
-				if (sfp->closed || (0 == sg_srp_done(srp, sfp)))
-					sg_finish_rem_req(srp);
-			}
-			if (sfp->closed) {
-				scsi_device_put(sdp->device);
-				__sg_remove_sfp(sdp, sfp);
-			} else {
-				delay = 1;
-				wake_up_interruptible(&sfp->read_wait);
-				kill_fasync(&sfp->async_qp, SIGPOLL,
-					    POLL_HUP);
-			}
-		}
-		SCSI_LOG_TIMEOUT(3, printk("sg_remove: dev=%d, dirty\n", sdp->index));
-		if (NULL == sdp->headfp) {
-			idr_remove(&sg_index_idr, sdp->index);
-		}
-	} else {	/* nothing active, simple case */
-		SCSI_LOG_TIMEOUT(3, printk("sg_remove: dev=%d\n", sdp->index));
-		idr_remove(&sg_index_idr, sdp->index);
+	SCSI_LOG_TIMEOUT(3, printk("sg_remove: %s\n", sdp->disk->disk_name));
+
+	sdp->detached = 1;
+
+	read_lock_irqsave(&sg_index_lock, iflags);
+	for (sfp = sdp->headfp; sfp; sfp = sfp->nextfp) {
+		wake_up_interruptible(&sfp->read_wait);
+		kill_fasync(&sfp->async_qp, SIGPOLL, POLL_HUP);
 	}
-	write_unlock_irqrestore(&sg_index_lock, iflags);
+	read_unlock_irqrestore(&sg_index_lock, iflags);
 
 	sysfs_remove_link(&scsidp->sdev_gendev.kobj, "generic");
 	device_destroy(sg_sysfs_class, MKDEV(SCSI_GENERIC_MAJOR, sdp->index));
 	cdev_del(sdp->cdev);
 	sdp->cdev = NULL;
-	put_disk(sdp->disk);
-	sdp->disk = NULL;
-	if (NULL == sdp->headfp)
-		kfree(sdp);
 
-	if (delay)
-		msleep(10);	/* dirty detach so delay device destruction */
+	sg_put_dev(sdp); /* put initial reference from kref_init. */
 }
 
 module_param_named(scatter_elem_sz, scatter_elem_sz, int, S_IRUGO | S_IWUSR);
@@ -2033,18 +2035,21 @@ sg_remove_request(Sg_fd * sfp, Sg_reques
 }
 
 #ifdef CONFIG_SCSI_PROC_FS
-static Sg_fd *
-sg_get_nth_sfp(Sg_device * sdp, int nth)
+static Sg_fd *sg_get_next_sfp(Sg_device *sdp, Sg_fd *prev_sfp)
 {
-	Sg_fd *resp;
+	Sg_fd *sfp;
 	unsigned long iflags;
-	int k;
 
 	read_lock_irqsave(&sg_index_lock, iflags);
-	for (k = 0, resp = sdp->headfp; resp && (k < nth);
-	     ++k, resp = resp->nextfp) ;
+	sfp = (prev_sfp == NULL) ? sdp->headfp : prev_sfp->nextfp;
+	for ( ; sfp != NULL; sfp = sfp->nextfp) {
+		if (sg_kref_get_not_zero(&sfp->f_ref))
+			break;
+	}
 	read_unlock_irqrestore(&sg_index_lock, iflags);
-	return resp;
+	if (prev_sfp != NULL)
+		kref_put(&prev_sfp->f_ref, sg_remove_sfp);
+	return sfp;
 }
 #endif
 
@@ -2062,6 +2067,7 @@ sg_add_sfp(Sg_device * sdp, int dev)
 	init_waitqueue_head(&sfp->read_wait);
 	rwlock_init(&sfp->rq_list_lock);
 
+	kref_init(&sfp->f_ref);
 	sfp->timeout = SG_DEFAULT_TIMEOUT;
 	sfp->timeout_user = SG_DEFAULT_TIMEOUT_USER;
 	sfp->force_packid = SG_DEF_FORCE_PACK_ID;
@@ -2089,15 +2095,49 @@ sg_add_sfp(Sg_device * sdp, int dev)
 	sg_build_reserve(sfp, bufflen);
 	SCSI_LOG_TIMEOUT(3, printk("sg_add_sfp:   bufflen=%d, k_use_sg=%d\n",
 			   sfp->reserve.bufflen, sfp->reserve.k_use_sg));
+
+	kref_get(&sdp->d_ref);
+	__module_get(THIS_MODULE);
 	return sfp;
 }
 
-static void
-__sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp)
+static void sg_remove_sfp_usercontext(struct work_struct *work)
 {
+	struct sg_fd *sfp = container_of(work, struct sg_fd, ew.work);
+	struct sg_device *sdp = sfp->parentdp;
+
+	/* Cleanup any responses which were never read(). */
+	while (sfp->headrp)
+		sg_finish_rem_req(sfp->headrp);
+
+	if (sfp->reserve.bufflen > 0) {
+		SCSI_LOG_TIMEOUT(6,
+			printk("sg_remove_sfp:    bufflen=%d, k_use_sg=%d\n",
+				(int) sfp->reserve.bufflen,
+				(int) sfp->reserve.k_use_sg));
+		sg_remove_scat(&sfp->reserve);
+	}
+
+	SCSI_LOG_TIMEOUT(6,
+		printk("sg_remove_sfp: %s, sfp=0x%p\n",
+			sdp->disk->disk_name,
+			sfp));
+	kfree(sfp);
+
+	scsi_device_put(sdp->device);
+	sg_put_dev(sdp);
+	module_put(THIS_MODULE);
+}
+
+static void sg_remove_sfp(struct kref *kref)
+{
+	struct sg_fd *sfp = container_of(kref, struct sg_fd, f_ref);
+	struct sg_device *sdp = sfp->parentdp;
 	Sg_fd *fp;
 	Sg_fd *prev_fp;
+	unsigned long iflags;
 
+	write_lock_irqsave(&sg_index_lock, iflags);
 	prev_fp = sdp->headfp;
 	if (sfp == prev_fp)
 		sdp->headfp = prev_fp->nextfp;
@@ -2110,54 +2150,10 @@ __sg_remove_sfp(Sg_device * sdp, Sg_fd *
 			prev_fp = fp;
 		}
 	}
-	if (sfp->reserve.bufflen > 0) {
-		SCSI_LOG_TIMEOUT(6, 
-			printk("__sg_remove_sfp:    bufflen=%d, k_use_sg=%d\n",
-			(int) sfp->reserve.bufflen, (int) sfp->reserve.k_use_sg));
-		sg_remove_scat(&sfp->reserve);
-	}
-	sfp->parentdp = NULL;
-	SCSI_LOG_TIMEOUT(6, printk("__sg_remove_sfp:    sfp=0x%p\n", sfp));
-	kfree(sfp);
-}
-
-/* Returns 0 in normal case, 1 when detached and sdp object removed */
-static int
-sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp)
-{
-	Sg_request *srp;
-	Sg_request *tsrp;
-	int dirty = 0;
-	int res = 0;
-
-	for (srp = sfp->headrp; srp; srp = tsrp) {
-		tsrp = srp->nextrp;
-		if (sg_srp_done(srp, sfp))
-			sg_finish_rem_req(srp);
-		else
-			++dirty;
-	}
-	if (0 == dirty) {
-		unsigned long iflags;
+	write_unlock_irqrestore(&sg_index_lock, iflags);
+	wake_up_interruptible(&sdp->o_excl_wait);
 
-		write_lock_irqsave(&sg_index_lock, iflags);
-		__sg_remove_sfp(sdp, sfp);
-		if (sdp->detached && (NULL == sdp->headfp)) {
-			idr_remove(&sg_index_idr, sdp->index);
-			kfree(sdp);
-			res = 1;
-		}
-		write_unlock_irqrestore(&sg_index_lock, iflags);
-	} else {
-		/* MOD_INC's to inhibit unloading sg and associated adapter driver */
-		/* only bump the access_count if we actually succeeded in
-		 * throwing another counter on the host module */
-		scsi_device_get(sdp->device);	/* XXX: retval ignored? */	
-		sfp->closed = 1;	/* flag dirty state on this fd */
-		SCSI_LOG_TIMEOUT(1, printk("sg_remove_sfp: worrisome, %d writes pending\n",
-				  dirty));
-	}
-	return res;
+	execute_in_process_context(sg_remove_sfp_usercontext, &sfp->ew);
 }
 
 static int
@@ -2199,6 +2195,28 @@ sg_last_dev(void)
 }
 #endif
 
+static void sg_device_destroy(struct kref *kref)
+{
+	struct sg_device *sdp = container_of(kref, struct sg_device, d_ref);
+	unsigned long flags;
+
+	SCSI_LOG_TIMEOUT(3,
+		printk("sg_device_destroy: %s\n",
+			sdp->disk->disk_name));
+
+	write_lock_irqsave(&sg_index_lock, flags);
+	idr_remove(&sg_index_idr, sdp->index);
+	write_unlock_irqrestore(&sg_index_lock, flags);
+	put_disk(sdp->disk);
+	kfree(sdp);
+}
+
+static void sg_put_dev(struct sg_device *sdp)
+{
+	if (sdp)
+		kref_put(&sdp->d_ref, sg_device_destroy);
+}
+
 static Sg_device *
 sg_get_dev(int dev)
 {
@@ -2207,6 +2225,9 @@ sg_get_dev(int dev)
 
 	read_lock_irqsave(&sg_index_lock, iflags);
 	sdp = idr_find(&sg_index_idr, dev);
+	if (sdp)
+		if (!sg_kref_get_not_zero(&sdp->d_ref))
+			sdp = NULL;
 	read_unlock_irqrestore(&sg_index_lock, iflags);
 
 	return sdp;
@@ -2480,6 +2501,7 @@ static int sg_proc_seq_show_dev(struct s
 			      (int) scsi_device_online(scsidp));
 	else
 		seq_printf(s, "-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\n");
+	sg_put_dev(sdp);
 	return 0;
 }
 
@@ -2500,19 +2522,19 @@ static int sg_proc_seq_show_devstrs(stru
 			   scsidp->vendor, scsidp->model, scsidp->rev);
 	else
 		seq_printf(s, "<no active device>\n");
+	sg_put_dev(sdp);
 	return 0;
 }
 
-static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp)
+static void sg_proc_debug_helper(struct seq_file *s, Sg_device *sdp, Sg_fd *fp)
 {
 	int k, m, new_interface, blen, usg;
 	Sg_request *srp;
-	Sg_fd *fp;
 	const sg_io_hdr_t *hp;
 	const char * cp;
 	unsigned int ms;
 
-	for (k = 0; (fp = sg_get_nth_sfp(sdp, k)); ++k) {
+	for (k = 0; fp != NULL; ++k, fp = sg_get_next_sfp(sdp, fp)) {
 		seq_printf(s, "   FD(%d): timeout=%dms bufflen=%d "
 			   "(res)sgat=%d low_dma=%d\n", k + 1,
 			   jiffies_to_msecs(fp->timeout),
@@ -2580,14 +2602,16 @@ static int sg_proc_seq_show_debug(struct
 	sdp = it ? sg_get_dev(it->index) : NULL;
 	if (sdp) {
 		struct scsi_device *scsidp = sdp->device;
+		Sg_fd *fp;
 
 		if (NULL == scsidp) {
 			seq_printf(s, "device %d detached ??\n", 
 				   (int)it->index);
-			return 0;
+			goto out;
 		}
 
-		if (sg_get_nth_sfp(sdp, 0)) {
+		fp = sg_get_next_sfp(sdp, NULL);
+		if (fp) {
 			seq_printf(s, " >>> device=%s ",
 				sdp->disk->disk_name);
 			if (sdp->detached)
@@ -2601,9 +2625,11 @@ static int sg_proc_seq_show_debug(struct
 				     scsidp->host->hostt->emulated);
 			seq_printf(s, " sg_tablesize=%d excl=%d\n",
 				   sdp->sg_tablesize, sdp->exclude);
+			sg_proc_debug_helper(s, sdp, fp);
 		}
-		sg_proc_debug_helper(s, sdp);
 	}
+out:
+	sg_put_dev(sdp);
 	return 0;
 }
 


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH 0/2] sg: fix races during device removal (v2)
  2009-01-14 20:31   ` Tony Battersby
@ 2009-01-19  6:57     ` FUJITA Tomonori
  2009-01-19 23:06       ` [PATCH 2/2] sg: fix races with ioctl(SG_IO) Tony Battersby
  0 siblings, 1 reply; 2+ messages in thread
From: FUJITA Tomonori @ 2009-01-19  6:57 UTC (permalink / raw)
  To: tonyb; +Cc: fujita.tomonori, dgilbert, James.Bottomley, hch, linux-scsi, greg

Sorry for the delay,

On Wed, 14 Jan 2009 15:31:15 -0500
Tony Battersby <tonyb@cybernetics.com> wrote:

> [CC: Greg Kroah-Hartman <greg@kroah.com>]
> 
> Fujita, your patch results in simpler code than my version 2 patch,
> but it still leaves some subtle races and other problems.  The crux
> of the problem is that kref_put() needs do be done while holding a
> lock if there is still a way for some other CPU to find a reference
> to the object in between the time the refcount drops to 0 and the
> time the destructor is called.
> 
> For example, look at sg_get_dev() and sg_put_dev().  In my patch,
> I locked sg_index_lock in sg_put_dev() and called kref_put() while
> holding the lock.  In your patch, you moved the lock from sg_put_dev()
> to the destructor function.  This introduces a subtle race with
> sg_get_dev():
> 
> CPU 1:
> sg_put_dev()
>   kref_put(): refcount 1 -> 0
> 
> CPU 2:
> sg_get_dev()
>   lock sg_index_lock
>   idr_find()
>   kref_get(): refcount 0 -> 1: bug
>   unlock sg_index_lock
> 
> CPU 1:
> sg_device_release()
>   lock sg_index_lock
>   idr_remove
>   unlock sg_index_lock
>   kfree(sdp)
> 
> CPU 2:
> access sdp which has already been freed
> sg_put_dev(): double-free bug
> 
> Besides holding the lock during kref_put(), I also considered two
> other simple ways to avoid this race:
> 1) Do idr_remove() from sg_remove().
> 2) Return NULL in sg_get_dev() if sdp->detached.
> 
> However, both of these options would have changed the behavior of
> the /proc/scsi/sg/* functions that show information for devices that
> are in the process of being detached.  I wanted to fix bugs without
> changing other behavior, so I chose to call kref_put() under lock in
> my previous patches.

How about doing 2) and accessing to /proc/scsi/sg/* with
sg_index_lock (don't use sg_get_device for it).

What /proc/scsi/sg/* doing is abnormal from the perspective of the ref
counting (accessing to something that is going away).

>From the perspective of the ref counting, the best way is calling
idr_remove in from sg_remove but as you said, it's not nice to change
the behavior.

If we don't use sg_get_device for /proc/scsi/sg/*, then we use
sg_get_device for only sg_open So we can do 2) without changing the
behavior.


> A similar problem exists with sg_remove_sfp() in your patch.
> The refcount for the sg_fd can drop to 0 while a different CPU can
> still find a reference to it by scanning sg_device::headfp.  The good
> news is that most places that scan sg_fd's using sdp->headfp never
> drop sg_index_lock while still holding a reference, so for most cases
> it is safe for the destructor to work the way you have it.  The one
> exception is sg_get_nth_sfp(), which needs to kref_get() the sg_fd it
> returns (my version 2 patch doesn't fix this problem either since I
> just noticed it).  But then the kref_get() would have a race similar
> to the one I described above between sg_get_dev() and sg_put_dev().

If we go with the above way, we can solve this? sg_get_nth_sfp is used
for only /proc/scsi/sg/*.


> There are several solutions to this problem:
> 1) Call kref_put(&sfp->f_ref, sg_remove_sfp) while holding
> sg_index_lock, and have the destructor unlink sfp from sdp->headfp
> before dropping the lock.  This is ugly since the destructor may want
> to drop sg_index_lock (e.g. to call scsi_device_put()), but the saved
> irqflags variable is in a different function.
> 2) Unlink sfp from sdp->headfp in sg_release().  The downside to this
> is that the /proc/scsi/sg/* functions no longer show fds that have
> been closed but still have commands pending.
> 3) Set sfp->closed in sg_release() and have sg_get_nth_sfp() return
> NULL if sfp->closed.  Same downside as #2.
> 4) Add a variation of kref_get() that uses atomic_inc_not_zero().
> 
> When I was designing my previous patches, I anticipated running into
> these complications with kref, which is one reason I avoided it.
> Other people have experienced similar problems with the kref interface
> (search "kref_get_not_zero" or "cgroups: fix probable race with
> put_css_set[_taskexit] and find_css_set").  If you still want to use
> kref, I think that we are either going to have to change the behavior
> of /proc/scsi/sg/*, or else use atomic_inc_not_zero().  Since I still
> prefer not to change valid user-visible behavior in a patch that fixes
> so many possible oopses, I will go with atomic_inc_not_zero().  And,
> since I am going that route, I will use it for sg_get_dev() also,
> so that sg_put_dev() doesn't need to acquire sg_index_lock.
> 
> For the purposes of this patch, I added the local function
> sg_kref_get_not_zero() to sg.c.  It would be better to add
> kref_get_not_zero() to kref.c, but I see in the mailing list archives
> that there has been some resistance to that idea because it complicates
> the kref interface.  However, since it has come up more than once,
> perhaps it would be better to go ahead and make it an official part
> of the interface.  If anyone wants to support that idea, I will break
> it out into a separate patch.

I've not read the whole discussion, but using the common mechanism in
your original way is not good. It confuses other developers and other
developers don't like it.


> Your patch also has a problem with management of f_ref.  You do
> kref_get() in sg_add_request() and kref_put() in sg_rq_end_io()().
> However, an error can occur in between sg_add_request() and sending
> the command to the midlevel, so sg_rq_end_io() may never be called.
> It is better to do kref_get() just before sending the command to the
> midlevel; that way the refcount counts active commands waiting for
> the completion callback rather than prepared requests.

Oops, yeah, you are right.


> Your patch doesn't fix the problem of various functions calling
> SCSI_LOG_TIMEOUT with sdp->disk->disk_name after sg_remove() sets
> sdp->disk to NULL.  This is why I moved put_disk() from sg_remove()
> to sg_device_release().

As I wrote in the previous mail, I don't try to fix all the race in
this patch. But yeah, we need to clean up sdp->disk in
sg_device_release().


> Your patch doesn't fix the problem of forgetting to do
> wake_up_interruptible(&sdp->o_excl_wait) after removing sfp from
> sdp->headfp.  This is important if sg_release() is called with
> commands still active; the last command that finishes should wake up
> any O_EXCL waiters.

I think we can fix this even with my patch.


Thanks,

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [PATCH 2/2] sg: fix races with ioctl(SG_IO)
  2009-01-19  6:57     ` FUJITA Tomonori
@ 2009-01-19 23:06       ` Tony Battersby
  0 siblings, 0 replies; 2+ messages in thread
From: Tony Battersby @ 2009-01-19 23:06 UTC (permalink / raw)
  To: James.Bottomley; +Cc: FUJITA Tomonori, dgilbert, hch, linux-scsi

sg_io_owned needs to be set before the command is sent to the midlevel;
otherwise, a quickly-completing command may cause a different CPU
to see "srp->done == 1 && !srp->sg_io_owned", which would lead to
incorrect behavior.

Check srp->done and set srp->orphan while holding rq_list_lock to
prevent races with sg_rq_end_io().

Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
---

Same as before, just rediffed against v4 of the previous patch.

 sg.c |   21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

--- linux-2.6.29-rc2/drivers/scsi/sg.c.orig	2009-01-19 16:26:21.000000000 -0500
+++ linux-2.6.29-rc2/drivers/scsi/sg.c	2009-01-19 17:53:28.000000000 -0500
@@ -208,7 +208,7 @@ static ssize_t sg_new_read(Sg_fd * sfp, 
 			   Sg_request * srp);
 static ssize_t sg_new_write(Sg_fd *sfp, struct file *file,
 			const char __user *buf, size_t count, int blocking,
-			int read_only, Sg_request **o_srp);
+			int read_only, int sg_io_owned, Sg_request **o_srp);
 static int sg_common_write(Sg_fd * sfp, Sg_request * srp,
 			   unsigned char *cmnd, int timeout, int blocking);
 static int sg_read_oxfer(Sg_request * srp, char __user *outp, int num_read_xfer);
@@ -590,7 +590,8 @@ sg_write(struct file *filp, const char _
 		return -EFAULT;
 	blocking = !(filp->f_flags & O_NONBLOCK);
 	if (old_hdr.reply_len < 0)
-		return sg_new_write(sfp, filp, buf, count, blocking, 0, NULL);
+		return sg_new_write(sfp, filp, buf, count,
+				    blocking, 0, 0, NULL);
 	if (count < (SZ_SG_HEADER + 6))
 		return -EIO;	/* The minimum scsi command length is 6 bytes. */
 
@@ -671,7 +672,7 @@ sg_write(struct file *filp, const char _
 
 static ssize_t
 sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf,
-		 size_t count, int blocking, int read_only,
+		 size_t count, int blocking, int read_only, int sg_io_owned,
 		 Sg_request **o_srp)
 {
 	int k;
@@ -691,6 +692,7 @@ sg_new_write(Sg_fd *sfp, struct file *fi
 		SCSI_LOG_TIMEOUT(1, printk("sg_new_write: queue full\n"));
 		return -EDOM;
 	}
+	srp->sg_io_owned = sg_io_owned;
 	hp = &srp->header;
 	if (__copy_from_user(hp, buf, SZ_SG_IO_HDR)) {
 		sg_remove_request(sfp, srp);
@@ -838,10 +840,9 @@ sg_ioctl(struct inode *inode, struct fil
 				return -EFAULT;
 			result =
 			    sg_new_write(sfp, filp, p, SZ_SG_IO_HDR,
-					 blocking, read_only, &srp);
+					 blocking, read_only, 1, &srp);
 			if (result < 0)
 				return result;
-			srp->sg_io_owned = 1;
 			while (1) {
 				result = 0;	/* following macro to beat race condition */
 				__wait_event_interruptible(sfp->read_wait,
@@ -851,14 +852,16 @@ sg_ioctl(struct inode *inode, struct fil
 					return -ENODEV;
 				if (sfp->closed)
 					return 0;	/* request packet dropped already */
-				if (0 == result)
+				write_lock_irq(&sfp->rq_list_lock);
+				if (srp->done) {
+					srp->done = 2;
+					write_unlock_irq(&sfp->rq_list_lock);
 					break;
+				}
 				srp->orphan = 1;
+				write_unlock_irq(&sfp->rq_list_lock);
 				return result;	/* -ERESTARTSYS because signal hit process */
 			}
-			write_lock_irqsave(&sfp->rq_list_lock, iflags);
-			srp->done = 2;
-			write_unlock_irqrestore(&sfp->rq_list_lock, iflags);
 			result = sg_new_read(sfp, p, SZ_SG_IO_HDR, srp);
 			return (result < 0) ? result : 0;
 		}



^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2009-01-19 23:06 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-01-05 19:11 [PATCH 2/2] sg: fix races with ioctl(SG_IO) Tony Battersby
  -- strict thread matches above, loose matches on Subject: below --
2009-01-05 19:07 [PATCH 0/2] sg: fix races during device removal (v2) Tony Battersby
2009-01-10 17:26 ` FUJITA Tomonori
2009-01-14 20:31   ` Tony Battersby
2009-01-19  6:57     ` FUJITA Tomonori
2009-01-19 23:06       ` [PATCH 2/2] sg: fix races with ioctl(SG_IO) Tony Battersby

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox