public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* md versus partition scanning (bd_invalidated)
@ 2010-07-21 19:52 Dan Williams
  2010-07-28 19:23 ` Dan Williams
  0 siblings, 1 reply; 2+ messages in thread
From: Dan Williams @ 2010-07-21 19:52 UTC (permalink / raw)
  To: Jens Axboe, NeilBrown, Tejun Heo
  Cc: linux-kernel, linux-raid, Neubauer, Wojciech, Ed Ciechanowski

Hi,

We (Intel software raid devs) are seeing a problem with the detection of
partitions on md devices.  The trace below shows that the block device
inode is dropped in-between when the array comes up and when it is first
opened (timestamp 655.498875).  When this happens bdev->bd_invalidated
is cleared by bdget() and we never detect partitions.  (Seen on a 2.6.32
based kernel, but I presume it is still present)

# tracer: nop
#
#           TASK-PID    CPU#    TIMESTAMP  FUNCTION
#              | |       |          |         |
           <...>-1114  [000]   655.488730: check_disk_size_change: ffff8800374d3780: (md1) check_disk_size_change:1103 open: 1 invalidated: 0
           <...>-1114  [000]   655.497906: check_disk_size_change: ffff8800374d3780: (md1) check_disk_size_change:1112 open: 1 invalidated: 0
           <...>-1114  [000]   655.497908: flush_disk: ffff8800374d3780: (md1) flush_disk:1084 open: 1 invalidated: 1
           <...>-1114  [000]   655.497909: flush_disk: ffff8800374d3780: (md1) flush_disk:1086 open: 1 invalidated: 1
           <...>-1117  [003]   655.498875: bdget: ffff8800375aec40: () bdget:595 open: 0 invalidated: 0
           <...>-1117  [003]   655.498878: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1229 open: 0 invalidated: 0
           <...>-1117  [003]   655.498879: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1233 open: 0 invalidated: 0
           <...>-1117  [003]   655.498880: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1239 open: 0 invalidated: 0
           <...>-1117  [003]   655.498882: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1307 open: 1 invalidated: 0

These traces generated by:

#define dbg(bdev) ({ if (debug_partitions) {\
        char name[BDEVNAME_SIZE] = "";\
        if (bdev->bd_disk)\
                disk_name(bdev->bd_disk, 0, name);\
        trace_printk("%p: (%s) %s:%d open: %d invalidated: %d\n", bdev, name, __func__, __LINE__, bdev->bd_openers, bdev->bd_invalidated); \
        } 0; })

The patch below (2.6.32 based) moves the block_device bd_invalidated
field to a gendisk flag, as it seems this info wants a longer lifetime.

Thoughts on this fix?  Maybe it wants to be a standalone integer flag so
we don't need to add locking to nbd.

Thanks,
Dan

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index cc923a5..de8a4a4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -607,8 +607,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 			if (S_ISSOCK(inode->i_mode)) {
 				lo->file = file;
 				lo->sock = SOCKET_I(inode);
+				mutex_lock(&bdev->bd_mutex);
 				if (max_part > 0)
-					bdev->bd_invalidated = 1;
+					bdev->bd_disk->flags |= GENHD_FL_INVALIDATED;
+				mutex_unlock(&bdev->bd_mutex);
 				return 0;
 			} else {
 				fput(file);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88..147f449 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -571,7 +571,6 @@ struct block_device *bdget(dev_t dev)
 		bdev->bd_inode = inode;
 		bdev->bd_block_size = (1 << inode->i_blkbits);
 		bdev->bd_part_count = 0;
-		bdev->bd_invalidated = 0;
 		inode->i_mode = S_IFBLK;
 		inode->i_rdev = dev;
 		inode->i_bdev = bdev;
@@ -1069,7 +1068,7 @@ static void flush_disk(struct block_device *bdev)
 	if (!bdev->bd_disk)
 		return;
 	if (disk_partitionable(bdev->bd_disk))
-		bdev->bd_invalidated = 1;
+		bdev->bd_disk->flags |= GENHD_FL_INVALIDATED;
 }
 
 /**
@@ -1243,7 +1242,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					bdi = &default_backing_dev_info;
 				bdev->bd_inode->i_data.backing_dev_info = bdi;
 			}
-			if (bdev->bd_invalidated)
+			if (bdev->bd_disk->flags & GENHD_FL_INVALIDATED)
 				rescan_partitions(disk, bdev);
 		} else {
 			struct block_device *whole;
@@ -1276,7 +1275,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				if (ret)
 					goto out_unlock_bdev;
 			}
-			if (bdev->bd_invalidated)
+			if (bdev->bd_disk->flags & GENHD_FL_INVALIDATED)
 				rescan_partitions(bdev->bd_disk, bdev);
 		}
 	}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e8865c1..7872269 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -519,7 +519,7 @@ void register_disk(struct gendisk *disk)
 	if (!bdev)
 		goto exit;
 
-	bdev->bd_invalidated = 1;
+	disk->flags |= GENHD_FL_INVALIDATED;
 	err = blkdev_get(bdev, FMODE_READ);
 	if (err < 0)
 		goto exit;
@@ -558,7 +558,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
 	check_disk_size_change(disk, bdev);
-	bdev->bd_invalidated = 0;
+	bdev->bd_disk->flags &= ~GENHD_FL_INVALIDATED;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
 		return 0;
 	if (IS_ERR(state))	/* I/O error reading the partition table */
@@ -609,7 +609,7 @@ try_scan:
 				if (capacity > get_capacity(disk)) {
 					set_capacity(disk, capacity);
 					check_disk_size_change(disk, bdev);
-					bdev->bd_invalidated = 0;
+					bdev->bd_disk->flags &= ~GENHD_FL_INVALIDATED;
 				}
 				goto try_scan;
 			} else {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1e58fc8..367875a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -661,7 +661,6 @@ struct block_device {
 	struct hd_struct *	bd_part;
 	/* number of times partitions within this device have been opened. */
 	unsigned		bd_part_count;
-	int			bd_invalidated;
 	struct gendisk *	bd_disk;
 	struct list_head	bd_list;
 	/*
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index c6c0c41..d97cdec 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -116,6 +116,7 @@ struct hd_struct {
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	32
 #define GENHD_FL_EXT_DEVT			64 /* allow extended devt */
 #define GENHD_FL_NATIVE_CAPACITY		128
+#define GENHD_FL_INVALIDATED			256
 
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))



^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: md versus partition scanning (bd_invalidated)
  2010-07-21 19:52 md versus partition scanning (bd_invalidated) Dan Williams
@ 2010-07-28 19:23 ` Dan Williams
  0 siblings, 0 replies; 2+ messages in thread
From: Dan Williams @ 2010-07-28 19:23 UTC (permalink / raw)
  To: Jens Axboe, NeilBrown, Tejun Heo
  Cc: linux-kernel, linux-raid, Neubauer, Wojciech, Ed Ciechanowski

On Wed, Jul 21, 2010 at 12:52 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> Hi,
>
> We (Intel software raid devs) are seeing a problem with the detection of
> partitions on md devices.  The trace below shows that the block device
> inode is dropped in-between when the array comes up and when it is first
> opened (timestamp 655.498875).  When this happens bdev->bd_invalidated
> is cleared by bdget() and we never detect partitions.  (Seen on a 2.6.32
> based kernel, but I presume it is still present)

The kernel in question was missing commit f3b99be1 "Restore partition
detection of newly created md arrays" [1].  With that in place the
problem goes away.  I haven't fully convinced myself that there isn't
a small race that could be triggered by changing the array size via
sysfs, to get the the gendisk size updated and then dropping all inode
references prior to the first open... but I'll put that investigation
on the back burner.

--
Dan

[1]: http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=f3b99be1

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2010-07-28 19:23 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-07-21 19:52 md versus partition scanning (bd_invalidated) Dan Williams
2010-07-28 19:23 ` Dan Williams

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox