public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Dan Williams <dan.j.williams@intel.com>
To: Jens Axboe <axboe@kernel.dk>, NeilBrown <neilb@suse.de>,
	Tejun Heo <htejun@gmail.com>
Cc: linux-kernel <linux-kernel@vger.kernel.org>,
	linux-raid <linux-raid@vger.kernel.org>,
	"Neubauer, Wojciech" <Wojciech.Neubauer@intel.com>,
	Ed Ciechanowski <ed.ciechanowski@intel.com>
Subject: md versus partition scanning (bd_invalidated)
Date: Wed, 21 Jul 2010 12:52:46 -0700	[thread overview]
Message-ID: <1279741966.25713.31.camel@dwillia2-linux> (raw)

Hi,

We (Intel software raid devs) are seeing a problem with the detection of
partitions on md devices.  The trace below shows that the block device
inode is dropped in-between when the array comes up and when it is first
opened (timestamp 655.498875).  When this happens bdev->bd_invalidated
is cleared by bdget() and we never detect partitions.  (Seen on a 2.6.32
based kernel, but I presume it is still present)

# tracer: nop
#
#           TASK-PID    CPU#    TIMESTAMP  FUNCTION
#              | |       |          |         |
           <...>-1114  [000]   655.488730: check_disk_size_change: ffff8800374d3780: (md1) check_disk_size_change:1103 open: 1 invalidated: 0
           <...>-1114  [000]   655.497906: check_disk_size_change: ffff8800374d3780: (md1) check_disk_size_change:1112 open: 1 invalidated: 0
           <...>-1114  [000]   655.497908: flush_disk: ffff8800374d3780: (md1) flush_disk:1084 open: 1 invalidated: 1
           <...>-1114  [000]   655.497909: flush_disk: ffff8800374d3780: (md1) flush_disk:1086 open: 1 invalidated: 1
           <...>-1117  [003]   655.498875: bdget: ffff8800375aec40: () bdget:595 open: 0 invalidated: 0
           <...>-1117  [003]   655.498878: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1229 open: 0 invalidated: 0
           <...>-1117  [003]   655.498879: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1233 open: 0 invalidated: 0
           <...>-1117  [003]   655.498880: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1239 open: 0 invalidated: 0
           <...>-1117  [003]   655.498882: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1307 open: 1 invalidated: 0

These traces generated by:

#define dbg(bdev) ({ if (debug_partitions) {\
        char name[BDEVNAME_SIZE] = "";\
        if (bdev->bd_disk)\
                disk_name(bdev->bd_disk, 0, name);\
        trace_printk("%p: (%s) %s:%d open: %d invalidated: %d\n", bdev, name, __func__, __LINE__, bdev->bd_openers, bdev->bd_invalidated); \
        } 0; })

The patch below (2.6.32 based) moves the block_device bd_invalidated
field to a gendisk flag, as it seems this info wants a longer lifetime.

Thoughts on this fix?  Maybe it wants to be a standalone integer flag so
we don't need to add locking to nbd.

Thanks,
Dan

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index cc923a5..de8a4a4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -607,8 +607,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 			if (S_ISSOCK(inode->i_mode)) {
 				lo->file = file;
 				lo->sock = SOCKET_I(inode);
+				mutex_lock(&bdev->bd_mutex);
 				if (max_part > 0)
-					bdev->bd_invalidated = 1;
+					bdev->bd_disk->flags |= GENHD_FL_INVALIDATED;
+				mutex_unlock(&bdev->bd_mutex);
 				return 0;
 			} else {
 				fput(file);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88..147f449 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -571,7 +571,6 @@ struct block_device *bdget(dev_t dev)
 		bdev->bd_inode = inode;
 		bdev->bd_block_size = (1 << inode->i_blkbits);
 		bdev->bd_part_count = 0;
-		bdev->bd_invalidated = 0;
 		inode->i_mode = S_IFBLK;
 		inode->i_rdev = dev;
 		inode->i_bdev = bdev;
@@ -1069,7 +1068,7 @@ static void flush_disk(struct block_device *bdev)
 	if (!bdev->bd_disk)
 		return;
 	if (disk_partitionable(bdev->bd_disk))
-		bdev->bd_invalidated = 1;
+		bdev->bd_disk->flags |= GENHD_FL_INVALIDATED;
 }
 
 /**
@@ -1243,7 +1242,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					bdi = &default_backing_dev_info;
 				bdev->bd_inode->i_data.backing_dev_info = bdi;
 			}
-			if (bdev->bd_invalidated)
+			if (bdev->bd_disk->flags & GENHD_FL_INVALIDATED)
 				rescan_partitions(disk, bdev);
 		} else {
 			struct block_device *whole;
@@ -1276,7 +1275,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				if (ret)
 					goto out_unlock_bdev;
 			}
-			if (bdev->bd_invalidated)
+			if (bdev->bd_disk->flags & GENHD_FL_INVALIDATED)
 				rescan_partitions(bdev->bd_disk, bdev);
 		}
 	}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e8865c1..7872269 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -519,7 +519,7 @@ void register_disk(struct gendisk *disk)
 	if (!bdev)
 		goto exit;
 
-	bdev->bd_invalidated = 1;
+	disk->flags |= GENHD_FL_INVALIDATED;
 	err = blkdev_get(bdev, FMODE_READ);
 	if (err < 0)
 		goto exit;
@@ -558,7 +558,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
 	check_disk_size_change(disk, bdev);
-	bdev->bd_invalidated = 0;
+	bdev->bd_disk->flags &= ~GENHD_FL_INVALIDATED;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
 		return 0;
 	if (IS_ERR(state))	/* I/O error reading the partition table */
@@ -609,7 +609,7 @@ try_scan:
 				if (capacity > get_capacity(disk)) {
 					set_capacity(disk, capacity);
 					check_disk_size_change(disk, bdev);
-					bdev->bd_invalidated = 0;
+					bdev->bd_disk->flags &= ~GENHD_FL_INVALIDATED;
 				}
 				goto try_scan;
 			} else {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1e58fc8..367875a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -661,7 +661,6 @@ struct block_device {
 	struct hd_struct *	bd_part;
 	/* number of times partitions within this device have been opened. */
 	unsigned		bd_part_count;
-	int			bd_invalidated;
 	struct gendisk *	bd_disk;
 	struct list_head	bd_list;
 	/*
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index c6c0c41..d97cdec 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -116,6 +116,7 @@ struct hd_struct {
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	32
 #define GENHD_FL_EXT_DEVT			64 /* allow extended devt */
 #define GENHD_FL_NATIVE_CAPACITY		128
+#define GENHD_FL_INVALIDATED			256
 
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))



             reply	other threads:[~2010-07-21 19:44 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-07-21 19:52 Dan Williams [this message]
2010-07-28 19:23 ` md versus partition scanning (bd_invalidated) Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1279741966.25713.31.camel@dwillia2-linux \
    --to=dan.j.williams@intel.com \
    --cc=Wojciech.Neubauer@intel.com \
    --cc=axboe@kernel.dk \
    --cc=ed.ciechanowski@intel.com \
    --cc=htejun@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-raid@vger.kernel.org \
    --cc=neilb@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox