linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] md - 1 of 7 - Print "deprecated" warning when START_ARRAY is used.
  2004-02-06  5:35 [PATCH] md - 0 of 7 - Introduction NeilBrown
@ 2004-02-06  5:35 ` NeilBrown
  2004-02-06  5:35 ` [PATCH] md - 3 of 7 - Discard the cmd field from r1_bio structure NeilBrown
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: NeilBrown @ 2004-02-06  5:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-raid


The "START_ARRAY" ioctl depends on major/minor numbers (as stored
in the raid superblock) are stable over reboots, which is increasingly
untrue.
There are better ways to start an array (e.g. with mdadm) so we 
mark the ioctl as deprecated for 2.6, and will remove it in 2.7.

 ----------- Diffstat output ------------
 ./drivers/md/md.c |    8 ++++++++
 1 files changed, 8 insertions(+)

diff ./drivers/md/md.c~current~ ./drivers/md/md.c
--- ./drivers/md/md.c~current~	2004-02-06 16:17:55.000000000 +1100
+++ ./drivers/md/md.c	2004-02-06 16:17:55.000000000 +1100
@@ -2419,6 +2419,14 @@ static int md_ioctl(struct inode *inode,
 		/* START_ARRAY doesn't need to lock the array as autostart_array
 		 * does the locking, and it could even be a different array
 		 */
+		static int cnt = 3;
+		if (cnt > 0 ) {
+			printk(KERN_WARNING 
+			       "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
+			       "This will not be supported beyond 2.6\n",
+			       current->comm, current->pid);
+			cnt--;
+		}
 		err = autostart_array(new_decode_dev(arg));
 		if (err) {
 			printk(KERN_WARNING "md: autostart %s failed!\n",

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] md - 0 of 7 - Introduction
@ 2004-02-06  5:35 NeilBrown
  2004-02-06  5:35 ` [PATCH] md - 1 of 7 - Print "deprecated" warning when START_ARRAY is used NeilBrown
                   ` (6 more replies)
  0 siblings, 7 replies; 8+ messages in thread
From: NeilBrown @ 2004-02-06  5:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-raid, Christoph Hellwig, Al Viro

Hi again.

Here are 7 patches for md in 2.4.3-pre-current

The first flags the START_ARRAY as deprecated - I plan to remove it in 2.7.
The comment with that patch explains why.

The next 5 provide some cleanup of raid1 code leading to a patch which 
enabled raid1 to safely use large requests when resyncing (it uses 
bio_add_page to make sure each page is allowed).  This substantially increases
resync speed in atleast one test.

The final patch in a new version of the partitioning patch.
It removes the rather ugly code for forcing the partitions to be 
re-evaluated and instead makes simple arranges for them to be reread 
on the next open.  This fixes the one bug that was reported against the
patch.
I have also taken onboard some of the feedback I got about the patch.

NeilBrown

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] md - 2 of 7 - Split read and write end_request handlers
  2004-02-06  5:35 [PATCH] md - 0 of 7 - Introduction NeilBrown
  2004-02-06  5:35 ` [PATCH] md - 1 of 7 - Print "deprecated" warning when START_ARRAY is used NeilBrown
  2004-02-06  5:35 ` [PATCH] md - 3 of 7 - Discard the cmd field from r1_bio structure NeilBrown
@ 2004-02-06  5:35 ` NeilBrown
  2004-02-06  5:35 ` [PATCH] md - 5 of 7 - Avoid unnecessary bio allocation during raid1 resync NeilBrown
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: NeilBrown @ 2004-02-06  5:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-raid


Instead of having a single end_request handler that must
determine whether it was a read or a write request, we have
two separate handlers, which makes each of them easier to follow.

 ----------- Diffstat output ------------
 ./drivers/md/raid1.c |  102 +++++++++++++++++++++++++++++++++------------------
 1 files changed, 66 insertions(+), 36 deletions(-)

diff ./drivers/md/raid1.c~current~ ./drivers/md/raid1.c
--- ./drivers/md/raid1.c~current~	2004-02-06 16:18:22.000000000 +1100
+++ ./drivers/md/raid1.c	2004-02-06 16:18:22.000000000 +1100
@@ -261,7 +261,7 @@ static inline void update_head_pos(int d
 		r1_bio->sector + (r1_bio->master_bio->bi_size >> 9);
 }
 
-static int raid1_end_request(struct bio *bio, unsigned int bytes_done, int error)
+static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
@@ -271,13 +271,7 @@ static int raid1_end_request(struct bio 
 	if (bio->bi_size)
 		return 1;
 	
-	if (r1_bio->cmd == READ || r1_bio->cmd == READA)
-		mirror = r1_bio->read_disk;
-	else {
-		for (mirror = 0; mirror < conf->raid_disks; mirror++)
-			if (r1_bio->write_bios[mirror] == bio)
-				break;
-	}
+	mirror = r1_bio->read_disk;
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
@@ -296,42 +290,78 @@ static int raid1_end_request(struct bio 
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
 
 	update_head_pos(mirror, r1_bio);
-	if ((r1_bio->cmd == READ) || (r1_bio->cmd == READA)) {
-		if (!r1_bio->read_bio)
-			BUG();
+
+	if (!r1_bio->read_bio)
+		BUG();
+	/*
+	 * we have only one bio on the read side
+	 */
+	if (uptodate)
+		raid_end_bio_io(r1_bio);
+	else {
 		/*
-		 * we have only one bio on the read side
+		 * oops, read error:
 		 */
-		if (uptodate)
-			raid_end_bio_io(r1_bio);
-		else {
-			/*
-			 * oops, read error:
-			 */
-			char b[BDEVNAME_SIZE];
-			printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
-				bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
-			reschedule_retry(r1_bio);
-		}
-	} else {
+		char b[BDEVNAME_SIZE];
+		printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
+		       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
+		reschedule_retry(r1_bio);
+	}
 
-		if (r1_bio->read_bio)
-			BUG();
+	atomic_dec(&conf->mirrors[mirror].rdev->nr_pending);
+	return 0;
+}
+
+static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
+{
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+	int mirror;
+	conf_t *conf = mddev_to_conf(r1_bio->mddev);
+
+	if (bio->bi_size)
+		return 1;
+	
+	for (mirror = 0; mirror < conf->raid_disks; mirror++)
+		if (r1_bio->write_bios[mirror] == bio)
+			break;
+
+	/*
+	 * this branch is our 'one mirror IO has finished' event handler:
+	 */
+	if (!uptodate)
+		md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+	else
 		/*
-		 * WRITE:
+		 * Set R1BIO_Uptodate in our master bio, so that
+		 * we will return a good error code for to the higher
+		 * levels even if IO on some other mirrored buffer fails.
 		 *
-		 * Let's see if all mirrored write operations have finished
-		 * already.
+		 * The 'master' represents the composite IO operation to
+		 * user-side. So if something waits for IO, then it will
+		 * wait for the 'master' bio.
 		 */
-		if (atomic_dec_and_test(&r1_bio->remaining)) {
-			md_write_end(r1_bio->mddev);
-			raid_end_bio_io(r1_bio);
-		}	
-	}
+		set_bit(R1BIO_Uptodate, &r1_bio->state);
+
+	update_head_pos(mirror, r1_bio);
+
+	if (r1_bio->read_bio)
+		BUG();
+	/*
+	 *
+	 * Let's see if all mirrored write operations have finished
+	 * already.
+	 */
+	if (atomic_dec_and_test(&r1_bio->remaining)) {
+		md_write_end(r1_bio->mddev);
+		raid_end_bio_io(r1_bio);
+	}	
+
 	atomic_dec(&conf->mirrors[mirror].rdev->nr_pending);
 	return 0;
 }
 
+
 /*
  * This routine returns the disk from which the requested read should
  * be done. There is a per-array 'next expected sequential IO' sector
@@ -508,7 +538,7 @@ static int make_request(request_queue_t 
 
 		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
-		read_bio->bi_end_io = raid1_end_request;
+		read_bio->bi_end_io = raid1_end_read_request;
 		read_bio->bi_rw = r1_bio->cmd;
 		read_bio->bi_private = r1_bio;
 
@@ -546,7 +576,7 @@ static int make_request(request_queue_t 
 
 		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
-		mbio->bi_end_io	= raid1_end_request;
+		mbio->bi_end_io	= raid1_end_write_request;
 		mbio->bi_rw = r1_bio->cmd;
 		mbio->bi_private = r1_bio;
 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] md - 3 of 7 - Discard the cmd field from r1_bio structure
  2004-02-06  5:35 [PATCH] md - 0 of 7 - Introduction NeilBrown
  2004-02-06  5:35 ` [PATCH] md - 1 of 7 - Print "deprecated" warning when START_ARRAY is used NeilBrown
@ 2004-02-06  5:35 ` NeilBrown
  2004-02-06  5:35 ` [PATCH] md - 2 of 7 - Split read and write end_request handlers NeilBrown
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: NeilBrown @ 2004-02-06  5:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-raid


The only time it is really needed is to differentiate a retry-on-fail
from a write-after-read-for-resync request to raid1d.
So we use a bit in 'state' for that.

 ----------- Diffstat output ------------
 ./drivers/md/raid1.c         |   43 +++++++++++++++++++------------------------
 ./include/linux/raid/raid1.h |    5 ++---
 2 files changed, 21 insertions(+), 27 deletions(-)

diff ./drivers/md/raid1.c~current~ ./drivers/md/raid1.c
--- ./drivers/md/raid1.c~current~	2004-02-06 16:18:22.000000000 +1100
+++ ./drivers/md/raid1.c	2004-02-06 16:18:52.000000000 +1100
@@ -523,9 +523,8 @@ static int make_request(request_queue_t 
 
 	r1_bio->mddev = mddev;
 	r1_bio->sector = bio->bi_sector;
-	r1_bio->cmd = bio_data_dir(bio);
 
-	if (r1_bio->cmd == READ) {
+	if (bio_data_dir(bio) == READ) {
 		/*
 		 * read balancing logic:
 		 */
@@ -539,7 +538,7 @@ static int make_request(request_queue_t 
 		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
 		read_bio->bi_end_io = raid1_end_read_request;
-		read_bio->bi_rw = r1_bio->cmd;
+		read_bio->bi_rw = READ;
 		read_bio->bi_private = r1_bio;
 
 		generic_make_request(read_bio);
@@ -577,7 +576,7 @@ static int make_request(request_queue_t 
 		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_rw = r1_bio->cmd;
+		mbio->bi_rw = WRITE;
 		mbio->bi_private = r1_bio;
 
 		atomic_inc(&r1_bio->remaining);
@@ -926,30 +925,26 @@ static void raid1d(mddev_t *mddev)
 		mddev = r1_bio->mddev;
 		conf = mddev_to_conf(mddev);
 		bio = r1_bio->master_bio;
-		switch(r1_bio->cmd) {
-		case SPECIAL:
+		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
 			sync_request_write(mddev, r1_bio);
-			break;
-		case READ:
-		case READA:
+		} else {
 			if (map(mddev, &rdev) == -1) {
 				printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
-				" read error for block %llu\n",
-				bdevname(bio->bi_bdev,b),
-				(unsigned long long)r1_bio->sector);
+				       " read error for block %llu\n",
+				       bdevname(bio->bi_bdev,b),
+				       (unsigned long long)r1_bio->sector);
 				raid_end_bio_io(r1_bio);
-				break;
-			}
-			printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
-				" another mirror\n",
-				bdevname(rdev->bdev,b),
-				(unsigned long long)r1_bio->sector);
-			bio->bi_bdev = rdev->bdev;
-			bio->bi_sector = r1_bio->sector + rdev->data_offset;
-			bio->bi_rw = r1_bio->cmd;
+			} else {
+				printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
+				       " another mirror\n",
+				       bdevname(rdev->bdev,b),
+				       (unsigned long long)r1_bio->sector);
+				bio->bi_bdev = rdev->bdev;
+				bio->bi_sector = r1_bio->sector + rdev->data_offset;
+				bio->bi_rw = READ;
 
-			generic_make_request(bio);
-			break;
+				generic_make_request(bio);
+			}
 		}
 	}
 	spin_unlock_irqrestore(&retry_list_lock, flags);
@@ -1037,7 +1032,7 @@ static int sync_request(mddev_t *mddev, 
 
 	r1_bio->mddev = mddev;
 	r1_bio->sector = sector_nr;
-	r1_bio->cmd = SPECIAL;
+	set_bit(R1BIO_IsSync, &r1_bio->state);
 	r1_bio->read_disk = disk;
 
 	bio = r1_bio->master_bio;

diff ./include/linux/raid/raid1.h~current~ ./include/linux/raid/raid1.h
--- ./include/linux/raid/raid1.h~current~	2004-02-06 16:18:52.000000000 +1100
+++ ./include/linux/raid/raid1.h	2004-02-06 16:18:52.000000000 +1100
@@ -54,7 +54,6 @@ struct r1bio_s {
 	atomic_t		remaining; /* 'have we finished' count,
 					    * used from IRQ handlers
 					    */
-	int			cmd;
 	sector_t		sector;
 	unsigned long		state;
 	mddev_t			*mddev;
@@ -78,6 +77,6 @@ struct r1bio_s {
 };
 
 /* bits for r1bio.state */
-#define	R1BIO_Uptodate	1
-
+#define	R1BIO_Uptodate	0
+#define	R1BIO_IsSync	1
 #endif

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] md - 4 of 7 - Remove some un-needed fields from r1bio_s
  2004-02-06  5:35 [PATCH] md - 0 of 7 - Introduction NeilBrown
                   ` (3 preceding siblings ...)
  2004-02-06  5:35 ` [PATCH] md - 5 of 7 - Avoid unnecessary bio allocation during raid1 resync NeilBrown
@ 2004-02-06  5:35 ` NeilBrown
  2004-02-06  5:35 ` [PATCH] md - 6 of 7 - Dynamically limit size of bio requests used for raid1 resync NeilBrown
  2004-02-06  5:35 ` [PATCH] md - 7 of 7 - Allow partitioning of MD devices NeilBrown
  6 siblings, 0 replies; 8+ messages in thread
From: NeilBrown @ 2004-02-06  5:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-raid


next_r1 is never used, so it can just go.

read_bio isn't needed as we can easily use one of the pointers
in the write_bios array - write_bios[->read_disk].
So rename "write_bios" to "bios" and store the pointer to the
read bio in there.

 ----------- Diffstat output ------------
 ./drivers/md/raid1.c         |   55 ++++++++++++++-----------------------------
 ./include/linux/raid/raid1.h |    6 +---
 2 files changed, 21 insertions(+), 40 deletions(-)

diff ./drivers/md/raid1.c~current~ ./drivers/md/raid1.c
--- ./drivers/md/raid1.c~current~	2004-02-06 16:18:52.000000000 +1100
+++ ./drivers/md/raid1.c	2004-02-06 16:19:15.000000000 +1100
@@ -42,7 +42,7 @@ static void * r1bio_pool_alloc(int gfp_f
 	mddev_t *mddev = data;
 	r1bio_t *r1_bio;
 
-	/* allocate a r1bio with room for raid_disks entries in the write_bios array */
+	/* allocate a r1bio with room for raid_disks entries in the bios array */
 	r1_bio = kmalloc(sizeof(r1bio_t) + sizeof(struct bio*)*mddev->raid_disks,
 			 gfp_flags);
 	if (r1_bio)
@@ -132,19 +132,10 @@ static void put_all_bios(conf_t *conf, r
 {
 	int i;
 
-	if (r1_bio->read_bio) {
-		if (atomic_read(&r1_bio->read_bio->bi_cnt) != 1)
-			BUG();
-		bio_put(r1_bio->read_bio);
-		r1_bio->read_bio = NULL;
-	}
 	for (i = 0; i < conf->raid_disks; i++) {
-		struct bio **bio = r1_bio->write_bios + i;
-		if (*bio) {
-			if (atomic_read(&(*bio)->bi_cnt) != 1)
-				BUG();
+		struct bio **bio = r1_bio->bios + i;
+		if (*bio)
 			bio_put(*bio);
-		}
 		*bio = NULL;
 	}
 }
@@ -291,8 +282,6 @@ static int raid1_end_read_request(struct
 
 	update_head_pos(mirror, r1_bio);
 
-	if (!r1_bio->read_bio)
-		BUG();
 	/*
 	 * we have only one bio on the read side
 	 */
@@ -323,7 +312,7 @@ static int raid1_end_write_request(struc
 		return 1;
 	
 	for (mirror = 0; mirror < conf->raid_disks; mirror++)
-		if (r1_bio->write_bios[mirror] == bio)
+		if (r1_bio->bios[mirror] == bio)
 			break;
 
 	/*
@@ -345,8 +334,6 @@ static int raid1_end_write_request(struc
 
 	update_head_pos(mirror, r1_bio);
 
-	if (r1_bio->read_bio)
-		BUG();
 	/*
 	 *
 	 * Let's see if all mirrored write operations have finished
@@ -531,9 +518,8 @@ static int make_request(request_queue_t 
 		mirror = conf->mirrors + read_balance(conf, bio, r1_bio);
 
 		read_bio = bio_clone(bio, GFP_NOIO);
-		if (r1_bio->read_bio)
-			BUG();
-		r1_bio->read_bio = read_bio;
+
+		r1_bio->bios[r1_bio->read_disk] = read_bio;
 
 		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
@@ -550,16 +536,16 @@ static int make_request(request_queue_t 
 	 */
 	/* first select target devices under spinlock and
 	 * inc refcount on their rdev.  Record them by setting
-	 * write_bios[x] to bio
+	 * bios[x] to bio
 	 */
 	spin_lock_irq(&conf->device_lock);
 	for (i = 0;  i < disks; i++) {
 		if (conf->mirrors[i].rdev &&
 		    !conf->mirrors[i].rdev->faulty) {
 			atomic_inc(&conf->mirrors[i].rdev->nr_pending);
-			r1_bio->write_bios[i] = bio;
+			r1_bio->bios[i] = bio;
 		} else
-			r1_bio->write_bios[i] = NULL;
+			r1_bio->bios[i] = NULL;
 	}
 	spin_unlock_irq(&conf->device_lock);
 
@@ -567,11 +553,11 @@ static int make_request(request_queue_t 
 	md_write_start(mddev);
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio;
-		if (!r1_bio->write_bios[i])
+		if (!r1_bio->bios[i])
 			continue;
 
 		mbio = bio_clone(bio, GFP_NOIO);
-		r1_bio->write_bios[i] = mbio;
+		r1_bio->bios[i] = mbio;
 
 		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
@@ -773,7 +759,7 @@ static int end_sync_read(struct bio *bio
 	if (bio->bi_size)
 		return 1;
 
-	if (r1_bio->read_bio != bio)
+	if (r1_bio->bios[r1_bio->read_disk] != bio)
 		BUG();
 	update_head_pos(r1_bio->read_disk, r1_bio);
 	/*
@@ -804,7 +790,7 @@ static int end_sync_write(struct bio *bi
 		return 1;
 
 	for (i = 0; i < conf->raid_disks; i++)
-		if (r1_bio->write_bios[i] == bio) {
+		if (r1_bio->bios[i] == bio) {
 			mirror = i;
 			break;
 		}
@@ -850,11 +836,11 @@ static void sync_request_write(mddev_t *
 
 	spin_lock_irq(&conf->device_lock);
 	for (i = 0; i < disks ; i++) {
-		r1_bio->write_bios[i] = NULL;
+		r1_bio->bios[i] = NULL;
 		if (!conf->mirrors[i].rdev || 
 		    conf->mirrors[i].rdev->faulty)
 			continue;
-		if (conf->mirrors[i].rdev->bdev == bio->bi_bdev)
+		if (i == r1_bio->read_disk)
 			/*
 			 * we read from here, no need to write
 			 */
@@ -866,16 +852,16 @@ static void sync_request_write(mddev_t *
 			 */
 			continue;
 		atomic_inc(&conf->mirrors[i].rdev->nr_pending);
-		r1_bio->write_bios[i] = bio;
+		r1_bio->bios[i] = bio;
 	}
 	spin_unlock_irq(&conf->device_lock);
 
 	atomic_set(&r1_bio->remaining, 1);
 	for (i = disks; i-- ; ) {
-		if (!r1_bio->write_bios[i])
+		if (!r1_bio->bios[i])
 			continue;
 		mbio = bio_clone(bio, GFP_NOIO);
-		r1_bio->write_bios[i] = mbio;
+		r1_bio->bios[i] = mbio;
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
 		mbio->bi_end_io	= end_sync_write;
@@ -1056,10 +1042,7 @@ static int sync_request(mddev_t *mddev, 
 	read_bio->bi_end_io = end_sync_read;
 	read_bio->bi_rw = READ;
 	read_bio->bi_private = r1_bio;
-
-	if (r1_bio->read_bio)
-		BUG();
-	r1_bio->read_bio = read_bio;
+	r1_bio->bios[r1_bio->read_disk] = read_bio;
 
 	md_sync_acct(mirror->rdev, nr_sectors);
 

diff ./include/linux/raid/raid1.h~current~ ./include/linux/raid/raid1.h
--- ./include/linux/raid/raid1.h~current~	2004-02-06 16:18:52.000000000 +1100
+++ ./include/linux/raid/raid1.h	2004-02-06 16:19:15.000000000 +1100
@@ -62,18 +62,16 @@ struct r1bio_s {
 	 */
 	struct bio		*master_bio;
 	/*
-	 * if the IO is in READ direction, then this bio is used:
+	 * if the IO is in READ direction, then this is where we read
 	 */
-	struct bio		*read_bio;
 	int			read_disk;
 
-	r1bio_t			*next_r1; /* next for retry or in free list */
 	struct list_head	retry_list;
 	/*
 	 * if the IO is in WRITE direction, then multiple bios are used.
 	 * We choose the number when they are allocated.
 	 */
-	struct bio		*write_bios[0];
+	struct bio		*bios[0];
 };
 
 /* bits for r1bio.state */

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] md - 5 of 7 - Avoid unnecessary bio allocation during raid1 resync
  2004-02-06  5:35 [PATCH] md - 0 of 7 - Introduction NeilBrown
                   ` (2 preceding siblings ...)
  2004-02-06  5:35 ` [PATCH] md - 2 of 7 - Split read and write end_request handlers NeilBrown
@ 2004-02-06  5:35 ` NeilBrown
  2004-02-06  5:35 ` [PATCH] md - 4 of 7 - Remove some un-needed fields from r1bio_s NeilBrown
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: NeilBrown @ 2004-02-06  5:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-raid


For each resync request, we allocate a "r1_bio" which has a
bio "master_bio" attached that goes largely unused. We also
allocate a read_bio which is used.
This patch removes the read_bio and just uses the master_bio instead.

This fixes a bug wherein bi_bdev of the master_bio wasn't being set,
but was being used.

We also introduce a new "sectors" field into the r1_bio as we can
no-longer rely in master_bio->bi_sectors.

 ----------- Diffstat output ------------
 ./drivers/md/raid1.c         |   37 ++++++++++++++++++-------------------
 ./include/linux/raid/raid1.h |    1 +
 2 files changed, 19 insertions(+), 19 deletions(-)

diff ./drivers/md/raid1.c~current~ ./drivers/md/raid1.c
--- ./drivers/md/raid1.c~current~	2004-02-06 16:19:15.000000000 +1100
+++ ./drivers/md/raid1.c	2004-02-06 16:19:44.000000000 +1100
@@ -77,6 +77,9 @@ static void * r1buf_pool_alloc(int gfp_f
 	if (!bio)
 		goto out_free_r1_bio;
 
+	/*
+	 * Allocate RESYNC_PAGES data pages for this iovec.
+	 */
 	for (i = 0; i < RESYNC_PAGES; i++) {
 		page = alloc_page(gfp_flags);
 		if (unlikely(!page))
@@ -87,9 +90,6 @@ static void * r1buf_pool_alloc(int gfp_f
 		bio->bi_io_vec[i].bv_offset = 0;
 	}
 
-	/*
-	 * Allocate a single data page for this iovec.
-	 */
 	bio->bi_vcnt = RESYNC_PAGES;
 	bio->bi_idx = 0;
 	bio->bi_size = RESYNC_BLOCK_SIZE;
@@ -122,8 +122,6 @@ static void r1buf_pool_free(void *__r1_b
 		__free_page(bio->bi_io_vec[i].bv_page);
 		bio->bi_io_vec[i].bv_page = NULL;
 	}
-	if (atomic_read(&bio->bi_cnt) != 1)
-		BUG();
 	bio_put(bio);
 	r1bio_pool_free(r1bio, conf->mddev);
 }
@@ -249,7 +247,7 @@ static inline void update_head_pos(int d
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
 
 	conf->mirrors[disk].head_position =
-		r1_bio->sector + (r1_bio->master_bio->bi_size >> 9);
+		r1_bio->sector + (r1_bio->sectors);
 }
 
 static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
@@ -507,6 +505,7 @@ static int make_request(request_queue_t 
 	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 
 	r1_bio->master_bio = bio;
+	r1_bio->sectors = bio->bi_size >> 9;
 
 	r1_bio->mddev = mddev;
 	r1_bio->sector = bio->bi_sector;
@@ -799,7 +798,7 @@ static int end_sync_write(struct bio *bi
 	update_head_pos(mirror, r1_bio);
 
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
-		md_done_sync(mddev, r1_bio->master_bio->bi_size >> 9, uptodate);
+		md_done_sync(mddev, r1_bio->sectors, uptodate);
 		put_buf(r1_bio);
 	}
 	atomic_dec(&conf->mirrors[mirror].rdev->nr_pending);
@@ -829,7 +828,7 @@ static void sync_request_write(mddev_t *
 			" for block %llu\n",
 			bdevname(bio->bi_bdev,b), 
 			(unsigned long long)r1_bio->sector);
-		md_done_sync(mddev, r1_bio->master_bio->bi_size >> 9, 0);
+		md_done_sync(mddev, r1_bio->sectors, 0);
 		put_buf(r1_bio);
 		return;
 	}
@@ -874,7 +873,7 @@ static void sync_request_write(mddev_t *
 	}
 
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
-		md_done_sync(mddev, r1_bio->master_bio->bi_size >> 9, 1);
+		md_done_sync(mddev, r1_bio->sectors, 1);
 		put_buf(r1_bio);
 	}
 }
@@ -966,7 +965,7 @@ static int sync_request(mddev_t *mddev, 
 	conf_t *conf = mddev_to_conf(mddev);
 	mirror_info_t *mirror;
 	r1bio_t *r1_bio;
-	struct bio *read_bio, *bio;
+	struct bio *bio;
 	sector_t max_sector, nr_sectors;
 	int disk, partial;
 
@@ -1035,18 +1034,18 @@ static int sync_request(mddev_t *mddev, 
 		bio->bi_io_vec[bio->bi_vcnt-1].bv_len = partial;
 
 
-	read_bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
-
-	read_bio->bi_sector = sector_nr + mirror->rdev->data_offset;
-	read_bio->bi_bdev = mirror->rdev->bdev;
-	read_bio->bi_end_io = end_sync_read;
-	read_bio->bi_rw = READ;
-	read_bio->bi_private = r1_bio;
-	r1_bio->bios[r1_bio->read_disk] = read_bio;
+	bio->bi_sector = sector_nr + mirror->rdev->data_offset;
+	bio->bi_bdev = mirror->rdev->bdev;
+	bio->bi_end_io = end_sync_read;
+	bio->bi_rw = READ;
+	bio->bi_private = r1_bio;
+	bio_get(bio);
+	r1_bio->bios[r1_bio->read_disk] = bio;
+	r1_bio->sectors = nr_sectors;
 
 	md_sync_acct(mirror->rdev, nr_sectors);
 
-	generic_make_request(read_bio);
+	generic_make_request(bio);
 
 	return nr_sectors;
 }

diff ./include/linux/raid/raid1.h~current~ ./include/linux/raid/raid1.h
--- ./include/linux/raid/raid1.h~current~	2004-02-06 16:19:15.000000000 +1100
+++ ./include/linux/raid/raid1.h	2004-02-06 16:19:44.000000000 +1100
@@ -55,6 +55,7 @@ struct r1bio_s {
 					    * used from IRQ handlers
 					    */
 	sector_t		sector;
+	int			sectors;
 	unsigned long		state;
 	mddev_t			*mddev;
 	/*

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] md - 6 of 7 - Dynamically limit size of bio requests used for raid1 resync
  2004-02-06  5:35 [PATCH] md - 0 of 7 - Introduction NeilBrown
                   ` (4 preceding siblings ...)
  2004-02-06  5:35 ` [PATCH] md - 4 of 7 - Remove some un-needed fields from r1bio_s NeilBrown
@ 2004-02-06  5:35 ` NeilBrown
  2004-02-06  5:35 ` [PATCH] md - 7 of 7 - Allow partitioning of MD devices NeilBrown
  6 siblings, 0 replies; 8+ messages in thread
From: NeilBrown @ 2004-02-06  5:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-raid


Currently raid1 uses PAGE_SIZE read/write requests for resync, as it 
doesn't know how to honour per-device restrictions.
This patch uses to bio_add_page to honour those restrictions and ups the limit
on request size to 64K.
This has a measurable impact on rebuild speed (25M/s -> 60M/s)


 ----------- Diffstat output ------------
 ./drivers/md/raid1.c |  183 ++++++++++++++++++++++++++-------------------------
 1 files changed, 96 insertions(+), 87 deletions(-)

diff ./drivers/md/raid1.c~current~ ./drivers/md/raid1.c
--- ./drivers/md/raid1.c~current~	2004-02-06 16:19:44.000000000 +1100
+++ ./drivers/md/raid1.c	2004-02-06 16:20:51.000000000 +1100
@@ -56,8 +56,8 @@ static void r1bio_pool_free(void *r1_bio
 	kfree(r1_bio);
 }
 
-//#define RESYNC_BLOCK_SIZE (64*1024)
-#define RESYNC_BLOCK_SIZE PAGE_SIZE
+#define RESYNC_BLOCK_SIZE (64*1024)
+//#define RESYNC_BLOCK_SIZE PAGE_SIZE
 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 #define RESYNC_WINDOW (2048*1024)
@@ -73,38 +73,39 @@ static void * r1buf_pool_alloc(int gfp_f
 	r1_bio = r1bio_pool_alloc(gfp_flags, conf->mddev);
 	if (!r1_bio)
 		return NULL;
-	bio = bio_alloc(gfp_flags, RESYNC_PAGES);
-	if (!bio)
-		goto out_free_r1_bio;
 
 	/*
-	 * Allocate RESYNC_PAGES data pages for this iovec.
+	 * Allocate bios : 1 for reading, n-1 for writing
 	 */
+	for (j = conf->raid_disks ; j-- ; ) {
+		bio = bio_alloc(gfp_flags, RESYNC_PAGES);
+		if (!bio)
+			goto out_free_bio;
+		r1_bio->bios[j] = bio;
+	}
+	/*
+	 * Allocate RESYNC_PAGES data pages and attach them to
+	 * the first bio;
+	 */
+	bio = r1_bio->bios[0];
 	for (i = 0; i < RESYNC_PAGES; i++) {
 		page = alloc_page(gfp_flags);
 		if (unlikely(!page))
 			goto out_free_pages;
 
 		bio->bi_io_vec[i].bv_page = page;
-		bio->bi_io_vec[i].bv_len = PAGE_SIZE;
-		bio->bi_io_vec[i].bv_offset = 0;
 	}
 
-	bio->bi_vcnt = RESYNC_PAGES;
-	bio->bi_idx = 0;
-	bio->bi_size = RESYNC_BLOCK_SIZE;
-	bio->bi_end_io = NULL;
-	atomic_set(&bio->bi_cnt, 1);
-
 	r1_bio->master_bio = bio;
 
 	return r1_bio;
 
 out_free_pages:
-	for (j = 0; j < i; j++)
-		__free_page(bio->bi_io_vec[j].bv_page);
-	bio_put(bio);
-out_free_r1_bio:
+	for ( ; i > 0 ; i--)
+		__free_page(bio->bi_io_vec[i-1].bv_page);
+out_free_bio:
+	while ( j < conf->raid_disks )
+		bio_put(r1_bio->bios[++j]);
 	r1bio_pool_free(r1_bio, conf->mddev);
 	return NULL;
 }
@@ -114,15 +115,15 @@ static void r1buf_pool_free(void *__r1_b
 	int i;
 	conf_t *conf = data;
 	r1bio_t *r1bio = __r1_bio;
-	struct bio *bio = r1bio->master_bio;
+	struct bio *bio = r1bio->bios[0];
 
-	if (atomic_read(&bio->bi_cnt) != 1)
-		BUG();
 	for (i = 0; i < RESYNC_PAGES; i++) {
 		__free_page(bio->bi_io_vec[i].bv_page);
 		bio->bi_io_vec[i].bv_page = NULL;
 	}
-	bio_put(bio);
+	for (i=0 ; i < conf->raid_disks; i++)
+		bio_put(r1bio->bios[i]);
+
 	r1bio_pool_free(r1bio, conf->mddev);
 }
 
@@ -162,15 +163,8 @@ static inline void free_r1bio(r1bio_t *r
 static inline void put_buf(r1bio_t *r1_bio)
 {
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
-	struct bio *bio = r1_bio->master_bio;
 	unsigned long flags;
 
-	/*
-	 * undo any possible partial request fixup magic:
-	 */
-	if (bio->bi_size != RESYNC_BLOCK_SIZE)
-		bio->bi_io_vec[bio->bi_vcnt-1].bv_len = PAGE_SIZE;
-	put_all_bios(conf, r1_bio);
 	mempool_free(r1_bio, conf->r1buf_pool);
 
 	spin_lock_irqsave(&conf->resync_lock, flags);
@@ -810,12 +804,11 @@ static void sync_request_write(mddev_t *
 	conf_t *conf = mddev_to_conf(mddev);
 	int i;
 	int disks = conf->raid_disks;
-	struct bio *bio, *mbio;
+	struct bio *bio, *wbio;
 
-	bio = r1_bio->master_bio;
+	bio = r1_bio->bios[r1_bio->read_disk];
 
 	/*
-	 * have to allocate lots of bio structures and
 	 * schedule writes
 	 */
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
@@ -833,43 +826,16 @@ static void sync_request_write(mddev_t *
 		return;
 	}
 
-	spin_lock_irq(&conf->device_lock);
-	for (i = 0; i < disks ; i++) {
-		r1_bio->bios[i] = NULL;
-		if (!conf->mirrors[i].rdev || 
-		    conf->mirrors[i].rdev->faulty)
-			continue;
-		if (i == r1_bio->read_disk)
-			/*
-			 * we read from here, no need to write
-			 */
-			continue;
-		if (conf->mirrors[i].rdev->in_sync && 
-			r1_bio->sector + (bio->bi_size>>9) <= mddev->recovery_cp)
-			/*
-			 * don't need to write this we are just rebuilding
-			 */
-			continue;
-		atomic_inc(&conf->mirrors[i].rdev->nr_pending);
-		r1_bio->bios[i] = bio;
-	}
-	spin_unlock_irq(&conf->device_lock);
-
 	atomic_set(&r1_bio->remaining, 1);
-	for (i = disks; i-- ; ) {
-		if (!r1_bio->bios[i])
+	for (i = 0; i < disks ; i++) {
+		wbio = r1_bio->bios[i];
+		if (wbio->bi_end_io != end_sync_write)
 			continue;
-		mbio = bio_clone(bio, GFP_NOIO);
-		r1_bio->bios[i] = mbio;
-		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
-		mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
-		mbio->bi_end_io	= end_sync_write;
-		mbio->bi_rw = WRITE;
-		mbio->bi_private = r1_bio;
 
+		atomic_inc(&conf->mirrors[i].rdev->nr_pending);
 		atomic_inc(&r1_bio->remaining);
-		md_sync_acct(conf->mirrors[i].rdev, mbio->bi_size >> 9);
-		generic_make_request(mbio);
+		md_sync_acct(conf->mirrors[i].rdev, wbio->bi_size >> 9);
+		generic_make_request(wbio);
 	}
 
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
@@ -967,7 +933,8 @@ static int sync_request(mddev_t *mddev, 
 	r1bio_t *r1_bio;
 	struct bio *bio;
 	sector_t max_sector, nr_sectors;
-	int disk, partial;
+	int disk;
+	int i;
 
 	if (!conf->r1buf_pool)
 		if (init_resync(conf))
@@ -1020,27 +987,69 @@ static int sync_request(mddev_t *mddev, 
 	set_bit(R1BIO_IsSync, &r1_bio->state);
 	r1_bio->read_disk = disk;
 
-	bio = r1_bio->master_bio;
-	nr_sectors = RESYNC_BLOCK_SIZE >> 9;
-	if (max_sector - sector_nr < nr_sectors)
-		nr_sectors = max_sector - sector_nr;
-	bio->bi_size = nr_sectors << 9;
-	bio->bi_vcnt = (bio->bi_size + PAGE_SIZE-1) / PAGE_SIZE;
-	/*
-	 * Is there a partial page at the end of the request?
-	 */
-	partial = bio->bi_size % PAGE_SIZE;
-	if (partial)
-		bio->bi_io_vec[bio->bi_vcnt-1].bv_len = partial;
-
-
-	bio->bi_sector = sector_nr + mirror->rdev->data_offset;
-	bio->bi_bdev = mirror->rdev->bdev;
-	bio->bi_end_io = end_sync_read;
-	bio->bi_rw = READ;
-	bio->bi_private = r1_bio;
-	bio_get(bio);
-	r1_bio->bios[r1_bio->read_disk] = bio;
+	for (i=0; i < conf->raid_disks; i++) {
+		bio = r1_bio->bios[i];
+
+		/* take from bio_init */
+		bio->bi_next = NULL;
+		bio->bi_flags |= 1 << BIO_UPTODATE;
+		bio->bi_rw = 0;
+		bio->bi_vcnt = 0;
+		bio->bi_idx = 0;
+		bio->bi_phys_segments = 0;
+		bio->bi_hw_segments = 0;
+		bio->bi_size = 0;
+		bio->bi_end_io = NULL;
+		bio->bi_private = NULL;
+
+		if (i == disk) {
+			bio->bi_rw = READ;
+			bio->bi_end_io = end_sync_read;
+		} else if (conf->mirrors[i].rdev &&
+			   !conf->mirrors[i].rdev->faulty &&
+			   (!conf->mirrors[i].rdev->in_sync ||
+			    sector_nr + RESYNC_SECTORS > mddev->recovery_cp)) {
+			bio->bi_rw = WRITE;
+			bio->bi_end_io = end_sync_write;
+		} else
+			continue;
+		bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset;
+		bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+		bio->bi_private = r1_bio;
+	}
+	nr_sectors = 0;
+	do {
+		struct page *page;
+		int len = PAGE_SIZE;
+		if (sector_nr + (len>>9) > max_sector)
+			len = (max_sector - sector_nr) << 9;
+		if (len == 0)	
+			break;
+		for (i=0 ; i < conf->raid_disks; i++) {
+			bio = r1_bio->bios[i];
+			if (bio->bi_end_io) {
+				page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page;
+				if (bio_add_page(bio, page, len, 0) == 0) {
+					/* stop here */
+					r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page;
+					while (i > 0) {
+						i--;
+						bio = r1_bio->bios[i];
+						if (bio->bi_end_io==NULL) continue;
+						/* remove last page from this bio */
+						bio->bi_vcnt--;
+						bio->bi_size -= len;
+						bio->bi_flags &= ~(1<< BIO_SEG_VALID);
+					}
+					goto bio_full;
+				}
+			}
+		}
+		nr_sectors += len>>9;
+		sector_nr += len>>9;
+	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
+ bio_full:
+	bio = r1_bio->bios[disk];
 	r1_bio->sectors = nr_sectors;
 
 	md_sync_acct(mirror->rdev, nr_sectors);

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] md - 7 of 7 - Allow partitioning of MD devices.
  2004-02-06  5:35 [PATCH] md - 0 of 7 - Introduction NeilBrown
                   ` (5 preceding siblings ...)
  2004-02-06  5:35 ` [PATCH] md - 6 of 7 - Dynamically limit size of bio requests used for raid1 resync NeilBrown
@ 2004-02-06  5:35 ` NeilBrown
  6 siblings, 0 replies; 8+ messages in thread
From: NeilBrown @ 2004-02-06  5:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-raid, Christoph Hellwig, Al Viro


With this patch, md used two major numbers for arrays.

One Major is number 9 with name 'md'  have unpartitioned md arrays,
one per minor number.

The other Major is allocated dynamically with name 'mdp' and had 
on array for every 64 minors, allowing for upto 63 partitions.

The arrays under one major are completely separate from the arrays
under the other.

The preferred name for devices with the new major are of the form:

  /dev/md/d1p3  # partion 3 of device 1 - minor 67

When a paritioned md device is assembled, the partitions are not recognised
until after the whole-array device is opened again.  A future version of
mdadm will perform this open so that the need will be transparent.

 ----------- Diffstat output ------------
 ./drivers/md/md.c           |  105 +++++++++++++++++++++++++++++++-------------
 ./drivers/md/raid5.c        |    2 
 ./drivers/md/raid6main.c    |    2 
 ./include/linux/raid/md_k.h |   13 +----
 4 files changed, 80 insertions(+), 42 deletions(-)

diff ./drivers/md/md.c~current~ ./drivers/md/md.c
--- ./drivers/md/md.c~current~	2004-02-06 16:17:55.000000000 +1100
+++ ./drivers/md/md.c	2004-02-06 16:22:56.000000000 +1100
@@ -52,6 +52,9 @@
 #define MAJOR_NR MD_MAJOR
 #define MD_DRIVER
 
+/* 63 partitions with the alternate major number (mdp) */
+#define MdpMinorShift 6
+
 #define DEBUG 0
 #define dprintk(x...) ((void)(DEBUG && printk(x)))
 
@@ -178,14 +181,14 @@ static void mddev_put(mddev_t *mddev)
 	spin_unlock(&all_mddevs_lock);
 }
 
-static mddev_t * mddev_find(int unit)
+static mddev_t * mddev_find(dev_t unit)
 {
 	mddev_t *mddev, *new = NULL;
 
  retry:
 	spin_lock(&all_mddevs_lock);
 	list_for_each_entry(mddev, &all_mddevs, all_mddevs)
-		if (mdidx(mddev) == unit) {
+		if (mddev->unit == unit) {
 			mddev_get(mddev);
 			spin_unlock(&all_mddevs_lock);
 			if (new)
@@ -206,7 +209,12 @@ static mddev_t * mddev_find(int unit)
 
 	memset(new, 0, sizeof(*new));
 
-	new->__minor = unit;
+	new->unit = unit;
+	if (MAJOR(unit) == MD_MAJOR)
+		new->md_minor = MINOR(unit);
+	else
+		new->md_minor = MINOR(unit) >> MdpMinorShift;
+
 	init_MUTEX(&new->reconfig_sem);
 	INIT_LIST_HEAD(&new->disks);
 	INIT_LIST_HEAD(&new->all_mddevs);
@@ -660,7 +668,7 @@ static void super_90_sync(mddev_t *mddev
 	sb->level = mddev->level;
 	sb->size  = mddev->size;
 	sb->raid_disks = mddev->raid_disks;
-	sb->md_minor = mddev->__minor;
+	sb->md_minor = mddev->md_minor;
 	sb->not_persistent = !mddev->persistent;
 	sb->utime = mddev->utime;
 	sb->state = 0;
@@ -1442,13 +1450,16 @@ abort:
 	return 1;
 }
 
+static int mdp_major = 0;
 
 static struct kobject *md_probe(dev_t dev, int *part, void *data)
 {
 	static DECLARE_MUTEX(disks_sem);
-	int unit = *part;
-	mddev_t *mddev = mddev_find(unit);
+	mddev_t *mddev = mddev_find(dev);
 	struct gendisk *disk;
+	int partitioned = (MAJOR(dev) != MD_MAJOR);
+	int shift = partitioned ? MdpMinorShift : 0;
+	int unit = MINOR(dev) >> shift;
 
 	if (!mddev)
 		return NULL;
@@ -1459,15 +1470,18 @@ static struct kobject *md_probe(dev_t de
 		mddev_put(mddev);
 		return NULL;
 	}
-	disk = alloc_disk(1);
+	disk = alloc_disk(1 << shift);
 	if (!disk) {
 		up(&disks_sem);
 		mddev_put(mddev);
 		return NULL;
 	}
-	disk->major = MD_MAJOR;
-	disk->first_minor = mdidx(mddev);
-	sprintf(disk->disk_name, "md%d", mdidx(mddev));
+	disk->major = MAJOR(dev);
+	disk->first_minor = unit << shift;
+	if (partitioned)
+		sprintf(disk->disk_name, "md_d%d", unit);
+	else
+		sprintf(disk->disk_name, "md%d", unit);
 	disk->fops = &md_fops;
 	disk->private_data = mddev;
 	disk->queue = mddev->queue;
@@ -1496,7 +1510,6 @@ static int do_md_run(mddev_t * mddev)
 	mdk_rdev_t *rdev;
 	struct gendisk *disk;
 	char b[BDEVNAME_SIZE];
-	int unit;
 
 	if (list_empty(&mddev->disks)) {
 		MD_BUG();
@@ -1588,8 +1601,7 @@ static int do_md_run(mddev_t * mddev)
 		invalidate_bdev(rdev->bdev, 0);
 	}
 
-	unit = mdidx(mddev);
-	md_probe(0, &unit, NULL);
+	md_probe(mddev->unit, NULL, NULL);
 	disk = mddev->gendisk;
 	if (!disk)
 		return -ENOMEM;
@@ -1636,6 +1648,7 @@ static int do_md_run(mddev_t * mddev)
 	mddev->queue->queuedata = mddev;
 	mddev->queue->make_request_fn = mddev->pers->make_request;
 
+	mddev->changed = 1;
 	return 0;
 }
 
@@ -1735,6 +1748,7 @@ static int do_md_stop(mddev_t * mddev, i
 		disk = mddev->gendisk;
 		if (disk)
 			set_capacity(disk, 0);
+		mddev->changed = 1;
 	} else
 		printk(KERN_INFO "md: %s switched to read-only mode.\n",
 			mdname(mddev));
@@ -1791,6 +1805,7 @@ static void autorun_devices(void)
 
 	printk(KERN_INFO "md: autorun ...\n");
 	while (!list_empty(&pending_raid_disks)) {
+		dev_t dev;
 		rdev0 = list_entry(pending_raid_disks.next,
 					 mdk_rdev_t, same_set);
 
@@ -1808,8 +1823,14 @@ static void autorun_devices(void)
 		 * mostly sane superblocks. It's time to allocate the
 		 * mddev.
 		 */
-
-		mddev = mddev_find(rdev0->preferred_minor);
+		if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
+			printk(KERN_INFO "md: unit number in %s is bad: %d\n", 
+			       bdevname(rdev0->bdev, b), rdev0->preferred_minor);
+			break;
+		}
+		dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
+		md_probe(dev, NULL, NULL);
+		mddev = mddev_find(dev);
 		if (!mddev) {
 			printk(KERN_ERR 
 				"md: cannot allocate memory for md drive.\n");
@@ -1824,7 +1845,7 @@ static void autorun_devices(void)
 				"md: %s already running, cannot run %s\n",
 				mdname(mddev), bdevname(rdev0->bdev,b));
 			mddev_unlock(mddev);
-		} else if (rdev0->preferred_minor >= 0 && rdev0->preferred_minor < MAX_MD_DEVS) {
+		} else {
 			printk(KERN_INFO "md: created %s\n", mdname(mddev));
 			ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
 				list_del_init(&rdev->same_set);
@@ -1833,9 +1854,7 @@ static void autorun_devices(void)
 			}
 			autorun_array(mddev);
 			mddev_unlock(mddev);
-		} else
-			printk(KERN_WARNING "md: %s had invalid preferred minor %d\n",
-			       bdevname(rdev->bdev, b), rdev0->preferred_minor);
+		}
 		/* on success, candidates will be empty, on error
 		 * it won't...
 		 */
@@ -1955,7 +1974,7 @@ static int get_array_info(mddev_t * mdde
 	info.size          = mddev->size;
 	info.nr_disks      = nr;
 	info.raid_disks    = mddev->raid_disks;
-	info.md_minor      = mddev->__minor;
+	info.md_minor      = mddev->md_minor;
 	info.not_persistent= !mddev->persistent;
 
 	info.utime         = mddev->utime;
@@ -2326,7 +2345,7 @@ static int set_array_info(mddev_t * mdde
 	mddev->level         = info->level;
 	mddev->size          = info->size;
 	mddev->raid_disks    = info->raid_disks;
-	/* don't set __minor, it is determined by which /dev/md* was
+	/* don't set md_minor, it is determined by which /dev/md* was
 	 * openned
 	 */
 	if (info->state & (1<<MD_SB_CLEAN))
@@ -2366,7 +2385,6 @@ static int md_ioctl(struct inode *inode,
 			unsigned int cmd, unsigned long arg)
 {
 	char b[BDEVNAME_SIZE];
-	unsigned int minor = iminor(inode);
 	int err = 0;
 	struct hd_geometry *loc = (struct hd_geometry *) arg;
 	mddev_t *mddev = NULL;
@@ -2374,11 +2392,6 @@ static int md_ioctl(struct inode *inode,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
-	if (minor >= MAX_MD_DEVS) {
-		MD_BUG();
-		return -EINVAL;
-	}
-
 	/*
 	 * Commands dealing with the RAID driver but not any
 	 * particular array:
@@ -2620,6 +2633,7 @@ static int md_open(struct inode *inode, 
 	mddev_get(mddev);
 	mddev_unlock(mddev);
 
+	check_disk_change(inode->i_bdev);
  out:
 	return err;
 }
@@ -2635,12 +2649,28 @@ static int md_release(struct inode *inod
 	return 0;
 }
 
+static int md_media_changed(struct gendisk *disk)
+{
+	mddev_t *mddev = disk->private_data;
+
+	return mddev->changed;
+}
+
+static int md_revalidate(struct gendisk *disk)
+{
+	mddev_t *mddev = disk->private_data;
+
+	mddev->changed = 0;
+	return 0;
+}
 static struct block_device_operations md_fops =
 {
 	.owner		= THIS_MODULE,
 	.open		= md_open,
 	.release	= md_release,
 	.ioctl		= md_ioctl,
+	.media_changed	= md_media_changed,
+	.revalidate_disk= md_revalidate,
 };
 
 int md_thread(void * arg)
@@ -3505,16 +3535,26 @@ int __init md_init(void)
 
 	if (register_blkdev(MAJOR_NR, "md"))
 		return -1;
-
+	if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
+		unregister_blkdev(MAJOR_NR, "md");
+		return -1;
+	}
 	devfs_mk_dir("md");
 	blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
 				md_probe, NULL, NULL);
+	blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
+			    md_probe, NULL, NULL);
 
-	for (minor=0; minor < MAX_MD_DEVS; ++minor) {
+	for (minor=0; minor < MAX_MD_DEVS; ++minor)
 		devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
 				S_IFBLK|S_IRUSR|S_IWUSR,
 				"md/%d", minor);
-	}
+
+	for (minor=0; minor < MAX_MD_DEVS; ++minor)
+		devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
+			      S_IFBLK|S_IRUSR|S_IWUSR,
+			      "md/d%d", minor);
+	
 
 	register_reboot_notifier(&md_notifier);
 	raid_table_header = register_sysctl_table(raid_root_table, 1);
@@ -3576,11 +3616,16 @@ static __exit void md_exit(void)
 	struct list_head *tmp;
 	int i;
 	blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
+	blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
 	for (i=0; i < MAX_MD_DEVS; i++)
 		devfs_remove("md/%d", i);
+	for (i=0; i < MAX_MD_DEVS; i++)
+		devfs_remove("md/d%d", i);
+
 	devfs_remove("md");
 
 	unregister_blkdev(MAJOR_NR,"md");
+	unregister_blkdev(mdp_major, "mdp");
 	unregister_reboot_notifier(&md_notifier);
 	unregister_sysctl_table(raid_table_header);
 	remove_proc_entry("mdstat", NULL);

diff ./drivers/md/raid5.c~current~ ./drivers/md/raid5.c
--- ./drivers/md/raid5.c~current~	2004-02-06 16:22:55.000000000 +1100
+++ ./drivers/md/raid5.c	2004-02-06 16:22:56.000000000 +1100
@@ -284,7 +284,7 @@ static int grow_stripes(raid5_conf_t *co
 	kmem_cache_t *sc;
 	int devs = conf->raid_disks;
 
-	sprintf(conf->cache_name, "md/raid5-%d", conf->mddev->__minor);
+	sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev));
 
 	sc = kmem_cache_create(conf->cache_name, 
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),

diff ./drivers/md/raid6main.c~current~ ./drivers/md/raid6main.c
--- ./drivers/md/raid6main.c~current~	2004-02-06 16:22:55.000000000 +1100
+++ ./drivers/md/raid6main.c	2004-02-06 16:22:56.000000000 +1100
@@ -303,7 +303,7 @@ static int grow_stripes(raid6_conf_t *co
 	kmem_cache_t *sc;
 	int devs = conf->raid_disks;
 
-	sprintf(conf->cache_name, "md/raid6-%d", conf->mddev->__minor);
+	sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev));
 
 	sc = kmem_cache_create(conf->cache_name,
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),

diff ./include/linux/raid/md_k.h~current~ ./include/linux/raid/md_k.h
--- ./include/linux/raid/md_k.h~current~	2004-02-06 16:22:55.000000000 +1100
+++ ./include/linux/raid/md_k.h	2004-02-06 16:22:56.000000000 +1100
@@ -186,7 +186,8 @@ struct mddev_s
 {
 	void				*private;
 	mdk_personality_t		*pers;
-	int				__minor;
+	dev_t				unit;
+	int				md_minor;
 	struct list_head 		disks;
 	int				sb_dirty;
 	int				ro;
@@ -235,6 +236,7 @@ struct mddev_s
 	struct semaphore		reconfig_sem;
 	atomic_t			active;
 
+	int				changed;	/* true if we might need to reread partition info */
 	int				degraded;	/* whether md should consider
 							 * adding a spare
 							 */
@@ -272,15 +274,6 @@ struct mdk_personality_s
 };
 
 
-/*
- * Currently we index md_array directly, based on the minor
- * number. This will have to change to dynamic allocation
- * once we start supporting partitioning of md devices.
- */
-static inline int mdidx (mddev_t * mddev)
-{
-	return mddev->__minor;
-}
 static inline char * mdname (mddev_t * mddev)
 {
 	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2004-02-06  5:35 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-02-06  5:35 [PATCH] md - 0 of 7 - Introduction NeilBrown
2004-02-06  5:35 ` [PATCH] md - 1 of 7 - Print "deprecated" warning when START_ARRAY is used NeilBrown
2004-02-06  5:35 ` [PATCH] md - 3 of 7 - Discard the cmd field from r1_bio structure NeilBrown
2004-02-06  5:35 ` [PATCH] md - 2 of 7 - Split read and write end_request handlers NeilBrown
2004-02-06  5:35 ` [PATCH] md - 5 of 7 - Avoid unnecessary bio allocation during raid1 resync NeilBrown
2004-02-06  5:35 ` [PATCH] md - 4 of 7 - Remove some un-needed fields from r1bio_s NeilBrown
2004-02-06  5:35 ` [PATCH] md - 6 of 7 - Dynamically limit size of bio requests used for raid1 resync NeilBrown
2004-02-06  5:35 ` [PATCH] md - 7 of 7 - Allow partitioning of MD devices NeilBrown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).