* [PATCH] md: Use new topology calls to indicate alignment and I/O sizes @ 2009-06-23 4:54 Martin K. Petersen 2009-06-23 21:44 ` Mike Snitzer 0 siblings, 1 reply; 16+ messages in thread From: Martin K. Petersen @ 2009-06-23 4:54 UTC (permalink / raw) To: NeilBrown; +Cc: linux-raid Neil, Looks like this one missed the boat on the first MD pull request. Here's an updated patch that applies to Linus' current tree in case you are planning a second round... Switch MD over to the new disk_stack_limits() function which checks for aligment and adjusts preferred I/O sizes when stacking. Also indicate preferred I/O sizes where applicable. Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> --- diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 15c8b7b..7df78b9 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -166,8 +166,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) rdev->sectors = sectors * mddev->chunk_sectors; } - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index cbe368f..d48f72a 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -294,7 +294,8 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) for (path = first; path <= last; path++) if ((p=conf->multipaths+path)->rdev == NULL) { q = rdev->bdev->bd_disk->queue; - blk_queue_stack_limits(mddev->queue, q); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as @@ -463,9 +464,9 @@ static int multipath_run (mddev_t *mddev) disk = conf->multipaths + disk_idx; disk->rdev = rdev; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset); - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); /* as we don't honour merge_bvec_fn, we must never risk * violating it, not that we ever expect a device with * a merge_bvec_fn to be involved in multipath */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ab4a489..12cd1eb 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -170,8 +170,8 @@ static int create_strip_zones(mddev_t *mddev) } dev[j] = rdev1; - blk_queue_stack_limits(mddev->queue, - rdev1->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev1->bdev, + rdev1->data_offset); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. @@ -250,6 +250,11 @@ static int create_strip_zones(mddev_t *mddev) mddev->chunk_sectors << 9); goto abort; } + + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + blk_queue_io_opt(mddev->queue, + (mddev->chunk_sectors << 9) * mddev->raid_disks); + printk(KERN_INFO "raid0: done.\n"); mddev->private = conf; return 0; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 89939a7..174c4dd 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1123,8 +1123,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) for (mirror = first; mirror <= last; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. @@ -1988,9 +1988,8 @@ static int run(mddev_t *mddev) disk = conf->mirrors + disk_idx; disk->rdev = rdev; - - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ae12cea..032e791 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1151,8 +1151,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) for ( ; mirror <= last ; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. @@ -2044,7 +2044,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) static int run(mddev_t *mddev) { conf_t *conf; - int i, disk_idx; + int i, disk_idx, chunk_size; mirror_info_t *disk; mdk_rdev_t *rdev; int nc, fc, fo; @@ -2130,6 +2130,14 @@ static int run(mddev_t *mddev) spin_lock_init(&conf->device_lock); mddev->queue->queue_lock = &conf->device_lock; + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + if (conf->raid_disks % conf->near_copies) + blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); + else + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks / conf->near_copies)); + list_for_each_entry(rdev, &mddev->disks, same_set) { disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks @@ -2138,9 +2146,8 @@ static int run(mddev_t *mddev) disk = conf->mirrors + disk_idx; disk->rdev = rdev; - - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f9f991e..d363489 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4452,7 +4452,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) static int run(mddev_t *mddev) { raid5_conf_t *conf; - int working_disks = 0; + int working_disks = 0, chunk_size; mdk_rdev_t *rdev; if (mddev->recovery_cp != MaxSector) @@ -4607,6 +4607,14 @@ static int run(mddev_t *mddev) md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks - conf->max_degraded)); + + list_for_each_entry(rdev, &mddev->disks, same_set) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset); return 0; abort: ^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH] md: Use new topology calls to indicate alignment and I/O sizes 2009-06-23 4:54 [PATCH] md: Use new topology calls to indicate alignment and I/O sizes Martin K. Petersen @ 2009-06-23 21:44 ` Mike Snitzer 2009-06-24 4:05 ` Neil Brown 0 siblings, 1 reply; 16+ messages in thread From: Mike Snitzer @ 2009-06-23 21:44 UTC (permalink / raw) To: Martin K. Petersen; +Cc: NeilBrown, linux-raid, snitzer, Linus Torvalds On Tue, Jun 23, 2009 at 12:54 AM, Martin K. Petersen<martin.petersen@oracle.com> wrote: > > Neil, > > Looks like this one missed the boat on the first MD pull request. > Here's an updated patch that applies to Linus' current tree in case you > are planning a second round... > > > Switch MD over to the new disk_stack_limits() function which checks for > aligment and adjusts preferred I/O sizes when stacking. > > Also indicate preferred I/O sizes where applicable. Given that Linus has not yet pulled in the DM changes for 2.6.31 (which includes DM's topology support) I'd imagine it shouldn't be a problem to get this patch in. In fact it would be a real shame if this somehow missed 2.6.31 seeing as MD is the only consumer of the topology infrastructure's disk_stack_limits(). Neil, please push to Linus for 2.6.31, thanks! Mike ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] md: Use new topology calls to indicate alignment and I/O sizes 2009-06-23 21:44 ` Mike Snitzer @ 2009-06-24 4:05 ` Neil Brown 2009-06-24 5:03 ` Martin K. Petersen 2009-06-30 20:24 ` Mike Snitzer 0 siblings, 2 replies; 16+ messages in thread From: Neil Brown @ 2009-06-24 4:05 UTC (permalink / raw) To: Mike Snitzer; +Cc: Martin K. Petersen, linux-raid, snitzer, Linus Torvalds On Tuesday June 23, snitzer@gmail.com wrote: > On Tue, Jun 23, 2009 at 12:54 AM, Martin K. > Petersen<martin.petersen@oracle.com> wrote: > > > > Neil, > > > > Looks like this one missed the boat on the first MD pull request. > > Here's an updated patch that applies to Linus' current tree in case you > > are planning a second round... > > > > > > Switch MD over to the new disk_stack_limits() function which checks for > > aligment and adjusts preferred I/O sizes when stacking. > > > > Also indicate preferred I/O sizes where applicable. > > Given that Linus has not yet pulled in the DM changes for 2.6.31 > (which includes DM's topology support) I'd imagine it shouldn't be a > problem to get this patch in. In fact it would be a real shame if > this somehow missed 2.6.31 seeing as MD is the only consumer of the > topology infrastructure's disk_stack_limits(). > > Neil, please push to Linus for 2.6.31, thanks! > It looks mostly OK and can now be pulled from git://neil.brown.name/md/ for-linus though I'll send a more formal pull request later (if necessary) But while I have your collective attention: 1/ Is there any documentation on exactly what "io_min" etc mean? I can guess that "io_opt" means "try to use this size, or a multiple of this size, whenever possible" and does not differentiate between read and write (despite the fact that I probably care a lot more about write than read). But when io_min is larger than physical_block_size, what does it mean? Maybe I just didn't look hard enough for the documentation?? 2/ Is it too late to discuss moving the sysfs files out of the 'queue' subdirectory? 'queue' has a lot of values the are purely related to the request queue used in the elevator algorithm, and are completely irrelevant to md and other virtual devices (I look forward to the day when md devices don't have a 'queue' at all). However these new fields are part of the interface between the block device and the filesystem (or other client) and so are of a very different character and should - I think - be in a very different place. I'm not sure where that place should be yet. One idea I has was in the bdi, but that isn't completely convincing. However a separate directory like bdi but called e.g. topology might make a lot of sense. I would probably like to move the fields out of struct request_queue too, but as that is purely of internal interest, it doesn't matter if that is delayed a release or two. The interface through sysfs is more important to get 'right'. So: can this be open for discussion? Thanks, NeilBrown ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] md: Use new topology calls to indicate alignment and I/O sizes 2009-06-24 4:05 ` Neil Brown @ 2009-06-24 5:03 ` Martin K. Petersen 2009-06-24 6:22 ` Neil Brown 2009-06-30 20:24 ` Mike Snitzer 1 sibling, 1 reply; 16+ messages in thread From: Martin K. Petersen @ 2009-06-24 5:03 UTC (permalink / raw) To: Neil Brown Cc: Mike Snitzer, Martin K. Petersen, linux-raid, snitzer, Linus Torvalds >>>>> "Neil" == Neil Brown <neilb@suse.de> writes: Neil, Neil> 1/ Is there any documentation on exactly what "io_min" etc mean? Neil> I can guess that "io_opt" means "try to use this size, or a Neil> multiple of this size, whenever possible" and does not Neil> differentiate between read and write (despite the fact that I Neil> probably care a lot more about write than read). This was discussed a while back. My first take did provide min + opt parameters for both I/O directions. However, the values provided by the hardware that we feed off are somewhat write biased. And obviously there's no read-modify-write penalty for reads. So there was not much point in providing the knobs for the latter. Neil> But when io_min is larger than physical_block_size, what does it Neil> mean? Maybe I just didn't look hard enough for the Neil> documentation?? Documentation/ABI/testing/sysfs-block.txt The difference is that the io_min parameter can be scaled up by stacking drivers. For RAID5 you may sit on top of disks with 512 byte physical blocks but I/Os that small will cause MD to perform read-modify-write. So you scale io_min up to whatever makes sense given the chunk size. Think of physical_block_size as an indicator of physical atomicity for correctness reasons and io_min as the smallest I/O you'd want to issue for performance reasons. Neil> 2/ Is it too late to discuss moving the sysfs files out of the Neil> 'queue' subdirectory? 'queue' has a lot of values the are purely Neil> related to the request queue used in the elevator algorithm, and Neil> are completely irrelevant to md and other virtual devices (I look Neil> forward to the day when md devices don't have a 'queue' at all). These sat under /sys/block/<dev>/topology for a while but there was overlap with the existing queue params and several apps expected to find the values in queue. Also, at the storage summit several people advocated having the limits in queue instead of introducing a new directory. If you look at the patches that went in through block you'll see that MD devices now have the queue directory exposed in sysfs despite not really having a queue (nor an associated elevator). To me, it's more a matter of the term "queue" being a misnomer rather than the actual values/functions that are contained in struct request_queue. I always implicitly read request_queue as request_handling_goo. That being said I don't have a problem moving the limits somewhere else if that's what people want to do. I agree that the current sysfs location for the device limits is mostly a function of implementation and backwards compatibility. -- Martin K. Petersen Oracle Linux Engineering ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] md: Use new topology calls to indicate alignment and I/O sizes 2009-06-24 5:03 ` Martin K. Petersen @ 2009-06-24 6:22 ` Neil Brown 2009-06-24 15:27 ` Mike Snitzer 2009-06-24 17:07 ` [PATCH] " Martin K. Petersen 0 siblings, 2 replies; 16+ messages in thread From: Neil Brown @ 2009-06-24 6:22 UTC (permalink / raw) To: Martin K. Petersen; +Cc: Mike Snitzer, linux-raid, snitzer, Linus Torvalds On Wednesday June 24, martin.petersen@oracle.com wrote: > >>>>> "Neil" == Neil Brown <neilb@suse.de> writes: > Neil> But when io_min is larger than physical_block_size, what does it > Neil> mean? Maybe I just didn't look hard enough for the > Neil> documentation?? > > Documentation/ABI/testing/sysfs-block.txt Ahh, thanks. I searched for "io_min" not "minimum_io" :-) > > The difference is that the io_min parameter can be scaled up by stacking > drivers. For RAID5 you may sit on top of disks with 512 byte physical > blocks but I/Os that small will cause MD to perform read-modify-write. > So you scale io_min up to whatever makes sense given the chunk size. > > Think of physical_block_size as an indicator of physical atomicity for > correctness reasons and io_min as the smallest I/O you'd want to issue > for performance reasons. That correctness/performance distinction is a good one, but is not at all clear from the documentation. Are you saying that if you tried to write a 512byte sector to a SATA drive with 4KB sectors it would corrupt the data? Or it would fail? In either case, the reference to "read-modify-write" in the documentation seems misplaced. So a write MUST be physical_block_size and SHOULD be minimum_io_size Now I don't get the difference between "preferred" and "optimal". Surely we would always prefer everything to be optimal. The definition of "optimal_io_size" from the doco says it is the "preferred unit of receiving I/O". Very confusing. What I can see at present is 5 values: logical_block_size physical_block_size minimum_io_size optimal_io_size read_ahead_kb and only one distinction: "correctness" vs "performance" aka "MUST" vs "SHOULD". Maybe there is another distinction: "SHOULD" for read and "SHOULD" for write. Though reading further about the alignment, it seems that the physical_block_size isn't really a 'MUST', as having a partition that was not properly aligned to a MUST size would be totally broken. Is it possible to get more precise definitions of these? I would like definitions that make strong statements so I can compare these to the actual implementation to see if the implementation is correct or not. My current thought for raid0 for example is that the only way it differs from the max of the underlying devices is that the read-ahead size should be N times the max for N drives. A read_ahead related to optimal_io_size ?? > > > Neil> 2/ Is it too late to discuss moving the sysfs files out of the > Neil> 'queue' subdirectory? 'queue' has a lot of values the are purely > Neil> related to the request queue used in the elevator algorithm, and > Neil> are completely irrelevant to md and other virtual devices (I look > Neil> forward to the day when md devices don't have a 'queue' at all). > > These sat under /sys/block/<dev>/topology for a while but there was > overlap with the existing queue params and several apps expected to find > the values in queue. Also, at the storage summit several people > advocated having the limits in queue instead of introducing a new > directory. (not impressed with having summits like that in meat-space - they exclude people who are not in a position to travel.. Maybe we should try on-line summits). I think /sys/block/<dev>/topology is an excellent idea (except that the word always makes me think of rubber sheets with cups and handles - from third year mathematics). I'd go for "metrics" myself. Or bdi. Yes, some values would be duplicated from 'queue', but we already have read_ahead_kb duplicated in queue and bdi. Having a few more duplicated, and then trying to phase out the legacy usage would not be a bad idea. Actually, the more I think about it the more I like the idea that this is all information about the backing device for a filesystem, information that is used by the VM and the filesystem to choose suitable IO sizes (just like read_ahead_kb). So I *really* think it belongs in bdi. > > If you look at the patches that went in through block you'll see that MD > devices now have the queue directory exposed in sysfs despite not really > having a queue (nor an associated elevator). To me, it's more a matter > of the term "queue" being a misnomer rather than the actual > values/functions that are contained in struct request_queue. I always > implicitly read request_queue as request_handling_goo. Agreed, the name 'queue' is part of the problem, and 'goo' might work better. But there is more to it than that. Some fields are of interest only to code that has special knowledge about the particular implementation. These fields will be different for different types of devices. nr_request for the elevator, chunk_size for a raid array. This is 'goo'. Other fields are truly generic. 'size' 'read_ahead_kb' 'hw_sector_size' are relevant to all devices and needed by some filesystems. This is metrics, or bdi. I think the 'particular' and the 'generic' should be in different places. > > That being said I don't have a problem moving the limits somewhere else > if that's what people want to do. I agree that the current sysfs > location for the device limits is mostly a function of implementation > and backwards compatibility. Who do I have to get on side for you to be comfortable moving the various metrics to 'bdi' (leaving legacy duplicates in 'queue' where that is necessary) ?? i.e. which people need to want it? NeilBrown ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: md: Use new topology calls to indicate alignment and I/O sizes 2009-06-24 6:22 ` Neil Brown @ 2009-06-24 15:27 ` Mike Snitzer 2009-06-24 16:27 ` Mike Snitzer 2009-06-24 23:28 ` Neil Brown 2009-06-24 17:07 ` [PATCH] " Martin K. Petersen 1 sibling, 2 replies; 16+ messages in thread From: Mike Snitzer @ 2009-06-24 15:27 UTC (permalink / raw) To: Neil Brown Cc: linux-raid, dm-devel, Linus Torvalds, Alasdair G Kergon, Martin K. Petersen On Wed, Jun 24 2009 at 2:22am -0400, Neil Brown <neilb@suse.de> wrote: > > On Wednesday June 24, martin.petersen@oracle.com wrote: > > >>>>> "Neil" == Neil Brown <neilb@suse.de> writes: > > Neil> But when io_min is larger than physical_block_size, what does it > > Neil> mean? Maybe I just didn't look hard enough for the > > Neil> documentation?? > > > > Documentation/ABI/testing/sysfs-block.txt > > Ahh, thanks. I searched for "io_min" not "minimum_io" :-) > > > > > The difference is that the io_min parameter can be scaled up by stacking > > drivers. For RAID5 you may sit on top of disks with 512 byte physical > > blocks but I/Os that small will cause MD to perform read-modify-write. > > So you scale io_min up to whatever makes sense given the chunk size. > > > > Think of physical_block_size as an indicator of physical atomicity for > > correctness reasons and io_min as the smallest I/O you'd want to issue > > for performance reasons. > > That correctness/performance distinction is a good one, but is not at > all clear from the documentation. > > Are you saying that if you tried to write a 512byte sector to a SATA > drive with 4KB sectors it would corrupt the data? Or it would fail? > In either case, the reference to "read-modify-write" in the > documentation seems misplaced. > > So a write MUST be physical_block_size > and SHOULD be minimum_io_size > > Now I don't get the difference between "preferred" and "optimal". > Surely we would always prefer everything to be optimal. > The definition of "optimal_io_size" from the doco says it is the > "preferred unit of receiving I/O". Very confusing. > > What I can see at present is 5 values: > logical_block_size > physical_block_size > minimum_io_size > optimal_io_size > read_ahead_kb > and only one distinction: "correctness" vs "performance" aka "MUST" vs > "SHOULD". Maybe there is another distinction: "SHOULD" for read and > "SHOULD" for write. > > Though reading further about the alignment, it seems that the > physical_block_size isn't really a 'MUST', as having a partition that > was not properly aligned to a MUST size would be totally broken. > > Is it possible to get more precise definitions of these? > I would like definitions that make strong statements so I can compare > these to the actual implementation to see if the implementation is > correct or not. > > My current thought for raid0 for example is that the only way it > differs from the max of the underlying devices is that the read-ahead > size should be N times the max for N drives. A read_ahead related to > optimal_io_size ?? Hi Neil, For some reason I thought you were aware of what Martin had put together. I assumed as much given you helped sort out some MD interface compile fixes in linux-next relative to topology-motivated changes. Anyway, not your fault that you didn't notice the core topology support.. It is likely a function of Martin having implemented the MD bits; you got this topology support "for free"; whereas I was forced to implement DM's topology support (and a few important changes to the core infrastructure). Here is a thread from April that discusses the core of the topology support: http://marc.info/?l=linux-ide&m=124058535512850&w=4 This post touches on naming and how userland tools are expected to consume the topology metrics: http://marc.info/?t=124055146700007&r=1&w=4 This post talks about the use of sysfs: http://marc.info/?l=linux-ide&m=124058543713031&w=4 Martin later shared the following with Alasdair and I and it really helped smooth out my understanding of these new topology metrics (I've updated it to reflect the current naming of these metrics): >>>>> "Alasdair" == Alasdair G Kergon <agk@redhat.com> writes: Alasdair> All I/O to any device MUST be a multiple of the hardsect_size. Correct. Alasdair> I/O need not be aligned to a multiple of the hardsect_size. Incorrect. It will inevitably be aligned to the hardsect_size but not necessarily to the physical block size. Let's stop using the term hardsect_size. It is confusing. I have killed the term. The logical_block_size (what was called hardsect_size) is the block size exposed to the operating system by way of the programming interface (ATA or SCSI Block Commands). The logical block size is the smallest atomic unit we can programmatically access on the device. The physical_block_size is the entity the storage device is using internally to organize data. On contemporary disk drives the logical and physical blocks are both 512 bytes. But even today we have RAID arrays whose internal block size is significantly bigger than the logical ditto. So far this hasn't been a big issue because the arrays have done a decent job at masking their internal block sizes. However, disk drives don't have that luxury because there's only one spindle and no non-volatile cache to mask the effect of accessing the device in a suboptimal fashion. The performance impact of doing read-modify-write on disk drives is huge. Therefore it's imperative that we submit aligned I/O once drive vendors switch to 4KB physical blocks (no later than 2011). So we have: - hardsect_size = logical_block_size = the smallest unit we can address using the programming interface - minimum_io_size = physical_block_size = the smallest unit we can write without incurring read-modify-write penalty - optimal_io_size = the biggest I/O we can submit without incurring a penalty (stall, cache or queue full). A multiple of minimum_io_size. - alignment_offset = padding to the start of the lowest aligned logical block. The actual minimum_io_size at the top of the stack may be scaled up (for RAID5 chunk size, for instance). Some common cases will be: Legacy drives: - 512-byte logical_block_size (was hardsect_size) - 512-byte physical_block_size (minimum_io_size) - 0-byte optimal_io_size (not specified, any multiple of minimum_io_size) - 0-byte alignment_offset (lowest logical block aligned to a 512-byte boundary is LBA 0) 4KB desktop-class SATA drives: - 512-byte logical_block_size (was hardsect_size) - 4096-byte physical_block_size (minimum_io_size) - 0-byte optimal_io_size (not specified, any multiple of minimum_io_size) - 3584-byte alignment_offset (lowest logical block aligned to a minimum_io_size boundary is LBA 7) 4KB SCSI drives and 4KB nearline SATA: - 4096-byte logical_block_size (was hardsect_size) - 4096-byte physical_block_size (minimum_io_size) - 0-byte optimal_io_size (not specified, any multiple of minimum_io_size) - 0-byte alignment_offset (lowest logical block aligned to an minimum_io_size boundary is LBA 0) Example 5-disk RAID5 array: - 512-byte logical_block_size (was hardsect_size) - 64-Kbyte physical_block_size (minimum_io_size == chunk size) - 256-Kbyte optimal_io_size (4 * minimum_io_size == full stripe) - 3584-byte alignment_offset (lowest logical block aligned to a 64-Kbyte boundary is LBA 7) Alasdair> There is an alignment_offset - of 63 sectors in your Alasdair> example. IF this is non-zero, I/O SHOULD be offset to a Alasdair> multiple of the minimum_io_size plus this alignment_offset. I/O SHOULD always be submitted in multiples of the alignment_offset. I/O SHOULD always be aligned on the alignment_offset boundary. if (bio->bi_sector * logical_block_size) % minimum_io_size == alignment_offset) /* I/O is aligned */ > > Neil> 2/ Is it too late to discuss moving the sysfs files out of the > > Neil> 'queue' subdirectory? 'queue' has a lot of values the are purely > > Neil> related to the request queue used in the elevator algorithm, and > > Neil> are completely irrelevant to md and other virtual devices (I look > > Neil> forward to the day when md devices don't have a 'queue' at all). > > > > These sat under /sys/block/<dev>/topology for a while but there was > > overlap with the existing queue params and several apps expected to find > > the values in queue. Also, at the storage summit several people > > advocated having the limits in queue instead of introducing a new > > directory. > > (not impressed with having summits like that in meat-space - they > exclude people who are not in a position to travel.. Maybe we should > try on-line summits). > > I think /sys/block/<dev>/topology is an excellent idea (except that > the word always makes me think of rubber sheets with cups and handles > - from third year mathematics). I'd go for "metrics" myself. Or bdi. > > Yes, some values would be duplicated from 'queue', but we already have > read_ahead_kb duplicated in queue and bdi. Having a few more > duplicated, and then trying to phase out the legacy usage would not be > a bad idea. > > Actually, the more I think about it the more I like the idea that this > is all information about the backing device for a filesystem, > information that is used by the VM and the filesystem to choose > suitable IO sizes (just like read_ahead_kb). > So I *really* think it belongs in bdi. > > > If you look at the patches that went in through block you'll see that MD > > devices now have the queue directory exposed in sysfs despite not really > > having a queue (nor an associated elevator). To me, it's more a matter > > of the term "queue" being a misnomer rather than the actual > > values/functions that are contained in struct request_queue. I always > > implicitly read request_queue as request_handling_goo. > > Agreed, the name 'queue' is part of the problem, and 'goo' might work > better. > But there is more to it than that. > Some fields are of interest only to code that has special knowledge > about the particular implementation. These fields will be different > for different types of devices. nr_request for the elevator, > chunk_size for a raid array. This is 'goo'. > Other fields are truly generic. 'size' 'read_ahead_kb' > 'hw_sector_size' are relevant to all devices and needed by some > filesystems. This is metrics, or bdi. > I think the 'particular' and the 'generic' should be in different > places. > > > > > That being said I don't have a problem moving the limits somewhere else > > if that's what people want to do. I agree that the current sysfs > > location for the device limits is mostly a function of implementation > > and backwards compatibility. > > Who do I have to get on side for you to be comfortable moving the > various metrics to 'bdi' (leaving legacy duplicates in 'queue' where > that is necessary) ?? i.e. which people need to want it? While I agree that adding these generic topology metrics to 'queue' may not be the perfect place I don't feel 'bdi' really helps userland understand them any better. Nor would userland really care. But I do agree that 'bdi' is likely a better place. You had mentioned your goal of removing MD's 'queue' entirely. Well DM already had that but Martin exposed a minimalist one as part of preparations for the topology support, see commit: cd43e26f071524647e660706b784ebcbefbd2e44 This 'bdi' vs 'queue' discussion really stands to cause problems for userland. It would be unfortunate to force tools be aware of 2 places. Rather than "phase out legacy usage" of these brand new topology limits it would likely be wise to get it right the first time. Again, I'm OK with 'queue'; but Neil if you feel strongly about 'bdi' we should get a patch to Linus ASAP for 2.6.31. I can take a stab at it now if you don't have time. We _could_ then backout cd43e26f071524647e660706b784ebcbefbd2e44 too? Mike ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: md: Use new topology calls to indicate alignment and I/O sizes 2009-06-24 15:27 ` Mike Snitzer @ 2009-06-24 16:27 ` Mike Snitzer 2009-06-24 23:28 ` Neil Brown 1 sibling, 0 replies; 16+ messages in thread From: Mike Snitzer @ 2009-06-24 16:27 UTC (permalink / raw) To: Neil Brown Cc: linux-raid, dm-devel, Linus Torvalds, Martin K. Petersen, Alasdair G Kergon On Wed, Jun 24 2009 at 11:27am -0400, Mike Snitzer <snitzer@redhat.com> wrote: > For some reason I thought you were aware of what Martin had put > together. I assumed as much given you helped sort out some MD interface > compile fixes in linux-next relative to topology-motivated changes. > Anyway, not your fault that you didn't notice the core topology > support.. It is likely a function of Martin having implemented the MD > bits; you got this topology support "for free"; whereas I was forced to > implement DM's topology support (and a few important changes to the core > infrastructure). Got these inverted: > Here is a thread from April that discusses the core of the topology > support: http://marc.info/?t=124055146700007&r=1&w=4 > This post touches on naming and how userland tools are expected to > consume the topology metrics: http://marc.info/?l=linux-ide&m=124058535512850&w=4 > > This post talks about the use of sysfs: > http://marc.info/?l=linux-ide&m=124058543713031&w=4 ... > While I agree that adding these generic topology metrics to 'queue' may > not be the perfect place I don't feel 'bdi' really helps userland > understand them any better. Nor would userland really care. But I do > agree that 'bdi' is likely a better place. > > You had mentioned your goal of removing MD's 'queue' entirely. Well DM > already had that but Martin exposed a minimalist one as part of > preparations for the topology support, see commit: > cd43e26f071524647e660706b784ebcbefbd2e44 > > This 'bdi' vs 'queue' discussion really stands to cause problems for > userland. It would be unfortunate to force tools be aware of 2 places. > Rather than "phase out legacy usage" of these brand new topology limits > it would likely be wise to get it right the first time. Again, I'm OK > with 'queue'; but Neil if you feel strongly about 'bdi' we should get a > patch to Linus ASAP for 2.6.31. > > I can take a stab at it now if you don't have time. On 2nd thought, bdi is inherently tied to mm and doesn't feel 'right'; I'll defer to Martin and Jens on this. Regards, Mike ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: md: Use new topology calls to indicate alignment and I/O sizes 2009-06-24 15:27 ` Mike Snitzer 2009-06-24 16:27 ` Mike Snitzer @ 2009-06-24 23:28 ` Neil Brown 2009-06-25 0:05 ` Martin K. Petersen 1 sibling, 1 reply; 16+ messages in thread From: Neil Brown @ 2009-06-24 23:28 UTC (permalink / raw) To: Mike Snitzer Cc: Martin K. Petersen, linux-raid, Linus Torvalds, Alasdair G Kergon, dm-devel On Wednesday June 24, snitzer@redhat.com wrote: > > > Hi Neil, > > For some reason I thought you were aware of what Martin had put > together. I assumed as much given you helped sort out some MD interface > compile fixes in linux-next relative to topology-motivated changes. > Anyway, not your fault that you didn't notice the core topology > support.. It is likely a function of Martin having implemented the MD > bits; you got this topology support "for free"; whereas I was forced to > implement DM's topology support (and a few important changes to the core > infrastructure). > > Here is a thread from April that discusses the core of the topology > support: > http://marc.info/?l=linux-ide&m=124058535512850&w=4 Thanks for these. I see they were mainly on linux-ide and linux-scsi, neither of which I read. They also eventually made an appearance on linux-kernel, but it is only by luck that I find important stuff there. Yes, I have been aware of it for a while but had my mind on other things and figured other people could probably get the details right without me interfering.... > > This post touches on naming and how userland tools are expected to > consume the topology metrics: > http://marc.info/?t=124055146700007&r=1&w=4 > > This post talks about the use of sysfs: > http://marc.info/?l=linux-ide&m=124058543713031&w=4 > > > Martin later shared the following with Alasdair and I and it really > helped smooth out my understanding of these new topology metrics (I've > updated it to reflect the current naming of these metrics): Thanks for this - quite helpful. I might refer to bit in my reply to Martin. > > > > > > That being said I don't have a problem moving the limits somewhere else > > > if that's what people want to do. I agree that the current sysfs > > > location for the device limits is mostly a function of implementation > > > and backwards compatibility. > > > > Who do I have to get on side for you to be comfortable moving the > > various metrics to 'bdi' (leaving legacy duplicates in 'queue' where > > that is necessary) ?? i.e. which people need to want it? > > While I agree that adding these generic topology metrics to 'queue' may > not be the perfect place I don't feel 'bdi' really helps userland > understand them any better. Nor would userland really care. But I do > agree that 'bdi' is likely a better place. > > You had mentioned your goal of removing MD's 'queue' entirely. Well DM > already had that but Martin exposed a minimalist one as part of > preparations for the topology support, see commit: > cd43e26f071524647e660706b784ebcbefbd2e44 Oh yuck. I wasn't aware of that. This is just horrible. > > We _could_ then backout cd43e26f071524647e660706b784ebcbefbd2e44 too? please please please please please! (I haven't really paid attention, but I thought everything always had a 'queue' - I just ignored in on md devices because it didn't mean anything). NeilBrown ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: md: Use new topology calls to indicate alignment and I/O sizes 2009-06-24 23:28 ` Neil Brown @ 2009-06-25 0:05 ` Martin K. Petersen 0 siblings, 0 replies; 16+ messages in thread From: Martin K. Petersen @ 2009-06-25 0:05 UTC (permalink / raw) To: Neil Brown Cc: Mike Snitzer, Martin K. Petersen, linux-raid, dm-devel, jens.axboe, Linus Torvalds, Alasdair G Kergon >>>>> "Neil" == Neil Brown <neilb@suse.de> writes: Neil> Thanks for this - quite helpful. I might refer to bit in my reply Neil> to Martin. I'm putting Jens on Cc: so he gets a chance to reply... -- Martin K. Petersen Oracle Linux Engineering ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] md: Use new topology calls to indicate alignment and I/O sizes 2009-06-24 6:22 ` Neil Brown 2009-06-24 15:27 ` Mike Snitzer @ 2009-06-24 17:07 ` Martin K. Petersen 2009-06-25 2:35 ` Neil Brown 1 sibling, 1 reply; 16+ messages in thread From: Martin K. Petersen @ 2009-06-24 17:07 UTC (permalink / raw) To: Neil Brown Cc: Martin K. Petersen, Mike Snitzer, linux-raid, snitzer, Linus Torvalds >>>>> "Neil" == Neil Brown <neilb@suse.de> writes: Neil, Neil> Are you saying that if you tried to write a 512byte sector to a Neil> SATA drive with 4KB sectors it would corrupt the data? Or it Neil> would fail? In either case, the reference to "read-modify-write" Neil> in the documentation seems misplaced. The next generation SATA drives will have 512-byte logical block size but use a 4096-byte internal block size. If you submit a 512-byte write to such a drive it will have to first read the 4KB sector, add the 512 bytes of new data, and then write the combined result. This means you have to do an extra rotation for each I/O. I have some drives here and the performance impact is huge for random I/O workloads. Now, the reason I keep the physical_block_size around is because the usual sector atomicity "guarantees" get tweaked with 4KB drives. For instance, assume that your filesystem is journaling in 512-byte increments and rely on a 512-byte write being atomic. On a 4KB drive you could get an I/O error on logical block N within a 4KB physical block. That would cause the previous writes to that sector to "disappear" despite having been acknowledged to the OS. Consequently, it is important that we know the actual atomicity of the underlying storage for correctness reasons. Hence the physical block size parameter. Neil> Now I don't get the difference between "preferred" and "optimal". I don't think there is a difference. Neil> Surely we would always prefer everything to be optimal. The Neil> definition of "optimal_io_size" from the doco says it is the Neil> "preferred unit of receiving I/O". Very confusing. I think that comes out of the SCSI spec. The knobs were driven by hardware RAID arrays that prefer writing in multiples of full stripe widths. I like to think of things this way: Hardware limitations (MUST): - logical_block_size is the smallest unit the device can address. - physical_block_size is the smallest I/O the device can perform atomically. >= logical_block_size. - alignment_offset describes how much LBA 0 is offset from the natural (physical) block alignment. Performance hints (SHOULD): - minimum_io_size is the preferred I/O size for random writes. No R-M-W. >= physical_block_size. - optimal_io_size is the preferred I/O size for large sustained writes. Best utilization of the spindles available (or whatever makes sense given the type of device). Multiple of minimum_io_size. Neil> Though reading further about the alignment, it seems that the Neil> physical_block_size isn't really a 'MUST', as having a partition Neil> that was not properly aligned to a MUST size would be totally Neil> broken. The main reason for exporting these values in sysfs is so that fdisk/parted/dmsetup/mdadm can avoid creating block devices that will cause misaligned I/O. And then libdisk/mkfs.* might use the performance hints to make sure things are aligned to stripe units etc. It is true that we could use the knobs inside the kernel to adjust things at runtime. And we might. But the main motivator here is to make sure we lay out things correctly when creating block devices/partitions/filesystems on top of these - ahem - quirky devices coming out. Neil> My current thought for raid0 for example is that the only way it Neil> differs from the max of the underlying devices is that the Neil> read-ahead size should be N times the max for N drives. A Neil> read_ahead related to optimal_io_size ?? Optimal I/O size is mainly aimed at making sure you write in multiples of the stripe size so you can keep all drives equally busy in a RAID setup. The read-ahead size is somewhat orthogonal but I guess we could wire it up to the optimal_io_size for RAID arrays. I haven't done any real life testing to see whether that would improve performance. Neil> Who do I have to get on side for you to be comfortable moving the Neil> various metrics to 'bdi' (leaving legacy duplicates in 'queue' Neil> where that is necessary) ?? i.e. which people need to want it? Jens at the very minimum :) -- Martin K. Petersen Oracle Linux Engineering ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] md: Use new topology calls to indicate alignment and I/O sizes 2009-06-24 17:07 ` [PATCH] " Martin K. Petersen @ 2009-06-25 2:35 ` Neil Brown 2009-06-25 4:37 ` Martin K. Petersen 0 siblings, 1 reply; 16+ messages in thread From: Neil Brown @ 2009-06-25 2:35 UTC (permalink / raw) To: Martin K. Petersen Cc: Mike Snitzer, linux-raid, snitzer, jens.axboe, Linus Torvalds On Wednesday June 24, martin.petersen@oracle.com wrote: > >>>>> "Neil" == Neil Brown <neilb@suse.de> writes: > > Neil, > > Neil> Are you saying that if you tried to write a 512byte sector to a > Neil> SATA drive with 4KB sectors it would corrupt the data? Or it > Neil> would fail? In either case, the reference to "read-modify-write" > Neil> in the documentation seems misplaced. > > The next generation SATA drives will have 512-byte logical block size > but use a 4096-byte internal block size. > > If you submit a 512-byte write to such a drive it will have to first > read the 4KB sector, add the 512 bytes of new data, and then write the > combined result. This means you have to do an extra rotation for each > I/O. I have some drives here and the performance impact is huge for > random I/O workloads. > > Now, the reason I keep the physical_block_size around is because the > usual sector atomicity "guarantees" get tweaked with 4KB drives. For > instance, assume that your filesystem is journaling in 512-byte > increments and rely on a 512-byte write being atomic. On a 4KB drive > you could get an I/O error on logical block N within a 4KB physical > block. That would cause the previous writes to that sector to > "disappear" despite having been acknowledged to the OS. Consequently, > it is important that we know the actual atomicity of the underlying > storage for correctness reasons. Hence the physical block size > parameter. "atomicity". That is a very useful word that is completely missing from the documentation. I guess is it very similar to "no R-M-W" but the observation about leaking write errors is a very useful one I think that shouldn't be left out. So: physical_block_size : the smallest unit of atomic writes. A write error in one physical block will never corrupt data in a different physical block. That starts to make sense. With RAID5, this definition would make the physical_block_size the same as the stripe size because if the array is degraded (which is the only time a write error can be visible), a write error will potentially corrupt other blocks in the stripe. Does the physical_block_size have to be a power of 2?? > > > Neil> Now I don't get the difference between "preferred" and "optimal". > > I don't think there is a difference. Good. Let's just choose one and clean up the documentation. > > > Neil> Surely we would always prefer everything to be optimal. The > Neil> definition of "optimal_io_size" from the doco says it is the > Neil> "preferred unit of receiving I/O". Very confusing. > > I think that comes out of the SCSI spec. The knobs were driven by > hardware RAID arrays that prefer writing in multiples of full stripe > widths. > > I like to think of things this way: > > Hardware limitations (MUST): > > - logical_block_size is the smallest unit the device can address. This is one definition I have no quibble with at all. > > - physical_block_size is the smallest I/O the device can perform > atomically. >= logical_block_size. Well, the smallest write, 'O', but not 'I'. I guess it is the smallest atomic read is many cases, but I cannot see that being relevant. Is it OK to just talk about he 'write' path here? > > - alignment_offset describes how much LBA 0 is offset from the natural > (physical) block alignment. > > > Performance hints (SHOULD): > > - minimum_io_size is the preferred I/O size for random writes. No > R-M-W. >= physical_block_size. Presumably this is not simply >= physical_block_size, but is an integer multiple of physical_block_size ?? This is assumed to be aligned the same as physical_block_size and no further alignment added? i.e. the address should be alignment_offset + N * minimum_io_size ??? I think that is reasonable, but should be documented. And again, as it is for 'writes', let's drop the 'I'. And I'm still not seeing a clear distinction between this and physical_block_size. The difference seems to be simply that there are two different R-M-W scenarios happening: one in the drive and one in a RAID4/5/6 array. Each can cause atomicity problems. Maybe we really want physical_block_size_A and physical_block_size_B. Where B is preferred, but A is better than nothing. You could even add that A must be a power of 2, but B doesn't need to be. > > - optimal_io_size is the preferred I/O size for large sustained writes. > Best utilization of the spindles available (or whatever makes sense > given the type of device). Multiple of minimum_io_size. In the email Mike forwarded, you said: - optimal_io_size = the biggest I/O we can submit without incurring a penalty (stall, cache or queue full). A multiple of minimum_io_size. Which has a subtly different implication. If the queue and cache are configurable (as is the case for e.g. md/raid5) then this is a dynamic value (contrasting with 'spindles' which are much less likely to be dynamic) and so is really of interest only to the VM and filesystem while the device is being accessed. I'm not even sure it is interesting to them. Surely the VM should just throw writes at the device in minimum_io_size chunks until the device sets it's "congested" flag - then the VM backs off. Knowing in advance what that size will be doesn't seem very useful (but if it is, it would be good to have that explained in the documentation). This is in strong contrast to minimum_io_size which I think could be very useful. The VM tries to gather writes in chunks of this size, and the filesystem tries to lay out blocks with this sort of alignment. I think it would be a substantial improvement to the documentation to give examples of how the values would be used. As an aside, I can easily imagine devices where the minimum_io_size varies across the address-space of the device - RAID-X being one interesting example. Regular hard drives being another if it helped to make minimum_io_size line up with the track or cylinder size. So maybe it would be best not to export this, but to provide a way for the VM and filesystem do discover it dynamically on a per-address basic?? I guess that is not good for mkfs on a filesystem that is designed to expect a static layout. > > > Neil> Though reading further about the alignment, it seems that the > Neil> physical_block_size isn't really a 'MUST', as having a partition > Neil> that was not properly aligned to a MUST size would be totally > Neil> broken. > > The main reason for exporting these values in sysfs is so that > fdisk/parted/dmsetup/mdadm can avoid creating block devices that will > cause misaligned I/O. > > And then libdisk/mkfs.* might use the performance hints to make sure > things are aligned to stripe units etc. > > It is true that we could use the knobs inside the kernel to adjust > things at runtime. And we might. But the main motivator here is to > make sure we lay out things correctly when creating block > devices/partitions/filesystems on top of these - ahem - quirky devices > coming out. This clearly justifies logical_block_size, physical_block_size, alignment_offset. And it justifies minimum_io_size as a hint. (Which is really minimum_write_size). I'm not so sure about optimal_io_size.. I guess it is just another hint. There is a bit of a pattern there. We have a number (2) of different write sizes where going below that size risks integrity, and a number (2) where going below that size risks performance. So maybe we should be explicit about that and provide some lists of sizes: safe_write_size: 512 4096 327680 optimal_write_size: 65536 327680 10485760 (which is for a raid5 with 6 drives, 64k chunk size, 4K blocks on underlying device, and a cache which comfortably holds 32 stripes). In each case, the implication is to use the biggest size possible, but if you cannot, use a multiple of the largest size you can reach. safe_write size is for the filesystem to lay out data correctly, optimal_write_size is for the filesystem and the VM to maximise speed. > > > Neil> My current thought for raid0 for example is that the only way it > Neil> differs from the max of the underlying devices is that the > Neil> read-ahead size should be N times the max for N drives. A > Neil> read_ahead related to optimal_io_size ?? > > Optimal I/O size is mainly aimed at making sure you write in multiples > of the stripe size so you can keep all drives equally busy in a RAID > setup. > > The read-ahead size is somewhat orthogonal but I guess we could wire it > up to the optimal_io_size for RAID arrays. I haven't done any real life > testing to see whether that would improve performance. md raid arrays already set the read ahead to a multiple of the stripe size (though possibly not a large enough multiple in some cases as it happens). In some sense it is orthogonal, but in another sense it is part of the same set of data: metrics of the device that improve performance. > > > Neil> Who do I have to get on side for you to be comfortable moving the > Neil> various metrics to 'bdi' (leaving legacy duplicates in 'queue' > Neil> where that is necessary) ?? i.e. which people need to want it? > > Jens at the very minimum :) I'll prepare a missive, though I've already included Jens in the CC on this as you (separately) suggest. Thanks, NeilBrown ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] md: Use new topology calls to indicate alignment and I/O sizes 2009-06-25 2:35 ` Neil Brown @ 2009-06-25 4:37 ` Martin K. Petersen 2009-06-25 6:16 ` Neil Brown 0 siblings, 1 reply; 16+ messages in thread From: Martin K. Petersen @ 2009-06-25 4:37 UTC (permalink / raw) To: Neil Brown Cc: Martin K. Petersen, Mike Snitzer, linux-raid, snitzer, jens.axboe, Linus Torvalds >>>>> "Neil" == Neil Brown <neilb@suse.de> writes: Neil, Neil> With RAID5, this definition would make the physical_block_size the Neil> same as the stripe size because if the array is degraded (which is Neil> the only time a write error can be visible), a write error will Neil> potentially corrupt other blocks in the stripe. But with MD RAID5 we know about it. The problem with disk drives (and even some arrays) is that the result is undefined. But see below. Neil> Well, the smallest write, 'O', but not 'I'. I guess it is the Neil> smallest atomic read is many cases, but I cannot see that being Neil> relevant. Is it OK to just talk about he 'write' path here? As I said earlier, these values are write centric. People didn't see much value in providing a similar set of knobs for reads so I removed them. >> - minimum_io_size is the preferred I/O size for random writes. No >> R-M-W. >= physical_block_size. Neil> Presumably this is not simply >= physical_block_size, but is an Neil> integer multiple of physical_block_size ?? Yep. Neil> This is assumed to be aligned the same as physical_block_size and Neil> no further alignment added? i.e. the address should be Neil> alignment_offset + N * minimum_io_size ??? Yep. Neil> Maybe we really want physical_block_size_A and Neil> physical_block_size_B. Where B is preferred, but A is better than Neil> nothing. You could even add that A must be a power of 2, but B Neil> doesn't need to be. The way I view it is that physical_block_size is tied to the low-level hardware. It is called physical_block_size to match the definition in the ATA and SCSI specs. That's also where logical_block_size comes from. Applications that care about the write block size should look at minimum_io_size. Regardless of whether they are sitting on top of the raw disk or MD or DM. Above the physical disk layer pbs mostly has meaning as housekeeping. I.e. it records the max pbs of the component devices. For instance a RAID1 using a 512-byte drive and a 4KB drive will have a pbs of 4KB. There are a few special cases where something may want to look directly at physical_block_size. But that's in the journal padding/cluster heartbeat department. Neil> In the email Mike forwarded, you said: Neil> - optimal_io_size = the biggest I/O we can submit without Neil> incurring a penalty (stall, cache or queue full). A multiple Neil> of minimum_io_size. I believe I corrected that description later in the thread. But anyway... Neil> Which has a subtly different implication. If the queue and cache Neil> are configurable (as is the case for e.g. md/raid5) then this is a Neil> dynamic value (contrasting with 'spindles' which are much less Neil> likely to be dynamic) and so is really of interest only to the VM Neil> and filesystem while the device is being accessed. The SCSI spec says: "The OPTIMAL TRANSFER LENGTH GRANULARITY field indicates the optimal transfer length granularity in blocks for a single [...] command. Transfers with transfer lengths not equal to a multiple of this value may incur significant delays in processing." I tried to provide a set of hints that could: 1. Be seeded by the knobs actually provided in the hardware specs 2. Help filesystems lay out things optimally for the underlying storage using a single interface regardless of whether it was disk, array, MD or LVM 3. Make sense when allowing essentially arbitrary stacking of MD and LVM devices (for fun add virtualization to the mix) 4. Allow us to make sure we did not submit misaligned requests And consequently I am deliberately being vague. What makes sense for an spinning disk doesn't make sense for an SSD. Or for a Symmetrix. Or for LVM on top of MD on top of 10 Compact Flash devices. So min and opt are hints that are supposed to make some sort of sense regardless of what your actual storage stack looks like. I'm happy to take a stab at making the documentation clearer. But I am against making it so explicit that the terminology only makes sense in an "MD RAID5 on top of three raw disks" universe. I think "Please don't submit writes smaller than this" and "I prefer big writes in multiples of this" are fairly universal. Neil> As an aside, I can easily imagine devices where the Neil> minimum_io_size varies across the address-space of the device - Neil> RAID-X being one interesting example. Regular hard drives being Neil> another if it helped to make minimum_io_size line up with the Neil> track or cylinder size. So maybe it would be best not to export Neil> this, but to provide a way for the VM and filesystem do discover Neil> it dynamically on a per-address basic?? This is what I presented at the Storage & Filesystems workshop. And people hated the "list of topologies" thing. General consensus was that it was too complex. The original code had: /sys/block/topology/nr_regions /sys/block/topology/0/{offset,length,min_io,opt_io,etc.} /sys/block/topology/1/{offset,length,min_io,opt_io,etc.} Aside from the nightmare of splitting and merging topologies when stacking there were also obvious problems with providing an exact representation. For instance a RAID1 with drives with different topologies. How do you describe that? Or a RAID0 with similar mismatched drives where each - say - 64KB chunk in the stripe has a different topology. That becomes a really long list. And if you make it a mapping function callback then what's mkfs supposed to do? Walk the entire block device to get the picture? There was a long discussion about this at the workshop. My code started out last summer as a tiny patch kit called "I/O hints" and the resulting topology patch series as of this spring had turned into a big, bloated monster thanks to arbitrary stacking. So I was not sad to see heterogeneous topology code go at the workshop. It was complex despite my best efforts to keep things simple. The intent with the changes that are now in the kernel is to: 1. Make sure we align properly by way of the hardware metrics (physical_block_size, alignment_offset) 2. Provide some hints that filesystems may or may not use to lay out things (minimum_io_size, optimal_io_size) We already do (2) and have been for ages by way of libdisk which queries LVM or MD but only understands the top device. So part of fixing (2) involved making stacking do the right thing and making a unified interface for a block device to kill the if (this_is_an_md_dev(foo)) /* Go poke MD with ioctls */ else if (this_is_a_dm_dev(foo) /* Go fork LVM utilities and parse cmd line output */ Everybody agreed that it would be better to have a unified set of hints in /sys/block. So that's why things were done this way. Neil> This clearly justifies logical_block_size, physical_block_size, Neil> alignment_offset. And it justifies minimum_io_size as a Neil> hint. (Which is really minimum_write_size). I'm not so sure about Neil> optimal_io_size.. I guess it is just another hint. Absolutely. Neil> There is a bit of a pattern there. We have a number (2) of Neil> different write sizes where going below that size risks integrity, Neil> and a number (2) where going below that size risks performance. Neil> So maybe we should be explicit about that and provide some lists Neil> of sizes: Neil> safe_write_size: 512 4096 327680 optimal_write_size: 65536 327680 Neil> 10485760 Violates the one-value-per-sysfs file rule :) I understand your point but I think it's too much information. -- Martin K. Petersen Oracle Linux Engineering ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] md: Use new topology calls to indicate alignment and I/O sizes 2009-06-25 4:37 ` Martin K. Petersen @ 2009-06-25 6:16 ` Neil Brown 2009-06-25 16:24 ` Martin K. Petersen 0 siblings, 1 reply; 16+ messages in thread From: Neil Brown @ 2009-06-25 6:16 UTC (permalink / raw) To: Martin K. Petersen Cc: Mike Snitzer, linux-raid, snitzer, jens.axboe, Linus Torvalds On Thursday June 25, martin.petersen@oracle.com wrote: > >>>>> "Neil" == Neil Brown <neilb@suse.de> writes: > Neil> Well, the smallest write, 'O', but not 'I'. I guess it is the > Neil> smallest atomic read is many cases, but I cannot see that being > Neil> relevant. Is it OK to just talk about he 'write' path here? > > As I said earlier, these values are write centric. People didn't see > much value in providing a similar set of knobs for reads so I removed > them. Agreed. All I'm asking here is to change the names to reflect this truth. minimum_write_size, optimum_write_size > Neil> Maybe we really want physical_block_size_A and > Neil> physical_block_size_B. Where B is preferred, but A is better than > Neil> nothing. You could even add that A must be a power of 2, but B > Neil> doesn't need to be. > > The way I view it is that physical_block_size is tied to the low-level > hardware. It is called physical_block_size to match the definition in > the ATA and SCSI specs. That's also where logical_block_size comes > from. > > Applications that care about the write block size should look at > minimum_io_size. Regardless of whether they are sitting on top of the > raw disk or MD or DM. Above the physical disk layer pbs mostly has > meaning as housekeeping. I.e. it records the max pbs of the component > devices. For instance a RAID1 using a 512-byte drive and a 4KB drive > will have a pbs of 4KB. > > There are a few special cases where something may want to look directly > at physical_block_size. But that's in the journal padding/cluster > heartbeat department. Here you're thinking seems be very specific. This value comes from that spec. That value is used for this particular application. Yet.... > > And consequently I am deliberately being vague. What makes sense for an > spinning disk doesn't make sense for an SSD. Or for a Symmetrix. Or > for LVM on top of MD on top of 10 Compact Flash devices. So min and opt > are hints that are supposed to make some sort of sense regardless of > what your actual storage stack looks like. ...here you are being deliberately vague. But with the exact same values. I find this confusing. I think that we need to have values that have very strong and well defined meanings. I read your comments above as saying something like: When writing use at least minimum_io_size unless you are journal or heartbeat or similar code, then use physical_block_size. I don't like that definition at all. Conversely: "use the largest of these values that is practical for you, each is better than the previous" does work nicely. It actually reflects reality of devices, gives strong guidance to filesystems and doesn't carry any baggage. > > I'm happy to take a stab at making the documentation clearer. But I am > against making it so explicit that the terminology only makes sense in > an "MD RAID5 on top of three raw disks" universe. > I often find that when it is hard to document something clearly, it is because that something itself is not well defined... My target for this or any similar interface documentation, is that the provider must be guided by the documentation to know exactly what value to present, and the consumer must be guided by the documentation to know exactly which value to use. > > Neil> There is a bit of a pattern there. We have a number (2) of > Neil> different write sizes where going below that size risks integrity, > Neil> and a number (2) where going below that size risks performance. > Neil> So maybe we should be explicit about that and provide some lists > Neil> of sizes: > > Neil> safe_write_size: 512 4096 327680 optimal_write_size: 65536 327680 > Neil> 10485760 > > Violates the one-value-per-sysfs file rule :) Since when are array values not values? It *is* only one value per file! > > I understand your point but I think it's too much information. I think that it is exactly the same information that you are presenting, but in a more meaningful and less confusing (to me) way. safe_write_size: logical_block_size physical_block_size minimum_io_size optimal_write_size: physical_block_size minimum_io_size optimal_io_size And it is more flexible if someone has a more complex hierarchy. The 'alignment' value applies equally to any size value which is larger than it. (I wonder if 'alignment' could benefit from being an array value.....probably not). Just for completeness, I tried to write documentation for the above two array values to fit my guidelines described earlier. I found that I couldn't find a really convincing case for the distinction. A filesystem wants all of it's data to be safe, and wants all of its writes to be fast. And in all the examples we can think of, a safer write is faster than a less-safe write. What that leaves us with is a simple increasing list of sizes, each a multiple of the previous. Each safer and/or faster, but also larger. So I seem to now be suggesting a single array value: preferred_write_size: logical_block_size physical_block_size \ minimum_io_size optimal_io_size It would be documented as: preferred_write_size: A list of sizes (in bytes) such that a write of that size, aligned to a multiple of that size (plus alignment_offset) is strongly preferred over an smaller write for reasons of safety and/or performance. All writes *must* be a multiple of the smallest listed size. Due to the hierarchical nature of some storage systems, there may be a list of values, each safer and/or faster than the previous, but also larger. Subsequent values will normally be multiples of previous values. Sizes other than the first are not constrained to be powers of 2. A filesystem or other client should choose the largest listed size (or a multiple-thereof) which fits within any other constraint it is working under. Note: the 'safety' aspect only applies to cases where a device error or system crash occurs. In such cases there may be a chance that data outside the target area of a write gets corrupted. When there is no such error, all writes are equally safe. NeilBrown ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] md: Use new topology calls to indicate alignment and I/O sizes 2009-06-25 6:16 ` Neil Brown @ 2009-06-25 16:24 ` Martin K. Petersen 0 siblings, 0 replies; 16+ messages in thread From: Martin K. Petersen @ 2009-06-25 16:24 UTC (permalink / raw) To: Neil Brown Cc: Martin K. Petersen, Mike Snitzer, linux-raid, snitzer, jens.axboe, Linus Torvalds >>>>> "Neil" == Neil Brown <neilb@suse.de> writes: Neil> Here you're thinking seems be very specific. This value comes Neil> from that spec. That value is used for this particular Neil> application. Yet.... [...] Neil> ...here you are being deliberately vague. But with the exact same Neil> values. I find this confusing. That's because I distinguish between hardware-imposed limits and software-imposed ditto. You don't think there's a difference. I do. We don't have any control whatsoever over the hardware limits. We do have full control over the software ones. hw_sector_size is dead and has been replaced with logical_block_size and physical_block_size to accommodate new hardware devices. That's all there is to that. Full stop. The presence of lbs and pbs is completely orthogonal to I/O hints provided by hardware as well as software RAID devices. The only thing the two concepts have in common is that we need to make sure that any hints applied by stacking drivers are compatible with the underlying storage limitations. You could argue that an MD device shouldn't have logical and physical block size exposed at all. To some extent I agree. However, some applications actually *do* require intimate knowledge about the hardware. Consequently I am in favor of having the knobs exposed. Besides, the values are also being used extensively by the stacking logic. The desire to have knowledge about the hardware is not exclusively a storage thing. Page size, for instance. Most applications don't care about the page size. That doesn't mean we remove getpagesize() or pack it into something that applies equally well to all applications (memory_allocation_granularity() !?). It is a knob that available for the few applications that need it. The same goes for physical_block_size. -- Martin K. Petersen Oracle Linux Engineering ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: md: Use new topology calls to indicate alignment and I/O sizes 2009-06-24 4:05 ` Neil Brown 2009-06-24 5:03 ` Martin K. Petersen @ 2009-06-30 20:24 ` Mike Snitzer 2009-06-30 23:58 ` Neil Brown 1 sibling, 1 reply; 16+ messages in thread From: Mike Snitzer @ 2009-06-30 20:24 UTC (permalink / raw) To: Neil Brown Cc: jens.axboe, Martin K. Petersen, linux-raid, linux-kernel, Linus Torvalds On Wed, Jun 24 2009 at 12:05am -0400, Neil Brown <neilb@suse.de> wrote: > On Tuesday June 23, snitzer@gmail.com wrote: > > > > On Tue, Jun 23, 2009 at 12:54 AM, Martin K. > > Petersen<martin.petersen@oracle.com> wrote: > > > > > > Neil, > > > > > > Looks like this one missed the boat on the first MD pull request. > > > Here's an updated patch that applies to Linus' current tree in case you > > > are planning a second round... > > > > > > > > > Switch MD over to the new disk_stack_limits() function which checks for > > > aligment and adjusts preferred I/O sizes when stacking. > > > > > > Also indicate preferred I/O sizes where applicable. > > > > Given that Linus has not yet pulled in the DM changes for 2.6.31 > > (which includes DM's topology support) I'd imagine it shouldn't be a > > problem to get this patch in. In fact it would be a real shame if > > this somehow missed 2.6.31 seeing as MD is the only consumer of the > > topology infrastructure's disk_stack_limits(). > > > > Neil, please push to Linus for 2.6.31, thanks! > > > > It looks mostly OK and can now be pulled from > git://neil.brown.name/md/ for-linus > though I'll send a more formal pull request later (if necessary) Neil, Any chance you could make a formal pull request for this MD topology support patch for 2.6.31-rc2? NOTE: all the disk_stack_limits() calls must pass the 3rd 'offset' arg in terms of bytes and _not_ sectors. So all of MD's calls to disk_stack_limits() should pass: rdev->data_offset << 9 Here is an updated patch, please advise, thanks! From: Martin K. Petersen <martin.petersen@oracle.com> Switch MD over to the new disk_stack_limits() function which checks for alignment and adjusts preferred I/O sizes when stacking. Also indicate preferred I/O sizes where applicable. Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> -- diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 15c8b7b..5810fa9 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -166,8 +166,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) rdev->sectors = sectors * mddev->chunk_sectors; } - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index cbe368f..3e320ac 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -294,7 +294,8 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) for (path = first; path <= last; path++) if ((p=conf->multipaths+path)->rdev == NULL) { q = rdev->bdev->bd_disk->queue; - blk_queue_stack_limits(mddev->queue, q); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as @@ -463,9 +464,9 @@ static int multipath_run (mddev_t *mddev) disk = conf->multipaths + disk_idx; disk->rdev = rdev; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); /* as we don't honour merge_bvec_fn, we must never risk * violating it, not that we ever expect a device with * a merge_bvec_fn to be involved in multipath */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ab4a489..335f490 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -170,8 +170,8 @@ static int create_strip_zones(mddev_t *mddev) } dev[j] = rdev1; - blk_queue_stack_limits(mddev->queue, - rdev1->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev1->bdev, + rdev1->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. @@ -250,6 +250,11 @@ static int create_strip_zones(mddev_t *mddev) mddev->chunk_sectors << 9); goto abort; } + + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + blk_queue_io_opt(mddev->queue, + (mddev->chunk_sectors << 9) * mddev->raid_disks); + printk(KERN_INFO "raid0: done.\n"); mddev->private = conf; return 0; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 89939a7..0569efb 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1123,8 +1123,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) for (mirror = first; mirror <= last; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. @@ -1988,9 +1988,8 @@ static int run(mddev_t *mddev) disk = conf->mirrors + disk_idx; disk->rdev = rdev; - - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ae12cea..7298a5e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1151,8 +1151,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) for ( ; mirror <= last ; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. @@ -2044,7 +2044,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) static int run(mddev_t *mddev) { conf_t *conf; - int i, disk_idx; + int i, disk_idx, chunk_size; mirror_info_t *disk; mdk_rdev_t *rdev; int nc, fc, fo; @@ -2130,6 +2130,14 @@ static int run(mddev_t *mddev) spin_lock_init(&conf->device_lock); mddev->queue->queue_lock = &conf->device_lock; + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + if (conf->raid_disks % conf->near_copies) + blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); + else + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks / conf->near_copies)); + list_for_each_entry(rdev, &mddev->disks, same_set) { disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks @@ -2138,9 +2146,8 @@ static int run(mddev_t *mddev) disk = conf->mirrors + disk_idx; disk->rdev = rdev; - - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f9f991e..92ef9b6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4452,7 +4452,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) static int run(mddev_t *mddev) { raid5_conf_t *conf; - int working_disks = 0; + int working_disks = 0, chunk_size; mdk_rdev_t *rdev; if (mddev->recovery_cp != MaxSector) @@ -4607,6 +4607,14 @@ static int run(mddev_t *mddev) md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks - conf->max_degraded)); + + list_for_each_entry(rdev, &mddev->disks, same_set) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); return 0; abort: ^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: md: Use new topology calls to indicate alignment and I/O sizes 2009-06-30 20:24 ` Mike Snitzer @ 2009-06-30 23:58 ` Neil Brown 0 siblings, 0 replies; 16+ messages in thread From: Neil Brown @ 2009-06-30 23:58 UTC (permalink / raw) To: Mike Snitzer Cc: jens.axboe, Martin K. Petersen, linux-raid, linux-kernel, Linus Torvalds On Tuesday June 30, snitzer@redhat.com wrote: > > Neil, > > Any chance you could make a formal pull request for this MD topology > support patch for 2.6.31-rc2? Yes.. I want to include a couple of other patches in the same batch and I've been a bit distracted lately (by a cold among other things). I'll try to get this sorted out today. > > NOTE: all the disk_stack_limits() calls must pass the 3rd 'offset' arg > in terms of bytes and _not_ sectors. So all of MD's calls to > disk_stack_limits() should pass: rdev->data_offset << 9 Noted, thanks. NeilBrown ^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2009-06-30 23:58 UTC | newest] Thread overview: 16+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2009-06-23 4:54 [PATCH] md: Use new topology calls to indicate alignment and I/O sizes Martin K. Petersen 2009-06-23 21:44 ` Mike Snitzer 2009-06-24 4:05 ` Neil Brown 2009-06-24 5:03 ` Martin K. Petersen 2009-06-24 6:22 ` Neil Brown 2009-06-24 15:27 ` Mike Snitzer 2009-06-24 16:27 ` Mike Snitzer 2009-06-24 23:28 ` Neil Brown 2009-06-25 0:05 ` Martin K. Petersen 2009-06-24 17:07 ` [PATCH] " Martin K. Petersen 2009-06-25 2:35 ` Neil Brown 2009-06-25 4:37 ` Martin K. Petersen 2009-06-25 6:16 ` Neil Brown 2009-06-25 16:24 ` Martin K. Petersen 2009-06-30 20:24 ` Mike Snitzer 2009-06-30 23:58 ` Neil Brown
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).