[RFC] 'discard sectors' block request.

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC] 'discard sectors' block request.
       [not found] ` <20080726130200.f541e604.akpm@linux-foundation.org>
@ 2008-08-05  1:45   ` David Woodhouse
  2008-08-05  1:59     ` Andrew Morton
  2008-08-05 11:42     ` Jens Axboe
  0 siblings, 2 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-05  1:45 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ric Wheeler, linux-fsdevel, axboe, gilad

On Sat, 2008-07-26 at 13:02 -0700, Andrew Morton wrote:
> I seem to be hearing a lot of silence over support for SSD devices.  I
> have this vague worry that there will be a large rollout of SSD
> hardware and Linux will be found to have pants-around-ankles.
> 
> For example, support for the T13 Trim command.  There will be others,
> but I surely don't know what they are.

Here's a first attempt at that. It implements basic support for
'discard' requests in the block layer, implements that in FTL (the flash
translation layer used on PCMCIA flash cards), and in FAT.

It doesn't yet do any merging of discard requests, and everything in
block/ needs careful review -- I don't know that code well at all. But
it seems to pass at least basic testing: 

 modprobe mtdram
 modprobe mtdchar
 ftl_format /dev/mtd0
 modprobe ftl
 mkfs.msdos /dev/ftla
 mount /dev/ftla /mnt/test
 cp /etc/services /mnt/test
 rm /mnt/test/services

... and (with appropriate debugging added) you can see the sectors being
thrown away. And 'od -t x1 /dev/ftla' will confirm that.

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index a09ead1..e0ac1c8 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -258,7 +258,6 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
 			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	}
-
 	complete(bio->bi_private);
 }
 
@@ -315,3 +314,62 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:	blockdev to issue discard for
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to discard
+ * @error_sector:	error sector
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question. Caller can supply
+ *    room for storing the error offset in case of a discard error, if they
+ *    wish to.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+			 unsigned nr_sects, sector_t *error_sector)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct request_queue *q;
+	struct bio *bio;
+	int ret;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	bio = bio_alloc(GFP_KERNEL, 0);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_end_io = bio_end_empty_barrier;
+	bio->bi_private = &wait;
+	bio->bi_bdev = bdev;
+	bio->bi_sector = sector;
+	bio->bi_size = nr_sects << 9;
+	submit_bio(1 << BIO_RW_DISCARD, bio);
+
+	wait_for_completion(&wait);
+
+	/*
+	 * The driver must store the error location in ->bi_sector, if
+	 * it supports it. For non-stacked drivers, this should be copied
+	 * from rq->sector.
+	 */
+	if (error_sector)
+		*error_sector = bio->bi_sector;
+
+	ret = 0;
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+	else if (!bio_flagged(bio, BIO_UPTODATE))
+		ret = -EIO;
+
+	bio_put(bio);
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-core.c b/block/blk-core.c
index 4889eb8..dc57c33 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -626,6 +626,7 @@ blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
 	 * first three bits are identical in rq->cmd_flags and bio->bi_rw,
 	 * see bio.h and blkdev.h
 	 */
+	/* Er, Are they really? And REQ_ALLOCED isn't one of them anyway */
 	rq->cmd_flags = rw | REQ_ALLOCED;
 
 	if (priv) {
@@ -1081,7 +1082,10 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	 */
 	if (unlikely(bio_barrier(bio)))
 		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
-
+	if (unlikely(bio_discard(bio))) {
+		req->cmd_flags |= (REQ_SOFTBARRIER | REQ_DISCARD | REQ_NOMERGE);
+		req->q->discard_fn(req->q, req);
+	}
 	if (bio_sync(bio))
 		req->cmd_flags |= REQ_RW_SYNC;
 	if (bio_rw_meta(bio))
@@ -1097,7 +1101,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
-	int el_ret, nr_sectors, barrier, err;
+	int el_ret, nr_sectors, barrier, discard, err;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	int rw_flags;
@@ -1117,9 +1121,15 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		goto end_io;
 	}
 
+	discard = bio_discard(bio);
+	if (unlikely(discard) && !q->discard_fn) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
+
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely(barrier) || elv_queue_empty(q))
+	if (unlikely(barrier) || unlikely(discard) || elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1411,6 +1421,10 @@ end_io:
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
+		if (bio_discard(bio) && !q->discard_fn) {
+			err = -EOPNOTSUPP;
+			goto end_io;
+		}
 
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
@@ -1488,7 +1502,7 @@ void submit_bio(int rw, struct bio *bio)
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
 	 */
-	if (!bio_empty_barrier(bio)) {
+	if (!bio_empty_barrier(bio) && !bio_discard(bio)) {
 
 		BIO_BUG_ON(!bio->bi_size);
 		BIO_BUG_ON(!bio->bi_io_vec);
@@ -1562,13 +1576,12 @@ static int __end_that_request_first(struct request *req, int error,
 		int nbytes;
 
 		/*
-		 * For an empty barrier request, the low level driver must
-		 * store a potential error location in ->sector. We pass
-		 * that back up in ->bi_sector.
+		 * For an empty barrier request or sector discard request, the
+		 * low level driver must store a potential error location in 
+		 * ->sector. We pass that back up in ->bi_sector.
 		 */
-		if (blk_empty_barrier(req))
+		if (blk_empty_barrier(req) || blk_discard_rq(req))
 			bio->bi_sector = req->sector;
-
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
@@ -1886,7 +1899,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 	struct request_queue *q = rq->q;
 	unsigned long flags = 0UL;
 
-	if (blk_fs_request(rq) || blk_pc_request(rq)) {
+	if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) {
 		if (__end_that_request_first(rq, error, nr_bytes))
 			return 1;
 
@@ -1944,7 +1957,7 @@ EXPORT_SYMBOL_GPL(blk_end_request);
  **/
 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
-	if (blk_fs_request(rq) || blk_pc_request(rq)) {
+	if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) {
 		if (__end_that_request_first(rq, error, nr_bytes))
 			return 1;
 	}
@@ -2015,6 +2028,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
 	/* first two bits are identical in rq->cmd_flags and bio->bi_rw */
+	/* Er, Are they really? */
 	rq->cmd_flags |= (bio->bi_rw & 3);
 
 	rq->nr_phys_segments = bio_phys_segments(q, bio);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index dfc7701..6991af0 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -33,6 +33,22 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 EXPORT_SYMBOL(blk_queue_prep_rq);
 
 /**
+ * blk_queue_discard - set a discard_sectors function for queue
+ * @q:		queue
+ * @dfn:	discard function
+ *
+ * It's possible for a queue to register a discard callback which is used
+ * to queue a request to discard (or "trim") sectors whose content is no
+ * longer required, if the underlying device supports such an action.
+ *
+ */
+void blk_queue_discard(struct request_queue *q, discard_fn *dfn)
+{
+	q->discard_fn = dfn;
+}
+EXPORT_SYMBOL(blk_queue_discard);
+
+/**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:		queue
  * @mbfn:	merge_bvec_fn
diff --git a/block/elevator.c b/block/elevator.c
index ed6f8f3..bb26424 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -675,7 +675,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 	if (q->ordcolor)
 		rq->cmd_flags |= REQ_ORDERED_COLOR;
 
-	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
+	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_DISCARD)) {
 		/*
 		 * toggle ordered color
 		 */
diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index f34f20c..0f8d886 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -1005,6 +1005,29 @@ static int ftl_writesect(struct mtd_blktrans_dev *dev,
 	return ftl_write((void *)dev, buf, block, 1);
 }
 
+static int ftl_discardsect(struct mtd_blktrans_dev *dev,
+			   unsigned long sector, unsigned nr_sects)
+{
+	partition_t *part = (void *)dev;
+	uint32_t bsize = 1 << part->header.EraseUnitSize;
+
+	printk("FTL erase sector %ld for %d sectors\n", 
+	       sector, nr_sects);
+
+	while (nr_sects) {
+		uint32_t old_addr = part->VirtualBlockMap[sector];
+		if (old_addr != 0xffffffff) {
+			part->VirtualBlockMap[sector] = 0xffffffff;
+			part->EUNInfo[old_addr/bsize].Deleted++;
+			if (set_bam_entry(part, old_addr, 0))
+				return -EIO;
+		}
+		nr_sects--;
+		sector++;
+	}
+
+	return 0;
+}
 /*====================================================================*/
 
 static void ftl_freepart(partition_t *part)
@@ -1069,6 +1092,7 @@ static struct mtd_blktrans_ops ftl_tr = {
 	.blksize 	= SECTOR_SIZE,
 	.readsect	= ftl_readsect,
 	.writesect	= ftl_writesect,
+	.discard	= ftl_discardsect,
 	.getgeo		= ftl_getgeo,
 	.add_mtd	= ftl_add_mtd,
 	.remove_dev	= ftl_remove_dev,
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 9ff007c..0ef1d4b 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -32,6 +32,13 @@ struct mtd_blkcore_priv {
 	spinlock_t queue_lock;
 };
 
+static int blktrans_discard_request(struct request_queue *q,
+				    struct request *req)
+{
+	req->cmd_type = REQ_TYPE_DISCARD;
+	return 0;
+}
+
 static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 			       struct mtd_blktrans_dev *dev,
 			       struct request *req)
@@ -44,6 +51,9 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 
 	buf = req->buffer;
 
+	if (req->cmd_type == REQ_TYPE_DISCARD)
+		return !tr->discard(dev, block, nsect);
+
 	if (!blk_fs_request(req))
 		return 0;
 
@@ -367,6 +377,9 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
 
 	tr->blkcore_priv->rq->queuedata = tr;
 	blk_queue_hardsect_size(tr->blkcore_priv->rq, tr->blksize);
+	if (tr->discard)
+		blk_queue_discard(tr->blkcore_priv->rq, blktrans_discard_request);
+
 	tr->blkshift = ffs(tr->blksize) - 1;
 
 	tr->blkcore_priv->thread = kthread_run(mtd_blktrans_thread, tr,
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 302e95c..249e9ec 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
+#include <linux/blkdev.h>
 
 struct fatent_operations {
 	void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -527,6 +528,15 @@ out:
 	return err;
 }
 
+static void fat_discard_cluster(struct super_block *sb, int cluster)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct block_device *bdev = sb->s_bdev;
+	unsigned long sector = fat_clus_to_blknr(sbi, cluster);
+
+	blkdev_issue_discard(bdev, sector, sbi->sec_per_clus, NULL);
+}
+
 int fat_free_clusters(struct inode *inode, int cluster)
 {
 	struct super_block *sb = inode->i_sb;
@@ -540,6 +550,8 @@ int fat_free_clusters(struct inode *inode, int cluster)
 	fatent_init(&fatent);
 	lock_fat(sbi);
 	do {
+		fat_discard_cluster(sb, cluster);
+
 		cluster = fat_ent_read(inode, &fatent, cluster);
 		if (cluster < 0) {
 			err = cluster;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0933a14..5480702 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -149,6 +149,8 @@ struct bio {
  * bit 2 -- barrier
  * bit 3 -- fail fast, don't want low level driver retries
  * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
+ * bit 5 -- 
+ * bit 6 -- discard sectors (trim)
  */
 #define BIO_RW		0
 #define BIO_RW_AHEAD	1
@@ -156,6 +158,7 @@ struct bio {
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
 #define BIO_RW_META	5
+#define BIO_RW_DISCARD	6
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
@@ -185,10 +188,14 @@ struct bio {
 #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
+#define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !(bio)->bi_size)
 
 static inline unsigned int bio_cur_sectors(struct bio *bio)
 {
+	if (unlikely(bio_discard(bio)))
+		return bio->bi_size >> 9;
+
 	if (bio->bi_vcnt)
 		return bio_iovec(bio)->bv_len >> 9;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e61f22b..6f00c37 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -55,6 +55,7 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
 	REQ_TYPE_FLUSH,			/* flush request */
+	REQ_TYPE_DISCARD,		/* discard sectors ('trim') request */
 	REQ_TYPE_SPECIAL,		/* driver defined type */
 	REQ_TYPE_LINUX_BLOCK,		/* generic block layer message */
 	/*
@@ -89,6 +90,7 @@ enum {
 enum rq_flag_bits {
 	__REQ_RW,		/* not set, read. set, write */
 	__REQ_FAILFAST,		/* no low level driver retries */
+	__REQ_DISCARD,		/* request to discard sectors */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
 	__REQ_HARDBARRIER,	/* may not be passed by drive either */
@@ -111,6 +113,7 @@ enum rq_flag_bits {
 };
 
 #define REQ_RW		(1 << __REQ_RW)
+#define REQ_DISCARD	(1 << __REQ_DISCARD)
 #define REQ_FAILFAST	(1 << __REQ_FAILFAST)
 #define REQ_SORTED	(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
@@ -252,6 +255,7 @@ typedef void (request_fn_proc) (struct request_queue *q);
 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
+typedef int (discard_fn) (struct request_queue *, struct request *);
 
 struct bio_vec;
 struct bvec_merge_data {
@@ -298,6 +302,7 @@ struct request_queue
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unplug_fn		*unplug_fn;
+	discard_fn		*discard_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
@@ -536,6 +541,7 @@ enum {
 #define blk_sorted_rq(rq)	((rq)->cmd_flags & REQ_SORTED)
 #define blk_barrier_rq(rq)	((rq)->cmd_flags & REQ_HARDBARRIER)
 #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
+#define blk_discard_rq(rq)	((rq)->cmd_flags & REQ_DISCARD)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 #define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
 /* rq->queuelist of dequeued request must be list_empty() */
@@ -786,6 +792,7 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
+extern void blk_queue_discard(struct request_queue *, discard_fn *);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
 extern int blk_do_ordered(struct request_queue *, struct request **);
@@ -829,6 +836,7 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
+extern int blkdev_issue_discard(struct block_device *, sector_t, unsigned, sector_t *);
 
 /*
 * command filter functions
diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h
index 310e616..8b4aa05 100644
--- a/include/linux/mtd/blktrans.h
+++ b/include/linux/mtd/blktrans.h
@@ -41,6 +41,8 @@ struct mtd_blktrans_ops {
 		    unsigned long block, char *buffer);
 	int (*writesect)(struct mtd_blktrans_dev *dev,
 		     unsigned long block, char *buffer);
+	int (*discard)(struct mtd_blktrans_dev *dev,
+		       unsigned long block, unsigned nr_blocks);
 
 	/* Block layer ioctls */
 	int (*getgeo)(struct mtd_blktrans_dev *dev, struct hd_geometry *geo);


-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [RFC] 'discard sectors' block request.
  2008-08-05  1:45   ` [RFC] 'discard sectors' block request David Woodhouse
@ 2008-08-05  1:59     ` Andrew Morton
  2008-08-05  9:01       ` David Woodhouse
  2008-08-05 11:42     ` Jens Axboe
  1 sibling, 1 reply; 48+ messages in thread
From: Andrew Morton @ 2008-08-05  1:59 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Ric Wheeler, linux-fsdevel, axboe, gilad

On Tue, 05 Aug 2008 02:45:16 +0100 David Woodhouse <dwmw2@infradead.org> wrote:

> On Sat, 2008-07-26 at 13:02 -0700, Andrew Morton wrote:
> > I seem to be hearing a lot of silence over support for SSD devices.  I
> > have this vague worry that there will be a large rollout of SSD
> > hardware and Linux will be found to have pants-around-ankles.
> > 
> > For example, support for the T13 Trim command.  There will be others,
> > but I surely don't know what they are.
> 
> Here's a first attempt at that. It implements basic support for
> 'discard' requests in the block layer, implements that in FTL (the flash
> translation layer used on PCMCIA flash cards), and in FAT.

Looks nice and simple.

>
> ...
>
> +int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> +			 unsigned nr_sects, sector_t *error_sector)
> +{
> +	DECLARE_COMPLETION_ONSTACK(wait);
> +	struct request_queue *q;
> +	struct bio *bio;
> +	int ret;
> +
> +	if (bdev->bd_disk == NULL)
> +		return -ENXIO;
> +
> +	q = bdev_get_queue(bdev);
> +	if (!q)
> +		return -ENXIO;
> +
> +	bio = bio_alloc(GFP_KERNEL, 0);
> +	if (!bio)
> +		return -ENOMEM;
> +
> +	bio->bi_end_io = bio_end_empty_barrier;
> +	bio->bi_private = &wait;
> +	bio->bi_bdev = bdev;
> +	bio->bi_sector = sector;
> +	bio->bi_size = nr_sects << 9;
> +	submit_bio(1 << BIO_RW_DISCARD, bio);
> +
> +	wait_for_completion(&wait);

The synchronous nature might get pretty painful, even after merging is
working?

err, actually, it'll have to become async to be able to merge, won't it?

> +	/*
> +	 * The driver must store the error location in ->bi_sector, if
> +	 * it supports it. For non-stacked drivers, this should be copied
> +	 * from rq->sector.
> +	 */
> +	if (error_sector)
> +		*error_sector = bio->bi_sector;
> +
> +	ret = 0;
> +	if (bio_flagged(bio, BIO_EOPNOTSUPP))
> +		ret = -EOPNOTSUPP;
> +	else if (!bio_flagged(bio, BIO_UPTODATE))
> +		ret = -EIO;
> +
> +	bio_put(bio);
> +	return ret;
> +}
> +EXPORT_SYMBOL(blkdev_issue_discard);
>
> ...
>
> @@ -1411,6 +1421,10 @@ end_io:
>  			err = -EOPNOTSUPP;
>  			goto end_io;
>  		}
> +		if (bio_discard(bio) && !q->discard_fn) {
> +			err = -EOPNOTSUPP;
> +			goto end_io;
> +		}
>  
>  		ret = q->make_request_fn(q, bio);
>  	} while (ret);
> @@ -1488,7 +1502,7 @@ void submit_bio(int rw, struct bio *bio)
>  	 * If it's a regular read/write or a barrier with data attached,
>  	 * go through the normal accounting stuff before submission.
>  	 */
> -	if (!bio_empty_barrier(bio)) {
> +	if (!bio_empty_barrier(bio) && !bio_discard(bio)) {

Perhaps we should have a new bio_doesnt_have_any_data_in_it() helper,
rather than opn-coding it here?  Assuming that new non-data-containing
bios will turn up in the future.

>  
>  		BIO_BUG_ON(!bio->bi_size);
>  		BIO_BUG_ON(!bio->bi_io_vec);
> @@ -1562,13 +1576,12 @@ static int __end_that_request_first(struct request *req, int error,
>  		int nbytes;
>  
>  		/*
> -		 * For an empty barrier request, the low level driver must
> -		 * store a potential error location in ->sector. We pass
> -		 * that back up in ->bi_sector.
> +		 * For an empty barrier request or sector discard request, the
> +		 * low level driver must store a potential error location in 
> +		 * ->sector. We pass that back up in ->bi_sector.
>  		 */
> -		if (blk_empty_barrier(req))
> +		if (blk_empty_barrier(req) || blk_discard_rq(req))

Maybe ditto.

>  			bio->bi_sector = req->sector;
> -
>  		if (nr_bytes >= bio->bi_size) {
>  			req->bio = bio->bi_next;
>  			nbytes = bio->bi_size;
>
> ...
>
>  
> +static int ftl_discardsect(struct mtd_blktrans_dev *dev,
> +			   unsigned long sector, unsigned nr_sects)
> +{
> +	partition_t *part = (void *)dev;

wuh?  We're casting a `struct mtd_blktrans_ops *' into a partition_t?



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] 'discard sectors' block request.
  2008-08-05  1:59     ` Andrew Morton
@ 2008-08-05  9:01       ` David Woodhouse
  0 siblings, 0 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-05  9:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ric Wheeler, linux-fsdevel, axboe, gilad

On Mon, 2008-08-04 at 18:59 -0700, Andrew Morton wrote:
> On Tue, 05 Aug 2008 02:45:16 +0100 David Woodhouse <dwmw2@infradead.org> wrote:
> 
> > On Sat, 2008-07-26 at 13:02 -0700, Andrew Morton wrote:
> > > I seem to be hearing a lot of silence over support for SSD devices.  I
> > > have this vague worry that there will be a large rollout of SSD
> > > hardware and Linux will be found to have pants-around-ankles.
> > > 
> > > For example, support for the T13 Trim command.  There will be others,
> > > but I surely don't know what they are.
> > 
> > Here's a first attempt at that. It implements basic support for
> > 'discard' requests in the block layer, implements that in FTL (the flash
> > translation layer used on PCMCIA flash cards), and in FAT.
> 
> Looks nice and simple.

> The synchronous nature might get pretty painful, even after merging is
> working?

Yeah. That's just an example, based on blkdev_issue_flush(). When file
systems submit bios directly, that can be different.

> Perhaps we should have a new bio_doesnt_have_any_data_in_it() helper,
> rather than opn-coding it here?  Assuming that new non-data-containing
> bios will turn up in the future.

That would be something like (!bio->bi_io_vec) ?  I'm kind of lost in
block code; the whole of that part of the patch wants someone like Jens
to go over it carefully.

> > +static int ftl_discardsect(struct mtd_blktrans_dev *dev,
> > +			   unsigned long sector, unsigned nr_sects)
> > +{
> > +	partition_t *part = (void *)dev;
> 
> wuh?  We're casting a `struct mtd_blktrans_ops *' into a partition_t?

Yeah, I had to look carefully at that one when I pasted it from
elsewhere. 

typedef struct partition_t {
    struct mtd_blktrans_dev mbd;
    ....

-- 
dwmw2


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] 'discard sectors' block request.
  2008-08-05  1:45   ` [RFC] 'discard sectors' block request David Woodhouse
  2008-08-05  1:59     ` Andrew Morton
@ 2008-08-05 11:42     ` Jens Axboe
  2008-08-05 13:32       ` David Woodhouse
  2008-08-05 16:29       ` David Woodhouse
  1 sibling, 2 replies; 48+ messages in thread
From: Jens Axboe @ 2008-08-05 11:42 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Tue, Aug 05 2008, David Woodhouse wrote:
> On Sat, 2008-07-26 at 13:02 -0700, Andrew Morton wrote:
> > I seem to be hearing a lot of silence over support for SSD devices.  I
> > have this vague worry that there will be a large rollout of SSD
> > hardware and Linux will be found to have pants-around-ankles.
> > 
> > For example, support for the T13 Trim command.  There will be others,
> > but I surely don't know what they are.
> 
> Here's a first attempt at that. It implements basic support for
> 'discard' requests in the block layer, implements that in FTL (the flash
> translation layer used on PCMCIA flash cards), and in FAT.
> 
> It doesn't yet do any merging of discard requests, and everything in
> block/ needs careful review -- I don't know that code well at all. But
> it seems to pass at least basic testing: 
> 
>  modprobe mtdram
>  modprobe mtdchar
>  ftl_format /dev/mtd0
>  modprobe ftl
>  mkfs.msdos /dev/ftla
>  mount /dev/ftla /mnt/test
>  cp /etc/services /mnt/test
>  rm /mnt/test/services
> 
> ... and (with appropriate debugging added) you can see the sectors being
> thrown away. And 'od -t x1 /dev/ftla' will confirm that.
> 
> diff --git a/block/blk-barrier.c b/block/blk-barrier.c
> index a09ead1..e0ac1c8 100644
> --- a/block/blk-barrier.c
> +++ b/block/blk-barrier.c
> @@ -258,7 +258,6 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
>  			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
>  		clear_bit(BIO_UPTODATE, &bio->bi_flags);
>  	}
> -
>  	complete(bio->bi_private);
>  }
>  
> @@ -315,3 +314,62 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
>  	return ret;
>  }
>  EXPORT_SYMBOL(blkdev_issue_flush);
> +
> +/**
> + * blkdev_issue_discard - queue a discard
> + * @bdev:	blockdev to issue discard for
> + * @sector:	start sector
> + * @nr_sects:	number of sectors to discard
> + * @error_sector:	error sector
> + *
> + * Description:
> + *    Issue a discard request for the sectors in question. Caller can supply
> + *    room for storing the error offset in case of a discard error, if they
> + *    wish to.
> + */
> +int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> +			 unsigned nr_sects, sector_t *error_sector)
> +{
> +	DECLARE_COMPLETION_ONSTACK(wait);
> +	struct request_queue *q;
> +	struct bio *bio;
> +	int ret;
> +
> +	if (bdev->bd_disk == NULL)
> +		return -ENXIO;
> +
> +	q = bdev_get_queue(bdev);
> +	if (!q)
> +		return -ENXIO;
> +
> +	bio = bio_alloc(GFP_KERNEL, 0);
> +	if (!bio)
> +		return -ENOMEM;
> +
> +	bio->bi_end_io = bio_end_empty_barrier;
> +	bio->bi_private = &wait;
> +	bio->bi_bdev = bdev;
> +	bio->bi_sector = sector;
> +	bio->bi_size = nr_sects << 9;
> +	submit_bio(1 << BIO_RW_DISCARD, bio);
> +
> +	wait_for_completion(&wait);
> +
> +	/*
> +	 * The driver must store the error location in ->bi_sector, if
> +	 * it supports it. For non-stacked drivers, this should be copied
> +	 * from rq->sector.
> +	 */
> +	if (error_sector)
> +		*error_sector = bio->bi_sector;
> +
> +	ret = 0;
> +	if (bio_flagged(bio, BIO_EOPNOTSUPP))
> +		ret = -EOPNOTSUPP;
> +	else if (!bio_flagged(bio, BIO_UPTODATE))
> +		ret = -EIO;
> +
> +	bio_put(bio);
> +	return ret;
> +}
> +EXPORT_SYMBOL(blkdev_issue_discard);

I was going to suggest that if you want this sync, then you should cut
this code out of blkdev_issue_flush() and reuse that helper for
blkdev_issue_flush and blkdev_issue_discard(). But I don't think the
sync interface makes a lot of sense. Also, get rid of the error_sector
stuff. First of all it isn't really used consistently in the flush part
since most block drivers don't have this information (and/or fill it
in), and secondly you can just store that in bi_sector or whatever
instead for this request.

int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
			 unsigned nr_sects, bio_end_io_t *end_io)
{
	struct request_queue *q;
	struct bio *bio;
	int ret;

	if (bdev->bd_disk == NULL)
		return -ENXIO;

	q = bdev_get_queue(bdev);
	if (!q)
		return -ENXIO;

	bio = bio_alloc(GFP_KERNEL, 0);
	if (!bio)
		return -ENOMEM;

	bio->bi_bdev = bdev;
	bio->bi_sector = sector;
	bio->bi_size = nr_sects << 9;
	submit_bio(1 << BIO_RW_DISCARD, bio);
        return 0;
}

That would be the simpler async variant. Let the caller pass in the
end_io function as well, as the fs would definitely want to check for
EOPNOTSUPP or other errors itself on completion. Remember to do the
bio_put() in there as well at the end. Perhaps you want a sync interface
as well? Then I would suggest passing in the bio instead of allocating
it, or just add a blkdev_issue_discard_sync() with the above body moved
to a stsatic __blkdev_issue_discard().

Otherwise that akpm said, he covered basically all my points. Lets do a

static inline int bio_data_less(struct bio *bio)
{
        return !bio->bi_io_vec;
}

like you suggested for checking whether the bio carries data or not,
since we need that for the empty barrier as well (and other future bio
hints we want to pass down).

> @@ -1886,7 +1899,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
>  	struct request_queue *q = rq->q;
>  	unsigned long flags = 0UL;
>  
> -	if (blk_fs_request(rq) || blk_pc_request(rq)) {
> +	if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) {
>  		if (__end_that_request_first(rq, error, nr_bytes))
>  			return 1;
>  
> @@ -1944,7 +1957,7 @@ EXPORT_SYMBOL_GPL(blk_end_request);
>   **/
>  int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
>  {
> -	if (blk_fs_request(rq) || blk_pc_request(rq)) {
> +	if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) {
>  		if (__end_that_request_first(rq, error, nr_bytes))
>  			return 1;
>  	}

Do you need to call into __end_request_request_first()? You don't need
to fill error sector now, and there's no bio data to complete.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] 'discard sectors' block request.
  2008-08-05 11:42     ` Jens Axboe
@ 2008-08-05 13:32       ` David Woodhouse
  2008-08-05 14:21         ` Ric Wheeler
  2008-08-05 16:29       ` David Woodhouse
  1 sibling, 1 reply; 48+ messages in thread
From: David Woodhouse @ 2008-08-05 13:32 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Tue, 2008-08-05 at 13:42 +0200, Jens Axboe wrote:
> I was going to suggest that if you want this sync, then you should cut
> this code out of blkdev_issue_flush() and reuse that helper for
> blkdev_issue_flush and blkdev_issue_discard(). But I don't think the
> sync interface makes a lot of sense. 

Yeah, I agree. That was just for the proof of concept, really.

> Also, get rid of the error_sector stuff. First of all it isn't really
> used consistently in the flush part since most block drivers don't
> have this information (and/or fill it in), and secondly you can just
> store that in bi_sector or whatever instead for this request.

OK.

> int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> 			 unsigned nr_sects, bio_end_io_t *end_io)
> {
  ...

> That would be the simpler async variant. Let the caller pass in the
> end_io function as well, as the fs would definitely want to check for
> EOPNOTSUPP or other errors itself on completion.

Most of the time the fs doesn't care at all. The whole point is that it
no longer gives a monkey's about the contents of these sectors. It's
just giving the underlying storage an _opportunity_ to reclaim them, if
it wants to do so. Most of the time, a file system isn't going to do
_anything_ with an error other than ignore it.

Even -EOPNOTSUPP isn't massively interesting, because it'll want to
check whether the operation is supported _before_ it bothers to queue
the request, rather than queueing the request unconditionally.

>  Remember to do the bio_put() in there as well at the end. 

Are we allowed to just set the end_io function to &bio_put?

> Perhaps you want a sync interface as well? 

I think we can live without a sync interface. As long as the discard
request is guaranteed to finish before any subsequent write, which is
something for the elevator to care about rather than the caller, we
really don't care when it finishes. If the occasional strange caller
wants to do it synchronously, they can arrange that for themselves.

> Otherwise that akpm said, he covered basically all my points. Lets do a
> 
> static inline int bio_data_less(struct bio *bio)
> {
>         return !bio->bi_io_vec;
> }
> 
> like you suggested for checking whether the bio carries data or not,
> since we need that for the empty barrier as well (and other future bio
> hints we want to pass down).

OK. And for blk_dataless_rq() it would be checking !req->buffer? 

> > @@ -1886,7 +1899,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
> >  	struct request_queue *q = rq->q;
> >  	unsigned long flags = 0UL;
> >  
> > -	if (blk_fs_request(rq) || blk_pc_request(rq)) {
> > +	if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) {
> >  		if (__end_that_request_first(rq, error, nr_bytes))
> >  			return 1;
> >  
> > @@ -1944,7 +1957,7 @@ EXPORT_SYMBOL_GPL(blk_end_request);
> >   **/
> >  int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
> >  {
> > -	if (blk_fs_request(rq) || blk_pc_request(rq)) {
> > +	if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) {
> >  		if (__end_that_request_first(rq, error, nr_bytes))
> >  			return 1;
> >  	}
> 
> Do you need to call into __end_request_request_first()? You don't need
> to fill error sector now, and there's no bio data to complete.

That's what actually calls the ->end_io function, isn't it? The need to
add that was determined empirically... but at about 2am this morning, so
I'm not going to sulk if you tell me I'm wrong.

We'll probably also want the elevator to merge discard requests. But
that can come later.

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] 'discard sectors' block request.
  2008-08-05 13:32       ` David Woodhouse
@ 2008-08-05 14:21         ` Ric Wheeler
  0 siblings, 0 replies; 48+ messages in thread
From: Ric Wheeler @ 2008-08-05 14:21 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Jens Axboe, Andrew Morton, linux-fsdevel, gilad


I think that this is a very interesting patch. Just for background, 
NetApp has proposed some extensions for a similar SCSI mechanism aimed 
more at large arrays which want to do "thin provisioned LUNS":

08-149R0 2008/03/11 SBC - Thin Provisioning 

http://www.t10.org/ftp/t10/document.08/08-149r0.pdf

08-070R0 2008/01/14 Thin Provisioning Concept Proposal

http://www.t10.org/ftp/t10/document.08/08-070r0.pdf


ric




^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] 'discard sectors' block request.
  2008-08-05 11:42     ` Jens Axboe
  2008-08-05 13:32       ` David Woodhouse
@ 2008-08-05 16:29       ` David Woodhouse
  2008-08-05 17:25         ` David Woodhouse
  1 sibling, 1 reply; 48+ messages in thread
From: David Woodhouse @ 2008-08-05 16:29 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Tue, 2008-08-05 at 13:42 +0200, Jens Axboe wrote:
 <some stuff>

Incremental patch below; I'll commit it to a git tree in discrete
patches, and publish those.

You can just pass NULL as your end_io function to blkdev_issue_discard()
and it'll set it to bio_put() for you.

We'll probably want to make the elevator capable of merging discard
requests -- and even letting them cross real read/write requests for
other sectors, and just dropping them if there's a subsequent write
request covering the same region, etc.


diff -u b/block/blk-barrier.c b/block/blk-barrier.c
--- b/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -320,17 +321,16 @@
  * @bdev:	blockdev to issue discard for
  * @sector:	start sector
  * @nr_sects:	number of sectors to discard
- * @error_sector:	error sector
+ * @end_io:	end_io function (or NULL)
  *
  * Description:
- *    Issue a discard request for the sectors in question. Caller can supply
- *    room for storing the error offset in case of a discard error, if they
- *    wish to.
+ *    Issue a discard request for the sectors in question. Caller can pass an
+ *    end_io function (which must call bio_put()), if they really want to see
+ *    the outcome. Most callers probably won't, and can just pass NULL.
  */
 int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-			 unsigned nr_sects, sector_t *error_sector)
+			 unsigned nr_sects, bio_end_io_t end_io)
 {
-	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q;
 	struct bio *bio;
 	int ret;
@@ -343,33 +343,24 @@
 		return -ENXIO;
 
+	/* Many callers won't care at all about the outcome. After all,
+	   this is just a hint to the underlying device. They'll just
+	   ignore errors completely. So the end_io function can be just
+	   a call to bio_put() */
+	if (!end_io)
+		end_io = (void *)&bio_put;
+
+	if (!q->discard_fn)
+		return -EOPNOTSUPP;
+
 	bio = bio_alloc(GFP_KERNEL, 0);
 	if (!bio)
 		return -ENOMEM;
 
-	bio->bi_end_io = bio_end_empty_barrier;
-	bio->bi_private = &wait;
+	bio->bi_end_io = end_io;
 	bio->bi_bdev = bdev;
 	bio->bi_sector = sector;
 	bio->bi_size = nr_sects << 9;
 	submit_bio(1 << BIO_RW_DISCARD, bio);
-
-	wait_for_completion(&wait);
-
-	/*
-	 * The driver must store the error location in ->bi_sector, if
-	 * it supports it. For non-stacked drivers, this should be copied
-	 * from rq->sector.
-	 */
-	if (error_sector)
-		*error_sector = bio->bi_sector;
-
-	ret = 0;
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-	else if (!bio_flagged(bio, BIO_UPTODATE))
-		ret = -EIO;
-
-	bio_put(bio);
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
diff -u b/block/blk-core.c b/block/blk-core.c
--- b/block/blk-core.c
+++ b/block/blk-core.c
@@ -1502,10 +1501,9 @@
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
 	 */
-	if (!bio_empty_barrier(bio) && !bio_discard(bio)) {
+	if (!bio_dataless(bio)) {
 
 		BIO_BUG_ON(!bio->bi_size);
-		BIO_BUG_ON(!bio->bi_io_vec);
 
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
@@ -1576,12 +1574,13 @@
 		int nbytes;
 
 		/*
-		 * For an empty barrier request or sector discard request, the
-		 * low level driver must store a potential error location in 
-		 * ->sector. We pass that back up in ->bi_sector.
+		 * For an empty barrier request, the low level driver must
+		 * store a potential error location in ->sector. We pass
+		 * that back up in ->bi_sector.
 		 */
-		if (blk_empty_barrier(req) || blk_discard_rq(req))
+		if (blk_empty_barrier(req))
 			bio->bi_sector = req->sector;
+
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
diff -u b/fs/fat/fatent.c b/fs/fat/fatent.c
--- b/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -528,15 +528,6 @@
 	return err;
 }
 
-static void fat_discard_cluster(struct super_block *sb, int cluster)
-{
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	struct block_device *bdev = sb->s_bdev;
-	unsigned long sector = fat_clus_to_blknr(sbi, cluster);
-
-	blkdev_issue_discard(bdev, sector, sbi->sec_per_clus, NULL);
-}
-
 int fat_free_clusters(struct inode *inode, int cluster)
 {
 	struct super_block *sb = inode->i_sb;
@@ -550,8 +541,11 @@
 	fatent_init(&fatent);
 	lock_fat(sbi);
 	do {
-		fat_discard_cluster(sb, cluster);
-
+		/* Issue discard for the sectors we no longer care about */
+		blkdev_issue_discard(sb->s_bdev,
+				     fat_clus_to_blknr(sbi, cluster),
+				     sbi->sec_per_clus, NULL);
+				     
 		cluster = fat_ent_read(inode, &fatent, cluster);
 		if (cluster < 0) {
 			err = cluster;
diff -u b/include/linux/bio.h b/include/linux/bio.h
--- b/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -191,6 +191,8 @@
 #define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !(bio)->bi_size)
 
+#define bio_dataless(bio)	(!(bio)->bi_io_vec)
+
 static inline unsigned int bio_cur_sectors(struct bio *bio)
 {
 	if (unlikely(bio_discard(bio)))
diff -u b/include/linux/blkdev.h b/include/linux/blkdev.h
--- b/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -546,6 +546,7 @@
 #define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
 /* rq->queuelist of dequeued request must be list_empty() */
 #define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
+#define blk_dataless_rq(rq)	(!(rq)->buffer)
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
@@ -836,7 +837,7 @@
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
-extern int blkdev_issue_discard(struct block_device *, sector_t, unsigned, sector_t *);
+extern int blkdev_issue_discard(struct block_device *, sector_t, unsigned, bio_end_io_t *);
 
 /*
 * command filter functions

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] 'discard sectors' block request.
  2008-08-05 16:29       ` David Woodhouse
@ 2008-08-05 17:25         ` David Woodhouse
  2008-08-06  9:25           ` [PATCH 1/5] [BLOCK] Add 'discard' request handling David Woodhouse
                             ` (4 more replies)
  0 siblings, 5 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-05 17:25 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Tue, 2008-08-05 at 17:29 +0100, David Woodhouse wrote:
> Incremental patch below; I'll commit it to a git tree in discrete
> patches, and publish those.

{http://,git://} git.infradead.org/users/dwmw2/discard-2.6.git

-- 
dwmw2


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-05 17:25         ` David Woodhouse
@ 2008-08-06  9:25           ` David Woodhouse
  2008-08-06 16:19             ` OGAWA Hirofumi
                               ` (2 more replies)
  2008-08-06  9:25           ` [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded David Woodhouse
                             ` (3 subsequent siblings)
  4 siblings, 3 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-06  9:25 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

Some block devices benefit from a hint that they can forget the contents
of certain sectors. Add basic support for this to the block core, along
with a 'blkdev_issue_discard()' helper function which issues such
requests.

Although blkdev_issue_discard() can take an end_io function, it's
acceptable to leave that as NULL and in that case the allocated bio will
be automatically freed. Most of the time, it's expected that callers
won't care about when, or even _if_, the request completes. It's only a
hint to the device anyway. By definition, the file system doesn't _care_
about these sectors any more.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 block/blk-barrier.c    |   49 ++++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-core.c       |   25 ++++++++++++++++++-----
 block/blk-settings.c   |   16 +++++++++++++++
 block/elevator.c       |    2 +-
 include/linux/bio.h    |    9 ++++++++
 include/linux/blkdev.h |   10 +++++++++
 6 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index a09ead1..29e60ff 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -315,3 +315,52 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:	blockdev to issue discard for
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to discard
+ * @end_io:	end_io function (or %NULL)
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question. Caller can pass an
+ *    end_io function (which must call bio_put()), if they really want to see
+ *    the outcome. Most callers probably won't, and can just pass %NULL.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+			 unsigned nr_sects, bio_end_io_t end_io)
+{
+	struct request_queue *q;
+	struct bio *bio;
+	int ret;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	/* Many callers won't care at all about the outcome. After all,
+	   this is just a hint to the underlying device. They'll just
+	   ignore errors completely. So the end_io function can be just
+	   a call to bio_put() */
+	if (!end_io)
+		end_io = (void *)&bio_put;
+
+	if (!q->discard_fn)
+		return -EOPNOTSUPP;
+
+	bio = bio_alloc(GFP_KERNEL, 0);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_end_io = end_io;
+	bio->bi_bdev = bdev;
+	bio->bi_sector = sector;
+	bio->bi_size = nr_sects << 9;
+	submit_bio(1 << BIO_RW_DISCARD, bio);
+	return 0;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-core.c b/block/blk-core.c
index 4889eb8..0cb993a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1081,6 +1081,10 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	 */
 	if (unlikely(bio_barrier(bio)))
 		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
+	if (unlikely(bio_discard(bio))) {
+		req->cmd_flags |= (REQ_SOFTBARRIER | REQ_DISCARD | REQ_NOMERGE);
+		req->q->discard_fn(req->q, req);
+	}
 
 	if (bio_sync(bio))
 		req->cmd_flags |= REQ_RW_SYNC;
@@ -1097,7 +1101,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
-	int el_ret, nr_sectors, barrier, err;
+	int el_ret, nr_sectors, barrier, discard, err;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	int rw_flags;
@@ -1117,9 +1121,15 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		goto end_io;
 	}
 
+	discard = bio_discard(bio);
+	if (unlikely(discard) && !q->discard_fn) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
+
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely(barrier) || elv_queue_empty(q))
+	if (unlikely(barrier) || unlikely(discard) || elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1411,6 +1421,10 @@ end_io:
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
+		if (bio_discard(bio) && !q->discard_fn) {
+			err = -EOPNOTSUPP;
+			goto end_io;
+		}
 
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
@@ -1488,10 +1502,9 @@ void submit_bio(int rw, struct bio *bio)
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
 	 */
-	if (!bio_empty_barrier(bio)) {
+	if (!bio_dataless(bio)) {
 
 		BIO_BUG_ON(!bio->bi_size);
-		BIO_BUG_ON(!bio->bi_io_vec);
 
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
@@ -1886,7 +1899,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 	struct request_queue *q = rq->q;
 	unsigned long flags = 0UL;
 
-	if (blk_fs_request(rq) || blk_pc_request(rq)) {
+	if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) {
 		if (__end_that_request_first(rq, error, nr_bytes))
 			return 1;
 
@@ -1944,7 +1957,7 @@ EXPORT_SYMBOL_GPL(blk_end_request);
  **/
 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
-	if (blk_fs_request(rq) || blk_pc_request(rq)) {
+	if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) {
 		if (__end_that_request_first(rq, error, nr_bytes))
 			return 1;
 	}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index dfc7701..6991af0 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -33,6 +33,22 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 EXPORT_SYMBOL(blk_queue_prep_rq);
 
 /**
+ * blk_queue_discard - set a discard_sectors function for queue
+ * @q:		queue
+ * @dfn:	discard function
+ *
+ * It's possible for a queue to register a discard callback which is used
+ * to queue a request to discard (or "trim") sectors whose content is no
+ * longer required, if the underlying device supports such an action.
+ *
+ */
+void blk_queue_discard(struct request_queue *q, discard_fn *dfn)
+{
+	q->discard_fn = dfn;
+}
+EXPORT_SYMBOL(blk_queue_discard);
+
+/**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:		queue
  * @mbfn:	merge_bvec_fn
diff --git a/block/elevator.c b/block/elevator.c
index ed6f8f3..bb26424 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -675,7 +675,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 	if (q->ordcolor)
 		rq->cmd_flags |= REQ_ORDERED_COLOR;
 
-	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
+	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_DISCARD)) {
 		/*
 		 * toggle ordered color
 		 */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0933a14..67cb2ee 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -149,6 +149,8 @@ struct bio {
  * bit 2 -- barrier
  * bit 3 -- fail fast, don't want low level driver retries
  * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
+ * bit 5 --
+ * bit 6 -- discard sectors (trim)
  */
 #define BIO_RW		0
 #define BIO_RW_AHEAD	1
@@ -156,6 +158,7 @@ struct bio {
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
 #define BIO_RW_META	5
+#define BIO_RW_DISCARD	6
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
@@ -185,10 +188,16 @@ struct bio {
 #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
+#define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !(bio)->bi_size)
 
+#define bio_dataless(bio)	(!(bio)->bi_io_vec)
+
 static inline unsigned int bio_cur_sectors(struct bio *bio)
 {
+	if (unlikely(bio_discard(bio)))
+		return bio->bi_size >> 9;
+
 	if (bio->bi_vcnt)
 		return bio_iovec(bio)->bv_len >> 9;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e61f22b..9e3adc4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -55,6 +55,7 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
 	REQ_TYPE_FLUSH,			/* flush request */
+	REQ_TYPE_DISCARD,		/* discard sectors request */
 	REQ_TYPE_SPECIAL,		/* driver defined type */
 	REQ_TYPE_LINUX_BLOCK,		/* generic block layer message */
 	/*
@@ -89,6 +90,7 @@ enum {
 enum rq_flag_bits {
 	__REQ_RW,		/* not set, read. set, write */
 	__REQ_FAILFAST,		/* no low level driver retries */
+	__REQ_DISCARD,		/* request to discard sectors */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
 	__REQ_HARDBARRIER,	/* may not be passed by drive either */
@@ -111,6 +113,7 @@ enum rq_flag_bits {
 };
 
 #define REQ_RW		(1 << __REQ_RW)
+#define REQ_DISCARD	(1 << __REQ_DISCARD)
 #define REQ_FAILFAST	(1 << __REQ_FAILFAST)
 #define REQ_SORTED	(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
@@ -252,6 +255,7 @@ typedef void (request_fn_proc) (struct request_queue *q);
 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
+typedef int (discard_fn) (struct request_queue *, struct request *);
 
 struct bio_vec;
 struct bvec_merge_data {
@@ -298,6 +302,7 @@ struct request_queue
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unplug_fn		*unplug_fn;
+	discard_fn		*discard_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
@@ -536,10 +541,12 @@ enum {
 #define blk_sorted_rq(rq)	((rq)->cmd_flags & REQ_SORTED)
 #define blk_barrier_rq(rq)	((rq)->cmd_flags & REQ_HARDBARRIER)
 #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
+#define blk_discard_rq(rq)	((rq)->cmd_flags & REQ_DISCARD)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 #define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
 /* rq->queuelist of dequeued request must be list_empty() */
 #define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
+#define blk_dataless_rq(rq)	(!(rq)->buffer)
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
@@ -786,6 +793,7 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
+extern void blk_queue_discard(struct request_queue *, discard_fn *);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
 extern int blk_do_ordered(struct request_queue *, struct request **);
@@ -829,6 +837,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
+extern int blkdev_issue_discard(struct block_device *, sector_t sector,
+				unsigned nr_sects, bio_end_io_t *);
 
 /*
 * command filter functions
-- 
1.5.5.1


-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-06  9:25           ` [PATCH 1/5] [BLOCK] Add 'discard' request handling David Woodhouse
@ 2008-08-06 16:19             ` OGAWA Hirofumi
  2008-08-06 18:18               ` David Woodhouse
  2008-08-07 16:32             ` [PATCH 1/5, v2] " David Woodhouse
  2008-08-07 18:41             ` [PATCH 1/5] " Jens Axboe
  2 siblings, 1 reply; 48+ messages in thread
From: OGAWA Hirofumi @ 2008-08-06 16:19 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

David Woodhouse <dwmw2@infradead.org> writes:

> Some block devices benefit from a hint that they can forget the contents
> of certain sectors. Add basic support for this to the block core, along
> with a 'blkdev_issue_discard()' helper function which issues such
> requests.
>
> Although blkdev_issue_discard() can take an end_io function, it's
> acceptable to leave that as NULL and in that case the allocated bio will
> be automatically freed. Most of the time, it's expected that callers
> won't care about when, or even _if_, the request completes. It's only a
> hint to the device anyway. By definition, the file system doesn't _care_
> about these sectors any more.

Looks like good start. Thanks. Although I'm not quite sure it helps, is
there any plan to merge bios?

> +int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> +			 unsigned nr_sects, bio_end_io_t end_io)
> +{
> +	struct request_queue *q;

[...]

> +
> +	bio->bi_end_io = end_io;
> +	bio->bi_bdev = bdev;
> +	bio->bi_sector = sector;
> +	bio->bi_size = nr_sects << 9;
> +	submit_bio(1 << BIO_RW_DISCARD, bio);
> +	return 0;
> +}

If fs merges contiguous blocks to one bio, bi_size will overflow.
Will we have to limit (of the device?) and separate for the device?
Or how about to have #define for discard bio for BIO_RW_DISCARD users?

> @@ -185,10 +188,16 @@ struct bio {
>  #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
>  #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
>  #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
> +#define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
>  #define bio_empty_barrier(bio)	(bio_barrier(bio) && !(bio)->bi_size)
>  
> +#define bio_dataless(bio)	(!(bio)->bi_io_vec)

The following seems good to convert to bio_dataless?

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---

 mm/bounce.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff -puN mm/bounce.c~test mm/bounce.c
--- linux-2.6/mm/bounce.c~test	2008-08-07 00:04:26.000000000 +0900
+++ linux-2.6-hirofumi/mm/bounce.c	2008-08-07 00:04:57.000000000 +0900
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_que
 	/*
 	 * Data-less bio, nothing to bounce
 	 */
-	if (bio_empty_barrier(*bio_orig))
+	if (bio_dataless(*bio_orig)) {
 		return;
 
 	/*
_
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-06 16:19             ` OGAWA Hirofumi
@ 2008-08-06 18:18               ` David Woodhouse
  2008-08-06 19:28                 ` OGAWA Hirofumi
  0 siblings, 1 reply; 48+ messages in thread
From: David Woodhouse @ 2008-08-06 18:18 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Thu, 2008-08-07 at 01:19 +0900, OGAWA Hirofumi wrote:
> David Woodhouse <dwmw2@infradead.org> writes:
> 
> > Some block devices benefit from a hint that they can forget the contents
> > of certain sectors. Add basic support for this to the block core, along
> > with a 'blkdev_issue_discard()' helper function which issues such
> > requests.
> >
> > Although blkdev_issue_discard() can take an end_io function, it's
> > acceptable to leave that as NULL and in that case the allocated bio will
> > be automatically freed. Most of the time, it's expected that callers
> > won't care about when, or even _if_, the request completes. It's only a
> > hint to the device anyway. By definition, the file system doesn't _care_
> > about these sectors any more.
> 
> Looks like good start. Thanks. Although I'm not quite sure it helps, is
> there any plan to merge bios?

Yes, it would be very good to merge bios. In fact, we can also allow the
elevator to let discards cross reads and writes -- there's no _real_
need for them to be a soft barrier. And we can _drop_ a discard request
if there's a later write to the same sectors.

I was planning to do that work in the elevator later. Or preferably,
leave it to someone more familiar with that code.

> If fs merges contiguous blocks to one bio, bi_size will overflow.
> Will we have to limit (of the device?) and separate for the device?

Or just let the device deal with it in multiple requests if it needs to?
I have no strong opinion.

> Or how about to have #define for discard bio for BIO_RW_DISCARD users?

Que?

> The following seems good to convert to bio_dataless?

Yes, that seems appropriate; thanks.
_
-- 
dwmw2


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-06 18:18               ` David Woodhouse
@ 2008-08-06 19:28                 ` OGAWA Hirofumi
  0 siblings, 0 replies; 48+ messages in thread
From: OGAWA Hirofumi @ 2008-08-06 19:28 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

David Woodhouse <dwmw2@infradead.org> writes:

>> If fs merges contiguous blocks to one bio, bi_size will overflow.
>> Will we have to limit (of the device?) and separate for the device?
>
> Or just let the device deal with it in multiple requests if it needs to?
> I have no strong opinion.

Ah, I see.

>> Or how about to have #define for discard bio for BIO_RW_DISCARD users?
>
> Que?

How about to add #define like BIO_MAX_SIZE of limit for one discard bio?
Maybe, it is needed for BIO_RW_DISCARD users...?
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 1/5, v2] [BLOCK] Add 'discard' request handling
  2008-08-06  9:25           ` [PATCH 1/5] [BLOCK] Add 'discard' request handling David Woodhouse
  2008-08-06 16:19             ` OGAWA Hirofumi
@ 2008-08-07 16:32             ` David Woodhouse
  2008-08-07 18:41             ` [PATCH 1/5] " Jens Axboe
  2 siblings, 0 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-07 16:32 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

Some block devices benefit from a hint that they can forget the contents
of certain sectors. Add basic support for this to the block core, along
with a 'blkdev_issue_discard()' helper function which issues such
requests.

Although blkdev_issue_discard() can take an end_io function, it's
acceptable to leave that as NULL and in that case the allocated bio will
be automatically freed. Most of the time, it's expected that callers
won't care about when, or even _if_, the request completes. It's only a
hint to the device anyway. By definition, the file system doesn't _care_
about these sectors any more.

Improved-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 block/blk-barrier.c    |   49 ++++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-core.c       |   25 ++++++++++++++++++-----
 block/blk-settings.c   |   16 +++++++++++++++
 block/elevator.c       |    2 +-
 include/linux/bio.h    |    9 ++++++++
 include/linux/blkdev.h |   19 ++++++++++++++++++
 mm/bounce.c            |    2 +-
 7 files changed, 114 insertions(+), 8 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index a09ead1..29e60ff 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -315,3 +315,52 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:	blockdev to issue discard for
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to discard
+ * @end_io:	end_io function (or %NULL)
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question. Caller can pass an
+ *    end_io function (which must call bio_put()), if they really want to see
+ *    the outcome. Most callers probably won't, and can just pass %NULL.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+			 unsigned nr_sects, bio_end_io_t end_io)
+{
+	struct request_queue *q;
+	struct bio *bio;
+	int ret;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	/* Many callers won't care at all about the outcome. After all,
+	   this is just a hint to the underlying device. They'll just
+	   ignore errors completely. So the end_io function can be just
+	   a call to bio_put() */
+	if (!end_io)
+		end_io = (void *)&bio_put;
+
+	if (!q->discard_fn)
+		return -EOPNOTSUPP;
+
+	bio = bio_alloc(GFP_KERNEL, 0);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_end_io = end_io;
+	bio->bi_bdev = bdev;
+	bio->bi_sector = sector;
+	bio->bi_size = nr_sects << 9;
+	submit_bio(1 << BIO_RW_DISCARD, bio);
+	return 0;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-core.c b/block/blk-core.c
index 4889eb8..0cb993a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1081,6 +1081,10 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	 */
 	if (unlikely(bio_barrier(bio)))
 		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
+	if (unlikely(bio_discard(bio))) {
+		req->cmd_flags |= (REQ_SOFTBARRIER | REQ_DISCARD | REQ_NOMERGE);
+		req->q->discard_fn(req->q, req);
+	}
 
 	if (bio_sync(bio))
 		req->cmd_flags |= REQ_RW_SYNC;
@@ -1097,7 +1101,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
-	int el_ret, nr_sectors, barrier, err;
+	int el_ret, nr_sectors, barrier, discard, err;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	int rw_flags;
@@ -1117,9 +1121,15 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		goto end_io;
 	}
 
+	discard = bio_discard(bio);
+	if (unlikely(discard) && !q->discard_fn) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
+
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely(barrier) || elv_queue_empty(q))
+	if (unlikely(barrier) || unlikely(discard) || elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1411,6 +1421,10 @@ end_io:
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
+		if (bio_discard(bio) && !q->discard_fn) {
+			err = -EOPNOTSUPP;
+			goto end_io;
+		}
 
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
@@ -1488,10 +1502,9 @@ void submit_bio(int rw, struct bio *bio)
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
 	 */
-	if (!bio_empty_barrier(bio)) {
+	if (!bio_dataless(bio)) {
 
 		BIO_BUG_ON(!bio->bi_size);
-		BIO_BUG_ON(!bio->bi_io_vec);
 
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
@@ -1886,7 +1899,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 	struct request_queue *q = rq->q;
 	unsigned long flags = 0UL;
 
-	if (blk_fs_request(rq) || blk_pc_request(rq)) {
+	if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) {
 		if (__end_that_request_first(rq, error, nr_bytes))
 			return 1;
 
@@ -1944,7 +1957,7 @@ EXPORT_SYMBOL_GPL(blk_end_request);
  **/
 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
-	if (blk_fs_request(rq) || blk_pc_request(rq)) {
+	if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) {
 		if (__end_that_request_first(rq, error, nr_bytes))
 			return 1;
 	}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index dfc7701..6991af0 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -33,6 +33,22 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 EXPORT_SYMBOL(blk_queue_prep_rq);
 
 /**
+ * blk_queue_discard - set a discard_sectors function for queue
+ * @q:		queue
+ * @dfn:	discard function
+ *
+ * It's possible for a queue to register a discard callback which is used
+ * to queue a request to discard (or "trim") sectors whose content is no
+ * longer required, if the underlying device supports such an action.
+ *
+ */
+void blk_queue_discard(struct request_queue *q, discard_fn *dfn)
+{
+	q->discard_fn = dfn;
+}
+EXPORT_SYMBOL(blk_queue_discard);
+
+/**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:		queue
  * @mbfn:	merge_bvec_fn
diff --git a/block/elevator.c b/block/elevator.c
index ed6f8f3..bb26424 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -675,7 +675,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 	if (q->ordcolor)
 		rq->cmd_flags |= REQ_ORDERED_COLOR;
 
-	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
+	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_DISCARD)) {
 		/*
 		 * toggle ordered color
 		 */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0933a14..67cb2ee 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -149,6 +149,8 @@ struct bio {
  * bit 2 -- barrier
  * bit 3 -- fail fast, don't want low level driver retries
  * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
+ * bit 5 --
+ * bit 6 -- discard sectors (trim)
  */
 #define BIO_RW		0
 #define BIO_RW_AHEAD	1
@@ -156,6 +158,7 @@ struct bio {
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
 #define BIO_RW_META	5
+#define BIO_RW_DISCARD	6
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
@@ -185,10 +188,16 @@ struct bio {
 #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
+#define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !(bio)->bi_size)
 
+#define bio_dataless(bio)	(!(bio)->bi_io_vec)
+
 static inline unsigned int bio_cur_sectors(struct bio *bio)
 {
+	if (unlikely(bio_discard(bio)))
+		return bio->bi_size >> 9;
+
 	if (bio->bi_vcnt)
 		return bio_iovec(bio)->bv_len >> 9;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e61f22b..1ea2d70 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -55,6 +55,7 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
 	REQ_TYPE_FLUSH,			/* flush request */
+	REQ_TYPE_DISCARD,		/* discard sectors request */
 	REQ_TYPE_SPECIAL,		/* driver defined type */
 	REQ_TYPE_LINUX_BLOCK,		/* generic block layer message */
 	/*
@@ -89,6 +90,7 @@ enum {
 enum rq_flag_bits {
 	__REQ_RW,		/* not set, read. set, write */
 	__REQ_FAILFAST,		/* no low level driver retries */
+	__REQ_DISCARD,		/* request to discard sectors */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
 	__REQ_HARDBARRIER,	/* may not be passed by drive either */
@@ -111,6 +113,7 @@ enum rq_flag_bits {
 };
 
 #define REQ_RW		(1 << __REQ_RW)
+#define REQ_DISCARD	(1 << __REQ_DISCARD)
 #define REQ_FAILFAST	(1 << __REQ_FAILFAST)
 #define REQ_SORTED	(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
@@ -252,6 +255,7 @@ typedef void (request_fn_proc) (struct request_queue *q);
 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
+typedef int (discard_fn) (struct request_queue *, struct request *);
 
 struct bio_vec;
 struct bvec_merge_data {
@@ -298,6 +302,7 @@ struct request_queue
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unplug_fn		*unplug_fn;
+	discard_fn		*discard_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
@@ -536,10 +541,12 @@ enum {
 #define blk_sorted_rq(rq)	((rq)->cmd_flags & REQ_SORTED)
 #define blk_barrier_rq(rq)	((rq)->cmd_flags & REQ_HARDBARRIER)
 #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
+#define blk_discard_rq(rq)	((rq)->cmd_flags & REQ_DISCARD)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 #define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
 /* rq->queuelist of dequeued request must be list_empty() */
 #define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
+#define blk_dataless_rq(rq)	(!(rq)->buffer)
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
@@ -786,6 +793,7 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
+extern void blk_queue_discard(struct request_queue *, discard_fn *);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
 extern int blk_do_ordered(struct request_queue *, struct request **);
@@ -829,6 +837,17 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
+extern int blkdev_issue_discard(struct block_device *, sector_t sector,
+				unsigned nr_sects, bio_end_io_t *);
+
+static inline int sb_issue_discard(struct super_block *sb,
+				   sector_t block, unsigned nr_blocks,
+				   bio_end_io_t *endio)
+{
+	block <<= (sb->s_blocksize_bits - 9);
+	nr_blocks <<= (sb->s_blocksize_bits - 9);
+	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, endio);
+}
 
 /*
 * command filter functions
diff --git a/mm/bounce.c b/mm/bounce.c
index b6d2d0f..1cfe193 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	/*
 	 * Data-less bio, nothing to bounce
 	 */
-	if (bio_empty_barrier(*bio_orig))
+	if (bio_dataless(*bio_orig))
 		return;
 
 	/*
-- 
1.5.5.1


-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-06  9:25           ` [PATCH 1/5] [BLOCK] Add 'discard' request handling David Woodhouse
  2008-08-06 16:19             ` OGAWA Hirofumi
  2008-08-07 16:32             ` [PATCH 1/5, v2] " David Woodhouse
@ 2008-08-07 18:41             ` Jens Axboe
  2008-08-08  9:33               ` David Woodhouse
  2 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2008-08-07 18:41 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Wed, Aug 06 2008, David Woodhouse wrote:
> Some block devices benefit from a hint that they can forget the contents
> of certain sectors. Add basic support for this to the block core, along
> with a 'blkdev_issue_discard()' helper function which issues such
> requests.
> 
> Although blkdev_issue_discard() can take an end_io function, it's
> acceptable to leave that as NULL and in that case the allocated bio will
> be automatically freed. Most of the time, it's expected that callers
> won't care about when, or even _if_, the request completes. It's only a
> hint to the device anyway. By definition, the file system doesn't _care_
> about these sectors any more.

I've queued this up for some testing, I'll add the merge support as
well.

> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
> ---
>  block/blk-barrier.c    |   49 ++++++++++++++++++++++++++++++++++++++++++++++++
>  block/blk-core.c       |   25 ++++++++++++++++++-----
>  block/blk-settings.c   |   16 +++++++++++++++
>  block/elevator.c       |    2 +-
>  include/linux/bio.h    |    9 ++++++++
>  include/linux/blkdev.h |   10 +++++++++
>  6 files changed, 104 insertions(+), 7 deletions(-)
> 
> diff --git a/block/blk-barrier.c b/block/blk-barrier.c
> index a09ead1..29e60ff 100644
> --- a/block/blk-barrier.c
> +++ b/block/blk-barrier.c

Not sure why you are placing it here, it should probably just go into
blk-core.c

> @@ -315,3 +315,52 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
>  	return ret;
>  }
>  EXPORT_SYMBOL(blkdev_issue_flush);
> +
> +/**
> + * blkdev_issue_discard - queue a discard
> + * @bdev:	blockdev to issue discard for
> + * @sector:	start sector
> + * @nr_sects:	number of sectors to discard
> + * @end_io:	end_io function (or %NULL)
> + *
> + * Description:
> + *    Issue a discard request for the sectors in question. Caller can pass an
> + *    end_io function (which must call bio_put()), if they really want to see
> + *    the outcome. Most callers probably won't, and can just pass %NULL.
> + */
> +int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> +			 unsigned nr_sects, bio_end_io_t end_io)
> +{
> +	struct request_queue *q;
> +	struct bio *bio;
> +	int ret;
> +
> +	if (bdev->bd_disk == NULL)
> +		return -ENXIO;
> +
> +	q = bdev_get_queue(bdev);
> +	if (!q)
> +		return -ENXIO;
> +
> +	/* Many callers won't care at all about the outcome. After all,
> +	   this is just a hint to the underlying device. They'll just
> +	   ignore errors completely. So the end_io function can be just
> +	   a call to bio_put() */
> +	if (!end_io)
> +		end_io = (void *)&bio_put;

Please don't do that, that's pretty ugly.

> +
> +	if (!q->discard_fn)
> +		return -EOPNOTSUPP;
> +
> +	bio = bio_alloc(GFP_KERNEL, 0);
> +	if (!bio)
> +		return -ENOMEM;
> +
> +	bio->bi_end_io = end_io;
> +	bio->bi_bdev = bdev;
> +	bio->bi_sector = sector;
> +	bio->bi_size = nr_sects << 9;
> +	submit_bio(1 << BIO_RW_DISCARD, bio);
> +	return 0;
> +}
> +EXPORT_SYMBOL(blkdev_issue_discard);
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 4889eb8..0cb993a 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1081,6 +1081,10 @@ void init_request_from_bio(struct request *req, struct bio *bio)
>  	 */
>  	if (unlikely(bio_barrier(bio)))
>  		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
> +	if (unlikely(bio_discard(bio))) {
> +		req->cmd_flags |= (REQ_SOFTBARRIER | REQ_DISCARD | REQ_NOMERGE);
> +		req->q->discard_fn(req->q, req);
> +	}

What is the point of putting that request init into a new queue
strategy? Even if you wanted to do it in the driver, the driver should
just do it in its prep function.

>  	if (bio_sync(bio))
>  		req->cmd_flags |= REQ_RW_SYNC;
> @@ -1097,7 +1101,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
>  static int __make_request(struct request_queue *q, struct bio *bio)
>  {
>  	struct request *req;
> -	int el_ret, nr_sectors, barrier, err;
> +	int el_ret, nr_sectors, barrier, discard, err;
>  	const unsigned short prio = bio_prio(bio);
>  	const int sync = bio_sync(bio);
>  	int rw_flags;
> @@ -1117,9 +1121,15 @@ static int __make_request(struct request_queue *q, struct bio *bio)
>  		goto end_io;
>  	}
>  
> +	discard = bio_discard(bio);
> +	if (unlikely(discard) && !q->discard_fn) {
> +		err = -EOPNOTSUPP;
> +		goto end_io;
> +	}
> +
>  	spin_lock_irq(q->queue_lock);
>  
> -	if (unlikely(barrier) || elv_queue_empty(q))
> +	if (unlikely(barrier) || unlikely(discard) || elv_queue_empty(q))
>  		goto get_rq;
>  
>  	el_ret = elv_merge(q, &req, bio);
> @@ -1411,6 +1421,10 @@ end_io:
>  			err = -EOPNOTSUPP;
>  			goto end_io;
>  		}
> +		if (bio_discard(bio) && !q->discard_fn) {
> +			err = -EOPNOTSUPP;
> +			goto end_io;
> +		}
>  
>  		ret = q->make_request_fn(q, bio);
>  	} while (ret);
> @@ -1488,10 +1502,9 @@ void submit_bio(int rw, struct bio *bio)
>  	 * If it's a regular read/write or a barrier with data attached,
>  	 * go through the normal accounting stuff before submission.
>  	 */
> -	if (!bio_empty_barrier(bio)) {
> +	if (!bio_dataless(bio)) {
>  
>  		BIO_BUG_ON(!bio->bi_size);
> -		BIO_BUG_ON(!bio->bi_io_vec);
>  
>  		if (rw & WRITE) {
>  			count_vm_events(PGPGOUT, count);

Zach suggested bio_has_data() instead, which I agree is a lot more
readable.

> @@ -1886,7 +1899,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
>  	struct request_queue *q = rq->q;
>  	unsigned long flags = 0UL;
>  
> -	if (blk_fs_request(rq) || blk_pc_request(rq)) {
> +	if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) {
>  		if (__end_that_request_first(rq, error, nr_bytes))
>  			return 1;
>  

Striking my last comment on this, this really just checks whether we
have a bio to complete or not. So a check for that would probably be
better, but lets leave that for later. For now this is fine.

> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index dfc7701..6991af0 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -33,6 +33,22 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
>  EXPORT_SYMBOL(blk_queue_prep_rq);
>  
>  /**
> + * blk_queue_discard - set a discard_sectors function for queue
> + * @q:		queue
> + * @dfn:	discard function
> + *
> + * It's possible for a queue to register a discard callback which is used
> + * to queue a request to discard (or "trim") sectors whose content is no
> + * longer required, if the underlying device supports such an action.
> + *
> + */
> +void blk_queue_discard(struct request_queue *q, discard_fn *dfn)
> +{
> +	q->discard_fn = dfn;
> +}
> +EXPORT_SYMBOL(blk_queue_discard);
> +
> +/**
>   * blk_queue_merge_bvec - set a merge_bvec function for queue
>   * @q:		queue
>   * @mbfn:	merge_bvec_fn
> diff --git a/block/elevator.c b/block/elevator.c
> index ed6f8f3..bb26424 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -675,7 +675,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
>  	if (q->ordcolor)
>  		rq->cmd_flags |= REQ_ORDERED_COLOR;
>  
> -	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
> +	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_DISCARD)) {
>  		/*
>  		 * toggle ordered color
>  		 */

Just make REQ_DISCARD set REQ_SOFTBARRIER? Do you care if this acts as a
barrier or not?

> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 0933a14..67cb2ee 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -149,6 +149,8 @@ struct bio {
>   * bit 2 -- barrier
>   * bit 3 -- fail fast, don't want low level driver retries
>   * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
> + * bit 5 --
> + * bit 6 -- discard sectors (trim)
>   */
>  #define BIO_RW		0
>  #define BIO_RW_AHEAD	1
> @@ -156,6 +158,7 @@ struct bio {
>  #define BIO_RW_FAILFAST	3
>  #define BIO_RW_SYNC	4
>  #define BIO_RW_META	5
> +#define BIO_RW_DISCARD	6
>  
>  /*
>   * upper 16 bits of bi_rw define the io priority of this bio
> @@ -185,10 +188,16 @@ struct bio {
>  #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
>  #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
>  #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
> +#define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
>  #define bio_empty_barrier(bio)	(bio_barrier(bio) && !(bio)->bi_size)
>  
> +#define bio_dataless(bio)	(!(bio)->bi_io_vec)
> +
>  static inline unsigned int bio_cur_sectors(struct bio *bio)
>  {
> +	if (unlikely(bio_discard(bio)))
> +		return bio->bi_size >> 9;
> +
>  	if (bio->bi_vcnt)
>  		return bio_iovec(bio)->bv_len >> 9;

Just make that

        if (bio->bi_vcnt)
  		return bio_iovec(bio)->bv_len >> 9;
        else
                return bio->bi_size >> 9;

and add a comment about it.

> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index e61f22b..9e3adc4 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -55,6 +55,7 @@ enum rq_cmd_type_bits {
>  	REQ_TYPE_PM_RESUME,		/* resume request */
>  	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
>  	REQ_TYPE_FLUSH,			/* flush request */
> +	REQ_TYPE_DISCARD,		/* discard sectors request */
>  	REQ_TYPE_SPECIAL,		/* driver defined type */
>  	REQ_TYPE_LINUX_BLOCK,		/* generic block layer message */
>  	/*
> @@ -89,6 +90,7 @@ enum {
>  enum rq_flag_bits {
>  	__REQ_RW,		/* not set, read. set, write */
>  	__REQ_FAILFAST,		/* no low level driver retries */
> +	__REQ_DISCARD,		/* request to discard sectors */
>  	__REQ_SORTED,		/* elevator knows about this request */
>  	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
>  	__REQ_HARDBARRIER,	/* may not be passed by drive either */
> @@ -111,6 +113,7 @@ enum rq_flag_bits {
>  };
>  
>  #define REQ_RW		(1 << __REQ_RW)
> +#define REQ_DISCARD	(1 << __REQ_DISCARD)
>  #define REQ_FAILFAST	(1 << __REQ_FAILFAST)
>  #define REQ_SORTED	(1 << __REQ_SORTED)
>  #define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
> @@ -252,6 +255,7 @@ typedef void (request_fn_proc) (struct request_queue *q);
>  typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
>  typedef int (prep_rq_fn) (struct request_queue *, struct request *);
>  typedef void (unplug_fn) (struct request_queue *);
> +typedef int (discard_fn) (struct request_queue *, struct request *);
>  
>  struct bio_vec;
>  struct bvec_merge_data {
> @@ -298,6 +302,7 @@ struct request_queue
>  	make_request_fn		*make_request_fn;
>  	prep_rq_fn		*prep_rq_fn;
>  	unplug_fn		*unplug_fn;
> +	discard_fn		*discard_fn;
>  	merge_bvec_fn		*merge_bvec_fn;
>  	prepare_flush_fn	*prepare_flush_fn;
>  	softirq_done_fn		*softirq_done_fn;
> @@ -536,10 +541,12 @@ enum {
>  #define blk_sorted_rq(rq)	((rq)->cmd_flags & REQ_SORTED)
>  #define blk_barrier_rq(rq)	((rq)->cmd_flags & REQ_HARDBARRIER)
>  #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
> +#define blk_discard_rq(rq)	((rq)->cmd_flags & REQ_DISCARD)
>  #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
>  #define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
>  /* rq->queuelist of dequeued request must be list_empty() */
>  #define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
> +#define blk_dataless_rq(rq)	(!(rq)->buffer)
>  
>  #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
>  
> @@ -786,6 +793,7 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
>  extern void blk_queue_dma_alignment(struct request_queue *, int);
>  extern void blk_queue_update_dma_alignment(struct request_queue *, int);
>  extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
> +extern void blk_queue_discard(struct request_queue *, discard_fn *);
>  extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
>  extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
>  extern int blk_do_ordered(struct request_queue *, struct request **);
> @@ -829,6 +837,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
>  }
>  
>  extern int blkdev_issue_flush(struct block_device *, sector_t *);
> +extern int blkdev_issue_discard(struct block_device *, sector_t sector,
> +				unsigned nr_sects, bio_end_io_t *);
>  
>  /*
>  * command filter functions
> -- 
> 1.5.5.1
> 
> 
> -- 
> David Woodhouse                            Open Source Technology Centre
> David.Woodhouse@intel.com                              Intel Corporation
> 
> 
> 

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-07 18:41             ` [PATCH 1/5] " Jens Axboe
@ 2008-08-08  9:33               ` David Woodhouse
  2008-08-08 10:30                 ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: David Woodhouse @ 2008-08-08  9:33 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Thu, 2008-08-07 at 20:41 +0200, Jens Axboe wrote:
> I've queued this up for some testing, I'll add the merge support as
> well.

Thanks. Did you pull from my tree? If so I'll provide an incremental
patch. Otherwise I'll go back and recommit it with your suggested
changes.

> > diff --git a/block/blk-barrier.c b/block/blk-barrier.c
> > index a09ead1..29e60ff 100644
> > --- a/block/blk-barrier.c
> > +++ b/block/blk-barrier.c
> 
> Not sure why you are placing it here, it should probably just go into
> blk-core.c

Yeah, I vacillated about that for a while -- and even got as far as
moving it there, and back again. It's not really _core_ code either.
Since it started off almost identical to blkdev_issue_flush() I figured
it might as well sit next to it. But I'm happy to move it too.

> > +	/* Many callers won't care at all about the outcome. After all,
> > +	   this is just a hint to the underlying device. They'll just
> > +	   ignore errors completely. So the end_io function can be just
> > +	   a call to bio_put() */
> > +	if (!end_io)
> > +		end_io = (void *)&bio_put;
> 
> Please don't do that, that's pretty ugly.

OK.

> > @@ -1488,10 +1502,9 @@ void submit_bio(int rw, struct bio *bio)
> >  	 * If it's a regular read/write or a barrier with data attached,
> >  	 * go through the normal accounting stuff before submission.
> >  	 */
> > -	if (!bio_empty_barrier(bio)) {
> > +	if (!bio_dataless(bio)) {
> >  
> >  		BIO_BUG_ON(!bio->bi_size);
> > -		BIO_BUG_ON(!bio->bi_io_vec);
> >  
> >  		if (rw & WRITE) {
> >  			count_vm_events(PGPGOUT, count);
> 
> Zach suggested bio_has_data() instead, which I agree is a lot more
> readable.

OK.

> > diff --git a/block/elevator.c b/block/elevator.c
> > index ed6f8f3..bb26424 100644
> > --- a/block/elevator.c
> > +++ b/block/elevator.c
> > @@ -675,7 +675,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
> >  	if (q->ordcolor)
> >  		rq->cmd_flags |= REQ_ORDERED_COLOR;
> >  
> > -	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
> > +	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_DISCARD)) {
> >  		/*
> >  		 * toggle ordered color
> >  		 */
> 
> Just make REQ_DISCARD set REQ_SOFTBARRIER? Do you care if this acts as a
> barrier or not?

Er, didn't you object to me setting REQ_SOFTBARRIER for discard
requests, a few lines up? Admittedly, I shouldn't need to do _both_, but
I think I need one or the other.

In the long term no, I don't care if it acts as a barrier. I just did
that until we implement the merge support for discard requests.

> >  static inline unsigned int bio_cur_sectors(struct bio *bio)
> >  {
> > +	if (unlikely(bio_discard(bio)))
> > +		return bio->bi_size >> 9;
> > +
> >  	if (bio->bi_vcnt)
> >  		return bio_iovec(bio)->bv_len >> 9;
> 
> Just make that
> 
>         if (bio->bi_vcnt)
>   		return bio_iovec(bio)->bv_len >> 9;
>         else
>                 return bio->bi_size >> 9;
> 
> and add a comment about it.

OK.

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08  9:33               ` David Woodhouse
@ 2008-08-08 10:30                 ` Jens Axboe
  2008-08-08 10:49                   ` Jens Axboe
  2008-08-08 10:52                   ` David Woodhouse
  0 siblings, 2 replies; 48+ messages in thread
From: Jens Axboe @ 2008-08-08 10:30 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, Aug 08 2008, David Woodhouse wrote:
> On Thu, 2008-08-07 at 20:41 +0200, Jens Axboe wrote:
> > I've queued this up for some testing, I'll add the merge support as
> > well.
> 
> Thanks. Did you pull from my tree? If so I'll provide an incremental
> patch. Otherwise I'll go back and recommit it with your suggested
> changes.

I did not, the final version will be different from the 1st one, so no
point in carrying that history. What I did this morning was add the
bio_has_data() support bits, here:

http://git.kernel.dk/?p=linux-2.6-block.git;a=shortlog;h=refs/heads/for-2.6.28

> > > diff --git a/block/blk-barrier.c b/block/blk-barrier.c
> > > index a09ead1..29e60ff 100644
> > > --- a/block/blk-barrier.c
> > > +++ b/block/blk-barrier.c
> > 
> > Not sure why you are placing it here, it should probably just go into
> > blk-core.c
> 
> Yeah, I vacillated about that for a while -- and even got as far as
> moving it there, and back again. It's not really _core_ code either.
> Since it started off almost identical to blkdev_issue_flush() I figured
> it might as well sit next to it. But I'm happy to move it too.

It doesn't matter a whole lot, so lets just keep it there. None of the
files are a perfect fit, and it would be silly to start a new one for
this.

> > > diff --git a/block/elevator.c b/block/elevator.c
> > > index ed6f8f3..bb26424 100644
> > > --- a/block/elevator.c
> > > +++ b/block/elevator.c
> > > @@ -675,7 +675,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
> > >  	if (q->ordcolor)
> > >  		rq->cmd_flags |= REQ_ORDERED_COLOR;
> > >  
> > > -	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
> > > +	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_DISCARD)) {
> > >  		/*
> > >  		 * toggle ordered color
> > >  		 */
> > 
> > Just make REQ_DISCARD set REQ_SOFTBARRIER? Do you care if this acts as a
> > barrier or not?
> 
> Er, didn't you object to me setting REQ_SOFTBARRIER for discard
> requests, a few lines up? Admittedly, I shouldn't need to do _both_, but
> I think I need one or the other.

Not at all, please re-read that part. I objected to adding a strategy
hook just for setting cmd type and flags for discard. Basically I think
this boils down to the fact that we only really pass fs requests through
submit_bio(). Other types of requests are typically inserted directly
into the queue, if they aren't "normal" file system requests. Since we
want to do merging on discard requests, that isn't a good idea here.

The question here is whether you want discard to be a barrier or not.
Either you mark it as such when the rq is inited or you don't, don't
check for it explicitly in __elv_add_request().

> In the long term no, I don't care if it acts as a barrier. I just did
> that until we implement the merge support for discard requests.

OK

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 10:30                 ` Jens Axboe
@ 2008-08-08 10:49                   ` Jens Axboe
  2008-08-08 11:04                     ` David Woodhouse
  2008-08-08 10:52                   ` David Woodhouse
  1 sibling, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2008-08-08 10:49 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, Aug 08 2008, Jens Axboe wrote:
> On Fri, Aug 08 2008, David Woodhouse wrote:
> > On Thu, 2008-08-07 at 20:41 +0200, Jens Axboe wrote:
> > > I've queued this up for some testing, I'll add the merge support as
> > > well.
> > 
> > Thanks. Did you pull from my tree? If so I'll provide an incremental
> > patch. Otherwise I'll go back and recommit it with your suggested
> > changes.
> 
> I did not, the final version will be different from the 1st one, so no
> point in carrying that history. What I did this morning was add the
> bio_has_data() support bits, here:

This is what I came up with. You don't need the request discard flag,
since you are using a specific request type for this. Additionally, we
can move the request init where it is done for fs requests
(init_req_from_bio) and get it set there. I also added the small bits
for merging. Nothing tested, so it may blow up :-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index a09ead1..1f58063 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -315,3 +315,56 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
+
+static void bio_discard_end_io(struct bio *bio, int err)
+{
+	bio_put(bio);
+}
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:	blockdev to issue discard for
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to discard
+ * @end_io:	end_io function (or %NULL)
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question. Caller can pass an
+ *    end_io function (which must call bio_put()), if they really want to see
+ *    the outcome. Most callers probably won't, and can just pass %NULL.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+			 unsigned nr_sects, bio_end_io_t end_io)
+{
+	struct request_queue *q;
+	struct bio *bio;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	bio = bio_alloc(GFP_KERNEL, 0);
+	if (!bio)
+		return -ENOMEM;
+
+	/*
+	 * Many callers won't care at all about the outcome. After all,
+	 * this is just a hint to the underlying device. They'll just
+	 * ignore errors completely. So the end_io function can be just
+	 * a call to bio_put()
+	 */
+	if (end_io)
+		bio->bi_end_io = end_io;
+	else
+		bio->bi_end_io = bio_discard_end_io;
+
+	bio->bi_bdev = bdev;
+	bio->bi_sector = sector;
+	bio->bi_size = nr_sects << 9;
+	submit_bio(1 << BIO_RW_DISCARD, bio);
+	return 0;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-core.c b/block/blk-core.c
index ff7ec49..f1344ea 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1068,7 +1068,10 @@ EXPORT_SYMBOL(blk_put_request);
 
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
-	req->cmd_type = REQ_TYPE_FS;
+	if (bio_discard(bio))
+		req->cmd_type = REQ_TYPE_DISCARD;
+	else
+		req->cmd_type = REQ_TYPE_FS;
 
 	/*
 	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
@@ -2013,12 +2016,15 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 	/* first two bits are identical in rq->cmd_flags and bio->bi_rw */
 	rq->cmd_flags |= (bio->bi_rw & 3);
 
-	rq->nr_phys_segments = bio_phys_segments(q, bio);
-	rq->nr_hw_segments = bio_hw_segments(q, bio);
+	if (bio_has_data(bio)) {
+		rq->nr_phys_segments = bio_phys_segments(q, bio);
+		rq->nr_hw_segments = bio_hw_segments(q, bio);
+		rq->buffer = bio_data(bio);
+	}
+
 	rq->current_nr_sectors = bio_cur_sectors(bio);
 	rq->hard_cur_sectors = rq->current_nr_sectors;
 	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
-	rq->buffer = bio_data(bio);
 	rq->data_len = bio->bi_size;
 
 	rq->bio = rq->biotail = bio;
diff --git a/block/elevator.c b/block/elevator.c
index ed6f8f3..e704086 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -75,6 +75,13 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 		return 0;
 
 	/*
+	 * Don't merge file system and non-file system requests
+	 */
+	if ((!bio_has_data(bio) && bio_has_data(rq->bio)) ||
+	    (bio_has_data(bio) && !bio_has_data(rq->bio)))
+		return 0;
+
+	/*
 	 * different data direction or already started, don't merge
 	 */
 	if (bio_data_dir(bio) != rq_data_dir(rq))
@@ -607,7 +614,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		break;
 
 	case ELEVATOR_INSERT_SORT:
-		BUG_ON(!blk_fs_request(rq));
 		rq->cmd_flags |= REQ_SORTED;
 		q->nr_sorted++;
 		if (rq_mergeable(rq)) {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index dbeb66f..8dbc6c5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -149,6 +149,8 @@ struct bio {
  * bit 2 -- barrier
  * bit 3 -- fail fast, don't want low level driver retries
  * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
+ * bit 5 -- metadata request
+ * bit 6 -- discard sectors request
  */
 #define BIO_RW		0
 #define BIO_RW_AHEAD	1
@@ -156,6 +158,7 @@ struct bio {
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
 #define BIO_RW_META	5
+#define BIO_RW_DISCARD	6
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
@@ -186,6 +189,7 @@ struct bio {
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio))
+#define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
 
 static inline unsigned int bio_cur_sectors(struct bio *bio)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e61f22b..73cd4da 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -56,6 +56,7 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
 	REQ_TYPE_FLUSH,			/* flush request */
 	REQ_TYPE_SPECIAL,		/* driver defined type */
+	REQ_TYPE_DISCARD,		/* discard sectors request */
 	REQ_TYPE_LINUX_BLOCK,		/* generic block layer message */
 	/*
 	 * for ATA/ATAPI devices. this really doesn't belong here, ide should
@@ -522,6 +523,7 @@ enum {
 #define blk_pc_request(rq)	((rq)->cmd_type == REQ_TYPE_BLOCK_PC)
 #define blk_special_request(rq)	((rq)->cmd_type == REQ_TYPE_SPECIAL)
 #define blk_sense_request(rq)	((rq)->cmd_type == REQ_TYPE_SENSE)
+#define blk_discard_request(rq)	((rq)->cmd_type == REQ_TYPE_DISCARD)
 
 #define blk_noretry_request(rq)	((rq)->cmd_flags & REQ_FAILFAST)
 #define blk_rq_started(rq)	((rq)->cmd_flags & REQ_STARTED)
@@ -829,6 +831,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
+extern int blkdev_issue_discard(struct block_device *, sector_t,
+				unsigned int, bio_end_io_t *);
 
 /*
 * command filter functions

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 10:49                   ` Jens Axboe
@ 2008-08-08 11:04                     ` David Woodhouse
  2008-08-08 11:20                       ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: David Woodhouse @ 2008-08-08 11:04 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, 2008-08-08 at 12:49 +0200, Jens Axboe wrote:
> This is what I came up with. You don't need the request discard flag,
> since you are using a specific request type for this. Additionally, we
> can move the request init where it is done for fs requests
> (init_req_from_bio) and get it set there. 

Hmmm. (qv)

> I also added the small bits for merging.

Nice. I was _hoping_ it would be that easy, but didn't want to get
involved with that yet. One tweak we might want to make to that would be
to allow discard requests to be _dropped_ if there's a subsequent write
to the same sectors -- if we expect it to be common that we deallocate
sectors then re-use them again before the discard request hits the disk,
that might be an interesting optimisation.

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 11:04                     ` David Woodhouse
@ 2008-08-08 11:20                       ` Jens Axboe
  0 siblings, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2008-08-08 11:20 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, Aug 08 2008, David Woodhouse wrote:
> On Fri, 2008-08-08 at 12:49 +0200, Jens Axboe wrote:
> > This is what I came up with. You don't need the request discard flag,
> > since you are using a specific request type for this. Additionally, we
> > can move the request init where it is done for fs requests
> > (init_req_from_bio) and get it set there. 
> 
> Hmmm. (qv)
> 
> > I also added the small bits for merging.
> 
> Nice. I was _hoping_ it would be that easy, but didn't want to get
> involved with that yet. One tweak we might want to make to that would be
> to allow discard requests to be _dropped_ if there's a subsequent write
> to the same sectors -- if we expect it to be common that we deallocate
> sectors then re-use them again before the discard request hits the disk,
> that might be an interesting optimisation.

Merging should be fairly easy, for discarding discard (ahem) requests,
you would need to check for overlap. The merging doesn't do that, it'll
only look for boundary matches. So it would require some extra logic and
either runtime cost and/or an additional data structure to track that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 10:30                 ` Jens Axboe
  2008-08-08 10:49                   ` Jens Axboe
@ 2008-08-08 10:52                   ` David Woodhouse
  2008-08-08 11:09                     ` Jens Axboe
  2008-08-08 14:22                     ` Matthew Wilcox
  1 sibling, 2 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-08 10:52 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, 2008-08-08 at 12:30 +0200, Jens Axboe wrote:
> Not at all, please re-read that part. I objected to adding a strategy
> hook just for setting cmd type and flags for discard.

Ah, now I understand what you were saying. And I vacillated about that
too -- I actually suggested to Gilad that we didn't want it, but then
went ahead and did it anyway.

Adding the ->discard_fn hook provides two features:

Firstly, it allows an early 'abort' of discard requests where the device
doesn't support them -- there's no point in allocating the request and
passing it down to the device to be discarded there, when we can just
notice that there's no ->discard_fn and return immediately from
blkdev_issue_discard().

Secondly, it allows drivers to insert the appropriate command onto their
queue -- I expect that SCSI and ATA drives won't use REQ_TYPE_DISCARD,
just as they don't use REQ_TYPE_FLUSH. They'll use REQ_TYPE_BLOCK_PC
with whatever opcode is allocated for trim/punch.

We _could_ address those two differently -- maybe we could add a
QUEUE_FLAG_DISCARDS in q->queue_flags to indicate that discard requests
are supported, and we could make the driver 'translate' from
REQ_TYPE_DISCARD to whatever it actually wants to do. But that seems
strangely different to how we handle flushes, which is what I was basing
my implementation on.

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 10:52                   ` David Woodhouse
@ 2008-08-08 11:09                     ` Jens Axboe
  2008-08-08 11:18                       ` Jens Axboe
  2008-08-08 14:22                     ` Matthew Wilcox
  1 sibling, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2008-08-08 11:09 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, Aug 08 2008, David Woodhouse wrote:
> On Fri, 2008-08-08 at 12:30 +0200, Jens Axboe wrote:
> > Not at all, please re-read that part. I objected to adding a strategy
> > hook just for setting cmd type and flags for discard.
> 
> Ah, now I understand what you were saying. And I vacillated about that
> too -- I actually suggested to Gilad that we didn't want it, but then
> went ahead and did it anyway.
> 
> Adding the ->discard_fn hook provides two features:
> 
> Firstly, it allows an early 'abort' of discard requests where the device
> doesn't support them -- there's no point in allocating the request and
> passing it down to the device to be discarded there, when we can just
> notice that there's no ->discard_fn and return immediately from
> blkdev_issue_discard().
> 
> Secondly, it allows drivers to insert the appropriate command onto their
> queue -- I expect that SCSI and ATA drives won't use REQ_TYPE_DISCARD,
> just as they don't use REQ_TYPE_FLUSH. They'll use REQ_TYPE_BLOCK_PC
> with whatever opcode is allocated for trim/punch.
> 
> We _could_ address those two differently -- maybe we could add a
> QUEUE_FLAG_DISCARDS in q->queue_flags to indicate that discard requests
> are supported, and we could make the driver 'translate' from
> REQ_TYPE_DISCARD to whatever it actually wants to do. But that seems
> strangely different to how we handle flushes, which is what I was basing
> my implementation on.
> 

Given that discard requests may become quite often issued by drivers, it
does make sense to optimize for the case where we don't support them as
well. So perhaps the ->discard_fn() can be tolerated, though I'd rather
get rid of the ->issue_flush_fn instead :-)

For now, lets just go with the discard_fn, I'll update the patch again.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 11:09                     ` Jens Axboe
@ 2008-08-08 11:18                       ` Jens Axboe
  2008-08-08 11:29                         ` David Woodhouse
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2008-08-08 11:18 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, Aug 08 2008, Jens Axboe wrote:
> On Fri, Aug 08 2008, David Woodhouse wrote:
> > On Fri, 2008-08-08 at 12:30 +0200, Jens Axboe wrote:
> > > Not at all, please re-read that part. I objected to adding a strategy
> > > hook just for setting cmd type and flags for discard.
> > 
> > Ah, now I understand what you were saying. And I vacillated about that
> > too -- I actually suggested to Gilad that we didn't want it, but then
> > went ahead and did it anyway.
> > 
> > Adding the ->discard_fn hook provides two features:
> > 
> > Firstly, it allows an early 'abort' of discard requests where the device
> > doesn't support them -- there's no point in allocating the request and
> > passing it down to the device to be discarded there, when we can just
> > notice that there's no ->discard_fn and return immediately from
> > blkdev_issue_discard().
> > 
> > Secondly, it allows drivers to insert the appropriate command onto their
> > queue -- I expect that SCSI and ATA drives won't use REQ_TYPE_DISCARD,
> > just as they don't use REQ_TYPE_FLUSH. They'll use REQ_TYPE_BLOCK_PC
> > with whatever opcode is allocated for trim/punch.
> > 
> > We _could_ address those two differently -- maybe we could add a
> > QUEUE_FLAG_DISCARDS in q->queue_flags to indicate that discard requests
> > are supported, and we could make the driver 'translate' from
> > REQ_TYPE_DISCARD to whatever it actually wants to do. But that seems
> > strangely different to how we handle flushes, which is what I was basing
> > my implementation on.
> > 
> 
> Given that discard requests may become quite often issued by drivers, it
> does make sense to optimize for the case where we don't support them as
> well. So perhaps the ->discard_fn() can be tolerated, though I'd rather
> get rid of the ->issue_flush_fn instead :-)
> 
> For now, lets just go with the discard_fn, I'll update the patch again.

Actually, my memory is a bit shot, since I got rid of ->issue_flush_fn
in the 2.6.24 cycle. So it's already gone

So, instead, lets check for the appropriate queue flag in
blkdev_issue_flush() and in generic_make_request() when unrolling the
stack. The drive must then do the transformation in the ->prep_fn()
function.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 11:18                       ` Jens Axboe
@ 2008-08-08 11:29                         ` David Woodhouse
  2008-08-08 11:44                           ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: David Woodhouse @ 2008-08-08 11:29 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, 2008-08-08 at 13:18 +0200, Jens Axboe wrote:
> Actually, my memory is a bit shot, since I got rid of ->issue_flush_fn
> in the 2.6.24 cycle. So it's already gone

It was q->prepare_flush_fn which the discard_fn is based on, and that's
still there in the for-2.6.28 tree you just pointed me at.

http://git.kernel.dk/?p=linux-2.6-block.git;a=blob;f=block/blk-barrier.c;h=a09ead19f9c5702a1ad76d709c54969176fe9e94;hb=6dc2b733b2f9605b48fdb7692fce5a3eafe241e4#l149

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 11:29                         ` David Woodhouse
@ 2008-08-08 11:44                           ` Jens Axboe
  2008-08-08 11:47                             ` Jens Axboe
                                               ` (2 more replies)
  0 siblings, 3 replies; 48+ messages in thread
From: Jens Axboe @ 2008-08-08 11:44 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, Aug 08 2008, David Woodhouse wrote:
> On Fri, 2008-08-08 at 13:18 +0200, Jens Axboe wrote:
> > Actually, my memory is a bit shot, since I got rid of ->issue_flush_fn
> > in the 2.6.24 cycle. So it's already gone
> 
> It was q->prepare_flush_fn which the discard_fn is based on, and that's
> still there in the for-2.6.28 tree you just pointed me at.
> 
> http://git.kernel.dk/?p=linux-2.6-block.git;a=blob;f=block/blk-barrier.c;h=a09ead19f9c5702a1ad76d709c54969176fe9e94;hb=6dc2b733b2f9605b48fdb7692fce5a3eafe241e4#l149

Sigh indeed, ->issue_flush_fn() was the actual issuer, not the preparer.
Let me send a new diff. This adds the ->prepare_discard_fn() to do the
transformation, and also extends blkdev_issue_flush() to return error if
the IO was never queued because of some device in the stack not
supporting it.

Until we have overlap detection, I think we should make the discard
request a explicit barrier. Otherwise we could have problems with a
discard being passed by a write request and such.

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index a09ead1..b2add39 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -315,3 +315,78 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
+
+static void bio_discard_end_io(struct bio *bio, int err)
+{
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	}
+
+	bio_put(bio);
+}
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:	blockdev to issue discard for
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to discard
+ * @end_io:	end_io function (or %NULL)
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question. Caller can pass an
+ *    end_io function (which must call bio_put()), if they really want to see
+ *    the outcome. Most callers probably won't, and can just pass %NULL.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+			 unsigned nr_sects, bio_end_io_t end_io)
+{
+	struct request_queue *q;
+	struct bio *bio;
+	int ret;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	if (!q->prepare_discard_fn)
+		return -EOPNOTSUPP;
+
+	bio = bio_alloc(GFP_KERNEL, 0);
+	if (!bio)
+		return -ENOMEM;
+
+	/*
+	 * Many callers won't care at all about the outcome. After all,
+	 * this is just a hint to the underlying device. They'll just
+	 * ignore errors completely. So the end_io function can be just
+	 * a call to bio_put()
+	 */
+	if (end_io)
+		bio->bi_end_io = end_io;
+	else
+		bio->bi_end_io = bio_discard_end_io;
+
+	bio->bi_bdev = bdev;
+	bio->bi_sector = sector;
+	bio->bi_size = nr_sects << 9;
+	bio_get(bio);
+	submit_bio(1 << BIO_RW_DISCARD, bio);
+
+	/*
+	 * Check if it was failed before being submitted
+	 */
+	ret = 0;
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+	else if (!bio_flagged(bio, BIO_UPTODATE))
+		ret = -EIO;
+
+	bio_put(bio);
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-core.c b/block/blk-core.c
index ff7ec49..ad519c9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1068,7 +1068,10 @@ EXPORT_SYMBOL(blk_put_request);
 
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
-	req->cmd_type = REQ_TYPE_FS;
+	if (bio_discard(bio))
+		req->cmd_type = REQ_TYPE_DISCARD;
+	else
+		req->cmd_type = REQ_TYPE_FS;
 
 	/*
 	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
@@ -1407,7 +1410,8 @@ end_io:
 
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
-		if (bio_empty_barrier(bio) && !q->prepare_flush_fn) {
+		if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) ||
+		    (bio_discard(bio) && !q->prepare_discard_fn)) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index dfc7701..c783d84 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -32,6 +32,23 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 }
 EXPORT_SYMBOL(blk_queue_prep_rq);
 
+ /**
+ * blk_queue_setdiscard - set a discard_sectors function for queue
+ * @q:		queue
+ * @dfn:	discard function
+ *
+ * It's possible for a queue to register a discard callback which is used
+ * to transform a discard request into the appropriate type for the
+ * hardware. If none is registered, then discard requests are failed
+ * with EOPNOTSUPP.
+ *
+ */
+void blk_queue_set_discard(struct request_queue *q, prepare_discard_fn *dfn)
+{
+	q->prepare_discard_fn = dfn;
+}
+EXPORT_SYMBOL(blk_queue_set_discard);
+
 /**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:		queue
diff --git a/block/elevator.c b/block/elevator.c
index ed6f8f3..e704086 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -75,6 +75,13 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 		return 0;
 
 	/*
+	 * Don't merge file system and non-file system requests
+	 */
+	if ((!bio_has_data(bio) && bio_has_data(rq->bio)) ||
+	    (bio_has_data(bio) && !bio_has_data(rq->bio)))
+		return 0;
+
+	/*
 	 * different data direction or already started, don't merge
 	 */
 	if (bio_data_dir(bio) != rq_data_dir(rq))
@@ -607,7 +614,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		break;
 
 	case ELEVATOR_INSERT_SORT:
-		BUG_ON(!blk_fs_request(rq));
 		rq->cmd_flags |= REQ_SORTED;
 		q->nr_sorted++;
 		if (rq_mergeable(rq)) {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index dbeb66f..cc678d4 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -149,6 +149,8 @@ struct bio {
  * bit 2 -- barrier
  * bit 3 -- fail fast, don't want low level driver retries
  * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
+ * bit 5 -- metadata request
+ * bit 6 -- discard sectors request
  */
 #define BIO_RW		0
 #define BIO_RW_AHEAD	1
@@ -156,6 +158,7 @@ struct bio {
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
 #define BIO_RW_META	5
+#define BIO_RW_DISCARD	6
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
@@ -186,13 +189,18 @@ struct bio {
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio))
+#define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
 
 static inline unsigned int bio_cur_sectors(struct bio *bio)
 {
+	/*
+	 * Discard requests don't have an bio_vec list, yet they still
+	 * refer to a range of data. Just use the given bio size in that case.
+	 */
 	if (bio->bi_vcnt)
 		return bio_iovec(bio)->bv_len >> 9;
-
-	return 0;
+	else
+		return bio_sectors(bio);
 }
 
 static inline void *bio_data(struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e61f22b..1df6889 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -56,6 +56,7 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
 	REQ_TYPE_FLUSH,			/* flush request */
 	REQ_TYPE_SPECIAL,		/* driver defined type */
+	REQ_TYPE_DISCARD,		/* discard sectors request */
 	REQ_TYPE_LINUX_BLOCK,		/* generic block layer message */
 	/*
 	 * for ATA/ATAPI devices. this really doesn't belong here, ide should
@@ -263,6 +264,7 @@ struct bvec_merge_data {
 typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *,
 			     struct bio_vec *);
 typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
+typedef void (prepare_discard_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
 
@@ -300,6 +302,7 @@ struct request_queue
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	prepare_flush_fn	*prepare_flush_fn;
+	prepare_discard_fn	*prepare_discard_fn;
 	softirq_done_fn		*softirq_done_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
 
@@ -522,6 +525,7 @@ enum {
 #define blk_pc_request(rq)	((rq)->cmd_type == REQ_TYPE_BLOCK_PC)
 #define blk_special_request(rq)	((rq)->cmd_type == REQ_TYPE_SPECIAL)
 #define blk_sense_request(rq)	((rq)->cmd_type == REQ_TYPE_SENSE)
+#define blk_discard_request(rq)	((rq)->cmd_type == REQ_TYPE_DISCARD)
 
 #define blk_noretry_request(rq)	((rq)->cmd_flags & REQ_FAILFAST)
 #define blk_rq_started(rq)	((rq)->cmd_flags & REQ_STARTED)
@@ -788,6 +792,7 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
+extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *);
 extern int blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
@@ -829,6 +834,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
+extern int blkdev_issue_discard(struct block_device *, sector_t,
+				unsigned int, bio_end_io_t *);
 
 /*
 * command filter functions

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 11:44                           ` Jens Axboe
@ 2008-08-08 11:47                             ` Jens Axboe
  2008-08-08 12:05                             ` David Woodhouse
  2008-08-08 15:32                             ` David Woodhouse
  2 siblings, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2008-08-08 11:47 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, Aug 08 2008, Jens Axboe wrote:
> Until we have overlap detection, I think we should make the discard
> request a explicit barrier. Otherwise we could have problems with a
> discard being passed by a write request and such.

But that wreaks havoc with merging, since a barrier

a) must be inserted at the back of the queue, and
b) always has merging disabled


-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 11:44                           ` Jens Axboe
  2008-08-08 11:47                             ` Jens Axboe
@ 2008-08-08 12:05                             ` David Woodhouse
  2008-08-08 12:13                               ` Jens Axboe
  2008-08-08 15:32                             ` David Woodhouse
  2 siblings, 1 reply; 48+ messages in thread
From: David Woodhouse @ 2008-08-08 12:05 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, 2008-08-08 at 13:44 +0200, Jens Axboe wrote:
> 
> Sigh indeed, ->issue_flush_fn() was the actual issuer, not the preparer.
> Let me send a new diff. This adds the ->prepare_discard_fn() to do the
> transformation, and also extends blkdev_issue_flush() to return error if
> the IO was never queued because of some device in the stack not
> supporting it.

I think we still want the DISCARD flag in rq->cmd_flags. We can't rely
on rq->cmd_type == REQ_TYPE_DISCARD because that may well be changed to
something else (like REQ_TYPE_BLOCK_PC).

I was thinking of REQ_TYPE_DISCARD as a special case for some block
devices, just as REQ_TYPE_FLUSH is a special case for the ps3disk.c
driver (nobody else uses it). In fact, perhaps both of them should be
switched to REQ_TYPE_LINUX_BLOCK or REQ_TYPE_SPECIAL?

I was actually tempted to leave REQ_TYPE_DISCARD out of the original
patch 1/5 completely, and add it only as part of the FTL patch. I don't
think it's something for the core code to be caring about -- let the
flag in rq->cmd_flags do that.

> Until we have overlap detection, I think we should make the discard
> request a explicit barrier. Otherwise we could have problems with a
> discard being passed by a write request and such.

I think that making it an explicit barrier is a reasonable thing to do
for now. We can work on merges as a next step.

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 12:05                             ` David Woodhouse
@ 2008-08-08 12:13                               ` Jens Axboe
  2008-08-08 12:32                                 ` David Woodhouse
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2008-08-08 12:13 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, Aug 08 2008, David Woodhouse wrote:
> On Fri, 2008-08-08 at 13:44 +0200, Jens Axboe wrote:
> > 
> > Sigh indeed, ->issue_flush_fn() was the actual issuer, not the preparer.
> > Let me send a new diff. This adds the ->prepare_discard_fn() to do the
> > transformation, and also extends blkdev_issue_flush() to return error if
> > the IO was never queued because of some device in the stack not
> > supporting it.
> 
> I think we still want the DISCARD flag in rq->cmd_flags. We can't rely
> on rq->cmd_type == REQ_TYPE_DISCARD because that may well be changed to
> something else (like REQ_TYPE_BLOCK_PC).

It's getting a bit nasty at this point. Discard requests should be
treated as file system requests, if you want merging and sorting on
them. For file system requests, drivers do the transformation inside the
driver once it receives the request. They don't change types in-flight,
they always stay the same.

So either we do the same for discard requests, or we treat them as we
would a REQ_TYPE_BLOCK_PC on insert. That means doing the transformation
first, then insert the request with elv_add_request(). That would be a
the end of the queue, so it would not either pass or get passed by file
system requests. You would not get merging in this case.

> I was thinking of REQ_TYPE_DISCARD as a special case for some block
> devices, just as REQ_TYPE_FLUSH is a special case for the ps3disk.c
> driver (nobody else uses it). In fact, perhaps both of them should be
> switched to REQ_TYPE_LINUX_BLOCK or REQ_TYPE_SPECIAL?

SPECIAL should go away, it has no definitive meaning. If we choose the
number two option above, it should just be a REQ_TYPE_LINUX_BLOCK with
the "official" cdb of REQ_LB_OP_DISCARD. It would behave the same that
approach.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 12:13                               ` Jens Axboe
@ 2008-08-08 12:32                                 ` David Woodhouse
  2008-08-08 12:37                                   ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: David Woodhouse @ 2008-08-08 12:32 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, 2008-08-08 at 14:13 +0200, Jens Axboe wrote:
> On Fri, Aug 08 2008, David Woodhouse wrote:
> > On Fri, 2008-08-08 at 13:44 +0200, Jens Axboe wrote:
> > > 
> > > Sigh indeed, ->issue_flush_fn() was the actual issuer, not the preparer.
> > > Let me send a new diff. This adds the ->prepare_discard_fn() to do the
> > > transformation, and also extends blkdev_issue_flush() to return error if
> > > the IO was never queued because of some device in the stack not
> > > supporting it.
> > 
> > I think we still want the DISCARD flag in rq->cmd_flags. We can't rely
> > on rq->cmd_type == REQ_TYPE_DISCARD because that may well be changed to
> > something else (like REQ_TYPE_BLOCK_PC).
> 
> It's getting a bit nasty at this point. Discard requests should be
> treated as file system requests, if you want merging and sorting on
> them. 

Except that they _can't_ be, because file system requests are normal
reads and writes, and have just a single bit to indicate the direction.

Although I suppose we could declare that a discard request is just like
a write but with no buffer attached? I wasn't brave enough to attempt
that though, because I think it'll get painful and lead to oopses in
unexpected places.

So they're _not_ file system requests; they're different -- and they can
be marked by the appropriate bit in rq->cmd_flags, just like a flush
request is. There's no reason that has to prevent merging. The elevator
needs to learn about discard requests anyway, and it doesn't really
matter _how_ it recognises them as such. Surely?

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 12:32                                 ` David Woodhouse
@ 2008-08-08 12:37                                   ` Jens Axboe
  2008-08-08 12:49                                     ` David Woodhouse
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2008-08-08 12:37 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, Aug 08 2008, David Woodhouse wrote:
> On Fri, 2008-08-08 at 14:13 +0200, Jens Axboe wrote:
> > On Fri, Aug 08 2008, David Woodhouse wrote:
> > > On Fri, 2008-08-08 at 13:44 +0200, Jens Axboe wrote:
> > > > 
> > > > Sigh indeed, ->issue_flush_fn() was the actual issuer, not the preparer.
> > > > Let me send a new diff. This adds the ->prepare_discard_fn() to do the
> > > > transformation, and also extends blkdev_issue_flush() to return error if
> > > > the IO was never queued because of some device in the stack not
> > > > supporting it.
> > > 
> > > I think we still want the DISCARD flag in rq->cmd_flags. We can't rely
> > > on rq->cmd_type == REQ_TYPE_DISCARD because that may well be changed to
> > > something else (like REQ_TYPE_BLOCK_PC).
> > 
> > It's getting a bit nasty at this point. Discard requests should be
> > treated as file system requests, if you want merging and sorting on
> > them. 
> 
> Except that they _can't_ be, because file system requests are normal
> reads and writes, and have just a single bit to indicate the direction.
> 
> Although I suppose we could declare that a discard request is just like
> a write but with no buffer attached? I wasn't brave enough to attempt
> that though, because I think it'll get painful and lead to oopses in
> unexpected places.

It was a proposition with two options - one of them being that we treat
them as file system requests, which is a pre-requisite if you want to
have merging supported. If merging isn'ta must, then I would advocate
option 2 as out lined in the previous mail.

> So they're _not_ file system requests; they're different -- and they can
> be marked by the appropriate bit in rq->cmd_flags, just like a flush
> request is. There's no reason that has to prevent merging. The elevator
> needs to learn about discard requests anyway, and it doesn't really
> matter _how_ it recognises them as such. Surely?

Sure, I'm well aware that they are different in nature, I'm talking
about how you treat them in the block layer. You can't have it both
ways - either you use the fs submit path and get the merging, or you
don't. Adding merging on the side for these special requests is not an
option, that would be silly. You would dual path that basically, plus it
would exclude eventually doing overlap detection with real fs requests.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 12:37                                   ` Jens Axboe
@ 2008-08-08 12:49                                     ` David Woodhouse
  2008-08-10  1:05                                       ` Jamie Lokier
  0 siblings, 1 reply; 48+ messages in thread
From: David Woodhouse @ 2008-08-08 12:49 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, 2008-08-08 at 14:37 +0200, Jens Axboe wrote:
> On Fri, Aug 08 2008, David Woodhouse wrote:
> > On Fri, 2008-08-08 at 14:13 +0200, Jens Axboe wrote:
> > > On Fri, Aug 08 2008, David Woodhouse wrote:
> > > > On Fri, 2008-08-08 at 13:44 +0200, Jens Axboe wrote:
> > > > > 
> > > > > Sigh indeed, ->issue_flush_fn() was the actual issuer, not the preparer.
> > > > > Let me send a new diff. This adds the ->prepare_discard_fn() to do the
> > > > > transformation, and also extends blkdev_issue_flush() to return error if
> > > > > the IO was never queued because of some device in the stack not
> > > > > supporting it.
> > > > 
> > > > I think we still want the DISCARD flag in rq->cmd_flags. We can't rely
> > > > on rq->cmd_type == REQ_TYPE_DISCARD because that may well be changed to
> > > > something else (like REQ_TYPE_BLOCK_PC).
> > > 
> > > It's getting a bit nasty at this point. Discard requests should be
> > > treated as file system requests, if you want merging and sorting on
> > > them. 
> > 
> > Except that they _can't_ be, because file system requests are normal
> > reads and writes, and have just a single bit to indicate the direction.
> > 
> > Although I suppose we could declare that a discard request is just like
> > a write but with no buffer attached? I wasn't brave enough to attempt
> > that though, because I think it'll get painful and lead to oopses in
> > unexpected places.
> 
> It was a proposition with two options - one of them being that we treat
> them as file system requests, which is a pre-requisite if you want to
> have merging supported. If merging isn'ta must, then I would advocate
> option 2 as out lined in the previous mail.

It would be nice to have merging and sorting on them. Mostly, they could
be handled just like writes.

That includes the fact that you can discard writes from the queue if
they're going to be obsoleted by a subsequent write.

Can we get away with representing discards as 'writes without buffers'?

> > So they're _not_ file system requests; they're different -- and they can
> > be marked by the appropriate bit in rq->cmd_flags, just like a flush
> > request is. There's no reason that has to prevent merging. The elevator
> > needs to learn about discard requests anyway, and it doesn't really
> > matter _how_ it recognises them as such. Surely?
> 
> Sure, I'm well aware that they are different in nature, I'm talking
> about how you treat them in the block layer. You can't have it both
> ways - either you use the fs submit path and get the merging, or you
> don't.

We definitely want to use the fs submit path and get merging. The only
reason I didn't do that from the beginning is because I wanted to keep
things simple to start with.

My point is that we can use the fs submit path whether we use
REQ_TYPE_DISCARD or whether we use a DISCARD flag in rq->cmd_flags. That
detail really shouldn't matter, surely?

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 12:49                                     ` David Woodhouse
@ 2008-08-10  1:05                                       ` Jamie Lokier
  0 siblings, 0 replies; 48+ messages in thread
From: Jamie Lokier @ 2008-08-10  1:05 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

David Woodhouse wrote:
> Can we get away with representing discards as 'writes without buffers'?

That seems like a very good idea.  They have the same requirements as
writes w.r.t. barriers and merges/discards.

I can imagine situations where a filesystem wants discards to work
with barrier requests in exactly the same way as writes.

-- Jamie

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 11:44                           ` Jens Axboe
  2008-08-08 11:47                             ` Jens Axboe
  2008-08-08 12:05                             ` David Woodhouse
@ 2008-08-08 15:32                             ` David Woodhouse
  2 siblings, 0 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-08 15:32 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, 2008-08-08 at 13:44 +0200, Jens Axboe wrote:
> +       if (err) {
> +               if (err == -EOPNOTSUPP)
> +                       set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
> +               clear_bit(BIO_UPTODATE, &bio->bi_flags);
> +       }

If we're going to do that, we should do it whether the caller provides
their own end_io function or not. So we should probably call their
->end_io function from ours. I'll store it in ->bi_private and do just
that.

-- 
dwmw2


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 10:52                   ` David Woodhouse
  2008-08-08 11:09                     ` Jens Axboe
@ 2008-08-08 14:22                     ` Matthew Wilcox
  2008-08-08 14:27                       ` David Woodhouse
  1 sibling, 1 reply; 48+ messages in thread
From: Matthew Wilcox @ 2008-08-08 14:22 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, Aug 08, 2008 at 11:52:11AM +0100, David Woodhouse wrote:
> Secondly, it allows drivers to insert the appropriate command onto their
> queue -- I expect that SCSI and ATA drives won't use REQ_TYPE_DISCARD,
> just as they don't use REQ_TYPE_FLUSH. They'll use REQ_TYPE_BLOCK_PC
> with whatever opcode is allocated for trim/punch.

I hadn't realised that was what you'd intended.  I was setting
REQ_TYPE_DISCARD in the ->prepare and then translating in the prep_fn.

If the current T10 and T13 proposals stand unmodified, it'll be a bit of a
pain to implement fully.  PUNCH currently requires creating a data buffer
and attaching it to the command.  When we get down into libata, we then
want to throw that data buffer away again because TRIM doesn't need a
data buffer (indeed to translate PUNCH to TRIM, we need to send a series
of commands because PUNCH allows you to specify a set of ranges whereas
TRIM only lets you send one range at a time).

Let's hope T10 agrees that only one range is necessary at a time.

-- 
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 14:22                     ` Matthew Wilcox
@ 2008-08-08 14:27                       ` David Woodhouse
  2008-08-08 14:34                         ` Matthew Wilcox
  0 siblings, 1 reply; 48+ messages in thread
From: David Woodhouse @ 2008-08-08 14:27 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, 2008-08-08 at 08:22 -0600, Matthew Wilcox wrote:
> If the current T10 and T13 proposals stand unmodified, it'll be a bit of a
> pain to implement fully.  PUNCH currently requires creating a data buffer
> and attaching it to the command.  When we get down into libata, we then
> want to throw that data buffer away again because TRIM doesn't need a
> data buffer (indeed to translate PUNCH to TRIM, we need to send a series
> of commands because PUNCH allows you to specify a set of ranges whereas
> TRIM only lets you send one range at a time).
> 
> Let's hope T10 agrees that only one range is necessary at a time.

Weird. Does it really make sense for us to be 'translating' from generic
requests to SCSI and then from those to ATA? I hadn't realised our code
was that baroque...

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling
  2008-08-08 14:27                       ` David Woodhouse
@ 2008-08-08 14:34                         ` Matthew Wilcox
  0 siblings, 0 replies; 48+ messages in thread
From: Matthew Wilcox @ 2008-08-08 14:34 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Fri, Aug 08, 2008 at 03:27:29PM +0100, David Woodhouse wrote:
> On Fri, 2008-08-08 at 08:22 -0600, Matthew Wilcox wrote:
> > If the current T10 and T13 proposals stand unmodified, it'll be a bit of a
> > pain to implement fully.  PUNCH currently requires creating a data buffer
> > and attaching it to the command.  When we get down into libata, we then
> > want to throw that data buffer away again because TRIM doesn't need a
> > data buffer (indeed to translate PUNCH to TRIM, we need to send a series
> > of commands because PUNCH allows you to specify a set of ranges whereas
> > TRIM only lets you send one range at a time).
> > 
> > Let's hope T10 agrees that only one range is necessary at a time.
> 
> Weird. Does it really make sense for us to be 'translating' from generic
> requests to SCSI and then from those to ATA? I hadn't realised our code
> was that baroque...

We could do better by implementing an 'ata-drive' ULD for the scsi stack
that translated generic requests into ATA FIS's.  Possibly we could
do even better by moving ata out from inside the scsi stack entirely
... but that's already a kernelsummit discussion ;-)

-- 
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded
  2008-08-05 17:25         ` David Woodhouse
  2008-08-06  9:25           ` [PATCH 1/5] [BLOCK] Add 'discard' request handling David Woodhouse
@ 2008-08-06  9:25           ` David Woodhouse
  2008-08-06 16:40             ` OGAWA Hirofumi
  2008-08-07 16:33             ` [PATCH 2/5, v2] " David Woodhouse
  2008-08-06  9:25           ` [PATCH 3/5] [MTD] Support 'discard sectors' operation in translation layer support core David Woodhouse
                             ` (2 subsequent siblings)
  4 siblings, 2 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-06  9:25 UTC (permalink / raw)
  To: Jens Axboe, hirofumi; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/fat/fatent.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 302e95c..5970fd6 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
+#include <linux/blkdev.h>
 
 struct fatent_operations {
 	void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -540,6 +541,11 @@ int fat_free_clusters(struct inode *inode, int cluster)
 	fatent_init(&fatent);
 	lock_fat(sbi);
 	do {
+		/* Issue discard for the sectors we no longer care about */
+		blkdev_issue_discard(sb->s_bdev,
+				     fat_clus_to_blknr(sbi, cluster),
+				     sbi->sec_per_clus, NULL);
+
 		cluster = fat_ent_read(inode, &fatent, cluster);
 		if (cluster < 0) {
 			err = cluster;
-- 
1.5.5.1


-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded
  2008-08-06  9:25           ` [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded David Woodhouse
@ 2008-08-06 16:40             ` OGAWA Hirofumi
  2008-08-06 17:14               ` OGAWA Hirofumi
  2008-08-06 18:11               ` David Woodhouse
  2008-08-07 16:33             ` [PATCH 2/5, v2] " David Woodhouse
  1 sibling, 2 replies; 48+ messages in thread
From: OGAWA Hirofumi @ 2008-08-06 16:40 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

David Woodhouse <dwmw2@infradead.org> writes:

> @@ -540,6 +541,11 @@ int fat_free_clusters(struct inode *inode, int cluster)
>  	fatent_init(&fatent);
>  	lock_fat(sbi);
>  	do {
> +		/* Issue discard for the sectors we no longer care about */
> +		blkdev_issue_discard(sb->s_bdev,
> +				     fat_clus_to_blknr(sbi, cluster),
> +				     sbi->sec_per_clus, NULL);
> +
>  		cluster = fat_ent_read(inode, &fatent, cluster);
>  		if (cluster < 0) {
>  			err = cluster;

Perhaps, after checking cluster-chain corruption is better. Thanks.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
[just tweak]
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---

 fs/fat/fatent.c |    5 +++++
 1 file changed, 5 insertions(+)

diff -puN fs/fat/fatent.c~test fs/fat/fatent.c
--- linux-2.6/fs/fat/fatent.c~test	2008-08-07 01:24:23.000000000 +0900
+++ linux-2.6-hirofumi/fs/fat/fatent.c	2008-08-07 01:33:07.000000000 +0900
@@ -551,6 +551,11 @@ int fat_free_clusters(struct inode *inod
 			goto error;
 		}
 
+		/* Issue discard for the sectors we no longer care about */
+		blkdev_issue_discard(sb->s_bdev,
+				     fat_clus_to_blknr(sbi, fatent.entry),
+				     sbi->sec_per_clus, NULL);
+
 		ops->ent_put(&fatent, FAT_ENT_FREE);
 		if (sbi->free_clusters != -1) {
 			sbi->free_clusters++;
_
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded
  2008-08-06 16:40             ` OGAWA Hirofumi
@ 2008-08-06 17:14               ` OGAWA Hirofumi
  2008-08-06 18:11               ` David Woodhouse
  1 sibling, 0 replies; 48+ messages in thread
From: OGAWA Hirofumi @ 2008-08-06 17:14 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> writes:

> diff -puN fs/fat/fatent.c~test fs/fat/fatent.c
> --- linux-2.6/fs/fat/fatent.c~test	2008-08-07 01:24:23.000000000 +0900
> +++ linux-2.6-hirofumi/fs/fat/fatent.c	2008-08-07 01:33:07.000000000 +0900
> @@ -551,6 +551,11 @@ int fat_free_clusters(struct inode *inod
>  			goto error;
>  		}
>  
> +		/* Issue discard for the sectors we no longer care about */
> +		blkdev_issue_discard(sb->s_bdev,
> +				     fat_clus_to_blknr(sbi, fatent.entry),
> +				     sbi->sec_per_clus, NULL);
> +
>  		ops->ent_put(&fatent, FAT_ENT_FREE);
>  		if (sbi->free_clusters != -1) {
>  			sbi->free_clusters++;

Ah, blkdev_issue_discard() assumes blocksize is 512bytes, um...

blkdev_issue_discard() takes bytes, and instead add some helpers for sb
or inode?
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded
  2008-08-06 16:40             ` OGAWA Hirofumi
  2008-08-06 17:14               ` OGAWA Hirofumi
@ 2008-08-06 18:11               ` David Woodhouse
  2008-08-06 19:10                 ` OGAWA Hirofumi
  1 sibling, 1 reply; 48+ messages in thread
From: David Woodhouse @ 2008-08-06 18:11 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Thu, 2008-08-07 at 01:40 +0900, OGAWA Hirofumi wrote:
> Perhaps, after checking cluster-chain corruption is better. Thanks.

That can be done, but not like that, I think. At the point you added the
blkdev_issue_discard() call, the value of 'cluster' has already changed.

So if the chain being freed is clusters 10, 11 and 12, your version of
the patch will attempt to discard clusters 11, 12 and 0xFFFF (EOF).

On Thu, 2008-08-07 at 02:14 +0900, OGAWA Hirofumi wrote:
> Ah, blkdev_issue_discard() assumes blocksize is 512bytes, um...

Doesn't everything? I did consider using q->hardsect_size, but decided
that was probably the wrong thing to do. I don't know. Jens?

> blkdev_issue_discard() takes bytes,

-EPARSE. It takes sectors at the moment -- do you mean you _want_ it to
take bytes? 

> and instead add some helpers for sb or inode?

I suppose we could; I'm not really so sure we need them.
inode->i_sb->s_bdev isn't exactly hard...

-- 
dwmw2

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded
  2008-08-06 18:11               ` David Woodhouse
@ 2008-08-06 19:10                 ` OGAWA Hirofumi
  2008-08-06 19:50                   ` OGAWA Hirofumi
  2008-08-06 21:37                   ` David Woodhouse
  0 siblings, 2 replies; 48+ messages in thread
From: OGAWA Hirofumi @ 2008-08-06 19:10 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

David Woodhouse <dwmw2@infradead.org> writes:

> On Thu, 2008-08-07 at 01:40 +0900, OGAWA Hirofumi wrote:
>> Perhaps, after checking cluster-chain corruption is better. Thanks.
>
> That can be done, but not like that, I think. At the point you added the
> blkdev_issue_discard() call, the value of 'cluster' has already changed.
>
> So if the chain being freed is clusters 10, 11 and 12, your version of
> the patch will attempt to discard clusters 11, 12 and 0xFFFF (EOF).

+				     fat_clus_to_blknr(sbi, fatent.entry),

No, no. I used fatent.entry, not cluster.

> On Thu, 2008-08-07 at 02:14 +0900, OGAWA Hirofumi wrote:
>> Ah, blkdev_issue_discard() assumes blocksize is 512bytes, um...
>
> Doesn't everything? I did consider using q->hardsect_size, but decided
> that was probably the wrong thing to do. I don't know. Jens?

My point is fs-blocksize vs hard sector size.

In FAT patch, it's passing the number of fs-blocks, not number of hard
sectors.

>> blkdev_issue_discard() takes bytes,
>
> -EPARSE. It takes sectors at the moment -- do you mean you _want_ it to
> take bytes? 

Yes and no. I guess bytes or _fs_-blocksize is good interface for fs.

>> and instead add some helpers for sb or inode?
>
> I suppose we could; I'm not really so sure we need them.
> inode->i_sb->s_bdev isn't exactly hard...
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded
  2008-08-06 19:10                 ` OGAWA Hirofumi
@ 2008-08-06 19:50                   ` OGAWA Hirofumi
  2008-08-06 20:10                     ` OGAWA Hirofumi
  2008-08-06 21:37                   ` David Woodhouse
  1 sibling, 1 reply; 48+ messages in thread
From: OGAWA Hirofumi @ 2008-08-06 19:50 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> writes:

>> On Thu, 2008-08-07 at 02:14 +0900, OGAWA Hirofumi wrote:
>>> Ah, blkdev_issue_discard() assumes blocksize is 512bytes, um...
>>
>> Doesn't everything? I did consider using q->hardsect_size, but decided
>> that was probably the wrong thing to do. I don't know. Jens?
>
> My point is fs-blocksize vs hard sector size.
>
> In FAT patch, it's passing the number of fs-blocks, not number of hard
> sectors.

Ah, bio->bi_sector has same issue too.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded
  2008-08-06 19:50                   ` OGAWA Hirofumi
@ 2008-08-06 20:10                     ` OGAWA Hirofumi
  0 siblings, 0 replies; 48+ messages in thread
From: OGAWA Hirofumi @ 2008-08-06 20:10 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> writes:

> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> writes:
>
>>> On Thu, 2008-08-07 at 02:14 +0900, OGAWA Hirofumi wrote:
>>>> Ah, blkdev_issue_discard() assumes blocksize is 512bytes, um...
>>>
>>> Doesn't everything? I did consider using q->hardsect_size, but decided
>>> that was probably the wrong thing to do. I don't know. Jens?
>>
>> My point is fs-blocksize vs hard sector size.
>>
>> In FAT patch, it's passing the number of fs-blocks, not number of hard
>> sectors.
>
> Ah, bio->bi_sector has same issue too.

I'm not sure whether it's good interface. However, we would have to
convert like following from fs-blocksize to 512...

int sb_issue_discard(sruct super_block *sb, sector_t blocknr, unsigned blocks)
{
	sector_t sector = blocknr << (sb->s_blocksize_bits - 9);
        /*
	 * this should check overflow, or caller should?
         * and there is no point to sectors based on 512 temporarily,
         * so we should pass bytes directly?
         */
	unsigned nr_sects = blocks << (sb->s_blocksize_bits - 9);
	return blkdev_issue_discard(sb->s_bdev, sector, nr_sects);
}
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded
  2008-08-06 19:10                 ` OGAWA Hirofumi
  2008-08-06 19:50                   ` OGAWA Hirofumi
@ 2008-08-06 21:37                   ` David Woodhouse
  2008-08-07 16:09                     ` David Woodhouse
  1 sibling, 1 reply; 48+ messages in thread
From: David Woodhouse @ 2008-08-06 21:37 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Thu, 2008-08-07 at 04:10 +0900, OGAWA Hirofumi wrote:
> David Woodhouse <dwmw2@infradead.org> writes:
> 
> > On Thu, 2008-08-07 at 01:40 +0900, OGAWA Hirofumi wrote:
> >> Perhaps, after checking cluster-chain corruption is better. Thanks.
> >
> > That can be done, but not like that, I think. At the point you added the
> > blkdev_issue_discard() call, the value of 'cluster' has already changed.
> >
> > So if the chain being freed is clusters 10, 11 and 12, your version of
> > the patch will attempt to discard clusters 11, 12 and 0xFFFF (EOF).
> 
> +				     fat_clus_to_blknr(sbi, fatent.entry),
> 
> No, no. I used fatent.entry, not cluster.

Ah, OK. Sorry, I misread it :)

> > On Thu, 2008-08-07 at 02:14 +0900, OGAWA Hirofumi wrote:
> >> Ah, blkdev_issue_discard() assumes blocksize is 512bytes, um...
> >
> > Doesn't everything? I did consider using q->hardsect_size, but decided
> > that was probably the wrong thing to do. I don't know. Jens?
> 
> My point is fs-blocksize vs hard sector size.
> 
> In FAT patch, it's passing the number of fs-blocks, not number of hard
> sectors.
> 
> >> blkdev_issue_discard() takes bytes,
> >
> > -EPARSE. It takes sectors at the moment -- do you mean you _want_ it to
> > take bytes? 
> 
> Yes and no. I guess bytes or _fs_-blocksize is good interface for fs.

OK. I'll do the latter; thank you.

-- 
dwmw2


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded
  2008-08-06 21:37                   ` David Woodhouse
@ 2008-08-07 16:09                     ` David Woodhouse
  0 siblings, 0 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-07 16:09 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: Jens Axboe, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

On Wed, 2008-08-06 at 22:37 +0100, David Woodhouse wrote:
> > Yes and no. I guess bytes or _fs_-blocksize is good interface for fs.
> 
> OK. I'll do the latter; thank you.

Incremental patch over what I posted before...

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 5970fd6..a8b10ac 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -541,11 +541,6 @@ int fat_free_clusters(struct inode *inode, int cluster)
 	fatent_init(&fatent);
 	lock_fat(sbi);
 	do {
-		/* Issue discard for the sectors we no longer care about */
-		blkdev_issue_discard(sb->s_bdev,
-				     fat_clus_to_blknr(sbi, cluster),
-				     sbi->sec_per_clus, NULL);
-
 		cluster = fat_ent_read(inode, &fatent, cluster);
 		if (cluster < 0) {
 			err = cluster;
@@ -557,6 +552,10 @@ int fat_free_clusters(struct inode *inode, int cluster)
 			goto error;
 		}
 
+		/* Issue discard for the sectors we no longer care about */
+		sb_issue_discard(sb, fat_clus_to_blknr(sbi, fatent.entry),
+				 sbi->sec_per_clus, NULL);
+
 		ops->ent_put(&fatent, FAT_ENT_FREE);
 		if (sbi->free_clusters != -1) {
 			sbi->free_clusters++;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3f077a5..9b33909 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -840,6 +840,15 @@ extern int blkdev_issue_flush(struct block_device *, sector_t *);
 extern int blkdev_issue_discard(struct block_device *, sector_t sector,
 				unsigned nr_sects, bio_end_io_t *);
 
+static inline int sb_issue_discard(struct super_block *sb,
+				   sector_t block, unsigned nr_blocks,
+				   bio_end_io_t *endio)
+{
+	block <<= (sb->s_blocksize_bits - 9);
+	nr_blocks <<= (sb->s_blocksize_bits - 9);
+	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, endio);
+}
+
 /*
 * command filter functions
 */
diff --git a/mm/bounce.c b/mm/bounce.c
index b6d2d0f..1cfe193 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	/*
 	 * Data-less bio, nothing to bounce
 	 */
-	if (bio_empty_barrier(*bio_orig))
+	if (bio_dataless(*bio_orig))
 		return;
 
 	/*


-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH 2/5, v2] [FAT] Let the block device know when sectors can be discarded
  2008-08-06  9:25           ` [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded David Woodhouse
  2008-08-06 16:40             ` OGAWA Hirofumi
@ 2008-08-07 16:33             ` David Woodhouse
  1 sibling, 0 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-07 16:33 UTC (permalink / raw)
  To: Jens Axboe; +Cc: hirofumi, Andrew Morton, Ric Wheeler, linux-fsdevel, gilad


[hirofumi@mail.parknet.co.jp: discard _after_ checking for corrupt chains]

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/fat/fatent.c |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 302e95c..a8b10ac 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
+#include <linux/blkdev.h>
 
 struct fatent_operations {
 	void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -551,6 +552,10 @@ int fat_free_clusters(struct inode *inode, int cluster)
 			goto error;
 		}
 
+		/* Issue discard for the sectors we no longer care about */
+		sb_issue_discard(sb, fat_clus_to_blknr(sbi, fatent.entry),
+				 sbi->sec_per_clus, NULL);
+
 		ops->ent_put(&fatent, FAT_ENT_FREE);
 		if (sbi->free_clusters != -1) {
 			sbi->free_clusters++;
-- 
1.5.5.1


-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH 3/5] [MTD] Support 'discard sectors' operation in translation layer support core
  2008-08-05 17:25         ` David Woodhouse
  2008-08-06  9:25           ` [PATCH 1/5] [BLOCK] Add 'discard' request handling David Woodhouse
  2008-08-06  9:25           ` [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded David Woodhouse
@ 2008-08-06  9:25           ` David Woodhouse
  2008-08-06  9:25           ` [PATCH 4/5] [MTD] [FTL] Support 'discard sectors' operation David Woodhouse
  2008-08-06  9:25           ` [PATCH 5/5] [BLOCK] Fix up comments about matching flags between bio and rq David Woodhouse
  4 siblings, 0 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-06  9:25 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtd_blkdevs.c    |   13 +++++++++++++
 include/linux/mtd/blktrans.h |    2 ++
 2 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 9ff007c..0ef1d4b 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -32,6 +32,13 @@ struct mtd_blkcore_priv {
 	spinlock_t queue_lock;
 };
 
+static int blktrans_discard_request(struct request_queue *q,
+				    struct request *req)
+{
+	req->cmd_type = REQ_TYPE_DISCARD;
+	return 0;
+}
+
 static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 			       struct mtd_blktrans_dev *dev,
 			       struct request *req)
@@ -44,6 +51,9 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 
 	buf = req->buffer;
 
+	if (req->cmd_type == REQ_TYPE_DISCARD)
+		return !tr->discard(dev, block, nsect);
+
 	if (!blk_fs_request(req))
 		return 0;
 
@@ -367,6 +377,9 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
 
 	tr->blkcore_priv->rq->queuedata = tr;
 	blk_queue_hardsect_size(tr->blkcore_priv->rq, tr->blksize);
+	if (tr->discard)
+		blk_queue_discard(tr->blkcore_priv->rq, blktrans_discard_request);
+
 	tr->blkshift = ffs(tr->blksize) - 1;
 
 	tr->blkcore_priv->thread = kthread_run(mtd_blktrans_thread, tr,
diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h
index 310e616..8b4aa05 100644
--- a/include/linux/mtd/blktrans.h
+++ b/include/linux/mtd/blktrans.h
@@ -41,6 +41,8 @@ struct mtd_blktrans_ops {
 		    unsigned long block, char *buffer);
 	int (*writesect)(struct mtd_blktrans_dev *dev,
 		     unsigned long block, char *buffer);
+	int (*discard)(struct mtd_blktrans_dev *dev,
+		       unsigned long block, unsigned nr_blocks);
 
 	/* Block layer ioctls */
 	int (*getgeo)(struct mtd_blktrans_dev *dev, struct hd_geometry *geo);
-- 
1.5.5.1


-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH 4/5] [MTD] [FTL] Support 'discard sectors' operation.
  2008-08-05 17:25         ` David Woodhouse
                             ` (2 preceding siblings ...)
  2008-08-06  9:25           ` [PATCH 3/5] [MTD] Support 'discard sectors' operation in translation layer support core David Woodhouse
@ 2008-08-06  9:25           ` David Woodhouse
  2008-08-06  9:25           ` [PATCH 5/5] [BLOCK] Fix up comments about matching flags between bio and rq David Woodhouse
  4 siblings, 0 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-06  9:25 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

We can benefit from knowing that the file system no longer cares about
the contents of certain sectors, by throwing them away immediately and
then never having to garbage collect them, and using the extra free
space to make our operations more efficient. Do so.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/ftl.c |   24 ++++++++++++++++++++++++
 1 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index f34f20c..9bf581c 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -1005,6 +1005,29 @@ static int ftl_writesect(struct mtd_blktrans_dev *dev,
 	return ftl_write((void *)dev, buf, block, 1);
 }
 
+static int ftl_discardsect(struct mtd_blktrans_dev *dev,
+			   unsigned long sector, unsigned nr_sects)
+{
+	partition_t *part = (void *)dev;
+	uint32_t bsize = 1 << part->header.EraseUnitSize;
+
+	DEBUG(1, "FTL erase sector %ld for %d sectors\n",
+	      sector, nr_sects);
+
+	while (nr_sects) {
+		uint32_t old_addr = part->VirtualBlockMap[sector];
+		if (old_addr != 0xffffffff) {
+			part->VirtualBlockMap[sector] = 0xffffffff;
+			part->EUNInfo[old_addr/bsize].Deleted++;
+			if (set_bam_entry(part, old_addr, 0))
+				return -EIO;
+		}
+		nr_sects--;
+		sector++;
+	}
+
+	return 0;
+}
 /*====================================================================*/
 
 static void ftl_freepart(partition_t *part)
@@ -1069,6 +1092,7 @@ static struct mtd_blktrans_ops ftl_tr = {
 	.blksize 	= SECTOR_SIZE,
 	.readsect	= ftl_readsect,
 	.writesect	= ftl_writesect,
+	.discard	= ftl_discardsect,
 	.getgeo		= ftl_getgeo,
 	.add_mtd	= ftl_add_mtd,
 	.remove_dev	= ftl_remove_dev,
-- 
1.5.5.1


-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH 5/5] [BLOCK] Fix up comments about matching flags between bio and rq
  2008-08-05 17:25         ` David Woodhouse
                             ` (3 preceding siblings ...)
  2008-08-06  9:25           ` [PATCH 4/5] [MTD] [FTL] Support 'discard sectors' operation David Woodhouse
@ 2008-08-06  9:25           ` David Woodhouse
  4 siblings, 0 replies; 48+ messages in thread
From: David Woodhouse @ 2008-08-06  9:25 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, Ric Wheeler, linux-fsdevel, gilad

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 block/blk-core.c       |    7 ++-----
 include/linux/bio.h    |    4 ++--
 include/linux/blkdev.h |    6 +++---
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 0cb993a..0754a6f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -622,10 +622,6 @@ blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
 
 	blk_rq_init(q, rq);
 
-	/*
-	 * first three bits are identical in rq->cmd_flags and bio->bi_rw,
-	 * see bio.h and blkdev.h
-	 */
 	rq->cmd_flags = rw | REQ_ALLOCED;
 
 	if (priv) {
@@ -2027,7 +2023,8 @@ EXPORT_SYMBOL_GPL(blk_end_request_callback);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
-	/* first two bits are identical in rq->cmd_flags and bio->bi_rw */
+	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
+	   we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
 	rq->cmd_flags |= (bio->bi_rw & 3);
 
 	rq->nr_phys_segments = bio_phys_segments(q, bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 67cb2ee..83f8260 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -152,8 +152,8 @@ struct bio {
  * bit 5 --
  * bit 6 -- discard sectors (trim)
  */
-#define BIO_RW		0
-#define BIO_RW_AHEAD	1
+#define BIO_RW		0	/* Must match RW in req flags (blkdev.h) */
+#define BIO_RW_AHEAD	1	/* Must match FAILFAST in req flags */
 #define BIO_RW_BARRIER	2
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9e3adc4..3f077a5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -85,11 +85,11 @@ enum {
 };
 
 /*
- * request type modified bits. first three bits match BIO_RW* bits, important
+ * request type modified bits. first two bits match BIO_RW* bits, important
  */
 enum rq_flag_bits {
-	__REQ_RW,		/* not set, read. set, write */
-	__REQ_FAILFAST,		/* no low level driver retries */
+	__REQ_RW,		/* not set, read. set, write. SEE ABOVE */
+	__REQ_FAILFAST,		/* no low level driver retries. SEE ABOVE */
 	__REQ_DISCARD,		/* request to discard sectors */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
-- 
1.5.5.1


-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




^ permalink raw reply related	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2008-08-10  1:05 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <488B7281.4020007@gmail.com>
     [not found] ` <20080726130200.f541e604.akpm@linux-foundation.org>
2008-08-05  1:45   ` [RFC] 'discard sectors' block request David Woodhouse
2008-08-05  1:59     ` Andrew Morton
2008-08-05  9:01       ` David Woodhouse
2008-08-05 11:42     ` Jens Axboe
2008-08-05 13:32       ` David Woodhouse
2008-08-05 14:21         ` Ric Wheeler
2008-08-05 16:29       ` David Woodhouse
2008-08-05 17:25         ` David Woodhouse
2008-08-06  9:25           ` [PATCH 1/5] [BLOCK] Add 'discard' request handling David Woodhouse
2008-08-06 16:19             ` OGAWA Hirofumi
2008-08-06 18:18               ` David Woodhouse
2008-08-06 19:28                 ` OGAWA Hirofumi
2008-08-07 16:32             ` [PATCH 1/5, v2] " David Woodhouse
2008-08-07 18:41             ` [PATCH 1/5] " Jens Axboe
2008-08-08  9:33               ` David Woodhouse
2008-08-08 10:30                 ` Jens Axboe
2008-08-08 10:49                   ` Jens Axboe
2008-08-08 11:04                     ` David Woodhouse
2008-08-08 11:20                       ` Jens Axboe
2008-08-08 10:52                   ` David Woodhouse
2008-08-08 11:09                     ` Jens Axboe
2008-08-08 11:18                       ` Jens Axboe
2008-08-08 11:29                         ` David Woodhouse
2008-08-08 11:44                           ` Jens Axboe
2008-08-08 11:47                             ` Jens Axboe
2008-08-08 12:05                             ` David Woodhouse
2008-08-08 12:13                               ` Jens Axboe
2008-08-08 12:32                                 ` David Woodhouse
2008-08-08 12:37                                   ` Jens Axboe
2008-08-08 12:49                                     ` David Woodhouse
2008-08-10  1:05                                       ` Jamie Lokier
2008-08-08 15:32                             ` David Woodhouse
2008-08-08 14:22                     ` Matthew Wilcox
2008-08-08 14:27                       ` David Woodhouse
2008-08-08 14:34                         ` Matthew Wilcox
2008-08-06  9:25           ` [PATCH 2/5] [FAT] Let the block device know when sectors can be discarded David Woodhouse
2008-08-06 16:40             ` OGAWA Hirofumi
2008-08-06 17:14               ` OGAWA Hirofumi
2008-08-06 18:11               ` David Woodhouse
2008-08-06 19:10                 ` OGAWA Hirofumi
2008-08-06 19:50                   ` OGAWA Hirofumi
2008-08-06 20:10                     ` OGAWA Hirofumi
2008-08-06 21:37                   ` David Woodhouse
2008-08-07 16:09                     ` David Woodhouse
2008-08-07 16:33             ` [PATCH 2/5, v2] " David Woodhouse
2008-08-06  9:25           ` [PATCH 3/5] [MTD] Support 'discard sectors' operation in translation layer support core David Woodhouse
2008-08-06  9:25           ` [PATCH 4/5] [MTD] [FTL] Support 'discard sectors' operation David Woodhouse
2008-08-06  9:25           ` [PATCH 5/5] [BLOCK] Fix up comments about matching flags between bio and rq David Woodhouse

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).