[PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2

linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2
@ 2012-09-13 15:01 Dmitry Monakhov
  2012-09-13 15:01 ` [PATCH 1/9] ext4: ext4_inode_info diet Dmitry Monakhov
                   ` (8 more replies)
  0 siblings, 9 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov

There are number of races exist caused by lack of synchronization
between DIO workers in flight and truncate/fsync/punch_hole routines
This patch series try to optimize and fix existing DIO/AIO code.

This patch-set survives after all my tests(except fsdefrag)
I observe significant performance improvements for big SMP hosts
performind DIO for single file.

TOC:
# first one is simply a cleanup
ext4: ext4_inode_info diet
# Cleanup and bug fixes
ext4: completed_io locking cleanup V2
ext4: serialize dio nonlocked reads with defrag workers V3
ext4: punch_hole should wait for DIO writers
ext4: serialize unlocked dio reads with truncate
ext4: endless truncate due to nonlocked dio readers V2
ext4: serialize truncate with owerwrite DIO workers V2
# Nasty panch_hole regressions
ext4: fix ext_remove_space for punch_hole case
ext4: fix ext4_ext_remove_space tree traversal

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>

 ext4.h        |   27 +++++++++++++++----
 extents.c     |   82 ++++++++++++++++++++++++++++++++--------------------------
 fsync.c       |   47 +++++++++++----------------------
 indirect.c    |   17 +++++++++---
 inode.c       |   49 ++++++++++++++--------------------
 move_extent.c |    8 +++++
 page-io.c     |   76 ++++++++++++++---------------------------------------
 super.c       |    1
 8 files changed, 148 insertions(+), 159 deletions(-)


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/9] ext4: ext4_inode_info diet
  2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
  2012-09-13 15:01 ` [PATCH 2/9] ext4: completed_io locking cleanup V2 Dmitry Monakhov
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov

Generic inode has unused i_private pointer which may be used as cur_aio_dio
storage.

TODO: If cur_aio_dio will be passed as an argument to get_block_t this allow
      to have concurent AIO_DIO requests.

Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/ext4.h    |    5 +++--
 fs/ext4/extents.c |    4 ++--
 fs/ext4/inode.c   |    6 +++---
 fs/ext4/super.c   |    1 -
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f9024a6..b3f3c55 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -912,8 +912,6 @@ struct ext4_inode_info {
 	struct list_head i_completed_io_list;
 	spinlock_t i_completed_io_lock;
 	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
-	/* current io_end structure for async DIO write*/
-	ext4_io_end_t *cur_aio_dio;
 	atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
 
 	spinlock_t i_block_reservation_lock;
@@ -929,6 +927,9 @@ struct ext4_inode_info {
 	__u32 i_csum_seed;
 };
 
+/* current io_end structure for async DIO write */
+#define EXT4_CUR_AIO_DIO(inode) (inode)->i_private
+
 /*
  * File system states
  */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 10f0afd..e993879 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3644,7 +3644,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 {
 	int ret = 0;
 	int err = 0;
-	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+	ext4_io_end_t *io = (ext4_io_end_t*) EXT4_CUR_AIO_DIO(inode);
 
 	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
 		  "block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -3902,7 +3902,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	unsigned int allocated = 0, offset = 0;
 	unsigned int allocated_clusters = 0;
 	struct ext4_allocation_request ar;
-	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+	ext4_io_end_t *io = (ext4_io_end_t*) EXT4_CUR_AIO_DIO(inode);
 	ext4_lblk_t cluster_offset;
 
 	ext_debug("blocks %u/%u requested for inode %lu\n",
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d12d30e..202ae3f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3060,7 +3060,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		 * hook to the iocb.
  		 */
 		iocb->private = NULL;
-		EXT4_I(inode)->cur_aio_dio = NULL;
+		EXT4_CUR_AIO_DIO(inode) = NULL;
 		if (!is_sync_kiocb(iocb)) {
 			ext4_io_end_t *io_end =
 				ext4_init_io_end(inode, GFP_NOFS);
@@ -3077,7 +3077,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 			 * is a unwritten extents needs to be converted
 			 * when IO is completed.
 			 */
-			EXT4_I(inode)->cur_aio_dio = iocb->private;
+			EXT4_CUR_AIO_DIO(inode) = iocb->private;
 		}
 
 		if (overwrite)
@@ -3097,7 +3097,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 						 NULL,
 						 DIO_LOCKING);
 		if (iocb->private)
-			EXT4_I(inode)->cur_aio_dio = NULL;
+			EXT4_CUR_AIO_DIO(inode) = NULL;
 		/*
 		 * The io_end structure takes a reference to the inode,
 		 * that structure needs to be destroyed and the
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4a7092b..a6904b3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -967,7 +967,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->jinode = NULL;
 	INIT_LIST_HEAD(&ei->i_completed_io_list);
 	spin_lock_init(&ei->i_completed_io_lock);
-	ei->cur_aio_dio = NULL;
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
 	atomic_set(&ei->i_ioend_count, 0);
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/9] ext4: completed_io locking cleanup V2
  2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
  2012-09-13 15:01 ` [PATCH 1/9] ext4: ext4_inode_info diet Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
  2012-09-20 18:56   ` Dmitry Monakhov
  2012-09-13 15:01 ` [PATCH 3/9] ext4: serialize dio nonlocked reads with defrag workers V3 Dmitry Monakhov
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov

Current unwritten extent conversion state-machine is very fuzzy.
- By unknown reason it want perform conversion under i_mutex. What for?
  The only reason it used here is just protect io->flags modification,
  but only flush_completed_IO and work modified this flags and
  we can serialize them via i_completed_io_lock.
  All this games with mutex_trylock result in following deadlock
   truncate:                          kworker:
    ext4_setattr                       ext4_end_io_work
    mutex_lock(i_mutex)
    inode_dio_wait(inode)  ->BLOCK
                             DEADLOCK<- mutex_trylock()
                                        inode_dio_done()
  #TEST_CASE1_BEGIN
  MNT=/mnt_scrach
  unlink $MNT/file
  fallocate -l $((1024*1024*1024)) $MNT/file
  aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
  sleep 2
  truncate -s 0 $MNT/file
  #TEST_CASE1_END

This patch makes state machine simple and clean:

(1) ext4_flush_completed_IO is responsible for handling all pending
    end_io from ei->i_completed_io_list(per inode list)
    NOTE1: i_completed_io_lock is acquired only once
    NOTE2: i_mutex is not required because it does not protect
           any data guarded by i_mutex

(2) xxx_end_io schedule end_io context completion simply by pushing it
    to the inode's list.
    NOTE1: because of (1) work should be queued only if
    ->i_completed_io_list was empty at the moment, otherwise it
    work is scheduled already.

(3) No one is able to free inode's blocks while pented io_completion
    exist othervise may result in blocks beyond EOF.

- remove useless EXT4_IO_END_QUEUED and EXT4_IO_END_FSYNC flags
- Improve SMP scalability by removing useless i_mutex which does not
  protect io->flags anymore.
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
- Add BUG_ON to ext_ext_remove_space() in order to assert (3)

Changes since V1:
 Add bugon assertion according to Jan's comment

Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/ext4.h     |    5 +--
 fs/ext4/extents.c  |    1 +
 fs/ext4/fsync.c    |   47 +++++++++++---------------------
 fs/ext4/indirect.c |    6 +---
 fs/ext4/inode.c    |   25 +---------------
 fs/ext4/page-io.c  |   76 ++++++++++++++-------------------------------------
 6 files changed, 44 insertions(+), 116 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b3f3c55..be976ca 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -184,9 +184,7 @@ struct mpage_da_data {
  */
 #define	EXT4_IO_END_UNWRITTEN	0x0001
 #define EXT4_IO_END_ERROR	0x0002
-#define EXT4_IO_END_QUEUED	0x0004
-#define EXT4_IO_END_DIRECT	0x0008
-#define EXT4_IO_END_IN_FSYNC	0x0010
+#define EXT4_IO_END_DIRECT	0x0004
 
 struct ext4_io_page {
 	struct page	*p_page;
@@ -2405,6 +2403,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
+extern void ext4_add_complete_io(ext4_io_end_t *io_end);
 extern void ext4_exit_pageio(void);
 extern void ext4_ioend_wait(struct inode *);
 extern void ext4_free_io_end(ext4_io_end_t *io);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e993879..44e33b0 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2618,6 +2618,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 	handle_t *handle;
 	int i = 0, err;
 
+	BUG_ON(atomic_read(&EXT4_I(inode)->i_aiodio_unwritten));
 	ext_debug("truncate since %u to %u\n", start, end);
 
 	/* probably first extent we're gonna free will be last in block */
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 323eb15..24f3719 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,46 +73,31 @@ static void dump_completed_IO(struct inode * inode)
  * might needs to do the conversion. This function walks through
  * the list and convert the related unwritten extents for completed IO
  * to written.
- * The function return the number of pending IOs on success.
+ * The function return first error;
  */
 int ext4_flush_completed_IO(struct inode *inode)
 {
+	struct ext4_inode_info	*ei = EXT4_I(inode);
+	unsigned long		flags;
+	struct list_head complete_list;
+	int err, ret = 0;
 	ext4_io_end_t *io;
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	unsigned long flags;
-	int ret = 0;
-	int ret2 = 0;
 
 	dump_completed_IO(inode);
+
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	while (!list_empty(&ei->i_completed_io_list)){
-		io = list_entry(ei->i_completed_io_list.next,
-				ext4_io_end_t, list);
+	list_replace_init(&ei->i_completed_io_list, &complete_list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+	while(!list_empty(&complete_list)) {
+		io = list_entry(complete_list.next, ext4_io_end_t, list);
 		list_del_init(&io->list);
-		io->flag |= EXT4_IO_END_IN_FSYNC;
-		/*
-		 * Calling ext4_end_io_nolock() to convert completed
-		 * IO to written.
-		 *
-		 * When ext4_sync_file() is called, run_queue() may already
-		 * about to flush the work corresponding to this io structure.
-		 * It will be upset if it founds the io structure related
-		 * to the work-to-be schedule is freed.
-		 *
-		 * Thus we need to keep the io structure still valid here after
-		 * conversion finished. The io structure has a flag to
-		 * avoid double converting from both fsync and background work
-		 * queue work.
-		 */
-		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-		ret = ext4_end_io_nolock(io);
-		if (ret < 0)
-			ret2 = ret;
-		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-		io->flag &= ~EXT4_IO_END_IN_FSYNC;
+	        err = ext4_end_io_nolock(io);
+		ext4_free_io_end(io);
+		if (unlikely(err && !ret))
+			ret = err;
 	}
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-	return (ret2 < 0) ? ret2 : 0;
+	return ret;
 }
 
 /*
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 830e1b2..61f13e5 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -807,11 +807,9 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 
 retry:
 	if (rw == READ && ext4_should_dioread_nolock(inode)) {
-		if (unlikely(!list_empty(&ei->i_completed_io_list))) {
-			mutex_lock(&inode->i_mutex);
+		if (unlikely(!list_empty(&ei->i_completed_io_list)))
 			ext4_flush_completed_IO(inode);
-			mutex_unlock(&inode->i_mutex);
-		}
+
 		ret = __blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 202ae3f..762b955 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2885,9 +2885,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
         ext4_io_end_t *io_end = iocb->private;
-	struct workqueue_struct *wq;
-	unsigned long flags;
-	struct ext4_inode_info *ei;
 
 	/* if not async direct IO or dio with 0 bytes write, just return */
 	if (!io_end || !size)
@@ -2916,24 +2913,14 @@ out:
 		io_end->iocb = iocb;
 		io_end->result = ret;
 	}
-	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
-	/* Add the io_end to per-inode completed aio dio list*/
-	ei = EXT4_I(io_end->inode);
-	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	list_add_tail(&io_end->list, &ei->i_completed_io_list);
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 
-	/* queue the work to convert unwritten extents to written */
-	queue_work(wq, &io_end->work);
+	ext4_add_complete_io(io_end);
 }
 
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 {
 	ext4_io_end_t *io_end = bh->b_private;
-	struct workqueue_struct *wq;
 	struct inode *inode;
-	unsigned long flags;
 
 	if (!test_clear_buffer_uninit(bh) || !io_end)
 		goto out;
@@ -2952,15 +2939,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 	 */
 	inode = io_end->inode;
 	ext4_set_io_unwritten_flag(inode, io_end);
-
-	/* Add the io_end to per-inode completed io list*/
-	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
-	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
-	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
-	/* queue the work to convert unwritten extents to written */
-	queue_work(wq, &io_end->work);
+	ext4_add_complete_io(io_end);
 out:
 	bh->b_private = NULL;
 	bh->b_end_io = NULL;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dcdeef1..c369419 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -57,6 +57,22 @@ void ext4_ioend_wait(struct inode *inode)
 	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
 }
 
+/* Add the io_end to per-inode completed aio dio list. */
+void ext4_add_complete_io(ext4_io_end_t *io_end)
+{
+	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+	struct workqueue_struct *wq;
+	unsigned long flags;
+
+	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (list_empty(&ei->i_completed_io_list))
+		queue_work(wq, &io_end->work);
+	list_add_tail(&io_end->list, &ei->i_completed_io_list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
+
 static void put_io_page(struct ext4_io_page *io_page)
 {
 	if (atomic_dec_and_test(&io_page->p_count)) {
@@ -81,12 +97,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
 	kmem_cache_free(io_end_cachep, io);
 }
 
-/*
- * check a range of space and convert unwritten extents to written.
- *
- * Called with inode->i_mutex; we depend on this when we manipulate
- * io->flag, since we could otherwise race with ext4_flush_completed_IO()
- */
+/* check a range of space and convert unwritten extents to written. */
 int ext4_end_io_nolock(ext4_io_end_t *io)
 {
 	struct inode *inode = io->inode;
@@ -94,6 +105,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
 	ssize_t size = io->size;
 	int ret = 0;
 
+	BUG_ON(!list_empty(&io->list));
+
 	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
 		   "list->prev 0x%p\n",
 		   io, inode->i_ino, io->list.next, io->list.prev);
@@ -124,45 +137,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
 static void ext4_end_io_work(struct work_struct *work)
 {
 	ext4_io_end_t		*io = container_of(work, ext4_io_end_t, work);
-	struct inode		*inode = io->inode;
-	struct ext4_inode_info	*ei = EXT4_I(inode);
-	unsigned long		flags;
-
-	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	if (io->flag & EXT4_IO_END_IN_FSYNC)
-		goto requeue;
-	if (list_empty(&io->list)) {
-		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-		goto free;
-	}
-
-	if (!mutex_trylock(&inode->i_mutex)) {
-		bool was_queued;
-requeue:
-		was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
-		io->flag |= EXT4_IO_END_QUEUED;
-		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-		/*
-		 * Requeue the work instead of waiting so that the work
-		 * items queued after this can be processed.
-		 */
-		queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
-		/*
-		 * To prevent the ext4-dio-unwritten thread from keeping
-		 * requeueing end_io requests and occupying cpu for too long,
-		 * yield the cpu if it sees an end_io request that has already
-		 * been requeued.
-		 */
-		if (was_queued)
-			yield();
-		return;
-	}
-	list_del_init(&io->list);
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-	(void) ext4_end_io_nolock(io);
-	mutex_unlock(&inode->i_mutex);
-free:
-	ext4_free_io_end(io);
+	(void) ext4_flush_completed_IO(io->inode);
 }
 
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -195,9 +170,7 @@ static void buffer_io_error(struct buffer_head *bh)
 static void ext4_end_bio(struct bio *bio, int error)
 {
 	ext4_io_end_t *io_end = bio->bi_private;
-	struct workqueue_struct *wq;
 	struct inode *inode;
-	unsigned long flags;
 	int i;
 	sector_t bi_sector = bio->bi_sector;
 
@@ -255,14 +228,7 @@ static void ext4_end_bio(struct bio *bio, int error)
 		return;
 	}
 
-	/* Add the io_end to per-inode completed io list*/
-	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
-	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/9] ext4: completed_io locking cleanup V2
  2012-09-13 15:01 ` [PATCH 2/9] ext4: completed_io locking cleanup V2 Dmitry Monakhov
@ 2012-09-20 18:56   ` Dmitry Monakhov
  0 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-20 18:56 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, jack, wenqing.lz

On Thu, 13 Sep 2012 19:01:07 +0400, Dmitry Monakhov <dmonakhov@openvz.org> wrote:
> Current unwritten extent conversion state-machine is very fuzzy.
> - By unknown reason it want perform conversion under i_mutex. What for?
>   The only reason it used here is just protect io->flags modification,
>   but only flush_completed_IO and work modified this flags and
>   we can serialize them via i_completed_io_lock.
>   All this games with mutex_trylock result in following deadlock
>    truncate:                          kworker:
>     ext4_setattr                       ext4_end_io_work
>     mutex_lock(i_mutex)
>     inode_dio_wait(inode)  ->BLOCK
>                              DEADLOCK<- mutex_trylock()
>                                         inode_dio_done()
>   #TEST_CASE1_BEGIN
>   MNT=/mnt_scrach
>   unlink $MNT/file
>   fallocate -l $((1024*1024*1024)) $MNT/file
>   aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
>   sleep 2
>   truncate -s 0 $MNT/file
>   #TEST_CASE1_END
> 
> This patch makes state machine simple and clean:
> 
> (1) ext4_flush_completed_IO is responsible for handling all pending
>     end_io from ei->i_completed_io_list(per inode list)
>     NOTE1: i_completed_io_lock is acquired only once
>     NOTE2: i_mutex is not required because it does not protect
>            any data guarded by i_mutex
> 
> (2) xxx_end_io schedule end_io context completion simply by pushing it
>     to the inode's list.
>     NOTE1: because of (1) work should be queued only if
>     ->i_completed_io_list was empty at the moment, otherwise it
>     work is scheduled already.
> 
> (3) No one is able to free inode's blocks while pented io_completion
>     exist othervise may result in blocks beyond EOF.
> 
> - remove useless EXT4_IO_END_QUEUED and EXT4_IO_END_FSYNC flags
> - Improve SMP scalability by removing useless i_mutex which does not
>   protect io->flags anymore.
> - Reduce lock contention on i_completed_io_lock by optimizing list walk.
> - Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
> - Add BUG_ON to ext_ext_remove_space() in order to assert (3)
> 
> Changes since V1:
>  Add bugon assertion according to Jan's comment
> 
> Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> ---
>  fs/ext4/ext4.h     |    5 +--
>  fs/ext4/extents.c  |    1 +
>  fs/ext4/fsync.c    |   47 +++++++++++---------------------
>  fs/ext4/indirect.c |    6 +---
>  fs/ext4/inode.c    |   25 +---------------
>  fs/ext4/page-io.c  |   76 ++++++++++++++-------------------------------------
>  6 files changed, 44 insertions(+), 116 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index b3f3c55..be976ca 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -184,9 +184,7 @@ struct mpage_da_data {
>   */
>  #define	EXT4_IO_END_UNWRITTEN	0x0001
>  #define EXT4_IO_END_ERROR	0x0002
> -#define EXT4_IO_END_QUEUED	0x0004
> -#define EXT4_IO_END_DIRECT	0x0008
> -#define EXT4_IO_END_IN_FSYNC	0x0010
> +#define EXT4_IO_END_DIRECT	0x0004
>  
>  struct ext4_io_page {
>  	struct page	*p_page;
> @@ -2405,6 +2403,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
>  
>  /* page-io.c */
>  extern int __init ext4_init_pageio(void);
> +extern void ext4_add_complete_io(ext4_io_end_t *io_end);
>  extern void ext4_exit_pageio(void);
>  extern void ext4_ioend_wait(struct inode *);
>  extern void ext4_free_io_end(ext4_io_end_t *io);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index e993879..44e33b0 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2618,6 +2618,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
>  	handle_t *handle;
>  	int i = 0, err;
>  
> +	BUG_ON(atomic_read(&EXT4_I(inode)->i_aiodio_unwritten));
Yeah. I've just triggered this bugon. Even it it is false trigger
because it is safe to have i_aiodio_unwritten some where inside
file while file grow it in progress, but in order to be on the
safe side lets move inode_dio_wait inside ext4_truncate()
so it allow us to have 100% bulletproof assertion 
inside ext4_ext_remove_space:
BUG_ON(atomic_read(&EXT4_I(inode)->i_aiodio_unwritten))
BUG_ON(atomic_read(&(inode)->i_dio_count))

This should not affect performance since we already wait for dio tasks
during ext4_setattr, but now we will also will wait on failed_write
which happen only on error path.

 
EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts:
acl,user_xattr
------------[ cut here ]------------
kernel BUG at fs/ext4/extents.c:2621!
invalid opcode: 0000 [#1] SMP 
Modules linked in: brd ext4 jbd2 cpufreq_ondemand acpi_cpufreq
freq_table mperf coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel
microcode sg xhci_hcd ext3 jbd mbcache sd_mod crc_t10dif aesni_intel
ablk_helper cryptd aes_x86_64 aes_generic ahci libahci pata_acpi
ata_generic dm_mirror dm_region_hash dm_log dm_mod
CPU 2 
Pid: 11079, comm: fio Not tainted 3.6.0-rc1+ #62
/DQ67SW
RIP: 0010:[<ffffffffa0405799>]  [<ffffffffa0405799>]
ext4_ext_remove_space+0x79/0xa80 [ext4]
RSP: 0018:ffff8801bcc87888  EFLAGS: 00010202
RAX: 0000000000000003 RBX: ffff88022b7d62c8 RCX: 000000000000000c
RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffffa0481d90
RBP: ffff8801bcc87938 R08: 0000000000000001 R09: 0000000000000000
R10: ffffffffa04546d8 R11: 0000000000000001 R12: ffff8801ff109000
R13: 00000000fffffffe R14: 000000000026c8d0 R15: 0000000000000001
FS:  00007f4c3dd6a700(0000) GS:ffff88023d800000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f4c34f4f172 CR3: 0000000204d80000 CR4: 00000000000407e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process fio (pid: 11079, threadinfo ffff8801bcc86000, task
ffff88018f9e5700)
Stack:
 ffff88022b7d62c8 0000000000000000 ffff8801f1595cf8 ffff8801d8ff7000
 ffff8801bcc87938 ffffffffa03cd363 ffffffffa0408a15 0000000000000000
 ffff8801bcc878f8 0000000000000246 ffff88022b7d6238 ffffffffa0408a3d
Call Trace:
 [<ffffffffa03cd363>] ? ext4_mark_inode_dirty+0x293/0x2e0 [ext4]
 [<ffffffffa0408a15>] ? ext4_ext_truncate+0x115/0x2a0 [ext4]
 [<ffffffffa0408a3d>] ? ext4_ext_truncate+0x13d/0x2a0 [ext4]
 [<ffffffffa0408a15>] ? ext4_ext_truncate+0x115/0x2a0 [ext4]
 [<ffffffffa0408a5e>] ext4_ext_truncate+0x15e/0x2a0 [ext4]
 [<ffffffffa03c9e9a>] ext4_truncate+0x14a/0x240 [ext4]
 [<ffffffffa04224b1>] ext4_ind_direct_IO+0x481/0x740 [ext4]
 [<ffffffffa03cc970>] ? noalloc_get_block_write+0x90/0x90 [ext4]
 [<ffffffff81028965>] ? native_sched_clock+0x65/0xb0
 [<ffffffffa03c822e>] ext4_ext_direct_IO+0x26e/0x290 [ext4]
 [<ffffffffa03c83cc>] ext4_direct_IO+0x17c/0x2a0 [ext4]
 [<ffffffff81184354>] generic_file_direct_write+0x174/0x240
 [<ffffffff811849d0>] __generic_file_aio_write+0x5b0/0x820
 [<ffffffff81028965>] ? native_sched_clock+0x65/0xb0
 [<ffffffffa03c1491>] ext4_file_dio_write+0x3b1/0x550 [ext4]
 [<ffffffffa03c1778>] ext4_file_write+0x148/0x190 [ext4]
 [<ffffffffa03c1630>] ? ext4_file_dio_write+0x550/0x550 [ext4]
 [<ffffffff8127532e>] aio_rw_vect_retry+0xce/0x200
 [<ffffffff81275260>] ? aio_advance_iovec+0x130/0x130
 [<ffffffff81276246>] aio_run_iocb+0xd6/0x2a0
 [<ffffffff8173ce1d>] io_submit_one+0x38a/0x3ff
 [<ffffffff81277e1e>] do_io_submit+0x2be/0x3d0
 [<ffffffff81277f40>] sys_io_submit+0x10/0x20
 [<ffffffff8175e4e9>] system_call_fastpath+0x16/0x1b
Code: c7 c7 90 1d 48 a0 85 c0 41 0f 95 c7 31 d2 44 89 fe e8 fc a5 d5 e0
49 63 c7 48 83 c0 02 48 83 04 c5 d0 09 47 a0 01 45 85 ff 74 02 <0f> 0b
0f b7 75 bc 48 8b 7b 28 83 c6 01 e8 65 11 ff ff 48 89 c7 
RIP  [<ffffffffa0405799>] ext4_ext_remove_space+0x79/0xa80 [ext4]
 RSP <ffff8801bcc87888>
---[ end trace 19c3447cad5485fa ]---




>  
>  	/* probably first extent we're gonna free will be last in block */
> diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
> index 323eb15..24f3719 100644
> --- a/fs/ext4/fsync.c
> +++ b/fs/ext4/fsync.c
> @@ -73,46 +73,31 @@ static void dump_completed_IO(struct inode * inode)
>   * might needs to do the conversion. This function walks through
>   * the list and convert the related unwritten extents for completed IO
>   * to written.
> - * The function return the number of pending IOs on success.
> + * The function return first error;
>   */
>  int ext4_flush_completed_IO(struct inode *inode)
>  {
> +	struct ext4_inode_info	*ei = EXT4_I(inode);
> +	unsigned long		flags;
> +	struct list_head complete_list;
> +	int err, ret = 0;
>  	ext4_io_end_t *io;
> -	struct ext4_inode_info *ei = EXT4_I(inode);
> -	unsigned long flags;
> -	int ret = 0;
> -	int ret2 = 0;
>  
>  	dump_completed_IO(inode);
> +
>  	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> -	while (!list_empty(&ei->i_completed_io_list)){
> -		io = list_entry(ei->i_completed_io_list.next,
> -				ext4_io_end_t, list);
> +	list_replace_init(&ei->i_completed_io_list, &complete_list);
> +	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> +
> +	while(!list_empty(&complete_list)) {
> +		io = list_entry(complete_list.next, ext4_io_end_t, list);
>  		list_del_init(&io->list);
> -		io->flag |= EXT4_IO_END_IN_FSYNC;
> -		/*
> -		 * Calling ext4_end_io_nolock() to convert completed
> -		 * IO to written.
> -		 *
> -		 * When ext4_sync_file() is called, run_queue() may already
> -		 * about to flush the work corresponding to this io structure.
> -		 * It will be upset if it founds the io structure related
> -		 * to the work-to-be schedule is freed.
> -		 *
> -		 * Thus we need to keep the io structure still valid here after
> -		 * conversion finished. The io structure has a flag to
> -		 * avoid double converting from both fsync and background work
> -		 * queue work.
> -		 */
> -		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> -		ret = ext4_end_io_nolock(io);
> -		if (ret < 0)
> -			ret2 = ret;
> -		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> -		io->flag &= ~EXT4_IO_END_IN_FSYNC;
> +	        err = ext4_end_io_nolock(io);
> +		ext4_free_io_end(io);
> +		if (unlikely(err && !ret))
> +			ret = err;
>  	}
> -	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> -	return (ret2 < 0) ? ret2 : 0;
> +	return ret;
>  }
>  
>  /*
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index 830e1b2..61f13e5 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -807,11 +807,9 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
>  
>  retry:
>  	if (rw == READ && ext4_should_dioread_nolock(inode)) {
> -		if (unlikely(!list_empty(&ei->i_completed_io_list))) {
> -			mutex_lock(&inode->i_mutex);
> +		if (unlikely(!list_empty(&ei->i_completed_io_list)))
>  			ext4_flush_completed_IO(inode);
> -			mutex_unlock(&inode->i_mutex);
> -		}
> +
>  		ret = __blockdev_direct_IO(rw, iocb, inode,
>  				 inode->i_sb->s_bdev, iov,
>  				 offset, nr_segs,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 202ae3f..762b955 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2885,9 +2885,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
>  {
>  	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
>          ext4_io_end_t *io_end = iocb->private;
> -	struct workqueue_struct *wq;
> -	unsigned long flags;
> -	struct ext4_inode_info *ei;
>  
>  	/* if not async direct IO or dio with 0 bytes write, just return */
>  	if (!io_end || !size)
> @@ -2916,24 +2913,14 @@ out:
>  		io_end->iocb = iocb;
>  		io_end->result = ret;
>  	}
> -	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> -
> -	/* Add the io_end to per-inode completed aio dio list*/
> -	ei = EXT4_I(io_end->inode);
> -	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> -	list_add_tail(&io_end->list, &ei->i_completed_io_list);
> -	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
>  
> -	/* queue the work to convert unwritten extents to written */
> -	queue_work(wq, &io_end->work);
> +	ext4_add_complete_io(io_end);
>  }
>  
>  static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
>  {
>  	ext4_io_end_t *io_end = bh->b_private;
> -	struct workqueue_struct *wq;
>  	struct inode *inode;
> -	unsigned long flags;
>  
>  	if (!test_clear_buffer_uninit(bh) || !io_end)
>  		goto out;
> @@ -2952,15 +2939,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
>  	 */
>  	inode = io_end->inode;
>  	ext4_set_io_unwritten_flag(inode, io_end);
> -
> -	/* Add the io_end to per-inode completed io list*/
> -	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
> -	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
> -	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
> -
> -	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
> -	/* queue the work to convert unwritten extents to written */
> -	queue_work(wq, &io_end->work);
> +	ext4_add_complete_io(io_end);
>  out:
>  	bh->b_private = NULL;
>  	bh->b_end_io = NULL;
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index dcdeef1..c369419 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -57,6 +57,22 @@ void ext4_ioend_wait(struct inode *inode)
>  	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
>  }
>  
> +/* Add the io_end to per-inode completed aio dio list. */
> +void ext4_add_complete_io(ext4_io_end_t *io_end)
> +{
> +	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
> +	struct workqueue_struct *wq;
> +	unsigned long flags;
> +
> +	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> +
> +	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> +	if (list_empty(&ei->i_completed_io_list))
> +		queue_work(wq, &io_end->work);
> +	list_add_tail(&io_end->list, &ei->i_completed_io_list);
> +	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> +}
> +
>  static void put_io_page(struct ext4_io_page *io_page)
>  {
>  	if (atomic_dec_and_test(&io_page->p_count)) {
> @@ -81,12 +97,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
>  	kmem_cache_free(io_end_cachep, io);
>  }
>  
> -/*
> - * check a range of space and convert unwritten extents to written.
> - *
> - * Called with inode->i_mutex; we depend on this when we manipulate
> - * io->flag, since we could otherwise race with ext4_flush_completed_IO()
> - */
> +/* check a range of space and convert unwritten extents to written. */
>  int ext4_end_io_nolock(ext4_io_end_t *io)
>  {
>  	struct inode *inode = io->inode;
> @@ -94,6 +105,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
>  	ssize_t size = io->size;
>  	int ret = 0;
>  
> +	BUG_ON(!list_empty(&io->list));
> +
>  	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
>  		   "list->prev 0x%p\n",
>  		   io, inode->i_ino, io->list.next, io->list.prev);
> @@ -124,45 +137,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
>  static void ext4_end_io_work(struct work_struct *work)
>  {
>  	ext4_io_end_t		*io = container_of(work, ext4_io_end_t, work);
> -	struct inode		*inode = io->inode;
> -	struct ext4_inode_info	*ei = EXT4_I(inode);
> -	unsigned long		flags;
> -
> -	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> -	if (io->flag & EXT4_IO_END_IN_FSYNC)
> -		goto requeue;
> -	if (list_empty(&io->list)) {
> -		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> -		goto free;
> -	}
> -
> -	if (!mutex_trylock(&inode->i_mutex)) {
> -		bool was_queued;
> -requeue:
> -		was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
> -		io->flag |= EXT4_IO_END_QUEUED;
> -		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> -		/*
> -		 * Requeue the work instead of waiting so that the work
> -		 * items queued after this can be processed.
> -		 */
> -		queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
> -		/*
> -		 * To prevent the ext4-dio-unwritten thread from keeping
> -		 * requeueing end_io requests and occupying cpu for too long,
> -		 * yield the cpu if it sees an end_io request that has already
> -		 * been requeued.
> -		 */
> -		if (was_queued)
> -			yield();
> -		return;
> -	}
> -	list_del_init(&io->list);
> -	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> -	(void) ext4_end_io_nolock(io);
> -	mutex_unlock(&inode->i_mutex);
> -free:
> -	ext4_free_io_end(io);
> +	(void) ext4_flush_completed_IO(io->inode);
>  }
>  
>  ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
> @@ -195,9 +170,7 @@ static void buffer_io_error(struct buffer_head *bh)
>  static void ext4_end_bio(struct bio *bio, int error)
>  {
>  	ext4_io_end_t *io_end = bio->bi_private;
> -	struct workqueue_struct *wq;
>  	struct inode *inode;
> -	unsigned long flags;
>  	int i;
>  	sector_t bi_sector = bio->bi_sector;
>  
> @@ -255,14 +228,7 @@ static void ext4_end_bio(struct bio *bio, int error)
>  		return;
>  	}
>  
> -	/* Add the io_end to per-inode completed io list*/
> -	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
> -	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
> -	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
> -
> -	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
> -	/* queue the work to convert unwritten extents to written */
> -	queue_work(wq, &io_end->work);
> +	ext4_add_complete_io(io_end);
>  }
>  
>  void ext4_io_submit(struct ext4_io_submit *io)
> -- 
> 1.7.7.6
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 3/9] ext4: serialize dio nonlocked reads with defrag workers V3
  2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
  2012-09-13 15:01 ` [PATCH 1/9] ext4: ext4_inode_info diet Dmitry Monakhov
  2012-09-13 15:01 ` [PATCH 2/9] ext4: completed_io locking cleanup V2 Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
  2012-09-13 15:01 ` [PATCH 4/9] ext4: punch_hole should wait for DIO writers Dmitry Monakhov
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov

Inode's block defrag and ext4_change_inode_journal_flag() may
affect nonlocked DIO reads result, so proper synchronization
required.

- Add missed inode_dio_wait() calls where appropriate
- Check inode state under extra i_dio_count reference.


Changes from V1:
- add missed memory bariers
- move DIOREAD_LOCK state check out from generic should_dioread_nolock
  otherwise it will affect existing DIO readers.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/ext4.h        |   17 +++++++++++++++++
 fs/ext4/indirect.c    |   13 +++++++++++++
 fs/ext4/inode.c       |    5 +++++
 fs/ext4/move_extent.c |    8 ++++++++
 4 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index be976ca..a71e299 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1348,6 +1348,8 @@ enum {
 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
 	EXT4_STATE_NEWENTRY,		/* File just added to dir */
 	EXT4_STATE_DELALLOC_RESERVED,	/* blks already reserved for delalloc */
+	EXT4_STATE_DIOREAD_LOCK,	/* Disable support for dio read
+					   nolocking */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
@@ -2456,6 +2458,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
 	set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
 }
 
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+	ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+	smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+	smp_mb();
+	ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
 /* For ioend & aio unwritten conversion wait queues */
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 61f13e5..e4cc904 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -810,10 +810,23 @@ retry:
 		if (unlikely(!list_empty(&ei->i_completed_io_list)))
 			ext4_flush_completed_IO(inode);
 
+		/*
+		 * Nolock dioread optimization may be dynamically disabled
+		 * via ext4_inode_block_unlocked_dio(). Check inode's state
+		 * while holding extra i_dio_count ref.
+		 */
+		atomic_inc(&inode->i_dio_count);
+		smp_mb();
+		if (!unlikely(ext4_test_inode_state(inode,
+						    EXT4_STATE_DIOREAD_LOCK))) {
+			inode_dio_done(inode);
+			goto retry;
+		}
 		ret = __blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL, NULL, 0);
+		inode_dio_done(inode);
 	} else {
 		ret = blockdev_direct_IO(rw, iocb, inode, iov,
 				 offset, nr_segs, ext4_get_block);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 762b955..c34d38c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4720,6 +4720,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 			return err;
 	}
 
+	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
 	jbd2_journal_lock_updates(journal);
 
 	/*
@@ -4739,6 +4743,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
+	ext4_inode_resume_unlocked_dio(inode);
 
 	/* Finally we can mark the inode as dirty. */
 
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c5826c6..ae0357b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1214,6 +1214,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	if (ret1 < 0)
 		return ret1;
 
+	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(orig_inode);
+	ext4_inode_block_unlocked_dio(donor_inode);
+	inode_dio_wait(orig_inode);
+	inode_dio_wait(donor_inode);
+
 	/* Protect extent tree against block allocations via delalloc */
 	double_down_write_data_sem(orig_inode, donor_inode);
 	/* Check the filesystem environment whether move_extent can be done */
@@ -1412,6 +1418,8 @@ out:
 		kfree(holecheck_path);
 	}
 	double_up_write_data_sem(orig_inode, donor_inode);
+	ext4_inode_block_unlocked_dio(orig_inode);
+	ext4_inode_block_unlocked_dio(donor_inode);
 	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
 
 	if (ret1)
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 4/9] ext4: punch_hole should wait for DIO writers
  2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
                   ` (2 preceding siblings ...)
  2012-09-13 15:01 ` [PATCH 3/9] ext4: serialize dio nonlocked reads with defrag workers V3 Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
  2012-09-13 15:13   ` Lukáš Czerner
  2012-09-13 15:01 ` [PATCH 5/9] ext4: serialize unlocked dio reads with truncate Dmitry Monakhov
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov

punch_hole are the places where we have to wait for all existing writers
(writeback, aio, dio), but currently we simply flush pended end_io request
which is not sufficient. Even more i_mutex is not holded while punch_hole
which obviously result in dangerous data corruption due to
access-after-free issue.

This patch performs following changes:
- Guard punch_hole with i_mutex
- Block all new dio readers in order to prevent information leak caused by
  read-after-free pattern.
- punch_hole now wait for all writers in flight
  NOTE: XXX write-after-free race is still possible because there is
        no easy way to stop writeback while punch_hole is in progress.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/extents.c |   41 +++++++++++++++++++++++++----------------
 1 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 44e33b0..0e94485 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4814,9 +4814,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	loff_t first_page_offset, last_page_offset;
 	int credits, err = 0;
 
+	/*
+	 * Write out all dirty pages to avoid race conditions
+	 * Then release them.
+	 */
+	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		err = filemap_write_and_wait_range(mapping,
+			offset, offset + length - 1);
+
+		if (err)
+			return err;
+	}
+
+	mutex_lock(&inode->i_mutex);
 	/* No need to punch hole beyond i_size */
 	if (offset >= inode->i_size)
-		return 0;
+		goto out_mutex;
 
 	/*
 	 * If the hole extends beyond i_size, set the hole
@@ -4834,31 +4847,23 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	first_page_offset = first_page << PAGE_CACHE_SHIFT;
 	last_page_offset = last_page << PAGE_CACHE_SHIFT;
 
-	/*
-	 * Write out all dirty pages to avoid race conditions
-	 * Then release them.
-	 */
-	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-		err = filemap_write_and_wait_range(mapping,
-			offset, offset + length - 1);
-
-		if (err)
-			return err;
-	}
-
 	/* Now release the pages */
 	if (last_page_offset > first_page_offset) {
 		truncate_pagecache_range(inode, first_page_offset,
 					 last_page_offset - 1);
 	}
 
-	/* finish any pending end_io work */
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
 	ext4_flush_completed_IO(inode);
 
 	credits = ext4_writepage_trans_blocks(inode);
 	handle = ext4_journal_start(inode, credits);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto out_dio;
+	}
 
 	err = ext4_orphan_add(handle, inode);
 	if (err)
@@ -4952,6 +4957,10 @@ out:
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
 	return err;
 }
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/9] ext4: punch_hole should wait for DIO writers
  2012-09-13 15:01 ` [PATCH 4/9] ext4: punch_hole should wait for DIO writers Dmitry Monakhov
@ 2012-09-13 15:13   ` Lukáš Czerner
  0 siblings, 0 replies; 12+ messages in thread
From: Lukáš Czerner @ 2012-09-13 15:13 UTC (permalink / raw)
  To: Dmitry Monakhov; +Cc: linux-ext4, tytso, jack, wenqing.lz

On Thu, 13 Sep 2012, Dmitry Monakhov wrote:

> Date: Thu, 13 Sep 2012 19:01:09 +0400
> From: Dmitry Monakhov <dmonakhov@openvz.org>
> To: linux-ext4@vger.kernel.org
> Cc: tytso@mit.edu, jack@suse.cz, wenqing.lz@taobao.com,
>     Dmitry Monakhov <dmonakhov@openvz.org>
> Subject: [PATCH 4/9] ext4: punch_hole should wait for DIO writers
> 
> punch_hole are the places where we have to wait for all existing writers
> (writeback, aio, dio), but currently we simply flush pended end_io request
> which is not sufficient. Even more i_mutex is not holded while punch_hole
> which obviously result in dangerous data corruption due to
> access-after-free issue.
> 
> This patch performs following changes:
> - Guard punch_hole with i_mutex
> - Block all new dio readers in order to prevent information leak caused by
>   read-after-free pattern.
> - punch_hole now wait for all writers in flight
>   NOTE: XXX write-after-free race is still possible because there is
>         no easy way to stop writeback while punch_hole is in progress.

Hi Dimitry,

Just FYI, I am carrying the punch hole i_mutex in the patch set "Add
invalidatepage_range address space operation" but I am more than
happy to drop it and let your patch fix this instead.

Thanks!
-Lukas


> 
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> ---
>  fs/ext4/extents.c |   41 +++++++++++++++++++++++++----------------
>  1 files changed, 25 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 44e33b0..0e94485 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4814,9 +4814,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
>  	loff_t first_page_offset, last_page_offset;
>  	int credits, err = 0;
>  
> +	/*
> +	 * Write out all dirty pages to avoid race conditions
> +	 * Then release them.
> +	 */
> +	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> +		err = filemap_write_and_wait_range(mapping,
> +			offset, offset + length - 1);
> +
> +		if (err)
> +			return err;
> +	}
> +
> +	mutex_lock(&inode->i_mutex);
>  	/* No need to punch hole beyond i_size */
>  	if (offset >= inode->i_size)
> -		return 0;
> +		goto out_mutex;
>  
>  	/*
>  	 * If the hole extends beyond i_size, set the hole
> @@ -4834,31 +4847,23 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
>  	first_page_offset = first_page << PAGE_CACHE_SHIFT;
>  	last_page_offset = last_page << PAGE_CACHE_SHIFT;
>  
> -	/*
> -	 * Write out all dirty pages to avoid race conditions
> -	 * Then release them.
> -	 */
> -	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> -		err = filemap_write_and_wait_range(mapping,
> -			offset, offset + length - 1);
> -
> -		if (err)
> -			return err;
> -	}
> -
>  	/* Now release the pages */
>  	if (last_page_offset > first_page_offset) {
>  		truncate_pagecache_range(inode, first_page_offset,
>  					 last_page_offset - 1);
>  	}
>  
> -	/* finish any pending end_io work */
> +	/* Wait all existing dio workers, newcomers will block on i_mutex */
> +	ext4_inode_block_unlocked_dio(inode);
> +	inode_dio_wait(inode);
>  	ext4_flush_completed_IO(inode);
>  
>  	credits = ext4_writepage_trans_blocks(inode);
>  	handle = ext4_journal_start(inode, credits);
> -	if (IS_ERR(handle))
> -		return PTR_ERR(handle);
> +	if (IS_ERR(handle)) {
> +		err = PTR_ERR(handle);
> +		goto out_dio;
> +	}
>  
>  	err = ext4_orphan_add(handle, inode);
>  	if (err)
> @@ -4952,6 +4957,10 @@ out:
>  	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
>  	ext4_mark_inode_dirty(handle, inode);
>  	ext4_journal_stop(handle);
> +out_dio:
> +	ext4_inode_resume_unlocked_dio(inode);
> +out_mutex:
> +	mutex_unlock(&inode->i_mutex);
>  	return err;
>  }
>  int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 5/9] ext4: serialize unlocked dio reads with truncate
  2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
                   ` (3 preceding siblings ...)
  2012-09-13 15:01 ` [PATCH 4/9] ext4: punch_hole should wait for DIO writers Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
  2012-09-13 15:01 ` [PATCH 6/9] ext4: endless truncate due to nonlocked dio readers V2 Dmitry Monakhov
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov

Current serialization will works only for DIO which holds
i_mutex, but nonlocked DIO following race is possible:

dio_nolock_read_task            truncate_task
				->ext4_setattr()
				 ->inode_dio_wait()
->ext4_ext_direct_IO
  ->ext4_ind_direct_IO
    ->__blockdev_direct_IO
      ->ext4_get_block
				 ->truncate_setsize()
				 ->ext4_truncate()
				 #alloc truncated blocks
				 #to other inode
      ->submit_io()
     #INFORMATION LEAK

In order to serialize with unlocked DIO reads we have to
rearrange wait sequence
1) update i_size first
2) if i_size about to be reduced wait for outstanding DIO requests
3) and only after that truncate inode blocks

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/inode.c |    7 +++++--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c34d38c..658f649 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4283,7 +4283,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		inode_dio_wait(inode);
 
 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4332,8 +4331,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		if (attr->ia_size != i_size_read(inode))
+		if (attr->ia_size != inode->i_size) {
 			truncate_setsize(inode, attr->ia_size);
+			/* Inode size will be reduced, wait for dio in flight */
+			if (orphan)
+				inode_dio_wait(inode);
+		}
 		ext4_truncate(inode);
 	}
 
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 6/9] ext4: endless truncate due to nonlocked dio readers V2
  2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
                   ` (4 preceding siblings ...)
  2012-09-13 15:01 ` [PATCH 5/9] ext4: serialize unlocked dio reads with truncate Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
  2012-09-13 15:01 ` [PATCH 7/9] ext4: serialize truncate with owerwrite DIO workers V2 Dmitry Monakhov
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov

If we have enough aggressive DIO readers, truncate and other dio
waiters will wait forever inside inode_dio_wait(). It is reasonable
to disable nonlock DIO read optimization during truncate.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/inode.c |    8 ++++++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 658f649..a295b3a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4333,9 +4333,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (attr->ia_size != inode->i_size) {
 			truncate_setsize(inode, attr->ia_size);
-			/* Inode size will be reduced, wait for dio in flight */
-			if (orphan)
+			/* Inode size will be reduced, wait for dio in flight.
+			 * Temproraly disable unlocked DIO to prevent livelock */
+			if (orphan) {
+				ext4_inode_block_unlocked_dio(inode);
 				inode_dio_wait(inode);
+				ext4_inode_resume_unlocked_dio(inode);
+			}
 		}
 		ext4_truncate(inode);
 	}
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 7/9] ext4: serialize truncate with owerwrite DIO workers V2
  2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
                   ` (5 preceding siblings ...)
  2012-09-13 15:01 ` [PATCH 6/9] ext4: endless truncate due to nonlocked dio readers V2 Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
  2012-09-13 15:01 ` [PATCH 8/9] ext4: fix ext_remove_space for punch_hole case Dmitry Monakhov
  2012-09-13 15:01 ` [PATCH 9/9] ext4: fix ext4_ext_remove_space tree traversal Dmitry Monakhov
  8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov

Jan Kara have spotted interesting issue:
There are  potential data corruption issue with  direct IO overwrites
racing with truncate:
 Like:
  dio write                      truncate_task
  ->ext4_ext_direct_IO
   ->overwrite == 1
    ->down_read(&EXT4_I(inode)->i_data_sem);
    ->mutex_unlock(&inode->i_mutex);
                               ->ext4_setattr()
                                ->inode_dio_wait()
                                ->truncate_setsize()
                                ->ext4_truncate()
                                 ->down_write(&EXT4_I(inode)->i_data_sem);
    ->__blockdev_direct_IO
     ->ext4_get_block
     ->submit_io()
    ->up_read(&EXT4_I(inode)->i_data_sem);
                                 # truncate data blocks, allocate them to
                                 # other inode - bad stuff happens because
                                 # dio is still in flight.

In order to serialize with truncate dio worker should grab extra i_dio_count
reference before drop i_mutex.

Changes agains V1:
- wake up dio waiters before i_mutex.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/inode.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a295b3a..ce4d44a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3014,6 +3014,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		overwrite = *((int *)iocb->private);
 
 		if (overwrite) {
+			atomic_inc(&inode->i_dio_count);
 			down_read(&EXT4_I(inode)->i_data_sem);
 			mutex_unlock(&inode->i_mutex);
 		}
@@ -3111,6 +3112,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	retake_lock:
 		/* take i_mutex locking again if we do a ovewrite dio */
 		if (overwrite) {
+			inode_dio_done(inode);
 			up_read(&EXT4_I(inode)->i_data_sem);
 			mutex_lock(&inode->i_mutex);
 		}
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 8/9] ext4: fix ext_remove_space for punch_hole case
  2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
                   ` (6 preceding siblings ...)
  2012-09-13 15:01 ` [PATCH 7/9] ext4: serialize truncate with owerwrite DIO workers V2 Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
  2012-09-13 15:01 ` [PATCH 9/9] ext4: fix ext4_ext_remove_space tree traversal Dmitry Monakhov
  8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov

Inode is allowed to have empty leaf only if it this is blockless inode
i.e. (depth == 0).

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/extents.c |   11 +++++++----
 1 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0e94485..af2cc76 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2649,12 +2649,15 @@ again:
 			return PTR_ERR(path);
 		}
 		depth = ext_depth(inode);
+		/* Leaf not may not exist only if inode has no blocks at all */
 		ex = path[depth].p_ext;
 		if (!ex) {
-			ext4_ext_drop_refs(path);
-			kfree(path);
-			path = NULL;
-			goto cont;
+			if (depth) {
+				EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL",
+						 depth);
+				err = -EIO;
+			}
+			goto out;
 		}
 
 		ee_block = le32_to_cpu(ex->ee_block);
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 9/9] ext4: fix ext4_ext_remove_space tree traversal
  2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
                   ` (7 preceding siblings ...)
  2012-09-13 15:01 ` [PATCH 8/9] ext4: fix ext_remove_space for punch_hole case Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
  8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov

If ext4_ext_rm_leaf() restarted transaction we should restart loop
from the top because i_data_sem was internally dropped
but (i = 0) statement was moved out from the loop in following commit
968dee77220768a5 which result in NULL pointer dereference.

This patch fix tree walking procedure by moving 'i' and 'depth'
initalization inside loop body. Also perform code cleanup in order
to have better code flow separation for truncate and punch_hole
cases.

Originally i've found this on very speciffic test, but it can be easily
reproduced via fsstress.

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000028
 IP: [<ffffffffa01b4ebd>] ext4_ext_remove_space+0x8e6/0xc4f [ext4]
 PGD fe763c067 PUD 101e5a4067 PMD 0
 Oops: 0000 [#1] SMP
 Modules linked in: brd netconsole configfs cpufreq_ondemand acpi_cpufreq freq_table mperf ext4 jbd2 kvm_intel kvm microcode lpc_ich mfd_core i7300_idle\
i_transport_fc scsi_tgt
 CPU 2
 Pid: 9930, comm: unlink Not tainted 3.6.0-rc1+ #35 Intel MP Server/S7000FC4UR
 RIP: 0010:[<ffffffffa01b4ebd>]  [<ffffffffa01b4ebd>] ext4_ext_remove_space+0x8e6/0xc4f [ext4]
 RSP: 0018:ffff88101f0ffcb8  EFLAGS: 00010246
 RAX: 0000000000000000 RBX: ffff88100b398190 RCX: ffff88100b398030
 RDX: 0000000000000001 RSI: 0000000000000002 RDI: 0000000000a00000
 RBP: ffff88101f0ffd98 R08: 0000000000a00000 R09: 00019b0d6466e2b3
 R10: 0000000000000367 R11: ffff88101f0ffb38 R12: ffff88101fb13d70
 R13: 0000000000000000 R14: ffff88101fb13d40 R15: 0000000000000000
 FS:  00007f603bbc7700(0000) GS:ffff88103ba00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
 CR2: 0000000000000028 CR3: 0000000fe7639000 CR4: 00000000000007e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
 Process unlink (pid: 9930, threadinfo ffff88101f0fe000, task ffff88101b26c620)
 Stack:
 ffff88101f0ffcc8 ffffffffa018ddd4 ffff88101f0ffd28 ffffffffa0192461
 ffff881000007800 00000000fffffff5 ffffffff0b398000 0000000000547fff
 ffff88100b398000 ffff88100b398190 ffff8810203d0000 ffff88100ea0900c
 Call Trace:
 [<ffffffffa018ddd4>] ? brelse+0xe/0x10 [ext4]
 [<ffffffffa0192461>] ? ext4_mark_iloc_dirty+0x50c/0x56f [ext4]
 [<ffffffffa01b6cb3>] ext4_ext_truncate+0xd8/0x184 [ext4]
 [<ffffffffa019416c>] ? ext4_evict_inode+0x1c2/0x358 [ext4]
 [<ffffffffa01904fb>] ext4_truncate+0xdb/0x158 [ext4]
 [<ffffffffa01941f7>] ext4_evict_inode+0x24d/0x358 [ext4]
 [<ffffffffa0193faa>] ? ext4_da_writepages+0x54e/0x54e [ext4]
 [<ffffffff8114b2e8>] evict+0xa1/0x15b
 [<ffffffff8114b58a>] iput+0x1a3/0x1ac
 [<ffffffff811410cd>] do_unlinkat+0xff/0x15a
 [<ffffffff8108e48b>] ? trace_hardirqs_on_caller+0x151/0x197
 [<ffffffff810b11a5>] ? __audit_syscall_entry+0x11f/0x14b
 [<ffffffff8121e4de>] ? trace_hardirqs_on_thunk+0x3a/0x3f
 [<ffffffff81142b2d>] sys_unlink+0x16/0x18
 [<ffffffff8145bca9>] system_call_fastpath+0x16/0x1b
 Code: ff 4c 63 65 b0 4d 6b e4 30 4c 03 65 a8 e9 08 01 00 00 48 63 55 b0 4c 6b e2 30 4c 03 65 a8 49 83 7c 24 20 00 75 0e 49 8b 44 24 28 <48> 8b 40 28 49\
8 85 c0 75 22 49 8b
 RIP  [<ffffffffa01b4ebd>] ext4_ext_remove_space+0x8e6/0xc4f [ext4]
 RSP <ffff88101f0ffcb8>
 CR2: 0000000000000028
 ---[ end trace 07fcb23f8e07b495 ]---

#ORIGINAL_TESTCASE(huge hosts only):
modprobe brd rd_size=$((40*1024*1024)) rd_nr=1
mkfs.ext4  /dev/ram0
mount /dev/ram0 $MNT
fallocate -l $((32*1024*1024*1024)) $MNT/file
fio random_write2.fio
unlink $MNT/file
umount $MNT
fsck.ext4 -f /dev/ram0

### fio random_write2.fio job file
[random-writers]
ioengine=libaio
iodepth=256
rw=randwrite
bs=32k
direct=1

directory=/mnt
nrfiles=1
filename=file
filesize=32G

size=8G
group_reporting
numjobs=24
### fio file end

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/extents.c |   25 +++++++++++--------------
 1 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index af2cc76..5c1d313 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2616,7 +2616,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 	struct ext4_ext_path *path = NULL;
 	ext4_fsblk_t partial_cluster = 0;
 	handle_t *handle;
-	int i = 0, err;
+	int i, err;
 
 	BUG_ON(atomic_read(&EXT4_I(inode)->i_aiodio_unwritten));
 	ext_debug("truncate since %u to %u\n", start, end);
@@ -2627,8 +2627,9 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 		return PTR_ERR(handle);
 
 again:
+	err = 0;
+	depth = ext_depth(inode);
 	ext4_ext_invalidate_cache(inode);
-
 	trace_ext4_ext_remove_space(inode, start, depth);
 
 	/*
@@ -2641,6 +2642,7 @@ again:
 	if (end < EXT_MAX_BLOCKS - 1) {
 		struct ext4_extent *ex;
 		ext4_lblk_t ee_block;
+		int k;
 
 		/* find extent for this block */
 		path = ext4_ext_find_extent(inode, end, NULL);
@@ -2648,7 +2650,6 @@ again:
 			ext4_journal_stop(handle);
 			return PTR_ERR(path);
 		}
-		depth = ext_depth(inode);
 		/* Leaf not may not exist only if inode has no blocks at all */
 		ex = path[depth].p_ext;
 		if (!ex) {
@@ -2688,20 +2689,17 @@ again:
 			if (err < 0)
 				goto out;
 		}
-	}
-cont:
-
-	/*
-	 * We start scanning from right side, freeing all the blocks
-	 * after i_size and walking into the tree depth-wise.
-	 */
-	depth = ext_depth(inode);
-	if (path) {
-		int k = i = depth;
+		i = k = depth;
 		while (--k > 0)
 			path[k].p_block =
 				le16_to_cpu(path[k].p_hdr->eh_entries)+1;
 	} else {
+		/*
+		 * We start scanning from right side, freeing all the blocks
+		 * after i_size and walking into the tree depth-wise.
+		 */
+
+		i = 0;
 		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
 			       GFP_NOFS);
 		if (path == NULL) {
@@ -2716,7 +2714,6 @@ cont:
 			goto out;
 		}
 	}
-	err = 0;
 
 	while (i >= 0 && err == 0) {
 		if (i == depth) {
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2012-09-20 18:56 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 1/9] ext4: ext4_inode_info diet Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 2/9] ext4: completed_io locking cleanup V2 Dmitry Monakhov
2012-09-20 18:56   ` Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 3/9] ext4: serialize dio nonlocked reads with defrag workers V3 Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 4/9] ext4: punch_hole should wait for DIO writers Dmitry Monakhov
2012-09-13 15:13   ` Lukáš Czerner
2012-09-13 15:01 ` [PATCH 5/9] ext4: serialize unlocked dio reads with truncate Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 6/9] ext4: endless truncate due to nonlocked dio readers V2 Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 7/9] ext4: serialize truncate with owerwrite DIO workers V2 Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 8/9] ext4: fix ext_remove_space for punch_hole case Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 9/9] ext4: fix ext4_ext_remove_space tree traversal Dmitry Monakhov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).