* [PATCH 1/9] ext4: ext4_inode_info diet
2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 2/9] ext4: completed_io locking cleanup V2 Dmitry Monakhov
` (7 subsequent siblings)
8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov
Generic inode has unused i_private pointer which may be used as cur_aio_dio
storage.
TODO: If cur_aio_dio will be passed as an argument to get_block_t this allow
to have concurent AIO_DIO requests.
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
fs/ext4/ext4.h | 5 +++--
fs/ext4/extents.c | 4 ++--
fs/ext4/inode.c | 6 +++---
fs/ext4/super.c | 1 -
4 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f9024a6..b3f3c55 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -912,8 +912,6 @@ struct ext4_inode_info {
struct list_head i_completed_io_list;
spinlock_t i_completed_io_lock;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
- /* current io_end structure for async DIO write*/
- ext4_io_end_t *cur_aio_dio;
atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -929,6 +927,9 @@ struct ext4_inode_info {
__u32 i_csum_seed;
};
+/* current io_end structure for async DIO write */
+#define EXT4_CUR_AIO_DIO(inode) (inode)->i_private
+
/*
* File system states
*/
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 10f0afd..e993879 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3644,7 +3644,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
{
int ret = 0;
int err = 0;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = (ext4_io_end_t*) EXT4_CUR_AIO_DIO(inode);
ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
"block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -3902,7 +3902,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = (ext4_io_end_t*) EXT4_CUR_AIO_DIO(inode);
ext4_lblk_t cluster_offset;
ext_debug("blocks %u/%u requested for inode %lu\n",
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d12d30e..202ae3f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3060,7 +3060,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* hook to the iocb.
*/
iocb->private = NULL;
- EXT4_I(inode)->cur_aio_dio = NULL;
+ EXT4_CUR_AIO_DIO(inode) = NULL;
if (!is_sync_kiocb(iocb)) {
ext4_io_end_t *io_end =
ext4_init_io_end(inode, GFP_NOFS);
@@ -3077,7 +3077,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* is a unwritten extents needs to be converted
* when IO is completed.
*/
- EXT4_I(inode)->cur_aio_dio = iocb->private;
+ EXT4_CUR_AIO_DIO(inode) = iocb->private;
}
if (overwrite)
@@ -3097,7 +3097,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
NULL,
DIO_LOCKING);
if (iocb->private)
- EXT4_I(inode)->cur_aio_dio = NULL;
+ EXT4_CUR_AIO_DIO(inode) = NULL;
/*
* The io_end structure takes a reference to the inode,
* that structure needs to be destroyed and the
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4a7092b..a6904b3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -967,7 +967,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_completed_io_list);
spin_lock_init(&ei->i_completed_io_lock);
- ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
--
1.7.7.6
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 2/9] ext4: completed_io locking cleanup V2
2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 1/9] ext4: ext4_inode_info diet Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
2012-09-20 18:56 ` Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 3/9] ext4: serialize dio nonlocked reads with defrag workers V3 Dmitry Monakhov
` (6 subsequent siblings)
8 siblings, 1 reply; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov
Current unwritten extent conversion state-machine is very fuzzy.
- By unknown reason it want perform conversion under i_mutex. What for?
The only reason it used here is just protect io->flags modification,
but only flush_completed_IO and work modified this flags and
we can serialize them via i_completed_io_lock.
All this games with mutex_trylock result in following deadlock
truncate: kworker:
ext4_setattr ext4_end_io_work
mutex_lock(i_mutex)
inode_dio_wait(inode) ->BLOCK
DEADLOCK<- mutex_trylock()
inode_dio_done()
#TEST_CASE1_BEGIN
MNT=/mnt_scrach
unlink $MNT/file
fallocate -l $((1024*1024*1024)) $MNT/file
aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
sleep 2
truncate -s 0 $MNT/file
#TEST_CASE1_END
This patch makes state machine simple and clean:
(1) ext4_flush_completed_IO is responsible for handling all pending
end_io from ei->i_completed_io_list(per inode list)
NOTE1: i_completed_io_lock is acquired only once
NOTE2: i_mutex is not required because it does not protect
any data guarded by i_mutex
(2) xxx_end_io schedule end_io context completion simply by pushing it
to the inode's list.
NOTE1: because of (1) work should be queued only if
->i_completed_io_list was empty at the moment, otherwise it
work is scheduled already.
(3) No one is able to free inode's blocks while pented io_completion
exist othervise may result in blocks beyond EOF.
- remove useless EXT4_IO_END_QUEUED and EXT4_IO_END_FSYNC flags
- Improve SMP scalability by removing useless i_mutex which does not
protect io->flags anymore.
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
- Add BUG_ON to ext_ext_remove_space() in order to assert (3)
Changes since V1:
Add bugon assertion according to Jan's comment
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
fs/ext4/ext4.h | 5 +--
fs/ext4/extents.c | 1 +
fs/ext4/fsync.c | 47 +++++++++++---------------------
fs/ext4/indirect.c | 6 +---
fs/ext4/inode.c | 25 +---------------
fs/ext4/page-io.c | 76 ++++++++++++++-------------------------------------
6 files changed, 44 insertions(+), 116 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b3f3c55..be976ca 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -184,9 +184,7 @@ struct mpage_da_data {
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
#define EXT4_IO_END_ERROR 0x0002
-#define EXT4_IO_END_QUEUED 0x0004
-#define EXT4_IO_END_DIRECT 0x0008
-#define EXT4_IO_END_IN_FSYNC 0x0010
+#define EXT4_IO_END_DIRECT 0x0004
struct ext4_io_page {
struct page *p_page;
@@ -2405,6 +2403,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */
extern int __init ext4_init_pageio(void);
+extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e993879..44e33b0 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2618,6 +2618,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
handle_t *handle;
int i = 0, err;
+ BUG_ON(atomic_read(&EXT4_I(inode)->i_aiodio_unwritten));
ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 323eb15..24f3719 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,46 +73,31 @@ static void dump_completed_IO(struct inode * inode)
* might needs to do the conversion. This function walks through
* the list and convert the related unwritten extents for completed IO
* to written.
- * The function return the number of pending IOs on success.
+ * The function return first error;
*/
int ext4_flush_completed_IO(struct inode *inode)
{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned long flags;
+ struct list_head complete_list;
+ int err, ret = 0;
ext4_io_end_t *io;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long flags;
- int ret = 0;
- int ret2 = 0;
dump_completed_IO(inode);
+
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- while (!list_empty(&ei->i_completed_io_list)){
- io = list_entry(ei->i_completed_io_list.next,
- ext4_io_end_t, list);
+ list_replace_init(&ei->i_completed_io_list, &complete_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ while(!list_empty(&complete_list)) {
+ io = list_entry(complete_list.next, ext4_io_end_t, list);
list_del_init(&io->list);
- io->flag |= EXT4_IO_END_IN_FSYNC;
- /*
- * Calling ext4_end_io_nolock() to convert completed
- * IO to written.
- *
- * When ext4_sync_file() is called, run_queue() may already
- * about to flush the work corresponding to this io structure.
- * It will be upset if it founds the io structure related
- * to the work-to-be schedule is freed.
- *
- * Thus we need to keep the io structure still valid here after
- * conversion finished. The io structure has a flag to
- * avoid double converting from both fsync and background work
- * queue work.
- */
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- ret = ext4_end_io_nolock(io);
- if (ret < 0)
- ret2 = ret;
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- io->flag &= ~EXT4_IO_END_IN_FSYNC;
+ err = ext4_end_io_nolock(io);
+ ext4_free_io_end(io);
+ if (unlikely(err && !ret))
+ ret = err;
}
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- return (ret2 < 0) ? ret2 : 0;
+ return ret;
}
/*
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 830e1b2..61f13e5 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -807,11 +807,9 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
retry:
if (rw == READ && ext4_should_dioread_nolock(inode)) {
- if (unlikely(!list_empty(&ei->i_completed_io_list))) {
- mutex_lock(&inode->i_mutex);
+ if (unlikely(!list_empty(&ei->i_completed_io_list)))
ext4_flush_completed_IO(inode);
- mutex_unlock(&inode->i_mutex);
- }
+
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 202ae3f..762b955 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2885,9 +2885,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
ext4_io_end_t *io_end = iocb->private;
- struct workqueue_struct *wq;
- unsigned long flags;
- struct ext4_inode_info *ei;
/* if not async direct IO or dio with 0 bytes write, just return */
if (!io_end || !size)
@@ -2916,24 +2913,14 @@ out:
io_end->iocb = iocb;
io_end->result = ret;
}
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
- /* Add the io_end to per-inode completed aio dio list*/
- ei = EXT4_I(io_end->inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &ei->i_completed_io_list);
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
}
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
{
ext4_io_end_t *io_end = bh->b_private;
- struct workqueue_struct *wq;
struct inode *inode;
- unsigned long flags;
if (!test_clear_buffer_uninit(bh) || !io_end)
goto out;
@@ -2952,15 +2939,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
*/
inode = io_end->inode;
ext4_set_io_unwritten_flag(inode, io_end);
-
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
- wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
out:
bh->b_private = NULL;
bh->b_end_io = NULL;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dcdeef1..c369419 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -57,6 +57,22 @@ void ext4_ioend_wait(struct inode *inode)
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
}
+/* Add the io_end to per-inode completed aio dio list. */
+void ext4_add_complete_io(ext4_io_end_t *io_end)
+{
+ struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+ struct workqueue_struct *wq;
+ unsigned long flags;
+
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ if (list_empty(&ei->i_completed_io_list))
+ queue_work(wq, &io_end->work);
+ list_add_tail(&io_end->list, &ei->i_completed_io_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
+
static void put_io_page(struct ext4_io_page *io_page)
{
if (atomic_dec_and_test(&io_page->p_count)) {
@@ -81,12 +97,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
kmem_cache_free(io_end_cachep, io);
}
-/*
- * check a range of space and convert unwritten extents to written.
- *
- * Called with inode->i_mutex; we depend on this when we manipulate
- * io->flag, since we could otherwise race with ext4_flush_completed_IO()
- */
+/* check a range of space and convert unwritten extents to written. */
int ext4_end_io_nolock(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
@@ -94,6 +105,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
ssize_t size = io->size;
int ret = 0;
+ BUG_ON(!list_empty(&io->list));
+
ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
@@ -124,45 +137,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
static void ext4_end_io_work(struct work_struct *work)
{
ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
- struct inode *inode = io->inode;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long flags;
-
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (io->flag & EXT4_IO_END_IN_FSYNC)
- goto requeue;
- if (list_empty(&io->list)) {
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- goto free;
- }
-
- if (!mutex_trylock(&inode->i_mutex)) {
- bool was_queued;
-requeue:
- was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
- io->flag |= EXT4_IO_END_QUEUED;
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- /*
- * Requeue the work instead of waiting so that the work
- * items queued after this can be processed.
- */
- queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
- /*
- * To prevent the ext4-dio-unwritten thread from keeping
- * requeueing end_io requests and occupying cpu for too long,
- * yield the cpu if it sees an end_io request that has already
- * been requeued.
- */
- if (was_queued)
- yield();
- return;
- }
- list_del_init(&io->list);
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- (void) ext4_end_io_nolock(io);
- mutex_unlock(&inode->i_mutex);
-free:
- ext4_free_io_end(io);
+ (void) ext4_flush_completed_IO(io->inode);
}
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -195,9 +170,7 @@ static void buffer_io_error(struct buffer_head *bh)
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
- struct workqueue_struct *wq;
struct inode *inode;
- unsigned long flags;
int i;
sector_t bi_sector = bio->bi_sector;
@@ -255,14 +228,7 @@ static void ext4_end_bio(struct bio *bio, int error)
return;
}
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH 2/9] ext4: completed_io locking cleanup V2
2012-09-13 15:01 ` [PATCH 2/9] ext4: completed_io locking cleanup V2 Dmitry Monakhov
@ 2012-09-20 18:56 ` Dmitry Monakhov
0 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-20 18:56 UTC (permalink / raw)
To: linux-ext4; +Cc: tytso, jack, wenqing.lz
On Thu, 13 Sep 2012 19:01:07 +0400, Dmitry Monakhov <dmonakhov@openvz.org> wrote:
> Current unwritten extent conversion state-machine is very fuzzy.
> - By unknown reason it want perform conversion under i_mutex. What for?
> The only reason it used here is just protect io->flags modification,
> but only flush_completed_IO and work modified this flags and
> we can serialize them via i_completed_io_lock.
> All this games with mutex_trylock result in following deadlock
> truncate: kworker:
> ext4_setattr ext4_end_io_work
> mutex_lock(i_mutex)
> inode_dio_wait(inode) ->BLOCK
> DEADLOCK<- mutex_trylock()
> inode_dio_done()
> #TEST_CASE1_BEGIN
> MNT=/mnt_scrach
> unlink $MNT/file
> fallocate -l $((1024*1024*1024)) $MNT/file
> aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
> sleep 2
> truncate -s 0 $MNT/file
> #TEST_CASE1_END
>
> This patch makes state machine simple and clean:
>
> (1) ext4_flush_completed_IO is responsible for handling all pending
> end_io from ei->i_completed_io_list(per inode list)
> NOTE1: i_completed_io_lock is acquired only once
> NOTE2: i_mutex is not required because it does not protect
> any data guarded by i_mutex
>
> (2) xxx_end_io schedule end_io context completion simply by pushing it
> to the inode's list.
> NOTE1: because of (1) work should be queued only if
> ->i_completed_io_list was empty at the moment, otherwise it
> work is scheduled already.
>
> (3) No one is able to free inode's blocks while pented io_completion
> exist othervise may result in blocks beyond EOF.
>
> - remove useless EXT4_IO_END_QUEUED and EXT4_IO_END_FSYNC flags
> - Improve SMP scalability by removing useless i_mutex which does not
> protect io->flags anymore.
> - Reduce lock contention on i_completed_io_lock by optimizing list walk.
> - Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
> - Add BUG_ON to ext_ext_remove_space() in order to assert (3)
>
> Changes since V1:
> Add bugon assertion according to Jan's comment
>
> Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> ---
> fs/ext4/ext4.h | 5 +--
> fs/ext4/extents.c | 1 +
> fs/ext4/fsync.c | 47 +++++++++++---------------------
> fs/ext4/indirect.c | 6 +---
> fs/ext4/inode.c | 25 +---------------
> fs/ext4/page-io.c | 76 ++++++++++++++-------------------------------------
> 6 files changed, 44 insertions(+), 116 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index b3f3c55..be976ca 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -184,9 +184,7 @@ struct mpage_da_data {
> */
> #define EXT4_IO_END_UNWRITTEN 0x0001
> #define EXT4_IO_END_ERROR 0x0002
> -#define EXT4_IO_END_QUEUED 0x0004
> -#define EXT4_IO_END_DIRECT 0x0008
> -#define EXT4_IO_END_IN_FSYNC 0x0010
> +#define EXT4_IO_END_DIRECT 0x0004
>
> struct ext4_io_page {
> struct page *p_page;
> @@ -2405,6 +2403,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
>
> /* page-io.c */
> extern int __init ext4_init_pageio(void);
> +extern void ext4_add_complete_io(ext4_io_end_t *io_end);
> extern void ext4_exit_pageio(void);
> extern void ext4_ioend_wait(struct inode *);
> extern void ext4_free_io_end(ext4_io_end_t *io);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index e993879..44e33b0 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2618,6 +2618,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
> handle_t *handle;
> int i = 0, err;
>
> + BUG_ON(atomic_read(&EXT4_I(inode)->i_aiodio_unwritten));
Yeah. I've just triggered this bugon. Even it it is false trigger
because it is safe to have i_aiodio_unwritten some where inside
file while file grow it in progress, but in order to be on the
safe side lets move inode_dio_wait inside ext4_truncate()
so it allow us to have 100% bulletproof assertion
inside ext4_ext_remove_space:
BUG_ON(atomic_read(&EXT4_I(inode)->i_aiodio_unwritten))
BUG_ON(atomic_read(&(inode)->i_dio_count))
This should not affect performance since we already wait for dio tasks
during ext4_setattr, but now we will also will wait on failed_write
which happen only on error path.
EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts:
acl,user_xattr
------------[ cut here ]------------
kernel BUG at fs/ext4/extents.c:2621!
invalid opcode: 0000 [#1] SMP
Modules linked in: brd ext4 jbd2 cpufreq_ondemand acpi_cpufreq
freq_table mperf coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel
microcode sg xhci_hcd ext3 jbd mbcache sd_mod crc_t10dif aesni_intel
ablk_helper cryptd aes_x86_64 aes_generic ahci libahci pata_acpi
ata_generic dm_mirror dm_region_hash dm_log dm_mod
CPU 2
Pid: 11079, comm: fio Not tainted 3.6.0-rc1+ #62
/DQ67SW
RIP: 0010:[<ffffffffa0405799>] [<ffffffffa0405799>]
ext4_ext_remove_space+0x79/0xa80 [ext4]
RSP: 0018:ffff8801bcc87888 EFLAGS: 00010202
RAX: 0000000000000003 RBX: ffff88022b7d62c8 RCX: 000000000000000c
RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffffa0481d90
RBP: ffff8801bcc87938 R08: 0000000000000001 R09: 0000000000000000
R10: ffffffffa04546d8 R11: 0000000000000001 R12: ffff8801ff109000
R13: 00000000fffffffe R14: 000000000026c8d0 R15: 0000000000000001
FS: 00007f4c3dd6a700(0000) GS:ffff88023d800000(0000)
knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f4c34f4f172 CR3: 0000000204d80000 CR4: 00000000000407e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process fio (pid: 11079, threadinfo ffff8801bcc86000, task
ffff88018f9e5700)
Stack:
ffff88022b7d62c8 0000000000000000 ffff8801f1595cf8 ffff8801d8ff7000
ffff8801bcc87938 ffffffffa03cd363 ffffffffa0408a15 0000000000000000
ffff8801bcc878f8 0000000000000246 ffff88022b7d6238 ffffffffa0408a3d
Call Trace:
[<ffffffffa03cd363>] ? ext4_mark_inode_dirty+0x293/0x2e0 [ext4]
[<ffffffffa0408a15>] ? ext4_ext_truncate+0x115/0x2a0 [ext4]
[<ffffffffa0408a3d>] ? ext4_ext_truncate+0x13d/0x2a0 [ext4]
[<ffffffffa0408a15>] ? ext4_ext_truncate+0x115/0x2a0 [ext4]
[<ffffffffa0408a5e>] ext4_ext_truncate+0x15e/0x2a0 [ext4]
[<ffffffffa03c9e9a>] ext4_truncate+0x14a/0x240 [ext4]
[<ffffffffa04224b1>] ext4_ind_direct_IO+0x481/0x740 [ext4]
[<ffffffffa03cc970>] ? noalloc_get_block_write+0x90/0x90 [ext4]
[<ffffffff81028965>] ? native_sched_clock+0x65/0xb0
[<ffffffffa03c822e>] ext4_ext_direct_IO+0x26e/0x290 [ext4]
[<ffffffffa03c83cc>] ext4_direct_IO+0x17c/0x2a0 [ext4]
[<ffffffff81184354>] generic_file_direct_write+0x174/0x240
[<ffffffff811849d0>] __generic_file_aio_write+0x5b0/0x820
[<ffffffff81028965>] ? native_sched_clock+0x65/0xb0
[<ffffffffa03c1491>] ext4_file_dio_write+0x3b1/0x550 [ext4]
[<ffffffffa03c1778>] ext4_file_write+0x148/0x190 [ext4]
[<ffffffffa03c1630>] ? ext4_file_dio_write+0x550/0x550 [ext4]
[<ffffffff8127532e>] aio_rw_vect_retry+0xce/0x200
[<ffffffff81275260>] ? aio_advance_iovec+0x130/0x130
[<ffffffff81276246>] aio_run_iocb+0xd6/0x2a0
[<ffffffff8173ce1d>] io_submit_one+0x38a/0x3ff
[<ffffffff81277e1e>] do_io_submit+0x2be/0x3d0
[<ffffffff81277f40>] sys_io_submit+0x10/0x20
[<ffffffff8175e4e9>] system_call_fastpath+0x16/0x1b
Code: c7 c7 90 1d 48 a0 85 c0 41 0f 95 c7 31 d2 44 89 fe e8 fc a5 d5 e0
49 63 c7 48 83 c0 02 48 83 04 c5 d0 09 47 a0 01 45 85 ff 74 02 <0f> 0b
0f b7 75 bc 48 8b 7b 28 83 c6 01 e8 65 11 ff ff 48 89 c7
RIP [<ffffffffa0405799>] ext4_ext_remove_space+0x79/0xa80 [ext4]
RSP <ffff8801bcc87888>
---[ end trace 19c3447cad5485fa ]---
>
> /* probably first extent we're gonna free will be last in block */
> diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
> index 323eb15..24f3719 100644
> --- a/fs/ext4/fsync.c
> +++ b/fs/ext4/fsync.c
> @@ -73,46 +73,31 @@ static void dump_completed_IO(struct inode * inode)
> * might needs to do the conversion. This function walks through
> * the list and convert the related unwritten extents for completed IO
> * to written.
> - * The function return the number of pending IOs on success.
> + * The function return first error;
> */
> int ext4_flush_completed_IO(struct inode *inode)
> {
> + struct ext4_inode_info *ei = EXT4_I(inode);
> + unsigned long flags;
> + struct list_head complete_list;
> + int err, ret = 0;
> ext4_io_end_t *io;
> - struct ext4_inode_info *ei = EXT4_I(inode);
> - unsigned long flags;
> - int ret = 0;
> - int ret2 = 0;
>
> dump_completed_IO(inode);
> +
> spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> - while (!list_empty(&ei->i_completed_io_list)){
> - io = list_entry(ei->i_completed_io_list.next,
> - ext4_io_end_t, list);
> + list_replace_init(&ei->i_completed_io_list, &complete_list);
> + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> +
> + while(!list_empty(&complete_list)) {
> + io = list_entry(complete_list.next, ext4_io_end_t, list);
> list_del_init(&io->list);
> - io->flag |= EXT4_IO_END_IN_FSYNC;
> - /*
> - * Calling ext4_end_io_nolock() to convert completed
> - * IO to written.
> - *
> - * When ext4_sync_file() is called, run_queue() may already
> - * about to flush the work corresponding to this io structure.
> - * It will be upset if it founds the io structure related
> - * to the work-to-be schedule is freed.
> - *
> - * Thus we need to keep the io structure still valid here after
> - * conversion finished. The io structure has a flag to
> - * avoid double converting from both fsync and background work
> - * queue work.
> - */
> - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> - ret = ext4_end_io_nolock(io);
> - if (ret < 0)
> - ret2 = ret;
> - spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> - io->flag &= ~EXT4_IO_END_IN_FSYNC;
> + err = ext4_end_io_nolock(io);
> + ext4_free_io_end(io);
> + if (unlikely(err && !ret))
> + ret = err;
> }
> - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> - return (ret2 < 0) ? ret2 : 0;
> + return ret;
> }
>
> /*
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index 830e1b2..61f13e5 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -807,11 +807,9 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
>
> retry:
> if (rw == READ && ext4_should_dioread_nolock(inode)) {
> - if (unlikely(!list_empty(&ei->i_completed_io_list))) {
> - mutex_lock(&inode->i_mutex);
> + if (unlikely(!list_empty(&ei->i_completed_io_list)))
> ext4_flush_completed_IO(inode);
> - mutex_unlock(&inode->i_mutex);
> - }
> +
> ret = __blockdev_direct_IO(rw, iocb, inode,
> inode->i_sb->s_bdev, iov,
> offset, nr_segs,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 202ae3f..762b955 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2885,9 +2885,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> {
> struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> ext4_io_end_t *io_end = iocb->private;
> - struct workqueue_struct *wq;
> - unsigned long flags;
> - struct ext4_inode_info *ei;
>
> /* if not async direct IO or dio with 0 bytes write, just return */
> if (!io_end || !size)
> @@ -2916,24 +2913,14 @@ out:
> io_end->iocb = iocb;
> io_end->result = ret;
> }
> - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> -
> - /* Add the io_end to per-inode completed aio dio list*/
> - ei = EXT4_I(io_end->inode);
> - spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> - list_add_tail(&io_end->list, &ei->i_completed_io_list);
> - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
>
> - /* queue the work to convert unwritten extents to written */
> - queue_work(wq, &io_end->work);
> + ext4_add_complete_io(io_end);
> }
>
> static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
> {
> ext4_io_end_t *io_end = bh->b_private;
> - struct workqueue_struct *wq;
> struct inode *inode;
> - unsigned long flags;
>
> if (!test_clear_buffer_uninit(bh) || !io_end)
> goto out;
> @@ -2952,15 +2939,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
> */
> inode = io_end->inode;
> ext4_set_io_unwritten_flag(inode, io_end);
> -
> - /* Add the io_end to per-inode completed io list*/
> - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
> - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
> - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
> -
> - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
> - /* queue the work to convert unwritten extents to written */
> - queue_work(wq, &io_end->work);
> + ext4_add_complete_io(io_end);
> out:
> bh->b_private = NULL;
> bh->b_end_io = NULL;
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index dcdeef1..c369419 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -57,6 +57,22 @@ void ext4_ioend_wait(struct inode *inode)
> wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
> }
>
> +/* Add the io_end to per-inode completed aio dio list. */
> +void ext4_add_complete_io(ext4_io_end_t *io_end)
> +{
> + struct ext4_inode_info *ei = EXT4_I(io_end->inode);
> + struct workqueue_struct *wq;
> + unsigned long flags;
> +
> + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> +
> + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> + if (list_empty(&ei->i_completed_io_list))
> + queue_work(wq, &io_end->work);
> + list_add_tail(&io_end->list, &ei->i_completed_io_list);
> + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> +}
> +
> static void put_io_page(struct ext4_io_page *io_page)
> {
> if (atomic_dec_and_test(&io_page->p_count)) {
> @@ -81,12 +97,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
> kmem_cache_free(io_end_cachep, io);
> }
>
> -/*
> - * check a range of space and convert unwritten extents to written.
> - *
> - * Called with inode->i_mutex; we depend on this when we manipulate
> - * io->flag, since we could otherwise race with ext4_flush_completed_IO()
> - */
> +/* check a range of space and convert unwritten extents to written. */
> int ext4_end_io_nolock(ext4_io_end_t *io)
> {
> struct inode *inode = io->inode;
> @@ -94,6 +105,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
> ssize_t size = io->size;
> int ret = 0;
>
> + BUG_ON(!list_empty(&io->list));
> +
> ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
> "list->prev 0x%p\n",
> io, inode->i_ino, io->list.next, io->list.prev);
> @@ -124,45 +137,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
> static void ext4_end_io_work(struct work_struct *work)
> {
> ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> - struct inode *inode = io->inode;
> - struct ext4_inode_info *ei = EXT4_I(inode);
> - unsigned long flags;
> -
> - spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> - if (io->flag & EXT4_IO_END_IN_FSYNC)
> - goto requeue;
> - if (list_empty(&io->list)) {
> - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> - goto free;
> - }
> -
> - if (!mutex_trylock(&inode->i_mutex)) {
> - bool was_queued;
> -requeue:
> - was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
> - io->flag |= EXT4_IO_END_QUEUED;
> - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> - /*
> - * Requeue the work instead of waiting so that the work
> - * items queued after this can be processed.
> - */
> - queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
> - /*
> - * To prevent the ext4-dio-unwritten thread from keeping
> - * requeueing end_io requests and occupying cpu for too long,
> - * yield the cpu if it sees an end_io request that has already
> - * been requeued.
> - */
> - if (was_queued)
> - yield();
> - return;
> - }
> - list_del_init(&io->list);
> - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> - (void) ext4_end_io_nolock(io);
> - mutex_unlock(&inode->i_mutex);
> -free:
> - ext4_free_io_end(io);
> + (void) ext4_flush_completed_IO(io->inode);
> }
>
> ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
> @@ -195,9 +170,7 @@ static void buffer_io_error(struct buffer_head *bh)
> static void ext4_end_bio(struct bio *bio, int error)
> {
> ext4_io_end_t *io_end = bio->bi_private;
> - struct workqueue_struct *wq;
> struct inode *inode;
> - unsigned long flags;
> int i;
> sector_t bi_sector = bio->bi_sector;
>
> @@ -255,14 +228,7 @@ static void ext4_end_bio(struct bio *bio, int error)
> return;
> }
>
> - /* Add the io_end to per-inode completed io list*/
> - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
> - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
> - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
> -
> - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
> - /* queue the work to convert unwritten extents to written */
> - queue_work(wq, &io_end->work);
> + ext4_add_complete_io(io_end);
> }
>
> void ext4_io_submit(struct ext4_io_submit *io)
> --
> 1.7.7.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 3/9] ext4: serialize dio nonlocked reads with defrag workers V3
2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 1/9] ext4: ext4_inode_info diet Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 2/9] ext4: completed_io locking cleanup V2 Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 4/9] ext4: punch_hole should wait for DIO writers Dmitry Monakhov
` (5 subsequent siblings)
8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov
Inode's block defrag and ext4_change_inode_journal_flag() may
affect nonlocked DIO reads result, so proper synchronization
required.
- Add missed inode_dio_wait() calls where appropriate
- Check inode state under extra i_dio_count reference.
Changes from V1:
- add missed memory bariers
- move DIOREAD_LOCK state check out from generic should_dioread_nolock
otherwise it will affect existing DIO readers.
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
fs/ext4/ext4.h | 17 +++++++++++++++++
fs/ext4/indirect.c | 13 +++++++++++++
fs/ext4/inode.c | 5 +++++
fs/ext4/move_extent.c | 8 ++++++++
4 files changed, 43 insertions(+), 0 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index be976ca..a71e299 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1348,6 +1348,8 @@ enum {
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
+ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
+ nolocking */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -2456,6 +2458,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+ ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+ smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+ smp_mb();
+ ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
/* For ioend & aio unwritten conversion wait queues */
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 61f13e5..e4cc904 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -810,10 +810,23 @@ retry:
if (unlikely(!list_empty(&ei->i_completed_io_list)))
ext4_flush_completed_IO(inode);
+ /*
+ * Nolock dioread optimization may be dynamically disabled
+ * via ext4_inode_block_unlocked_dio(). Check inode's state
+ * while holding extra i_dio_count ref.
+ */
+ atomic_inc(&inode->i_dio_count);
+ smp_mb();
+ if (!unlikely(ext4_test_inode_state(inode,
+ EXT4_STATE_DIOREAD_LOCK))) {
+ inode_dio_done(inode);
+ goto retry;
+ }
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block, NULL, NULL, 0);
+ inode_dio_done(inode);
} else {
ret = blockdev_direct_IO(rw, iocb, inode, iov,
offset, nr_segs, ext4_get_block);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 762b955..c34d38c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4720,6 +4720,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
jbd2_journal_lock_updates(journal);
/*
@@ -4739,6 +4743,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
+ ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c5826c6..ae0357b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1214,6 +1214,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
if (ret1 < 0)
return ret1;
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(orig_inode);
+ ext4_inode_block_unlocked_dio(donor_inode);
+ inode_dio_wait(orig_inode);
+ inode_dio_wait(donor_inode);
+
/* Protect extent tree against block allocations via delalloc */
double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
@@ -1412,6 +1418,8 @@ out:
kfree(holecheck_path);
}
double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_inode_block_unlocked_dio(orig_inode);
+ ext4_inode_block_unlocked_dio(donor_inode);
ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
if (ret1)
--
1.7.7.6
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 4/9] ext4: punch_hole should wait for DIO writers
2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
` (2 preceding siblings ...)
2012-09-13 15:01 ` [PATCH 3/9] ext4: serialize dio nonlocked reads with defrag workers V3 Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
2012-09-13 15:13 ` Lukáš Czerner
2012-09-13 15:01 ` [PATCH 5/9] ext4: serialize unlocked dio reads with truncate Dmitry Monakhov
` (4 subsequent siblings)
8 siblings, 1 reply; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov
punch_hole are the places where we have to wait for all existing writers
(writeback, aio, dio), but currently we simply flush pended end_io request
which is not sufficient. Even more i_mutex is not holded while punch_hole
which obviously result in dangerous data corruption due to
access-after-free issue.
This patch performs following changes:
- Guard punch_hole with i_mutex
- Block all new dio readers in order to prevent information leak caused by
read-after-free pattern.
- punch_hole now wait for all writers in flight
NOTE: XXX write-after-free race is still possible because there is
no easy way to stop writeback while punch_hole is in progress.
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
fs/ext4/extents.c | 41 +++++++++++++++++++++++++----------------
1 files changed, 25 insertions(+), 16 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 44e33b0..0e94485 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4814,9 +4814,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
loff_t first_page_offset, last_page_offset;
int credits, err = 0;
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ err = filemap_write_and_wait_range(mapping,
+ offset, offset + length - 1);
+
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&inode->i_mutex);
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
- return 0;
+ goto out_mutex;
/*
* If the hole extends beyond i_size, set the hole
@@ -4834,31 +4847,23 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
first_page_offset = first_page << PAGE_CACHE_SHIFT;
last_page_offset = last_page << PAGE_CACHE_SHIFT;
- /*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- err = filemap_write_and_wait_range(mapping,
- offset, offset + length - 1);
-
- if (err)
- return err;
- }
-
/* Now release the pages */
if (last_page_offset > first_page_offset) {
truncate_pagecache_range(inode, first_page_offset,
last_page_offset - 1);
}
- /* finish any pending end_io work */
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
ext4_flush_completed_IO(inode);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_dio;
+ }
err = ext4_orphan_add(handle, inode);
if (err)
@@ -4952,6 +4957,10 @@ out:
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
return err;
}
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
--
1.7.7.6
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH 4/9] ext4: punch_hole should wait for DIO writers
2012-09-13 15:01 ` [PATCH 4/9] ext4: punch_hole should wait for DIO writers Dmitry Monakhov
@ 2012-09-13 15:13 ` Lukáš Czerner
0 siblings, 0 replies; 12+ messages in thread
From: Lukáš Czerner @ 2012-09-13 15:13 UTC (permalink / raw)
To: Dmitry Monakhov; +Cc: linux-ext4, tytso, jack, wenqing.lz
On Thu, 13 Sep 2012, Dmitry Monakhov wrote:
> Date: Thu, 13 Sep 2012 19:01:09 +0400
> From: Dmitry Monakhov <dmonakhov@openvz.org>
> To: linux-ext4@vger.kernel.org
> Cc: tytso@mit.edu, jack@suse.cz, wenqing.lz@taobao.com,
> Dmitry Monakhov <dmonakhov@openvz.org>
> Subject: [PATCH 4/9] ext4: punch_hole should wait for DIO writers
>
> punch_hole are the places where we have to wait for all existing writers
> (writeback, aio, dio), but currently we simply flush pended end_io request
> which is not sufficient. Even more i_mutex is not holded while punch_hole
> which obviously result in dangerous data corruption due to
> access-after-free issue.
>
> This patch performs following changes:
> - Guard punch_hole with i_mutex
> - Block all new dio readers in order to prevent information leak caused by
> read-after-free pattern.
> - punch_hole now wait for all writers in flight
> NOTE: XXX write-after-free race is still possible because there is
> no easy way to stop writeback while punch_hole is in progress.
Hi Dimitry,
Just FYI, I am carrying the punch hole i_mutex in the patch set "Add
invalidatepage_range address space operation" but I am more than
happy to drop it and let your patch fix this instead.
Thanks!
-Lukas
>
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> ---
> fs/ext4/extents.c | 41 +++++++++++++++++++++++++----------------
> 1 files changed, 25 insertions(+), 16 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 44e33b0..0e94485 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4814,9 +4814,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
> loff_t first_page_offset, last_page_offset;
> int credits, err = 0;
>
> + /*
> + * Write out all dirty pages to avoid race conditions
> + * Then release them.
> + */
> + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> + err = filemap_write_and_wait_range(mapping,
> + offset, offset + length - 1);
> +
> + if (err)
> + return err;
> + }
> +
> + mutex_lock(&inode->i_mutex);
> /* No need to punch hole beyond i_size */
> if (offset >= inode->i_size)
> - return 0;
> + goto out_mutex;
>
> /*
> * If the hole extends beyond i_size, set the hole
> @@ -4834,31 +4847,23 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
> first_page_offset = first_page << PAGE_CACHE_SHIFT;
> last_page_offset = last_page << PAGE_CACHE_SHIFT;
>
> - /*
> - * Write out all dirty pages to avoid race conditions
> - * Then release them.
> - */
> - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> - err = filemap_write_and_wait_range(mapping,
> - offset, offset + length - 1);
> -
> - if (err)
> - return err;
> - }
> -
> /* Now release the pages */
> if (last_page_offset > first_page_offset) {
> truncate_pagecache_range(inode, first_page_offset,
> last_page_offset - 1);
> }
>
> - /* finish any pending end_io work */
> + /* Wait all existing dio workers, newcomers will block on i_mutex */
> + ext4_inode_block_unlocked_dio(inode);
> + inode_dio_wait(inode);
> ext4_flush_completed_IO(inode);
>
> credits = ext4_writepage_trans_blocks(inode);
> handle = ext4_journal_start(inode, credits);
> - if (IS_ERR(handle))
> - return PTR_ERR(handle);
> + if (IS_ERR(handle)) {
> + err = PTR_ERR(handle);
> + goto out_dio;
> + }
>
> err = ext4_orphan_add(handle, inode);
> if (err)
> @@ -4952,6 +4957,10 @@ out:
> inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
> ext4_mark_inode_dirty(handle, inode);
> ext4_journal_stop(handle);
> +out_dio:
> + ext4_inode_resume_unlocked_dio(inode);
> +out_mutex:
> + mutex_unlock(&inode->i_mutex);
> return err;
> }
> int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 5/9] ext4: serialize unlocked dio reads with truncate
2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
` (3 preceding siblings ...)
2012-09-13 15:01 ` [PATCH 4/9] ext4: punch_hole should wait for DIO writers Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 6/9] ext4: endless truncate due to nonlocked dio readers V2 Dmitry Monakhov
` (3 subsequent siblings)
8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov
Current serialization will works only for DIO which holds
i_mutex, but nonlocked DIO following race is possible:
dio_nolock_read_task truncate_task
->ext4_setattr()
->inode_dio_wait()
->ext4_ext_direct_IO
->ext4_ind_direct_IO
->__blockdev_direct_IO
->ext4_get_block
->truncate_setsize()
->ext4_truncate()
#alloc truncated blocks
#to other inode
->submit_io()
#INFORMATION LEAK
In order to serialize with unlocked DIO reads we have to
rearrange wait sequence
1) update i_size first
2) if i_size about to be reduced wait for outstanding DIO requests
3) and only after that truncate inode blocks
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
fs/ext4/inode.c | 7 +++++--
1 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c34d38c..658f649 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4283,7 +4283,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- inode_dio_wait(inode);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4332,8 +4331,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size != i_size_read(inode))
+ if (attr->ia_size != inode->i_size) {
truncate_setsize(inode, attr->ia_size);
+ /* Inode size will be reduced, wait for dio in flight */
+ if (orphan)
+ inode_dio_wait(inode);
+ }
ext4_truncate(inode);
}
--
1.7.7.6
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 6/9] ext4: endless truncate due to nonlocked dio readers V2
2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
` (4 preceding siblings ...)
2012-09-13 15:01 ` [PATCH 5/9] ext4: serialize unlocked dio reads with truncate Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 7/9] ext4: serialize truncate with owerwrite DIO workers V2 Dmitry Monakhov
` (2 subsequent siblings)
8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov
If we have enough aggressive DIO readers, truncate and other dio
waiters will wait forever inside inode_dio_wait(). It is reasonable
to disable nonlock DIO read optimization during truncate.
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
fs/ext4/inode.c | 8 ++++++--
1 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 658f649..a295b3a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4333,9 +4333,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & ATTR_SIZE) {
if (attr->ia_size != inode->i_size) {
truncate_setsize(inode, attr->ia_size);
- /* Inode size will be reduced, wait for dio in flight */
- if (orphan)
+ /* Inode size will be reduced, wait for dio in flight.
+ * Temproraly disable unlocked DIO to prevent livelock */
+ if (orphan) {
+ ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ }
}
ext4_truncate(inode);
}
--
1.7.7.6
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 7/9] ext4: serialize truncate with owerwrite DIO workers V2
2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
` (5 preceding siblings ...)
2012-09-13 15:01 ` [PATCH 6/9] ext4: endless truncate due to nonlocked dio readers V2 Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 8/9] ext4: fix ext_remove_space for punch_hole case Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 9/9] ext4: fix ext4_ext_remove_space tree traversal Dmitry Monakhov
8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov
Jan Kara have spotted interesting issue:
There are potential data corruption issue with direct IO overwrites
racing with truncate:
Like:
dio write truncate_task
->ext4_ext_direct_IO
->overwrite == 1
->down_read(&EXT4_I(inode)->i_data_sem);
->mutex_unlock(&inode->i_mutex);
->ext4_setattr()
->inode_dio_wait()
->truncate_setsize()
->ext4_truncate()
->down_write(&EXT4_I(inode)->i_data_sem);
->__blockdev_direct_IO
->ext4_get_block
->submit_io()
->up_read(&EXT4_I(inode)->i_data_sem);
# truncate data blocks, allocate them to
# other inode - bad stuff happens because
# dio is still in flight.
In order to serialize with truncate dio worker should grab extra i_dio_count
reference before drop i_mutex.
Changes agains V1:
- wake up dio waiters before i_mutex.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
fs/ext4/inode.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a295b3a..ce4d44a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3014,6 +3014,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
overwrite = *((int *)iocb->private);
if (overwrite) {
+ atomic_inc(&inode->i_dio_count);
down_read(&EXT4_I(inode)->i_data_sem);
mutex_unlock(&inode->i_mutex);
}
@@ -3111,6 +3112,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
retake_lock:
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite) {
+ inode_dio_done(inode);
up_read(&EXT4_I(inode)->i_data_sem);
mutex_lock(&inode->i_mutex);
}
--
1.7.7.6
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 8/9] ext4: fix ext_remove_space for punch_hole case
2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
` (6 preceding siblings ...)
2012-09-13 15:01 ` [PATCH 7/9] ext4: serialize truncate with owerwrite DIO workers V2 Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
2012-09-13 15:01 ` [PATCH 9/9] ext4: fix ext4_ext_remove_space tree traversal Dmitry Monakhov
8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov
Inode is allowed to have empty leaf only if it this is blockless inode
i.e. (depth == 0).
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
fs/ext4/extents.c | 11 +++++++----
1 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0e94485..af2cc76 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2649,12 +2649,15 @@ again:
return PTR_ERR(path);
}
depth = ext_depth(inode);
+ /* Leaf not may not exist only if inode has no blocks at all */
ex = path[depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
- path = NULL;
- goto cont;
+ if (depth) {
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL",
+ depth);
+ err = -EIO;
+ }
+ goto out;
}
ee_block = le32_to_cpu(ex->ee_block);
--
1.7.7.6
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 9/9] ext4: fix ext4_ext_remove_space tree traversal
2012-09-13 15:01 [PATCH 0/9] ext4: Bunch of DIO/AIO fixes V2 Dmitry Monakhov
` (7 preceding siblings ...)
2012-09-13 15:01 ` [PATCH 8/9] ext4: fix ext_remove_space for punch_hole case Dmitry Monakhov
@ 2012-09-13 15:01 ` Dmitry Monakhov
8 siblings, 0 replies; 12+ messages in thread
From: Dmitry Monakhov @ 2012-09-13 15:01 UTC (permalink / raw)
To: linux-ext4; +Cc: tytso, jack, wenqing.lz, Dmitry Monakhov
If ext4_ext_rm_leaf() restarted transaction we should restart loop
from the top because i_data_sem was internally dropped
but (i = 0) statement was moved out from the loop in following commit
968dee77220768a5 which result in NULL pointer dereference.
This patch fix tree walking procedure by moving 'i' and 'depth'
initalization inside loop body. Also perform code cleanup in order
to have better code flow separation for truncate and punch_hole
cases.
Originally i've found this on very speciffic test, but it can be easily
reproduced via fsstress.
BUG: unable to handle kernel NULL pointer dereference at 0000000000000028
IP: [<ffffffffa01b4ebd>] ext4_ext_remove_space+0x8e6/0xc4f [ext4]
PGD fe763c067 PUD 101e5a4067 PMD 0
Oops: 0000 [#1] SMP
Modules linked in: brd netconsole configfs cpufreq_ondemand acpi_cpufreq freq_table mperf ext4 jbd2 kvm_intel kvm microcode lpc_ich mfd_core i7300_idle\
i_transport_fc scsi_tgt
CPU 2
Pid: 9930, comm: unlink Not tainted 3.6.0-rc1+ #35 Intel MP Server/S7000FC4UR
RIP: 0010:[<ffffffffa01b4ebd>] [<ffffffffa01b4ebd>] ext4_ext_remove_space+0x8e6/0xc4f [ext4]
RSP: 0018:ffff88101f0ffcb8 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff88100b398190 RCX: ffff88100b398030
RDX: 0000000000000001 RSI: 0000000000000002 RDI: 0000000000a00000
RBP: ffff88101f0ffd98 R08: 0000000000a00000 R09: 00019b0d6466e2b3
R10: 0000000000000367 R11: ffff88101f0ffb38 R12: ffff88101fb13d70
R13: 0000000000000000 R14: ffff88101fb13d40 R15: 0000000000000000
FS: 00007f603bbc7700(0000) GS:ffff88103ba00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000028 CR3: 0000000fe7639000 CR4: 00000000000007e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process unlink (pid: 9930, threadinfo ffff88101f0fe000, task ffff88101b26c620)
Stack:
ffff88101f0ffcc8 ffffffffa018ddd4 ffff88101f0ffd28 ffffffffa0192461
ffff881000007800 00000000fffffff5 ffffffff0b398000 0000000000547fff
ffff88100b398000 ffff88100b398190 ffff8810203d0000 ffff88100ea0900c
Call Trace:
[<ffffffffa018ddd4>] ? brelse+0xe/0x10 [ext4]
[<ffffffffa0192461>] ? ext4_mark_iloc_dirty+0x50c/0x56f [ext4]
[<ffffffffa01b6cb3>] ext4_ext_truncate+0xd8/0x184 [ext4]
[<ffffffffa019416c>] ? ext4_evict_inode+0x1c2/0x358 [ext4]
[<ffffffffa01904fb>] ext4_truncate+0xdb/0x158 [ext4]
[<ffffffffa01941f7>] ext4_evict_inode+0x24d/0x358 [ext4]
[<ffffffffa0193faa>] ? ext4_da_writepages+0x54e/0x54e [ext4]
[<ffffffff8114b2e8>] evict+0xa1/0x15b
[<ffffffff8114b58a>] iput+0x1a3/0x1ac
[<ffffffff811410cd>] do_unlinkat+0xff/0x15a
[<ffffffff8108e48b>] ? trace_hardirqs_on_caller+0x151/0x197
[<ffffffff810b11a5>] ? __audit_syscall_entry+0x11f/0x14b
[<ffffffff8121e4de>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[<ffffffff81142b2d>] sys_unlink+0x16/0x18
[<ffffffff8145bca9>] system_call_fastpath+0x16/0x1b
Code: ff 4c 63 65 b0 4d 6b e4 30 4c 03 65 a8 e9 08 01 00 00 48 63 55 b0 4c 6b e2 30 4c 03 65 a8 49 83 7c 24 20 00 75 0e 49 8b 44 24 28 <48> 8b 40 28 49\
8 85 c0 75 22 49 8b
RIP [<ffffffffa01b4ebd>] ext4_ext_remove_space+0x8e6/0xc4f [ext4]
RSP <ffff88101f0ffcb8>
CR2: 0000000000000028
---[ end trace 07fcb23f8e07b495 ]---
#ORIGINAL_TESTCASE(huge hosts only):
modprobe brd rd_size=$((40*1024*1024)) rd_nr=1
mkfs.ext4 /dev/ram0
mount /dev/ram0 $MNT
fallocate -l $((32*1024*1024*1024)) $MNT/file
fio random_write2.fio
unlink $MNT/file
umount $MNT
fsck.ext4 -f /dev/ram0
### fio random_write2.fio job file
[random-writers]
ioengine=libaio
iodepth=256
rw=randwrite
bs=32k
direct=1
directory=/mnt
nrfiles=1
filename=file
filesize=32G
size=8G
group_reporting
numjobs=24
### fio file end
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
fs/ext4/extents.c | 25 +++++++++++--------------
1 files changed, 11 insertions(+), 14 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index af2cc76..5c1d313 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2616,7 +2616,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct ext4_ext_path *path = NULL;
ext4_fsblk_t partial_cluster = 0;
handle_t *handle;
- int i = 0, err;
+ int i, err;
BUG_ON(atomic_read(&EXT4_I(inode)->i_aiodio_unwritten));
ext_debug("truncate since %u to %u\n", start, end);
@@ -2627,8 +2627,9 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
return PTR_ERR(handle);
again:
+ err = 0;
+ depth = ext_depth(inode);
ext4_ext_invalidate_cache(inode);
-
trace_ext4_ext_remove_space(inode, start, depth);
/*
@@ -2641,6 +2642,7 @@ again:
if (end < EXT_MAX_BLOCKS - 1) {
struct ext4_extent *ex;
ext4_lblk_t ee_block;
+ int k;
/* find extent for this block */
path = ext4_ext_find_extent(inode, end, NULL);
@@ -2648,7 +2650,6 @@ again:
ext4_journal_stop(handle);
return PTR_ERR(path);
}
- depth = ext_depth(inode);
/* Leaf not may not exist only if inode has no blocks at all */
ex = path[depth].p_ext;
if (!ex) {
@@ -2688,20 +2689,17 @@ again:
if (err < 0)
goto out;
}
- }
-cont:
-
- /*
- * We start scanning from right side, freeing all the blocks
- * after i_size and walking into the tree depth-wise.
- */
- depth = ext_depth(inode);
- if (path) {
- int k = i = depth;
+ i = k = depth;
while (--k > 0)
path[k].p_block =
le16_to_cpu(path[k].p_hdr->eh_entries)+1;
} else {
+ /*
+ * We start scanning from right side, freeing all the blocks
+ * after i_size and walking into the tree depth-wise.
+ */
+
+ i = 0;
path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
GFP_NOFS);
if (path == NULL) {
@@ -2716,7 +2714,6 @@ cont:
goto out;
}
}
- err = 0;
while (i >= 0 && err == 0) {
if (i == depth) {
--
1.7.7.6
^ permalink raw reply related [flat|nested] 12+ messages in thread