* [PATCH 2/2] block_dev/DIO - cache one bio allocation when caching a DIO.
2015-03-04 23:57 [PATCH 0/2] Avoid memory allocation for O_DIRECT IO NeilBrown
@ 2015-03-04 23:57 ` NeilBrown
2015-03-04 23:57 ` [PATCH 1/2] block_dev/DIO: Optionally allocate single 'struct dio' per file NeilBrown
1 sibling, 0 replies; 3+ messages in thread
From: NeilBrown @ 2015-03-04 23:57 UTC (permalink / raw)
To: Alexander Viro; +Cc: linux-fsdevel, linux-kernel
When performing an O_DIRECT write to a block device, a 'struct bio' is
allocated from a mempool.
There is only one mempool for all block devices so if a single block
device blocked indefinitely, the mempool could in theory be exhausted
and other block devices would be affected.
When mdmon needs to update RAID metadata (see previous patch) it needs
to perform an O_DIRECT write to some block devices while another block
device (the array) is frozen. This could conceivably lead to a
deadlock.
Rather than allocate one mempool per block device (which would be an
effective solution), this patch effects a single-bio pool for each
'struct dio' that is being used by an mlockall(MCL_FUTURE) process.
'cache_bio' is added to 'struct dio' and placed at the end so that it
isn't zeroed out regularly.
When an allocation is needed, the bio is used if it is present and
large enough. When a bio if freed, it is placed here if appropriate.
Naturally it is freed when the file is closed.
All other allocations to serve O_DIRECT writes are further down the
stack and use mempools that cannot be exhausted by a frozen md array.
Signed-off-by: NeilBrown <neilb@suse.de>
---
fs/direct-io.c | 45 +++++++++++++++++++++++++++++++++++++++------
1 file changed, 39 insertions(+), 6 deletions(-)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index ece5e45933d2..554913e9cc30 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -139,12 +139,17 @@ struct dio {
struct page *pages[DIO_PAGES]; /* page buffer */
struct work_struct complete_work;/* deferred AIO completion */
};
+ struct bio *cache_bio;
} ____cacheline_aligned_in_smp;
static struct kmem_cache *dio_cache __read_mostly;
void dio_free(struct dio *dio)
{
+ if (dio->cache_bio) {
+ bio_put(dio->cache_bio);
+ dio->cache_bio = NULL;
+ }
kmem_cache_free(dio_cache, dio);
}
@@ -362,13 +367,24 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
struct block_device *bdev,
sector_t first_sector, int nr_vecs)
{
- struct bio *bio;
+ struct bio *bio = NULL;
+ if ((dio->flags & DIO_PERSISTENT_DIO) && dio->cache_bio) {
+ spin_lock_irq(&dio->bio_lock);
+ if (dio->cache_bio &&
+ dio->cache_bio->bi_max_vecs >= nr_vecs) {
+ bio = dio->cache_bio;
+ dio->cache_bio = NULL;
+ bio_reset(bio);
+ }
+ spin_unlock_irq(&dio->bio_lock);
+ }
/*
* bio_alloc() is guaranteed to return a bio when called with
* __GFP_WAIT and we request a valid number of vectors.
*/
- bio = bio_alloc(GFP_KERNEL, nr_vecs);
+ if (!bio)
+ bio = bio_alloc(GFP_KERNEL, nr_vecs);
bio->bi_bdev = bdev;
bio->bi_iter.bi_sector = first_sector;
@@ -480,7 +496,21 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
set_page_dirty_lock(page);
page_cache_release(page);
}
- bio_put(bio);
+ if (dio->flags & DIO_PERSISTENT_DIO) {
+ spin_lock_irq(&dio->bio_lock);
+ if (dio->cache_bio &&
+ dio->cache_bio->bi_max_vecs < bio->bi_max_vecs) {
+ bio_put(dio->cache_bio);
+ dio->cache_bio = NULL;
+ }
+ if (dio->cache_bio == NULL) {
+ dio->cache_bio = bio;
+ bio = NULL;
+ }
+ spin_unlock_irq(&dio->bio_lock);
+ }
+ if (bio)
+ bio_put(bio);
}
return uptodate ? 0 : -EIO;
}
@@ -1144,8 +1174,11 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (cmpxchg(&iocb->ki_filp->private_data, dio, NULL) != dio)
dio = NULL;
}
- if (!dio)
+ if (!dio) {
dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
+ if (dio)
+ dio->cache_bio = NULL;
+ }
retval = -ENOMEM;
if (!dio)
goto out;
@@ -1169,7 +1202,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
end - 1);
if (retval) {
mutex_unlock(&inode->i_mutex);
- kmem_cache_free(dio_cache, dio);
+ dio_free(dio);
goto out;
}
}
@@ -1205,7 +1238,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
* We grab i_mutex only for reads so we don't have
* to release it here
*/
- kmem_cache_free(dio_cache, dio);
+ dio_free(dio);
goto out;
}
}
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH 1/2] block_dev/DIO: Optionally allocate single 'struct dio' per file.
2015-03-04 23:57 [PATCH 0/2] Avoid memory allocation for O_DIRECT IO NeilBrown
2015-03-04 23:57 ` [PATCH 2/2] block_dev/DIO - cache one bio allocation when caching a DIO NeilBrown
@ 2015-03-04 23:57 ` NeilBrown
1 sibling, 0 replies; 3+ messages in thread
From: NeilBrown @ 2015-03-04 23:57 UTC (permalink / raw)
To: Alexander Viro; +Cc: linux-fsdevel, linux-kernel
To be able to support RAID metadata operations in user-space, mdmon
(part of mdadm) sometimes needs to update the metadata on an array
before any future writes to the array are permitted. This is
particularly needed for recording a device failure.
If that array is being used for swap (and even to some extent when
just used for a filesystem) then any memory allocation performed by
mdmon can cause a deadlock if the allocation waits for data to be
written out to the array.
mdmon uses mlockall(MCL_FUTURE|MCL_CURRENT) and is careful not to
allocate any memory at the wrong time. However the kernel sometimes
allocates memory on its behalf and this can deadlock.
Updating the metadata requires an O_DIRECT write to each of a number
of files (which were previously opened). Each write requires
allocating a 'struct dio'.
To avoid this deadlock risk, this patch caches the 'struct dio' the
first time it is allocated so that future writes on the file do not
require the allocation. It is cached in '->private_data' for the
struct file. Only a single struct is cached so only sequential
accesses are allocation-free.
The caching is only performed if mlockall(MCL_FUTURE) is in effect,
thus limiting the change to only those cases where it will bring a
benefit.
Effectively, the memory allocated for O_DIRECT access is 'locked' in
place for future use.
Signed-off-by: NeilBrown <neilb@suse.de>
---
fs/block_dev.c | 7 ++++++-
fs/direct-io.c | 18 ++++++++++++++++--
include/linux/fs.h | 6 ++++++
3 files changed, 28 insertions(+), 3 deletions(-)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 975266be67d3..ed55e5329563 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -155,7 +155,10 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
offset, blkdev_get_block,
- NULL, NULL, 0);
+ NULL, NULL,
+ current->mm &&
+ (current->mm->def_flags & VM_LOCKED)
+ ? DIO_PERSISTENT_DIO : 0);
}
int __sync_blockdev(struct block_device *bdev, int wait)
@@ -1567,6 +1570,8 @@ EXPORT_SYMBOL(blkdev_put);
static int blkdev_close(struct inode * inode, struct file * filp)
{
struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+ if (filp->private_data)
+ dio_free(filp->private_data);
blkdev_put(bdev, filp->f_mode);
return 0;
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e181b6b2e297..ece5e45933d2 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -143,6 +143,11 @@ struct dio {
static struct kmem_cache *dio_cache __read_mostly;
+void dio_free(struct dio *dio)
+{
+ kmem_cache_free(dio_cache, dio);
+}
+
/*
* How many pages are in the queue?
*/
@@ -268,7 +273,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
aio_complete(dio->iocb, ret, 0);
}
- kmem_cache_free(dio_cache, dio);
+ if (!(dio->flags & DIO_PERSISTENT_DIO) ||
+ cmpxchg(&dio->iocb->ki_filp->private_data, NULL, dio) != NULL)
+ dio_free(dio);
return ret;
}
@@ -1131,7 +1138,14 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (rw == READ && !iov_iter_count(iter))
return 0;
- dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
+ dio = NULL;
+ if ((flags & DIO_PERSISTENT_DIO) &&
+ (dio = iocb->ki_filp->private_data) != NULL) {
+ if (cmpxchg(&iocb->ki_filp->private_data, dio, NULL) != dio)
+ dio = NULL;
+ }
+ if (!dio)
+ dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
retval = -ENOMEM;
if (!dio)
goto out;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b4d71b5e1ff2..b821fa32ba3f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -52,6 +52,7 @@ struct seq_file;
struct workqueue_struct;
struct iov_iter;
struct vm_fault;
+struct dio;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -2612,9 +2613,14 @@ enum {
/* filesystem can handle aio writes beyond i_size */
DIO_ASYNC_EXTEND = 0x04,
+
+ /* file->private_data is used to store a 'struct dio'
+ * between calls */
+ DIO_PERSISTENT_DIO = 0x08,
};
void dio_end_io(struct bio *bio, int error);
+void dio_free(struct dio *dio);
ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, struct iov_iter *iter, loff_t offset,
^ permalink raw reply related [flat|nested] 3+ messages in thread