From: Chris Mason <chris.mason@oracle.com>
To: linux-fsdevel@vger.kernel.org
Cc: akpm@osdl.org, zach.brown@oracle.com,
Suparna Bhattacharya <suparna@in.ibm.com>
Subject: [PATCH 2 of 7] Change O_DIRECT to use placeholders instead of i_mutex/i_alloc_sem locking
Date: Wed, 01 Nov 2006 11:08:04 -0400 [thread overview]
Message-ID: <4486b1f7011adb925d90.1162397284@opti.oraclecorp.com> (raw)
In-Reply-To: <patchbomb.1162397282@opti.oraclecorp.com>
All mutex and semaphore usage is removed from fs/direct-io.c. Callers
can ask for placeholder pages if they want help protecting against
races with buffered io.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff -r e3fe4a6b9355 -r 4486b1f7011a fs/direct-io.c
--- a/fs/direct-io.c Tue Oct 31 10:01:36 2006 -0500
+++ b/fs/direct-io.c Wed Nov 01 10:18:44 2006 -0500
@@ -35,6 +35,7 @@
#include <linux/rwsem.h>
#include <linux/uio.h>
#include <asm/atomic.h>
+#include <linux/writeback.h>
/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -94,6 +95,14 @@ struct dio {
struct buffer_head map_bh; /* last get_block() result */
/*
+ * kernel page pinning
+ */
+ struct page fake;
+ struct page *tmppages[DIO_PAGES];
+ unsigned long fspages_start_off;
+ unsigned long fspages_end_off;
+
+ /*
* Deferred addition of a page to the dio. These variables are
* private to dio_send_cur_page(), submit_page_section() and
* dio_bio_add_page().
@@ -190,6 +199,33 @@ out:
return ret;
}
+static void unlock_page_range(struct dio *dio, unsigned long start,
+ unsigned long nr)
+{
+ if (dio->lock_type != DIO_NO_LOCKING) {
+ remove_placeholder_pages(dio->inode->i_mapping, dio->tmppages,
+ &dio->fake,
+ start, start + nr,
+ ARRAY_SIZE(dio->tmppages));
+ }
+}
+
+static int lock_page_range(struct dio *dio, unsigned long start,
+ unsigned long nr)
+{
+ struct address_space *mapping = dio->inode->i_mapping;
+ struct page *fake = &dio->fake;
+ unsigned long end = start + nr;
+
+ if (dio->lock_type == DIO_NO_LOCKING)
+ return 0;
+ return find_or_insert_placeholders(mapping, dio->tmppages, start, end,
+ ARRAY_SIZE(dio->tmppages),
+ GFP_KERNEL, fake,
+ dio->rw == READ);
+}
+
+
/*
* Get another userspace page. Returns an ERR_PTR on error. Pages are
* buffered inside the dio so that we can call get_user_pages() against a
@@ -219,9 +255,9 @@ static void dio_complete(struct dio *dio
{
if (dio->end_io && dio->result)
dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private);
- if (dio->lock_type == DIO_LOCKING)
- /* lockdep: non-owner release */
- up_read_non_owner(&dio->inode->i_alloc_sem);
+ unlock_page_range(dio, dio->fspages_start_off,
+ dio->fspages_end_off - dio->fspages_start_off);
+ dio->fspages_end_off = dio->fspages_start_off;
}
/*
@@ -517,6 +553,7 @@ static int get_more_blocks(struct dio *d
unsigned long fs_count; /* Number of filesystem-sized blocks */
unsigned long dio_count;/* Number of dio_block-sized blocks */
unsigned long blkmask;
+ unsigned long index;
int create;
/*
@@ -544,7 +581,19 @@ static int get_more_blocks(struct dio *d
} else if (dio->lock_type == DIO_NO_LOCKING) {
create = 0;
}
-
+ index = fs_startblk >> (PAGE_CACHE_SHIFT -
+ dio->inode->i_blkbits);
+ end = (dio->final_block_in_request >> dio->blkfactor) >>
+ (PAGE_CACHE_SHIFT - dio->inode->i_blkbits);
+ BUG_ON(index > end);
+ while (index >= dio->fspages_end_off) {
+ unsigned long nr;
+ nr = min(end - index + 1, (unsigned long)DIO_PAGES);
+ ret = lock_page_range(dio, dio->fspages_end_off, nr);
+ if (ret)
+ goto error;
+ dio->fspages_end_off += nr;
+ }
/*
* For writes inside i_size we forbid block creations: only
* overwrites are permitted. We fall back to buffered writes
@@ -554,6 +603,7 @@ static int get_more_blocks(struct dio *d
ret = (*dio->get_block)(dio->inode, fs_startblk,
map_bh, create);
}
+error:
return ret;
}
@@ -943,9 +993,6 @@ out:
return ret;
}
-/*
- * Releases both i_mutex and i_alloc_sem
- */
static ssize_t
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
const struct iovec *iov, loff_t offset, unsigned long nr_segs,
@@ -1074,14 +1121,6 @@ direct_io_worker(int rw, struct kiocb *i
* In that case, we need to release all the pages we got hold on.
*/
dio_cleanup(dio);
-
- /*
- * All block lookups have been performed. For READ requests
- * we can let i_mutex go now that its achieved its purpose
- * of protecting us from looking up uninitialized blocks.
- */
- if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
- mutex_unlock(&dio->inode->i_mutex);
/*
* OK, all BIOs are submitted, so we can decrement bio_count to truly
@@ -1165,8 +1204,6 @@ direct_io_worker(int rw, struct kiocb *i
* DIO_LOCKING (simple locking for regular files)
* For writes we are called under i_mutex and return with i_mutex held, even
* though it is internally dropped.
- * For reads, i_mutex is not held on entry, but it is taken and dropped before
- * returning.
*
* DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
* uninitialised data, allowing parallel direct readers and writers)
@@ -1191,8 +1228,7 @@ __blockdev_direct_IO(int rw, struct kioc
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
- int release_i_mutex = 0;
- int acquire_i_mutex = 0;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
if (rw & WRITE)
rw = WRITE_SYNC;
@@ -1221,49 +1257,29 @@ __blockdev_direct_IO(int rw, struct kioc
goto out;
}
}
-
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
retval = -ENOMEM;
if (!dio)
goto out;
+ set_page_placeholder(&dio->fake);
+ dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT;
+ dio->fspages_end_off = dio->fspages_start_off;
+
/*
* For block device access DIO_NO_LOCKING is used,
* neither readers nor writers do any locking at all
* For regular files using DIO_LOCKING,
- * readers need to grab i_mutex and i_alloc_sem
- * writers need to grab i_alloc_sem only (i_mutex is already held)
+ * No locks are taken
* For regular files using DIO_OWN_LOCKING,
* neither readers nor writers take any locks here
*/
dio->lock_type = dio_lock_type;
- if (dio_lock_type != DIO_NO_LOCKING) {
- /* watch out for a 0 len io from a tricksy fs */
- if (rw == READ && end > offset) {
- struct address_space *mapping;
-
- mapping = iocb->ki_filp->f_mapping;
- if (dio_lock_type != DIO_OWN_LOCKING) {
- mutex_lock(&inode->i_mutex);
- release_i_mutex = 1;
- }
-
- retval = filemap_write_and_wait_range(mapping, offset,
- end - 1);
- if (retval) {
- kfree(dio);
- goto out;
- }
-
- if (dio_lock_type == DIO_OWN_LOCKING) {
- mutex_unlock(&inode->i_mutex);
- acquire_i_mutex = 1;
- }
- }
-
- if (dio_lock_type == DIO_LOCKING)
- /* lockdep: not the owner will release it */
- down_read_non_owner(&inode->i_alloc_sem);
+
+ if (dio->lock_type == DIO_NO_LOCKING && end > offset) {
+ retval = filemap_write_and_wait_range(mapping, offset, end - 1);
+ if (retval)
+ goto out;
}
/*
@@ -1277,15 +1293,7 @@ __blockdev_direct_IO(int rw, struct kioc
retval = direct_io_worker(rw, iocb, inode, iov, offset,
nr_segs, blkbits, get_block, end_io, dio);
-
- if (rw == READ && dio_lock_type == DIO_LOCKING)
- release_i_mutex = 0;
-
out:
- if (release_i_mutex)
- mutex_unlock(&inode->i_mutex);
- else if (acquire_i_mutex)
- mutex_lock(&inode->i_mutex);
return retval;
}
EXPORT_SYMBOL(__blockdev_direct_IO);
next prev parent reply other threads:[~2006-11-01 16:25 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-11-01 15:08 [PATCH 0 of 7] O_DIRECT locking rework Chris Mason
2006-11-01 15:08 ` [PATCH 1 of 7] Introduce a place holder page for the pagecache Chris Mason
2006-11-01 15:08 ` Chris Mason [this message]
2006-11-01 22:44 ` [PATCH 2 of 7] Change O_DIRECT to use placeholders instead of i_mutex/i_alloc_sem locking David Chinner
2006-11-01 15:08 ` [PATCH 3 of 7] DIO: don't fall back to buffered writes Chris Mason
2006-11-01 15:08 ` [PATCH 4 of 7] Turn the DIO lock_type parameter into a flags field Chris Mason
2006-11-01 22:58 ` David Chinner
2006-11-02 1:02 ` Chris Mason
2006-11-02 2:16 ` David Chinner
2006-11-08 18:48 ` Chris Mason
2006-11-01 15:08 ` [PATCH 5 of 7] Make ext3 safe for the new DIO locking rules Chris Mason
2006-11-01 15:08 ` [PATCH 6 of 7] Make reiserfs safe for " Chris Mason
2006-11-01 15:08 ` [PATCH 7 of 7] Adapt XFS to the new blockdev_direct_IO calls Chris Mason
2006-11-01 23:00 ` David Chinner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4486b1f7011adb925d90.1162397284@opti.oraclecorp.com \
--to=chris.mason@oracle.com \
--cc=akpm@osdl.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=suparna@in.ibm.com \
--cc=zach.brown@oracle.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).