From: Chris Mason <chris.mason@oracle.com>
To: linux-fsdevel@vger.kernel.org
Cc: akpm@osdl.org, zach.brown@oracle.com
Subject: Re: [RFC PATCH 1/2] page cache locking for O_DIRECT
Date: Fri, 20 Oct 2006 14:41:47 -0400 [thread overview]
Message-ID: <20061020184147.GC8674@think.oraclecorp.com> (raw)
In-Reply-To: <20061020183237.GA8674@think.oraclecorp.com>
This changes O_DIRECT to take page locks or insert placeholder pages to
lock regions under direct io.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff -r 18a9e9f5c707 fs/direct-io.c
--- a/fs/direct-io.c Thu Oct 19 08:30:00 2006 +0700
+++ b/fs/direct-io.c Fri Oct 20 12:38:24 2006 -0400
@@ -35,6 +35,7 @@
#include <linux/rwsem.h>
#include <linux/uio.h>
#include <asm/atomic.h>
+#include <linux/writeback.h>
/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -94,6 +95,14 @@ struct dio {
struct buffer_head map_bh; /* last get_block() result */
/*
+ * kernel page pinning
+ */
+ struct page fake;
+ struct page **fspages;
+ unsigned long nr_fspages;
+ loff_t fs_start_off;
+
+ /*
* Deferred addition of a page to the dio. These variables are
* private to dio_send_cur_page(), submit_page_section() and
* dio_bio_add_page().
@@ -190,6 +199,66 @@ out:
return ret;
}
+static void unlock_page_range(struct address_space *mapping,
+ struct page **pages,
+ unsigned long start,
+ unsigned long nr)
+{
+ unsigned long i;
+ struct page *p;
+ struct page *placeholder = NULL;
+ for (i = 0; i < nr; i++) {
+ p = pages[i];
+ if (PagePlaceHolder(p)) {
+ placeholder = p;
+ remove_placeholder_page(mapping, p, start + i);
+ } else {
+ unlock_page(p);
+ page_cache_release(p);
+ }
+ }
+ if (placeholder)
+ wake_up_placeholder_page(placeholder);
+}
+
+static int lock_page_range(struct address_space *mapping,
+ struct page **pages,
+ unsigned long start,
+ unsigned long nr,
+ struct page *fake)
+{
+ struct page *p;
+ unsigned long numlock = 0;
+ unsigned long end = start + nr;
+ loff_t end_bytes = end << PAGE_CACHE_SHIFT;
+ unsigned long i;
+ for (i = start ; i < end; i++) {
+ p = find_or_insert_page(mapping, i, GFP_KERNEL, fake);
+ if (!p)
+ goto fail;
+ if (PageDirty(p)) {
+ /* this page was dirty, so someone raced in and
+ * did a write. Start IO on the whole region
+ * and try again
+ */
+ unlock_page(p);
+ page_cache_release(p);
+ __filemap_fdatawrite_range(mapping,
+ i << PAGE_CACHE_SHIFT,
+ end_bytes, WB_SYNC_ALL);
+ continue;
+ }
+ pages[numlock++] = p;
+ }
+ /* now that we have all the pages locked, wait for any io */
+ wait_on_page_writeback_range(mapping, start, end);
+ return 0;
+fail:
+ unlock_page_range(mapping, pages, start, numlock);
+ return -1;
+}
+
+
/*
* Get another userspace page. Returns an ERR_PTR on error. Pages are
* buffered inside the dio so that we can call get_user_pages() against a
@@ -219,9 +288,8 @@ static void dio_complete(struct dio *dio
{
if (dio->end_io && dio->result)
dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private);
- if (dio->lock_type == DIO_LOCKING)
- /* lockdep: non-owner release */
- up_read_non_owner(&dio->inode->i_alloc_sem);
+ unlock_page_range(dio->inode->i_mapping, dio->fspages,
+ dio->fs_start_off, dio->nr_fspages);
}
/*
@@ -944,7 +1012,7 @@ out:
}
/*
- * Releases both i_mutex and i_alloc_sem
+ * Releases both i_mutex
*/
static ssize_t
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1191,8 +1259,9 @@ __blockdev_direct_IO(int rw, struct kioc
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
- int release_i_mutex = 0;
int acquire_i_mutex = 0;
+ struct page **pages = NULL;
+ unsigned long nrpages;
if (rw & WRITE)
rw = WRITE_SYNC;
@@ -1221,12 +1290,21 @@ __blockdev_direct_IO(int rw, struct kioc
goto out;
}
}
-
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
retval = -ENOMEM;
if (!dio)
goto out;
+ memset(&dio->fake, 0, sizeof(struct page));
+ SetPagePlaceHolder(&dio->fake);
+ nrpages = (end + PAGE_CACHE_SIZE - 1 - offset) >> PAGE_CACHE_SHIFT;
+ dio->fs_start_off = offset >> PAGE_CACHE_SHIFT;
+ pages = kmalloc(sizeof(struct page *) * nrpages, GFP_KERNEL);
+ dio->fspages = pages;
+ dio->nr_fspages = nrpages;
+ if (lock_page_range(inode->i_mapping, pages, dio->fs_start_off, nrpages,
+ &dio->fake))
+ goto out;
/*
* For block device access DIO_NO_LOCKING is used,
* neither readers nor writers do any locking at all
@@ -1240,30 +1318,11 @@ __blockdev_direct_IO(int rw, struct kioc
if (dio_lock_type != DIO_NO_LOCKING) {
/* watch out for a 0 len io from a tricksy fs */
if (rw == READ && end > offset) {
- struct address_space *mapping;
-
- mapping = iocb->ki_filp->f_mapping;
- if (dio_lock_type != DIO_OWN_LOCKING) {
- mutex_lock(&inode->i_mutex);
- release_i_mutex = 1;
- }
-
- retval = filemap_write_and_wait_range(mapping, offset,
- end - 1);
- if (retval) {
- kfree(dio);
- goto out;
- }
-
if (dio_lock_type == DIO_OWN_LOCKING) {
mutex_unlock(&inode->i_mutex);
acquire_i_mutex = 1;
}
}
-
- if (dio_lock_type == DIO_LOCKING)
- /* lockdep: not the owner will release it */
- down_read_non_owner(&inode->i_alloc_sem);
}
/*
@@ -1278,13 +1337,8 @@ __blockdev_direct_IO(int rw, struct kioc
retval = direct_io_worker(rw, iocb, inode, iov, offset,
nr_segs, blkbits, get_block, end_io, dio);
- if (rw == READ && dio_lock_type == DIO_LOCKING)
- release_i_mutex = 0;
-
out:
- if (release_i_mutex)
- mutex_unlock(&inode->i_mutex);
- else if (acquire_i_mutex)
+ if (acquire_i_mutex)
mutex_lock(&inode->i_mutex);
return retval;
}
next prev parent reply other threads:[~2006-10-20 18:42 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-10-20 18:32 [RFC PATCH 0/2] O_DIRECT locking rework Chris Mason
2006-10-20 18:38 ` [RFC PATCH 1/2] placeholder pages Chris Mason
2006-10-20 18:41 ` Chris Mason [this message]
2006-10-20 19:30 ` [RFC PATCH 0/2] O_DIRECT locking rework Andrew Morton
2006-10-20 20:03 ` Zach Brown
2006-10-20 20:12 ` Chris Mason
2006-10-20 20:05 ` Chris Mason
2006-10-20 20:23 ` Andi Kleen
2006-10-24 19:34 ` Chris Mason
2006-10-24 19:38 ` [RFC PATCH 1/2] placeholder pages Chris Mason
2006-10-24 19:40 ` [RFC PATCH 2/2] O_DIRECT locking via placeholders Chris Mason
2006-10-24 20:28 ` [RFC PATCH 0/2] O_DIRECT locking rework Badari Pulavarty
2006-10-24 20:50 ` Chris Mason
2006-10-24 21:52 ` Badari Pulavarty
2006-10-24 22:22 ` Chris Mason
2006-10-24 22:47 ` Badari Pulavarty
2006-10-24 22:37 ` Russell Cattelan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20061020184147.GC8674@think.oraclecorp.com \
--to=chris.mason@oracle.com \
--cc=akpm@osdl.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=zach.brown@oracle.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).