* [PATCH 1 of 7] Introduce a place holder page for the pagecache
2006-11-01 15:08 [PATCH 0 of 7] O_DIRECT locking rework Chris Mason
@ 2006-11-01 15:08 ` Chris Mason
2006-11-01 15:08 ` [PATCH 2 of 7] Change O_DIRECT to use placeholders instead of i_mutex/i_alloc_sem locking Chris Mason
` (5 subsequent siblings)
6 siblings, 0 replies; 14+ messages in thread
From: Chris Mason @ 2006-11-01 15:08 UTC (permalink / raw)
To: linux-fsdevel; +Cc: akpm, zach.brown, Suparna Bhattacharya
mm/filemap.c is changed to wait on these before adding a page into the page
cache, and truncates are changed to wait for all of the place holder pages to
disappear.
Place holder pages can only be tested or looked at with the mapping lock
held, and only PagePlaceHolder(page) can be trusted. They cannot be locked,
and cannot have references increased or decreased on them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff -r 18a9e9f5c707 -r e3fe4a6b9355 include/linux/mm.h
--- a/include/linux/mm.h Thu Oct 19 08:30:00 2006 +0700
+++ b/include/linux/mm.h Tue Oct 31 10:01:36 2006 -0500
@@ -276,6 +276,7 @@ static inline void get_page(struct page
if (unlikely(PageCompound(page)))
page = (struct page *)page_private(page);
VM_BUG_ON(atomic_read(&page->_count) == 0);
+ VM_BUG_ON(PagePlaceHolder(page));
atomic_inc(&page->_count);
}
diff -r 18a9e9f5c707 -r e3fe4a6b9355 include/linux/page-flags.h
--- a/include/linux/page-flags.h Thu Oct 19 08:30:00 2006 +0700
+++ b/include/linux/page-flags.h Tue Oct 31 10:01:36 2006 -0500
@@ -267,4 +267,6 @@ static inline void set_page_writeback(st
test_set_page_writeback(page);
}
+void set_page_placeholder(struct page *page);
+
#endif /* PAGE_FLAGS_H */
diff -r 18a9e9f5c707 -r e3fe4a6b9355 include/linux/pagemap.h
--- a/include/linux/pagemap.h Thu Oct 19 08:30:00 2006 +0700
+++ b/include/linux/pagemap.h Tue Oct 31 10:01:36 2006 -0500
@@ -72,6 +72,12 @@ extern struct page * find_get_page(struc
unsigned long index);
extern struct page * find_lock_page(struct address_space *mapping,
unsigned long index);
+int find_or_insert_placeholders(struct address_space *mapping,
+ struct page **pages, unsigned long start,
+ unsigned long end, unsigned long nr,
+ gfp_t gfp_mask,
+ struct page *insert,
+ int wait);
extern __deprecated_for_modules struct page * find_trylock_page(
struct address_space *mapping, unsigned long index);
extern struct page * find_or_create_page(struct address_space *mapping,
@@ -82,6 +88,16 @@ unsigned find_get_pages_contig(struct ad
unsigned int nr_pages, struct page **pages);
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
int tag, unsigned int nr_pages, struct page **pages);
+void remove_placeholder_pages(struct address_space *mapping,
+ struct page **pages,
+ struct page *placeholder,
+ unsigned long offset,
+ unsigned long end,
+ unsigned long nr);
+void wake_up_placeholder_page(struct page *page);
+void wait_on_placeholder_pages_range(struct address_space *mapping, pgoff_t start,
+ pgoff_t end);
+
/*
* Returns locked page at given index in given cache, creating it if needed.
diff -r 18a9e9f5c707 -r e3fe4a6b9355 include/linux/radix-tree.h
--- a/include/linux/radix-tree.h Thu Oct 19 08:30:00 2006 +0700
+++ b/include/linux/radix-tree.h Tue Oct 31 10:01:36 2006 -0500
@@ -56,6 +56,7 @@ radix_tree_gang_lookup(struct radix_tree
radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
unsigned long first_index, unsigned int max_items);
int radix_tree_preload(gfp_t gfp_mask);
+int radix_tree_needs_preload(void);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *root,
unsigned long index, unsigned int tag);
diff -r 18a9e9f5c707 -r e3fe4a6b9355 lib/radix-tree.c
--- a/lib/radix-tree.c Thu Oct 19 08:30:00 2006 +0700
+++ b/lib/radix-tree.c Tue Oct 31 10:01:36 2006 -0500
@@ -139,6 +139,19 @@ out:
out:
return ret;
}
+
+/*
+ * Must be called inside the preload pair. This tells you if another
+ * run of radix_tree_preload is required, allowing the caller to decide
+ * if a repeated insert will succeed
+ */
+int radix_tree_needs_preload(void)
+{
+ struct radix_tree_preload *rtp;
+ rtp = &__get_cpu_var(radix_tree_preloads);
+ return rtp->nr < ARRAY_SIZE(rtp->nodes);
+}
+EXPORT_SYMBOL_GPL(radix_tree_needs_preload);
static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
int offset)
diff -r 18a9e9f5c707 -r e3fe4a6b9355 mm/filemap.c
--- a/mm/filemap.c Thu Oct 19 08:30:00 2006 +0700
+++ b/mm/filemap.c Tue Oct 31 10:01:36 2006 -0500
@@ -44,6 +44,25 @@ generic_file_direct_IO(int rw, struct ki
generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
+static wait_queue_head_t *page_waitqueue(struct page *page);
+static void wait_on_placeholder_page(struct address_space *mapping,
+ struct page *page,
+ int write_lock);
+
+static struct address_space placeholder_address_space;
+#define PagePlaceHolder(page) ((page)->mapping == &placeholder_address_space)
+
+/*
+ * Turns the page struct into a marker for a placeholder page.
+ * This should not be called with a real page, only a struct page stub.
+ */
+void set_page_placeholder(struct page *page)
+{
+ memset(page, 0, sizeof(*page));
+ page->mapping = &placeholder_address_space;
+}
+EXPORT_SYMBOL_GPL(set_page_placeholder);
+
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
@@ -437,12 +456,24 @@ int add_to_page_cache(struct page *page,
int add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
- int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ int error;
+again:
+ error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
write_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
- if (!error) {
+ if (error == -EEXIST && (gfp_mask & __GFP_WAIT)) {
+ struct page *tmp;
+ tmp = radix_tree_lookup(&mapping->page_tree, offset);
+ if (tmp && PagePlaceHolder(tmp)) {
+ radix_tree_preload_end();
+ /* drops the lock */
+ wait_on_placeholder_page(mapping, tmp, 1);
+ goto again;
+ }
+ }
+ if (!error && !PagePlaceHolder(page)) {
page_cache_get(page);
SetPageLocked(page);
page->mapping = mapping;
@@ -526,6 +557,88 @@ void fastcall wait_on_page_bit(struct pa
}
EXPORT_SYMBOL(wait_on_page_bit);
+/*
+ * Call with either a read lock or a write lock on the mapping tree.
+ *
+ * placeholder pages can't be tested or checked without the tree lock held, and
+ * there's no way to know what offset they live at in the tree. This makes it
+ * difficult to retest if a given placeholder is still in the radix tree.
+ *
+ * In order to wait for the placeholders without losing a wakeup from someone
+ * removing them, we have to prepare_to_wait before dropping the tree lock.
+ *
+ * The lock is dropped just before waiting for the place holder. It is not
+ * retaken before returning.
+ */
+static void wait_on_placeholder_page(struct address_space *mapping,
+ struct page *page,
+ int write_lock)
+{
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = page_waitqueue(page);
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ if (write_lock)
+ write_unlock_irq(&mapping->tree_lock);
+ else
+ read_unlock_irq(&mapping->tree_lock);
+ io_schedule();
+ finish_wait(wqh, &wait);
+}
+
+void wake_up_placeholder_page(struct page *page)
+{
+ __wake_up_bit(page_waitqueue(page), &page->flags, PG_locked);
+}
+EXPORT_SYMBOL_GPL(wake_up_placeholder_page);
+
+/**
+ * wait_on_placeholder_pages - gang placeholder page waiter
+ * @mapping: The address_space to search
+ * @start: The starting page index
+ * @end: The max page index
+ *
+ * wait_on_placeholder_pages() will search for and wait on a range of pages
+ * in the mapping
+ *
+ * On return, the range has no placeholder pages sitting in it.
+ */
+void wait_on_placeholder_pages_range(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ unsigned int i;
+ unsigned int ret;
+ struct page *pages[8];
+ pgoff_t cur = start;
+ pgoff_t highest = start;
+
+ /*
+ * we expect a very small number of place holder pages, so
+ * this code isn't trying to be very fast.
+ */
+again:
+ read_lock_irq(&mapping->tree_lock);
+ ret = radix_tree_gang_lookup(&mapping->page_tree,
+ (void **)pages, cur, ARRAY_SIZE(pages));
+ for (i = 0; i < ret; i++) {
+ if (PagePlaceHolder(pages[i])) {
+ /* drops the lock */
+ wait_on_placeholder_page(mapping, pages[i], 0);
+ goto again;
+ } else if (pages[i]->index > highest)
+ highest = pages[i]->index;
+ }
+ if (highest < end && ret == ARRAY_SIZE(pages)) {
+ cur = highest;
+ if (need_resched()) {
+ read_unlock_irq(&mapping->tree_lock);
+ cond_resched();
+ }
+ goto again;
+ }
+ read_unlock_irq(&mapping->tree_lock);
+}
+EXPORT_SYMBOL_GPL(wait_on_placeholder_pages_range);
+
/**
* unlock_page - unlock a locked page
* @page: the page
@@ -542,6 +655,7 @@ EXPORT_SYMBOL(wait_on_page_bit);
*/
void fastcall unlock_page(struct page *page)
{
+ BUG_ON(PagePlaceHolder(page));
smp_mb__before_clear_bit();
if (!TestClearPageLocked(page))
BUG();
@@ -578,6 +692,7 @@ void fastcall __lock_page(struct page *p
{
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ BUG_ON(PagePlaceHolder(page));
__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
TASK_UNINTERRUPTIBLE);
}
@@ -590,6 +705,7 @@ void fastcall __lock_page_nosync(struct
void fastcall __lock_page_nosync(struct page *page)
{
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ BUG_ON(PagePlaceHolder(page));
__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
TASK_UNINTERRUPTIBLE);
}
@@ -608,12 +724,253 @@ struct page * find_get_page(struct addre
read_lock_irq(&mapping->tree_lock);
page = radix_tree_lookup(&mapping->page_tree, offset);
- if (page)
- page_cache_get(page);
+ if (page) {
+ if (PagePlaceHolder(page))
+ page = NULL;
+ else
+ page_cache_get(page);
+ }
read_unlock_irq(&mapping->tree_lock);
return page;
}
EXPORT_SYMBOL(find_get_page);
+
+/**
+ * remove_placeholder_pages - remove a range of placeholder or locked pages
+ * @mapping: the page's address_space
+ * @pages: an array of page pointers to use for gang looukps
+ * @placeholder: the placeholder page previously inserted (for verification)
+ * @start: the search starting point
+ * @end: the search end point (offsets >= end are not touched)
+ * @nr: the size of the pages array.
+ *
+ * Any placeholder pages in the range specified are removed. Any real
+ * pages are unlocked and released.
+ */
+void remove_placeholder_pages(struct address_space *mapping,
+ struct page **pages,
+ struct page *placeholder,
+ unsigned long start,
+ unsigned long end,
+ unsigned long nr)
+{
+ struct page *page;
+ int ret;
+ int i;
+ unsigned long num;
+
+ write_lock_irq(&mapping->tree_lock);
+ while (start < end) {
+ num = min(nr, end-start);
+ ret = radix_tree_gang_lookup(&mapping->page_tree,
+ (void **)pages, start, num);
+ BUG_ON(ret != num);
+ for (i = 0; i < ret; i++) {
+ page = pages[i];
+ if (PagePlaceHolder(page)) {
+ BUG_ON(page != placeholder);
+ radix_tree_delete(&mapping->page_tree,
+ start + i);
+ } else {
+ BUG_ON(page->index >= end);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+ start += ret;
+ if (need_resched()) {
+ write_unlock_irq(&mapping->tree_lock);
+ cond_resched();
+ write_lock_irq(&mapping->tree_lock);
+ }
+ }
+ write_unlock_irq(&mapping->tree_lock);
+ if (placeholder)
+ wake_up_placeholder_page(placeholder);
+}
+EXPORT_SYMBOL_GPL(remove_placeholder_pages);
+
+/*
+ * a helper function to insert a placeholder into multiple slots
+ * in the radix tree. This could probably use an optimized version
+ * in the radix code. It may insert fewer than the request number
+ * of placeholders if we need to reschedule or the radix tree needs to
+ * be preloaded again.
+ *
+ * returns zero on error or the number actually inserted.
+ */
+static unsigned long insert_placeholders(struct address_space *mapping,
+ struct page *insert,
+ unsigned long start, unsigned long nr)
+{
+ int err;
+ unsigned long i;
+ for (i = 0 ; i < nr; i++) {
+ err = radix_tree_insert(&mapping->page_tree, start + i,
+ insert);
+ if (err)
+ return i;
+ if (radix_tree_needs_preload() || (i && need_resched()))
+ return i + 1;
+ }
+ return i;
+}
+
+
+/**
+ * find_or_insert_placeholders - locate a group of pagecache pages or insert one
+ * @mapping: the page's address_space
+ * @pages: an array of page pointers to use for gang looukps
+ * @start: the search starting point
+ * @end: the search end point (offsets >= end are not touched)
+ * @nr: the size of the pages array.
+ * @gfp_mask: page allocation mode
+ * @insert: the page to insert if none is found
+ * @iowait: 1 if you want to wait for dirty or writeback pages.
+ *
+ * This locks down a range of offsets in the address space. Any pages
+ * already present are locked and a placeholder page is inserted into
+ * the radix tree for any offsets without pages.
+ */
+int find_or_insert_placeholders(struct address_space *mapping,
+ struct page **pages, unsigned long start,
+ unsigned long end, unsigned long nr,
+ gfp_t gfp_mask,
+ struct page *insert,
+ int iowait)
+{
+ int err = 0;
+ int i, ret;
+ unsigned long cur = start;
+ unsigned long ins;
+ struct page *page;
+ int restart;
+
+ /*
+ * this gets complicated. Placeholders and page locks need to
+ * be taken in order. We use gang lookup to cut down on the cpu
+ * cost, but we need to keep track of holes in the results and
+ * insert placeholders as appropriate.
+ *
+ * If a locked page or a placeholder is found, we wait for it and
+ * pick up where we left off. If a dirty or PG_writeback page is found
+ * and iowait==1, we have to drop all of our locks, kick/wait for the
+ * io and start over.
+ */
+repeat:
+ if (cur != start )
+ cond_resched();
+ err = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (err)
+ goto fail;
+ write_lock_irq(&mapping->tree_lock);
+repeat_lock:
+ ret = radix_tree_gang_lookup(&mapping->page_tree,
+ (void **)pages, cur,
+ min(nr, end-cur));
+ for (i = 0 ; i < ret ; i++) {
+ restart = 0;
+ page = pages[i];
+
+ if (PagePlaceHolder(page)) {
+ radix_tree_preload_end();
+ /* drops the lock */
+ wait_on_placeholder_page(mapping, page, 1);
+ goto repeat;
+ }
+
+ if (page->index > cur) {
+ unsigned long top = min(end, page->index);
+ unsigned long cnt = top - cur;
+ ins = insert_placeholders(mapping, insert, cur, cnt);
+ cur += ins;
+ if (radix_tree_needs_preload() || ins != cnt) {
+ write_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ if (ins == 0) {
+ err = -1;
+ goto fail;
+ }
+ goto repeat;
+ }
+ }
+ if (page->index >= end) {
+ ret = 0;
+ break;
+ }
+ page_cache_get(page);
+ BUG_ON(page->index !=cur);
+ if (TestSetPageLocked(page)) {
+ unsigned long tmpoff = page->index;
+ page_cache_get(page);
+ write_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ __lock_page(page);
+ /* Has the page been truncated while we slept? */
+ if (unlikely(page->mapping != mapping ||
+ page->index != tmpoff)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
+ } else {
+ /* we've locked the page, but we need to
+ * check it for dirty/writeback
+ */
+ restart = 1;
+ }
+ }
+ if (iowait && (PageDirty(page) || PageWriteback(page))) {
+ unlock_page(page);
+ page_cache_release(page);
+ if (!restart) {
+ write_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ }
+ err = filemap_write_and_wait_range(mapping,
+ cur << PAGE_CACHE_SHIFT,
+ end << PAGE_CACHE_SHIFT);
+ if (err)
+ goto fail;
+ goto repeat;
+ }
+ cur++;
+ if (restart)
+ goto repeat;
+ if (cur >= end)
+ break;
+ }
+
+ /* we haven't yet filled the range */
+ if (cur < end) {
+ unsigned long cnt;
+ /* if the search filled our array, there is more to do. */
+ if (ret && ret == nr)
+ goto repeat_lock;
+
+ /* otherwise insert placeholders for the remaining offsets */
+ cnt = end - cur;
+ ins = insert_placeholders(mapping, insert, cur, cnt);
+ cur += ins;
+ if (ins != cnt) {
+ write_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ if (ins == 0) {
+ err = -1;
+ goto fail;
+ }
+ goto repeat;
+ }
+ }
+ write_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ BUG_ON(cur < end);
+ BUG_ON(cur > end);
+ return err;
+fail:
+ remove_placeholder_pages(mapping, pages, insert, start, cur, nr);
+ return err;
+}
+EXPORT_SYMBOL_GPL(find_or_insert_placeholders);
/**
* find_trylock_page - find and lock a page
@@ -628,7 +985,7 @@ struct page *find_trylock_page(struct ad
read_lock_irq(&mapping->tree_lock);
page = radix_tree_lookup(&mapping->page_tree, offset);
- if (page && TestSetPageLocked(page))
+ if (page && (PagePlaceHolder(page) || TestSetPageLocked(page)))
page = NULL;
read_unlock_irq(&mapping->tree_lock);
return page;
@@ -654,6 +1011,12 @@ repeat:
repeat:
page = radix_tree_lookup(&mapping->page_tree, offset);
if (page) {
+ if (PagePlaceHolder(page)) {
+ /* drops the lock */
+ wait_on_placeholder_page(mapping, page, 0);
+ read_lock_irq(&mapping->tree_lock);
+ goto repeat;
+ }
page_cache_get(page);
if (TestSetPageLocked(page)) {
read_unlock_irq(&mapping->tree_lock);
@@ -737,14 +1100,25 @@ unsigned find_get_pages(struct address_s
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages)
{
- unsigned int i;
+ unsigned int i = 0;
unsigned int ret;
read_lock_irq(&mapping->tree_lock);
ret = radix_tree_gang_lookup(&mapping->page_tree,
(void **)pages, start, nr_pages);
- for (i = 0; i < ret; i++)
- page_cache_get(pages[i]);
+ while(i < ret) {
+ if (PagePlaceHolder(pages[i])) {
+ /* we can't return a place holder, shift it away */
+ if (i + 1 < ret) {
+ memcpy(&pages[i], &pages[i+1],
+ (ret - i - 1) * sizeof(struct page *));
+ }
+ ret--;
+ continue;
+ } else
+ page_cache_get(pages[i]);
+ i++;
+ }
read_unlock_irq(&mapping->tree_lock);
return ret;
}
@@ -771,6 +1145,8 @@ unsigned find_get_pages_contig(struct ad
ret = radix_tree_gang_lookup(&mapping->page_tree,
(void **)pages, index, nr_pages);
for (i = 0; i < ret; i++) {
+ if (PagePlaceHolder(pages[i]))
+ break;
if (pages[i]->mapping == NULL || pages[i]->index != index)
break;
@@ -795,14 +1171,25 @@ unsigned find_get_pages_tag(struct addre
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
int tag, unsigned int nr_pages, struct page **pages)
{
- unsigned int i;
+ unsigned int i = 0;
unsigned int ret;
read_lock_irq(&mapping->tree_lock);
ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
(void **)pages, *index, nr_pages, tag);
- for (i = 0; i < ret; i++)
- page_cache_get(pages[i]);
+ while(i < ret) {
+ if (PagePlaceHolder(pages[i])) {
+ /* we can't return a place holder, shift it away */
+ if (i + 1 < ret) {
+ memcpy(&pages[i], &pages[i+1],
+ (ret - i - 1) * sizeof(struct page *));
+ }
+ ret--;
+ continue;
+ } else
+ page_cache_get(pages[i]);
+ i++;
+ }
if (ret)
*index = pages[ret - 1]->index + 1;
read_unlock_irq(&mapping->tree_lock);
@@ -2368,18 +2755,15 @@ generic_file_direct_IO(int rw, struct ki
unmap_mapping_range(mapping, offset, write_len, 0);
}
- retval = filemap_write_and_wait(mapping);
- if (retval == 0) {
- retval = mapping->a_ops->direct_IO(rw, iocb, iov,
- offset, nr_segs);
- if (rw == WRITE && mapping->nrpages) {
- pgoff_t end = (offset + write_len - 1)
- >> PAGE_CACHE_SHIFT;
- int err = invalidate_inode_pages2_range(mapping,
- offset >> PAGE_CACHE_SHIFT, end);
- if (err)
- retval = err;
- }
+ retval = mapping->a_ops->direct_IO(rw, iocb, iov,
+ offset, nr_segs);
+ if (rw == WRITE && mapping->nrpages) {
+ pgoff_t end = (offset + write_len - 1)
+ >> PAGE_CACHE_SHIFT;
+ int err = invalidate_inode_pages2_range(mapping,
+ offset >> PAGE_CACHE_SHIFT, end);
+ if (err)
+ retval = err;
}
return retval;
}
diff -r 18a9e9f5c707 -r e3fe4a6b9355 mm/truncate.c
--- a/mm/truncate.c Thu Oct 19 08:30:00 2006 +0700
+++ b/mm/truncate.c Tue Oct 31 10:01:36 2006 -0500
@@ -207,6 +207,7 @@ void truncate_inode_pages_range(struct a
}
pagevec_release(&pvec);
}
+ wait_on_placeholder_pages_range(mapping, start, end);
}
EXPORT_SYMBOL(truncate_inode_pages_range);
^ permalink raw reply [flat|nested] 14+ messages in thread* [PATCH 2 of 7] Change O_DIRECT to use placeholders instead of i_mutex/i_alloc_sem locking
2006-11-01 15:08 [PATCH 0 of 7] O_DIRECT locking rework Chris Mason
2006-11-01 15:08 ` [PATCH 1 of 7] Introduce a place holder page for the pagecache Chris Mason
@ 2006-11-01 15:08 ` Chris Mason
2006-11-01 22:44 ` David Chinner
2006-11-01 15:08 ` [PATCH 3 of 7] DIO: don't fall back to buffered writes Chris Mason
` (4 subsequent siblings)
6 siblings, 1 reply; 14+ messages in thread
From: Chris Mason @ 2006-11-01 15:08 UTC (permalink / raw)
To: linux-fsdevel; +Cc: akpm, zach.brown, Suparna Bhattacharya
All mutex and semaphore usage is removed from fs/direct-io.c. Callers
can ask for placeholder pages if they want help protecting against
races with buffered io.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff -r e3fe4a6b9355 -r 4486b1f7011a fs/direct-io.c
--- a/fs/direct-io.c Tue Oct 31 10:01:36 2006 -0500
+++ b/fs/direct-io.c Wed Nov 01 10:18:44 2006 -0500
@@ -35,6 +35,7 @@
#include <linux/rwsem.h>
#include <linux/uio.h>
#include <asm/atomic.h>
+#include <linux/writeback.h>
/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -94,6 +95,14 @@ struct dio {
struct buffer_head map_bh; /* last get_block() result */
/*
+ * kernel page pinning
+ */
+ struct page fake;
+ struct page *tmppages[DIO_PAGES];
+ unsigned long fspages_start_off;
+ unsigned long fspages_end_off;
+
+ /*
* Deferred addition of a page to the dio. These variables are
* private to dio_send_cur_page(), submit_page_section() and
* dio_bio_add_page().
@@ -190,6 +199,33 @@ out:
return ret;
}
+static void unlock_page_range(struct dio *dio, unsigned long start,
+ unsigned long nr)
+{
+ if (dio->lock_type != DIO_NO_LOCKING) {
+ remove_placeholder_pages(dio->inode->i_mapping, dio->tmppages,
+ &dio->fake,
+ start, start + nr,
+ ARRAY_SIZE(dio->tmppages));
+ }
+}
+
+static int lock_page_range(struct dio *dio, unsigned long start,
+ unsigned long nr)
+{
+ struct address_space *mapping = dio->inode->i_mapping;
+ struct page *fake = &dio->fake;
+ unsigned long end = start + nr;
+
+ if (dio->lock_type == DIO_NO_LOCKING)
+ return 0;
+ return find_or_insert_placeholders(mapping, dio->tmppages, start, end,
+ ARRAY_SIZE(dio->tmppages),
+ GFP_KERNEL, fake,
+ dio->rw == READ);
+}
+
+
/*
* Get another userspace page. Returns an ERR_PTR on error. Pages are
* buffered inside the dio so that we can call get_user_pages() against a
@@ -219,9 +255,9 @@ static void dio_complete(struct dio *dio
{
if (dio->end_io && dio->result)
dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private);
- if (dio->lock_type == DIO_LOCKING)
- /* lockdep: non-owner release */
- up_read_non_owner(&dio->inode->i_alloc_sem);
+ unlock_page_range(dio, dio->fspages_start_off,
+ dio->fspages_end_off - dio->fspages_start_off);
+ dio->fspages_end_off = dio->fspages_start_off;
}
/*
@@ -517,6 +553,7 @@ static int get_more_blocks(struct dio *d
unsigned long fs_count; /* Number of filesystem-sized blocks */
unsigned long dio_count;/* Number of dio_block-sized blocks */
unsigned long blkmask;
+ unsigned long index;
int create;
/*
@@ -544,7 +581,19 @@ static int get_more_blocks(struct dio *d
} else if (dio->lock_type == DIO_NO_LOCKING) {
create = 0;
}
-
+ index = fs_startblk >> (PAGE_CACHE_SHIFT -
+ dio->inode->i_blkbits);
+ end = (dio->final_block_in_request >> dio->blkfactor) >>
+ (PAGE_CACHE_SHIFT - dio->inode->i_blkbits);
+ BUG_ON(index > end);
+ while (index >= dio->fspages_end_off) {
+ unsigned long nr;
+ nr = min(end - index + 1, (unsigned long)DIO_PAGES);
+ ret = lock_page_range(dio, dio->fspages_end_off, nr);
+ if (ret)
+ goto error;
+ dio->fspages_end_off += nr;
+ }
/*
* For writes inside i_size we forbid block creations: only
* overwrites are permitted. We fall back to buffered writes
@@ -554,6 +603,7 @@ static int get_more_blocks(struct dio *d
ret = (*dio->get_block)(dio->inode, fs_startblk,
map_bh, create);
}
+error:
return ret;
}
@@ -943,9 +993,6 @@ out:
return ret;
}
-/*
- * Releases both i_mutex and i_alloc_sem
- */
static ssize_t
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
const struct iovec *iov, loff_t offset, unsigned long nr_segs,
@@ -1074,14 +1121,6 @@ direct_io_worker(int rw, struct kiocb *i
* In that case, we need to release all the pages we got hold on.
*/
dio_cleanup(dio);
-
- /*
- * All block lookups have been performed. For READ requests
- * we can let i_mutex go now that its achieved its purpose
- * of protecting us from looking up uninitialized blocks.
- */
- if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
- mutex_unlock(&dio->inode->i_mutex);
/*
* OK, all BIOs are submitted, so we can decrement bio_count to truly
@@ -1165,8 +1204,6 @@ direct_io_worker(int rw, struct kiocb *i
* DIO_LOCKING (simple locking for regular files)
* For writes we are called under i_mutex and return with i_mutex held, even
* though it is internally dropped.
- * For reads, i_mutex is not held on entry, but it is taken and dropped before
- * returning.
*
* DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
* uninitialised data, allowing parallel direct readers and writers)
@@ -1191,8 +1228,7 @@ __blockdev_direct_IO(int rw, struct kioc
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
- int release_i_mutex = 0;
- int acquire_i_mutex = 0;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
if (rw & WRITE)
rw = WRITE_SYNC;
@@ -1221,49 +1257,29 @@ __blockdev_direct_IO(int rw, struct kioc
goto out;
}
}
-
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
retval = -ENOMEM;
if (!dio)
goto out;
+ set_page_placeholder(&dio->fake);
+ dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT;
+ dio->fspages_end_off = dio->fspages_start_off;
+
/*
* For block device access DIO_NO_LOCKING is used,
* neither readers nor writers do any locking at all
* For regular files using DIO_LOCKING,
- * readers need to grab i_mutex and i_alloc_sem
- * writers need to grab i_alloc_sem only (i_mutex is already held)
+ * No locks are taken
* For regular files using DIO_OWN_LOCKING,
* neither readers nor writers take any locks here
*/
dio->lock_type = dio_lock_type;
- if (dio_lock_type != DIO_NO_LOCKING) {
- /* watch out for a 0 len io from a tricksy fs */
- if (rw == READ && end > offset) {
- struct address_space *mapping;
-
- mapping = iocb->ki_filp->f_mapping;
- if (dio_lock_type != DIO_OWN_LOCKING) {
- mutex_lock(&inode->i_mutex);
- release_i_mutex = 1;
- }
-
- retval = filemap_write_and_wait_range(mapping, offset,
- end - 1);
- if (retval) {
- kfree(dio);
- goto out;
- }
-
- if (dio_lock_type == DIO_OWN_LOCKING) {
- mutex_unlock(&inode->i_mutex);
- acquire_i_mutex = 1;
- }
- }
-
- if (dio_lock_type == DIO_LOCKING)
- /* lockdep: not the owner will release it */
- down_read_non_owner(&inode->i_alloc_sem);
+
+ if (dio->lock_type == DIO_NO_LOCKING && end > offset) {
+ retval = filemap_write_and_wait_range(mapping, offset, end - 1);
+ if (retval)
+ goto out;
}
/*
@@ -1277,15 +1293,7 @@ __blockdev_direct_IO(int rw, struct kioc
retval = direct_io_worker(rw, iocb, inode, iov, offset,
nr_segs, blkbits, get_block, end_io, dio);
-
- if (rw == READ && dio_lock_type == DIO_LOCKING)
- release_i_mutex = 0;
-
out:
- if (release_i_mutex)
- mutex_unlock(&inode->i_mutex);
- else if (acquire_i_mutex)
- mutex_lock(&inode->i_mutex);
return retval;
}
EXPORT_SYMBOL(__blockdev_direct_IO);
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH 2 of 7] Change O_DIRECT to use placeholders instead of i_mutex/i_alloc_sem locking
2006-11-01 15:08 ` [PATCH 2 of 7] Change O_DIRECT to use placeholders instead of i_mutex/i_alloc_sem locking Chris Mason
@ 2006-11-01 22:44 ` David Chinner
0 siblings, 0 replies; 14+ messages in thread
From: David Chinner @ 2006-11-01 22:44 UTC (permalink / raw)
To: Chris Mason; +Cc: linux-fsdevel, akpm, zach.brown, Suparna Bhattacharya
On Wed, Nov 01, 2006 at 11:08:04AM -0400, Chris Mason wrote:
> All mutex and semaphore usage is removed from fs/direct-io.c. Callers
> can ask for placeholder pages if they want help protecting against
> races with buffered io.
>
> Signed-off-by: Chris Mason <chris.mason@oracle.com>
.....
> +static void unlock_page_range(struct dio *dio, unsigned long start,
> + unsigned long nr)
> +{
> + if (dio->lock_type != DIO_NO_LOCKING) {
> + remove_placeholder_pages(dio->inode->i_mapping, dio->tmppages,
> + &dio->fake,
> + start, start + nr,
> + ARRAY_SIZE(dio->tmppages));
> + }
> +}
> +
> +static int lock_page_range(struct dio *dio, unsigned long start,
> + unsigned long nr)
> +{
> + struct address_space *mapping = dio->inode->i_mapping;
> + struct page *fake = &dio->fake;
> + unsigned long end = start + nr;
> +
> + if (dio->lock_type == DIO_NO_LOCKING)
> + return 0;
> + return find_or_insert_placeholders(mapping, dio->tmppages, start, end,
> + ARRAY_SIZE(dio->tmppages),
> + GFP_KERNEL, fake,
> + dio->rw == READ);
> +}
I'm not sure that these two function are well named -
lock_page_range() could be confused for an extenstion of lock_page()
and they are very different things. Something like dio_lock_range()
and dio_unlock_range() might be more appropriate...
> +
> +
> /*
> * Get another userspace page. Returns an ERR_PTR on error. Pages are
> * buffered inside the dio so that we can call get_user_pages() against a
> @@ -219,9 +255,9 @@ static void dio_complete(struct dio *dio
> {
> if (dio->end_io && dio->result)
> dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private);
> - if (dio->lock_type == DIO_LOCKING)
> - /* lockdep: non-owner release */
> - up_read_non_owner(&dio->inode->i_alloc_sem);
> + unlock_page_range(dio, dio->fspages_start_off,
> + dio->fspages_end_off - dio->fspages_start_off);
> + dio->fspages_end_off = dio->fspages_start_off;
> }
dio->fspages_end_off and dio->fspages_start_off only seem to have
relevance to the DIO_PLACEHOLDER case - can this be wrapped up
inside dio_unlock_range()?
>
> /*
> @@ -517,6 +553,7 @@ static int get_more_blocks(struct dio *d
> unsigned long fs_count; /* Number of filesystem-sized blocks */
> unsigned long dio_count;/* Number of dio_block-sized blocks */
> unsigned long blkmask;
> + unsigned long index;
> int create;
>
> /*
> @@ -544,7 +581,19 @@ static int get_more_blocks(struct dio *d
> } else if (dio->lock_type == DIO_NO_LOCKING) {
> create = 0;
> }
> -
> + index = fs_startblk >> (PAGE_CACHE_SHIFT -
> + dio->inode->i_blkbits);
> + end = (dio->final_block_in_request >> dio->blkfactor) >>
> + (PAGE_CACHE_SHIFT - dio->inode->i_blkbits);
> + BUG_ON(index > end);
> + while (index >= dio->fspages_end_off) {
> + unsigned long nr;
> + nr = min(end - index + 1, (unsigned long)DIO_PAGES);
> + ret = lock_page_range(dio, dio->fspages_end_off, nr);
> + if (ret)
> + goto error;
> + dio->fspages_end_off += nr;
> + }
Ditto for this - if we are not using placeholders, then none of this
is needed and so could be wrapped in:
if (dio->flags & DIO_PLACEHOLDER) {
ret = dio_lock_range(dio, ...)
if (ret)
goto error;
}
> @@ -1221,49 +1257,29 @@ __blockdev_direct_IO(int rw, struct kioc
> goto out;
> }
> }
> -
> dio = kmalloc(sizeof(*dio), GFP_KERNEL);
> retval = -ENOMEM;
> if (!dio)
> goto out;
>
> + set_page_placeholder(&dio->fake);
> + dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT;
> + dio->fspages_end_off = dio->fspages_start_off;
> +
if (flags & DIO_PLACEHOLDER) ?
> /*
> * For block device access DIO_NO_LOCKING is used,
> * neither readers nor writers do any locking at all
> * For regular files using DIO_LOCKING,
> - * readers need to grab i_mutex and i_alloc_sem
> - * writers need to grab i_alloc_sem only (i_mutex is already held)
> + * No locks are taken
> * For regular files using DIO_OWN_LOCKING,
> * neither readers nor writers take any locks here
> */
So nothing does any locking here - comment can be simplified/killed?
Cheers,
Dave.
--
Dave Chinner
Principal Engineer
SGI Australian Software Group
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 3 of 7] DIO: don't fall back to buffered writes
2006-11-01 15:08 [PATCH 0 of 7] O_DIRECT locking rework Chris Mason
2006-11-01 15:08 ` [PATCH 1 of 7] Introduce a place holder page for the pagecache Chris Mason
2006-11-01 15:08 ` [PATCH 2 of 7] Change O_DIRECT to use placeholders instead of i_mutex/i_alloc_sem locking Chris Mason
@ 2006-11-01 15:08 ` Chris Mason
2006-11-01 15:08 ` [PATCH 4 of 7] Turn the DIO lock_type parameter into a flags field Chris Mason
` (3 subsequent siblings)
6 siblings, 0 replies; 14+ messages in thread
From: Chris Mason @ 2006-11-01 15:08 UTC (permalink / raw)
To: linux-fsdevel; +Cc: akpm, zach.brown, Suparna Bhattacharya
Placeholder pages allow DIO to use locking rules similar to that of
writepage. DIO can now fill holes, and it can extend the file via
expanding truncates. This creates holes that are filled during the DIO
write.
i_mutex can be dropped during writes once DIO decides if the file needs to be
extended.
The expanding truncate may cause some pagecache pages to be dirtied.
The call to find_or_insert_placeholders is changed to wait on dirty
or writeback pages in the case of writes as well as reads.
diff -r 4486b1f7011a -r 3fa8c25ec60f fs/direct-io.c
--- a/fs/direct-io.c Wed Nov 01 10:18:44 2006 -0500
+++ b/fs/direct-io.c Wed Nov 01 10:22:34 2006 -0500
@@ -69,6 +69,7 @@ struct dio {
int rw;
loff_t i_size; /* i_size when submitted */
int lock_type; /* doesn't change */
+ int reacquire_i_mutex; /* should we get i_mutex when done? */
unsigned blkbits; /* doesn't change */
unsigned blkfactor; /* When we're using an alignment which
is finer than the filesystem's soft
@@ -221,8 +222,7 @@ static int lock_page_range(struct dio *d
return 0;
return find_or_insert_placeholders(mapping, dio->tmppages, start, end,
ARRAY_SIZE(dio->tmppages),
- GFP_KERNEL, fake,
- dio->rw == READ);
+ GFP_KERNEL, fake, 1);
}
@@ -258,6 +258,8 @@ static void dio_complete(struct dio *dio
unlock_page_range(dio, dio->fspages_start_off,
dio->fspages_end_off - dio->fspages_start_off);
dio->fspages_end_off = dio->fspages_start_off;
+ if (dio->reacquire_i_mutex)
+ mutex_lock(&dio->inode->i_mutex);
}
/*
@@ -574,13 +576,8 @@ static int get_more_blocks(struct dio *d
map_bh->b_size = fs_count << dio->inode->i_blkbits;
create = dio->rw & WRITE;
- if (dio->lock_type == DIO_LOCKING) {
- if (dio->block_in_file < (i_size_read(dio->inode) >>
- dio->blkbits))
- create = 0;
- } else if (dio->lock_type == DIO_NO_LOCKING) {
+ if (dio->lock_type == DIO_NO_LOCKING)
create = 0;
- }
index = fs_startblk >> (PAGE_CACHE_SHIFT -
dio->inode->i_blkbits);
end = (dio->final_block_in_request >> dio->blkfactor) >>
@@ -1291,6 +1288,33 @@ __blockdev_direct_IO(int rw, struct kioc
dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
(end > i_size_read(inode)));
+ /*
+ * extend the file if needed, and drop i_mutex for non-aio writes.
+ * We extend the file by creating a hole that is later filled in
+ * during the O_DIRECT. Because pages are locked or placeholders
+ * are inserted, the locking rules end up being the same as
+ * mmap'd writes using writepage to fill holes
+ */
+ dio->reacquire_i_mutex = 0;
+ if ((rw & WRITE) && dio_lock_type == DIO_LOCKING) {
+ /* if our write goes past i_size, do an expanding
+ * truncate to fill it before dropping i_mutex
+ */
+ if (end > i_size_read(inode) && iocb->ki_filp) {
+ struct iattr newattrs;
+ newattrs.ia_size = end;
+ newattrs.ia_file = iocb->ki_filp;
+ newattrs.ia_valid = ATTR_SIZE | ATTR_FILE;
+ retval = notify_change(iocb->ki_filp->f_dentry,
+ &newattrs);
+ if (retval)
+ goto out;
+ }
+ if (is_sync_kiocb(iocb)) {
+ dio->reacquire_i_mutex = 1;
+ mutex_unlock(&inode->i_mutex);
+ }
+ }
retval = direct_io_worker(rw, iocb, inode, iov, offset,
nr_segs, blkbits, get_block, end_io, dio);
out:
diff -r 4486b1f7011a -r 3fa8c25ec60f mm/filemap.c
--- a/mm/filemap.c Wed Nov 01 10:18:44 2006 -0500
+++ b/mm/filemap.c Wed Nov 01 10:22:34 2006 -0500
@@ -2758,10 +2758,19 @@ generic_file_direct_IO(int rw, struct ki
retval = mapping->a_ops->direct_IO(rw, iocb, iov,
offset, nr_segs);
if (rw == WRITE && mapping->nrpages) {
+ int err;
pgoff_t end = (offset + write_len - 1)
>> PAGE_CACHE_SHIFT;
- int err = invalidate_inode_pages2_range(mapping,
- offset >> PAGE_CACHE_SHIFT, end);
+
+ /* O_DIRECT is allowed to drop i_mutex, so more data
+ * could have been dirtied by others. Start io one more
+ * time
+ */
+ err = filemap_fdatawrite_range(mapping, offset,
+ offset + write_len - 1);
+ if (!err)
+ err = invalidate_inode_pages2_range(mapping,
+ offset >> PAGE_CACHE_SHIFT, end);
if (err)
retval = err;
}
^ permalink raw reply [flat|nested] 14+ messages in thread* [PATCH 4 of 7] Turn the DIO lock_type parameter into a flags field
2006-11-01 15:08 [PATCH 0 of 7] O_DIRECT locking rework Chris Mason
` (2 preceding siblings ...)
2006-11-01 15:08 ` [PATCH 3 of 7] DIO: don't fall back to buffered writes Chris Mason
@ 2006-11-01 15:08 ` Chris Mason
2006-11-01 22:58 ` David Chinner
2006-11-01 15:08 ` [PATCH 5 of 7] Make ext3 safe for the new DIO locking rules Chris Mason
` (2 subsequent siblings)
6 siblings, 1 reply; 14+ messages in thread
From: Chris Mason @ 2006-11-01 15:08 UTC (permalink / raw)
To: linux-fsdevel; +Cc: akpm, zach.brown, Suparna Bhattacharya
This creates a number of flags so that filesystems can control
blockdev_direct_IO. It is based on code from Russell Cettelan.
The new flags are:
DIO_CREATE -- always pass create=1 to get_block on writes. This allows
DIO to fill holes in the file.
DIO_PLACEHOLDERS -- use placeholder pages to provide locking against buffered
io and truncates.
DIO_EXTEND -- use truncate to grow the file instead of falling back to
buffered io.
DIO_DROP_I_MUTEX -- drop i_mutex before starting the IO on writes
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff -r 3fa8c25ec60f -r f84d3216430d fs/direct-io.c
--- a/fs/direct-io.c Wed Nov 01 10:22:34 2006 -0500
+++ b/fs/direct-io.c Wed Nov 01 10:24:03 2006 -0500
@@ -53,13 +53,6 @@
*
* If blkfactor is zero then the user's request was aligned to the filesystem's
* blocksize.
- *
- * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
- * This determines whether we need to do the fancy locking which prevents
- * direct-IO from being able to read uninitialised disk blocks. If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
- * not held for the entire direct write (taken briefly, initially, during a
- * direct read though, but its never held for the duration of a direct-IO).
*/
struct dio {
@@ -68,7 +61,7 @@ struct dio {
struct inode *inode;
int rw;
loff_t i_size; /* i_size when submitted */
- int lock_type; /* doesn't change */
+ unsigned flags; /* doesn't change */
int reacquire_i_mutex; /* should we get i_mutex when done? */
unsigned blkbits; /* doesn't change */
unsigned blkfactor; /* When we're using an alignment which
@@ -203,7 +196,7 @@ static void unlock_page_range(struct dio
static void unlock_page_range(struct dio *dio, unsigned long start,
unsigned long nr)
{
- if (dio->lock_type != DIO_NO_LOCKING) {
+ if (dio->flags & DIO_PLACEHOLDERS) {
remove_placeholder_pages(dio->inode->i_mapping, dio->tmppages,
&dio->fake,
start, start + nr,
@@ -218,11 +211,13 @@ static int lock_page_range(struct dio *d
struct page *fake = &dio->fake;
unsigned long end = start + nr;
- if (dio->lock_type == DIO_NO_LOCKING)
- return 0;
- return find_or_insert_placeholders(mapping, dio->tmppages, start, end,
- ARRAY_SIZE(dio->tmppages),
- GFP_KERNEL, fake, 1);
+ if (dio->flags & DIO_PLACEHOLDERS) {
+ return find_or_insert_placeholders(mapping, dio->tmppages,
+ start, end,
+ ARRAY_SIZE(dio->tmppages),
+ GFP_KERNEL, fake, 1);
+ }
+ return 0;
}
@@ -556,6 +551,7 @@ static int get_more_blocks(struct dio *d
unsigned long dio_count;/* Number of dio_block-sized blocks */
unsigned long blkmask;
unsigned long index;
+ unsigned long end;
int create;
/*
@@ -575,8 +571,9 @@ static int get_more_blocks(struct dio *d
map_bh->b_state = 0;
map_bh->b_size = fs_count << dio->inode->i_blkbits;
- create = dio->rw & WRITE;
- if (dio->lock_type == DIO_NO_LOCKING)
+ if (dio->flags & DIO_CREATE)
+ create = dio->rw & WRITE;
+ else
create = 0;
index = fs_startblk >> (PAGE_CACHE_SHIFT -
dio->inode->i_blkbits);
@@ -1193,28 +1190,17 @@ direct_io_worker(int rw, struct kiocb *i
/*
* This is a library function for use by filesystem drivers.
- * The locking rules are governed by the dio_lock_type parameter.
- *
- * DIO_NO_LOCKING (no locking, for raw block device access)
- * For writes, i_mutex is not held on entry; it is never taken.
- *
- * DIO_LOCKING (simple locking for regular files)
- * For writes we are called under i_mutex and return with i_mutex held, even
- * though it is internally dropped.
- *
- * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
- * uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_mutex, return without it, never touch it.
- * For reads we are called under i_mutex and return with i_mutex held, even
- * though it may be internally dropped.
- *
- * Additional i_alloc_sem locking requirements described inline below.
+ * The flags parameter is a bitmask of:
+ *
+ * DIO_PLACEHOLDERS (use placeholder pages for locking)
+ * DIO_CREATE (pass create=1 to get_block for filling holes)
+ * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes)
*/
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- int dio_lock_type)
+ unsigned flags)
{
int seg;
size_t size;
@@ -1225,7 +1211,6 @@ __blockdev_direct_IO(int rw, struct kioc
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
- struct address_space *mapping = iocb->ki_filp->f_mapping;
if (rw & WRITE)
rw = WRITE_SYNC;
@@ -1271,9 +1256,14 @@ __blockdev_direct_IO(int rw, struct kioc
* For regular files using DIO_OWN_LOCKING,
* neither readers nor writers take any locks here
*/
- dio->lock_type = dio_lock_type;
-
- if (dio->lock_type == DIO_NO_LOCKING && end > offset) {
+ dio->flags = flags;
+
+ /*
+ * the placeholder code does filemap_write_and_wait, so if we
+ * aren't using placeholders we have to do it here
+ */
+ if (!(dio->flags & DIO_PLACEHOLDERS) && end > offset) {
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
retval = filemap_write_and_wait_range(mapping, offset, end - 1);
if (retval)
goto out;
@@ -1296,11 +1286,12 @@ __blockdev_direct_IO(int rw, struct kioc
* mmap'd writes using writepage to fill holes
*/
dio->reacquire_i_mutex = 0;
- if ((rw & WRITE) && dio_lock_type == DIO_LOCKING) {
+ if (rw & WRITE) {
/* if our write goes past i_size, do an expanding
* truncate to fill it before dropping i_mutex
*/
- if (end > i_size_read(inode) && iocb->ki_filp) {
+ if ((dio->flags & DIO_EXTEND) && end > i_size_read(inode) &&
+ iocb->ki_filp) {
struct iattr newattrs;
newattrs.ia_size = end;
newattrs.ia_file = iocb->ki_filp;
@@ -1310,7 +1301,7 @@ __blockdev_direct_IO(int rw, struct kioc
if (retval)
goto out;
}
- if (is_sync_kiocb(iocb)) {
+ if ((dio->flags & DIO_DROP_I_MUTEX) && is_sync_kiocb(iocb)) {
dio->reacquire_i_mutex = 1;
mutex_unlock(&inode->i_mutex);
}
diff -r 3fa8c25ec60f -r f84d3216430d include/linux/fs.h
--- a/include/linux/fs.h Wed Nov 01 10:22:34 2006 -0500
+++ b/include/linux/fs.h Wed Nov 01 10:24:03 2006 -0500
@@ -1801,21 +1801,32 @@ ssize_t __blockdev_direct_IO(int rw, str
ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- int lock_type);
-
-enum {
- DIO_LOCKING = 1, /* need locking between buffered and direct access */
- DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
- DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
-};
+ unsigned int dio_flags);
+
+#define DIO_PLACEHOLDERS (1 << 0) /* insert placeholder pages */
+#define DIO_CREATE (1 << 1) /* pass create=1 to get_block when writing */
+#define DIO_DROP_I_MUTEX (1 << 2) /* drop i_mutex during writes */
+#define DIO_EXTEND (1 << 3) /* extend the file w/truncate if needed */
static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
struct inode *inode, struct block_device *bdev, const struct iovec *iov,
loff_t offset, unsigned long nr_segs, get_block_t get_block,
dio_iodone_t end_io)
{
+ /* locking is on, FS wants to fill holes w/get_block */
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_LOCKING);
+ nr_segs, get_block, end_io, DIO_PLACEHOLDERS |
+ DIO_CREATE | DIO_DROP_I_MUTEX | DIO_EXTEND);
+}
+
+static inline ssize_t blockdev_direct_IO_flags(int rw, struct kiocb *iocb,
+ struct inode *inode, struct block_device *bdev, const struct iovec *iov,
+ loff_t offset, unsigned long nr_segs, get_block_t get_block,
+ dio_iodone_t end_io, unsigned int flags)
+{
+ /* file system dictates locking and create behavior */
+ return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+ nr_segs, get_block, end_io, flags);
}
static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
@@ -1823,17 +1834,9 @@ static inline ssize_t blockdev_direct_IO
loff_t offset, unsigned long nr_segs, get_block_t get_block,
dio_iodone_t end_io)
{
+ /* locking is off, create is off */
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_NO_LOCKING);
-}
-
-static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
- struct inode *inode, struct block_device *bdev, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, get_block_t get_block,
- dio_iodone_t end_io)
-{
- return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_OWN_LOCKING);
+ nr_segs, get_block, end_io, 0);
}
#endif
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH 4 of 7] Turn the DIO lock_type parameter into a flags field
2006-11-01 15:08 ` [PATCH 4 of 7] Turn the DIO lock_type parameter into a flags field Chris Mason
@ 2006-11-01 22:58 ` David Chinner
2006-11-02 1:02 ` Chris Mason
2006-11-08 18:48 ` Chris Mason
0 siblings, 2 replies; 14+ messages in thread
From: David Chinner @ 2006-11-01 22:58 UTC (permalink / raw)
To: Chris Mason; +Cc: linux-fsdevel, akpm, zach.brown, Suparna Bhattacharya
On Wed, Nov 01, 2006 at 11:08:06AM -0400, Chris Mason wrote:
> This creates a number of flags so that filesystems can control
> blockdev_direct_IO. It is based on code from Russell Cettelan.
>
> The new flags are:
> DIO_CREATE -- always pass create=1 to get_block on writes. This allows
> DIO to fill holes in the file.
> DIO_PLACEHOLDERS -- use placeholder pages to provide locking against buffered
> io and truncates.
> DIO_EXTEND -- use truncate to grow the file instead of falling back to
> buffered io.
> DIO_DROP_I_MUTEX -- drop i_mutex before starting the IO on writes
>
> Signed-off-by: Chris Mason <chris.mason@oracle.com>
....
> @@ -1193,28 +1190,17 @@ direct_io_worker(int rw, struct kiocb *i
>
> /*
> * This is a library function for use by filesystem drivers.
> - * The locking rules are governed by the dio_lock_type parameter.
> - *
> - * DIO_NO_LOCKING (no locking, for raw block device access)
> - * For writes, i_mutex is not held on entry; it is never taken.
> - *
> - * DIO_LOCKING (simple locking for regular files)
> - * For writes we are called under i_mutex and return with i_mutex held, even
> - * though it is internally dropped.
> - *
> - * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
> - * uninitialised data, allowing parallel direct readers and writers)
> - * For writes we are called without i_mutex, return without it, never touch it.
> - * For reads we are called under i_mutex and return with i_mutex held, even
> - * though it may be internally dropped.
> - *
> - * Additional i_alloc_sem locking requirements described inline below.
> + * The flags parameter is a bitmask of:
> + *
> + * DIO_PLACEHOLDERS (use placeholder pages for locking)
> + * DIO_CREATE (pass create=1 to get_block for filling holes)
> + * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes)
DIO_EXTEND?
> @@ -1271,9 +1256,14 @@ __blockdev_direct_IO(int rw, struct kioc
> * For regular files using DIO_OWN_LOCKING,
> * neither readers nor writers take any locks here
> */
> - dio->lock_type = dio_lock_type;
> -
> - if (dio->lock_type == DIO_NO_LOCKING && end > offset) {
> + dio->flags = flags;
> +
> + /*
> + * the placeholder code does filemap_write_and_wait, so if we
> + * aren't using placeholders we have to do it here
> + */
> + if (!(dio->flags & DIO_PLACEHOLDERS) && end > offset) {
> + struct address_space *mapping = iocb->ki_filp->f_mapping;
> retval = filemap_write_and_wait_range(mapping, offset, end - 1);
> if (retval)
> goto out;
So that means XFS will now do three cache flushes on write (one in
xfs_write(), one in generic_file_direct_IO() and yet another here....
Given the unconditional flush in generic_file_direct_IO(), is this
flush here needed at all?
> @@ -1310,7 +1301,7 @@ __blockdev_direct_IO(int rw, struct kioc
> if (retval)
> goto out;
> }
> - if (is_sync_kiocb(iocb)) {
> + if ((dio->flags & DIO_DROP_I_MUTEX) && is_sync_kiocb(iocb)) {
> dio->reacquire_i_mutex = 1;
> mutex_unlock(&inode->i_mutex);
> }
Hmm - do we need dio->reacquire_i_mutex? We could use the DIO_DROP_I_MUTEX
to determine if we need to reacquire the mutex (i.e. we clear the
flag if we don't need to reacquire the mutex.).
Cheers,
Dave.
--
Dave Chinner
Principal Engineer
SGI Australian Software Group
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH 4 of 7] Turn the DIO lock_type parameter into a flags field
2006-11-01 22:58 ` David Chinner
@ 2006-11-02 1:02 ` Chris Mason
2006-11-02 2:16 ` David Chinner
2006-11-08 18:48 ` Chris Mason
1 sibling, 1 reply; 14+ messages in thread
From: Chris Mason @ 2006-11-02 1:02 UTC (permalink / raw)
To: David Chinner; +Cc: linux-fsdevel, akpm, zach.brown, Suparna Bhattacharya
On Thu, Nov 02, 2006 at 09:58:58AM +1100, David Chinner wrote:
[ ...]
Thanks for the review, I'll incorporate these suggestions into the next
patch set.
> > +
> > + /*
> > + * the placeholder code does filemap_write_and_wait, so if we
> > + * aren't using placeholders we have to do it here
> > + */
> > + if (!(dio->flags & DIO_PLACEHOLDERS) && end > offset) {
> > + struct address_space *mapping = iocb->ki_filp->f_mapping;
> > retval = filemap_write_and_wait_range(mapping, offset, end - 1);
> > if (retval)
> > goto out;
>
> So that means XFS will now do three cache flushes on write (one in
> xfs_write(), one in generic_file_direct_IO() and yet another here....
I removed the flush before starting the IO in generic_file_direct_IO
because that is now done by the placeholders. When placeholders aren't
used, we need the flush to match the old behavior.
I added the flush in generic_file_direct_IO after calling the direct_IO
func because invalidate_page_range2 requires the data to be flushed
first. Otherwise it fails to free a dirty page with dirty buffers and
spits out -EIO. This only happens when mapping->nrpages > 0, so it
should be very low cost in the common case.
It's messy, I'll try to refactor things a bit to make it cleaner (same
goes for i_mutex drop/lock).
-chris
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH 4 of 7] Turn the DIO lock_type parameter into a flags field
2006-11-02 1:02 ` Chris Mason
@ 2006-11-02 2:16 ` David Chinner
0 siblings, 0 replies; 14+ messages in thread
From: David Chinner @ 2006-11-02 2:16 UTC (permalink / raw)
To: Chris Mason
Cc: David Chinner, linux-fsdevel, akpm, zach.brown,
Suparna Bhattacharya
On Wed, Nov 01, 2006 at 08:02:25PM -0500, Chris Mason wrote:
> On Thu, Nov 02, 2006 at 09:58:58AM +1100, David Chinner wrote:
> [ ...]
>
> Thanks for the review, I'll incorporate these suggestions into the next
> patch set.
>
> > > +
> > > + /*
> > > + * the placeholder code does filemap_write_and_wait, so if we
> > > + * aren't using placeholders we have to do it here
> > > + */
> > > + if (!(dio->flags & DIO_PLACEHOLDERS) && end > offset) {
> > > + struct address_space *mapping = iocb->ki_filp->f_mapping;
> > > retval = filemap_write_and_wait_range(mapping, offset, end - 1);
> > > if (retval)
> > > goto out;
> >
> > So that means XFS will now do three cache flushes on write (one in
> > xfs_write(), one in generic_file_direct_IO() and yet another here....
>
> I removed the flush before starting the IO in generic_file_direct_IO
> because that is now done by the placeholders. When placeholders aren't
> used, we need the flush to match the old behavior.
OK - given that XFS does a flush-and-invalidate (with the i_mutex held)
before calling the generic direct I/O code, do we have grounds for another
flag here as it is not needed for XFS?
(FWIW, XFS's fs_flushinval_pages() could be made smarter now that we
have range based functions that can be used....)
> I added the flush in generic_file_direct_IO after calling the direct_IO
> func because invalidate_page_range2 requires the data to be flushed
> first. Otherwise it fails to free a dirty page with dirty buffers and
> spits out -EIO. This only happens when mapping->nrpages > 0, so it
> should be very low cost in the common case.
*nod*. XFS does it's flush-inval does on the same criteria.
> It's messy, I'll try to refactor things a bit to make it cleaner (same
> goes for i_mutex drop/lock).
True, but it is a lot less messy than the current code ;)
Cheers,
Dave.
--
Dave Chinner
Principal Engineer
SGI Australian Software Group
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4 of 7] Turn the DIO lock_type parameter into a flags field
2006-11-01 22:58 ` David Chinner
2006-11-02 1:02 ` Chris Mason
@ 2006-11-08 18:48 ` Chris Mason
1 sibling, 0 replies; 14+ messages in thread
From: Chris Mason @ 2006-11-08 18:48 UTC (permalink / raw)
To: David Chinner; +Cc: linux-fsdevel, akpm, zach.brown, Suparna Bhattacharya
On Thu, Nov 02, 2006 at 09:58:58AM +1100, David Chinner wrote:
> > + if (!(dio->flags & DIO_PLACEHOLDERS) && end > offset) {
> > + struct address_space *mapping = iocb->ki_filp->f_mapping;
> > retval = filemap_write_and_wait_range(mapping, offset, end - 1);
> > if (retval)
> > goto out;
>
> So that means XFS will now do three cache flushes on write (one in
> xfs_write(), one in generic_file_direct_IO() and yet another here....
>
> Given the unconditional flush in generic_file_direct_IO(), is this
> flush here needed at all?
Ok, here's a new patch that replaces #4 of 7, and tries to rework the
API a bit to be more flexible....or easier to cut and paste, depending
on how you look at it.
-chris
--
Turn the DIO lock_type parameter into a flags field
This creates a number of flags so that filesystems can control
blockdev_direct_IO. It is based on code from Russell Cettelan.
The new flags are:
DIO_CREATE -- always pass create=1 to get_block on writes. This allows
DIO to fill holes in the file.
DIO_PLACEHOLDERS -- use placeholder pages to provide locking against buffered
io and truncates.
DIO_EXTEND -- use truncate to grow the file instead of falling back to
buffered io.
DIO_DROP_I_MUTEX -- drop i_mutex before starting the mapping, io submission,
or io waiting. The mutex is still dropped for AIO
as well.
Some API changes are made so that filesystems can have more control
over the DIO features.
__blockdev_direct_IO is more or less renamed to blockdev_direct_IO_flags.
All waiting and invalidating of page cache data is pushed down into
blockdev_direct_IO_flags (and removed from mm/filemap.c)
direct_io_worker is exported into the wild. Filesystems that want to be
special can pull out the bits of blockdev_direct_IO_flags they care about
and then call direct_io_worker directly.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff -r 98112f002089 fs/direct-io.c
--- a/fs/direct-io.c Wed Nov 08 10:33:48 2006 -0500
+++ b/fs/direct-io.c Wed Nov 08 11:46:26 2006 -0500
@@ -53,13 +53,6 @@
*
* If blkfactor is zero then the user's request was aligned to the filesystem's
* blocksize.
- *
- * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
- * This determines whether we need to do the fancy locking which prevents
- * direct-IO from being able to read uninitialised disk blocks. If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
- * not held for the entire direct write (taken briefly, initially, during a
- * direct read though, but its never held for the duration of a direct-IO).
*/
struct dio {
@@ -68,8 +61,7 @@ struct dio {
struct inode *inode;
int rw;
loff_t i_size; /* i_size when submitted */
- int lock_type; /* doesn't change */
- int reacquire_i_mutex; /* should we get i_mutex when done? */
+ unsigned flags; /* doesn't change */
unsigned blkbits; /* doesn't change */
unsigned blkfactor; /* When we're using an alignment which
is finer than the filesystem's soft
@@ -203,7 +195,7 @@ static void unlock_page_range(struct dio
static void unlock_page_range(struct dio *dio, unsigned long start,
unsigned long nr)
{
- if (dio->lock_type != DIO_NO_LOCKING) {
+ if (dio->flags & DIO_PLACEHOLDERS) {
remove_placeholder_pages(dio->inode->i_mapping, dio->tmppages,
&dio->fake,
start, start + nr,
@@ -218,11 +210,13 @@ static int lock_page_range(struct dio *d
struct page *fake = &dio->fake;
unsigned long end = start + nr;
- if (dio->lock_type == DIO_NO_LOCKING)
- return 0;
- return find_or_insert_placeholders(mapping, dio->tmppages, start, end,
- ARRAY_SIZE(dio->tmppages),
- GFP_KERNEL, fake, 1);
+ if (dio->flags & DIO_PLACEHOLDERS) {
+ return find_or_insert_placeholders(mapping, dio->tmppages,
+ start, end,
+ ARRAY_SIZE(dio->tmppages),
+ GFP_KERNEL, fake, 1);
+ }
+ return 0;
}
@@ -258,8 +252,6 @@ static void dio_complete(struct dio *dio
unlock_page_range(dio, dio->fspages_start_off,
dio->fspages_end_off - dio->fspages_start_off);
dio->fspages_end_off = dio->fspages_start_off;
- if (dio->reacquire_i_mutex)
- mutex_lock(&dio->inode->i_mutex);
}
/*
@@ -556,6 +548,7 @@ static int get_more_blocks(struct dio *d
unsigned long dio_count;/* Number of dio_block-sized blocks */
unsigned long blkmask;
unsigned long index;
+ unsigned long end;
int create;
/*
@@ -575,8 +568,9 @@ static int get_more_blocks(struct dio *d
map_bh->b_state = 0;
map_bh->b_size = fs_count << dio->inode->i_blkbits;
- create = dio->rw & WRITE;
- if (dio->lock_type == DIO_NO_LOCKING)
+ if (dio->flags & DIO_CREATE)
+ create = dio->rw & WRITE;
+ else
create = 0;
index = fs_startblk >> (PAGE_CACHE_SHIFT -
dio->inode->i_blkbits);
@@ -990,18 +984,44 @@ out:
return ret;
}
-static ssize_t
-direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs,
+/*
+ * This does all the real work of the direct io. Most filesystems want to
+ * call blockdev_direct_IO_flags instead, but if you have exotic locking
+ * routines you can call this directly.
+ *
+ * The flags parameter is a bitmask of:
+ *
+ * DIO_PLACEHOLDERS (use placeholder pages for locking)
+ * DIO_CREATE (pass create=1 to get_block for filling holes)
+ * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes)
+ * DIO_EXTEND (use truncate to expand the file if needed)
+ */
+ssize_t
+direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs,
unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
- struct dio *dio)
-{
- unsigned long user_addr;
+ int is_async, unsigned flags)
+{
+ unsigned long user_addr;
int seg;
ssize_t ret = 0;
ssize_t ret2;
size_t bytes;
-
+ struct dio *dio;
+
+ if (rw & WRITE)
+ rw = WRITE_SYNC;
+
+ dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!dio)
+ goto out;
+
+ set_page_placeholder(&dio->fake);
+ dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT;
+ dio->fspages_end_off = dio->fspages_start_off;
+ dio->flags = flags;
+ dio->is_async = is_async;
dio->bio = NULL;
dio->inode = inode;
dio->rw = rw;
@@ -1188,33 +1208,24 @@ direct_io_worker(int rw, struct kiocb *i
aio_complete(iocb, ret, 0);
kfree(dio);
}
+out:
return ret;
}
-
-/*
- * This is a library function for use by filesystem drivers.
- * The locking rules are governed by the dio_lock_type parameter.
- *
- * DIO_NO_LOCKING (no locking, for raw block device access)
- * For writes, i_mutex is not held on entry; it is never taken.
- *
- * DIO_LOCKING (simple locking for regular files)
- * For writes we are called under i_mutex and return with i_mutex held, even
- * though it is internally dropped.
- *
- * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
- * uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_mutex, return without it, never touch it.
- * For reads we are called under i_mutex and return with i_mutex held, even
- * though it may be internally dropped.
- *
- * Additional i_alloc_sem locking requirements described inline below.
- */
-ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, const struct iovec *iov, loff_t offset,
- unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- int dio_lock_type)
+EXPORT_SYMBOL(direct_io_worker);
+
+/*
+ * A utility function fro blockdev_direct_IO_flags, this checks
+ * alignment of a O_DIRECT iovec against filesystem and blockdevice
+ * requirements.
+ *
+ * It returns a blkbits value that will work for the io, and returns the
+ * end offset of the io (via blkbits_ret and end_ret).
+ *
+ * The function returns 0 if everything will work or -EINVAL on error
+ */
+int check_dio_alignment(struct inode *inode, struct block_device *bdev,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs,
+ unsigned *blkbits_ret, loff_t *end_ret)
{
int seg;
size_t size;
@@ -1222,13 +1233,7 @@ __blockdev_direct_IO(int rw, struct kioc
unsigned blkbits = inode->i_blkbits;
unsigned bdev_blkbits = 0;
unsigned blocksize_mask = (1 << blkbits) - 1;
- ssize_t retval = -EINVAL;
loff_t end = offset;
- struct dio *dio;
- struct address_space *mapping = iocb->ki_filp->f_mapping;
-
- if (rw & WRITE)
- rw = WRITE_SYNC;
if (bdev)
bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1238,7 +1243,7 @@ __blockdev_direct_IO(int rw, struct kioc
blkbits = bdev_blkbits;
blocksize_mask = (1 << blkbits) - 1;
if (offset & blocksize_mask)
- goto out;
+ return -EINVAL;
}
/* Check the memory alignment. Blocks cannot straddle pages */
@@ -1250,43 +1255,42 @@ __blockdev_direct_IO(int rw, struct kioc
if (bdev)
blkbits = bdev_blkbits;
blocksize_mask = (1 << blkbits) - 1;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
- goto out;
- }
- }
- dio = kmalloc(sizeof(*dio), GFP_KERNEL);
- retval = -ENOMEM;
- if (!dio)
+ if ((addr & blocksize_mask) || (size & blocksize_mask))
+ return -EINVAL;
+ }
+ }
+ *end_ret = end;
+ *blkbits_ret = blkbits;
+ return 0;
+}
+EXPORT_SYMBOL(check_dio_alignment);
+
+/*
+ * This is a library function for use by filesystem drivers.
+ * The flags parameter is a bitmask of:
+ *
+ * DIO_PLACEHOLDERS (use placeholder pages for locking)
+ * DIO_CREATE (pass create=1 to get_block for filling holes)
+ * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes)
+ * DIO_EXTEND (use truncate to expand the file if needed)
+ */
+ssize_t
+blockdev_direct_IO_flags(int rw, struct kiocb *iocb, struct inode *inode,
+ struct block_device *bdev, const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+ unsigned flags)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ unsigned blkbits = 0;
+ ssize_t retval = -EINVAL;
+ loff_t end = 0;
+ int is_async;
+ int grab_i_mutex = 0;
+
+
+ if (check_dio_alignment(inode, bdev, iov, offset, nr_segs,
+ &blkbits, &end))
goto out;
-
- set_page_placeholder(&dio->fake);
- dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT;
- dio->fspages_end_off = dio->fspages_start_off;
-
- /*
- * For block device access DIO_NO_LOCKING is used,
- * neither readers nor writers do any locking at all
- * For regular files using DIO_LOCKING,
- * No locks are taken
- * For regular files using DIO_OWN_LOCKING,
- * neither readers nor writers take any locks here
- */
- dio->lock_type = dio_lock_type;
-
- if (dio->lock_type == DIO_NO_LOCKING && end > offset) {
- retval = filemap_write_and_wait_range(mapping, offset, end - 1);
- if (retval)
- goto out;
- }
-
- /*
- * For file extending writes updating i_size before data
- * writeouts complete can expose uninitialized blocks. So
- * even for AIO, we need to wait for i/o to complete before
- * returning in this case.
- */
- dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
- (end > i_size_read(inode)));
/*
* extend the file if needed, and drop i_mutex for non-aio writes.
@@ -1295,12 +1299,12 @@ __blockdev_direct_IO(int rw, struct kioc
* are inserted, the locking rules end up being the same as
* mmap'd writes using writepage to fill holes
*/
- dio->reacquire_i_mutex = 0;
- if ((rw & WRITE) && dio_lock_type == DIO_LOCKING) {
+ if (rw & WRITE) {
/* if our write goes past i_size, do an expanding
* truncate to fill it before dropping i_mutex
*/
- if (end > i_size_read(inode) && iocb->ki_filp) {
+ if ((flags & DIO_EXTEND) && end > i_size_read(inode) &&
+ iocb->ki_filp) {
struct iattr newattrs;
newattrs.ia_size = end;
newattrs.ia_file = iocb->ki_filp;
@@ -1310,14 +1314,59 @@ __blockdev_direct_IO(int rw, struct kioc
if (retval)
goto out;
}
- if (is_sync_kiocb(iocb)) {
- dio->reacquire_i_mutex = 1;
+ /*
+ * If it's a write, unmap all mmappings of the file up-front.
+ * This will cause any pte dirty bits to be propagated into
+ * the pageframes for the subsequent filemap_write_and_wait().
+ */
+ if (mapping_mapped(mapping))
+ unmap_mapping_range(mapping, offset, end - offset, 0);
+ if (flags & DIO_DROP_I_MUTEX) {
mutex_unlock(&inode->i_mutex);
- }
- }
+ grab_i_mutex = 1;
+ }
+ }
+
+ /*
+ * the placeholder code does filemap_write_and_wait, so if we
+ * aren't using placeholders we have to do it here
+ */
+ if (!(flags & DIO_PLACEHOLDERS) && end > offset) {
+ retval = filemap_write_and_wait_range(mapping, offset, end - 1);
+ if (retval)
+ goto out;
+ }
+
+ /*
+ * For file extending writes updating i_size before data
+ * writeouts complete can expose uninitialized blocks. So
+ * even for AIO, we need to wait for i/o to complete before
+ * returning in this case.
+ */
+ is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
+ (end > i_size_read(inode)));
+
retval = direct_io_worker(rw, iocb, inode, iov, offset,
- nr_segs, blkbits, get_block, end_io, dio);
+ nr_segs, blkbits, get_block, end_io, is_async,
+ flags);
out:
+ if (grab_i_mutex)
+ mutex_lock(&inode->i_mutex);
+
+ if ((rw & WRITE) && mapping->nrpages) {
+ int err;
+ /* O_DIRECT is allowed to drop i_mutex, so more data
+ * could have been dirtied by others. Start io one more
+ * time
+ */
+ err = filemap_write_and_wait_range(mapping, offset, end - 1);
+ if (!err)
+ err = invalidate_inode_pages2_range(mapping,
+ offset >> PAGE_CACHE_SHIFT,
+ (end - 1) >> PAGE_CACHE_SHIFT);
+ if (!retval && err)
+ retval = err;
+ }
return retval;
}
-EXPORT_SYMBOL(__blockdev_direct_IO);
+EXPORT_SYMBOL(blockdev_direct_IO_flags);
diff -r 98112f002089 include/linux/fs.h
--- a/include/linux/fs.h Wed Nov 08 10:33:48 2006 -0500
+++ b/include/linux/fs.h Wed Nov 08 11:37:36 2006 -0500
@@ -1798,24 +1798,29 @@ static inline void do_generic_file_read(
}
#ifdef CONFIG_BLOCK
-ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+int check_dio_alignment(struct inode *inode, struct block_device *bdev,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs,
+ unsigned *blkbits_ret, loff_t *end_ret);
+
+ssize_t blockdev_direct_IO_flags(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- int lock_type);
-
-enum {
- DIO_LOCKING = 1, /* need locking between buffered and direct access */
- DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
- DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
-};
+ unsigned int dio_flags);
+
+#define DIO_PLACEHOLDERS (1 << 0) /* insert placeholder pages */
+#define DIO_CREATE (1 << 1) /* pass create=1 to get_block when writing */
+#define DIO_DROP_I_MUTEX (1 << 2) /* drop i_mutex during writes */
+#define DIO_EXTEND (1 << 3) /* extend the file w/truncate if needed */
static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
struct inode *inode, struct block_device *bdev, const struct iovec *iov,
loff_t offset, unsigned long nr_segs, get_block_t get_block,
dio_iodone_t end_io)
{
- return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_LOCKING);
+ /* locking is on, FS wants to fill holes w/get_block */
+ return blockdev_direct_IO_flags(rw, iocb, inode, bdev, iov, offset,
+ nr_segs, get_block, end_io, DIO_PLACEHOLDERS |
+ DIO_CREATE | DIO_DROP_I_MUTEX | DIO_EXTEND);
}
static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
@@ -1823,17 +1828,9 @@ static inline ssize_t blockdev_direct_IO
loff_t offset, unsigned long nr_segs, get_block_t get_block,
dio_iodone_t end_io)
{
- return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_NO_LOCKING);
-}
-
-static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
- struct inode *inode, struct block_device *bdev, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, get_block_t get_block,
- dio_iodone_t end_io)
-{
- return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_OWN_LOCKING);
+ /* locking is off, create is off */
+ return blockdev_direct_IO_flags(rw, iocb, inode, bdev, iov, offset,
+ nr_segs, get_block, end_io, 0);
}
#endif
diff -r 98112f002089 mm/filemap.c
--- a/mm/filemap.c Wed Nov 08 10:33:48 2006 -0500
+++ b/mm/filemap.c Wed Nov 08 11:49:14 2006 -0500
@@ -40,7 +40,7 @@
#include <asm/mman.h>
-static ssize_t
+static inline ssize_t
generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
@@ -2735,46 +2735,12 @@ EXPORT_SYMBOL(generic_file_aio_write);
* Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
* went wrong during pagecache shootdown.
*/
-static ssize_t
+static inline ssize_t
generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- ssize_t retval;
- size_t write_len = 0;
-
- /*
- * If it's a write, unmap all mmappings of the file up-front. This
- * will cause any pte dirty bits to be propagated into the pageframes
- * for the subsequent filemap_write_and_wait().
- */
- if (rw == WRITE) {
- write_len = iov_length(iov, nr_segs);
- if (mapping_mapped(mapping))
- unmap_mapping_range(mapping, offset, write_len, 0);
- }
-
- retval = mapping->a_ops->direct_IO(rw, iocb, iov,
- offset, nr_segs);
- if (rw == WRITE && mapping->nrpages) {
- int err;
- pgoff_t end = (offset + write_len - 1)
- >> PAGE_CACHE_SHIFT;
-
- /* O_DIRECT is allowed to drop i_mutex, so more data
- * could have been dirtied by others. Start io one more
- * time
- */
- err = filemap_fdatawrite_range(mapping, offset,
- offset + write_len - 1);
- if (!err)
- err = invalidate_inode_pages2_range(mapping,
- offset >> PAGE_CACHE_SHIFT, end);
- if (err)
- retval = err;
- }
- return retval;
+ return iocb->ki_filp->f_mapping->a_ops->direct_IO(rw, iocb, iov,
+ offset, nr_segs);
}
/**
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 5 of 7] Make ext3 safe for the new DIO locking rules
2006-11-01 15:08 [PATCH 0 of 7] O_DIRECT locking rework Chris Mason
` (3 preceding siblings ...)
2006-11-01 15:08 ` [PATCH 4 of 7] Turn the DIO lock_type parameter into a flags field Chris Mason
@ 2006-11-01 15:08 ` Chris Mason
2006-11-01 15:08 ` [PATCH 6 of 7] Make reiserfs safe for " Chris Mason
2006-11-01 15:08 ` [PATCH 7 of 7] Adapt XFS to the new blockdev_direct_IO calls Chris Mason
6 siblings, 0 replies; 14+ messages in thread
From: Chris Mason @ 2006-11-01 15:08 UTC (permalink / raw)
To: linux-fsdevel; +Cc: akpm, zach.brown, Suparna Bhattacharya
This changes ext3 to skip the orphaned inoded while
growing the file with DIO. It also creates a version of ext3_get_block
that starts and ends a transaction.
By starting and ending the transaction inside get_block, this is able to
avoid lock inversion problems when the DIO code tries to take page locks
inside blockdev_direct_IO. (transaction locks must always happen after
page locks).
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff -r f84d3216430d -r 218de24978fc fs/ext3/inode.c
--- a/fs/ext3/inode.c Wed Nov 01 10:24:03 2006 -0500
+++ b/fs/ext3/inode.c Wed Nov 01 10:24:05 2006 -0500
@@ -1608,6 +1608,30 @@ static int ext3_releasepage(struct page
return journal_try_to_free_buffers(journal, page, wait);
}
+static int ext3_get_block_direct_IO(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+ handle_t *handle = ext3_journal_start(inode, DIO_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = ext3_get_block(inode, iblock, bh_result, create);
+ /*
+ * Reacquire the handle: ext3_get_block() can restart the transaction
+ */
+ handle = journal_current_handle();
+ if (handle) {
+ int err;
+ err = ext3_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ }
+out:
+ return ret;
+}
+
/*
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
@@ -1620,67 +1644,11 @@ static ssize_t ext3_direct_IO(int rw, st
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct ext3_inode_info *ei = EXT3_I(inode);
- handle_t *handle = NULL;
- ssize_t ret;
- int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
-
- if (rw == WRITE) {
- loff_t final_size = offset + count;
-
- handle = ext3_journal_start(inode, DIO_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- if (final_size > inode->i_size) {
- ret = ext3_orphan_add(handle, inode);
- if (ret)
- goto out_stop;
- orphan = 1;
- ei->i_disksize = inode->i_size;
- }
- }
-
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext3_get_block, NULL);
-
- /*
- * Reacquire the handle: ext3_get_block() can restart the transaction
- */
- handle = journal_current_handle();
-
-out_stop:
- if (handle) {
- int err;
-
- if (orphan && inode->i_nlink)
- ext3_orphan_del(handle, inode);
- if (orphan && ret > 0) {
- loff_t end = offset + ret;
- if (end > inode->i_size) {
- ei->i_disksize = end;
- i_size_write(inode, end);
- /*
- * We're going to return a positive `ret'
- * here due to non-zero-length I/O, so there's
- * no way of reporting error returns from
- * ext3_mark_inode_dirty() to userspace. So
- * ignore it.
- */
- ext3_mark_inode_dirty(handle, inode);
- }
- }
- err = ext3_journal_stop(handle);
- if (ret == 0)
- ret = err;
- }
-out:
- return ret;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ return blockdev_direct_IO_flags(rw, iocb, inode, inode->i_sb->s_bdev,
+ iov, offset, nr_segs, ext3_get_block_direct_IO,
+ NULL, DIO_PLACEHOLDERS | DIO_CREATE |
+ DIO_EXTEND | DIO_DROP_I_MUTEX);
}
/*
^ permalink raw reply [flat|nested] 14+ messages in thread* [PATCH 6 of 7] Make reiserfs safe for new DIO locking rules
2006-11-01 15:08 [PATCH 0 of 7] O_DIRECT locking rework Chris Mason
` (4 preceding siblings ...)
2006-11-01 15:08 ` [PATCH 5 of 7] Make ext3 safe for the new DIO locking rules Chris Mason
@ 2006-11-01 15:08 ` Chris Mason
2006-11-01 15:08 ` [PATCH 7 of 7] Adapt XFS to the new blockdev_direct_IO calls Chris Mason
6 siblings, 0 replies; 14+ messages in thread
From: Chris Mason @ 2006-11-01 15:08 UTC (permalink / raw)
To: linux-fsdevel; +Cc: akpm, zach.brown, Suparna Bhattacharya
reiserfs is changed to use a version of reiserfs_get_block that is safe
for filling holes without i_mutex held.
reiserfs_file_write no longer creates a savelink before doing the direct io.
When DIO does the expanding truncate, it also tries to create a savelink,
and this conflicted with the one created by reiserfs_file_write.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff -r 218de24978fc -r b23ba06ac61f fs/reiserfs/file.c
--- a/fs/reiserfs/file.c Wed Nov 01 10:24:05 2006 -0500
+++ b/fs/reiserfs/file.c Wed Nov 01 10:24:08 2006 -0500
@@ -1306,56 +1306,8 @@ static ssize_t reiserfs_file_write(struc
count = MAX_NON_LFS - (unsigned long)*ppos;
}
- if (file->f_flags & O_DIRECT) { // Direct IO needs treatment
- ssize_t result, after_file_end = 0;
- if ((*ppos + count >= inode->i_size)
- || (file->f_flags & O_APPEND)) {
- /* If we are appending a file, we need to put this savelink in here.
- If we will crash while doing direct io, finish_unfinished will
- cut the garbage from the file end. */
- reiserfs_write_lock(inode->i_sb);
- err =
- journal_begin(&th, inode->i_sb,
- JOURNAL_PER_BALANCE_CNT);
- if (err) {
- reiserfs_write_unlock(inode->i_sb);
- return err;
- }
- reiserfs_update_inode_transaction(inode);
- add_save_link(&th, inode, 1 /* Truncate */ );
- after_file_end = 1;
- err =
- journal_end(&th, inode->i_sb,
- JOURNAL_PER_BALANCE_CNT);
- reiserfs_write_unlock(inode->i_sb);
- if (err)
- return err;
- }
- result = do_sync_write(file, buf, count, ppos);
-
- if (after_file_end) { /* Now update i_size and remove the savelink */
- struct reiserfs_transaction_handle th;
- reiserfs_write_lock(inode->i_sb);
- err = journal_begin(&th, inode->i_sb, 1);
- if (err) {
- reiserfs_write_unlock(inode->i_sb);
- return err;
- }
- reiserfs_update_inode_transaction(inode);
- mark_inode_dirty(inode);
- err = journal_end(&th, inode->i_sb, 1);
- if (err) {
- reiserfs_write_unlock(inode->i_sb);
- return err;
- }
- err = remove_save_link(inode, 1 /* truncate */ );
- reiserfs_write_unlock(inode->i_sb);
- if (err)
- return err;
- }
-
- return result;
- }
+ if (file->f_flags & O_DIRECT) // Direct IO needs treatment
+ return do_sync_write(file, buf, count, ppos);
if (unlikely((ssize_t) count < 0))
return -EINVAL;
diff -r 218de24978fc -r b23ba06ac61f fs/reiserfs/inode.c
--- a/fs/reiserfs/inode.c Wed Nov 01 10:24:05 2006 -0500
+++ b/fs/reiserfs/inode.c Wed Nov 01 10:24:08 2006 -0500
@@ -468,7 +468,8 @@ static int reiserfs_get_blocks_direct_io
bh_result->b_size = (1 << inode->i_blkbits);
ret = reiserfs_get_block(inode, iblock, bh_result,
- create | GET_BLOCK_NO_DANGLE);
+ create | GET_BLOCK_NO_DANGLE |
+ GET_BLOCK_NO_IMUX);
if (ret)
goto out;
^ permalink raw reply [flat|nested] 14+ messages in thread* [PATCH 7 of 7] Adapt XFS to the new blockdev_direct_IO calls
2006-11-01 15:08 [PATCH 0 of 7] O_DIRECT locking rework Chris Mason
` (5 preceding siblings ...)
2006-11-01 15:08 ` [PATCH 6 of 7] Make reiserfs safe for " Chris Mason
@ 2006-11-01 15:08 ` Chris Mason
2006-11-01 23:00 ` David Chinner
6 siblings, 1 reply; 14+ messages in thread
From: Chris Mason @ 2006-11-01 15:08 UTC (permalink / raw)
To: linux-fsdevel; +Cc: akpm, zach.brown, Suparna Bhattacharya
XFS is changed to use blockdev_direct_IO flags instead of DIO_OWN_LOCKING.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff -r b23ba06ac61f -r b23d3ec1929f fs/xfs/linux-2.6/xfs_aops.c
--- a/fs/xfs/linux-2.6/xfs_aops.c Wed Nov 01 10:24:08 2006 -0500
+++ b/fs/xfs/linux-2.6/xfs_aops.c Wed Nov 01 10:24:11 2006 -0500
@@ -1389,19 +1389,16 @@ xfs_vm_direct_IO(
iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
- if (rw == WRITE) {
- ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
- iomap.iomap_target->bt_bdev,
- iov, offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct);
- } else {
- ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
- iomap.iomap_target->bt_bdev,
- iov, offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct);
- }
+ /*
+ * ask DIO not to do any special locking for us, and to always
+ * pass create=1 to get_block on writes
+ */
+ ret = blockdev_direct_IO_flags(rw, iocb, inode,
+ iomap.iomap_target->bt_bdev,
+ iov, offset, nr_segs,
+ xfs_get_blocks_direct,
+ xfs_end_io_direct,
+ DIO_CREATE);
if (unlikely(ret <= 0 && iocb->private))
xfs_destroy_ioend(iocb->private);
^ permalink raw reply [flat|nested] 14+ messages in thread