From: Matthew Wilcox <matthew@wil.cx>
To: Jan Kara <jack@suse.cz>
Cc: linux-fsdevel@vger.kernel.org
Subject: Re: hole-punch vs fault
Date: Thu, 28 Nov 2013 05:24:34 -0700 [thread overview]
Message-ID: <20131128122433.GM24288@parisc-linux.org> (raw)
In-Reply-To: <20131128044454.GL24288@parisc-linux.org>
On Wed, Nov 27, 2013 at 09:44:54PM -0700, Matthew Wilcox wrote:
> Ohh. We can't take i_mutex because that's an inversion with mmap_sem.
Or the simple option is just to use a different mutex. That makes life
a bit more interesting because the filesystem now has to take the mutex
at the appropriate point to ensure that the pages found by fault won't
be damagable. I think this is correct for ext2/4:
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 8a33764..25ce796 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1212,8 +1212,14 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
if (error)
return error;
+ if (mapping_is_xip(inode->i_mapping))
+ damage_lock(inode->i_mapping);
truncate_setsize(inode, newsize);
__ext2_truncate_blocks(inode, newsize);
+ if (mapping_is_xip(inode->i_mapping)) {
+ damage_mapping(inode->i_mapping);
+ damage_unlock(inode->i_mapping);
+ }
inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
if (inode_needs_sync(inode)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0757634..4d3d533 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3571,6 +3571,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
first_block_offset = round_up(offset, sb->s_blocksize);
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+ damage_lock(mapping);
/* Now release the pages and zero block aligned part of pages*/
if (last_block_offset > first_block_offset)
truncate_pagecache_range(inode, first_block_offset,
@@ -3610,6 +3611,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
ret = ext4_es_remove_extent(inode, first_block,
stop_block - first_block);
if (ret) {
+ damage_unlock(mapping);
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
@@ -3622,6 +3624,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
stop_block);
ext4_discard_preallocations(inode);
+ damage_mapping(mapping);
+ damage_unlock(mapping);
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 121f11f..1ed4fa7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -411,6 +411,8 @@ struct address_space {
struct radix_tree_root page_tree; /* radix tree of all pages */
spinlock_t tree_lock; /* and lock protecting it */
unsigned int i_mmap_writable;/* count VM_SHARED mappings */
+ struct mutex i_damage_lock; /* fs data structures */
+ unsigned i_damaged; /* damage count */
struct rb_root i_mmap; /* tree of private and shared mappings */
struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
struct mutex i_mmap_mutex; /* protect tree, count, list */
@@ -499,6 +501,34 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
}
/*
+ * A mapping is damaged when blocks are removed from the filesystem's
+ * data structures.
+ */
+static inline unsigned mapping_damage(struct address_space *mapping)
+{
+ unsigned seq = ACCESS_ONCE(mapping->i_damaged);
+ smp_rmb(); /* Subsequent reads of damagable data structures */
+ return seq;
+}
+
+#define damage_lock(mapping) mutex_lock(&mapping->i_damage_lock)
+#define damage_unlock(mapping) mutex_unlock(&mapping->i_damage_lock)
+
+/* Must be called with i_damage_lock held */
+static inline bool
+mapping_is_damaged(struct address_space *mapping, unsigned seq)
+{
+ return mapping->i_damaged != seq;
+}
+
+/* Must be called with i_damage_lock held */
+static inline void damage_mapping(struct address_space *mapping)
+{
+ smp_wmb(); /* Prior writes to damagable data structures */
+ mapping->i_damaged++;
+}
+
+/*
* Use sequence counter to get consistent i_size on 32-bit processors.
*/
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a9..f99e831 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1617,6 +1617,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
pgoff_t offset = vmf->pgoff;
struct page *page;
pgoff_t size;
+ unsigned damage = mapping_damage(mapping);
int ret = 0;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -1676,6 +1677,19 @@ retry_find:
return VM_FAULT_SIGBUS;
}
+ /*
+ * Check if we were the unlucky victim of a holepunch
+ */
+ damage_lock(mapping);
+ if (unlikely(mapping_is_damaged(mapping, damage))) {
+ damage_unlock(mapping);
+ unlock_page(page);
+ page_cache_release(page);
+ damage = mapping_damage(mapping);
+ goto retry_find;
+ }
+ damage_unlock(mapping);
+
vmf->page = page;
return ret | VM_FAULT_LOCKED;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index d8d9fe3..dc478eb 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -224,6 +224,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
+ unsigned damage;
pgoff_t size;
void *xip_mem;
unsigned long xip_pfn;
@@ -232,6 +233,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
/* XXX: are VM_FAULT_ codes OK? */
again:
+ damage = mapping_damage(mapping);
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size)
return VM_FAULT_SIGBUS;
@@ -260,8 +262,14 @@ again:
__xip_unmap(mapping, vmf->pgoff);
found:
+ damage_lock(mapping);
+ if (unlikely(mapping_is_damaged(mapping, damage))) {
+ mutex_unlock(&inode->i_mutex);
+ goto again;
+ }
err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
xip_pfn);
+ damage_unlock(mapping);
if (err == -ENOMEM)
return VM_FAULT_OOM;
/*
--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
next prev parent reply other threads:[~2013-11-28 12:24 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-11-27 13:48 hole-punch vs fault Matthew Wilcox
2013-11-27 22:19 ` Jan Kara
2013-11-28 2:33 ` Matthew Wilcox
2013-11-28 3:30 ` Matthew Wilcox
2013-11-28 4:22 ` Dave Chinner
2013-11-28 4:44 ` Matthew Wilcox
2013-11-28 12:24 ` Matthew Wilcox [this message]
2013-11-28 22:12 ` Dave Chinner
2013-11-29 13:11 ` Matthew Wilcox
2013-12-01 21:52 ` Dave Chinner
2013-12-02 8:33 ` Jan Kara
2013-12-02 15:58 ` Matthew Wilcox
2013-12-02 20:11 ` Jan Kara
2013-12-02 20:13 ` Matthew Wilcox
2013-12-02 23:13 ` Jan Kara
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20131128122433.GM24288@parisc-linux.org \
--to=matthew@wil.cx \
--cc=jack@suse.cz \
--cc=linux-fsdevel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).