All of lore.kernel.org
 help / color / mirror / Atom feed
From: Matthew Wilcox <matthew@wil.cx>
To: Jan Kara <jack@suse.cz>
Cc: linux-fsdevel@vger.kernel.org
Subject: Re: hole-punch vs fault
Date: Thu, 28 Nov 2013 05:24:34 -0700	[thread overview]
Message-ID: <20131128122433.GM24288@parisc-linux.org> (raw)
In-Reply-To: <20131128044454.GL24288@parisc-linux.org>

On Wed, Nov 27, 2013 at 09:44:54PM -0700, Matthew Wilcox wrote:
> Ohh.  We can't take i_mutex because that's an inversion with mmap_sem.

Or the simple option is just to use a different mutex.  That makes life
a bit more interesting because the filesystem now has to take the mutex
at the appropriate point to ensure that the pages found by fault won't
be damagable.  I think this is correct for ext2/4:


diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 8a33764..25ce796 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1212,8 +1212,14 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 	if (error)
 		return error;
 
+	if (mapping_is_xip(inode->i_mapping))
+		damage_lock(inode->i_mapping);
 	truncate_setsize(inode, newsize);
 	__ext2_truncate_blocks(inode, newsize);
+	if (mapping_is_xip(inode->i_mapping)) {
+		damage_mapping(inode->i_mapping);
+		damage_unlock(inode->i_mapping);
+	}
 
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	if (inode_needs_sync(inode)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0757634..4d3d533 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3571,6 +3571,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	first_block_offset = round_up(offset, sb->s_blocksize);
 	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
 
+	damage_lock(mapping);
 	/* Now release the pages and zero block aligned part of pages*/
 	if (last_block_offset > first_block_offset)
 		truncate_pagecache_range(inode, first_block_offset,
@@ -3610,6 +3611,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	ret = ext4_es_remove_extent(inode, first_block,
 				    stop_block - first_block);
 	if (ret) {
+		damage_unlock(mapping);
 		up_write(&EXT4_I(inode)->i_data_sem);
 		goto out_stop;
 	}
@@ -3622,6 +3624,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 					    stop_block);
 
 	ext4_discard_preallocations(inode);
+	damage_mapping(mapping);
+	damage_unlock(mapping);
 	up_write(&EXT4_I(inode)->i_data_sem);
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 121f11f..1ed4fa7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -411,6 +411,8 @@ struct address_space {
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and lock protecting it */
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
+	struct mutex		i_damage_lock;	/* fs data structures */
+	unsigned		i_damaged;	/* damage count */
 	struct rb_root		i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 	struct mutex		i_mmap_mutex;	/* protect tree, count, list */
@@ -499,6 +501,34 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
 }
 
 /*
+ * A mapping is damaged when blocks are removed from the filesystem's
+ * data structures.
+ */
+static inline unsigned mapping_damage(struct address_space *mapping)
+{
+	unsigned seq = ACCESS_ONCE(mapping->i_damaged);
+	smp_rmb();	/* Subsequent reads of damagable data structures */
+	return seq;
+}
+
+#define damage_lock(mapping)	mutex_lock(&mapping->i_damage_lock)
+#define damage_unlock(mapping)	mutex_unlock(&mapping->i_damage_lock)
+
+/* Must be called with i_damage_lock held */
+static inline bool
+mapping_is_damaged(struct address_space *mapping, unsigned seq)
+{
+	return mapping->i_damaged != seq;
+}
+
+/* Must be called with i_damage_lock held */
+static inline void damage_mapping(struct address_space *mapping)
+{
+	smp_wmb();	/* Prior writes to damagable data structures */
+	mapping->i_damaged++;
+}
+
+/*
  * Use sequence counter to get consistent i_size on 32-bit processors.
  */
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a9..f99e831 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1617,6 +1617,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	pgoff_t offset = vmf->pgoff;
 	struct page *page;
 	pgoff_t size;
+	unsigned damage = mapping_damage(mapping);
 	int ret = 0;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -1676,6 +1677,19 @@ retry_find:
 		return VM_FAULT_SIGBUS;
 	}
 
+	/*
+	 * Check if we were the unlucky victim of a holepunch
+	 */
+	damage_lock(mapping);
+	if (unlikely(mapping_is_damaged(mapping, damage))) {
+		damage_unlock(mapping);
+		unlock_page(page);
+		page_cache_release(page);
+		damage = mapping_damage(mapping);
+		goto retry_find;
+	}
+	damage_unlock(mapping);
+
 	vmf->page = page;
 	return ret | VM_FAULT_LOCKED;
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index d8d9fe3..dc478eb 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -224,6 +224,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
+	unsigned damage;
 	pgoff_t size;
 	void *xip_mem;
 	unsigned long xip_pfn;
@@ -232,6 +233,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	/* XXX: are VM_FAULT_ codes OK? */
 again:
+	damage = mapping_damage(mapping);
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
 		return VM_FAULT_SIGBUS;
@@ -260,8 +262,14 @@ again:
 		__xip_unmap(mapping, vmf->pgoff);
 
 found:
+		damage_lock(mapping);
+		if (unlikely(mapping_is_damaged(mapping, damage))) {
+			mutex_unlock(&inode->i_mutex);
+			goto again;
+		}
 		err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
 							xip_pfn);
+		damage_unlock(mapping);
 		if (err == -ENOMEM)
 			return VM_FAULT_OOM;
 		/*

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

  reply	other threads:[~2013-11-28 12:24 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-11-27 13:48 hole-punch vs fault Matthew Wilcox
2013-11-27 22:19 ` Jan Kara
2013-11-28  2:33   ` Matthew Wilcox
2013-11-28  3:30     ` Matthew Wilcox
2013-11-28  4:22     ` Dave Chinner
2013-11-28  4:44     ` Matthew Wilcox
2013-11-28 12:24       ` Matthew Wilcox [this message]
2013-11-28 22:12       ` Dave Chinner
2013-11-29 13:11         ` Matthew Wilcox
2013-12-01 21:52           ` Dave Chinner
2013-12-02  8:33             ` Jan Kara
2013-12-02 15:58               ` Matthew Wilcox
2013-12-02 20:11                 ` Jan Kara
2013-12-02 20:13                 ` Matthew Wilcox
2013-12-02 23:13                   ` Jan Kara

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20131128122433.GM24288@parisc-linux.org \
    --to=matthew@wil.cx \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.