qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Andrew Barry <abarry@cray.com>
To: "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>
Cc: kvm <kvm@vger.kernel.org>, Marcelo Tosatti <mtosatti@redhat.com>,
	Hugh Dickins <hughd@google.com>, "agraf@suse.de" <agraf@suse.de>,
	"qemu-devel@nongnu.org" <qemu-devel@nongnu.org>,
	Adam Litke <agl@us.ibm.com>, Minchan Kim <minchan.kim@gmail.com>,
	Jan Kiszka <jan.kiszka@web.de>, Avi Kivity <avi@redhat.com>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Andrew Hastings <abh@cray.com>, Paul Mackerras <paulus@samba.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>,
	Mel Gorman <mgorman@suse.de>
Subject: [Qemu-devel] [RFC PATCH] mm/hugepages: Fix race between hugetlbfs umount and quota update.
Date: Tue, 16 Aug 2011 12:45:28 -0500	[thread overview]
Message-ID: <4E4AACB8.3090007@cray.com> (raw)
In-Reply-To: <20110816034748.GG18203@yookeroo.fritz.box>

Discussion was titled:  Fix refcounting in hugetlbfs quota handling

This patch fixes a race between the umount of a hugetlbfs filesystem, and quota
updates in that filesystem, which can result in the update of the filesystem
quota record, after the record structure has been freed.

Rather than an address-space struct pointer, it puts a hugetlbfs_sb_info struct
pointer into page_private of the page struct. A reference count and an active
bit are added to the hugetlbfs_sb_info struct; the reference count is increased
by hugetlb_get_quota and decreased by hugetlb_put_quota. When hugetlbfs is
unmounted, it frees the hugetlbfs_sb_info struct, but only if the reference
count is zero, otherwise it clears the active bit. The last hugetlb_put_quota
then frees the hugetlbfs_sb_info struct.

Signed-off-by: Andrew Barry <abarry@cray.com>

---

 fs/hugetlbfs/inode.c    |   40 ++++++++++++++++++++++++++--------------
 include/linux/hugetlb.h |    9 +++++++--
 mm/hugetlb.c            |   22 +++++++++++-----------
 3 files changed, 44 insertions(+), 27 deletions(-)


diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 87b6e04..2ed1cca 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -615,8 +615,12 @@ static void hugetlbfs_put_super(struct s
 	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);

 	if (sbi) {
+		sbi->active = HPAGE_INACTIVE;
 		sb->s_fs_info = NULL;
-		kfree(sbi);
+
+		/*Free only if used quota is zero. */
+		if (sbi->used_blocks == 0)
+			kfree(sbi);
 	}
 }

@@ -851,6 +855,8 @@ hugetlbfs_fill_super(struct super_block
 	sbinfo->free_blocks = config.nr_blocks;
 	sbinfo->max_inodes = config.nr_inodes;
 	sbinfo->free_inodes = config.nr_inodes;
+	sbinfo->used_blocks = 0;
+	sbinfo->active = HPAGE_ACTIVE;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = huge_page_size(config.hstate);
 	sb->s_blocksize_bits = huge_page_shift(config.hstate);

@@ -874,30 +880,36 @@ out_free:
 	return -ENOMEM;
 }

-int hugetlb_get_quota(struct address_space *mapping, long delta)
+int hugetlb_get_quota(struct hugetlbfs_sb_info *sbinfo, long delta)
 {
 	int ret = 0;
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);

-	if (sbinfo->free_blocks > -1) {
-		spin_lock(&sbinfo->stat_lock);
-		if (sbinfo->free_blocks - delta >= 0)
+	spin_lock(&sbinfo->stat_lock);
+	if ((sbinfo->free_blocks == -1) || (sbinfo->free_blocks - delta >= 0)) {
+		if (sbinfo->free_blocks != -1)
 			sbinfo->free_blocks -= delta;
-		else
-			ret = -ENOMEM;
-		spin_unlock(&sbinfo->stat_lock);
+		sbinfo->used_blocks += delta;
+		sbinfo->active = HPAGE_ACTIVE;
+	} else {
+		ret = -ENOMEM;
 	}
+	spin_unlock(&sbinfo->stat_lock);

 	return ret;
 }

-void hugetlb_put_quota(struct address_space *mapping, long delta)
+void hugetlb_put_quota(struct hugetlbfs_sb_info *sbinfo, long delta)
 {
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-
-	if (sbinfo->free_blocks > -1) {
-		spin_lock(&sbinfo->stat_lock);
+	spin_lock(&sbinfo->stat_lock);
+	if (sbinfo->free_blocks > -1)
 		sbinfo->free_blocks += delta;
+	sbinfo->used_blocks -= delta;
+	/* If hugetlbfs_put_super couldn't free sbinfo due to
+	* an outstanding quota reference, free it now. */
+	if ((sbinfo->used_blocks == 0) && (sbinfo->active == HPAGE_INACTIVE)) {
+		spin_unlock(&sbinfo->stat_lock);
+		kfree(sbinfo);
+	} else {
 		spin_unlock(&sbinfo->stat_lock);
 	}
 }


diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 19644e0..8780a91 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -142,11 +142,16 @@ struct hugetlbfs_config {
 	struct hstate *hstate;
 };

+#define HPAGE_INACTIVE  0
+#define HPAGE_ACTIVE    1
+
 struct hugetlbfs_sb_info {
 	long	max_blocks;   /* blocks allowed */
 	long	free_blocks;  /* blocks free */
 	long	max_inodes;   /* inodes allowed */
 	long	free_inodes;  /* inodes free */
+	long	used_blocks;  /* blocks used */
+	long	active;		  /* active bit */
 	spinlock_t	stat_lock;
 	struct hstate *hstate;
 };

@@ -171,8 +176,8 @@ extern const struct file_operations huge
 extern const struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
 				struct user_struct **user, int creat_flags);
-int hugetlb_get_quota(struct address_space *mapping, long delta);
-void hugetlb_put_quota(struct address_space *mapping, long delta);
+int hugetlb_get_quota(struct hugetlbfs_sb_info *sbinfo, long delta);
+void hugetlb_put_quota(struct hugetlbfs_sb_info *sbinfo, long delta);

 static inline int is_file_hugepages(struct file *file)
 {


diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dae27ba..cf26ae9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -533,9 +533,9 @@ static void free_huge_page(struct page *
 	 */
 	struct hstate *h = page_hstate(page);
 	int nid = page_to_nid(page);
-	struct address_space *mapping;
+	struct hugetlbfs_sb_info *sbinfo;

-	mapping = (struct address_space *) page_private(page);
+	sbinfo = ( struct hugetlbfs_sb_info *) page_private(page);
 	set_page_private(page, 0);
 	page->mapping = NULL;
 	BUG_ON(page_count(page));

@@ -551,8 +551,8 @@ static void free_huge_page(struct page *
 		enqueue_huge_page(h, page);
 	}
 	spin_unlock(&hugetlb_lock);
-	if (mapping)
-		hugetlb_put_quota(mapping, 1);
+	if (sbinfo)
+		hugetlb_put_quota(sbinfo, 1);
 }

 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)

@@ -1035,7 +1035,7 @@ static struct page *alloc_huge_page(stru
 	if (chg < 0)
 		return ERR_PTR(-VM_FAULT_OOM);
 	if (chg)
-		if (hugetlb_get_quota(inode->i_mapping, chg))
+		if (hugetlb_get_quota(HUGETLBFS_SB(inode->i_mapping->host->i_sb), chg))
 			return ERR_PTR(-VM_FAULT_SIGBUS);

 	spin_lock(&hugetlb_lock);

@@ -1045,12 +1045,12 @@ static struct page *alloc_huge_page(stru
 	if (!page) {
 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
 		if (!page) {
-			hugetlb_put_quota(inode->i_mapping, chg);
+			hugetlb_put_quota(HUGETLBFS_SB(inode->i_mapping->host->i_sb), chg);
 			return ERR_PTR(-VM_FAULT_SIGBUS);
 		}
 	}

-	set_page_private(page, (unsigned long) mapping);
+	set_page_private(page, (unsigned long)
HUGETLBFS_SB(inode->i_mapping->host->i_sb));

 	vma_commit_reservation(h, vma, addr);

@@ -2086,7 +2086,7 @@ static void hugetlb_vm_op_close(struct v

 		if (reserve) {
 			hugetlb_acct_memory(h, -reserve);
-			hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+			hugetlb_put_quota(HUGETLBFS_SB(vma->vm_file->f_mapping->host->i_sb), reserve);
 		}
 	}
 }

@@ -2884,7 +2884,7 @@ int hugetlb_reserve_pages(struct inode *
 		return chg;

 	/* There must be enough filesystem quota for the mapping */
-	if (hugetlb_get_quota(inode->i_mapping, chg))
+	if (hugetlb_get_quota(HUGETLBFS_SB(inode->i_mapping->host->i_sb), chg))
 		return -ENOSPC;

 	/*

@@ -2893,7 +2893,7 @@ int hugetlb_reserve_pages(struct inode *
 	 */
 	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
-		hugetlb_put_quota(inode->i_mapping, chg);
+		hugetlb_put_quota(HUGETLBFS_SB(inode->i_mapping->host->i_sb), chg);
 		return ret;
 	}

@@ -2922,7 +2922,7 @@ void hugetlb_unreserve_pages(struct inod
 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
 	spin_unlock(&inode->i_lock);

-	hugetlb_put_quota(inode->i_mapping, (chg - freed));
+	hugetlb_put_quota(HUGETLBFS_SB(inode->i_mapping->host->i_sb), (chg - freed));
 	hugetlb_acct_memory(h, -(chg - freed));
 }


On 08/15/2011 10:47 PM, David Gibson wrote:
> On Mon, Aug 15, 2011 at 03:25:35PM -0500, Andrew Barry wrote:
>> I've been doing something similar to this last proposal. I put a
>> hugetlbfs_sb_info pointer into page_private, and dropped a reference counter and
>> an active/inactive bit into the hugetlbfs_sb_info struct. At Umount time, the
>> sbinfo is freed, only if the reference count is zero. Otherwise, the last
>> put_quota frees the sbinfo structure. This fixed the race we were seeing between
>> umount and a put_quota from an rdma transaction. I just gave it a cursory test
>> on a 3.0 kernel; it has seen quite a lot more testing on a 2.6.32-derived
>> kernel, with no more hits of the umount race.
>>
>> Does this address the problems you were thinking about?
> 
> Ah, this looks much better than my patch.  And the fact that you've
> seen your race demonstrates clearly that this isn't just a kvm
> problem.  I hope we can push this upstream very soon - what can I do
> to help?
> 
>> -Andrew Barry
>>

  reply	other threads:[~2011-08-16 17:45 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-08-11  6:40 [Qemu-devel] Fix refcounting in hugetlbfs quota handling David Gibson
2011-08-12  0:48 ` Linus Torvalds
2011-08-12  4:34   ` Minchan Kim
2011-08-12 19:15     ` Hugh Dickins
2011-08-13  1:08       ` David Gibson
2011-08-15 18:00         ` Hugh Dickins
2011-08-15 20:25           ` Andrew Barry
2011-08-16  3:47             ` David Gibson
2011-08-16 17:45               ` Andrew Barry [this message]
2011-08-16 17:45               ` Andrew Barry
2011-08-12 22:20 ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4E4AACB8.3090007@cray.com \
    --to=abarry@cray.com \
    --cc=abh@cray.com \
    --cc=agl@us.ibm.com \
    --cc=agraf@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=avi@redhat.com \
    --cc=hughd@google.com \
    --cc=jan.kiszka@web.de \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=minchan.kim@gmail.com \
    --cc=mtosatti@redhat.com \
    --cc=paulus@samba.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).