linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Glauber Costa <glommer@openvz.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-fsdevel@vger.kernel.org, Mel Gorman <mgorman@suse.de>,
	Dave Chinner <david@fromorbit.com>,
	linux-mm@kvack.org, cgroups@vger.kernel.org,
	kamezawa.hiroyu@jp.fujitsu.com, Michal Hocko <mhocko@suse.cz>,
	Johannes Weiner <hannes@cmpxchg.org>,
	hughd@google.com, Greg Thelen <gthelen@google.com>,
	Dave Chinner <dchinner@redhat.com>,
	Glauber Costa <glommer@openvz.org>
Subject: [PATCH v10 17/35] xfs: rework buffer dispose list tracking
Date: Mon,  3 Jun 2013 23:29:46 +0400	[thread overview]
Message-ID: <1370287804-3481-18-git-send-email-glommer@openvz.org> (raw)
In-Reply-To: <1370287804-3481-1-git-send-email-glommer@openvz.org>

From: Dave Chinner <dchinner@redhat.com>

In converting the buffer lru lists to use the generic code, the
locking for marking the buffers as on the dispose list was lost.
This results in confusion in LRU buffer tracking and acocunting,
resulting in reference counts being mucked up and filesystem beig
unmountable.

To fix this, introduce an internal buffer spinlock to protect the
state field that holds the dispose list information. Because there
is now locking needed around xfs_buf_lru_add/del, and they are used
in exactly one place each two lines apart, get rid of the wrappers
and code the logic directly in place.

Further, the LRU emptying code used on unmount is less than optimal.
Convert it to use a dispose list as per a normal shrinker walk, and
repeat the walk that fills the dispose list until the LRU is empty.
Thi avoids needing to drop and regain the LRU lock for every item
being freed, and allows the same logic as the shrinker isolate call
to be used. Simpler, easier to understand.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
---
 fs/xfs/xfs_buf.c | 125 +++++++++++++++++++++++++++++++------------------------
 fs/xfs/xfs_buf.h |  12 ++++--
 2 files changed, 79 insertions(+), 58 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index dcf3c0d..0d7a619 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,37 +80,6 @@ xfs_buf_vmap_len(
 }
 
 /*
- * xfs_buf_lru_add - add a buffer to the LRU.
- *
- * The LRU takes a new reference to the buffer so that it will only be freed
- * once the shrinker takes the buffer off the LRU.
- */
-static void
-xfs_buf_lru_add(
-	struct xfs_buf	*bp)
-{
-	if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
-		bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
-		atomic_inc(&bp->b_hold);
-	}
-}
-
-/*
- * xfs_buf_lru_del - remove a buffer from the LRU
- *
- * The unlocked check is safe here because it only occurs when there are not
- * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
- * to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free().
- */
-static void
-xfs_buf_lru_del(
-	struct xfs_buf	*bp)
-{
-	list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
-}
-
-/*
  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
  * b_lru_ref count so that the buffer is freed immediately when the buffer
  * reference count falls to zero. If the buffer is already on the LRU, we need
@@ -133,12 +102,14 @@ xfs_buf_stale(
 	 */
 	bp->b_flags &= ~_XBF_DELWRI_Q;
 
-	atomic_set(&(bp)->b_lru_ref, 0);
-	if (!(bp->b_lru_flags & _XBF_LRU_DISPOSE) &&
+	spin_lock(&bp->b_lock);
+	atomic_set(&bp->b_lru_ref, 0);
+	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
 	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
 		atomic_dec(&bp->b_hold);
 
 	ASSERT(atomic_read(&bp->b_hold) >= 1);
+	spin_unlock(&bp->b_lock);
 }
 
 static int
@@ -202,6 +173,7 @@ _xfs_buf_alloc(
 	INIT_LIST_HEAD(&bp->b_list);
 	RB_CLEAR_NODE(&bp->b_rbnode);
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
+	spin_lock_init(&bp->b_lock);
 	XB_SET_OWNER(bp);
 	bp->b_target = target;
 	bp->b_flags = flags;
@@ -891,12 +863,33 @@ xfs_buf_rele(
 
 	ASSERT(atomic_read(&bp->b_hold) > 0);
 	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-		if (!(bp->b_flags & XBF_STALE) &&
-			   atomic_read(&bp->b_lru_ref)) {
-			xfs_buf_lru_add(bp);
+		spin_lock(&bp->b_lock);
+		if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+			/*
+			 * If the buffer is added to the LRU take a new
+			 * reference to the buffer for the LRU and clear the
+			 * (now stale) dispose list state flag
+			 */
+			if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+				bp->b_state &= ~XFS_BSTATE_DISPOSE;
+				atomic_inc(&bp->b_hold);
+			}
+			spin_unlock(&bp->b_lock);
 			spin_unlock(&pag->pag_buf_lock);
 		} else {
-			xfs_buf_lru_del(bp);
+			/*
+			 * most of the time buffers will already be removed from
+			 * the LRU, so optimise that case by checking for the
+			 * XFS_BSTATE_DISPOSE flag indicating the last list the
+			 * buffer was on was the disposal list
+			 */
+			if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+				list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+			} else {
+				ASSERT(list_empty(&bp->b_lru));
+			}
+			spin_unlock(&bp->b_lock);
+
 			ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
 			spin_unlock(&pag->pag_buf_lock);
@@ -1485,33 +1478,48 @@ xfs_buftarg_wait_rele(
 
 {
 	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
+	struct list_head	*dispose = arg;
 
 	if (atomic_read(&bp->b_hold) > 1) {
-		/* need to wait */
+		/* need to wait, so skip it this pass */
 		trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
-		spin_unlock(lru_lock);
-		delay(100);
-	} else {
-		/*
-		 * clear the LRU reference count so the buffer doesn't get
-		 * ignored in xfs_buf_rele().
-		 */
-		atomic_set(&bp->b_lru_ref, 0);
-		spin_unlock(lru_lock);
-		xfs_buf_rele(bp);
+		return LRU_SKIP;
 	}
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
 
-	spin_lock(lru_lock);
-	return LRU_RETRY;
+	/*
+	 * clear the LRU reference count so the buffer doesn't get
+	 * ignored in xfs_buf_rele().
+	 */
+	atomic_set(&bp->b_lru_ref, 0);
+	bp->b_state |= XFS_BSTATE_DISPOSE;
+	list_move(item, dispose);
+	spin_unlock(&bp->b_lock);
+	return LRU_REMOVED;
 }
 
 void
 xfs_wait_buftarg(
 	struct xfs_buftarg	*btp)
 {
-	while (list_lru_count(&btp->bt_lru))
+	LIST_HEAD(dispose);
+	int loop = 0;
+
+	/* loop until there is nothing left on the lru list. */
+	while (list_lru_count(&btp->bt_lru)) {
 		list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
-			      NULL, LONG_MAX);
+			      &dispose, LONG_MAX);
+
+		while (!list_empty(&dispose)) {
+			struct xfs_buf *bp;
+			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+			list_del_init(&bp->b_lru);
+			xfs_buf_rele(bp);
+		}
+		if (loop++ != 0)
+			delay(100);
+	}
 }
 
 static enum lru_status
@@ -1524,15 +1532,24 @@ xfs_buftarg_isolate(
 	struct list_head	*dispose = arg;
 
 	/*
+	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
+	 * If we fail to get the lock, just skip it.
+	 */
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
+	/*
 	 * Decrement the b_lru_ref count unless the value is already
 	 * zero. If the value is already zero, we need to reclaim the
 	 * buffer, otherwise it gets another trip through the LRU.
 	 */
-	if (!atomic_add_unless(&bp->b_lru_ref, -1, 0))
+	if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+		spin_unlock(&bp->b_lock);
 		return LRU_ROTATE;
+	}
 
-	bp->b_lru_flags |= _XBF_LRU_DISPOSE;
+	bp->b_state |= XFS_BSTATE_DISPOSE;
 	list_move(item, dispose);
+	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
 }
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5ec7d35..e656833 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -60,7 +60,6 @@ typedef enum {
 #define _XBF_KMEM	 (1 << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q	 (1 << 22)/* buffer on a delwri queue */
 #define _XBF_COMPOUND	 (1 << 23)/* compound buffer */
-#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */
 
 typedef unsigned int xfs_buf_flags_t;
 
@@ -79,8 +78,12 @@ typedef unsigned int xfs_buf_flags_t;
 	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_KMEM,		"KMEM" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
-	{ _XBF_COMPOUND,	"COMPOUND" }, \
-	{ _XBF_LRU_DISPOSE,	"LRU_DISPOSE" }
+	{ _XBF_COMPOUND,	"COMPOUND" }
+
+/*
+ * Internal state flags.
+ */
+#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
 
 typedef struct xfs_buftarg {
 	dev_t			bt_dev;
@@ -136,7 +139,8 @@ typedef struct xfs_buf {
 	 * bt_lru_lock and not by b_sema
 	 */
 	struct list_head	b_lru;		/* lru list */
-	xfs_buf_flags_t		b_lru_flags;	/* internal lru status flags */
+	spinlock_t		b_lock;		/* internal state lock */
+	unsigned int		b_state;	/* internal state flags */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;		/* contains rbtree root */
-- 
1.8.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2013-06-03 19:34 UTC|newest]

Thread overview: 103+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-06-03 19:29 [PATCH v10 00/35] kmemcg shrinkers Glauber Costa
2013-06-03 19:29 ` [PATCH v10 01/35] fs: bump inode and dentry counters to long Glauber Costa
2013-06-03 19:29 ` [PATCH v10 02/35] super: fix calculation of shrinkable objects for small numbers Glauber Costa
2013-06-03 19:29 ` [PATCH v10 03/35] dcache: convert dentry_stat.nr_unused to per-cpu counters Glauber Costa
2013-06-05 23:07   ` Andrew Morton
2013-06-06  1:45     ` Dave Chinner
2013-06-06  2:48       ` Andrew Morton
2013-06-06  4:02         ` Dave Chinner
2013-06-06 12:40         ` Glauber Costa
2013-06-06 22:25           ` Andrew Morton
2013-06-06 23:42             ` Dave Chinner
2013-06-07  6:03               ` Glauber Costa
2013-06-03 19:29 ` [PATCH v10 04/35] dentry: move to per-sb LRU locks Glauber Costa
2013-06-05 23:07   ` Andrew Morton
2013-06-06  1:56     ` Dave Chinner
2013-06-06  8:03     ` Glauber Costa
2013-06-06 12:51       ` Glauber Costa
2013-06-03 19:29 ` [PATCH v10 05/35] dcache: remove dentries from LRU before putting on dispose list Glauber Costa
2013-06-05 23:07   ` Andrew Morton
2013-06-06  8:04     ` Glauber Costa
2013-06-03 19:29 ` [PATCH v10 06/35] mm: new shrinker API Glauber Costa
2013-06-05 23:07   ` Andrew Morton
2013-06-06  7:58     ` Glauber Costa
2013-06-03 19:29 ` [PATCH v10 07/35] shrinker: convert superblock shrinkers to new API Glauber Costa
2013-06-03 19:29 ` [PATCH v10 08/35] list: add a new LRU list type Glauber Costa
2013-06-05 23:07   ` Andrew Morton
2013-06-06  2:49     ` Dave Chinner
2013-06-06  3:05       ` Andrew Morton
2013-06-06  4:44         ` Dave Chinner
2013-06-06  7:04           ` Andrew Morton
2013-06-06  9:03             ` Glauber Costa
2013-06-06  9:55               ` Andrew Morton
2013-06-06 11:47                 ` Glauber Costa
2013-06-06 14:28       ` Glauber Costa
2013-06-06  8:10     ` Glauber Costa
2013-06-03 19:29 ` [PATCH v10 09/35] inode: convert inode lru list to generic lru list code Glauber Costa
2013-06-03 19:29 ` [PATCH v10 10/35] dcache: convert to use new lru list infrastructure Glauber Costa
2013-06-03 19:29 ` [PATCH v10 11/35] list_lru: per-node " Glauber Costa
2013-06-05 23:08   ` Andrew Morton
2013-06-06  3:21     ` Dave Chinner
2013-06-06  3:51       ` Andrew Morton
2013-06-06  8:21       ` Glauber Costa
2013-06-06 16:15     ` Glauber Costa
2013-06-06 16:48       ` Andrew Morton
2013-06-03 19:29 ` [PATCH v10 12/35] shrinker: add node awareness Glauber Costa
2013-06-05 23:08   ` Andrew Morton
2013-06-06  3:26     ` Dave Chinner
2013-06-06  3:54       ` Andrew Morton
2013-06-06  8:23     ` Glauber Costa
2013-06-03 19:29 ` [PATCH v10 13/35] vmscan: per-node deferred work Glauber Costa
2013-06-05 23:08   ` Andrew Morton
2013-06-06  3:37     ` Dave Chinner
2013-06-06  4:59       ` Dave Chinner
2013-06-06  7:12         ` Andrew Morton
2013-06-06  9:00     ` Glauber Costa
2013-06-03 19:29 ` [PATCH v10 14/35] list_lru: per-node API Glauber Costa
2013-06-03 19:29 ` [PATCH v10 15/35] fs: convert inode and dentry shrinking to be node aware Glauber Costa
2013-06-03 19:29 ` [PATCH v10 16/35] xfs: convert buftarg LRU to generic code Glauber Costa
2013-06-03 19:29 ` Glauber Costa [this message]
2013-06-03 19:29 ` [PATCH v10 18/35] xfs: convert dquot cache lru to list_lru Glauber Costa
2013-06-03 19:29 ` [PATCH v10 19/35] fs: convert fs shrinkers to new scan/count API Glauber Costa
2013-06-03 19:29 ` [PATCH v10 20/35] drivers: convert shrinkers to new count/scan API Glauber Costa
2013-06-03 19:29 ` [PATCH v10 21/35] i915: bail out earlier when shrinker cannot acquire mutex Glauber Costa
2013-06-03 19:29 ` [PATCH v10 22/35] shrinker: convert remaining shrinkers to count/scan API Glauber Costa
2013-06-05 23:08   ` Andrew Morton
2013-06-06  3:41     ` Dave Chinner
2013-06-06  8:27       ` Glauber Costa
2013-06-03 19:29 ` [PATCH v10 23/35] hugepage: convert huge zero page shrinker to new shrinker API Glauber Costa
2013-06-03 19:29 ` [PATCH v10 24/35] shrinker: Kill old ->shrink API Glauber Costa
2013-06-03 19:29 ` [PATCH v10 25/35] vmscan: also shrink slab in memcg pressure Glauber Costa
2013-06-03 19:29 ` [PATCH v10 26/35] memcg,list_lru: duplicate LRUs upon kmemcg creation Glauber Costa
2013-06-05 23:08   ` Andrew Morton
2013-06-06  8:52     ` Glauber Costa
2013-06-03 19:29 ` [PATCH v10 27/35] lru: add an element to a memcg list Glauber Costa
2013-06-05 23:08   ` Andrew Morton
2013-06-06  8:44     ` Glauber Costa
2013-06-03 19:29 ` [PATCH v10 28/35] list_lru: per-memcg walks Glauber Costa
2013-06-05 23:08   ` Andrew Morton
2013-06-06  8:37     ` Glauber Costa
2013-06-03 19:29 ` [PATCH v10 29/35] memcg: per-memcg kmem shrinking Glauber Costa
2013-06-05 23:08   ` Andrew Morton
2013-06-06  8:35     ` Glauber Costa
2013-06-06  9:49       ` Andrew Morton
2013-06-06 12:09         ` Glauber Costa
2013-06-06 22:23           ` Andrew Morton
2013-06-07  6:10             ` Glauber Costa
2013-06-03 19:29 ` [PATCH v10 30/35] memcg: scan cache objects hierarchically Glauber Costa
2013-06-05 23:08   ` Andrew Morton
2013-06-03 19:30 ` [PATCH v10 31/35] vmscan: take at least one pass with shrinkers Glauber Costa
2013-06-03 19:30 ` [PATCH v10 32/35] super: targeted memcg reclaim Glauber Costa
2013-06-03 19:30 ` [PATCH v10 33/35] memcg: move initialization to memcg creation Glauber Costa
2013-06-03 19:30 ` [PATCH v10 34/35] vmpressure: in-kernel notifications Glauber Costa
2013-06-03 19:30 ` [PATCH v10 35/35] memcg: reap dead memcgs upon global memory pressure Glauber Costa
2013-06-05 23:09   ` Andrew Morton
2013-06-06  8:33     ` Glauber Costa
2013-06-05 23:07 ` [PATCH v10 00/35] kmemcg shrinkers Andrew Morton
2013-06-06  3:44   ` Dave Chinner
2013-06-06  5:51   ` Glauber Costa
2013-06-06  7:18     ` Andrew Morton
2013-06-06  7:37       ` Glauber Costa
2013-06-06  7:47         ` Andrew Morton
2013-06-06  7:59           ` Glauber Costa
2013-06-07 14:15   ` Michal Hocko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1370287804-3481-18-git-send-email-glommer@openvz.org \
    --to=glommer@openvz.org \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=david@fromorbit.com \
    --cc=dchinner@redhat.com \
    --cc=gthelen@google.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mhocko@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).