public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] xfs: add a shrinker to background inode reclaim
@ 2010-04-28  4:34 Dave Chinner
  2010-04-28 11:31 ` Christoph Hellwig
  2010-04-29 11:07 ` Alex Elder
  0 siblings, 2 replies; 4+ messages in thread
From: Dave Chinner @ 2010-04-28  4:34 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

On low memory boxes or those with highmem, kernel can OOM before the
background reclaims inodes via xfssyncd. Add a shrinker to run inode
reclaim so that it inode reclaim is expedited when memory is low.

This is more complex than it needs to be because the VM folk don't
want a context added to the shrinker infrastructure. Hence we need
to add a global list of XFS mount structures so the shrinker can
traverse them.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/linux-2.6/xfs_super.c   |    5 ++
 fs/xfs/linux-2.6/xfs_sync.c    |  112 +++++++++++++++++++++++++++++++++++++---
 fs/xfs/linux-2.6/xfs_sync.h    |    7 ++-
 fs/xfs/quota/xfs_qm_syscalls.c |    3 +-
 fs/xfs/xfs_ag.h                |    1 +
 fs/xfs/xfs_mount.h             |    1 +
 6 files changed, 120 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 1abcacc..a43d09e 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1210,6 +1210,7 @@ xfs_fs_put_super(
 
 	xfs_unmountfs(mp);
 	xfs_freesb(mp);
+	xfs_inode_shrinker_unregister(mp);
 	xfs_icsb_destroy_counters(mp);
 	xfs_close_devices(mp);
 	xfs_dmops_put(mp);
@@ -1623,6 +1624,8 @@ xfs_fs_fill_super(
 	if (error)
 		goto fail_vnrele;
 
+	xfs_inode_shrinker_register(mp);
+
 	kfree(mtpt);
 	return 0;
 
@@ -1868,6 +1871,7 @@ init_xfs_fs(void)
 		goto out_cleanup_procfs;
 
 	vfs_initquota();
+	xfs_inode_shrinker_init();
 
 	error = register_filesystem(&xfs_fs_type);
 	if (error)
@@ -1895,6 +1899,7 @@ exit_xfs_fs(void)
 {
 	vfs_exitquota();
 	unregister_filesystem(&xfs_fs_type);
+	xfs_inode_shrinker_destroy();
 	xfs_sysctl_unregister();
 	xfs_cleanup_procfs();
 	xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3a64179..3884e20 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -95,7 +95,8 @@ xfs_inode_ag_walk(
 					   struct xfs_perag *pag, int flags),
 	int			flags,
 	int			tag,
-	int			exclusive)
+	int			exclusive,
+	int			*nr_to_scan)
 {
 	uint32_t		first_index;
 	int			last_error = 0;
@@ -134,7 +135,7 @@ restart:
 		if (error == EFSCORRUPTED)
 			break;
 
-	} while (1);
+	} while ((*nr_to_scan)--);
 
 	if (skipped) {
 		delay(1);
@@ -150,12 +151,15 @@ xfs_inode_ag_iterator(
 					   struct xfs_perag *pag, int flags),
 	int			flags,
 	int			tag,
-	int			exclusive)
+	int			exclusive,
+	int			*nr_to_scan)
 {
 	int			error = 0;
 	int			last_error = 0;
 	xfs_agnumber_t		ag;
+	int			nr;
 
+	nr = nr_to_scan ? *nr_to_scan : INT_MAX;
 	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
 		struct xfs_perag	*pag;
 
@@ -165,14 +169,18 @@ xfs_inode_ag_iterator(
 			continue;
 		}
 		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
-						exclusive);
+						exclusive, &nr);
 		xfs_perag_put(pag);
 		if (error) {
 			last_error = error;
 			if (error == EFSCORRUPTED)
 				break;
 		}
+		if (nr <= 0)
+			break;
 	}
+	if (nr_to_scan)
+		*nr_to_scan = nr;
 	return XFS_ERROR(last_error);
 }
 
@@ -291,7 +299,7 @@ xfs_sync_data(
 	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
 
 	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
-				      XFS_ICI_NO_TAG, 0);
+				      XFS_ICI_NO_TAG, 0, NULL);
 	if (error)
 		return XFS_ERROR(error);
 
@@ -310,7 +318,7 @@ xfs_sync_attr(
 	ASSERT((flags & ~SYNC_WAIT) == 0);
 
 	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
-				     XFS_ICI_NO_TAG, 0);
+				     XFS_ICI_NO_TAG, 0, NULL);
 }
 
 STATIC int
@@ -636,6 +644,7 @@ __xfs_inode_set_reclaim_tag(
 	radix_tree_tag_set(&pag->pag_ici_root,
 			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
 			   XFS_ICI_RECLAIM_TAG);
+	pag->pag_ici_reclaimable++;
 }
 
 /*
@@ -668,6 +677,7 @@ __xfs_inode_clear_reclaim_tag(
 {
 	radix_tree_tag_clear(&pag->pag_ici_root,
 			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+	pag->pag_ici_reclaimable--;
 }
 
 /*
@@ -817,5 +827,93 @@ xfs_reclaim_inodes(
 	int		mode)
 {
 	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
-					XFS_ICI_RECLAIM_TAG, 1);
+					XFS_ICI_RECLAIM_TAG, 1, NULL);
+}
+
+/*
+ * Shrinker infrastructure.
+ *
+ * This is all far more complex than it needs to be. It adds a global list of
+ * mounts because the shrinkers can only call a global context. We need to make
+ * the shrinkers pass a context to avoid the need for global state.
+ */
+static LIST_HEAD(xfs_mount_list);
+static struct rw_semaphore xfs_mount_list_lock;
+
+static int
+xfs_reclaim_inode_shrink(
+	int		nr_to_scan,
+	gfp_t		gfp_mask)
+{
+	struct xfs_mount *mp;
+	struct xfs_perag *pag;
+	xfs_agnumber_t	ag;
+	int		reclaimable = 0;
+
+	if (nr_to_scan) {
+		if (!(gfp_mask & __GFP_FS))
+			return -1;
+
+		down_read(&xfs_mount_list_lock);
+		list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
+			xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
+					XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
+			if (nr_to_scan <= 0)
+				break;
+		}
+		up_read(&xfs_mount_list_lock);
+	}
+
+	down_read(&xfs_mount_list_lock);
+	list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
+		for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+
+			pag = xfs_perag_get(mp, ag);
+			if (!pag->pag_ici_init) {
+				xfs_perag_put(pag);
+				continue;
+			}
+			reclaimable += pag->pag_ici_reclaimable;
+			xfs_perag_put(pag);
+		}
+	}
+	up_read(&xfs_mount_list_lock);
+	return reclaimable;
+}
+
+static struct shrinker xfs_inode_shrinker = {
+	.shrink = xfs_reclaim_inode_shrink,
+	.seeks = DEFAULT_SEEKS,
+};
+
+void __init
+xfs_inode_shrinker_init(void)
+{
+	init_rwsem(&xfs_mount_list_lock);
+	register_shrinker(&xfs_inode_shrinker);
+}
+
+void
+xfs_inode_shrinker_destroy(void)
+{
+	ASSERT(list_empty(&xfs_mount_list));
+	unregister_shrinker(&xfs_inode_shrinker);
+}
+
+void
+xfs_inode_shrinker_register(
+	struct xfs_mount	*mp)
+{
+	down_write(&xfs_mount_list_lock);
+	list_add_tail(&mp->m_mplist, &xfs_mount_list);
+	up_write(&xfs_mount_list_lock);
+}
+
+void
+xfs_inode_shrinker_unregister(
+	struct xfs_mount	*mp)
+{
+	down_write(&xfs_mount_list_lock);
+	list_del(&mp->m_mplist);
+	up_write(&xfs_mount_list_lock);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index d480c34..cdcbaac 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -53,6 +53,11 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-	int flags, int tag, int write_lock);
+	int flags, int tag, int write_lock, int *nr_to_scan);
+
+void xfs_inode_shrinker_init(void);
+void xfs_inode_shrinker_destroy(void);
+void xfs_inode_shrinker_register(struct xfs_mount *mp);
+void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
 
 #endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index f14408e..26fa431 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -889,7 +889,8 @@ xfs_qm_dqrele_all_inodes(
 	uint		 flags)
 {
 	ASSERT(mp->m_quotainfo);
-	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
+	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
+				XFS_ICI_NO_TAG, 0, NULL);
 }
 
 /*------------------------------------------------------------------------*/
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index b1a5a1f..abb8222 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -223,6 +223,7 @@ typedef struct xfs_perag {
 	int		pag_ici_init;	/* incore inode cache initialised */
 	rwlock_t	pag_ici_lock;	/* incore inode lock */
 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
+	int		pag_ici_reclaimable;	/* reclaimable inodes */
 #endif
 	int		pagb_count;	/* pagb slots in use */
 	xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS];	/* unstable blocks */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4fa0bc7..9ff48a1 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -259,6 +259,7 @@ typedef struct xfs_mount {
 	wait_queue_head_t	m_wait_single_sync_task;
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
+	struct list_head	m_mplist;	/* inode shrinker mount list */
 } xfs_mount_t;
 
 /*
-- 
1.5.6.5

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] xfs: add a shrinker to background inode reclaim
  2010-04-28  4:34 [PATCH] xfs: add a shrinker to background inode reclaim Dave Chinner
@ 2010-04-28 11:31 ` Christoph Hellwig
  2010-04-29 11:07 ` Alex Elder
  1 sibling, 0 replies; 4+ messages in thread
From: Christoph Hellwig @ 2010-04-28 11:31 UTC (permalink / raw)
  To: Dave Chinner; +Cc: npiggin, linux-mm, xfs

This is quite ugly compared to the previous version but looks correct
enough to me.  One problem is that the first filesystem registered will
get an far over-proportional number of shrink requests, which the simple
patch to pass private data to the shrinker would get around easily.

Anyway, we need a fix, so:


Reviewed-by: Christoph Hellwig <hch@lst.de>

On Wed, Apr 28, 2010 at 02:34:08PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> On low memory boxes or those with highmem, kernel can OOM before the
> background reclaims inodes via xfssyncd. Add a shrinker to run inode
> reclaim so that it inode reclaim is expedited when memory is low.
> 
> This is more complex than it needs to be because the VM folk don't
> want a context added to the shrinker infrastructure. Hence we need
> to add a global list of XFS mount structures so the shrinker can
> traverse them.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/linux-2.6/xfs_super.c   |    5 ++
>  fs/xfs/linux-2.6/xfs_sync.c    |  112 +++++++++++++++++++++++++++++++++++++---
>  fs/xfs/linux-2.6/xfs_sync.h    |    7 ++-
>  fs/xfs/quota/xfs_qm_syscalls.c |    3 +-
>  fs/xfs/xfs_ag.h                |    1 +
>  fs/xfs/xfs_mount.h             |    1 +
>  6 files changed, 120 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
> index 1abcacc..a43d09e 100644
> --- a/fs/xfs/linux-2.6/xfs_super.c
> +++ b/fs/xfs/linux-2.6/xfs_super.c
> @@ -1210,6 +1210,7 @@ xfs_fs_put_super(
>  
>  	xfs_unmountfs(mp);
>  	xfs_freesb(mp);
> +	xfs_inode_shrinker_unregister(mp);
>  	xfs_icsb_destroy_counters(mp);
>  	xfs_close_devices(mp);
>  	xfs_dmops_put(mp);
> @@ -1623,6 +1624,8 @@ xfs_fs_fill_super(
>  	if (error)
>  		goto fail_vnrele;
>  
> +	xfs_inode_shrinker_register(mp);
> +
>  	kfree(mtpt);
>  	return 0;
>  
> @@ -1868,6 +1871,7 @@ init_xfs_fs(void)
>  		goto out_cleanup_procfs;
>  
>  	vfs_initquota();
> +	xfs_inode_shrinker_init();
>  
>  	error = register_filesystem(&xfs_fs_type);
>  	if (error)
> @@ -1895,6 +1899,7 @@ exit_xfs_fs(void)
>  {
>  	vfs_exitquota();
>  	unregister_filesystem(&xfs_fs_type);
> +	xfs_inode_shrinker_destroy();
>  	xfs_sysctl_unregister();
>  	xfs_cleanup_procfs();
>  	xfs_buf_terminate();
> diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
> index 3a64179..3884e20 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.c
> +++ b/fs/xfs/linux-2.6/xfs_sync.c
> @@ -95,7 +95,8 @@ xfs_inode_ag_walk(
>  					   struct xfs_perag *pag, int flags),
>  	int			flags,
>  	int			tag,
> -	int			exclusive)
> +	int			exclusive,
> +	int			*nr_to_scan)
>  {
>  	uint32_t		first_index;
>  	int			last_error = 0;
> @@ -134,7 +135,7 @@ restart:
>  		if (error == EFSCORRUPTED)
>  			break;
>  
> -	} while (1);
> +	} while ((*nr_to_scan)--);
>  
>  	if (skipped) {
>  		delay(1);
> @@ -150,12 +151,15 @@ xfs_inode_ag_iterator(
>  					   struct xfs_perag *pag, int flags),
>  	int			flags,
>  	int			tag,
> -	int			exclusive)
> +	int			exclusive,
> +	int			*nr_to_scan)
>  {
>  	int			error = 0;
>  	int			last_error = 0;
>  	xfs_agnumber_t		ag;
> +	int			nr;
>  
> +	nr = nr_to_scan ? *nr_to_scan : INT_MAX;
>  	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
>  		struct xfs_perag	*pag;
>  
> @@ -165,14 +169,18 @@ xfs_inode_ag_iterator(
>  			continue;
>  		}
>  		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
> -						exclusive);
> +						exclusive, &nr);
>  		xfs_perag_put(pag);
>  		if (error) {
>  			last_error = error;
>  			if (error == EFSCORRUPTED)
>  				break;
>  		}
> +		if (nr <= 0)
> +			break;
>  	}
> +	if (nr_to_scan)
> +		*nr_to_scan = nr;
>  	return XFS_ERROR(last_error);
>  }
>  
> @@ -291,7 +299,7 @@ xfs_sync_data(
>  	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
>  
>  	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
> -				      XFS_ICI_NO_TAG, 0);
> +				      XFS_ICI_NO_TAG, 0, NULL);
>  	if (error)
>  		return XFS_ERROR(error);
>  
> @@ -310,7 +318,7 @@ xfs_sync_attr(
>  	ASSERT((flags & ~SYNC_WAIT) == 0);
>  
>  	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
> -				     XFS_ICI_NO_TAG, 0);
> +				     XFS_ICI_NO_TAG, 0, NULL);
>  }
>  
>  STATIC int
> @@ -636,6 +644,7 @@ __xfs_inode_set_reclaim_tag(
>  	radix_tree_tag_set(&pag->pag_ici_root,
>  			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
>  			   XFS_ICI_RECLAIM_TAG);
> +	pag->pag_ici_reclaimable++;
>  }
>  
>  /*
> @@ -668,6 +677,7 @@ __xfs_inode_clear_reclaim_tag(
>  {
>  	radix_tree_tag_clear(&pag->pag_ici_root,
>  			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> +	pag->pag_ici_reclaimable--;
>  }
>  
>  /*
> @@ -817,5 +827,93 @@ xfs_reclaim_inodes(
>  	int		mode)
>  {
>  	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
> -					XFS_ICI_RECLAIM_TAG, 1);
> +					XFS_ICI_RECLAIM_TAG, 1, NULL);
> +}
> +
> +/*
> + * Shrinker infrastructure.
> + *
> + * This is all far more complex than it needs to be. It adds a global list of
> + * mounts because the shrinkers can only call a global context. We need to make
> + * the shrinkers pass a context to avoid the need for global state.
> + */
> +static LIST_HEAD(xfs_mount_list);
> +static struct rw_semaphore xfs_mount_list_lock;
> +
> +static int
> +xfs_reclaim_inode_shrink(
> +	int		nr_to_scan,
> +	gfp_t		gfp_mask)
> +{
> +	struct xfs_mount *mp;
> +	struct xfs_perag *pag;
> +	xfs_agnumber_t	ag;
> +	int		reclaimable = 0;
> +
> +	if (nr_to_scan) {
> +		if (!(gfp_mask & __GFP_FS))
> +			return -1;
> +
> +		down_read(&xfs_mount_list_lock);
> +		list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
> +			xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
> +					XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
> +			if (nr_to_scan <= 0)
> +				break;
> +		}
> +		up_read(&xfs_mount_list_lock);
> +	}
> +
> +	down_read(&xfs_mount_list_lock);
> +	list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
> +		for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
> +
> +			pag = xfs_perag_get(mp, ag);
> +			if (!pag->pag_ici_init) {
> +				xfs_perag_put(pag);
> +				continue;
> +			}
> +			reclaimable += pag->pag_ici_reclaimable;
> +			xfs_perag_put(pag);
> +		}
> +	}
> +	up_read(&xfs_mount_list_lock);
> +	return reclaimable;
> +}
> +
> +static struct shrinker xfs_inode_shrinker = {
> +	.shrink = xfs_reclaim_inode_shrink,
> +	.seeks = DEFAULT_SEEKS,
> +};
> +
> +void __init
> +xfs_inode_shrinker_init(void)
> +{
> +	init_rwsem(&xfs_mount_list_lock);
> +	register_shrinker(&xfs_inode_shrinker);
> +}
> +
> +void
> +xfs_inode_shrinker_destroy(void)
> +{
> +	ASSERT(list_empty(&xfs_mount_list));
> +	unregister_shrinker(&xfs_inode_shrinker);
> +}
> +
> +void
> +xfs_inode_shrinker_register(
> +	struct xfs_mount	*mp)
> +{
> +	down_write(&xfs_mount_list_lock);
> +	list_add_tail(&mp->m_mplist, &xfs_mount_list);
> +	up_write(&xfs_mount_list_lock);
> +}
> +
> +void
> +xfs_inode_shrinker_unregister(
> +	struct xfs_mount	*mp)
> +{
> +	down_write(&xfs_mount_list_lock);
> +	list_del(&mp->m_mplist);
> +	up_write(&xfs_mount_list_lock);
>  }
> diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
> index d480c34..cdcbaac 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.h
> +++ b/fs/xfs/linux-2.6/xfs_sync.h
> @@ -53,6 +53,11 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
>  int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
>  int xfs_inode_ag_iterator(struct xfs_mount *mp,
>  	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
> -	int flags, int tag, int write_lock);
> +	int flags, int tag, int write_lock, int *nr_to_scan);
> +
> +void xfs_inode_shrinker_init(void);
> +void xfs_inode_shrinker_destroy(void);
> +void xfs_inode_shrinker_register(struct xfs_mount *mp);
> +void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
>  
>  #endif
> diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
> index f14408e..26fa431 100644
> --- a/fs/xfs/quota/xfs_qm_syscalls.c
> +++ b/fs/xfs/quota/xfs_qm_syscalls.c
> @@ -889,7 +889,8 @@ xfs_qm_dqrele_all_inodes(
>  	uint		 flags)
>  {
>  	ASSERT(mp->m_quotainfo);
> -	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
> +	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
> +				XFS_ICI_NO_TAG, 0, NULL);
>  }
>  
>  /*------------------------------------------------------------------------*/
> diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
> index b1a5a1f..abb8222 100644
> --- a/fs/xfs/xfs_ag.h
> +++ b/fs/xfs/xfs_ag.h
> @@ -223,6 +223,7 @@ typedef struct xfs_perag {
>  	int		pag_ici_init;	/* incore inode cache initialised */
>  	rwlock_t	pag_ici_lock;	/* incore inode lock */
>  	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
> +	int		pag_ici_reclaimable;	/* reclaimable inodes */
>  #endif
>  	int		pagb_count;	/* pagb slots in use */
>  	xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS];	/* unstable blocks */
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 4fa0bc7..9ff48a1 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -259,6 +259,7 @@ typedef struct xfs_mount {
>  	wait_queue_head_t	m_wait_single_sync_task;
>  	__int64_t		m_update_flags;	/* sb flags we need to update
>  						   on the next remount,rw */
> +	struct list_head	m_mplist;	/* inode shrinker mount list */
>  } xfs_mount_t;
>  
>  /*
> -- 
> 1.5.6.5
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
---end quoted text---

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] xfs: add a shrinker to background inode reclaim
  2010-04-28  4:34 [PATCH] xfs: add a shrinker to background inode reclaim Dave Chinner
  2010-04-28 11:31 ` Christoph Hellwig
@ 2010-04-29 11:07 ` Alex Elder
  2010-04-30  2:02   ` Dave Chinner
  1 sibling, 1 reply; 4+ messages in thread
From: Alex Elder @ 2010-04-29 11:07 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Wed, 2010-04-28 at 14:34 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> On low memory boxes or those with highmem, kernel can OOM before the
> background reclaims inodes via xfssyncd. Add a shrinker to run inode
> reclaim so that it inode reclaim is expedited when memory is low.
> 
> This is more complex than it needs to be because the VM folk don't
> want a context added to the shrinker infrastructure. Hence we need
> to add a global list of XFS mount structures so the shrinker can
> traverse them.

I have some comments; one of them suggests a fairness
change.  But I know this fix is very important and
we're short on time so I'm going to pull this in as-is
and ask Linus to pull it as well.  

> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Reviewed-by: Alex Elder <aelder@sgi.com>

> ---
>  fs/xfs/linux-2.6/xfs_super.c   |    5 ++
>  fs/xfs/linux-2.6/xfs_sync.c    |  112 +++++++++++++++++++++++++++++++++++++---
>  fs/xfs/linux-2.6/xfs_sync.h    |    7 ++-
>  fs/xfs/quota/xfs_qm_syscalls.c |    3 +-
>  fs/xfs/xfs_ag.h                |    1 +
>  fs/xfs/xfs_mount.h             |    1 +
>  6 files changed, 120 insertions(+), 9 deletions(-)
> 

. . .

> diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
> index 3a64179..3884e20 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.c
> +++ b/fs/xfs/linux-2.6/xfs_sync.c
. . .
> @@ -134,7 +135,7 @@ restart:
>  		if (error == EFSCORRUPTED)
>  			break;
>  
> -	} while (1);
> +	} while ((*nr_to_scan)--);

Do you ever plan to return a negative value in *nr_to_scan?
(I know it matches the shrinker->shrink nr_to_scan argument
type.  More explanation on this question below.)

>  
>  	if (skipped) {
>  		delay(1);
. . .
> @@ -165,14 +169,18 @@ xfs_inode_ag_iterator(
>  			continue;
>  		}
>  		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
> -						exclusive);
> +						exclusive, &nr);
>  		xfs_perag_put(pag);
>  		if (error) {
>  			last_error = error;
>  			if (error == EFSCORRUPTED)
>  				break;
>  		}
> +		if (nr <= 0)
> +			break;

Why check for negative here?  It will never go negative
as currently used.  (Defensive coding is a reasonable
explanation.)

>  	}
> +	if (nr_to_scan)
> +		*nr_to_scan = nr;
>  	return XFS_ERROR(last_error);
>  }
>  
. . .

> @@ -817,5 +827,93 @@ xfs_reclaim_inodes(
>  	int		mode)
>  {
>  	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
> -					XFS_ICI_RECLAIM_TAG, 1);
> +					XFS_ICI_RECLAIM_TAG, 1, NULL);
> +}
> +
> +/*
> + * Shrinker infrastructure.
> + *
> + * This is all far more complex than it needs to be. It adds a global list of
> + * mounts because the shrinkers can only call a global context. We need to make
> + * the shrinkers pass a context to avoid the need for global state.
> + */
> +static LIST_HEAD(xfs_mount_list);
> +static struct rw_semaphore xfs_mount_list_lock;
> +
> +static int
> +xfs_reclaim_inode_shrink(
> +	int		nr_to_scan,
> +	gfp_t		gfp_mask)
> +{
> +	struct xfs_mount *mp;
> +	struct xfs_perag *pag;
> +	xfs_agnumber_t	ag;
> +	int		reclaimable = 0;
> +
> +	if (nr_to_scan) {
> +		if (!(gfp_mask & __GFP_FS))
> +			return -1;
> +
> +		down_read(&xfs_mount_list_lock);
> +		list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
> +			xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
> +					XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
> +			if (nr_to_scan <= 0)
> +				break;

So we limit our scan using nr_to_scan, and we'll break out as
soon as we have exhausted that many scans (i.e., reclaimed
inodes).  We're enforcing this limit across the global list of
all mounts.  Since we don't update the list, and we traverse
the list from the beginning each time, the first mount always
gets hit, the ones later in the list may never be reached.

I think--given the restricted environment we're working in
here--we could make the scans more fair by rotating the starting
point for scans each time through.  I.e., record the last mount
point we scanned and pick up there next time (or at the front
of the list if that mount went away).

> +		}
> +		up_read(&xfs_mount_list_lock);
> +	}
> +
> +	down_read(&xfs_mount_list_lock);
> +	list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
> +		for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
> +
> +			pag = xfs_perag_get(mp, ag);
> +			if (!pag->pag_ici_init) {
> +				xfs_perag_put(pag);
> +				continue;
> +			}
> +			reclaimable += pag->pag_ici_reclaimable;
> +			xfs_perag_put(pag);
> +		}
> +	}
> +	up_read(&xfs_mount_list_lock);
> +	return reclaimable;
> +}
> +
> +static struct shrinker xfs_inode_shrinker = {
> +	.shrink = xfs_reclaim_inode_shrink,
> +	.seeks = DEFAULT_SEEKS,
> +};
. . .
(The rest is fine)

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] xfs: add a shrinker to background inode reclaim
  2010-04-29 11:07 ` Alex Elder
@ 2010-04-30  2:02   ` Dave Chinner
  0 siblings, 0 replies; 4+ messages in thread
From: Dave Chinner @ 2010-04-30  2:02 UTC (permalink / raw)
  To: Alex Elder; +Cc: xfs

On Thu, Apr 29, 2010 at 06:07:33AM -0500, Alex Elder wrote:
> On Wed, 2010-04-28 at 14:34 +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > On low memory boxes or those with highmem, kernel can OOM before the
> > background reclaims inodes via xfssyncd. Add a shrinker to run inode
> > reclaim so that it inode reclaim is expedited when memory is low.
> > 
> > This is more complex than it needs to be because the VM folk don't
> > want a context added to the shrinker infrastructure. Hence we need
> > to add a global list of XFS mount structures so the shrinker can
> > traverse them.
> 
> I have some comments; one of them suggests a fairness
> change.  But I know this fix is very important and
> we're short on time so I'm going to pull this in as-is
> and ask Linus to pull it as well.  

I tried being fair, it didn't work and I have bigger things to spend
my time on (like getting context based shrinkers accepted ;).

> . . .
> > @@ -134,7 +135,7 @@ restart:
> >  		if (error == EFSCORRUPTED)
> >  			break;
> >  
> > -	} while (1);
> > +	} while ((*nr_to_scan)--);
> 
> Do you ever plan to return a negative value in *nr_to_scan?

No.

> > @@ -165,14 +169,18 @@ xfs_inode_ag_iterator(
> >  			continue;
> >  		}
> >  		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
> > -						exclusive);
> > +						exclusive, &nr);
> >  		xfs_perag_put(pag);
> >  		if (error) {
> >  			last_error = error;
> >  			if (error == EFSCORRUPTED)
> >  				break;
> >  		}
> > +		if (nr <= 0)
> > +			break;
> 
> Why check for negative here?  It will never go negative
> as currently used.  (Defensive coding is a reasonable
> explanation.)

A negative value is invalid - indicateѕ an overflow - so abort on
the termination case (0) or invalid value (< 0). So yes, i guess you
could say "defensive coding". ;)

> > +/*
> > + * Shrinker infrastructure.
> > + *
> > + * This is all far more complex than it needs to be. It adds a global list of
> > + * mounts because the shrinkers can only call a global context. We need to make
> > + * the shrinkers pass a context to avoid the need for global state.
> > + */
> > +static LIST_HEAD(xfs_mount_list);
> > +static struct rw_semaphore xfs_mount_list_lock;
> > +
> > +static int
> > +xfs_reclaim_inode_shrink(
> > +	int		nr_to_scan,
> > +	gfp_t		gfp_mask)
> > +{
> > +	struct xfs_mount *mp;
> > +	struct xfs_perag *pag;
> > +	xfs_agnumber_t	ag;
> > +	int		reclaimable = 0;
> > +
> > +	if (nr_to_scan) {
> > +		if (!(gfp_mask & __GFP_FS))
> > +			return -1;
> > +
> > +		down_read(&xfs_mount_list_lock);
> > +		list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
> > +			xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
> > +					XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
> > +			if (nr_to_scan <= 0)
> > +				break;
> 
> So we limit our scan using nr_to_scan, and we'll break out as
> soon as we have exhausted that many scans

Yup, that's exactly how shrinkers are supposed to work. If there
is more to do, then the shrinker will be called again with a new
nr_to_scan value.

> (i.e., reclaimed inodes). 

A scan count is not actually freeing/reclaiming an object, it's the
number of objects we've looked at with intent to reclaim. We may not
reclaim any by the time the scan count is exhausted, and that's the
reason that why proportioning across filesystems is not working in
this case.

The reason for this behaviour is that it prevents scanning thousands
(even 10s or 100s of thousands) of objects before finding enough to
reclaim.  If there are that many unreclaimable objects on the end of
the LRU, then there's no point trying to reclaim from it right now
and the time is better spent on reclaiming from somewhere else.

> We're enforcing this limit across the global list of
> all mounts.  Since we don't update the list, and we traverse
> the list from the beginning each time, the first mount always
> gets hit, the ones later in the list may never be reached.

Yes, that may happen. however, if the first filesystem has more
inodes to reclaim, we don't lose anything by reclaiming them first,
as the xfssyncd for the other fielsystems will come by soon and do a
complete reclaim pass on them.

The issue with the shrinker is that if we split the batch up over
all the filesystems, then the shrinker makes much slower progress
because the batches are not large enough to get past the first
cluster of reclaimable inodes on each filesytsem that is currently
under IO.... 

> I think--given the restricted environment we're working in
> here--we could make the scans more fair by rotating the starting
> point for scans each time through.  I.e., record the last mount
> point we scanned and pick up there next time (or at the front
> of the list if that mount went away).

I agree with you in that if the aim was to keep this code then we'd
need to do this. However, it's not worth the effort of designing and
testing a fancy scheme given that I'm going to do my best to remove
this traverse list as quickly as possible. The current stuff works,
but trying to improve it is like polishing a turd - no matter how
shiny you make it, in the end it's still just a smelly pile of crap.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-04-30  2:00 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-04-28  4:34 [PATCH] xfs: add a shrinker to background inode reclaim Dave Chinner
2010-04-28 11:31 ` Christoph Hellwig
2010-04-29 11:07 ` Alex Elder
2010-04-30  2:02   ` Dave Chinner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox