public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed
* [Patch] unique per-AG inode generation number initialisation
@ 2008-04-01 23:18 David Chinner
  2008-04-02  4:02 ` Niv Sardi
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: David Chinner @ 2008-04-01 23:18 UTC (permalink / raw)
  To: xfs-dev; +Cc: xfs-oss

Don't initialise new inode generation numbers to zero

When we allocation new inode chunks, we initialise the generation
numbers to zero. This works fine until we delete a chunk and then
reallocate it, resulting in the same inode numbers but with a
reset generation count. This can result in inode/generation
pairs of different inodes occurring relatively close together.

Given that the inode/gen pair makes up the "unique" portion of
an NFS filehandle on XFS, this can result in file handles cached
on clients being seen on the wire from the server but refer to
a different file. This causes .... issues for NFS clients.

Hence we need a unique generation number initialisation for
each inode to prevent reuse of a small portion of the generation
number space. Make this initialiser per-allocation group so
that it is not a single point of contention in the filesystem,
and increment it on every allocation within an AG to reduce the
chance that a generation number is reused for a given inode number
if the inode chunk is deleted and reallocated immediately
afterwards.

It is safe to add the agi_newinogen field to the AGI without
using a feature bit. If an older kernel is used, it simply
will not update the field on allocation. If the kernel is
updated and the field has garbage in it, then it's like having a
random seed to the generation number....

Signed-off-by: Dave Chinner <dgc@sgi.com>
---
 fs/xfs/xfs_ag.h     |    4 +++-
 fs/xfs/xfs_ialloc.c |   30 ++++++++++++++++++++++--------
 2 files changed, 25 insertions(+), 9 deletions(-)

Index: 2.6.x-xfs-new/fs/xfs/xfs_ag.h
===================================================================
--- 2.6.x-xfs-new.orig/fs/xfs/xfs_ag.h	2008-01-18 18:30:06.000000000 +1100
+++ 2.6.x-xfs-new/fs/xfs/xfs_ag.h	2008-03-26 13:03:41.122918236 +1100
@@ -121,6 +121,7 @@ typedef struct xfs_agi {
 	 * still being referenced.
 	 */
 	__be32		agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+	__be32		agi_newinogen;	/* inode cluster generation */
 } xfs_agi_t;
 
 #define	XFS_AGI_MAGICNUM	0x00000001
@@ -134,7 +135,8 @@ typedef struct xfs_agi {
 #define	XFS_AGI_NEWINO		0x00000100
 #define	XFS_AGI_DIRINO		0x00000200
 #define	XFS_AGI_UNLINKED	0x00000400
-#define	XFS_AGI_NUM_BITS	11
+#define	XFS_AGI_NEWINOGEN	0x00000800
+#define	XFS_AGI_NUM_BITS	12
 #define	XFS_AGI_ALL_BITS	((1 << XFS_AGI_NUM_BITS) - 1)
 
 /* disk block (xfs_daddr_t) in the AG */
Index: 2.6.x-xfs-new/fs/xfs/xfs_ialloc.c
===================================================================
--- 2.6.x-xfs-new.orig/fs/xfs/xfs_ialloc.c	2008-03-25 15:41:27.000000000 +1100
+++ 2.6.x-xfs-new/fs/xfs/xfs_ialloc.c	2008-03-26 14:29:47.998554368 +1100
@@ -309,6 +309,8 @@ xfs_ialloc_ag_alloc(
 			free = XFS_MAKE_IPTR(args.mp, fbuf, i);
 			free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 			free->di_core.di_version = version;
+			free->di_core.di_gen = agi->agi_newinogen;
+			be32_add_cpu(&agi->agi_newinogen, 1);
 			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
 			xfs_ialloc_log_di(tp, fbuf, i,
 				XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
@@ -347,7 +349,8 @@ xfs_ialloc_ag_alloc(
 	 * Log allocation group header fields
 	 */
 	xfs_ialloc_log_agi(tp, agbp,
-		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
+		XFS_AGI_COUNT | XFS_AGI_FREECOUNT |
+		XFS_AGI_NEWINO | XFS_AGI_NEWINOGEN);
 	/*
 	 * Modify/log superblock values for inode count and inode free count.
 	 */
@@ -896,11 +899,12 @@ nextag:
 	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
 	XFS_INOBT_CLR_FREE(&rec, offset);
 	rec.ir_freecount--;
+	be32_add_cpu(&agi->agi_newinogen, 1);
 	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
 			rec.ir_free)))
 		goto error0;
 	be32_add(&agi->agi_freecount, -1);
-	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT | XFS_AGI_NEWINOGEN);
 	down_read(&mp->m_peraglock);
 	mp->m_perag[tagno].pagi_freecount--;
 	up_read(&mp->m_peraglock);
@@ -1320,6 +1324,11 @@ xfs_ialloc_compute_maxlevels(
 
 /*
  * Log specified fields for the ag hdr (inode section)
+ *
+ * We don't log the unlinked inode fields through here; they
+ * get logged directly to the buffer. Hence we have a discontinuity
+ * in the fields we are logging and we need two calls to map all
+ * the dirtied parts of the agi....
  */
 void
 xfs_ialloc_log_agi(
@@ -1342,22 +1351,27 @@ xfs_ialloc_log_agi(
 		offsetof(xfs_agi_t, agi_newino),
 		offsetof(xfs_agi_t, agi_dirino),
 		offsetof(xfs_agi_t, agi_unlinked),
+		offsetof(xfs_agi_t, agi_newinogen),
 		sizeof(xfs_agi_t)
 	};
+	int			log_newino = fields & XFS_AGI_NEWINOGEN;
+
 #ifdef DEBUG
 	xfs_agi_t		*agi;	/* allocation group header */
 
 	agi = XFS_BUF_TO_AGI(bp);
 	ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
 #endif
-	/*
-	 * Compute byte offsets for the first and last fields.
-	 */
+	fields &= ~XFS_AGI_NEWINOGEN;
+
+	/* Compute byte offsets for the first and last fields.  */
 	xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
-	/*
-	 * Log the allocation group inode header buffer.
-	 */
 	xfs_trans_log_buf(tp, bp, first, last);
+	if (log_newino) {
+		xfs_btree_offsets(XFS_AGI_NEWINOGEN, offsets, XFS_AGI_NUM_BITS,
+					&first, &last);
+		xfs_trans_log_buf(tp, bp, first, last);
+	}
 }
 
 /*

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch] unique per-AG inode generation number initialisation
  2008-04-01 23:18 [Patch] unique per-AG inode generation number initialisation David Chinner
@ 2008-04-02  4:02 ` Niv Sardi
  2008-04-04  9:08 ` Hans-Peter Jansen
  2008-04-07 12:57 ` Christoph Hellwig
  2 siblings, 0 replies; 6+ messages in thread
From: Niv Sardi @ 2008-04-02  4:02 UTC (permalink / raw)
  To: David Chinner; +Cc: xfs-dev, xfs-oss


<HAT type=DMAPI>

David Chinner <dgc@sgi.com> writes:
> Don't initialise new inode generation numbers to zero
>
> When we allocation new inode chunks, we initialise the generation
> numbers to zero. This works fine until we delete a chunk and then
> reallocate it, resulting in the same inode numbers but with a
> reset generation count. This can result in inode/generation
> pairs of different inodes occurring relatively close together.
>
> Given that the inode/gen pair makes up the "unique" portion of
> an NFS filehandle on XFS, this can result in file handles cached
> on clients being seen on the wire from the server but refer to
> a different file. This causes .... issues for NFS clients.
>
> Hence we need a unique generation number initialisation for
> each inode to prevent reuse of a small portion of the generation
> number space. Make this initialiser per-allocation group so
> that it is not a single point of contention in the filesystem,
> and increment it on every allocation within an AG to reduce the
> chance that a generation number is reused for a given inode number
> if the inode chunk is deleted and reallocated immediately
> afterwards.
>
> It is safe to add the agi_newinogen field to the AGI without
> using a feature bit. If an older kernel is used, it simply
> will not update the field on allocation. If the kernel is
> updated and the field has garbage in it, then it's like having a
> random seed to the generation number....
>
> Signed-off-by: Dave Chinner <dgc@sgi.com>
> ---
>  fs/xfs/xfs_ag.h     |    4 +++-
>  fs/xfs/xfs_ialloc.c |   30 ++++++++++++++++++++++--------
>  2 files changed, 25 insertions(+), 9 deletions(-)

Appart from the bit of overhead all seems good.

> Index: 2.6.x-xfs-new/fs/xfs/xfs_ag.h
> ===================================================================
> --- 2.6.x-xfs-new.orig/fs/xfs/xfs_ag.h	2008-01-18 18:30:06.000000000 +1100
> +++ 2.6.x-xfs-new/fs/xfs/xfs_ag.h	2008-03-26 13:03:41.122918236 +1100
> @@ -121,6 +121,7 @@ typedef struct xfs_agi {
>  	 * still being referenced.
>  	 */
>  	__be32		agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
> +	__be32		agi_newinogen;	/* inode cluster generation */
>  } xfs_agi_t;
>  
>  #define	XFS_AGI_MAGICNUM	0x00000001
> @@ -134,7 +135,8 @@ typedef struct xfs_agi {
>  #define	XFS_AGI_NEWINO		0x00000100
>  #define	XFS_AGI_DIRINO		0x00000200
>  #define	XFS_AGI_UNLINKED	0x00000400
> -#define	XFS_AGI_NUM_BITS	11
> +#define	XFS_AGI_NEWINOGEN	0x00000800
> +#define	XFS_AGI_NUM_BITS	12
>  #define	XFS_AGI_ALL_BITS	((1 << XFS_AGI_NUM_BITS) - 1)
>  
>  /* disk block (xfs_daddr_t) in the AG */
> Index: 2.6.x-xfs-new/fs/xfs/xfs_ialloc.c
> ===================================================================
> --- 2.6.x-xfs-new.orig/fs/xfs/xfs_ialloc.c	2008-03-25 15:41:27.000000000 +1100
> +++ 2.6.x-xfs-new/fs/xfs/xfs_ialloc.c	2008-03-26 14:29:47.998554368 +1100
> @@ -309,6 +309,8 @@ xfs_ialloc_ag_alloc(
>  			free = XFS_MAKE_IPTR(args.mp, fbuf, i);
>  			free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
>  			free->di_core.di_version = version;
> +			free->di_core.di_gen = agi->agi_newinogen;
> +			be32_add_cpu(&agi->agi_newinogen, 1);
>  			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
>  			xfs_ialloc_log_di(tp, fbuf, i,
>  				XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
> @@ -347,7 +349,8 @@ xfs_ialloc_ag_alloc(
>  	 * Log allocation group header fields
>  	 */
>  	xfs_ialloc_log_agi(tp, agbp,
> -		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
> +		XFS_AGI_COUNT | XFS_AGI_FREECOUNT |
> +		XFS_AGI_NEWINO | XFS_AGI_NEWINOGEN);
>  	/*
>  	 * Modify/log superblock values for inode count and inode free count.
>  	 */
> @@ -896,11 +899,12 @@ nextag:
>  	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
>  	XFS_INOBT_CLR_FREE(&rec, offset);
>  	rec.ir_freecount--;
> +	be32_add_cpu(&agi->agi_newinogen, 1);
>  	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
>  			rec.ir_free)))
>  		goto error0;
>  	be32_add(&agi->agi_freecount, -1);
> -	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
> +	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT | XFS_AGI_NEWINOGEN);
>  	down_read(&mp->m_peraglock);
>  	mp->m_perag[tagno].pagi_freecount--;
>  	up_read(&mp->m_peraglock);
> @@ -1320,6 +1324,11 @@ xfs_ialloc_compute_maxlevels(
>  
>  /*
>   * Log specified fields for the ag hdr (inode section)
> + *
> + * We don't log the unlinked inode fields through here; they
> + * get logged directly to the buffer. Hence we have a discontinuity
> + * in the fields we are logging and we need two calls to map all
> + * the dirtied parts of the agi....
>   */
>  void
>  xfs_ialloc_log_agi(
> @@ -1342,22 +1351,27 @@ xfs_ialloc_log_agi(
>  		offsetof(xfs_agi_t, agi_newino),
>  		offsetof(xfs_agi_t, agi_dirino),
>  		offsetof(xfs_agi_t, agi_unlinked),
> +		offsetof(xfs_agi_t, agi_newinogen),
>  		sizeof(xfs_agi_t)
>  	};
> +	int			log_newino = fields & XFS_AGI_NEWINOGEN;
> +
>  #ifdef DEBUG
>  	xfs_agi_t		*agi;	/* allocation group header */
>  
>  	agi = XFS_BUF_TO_AGI(bp);
>  	ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
>  #endif
> -	/*
> -	 * Compute byte offsets for the first and last fields.
> -	 */
> +	fields &= ~XFS_AGI_NEWINOGEN;
> +
> +	/* Compute byte offsets for the first and last fields.  */
>  	xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
> -	/*
> -	 * Log the allocation group inode header buffer.
> -	 */
>  	xfs_trans_log_buf(tp, bp, first, last);
> +	if (log_newino) {
> +		xfs_btree_offsets(XFS_AGI_NEWINOGEN, offsets, XFS_AGI_NUM_BITS,
> +					&first, &last);
> +		xfs_trans_log_buf(tp, bp, first, last);
> +	}
>  }
>  
>  /*

-- 
Niv Sardi

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch] unique per-AG inode generation number initialisation
  2008-04-01 23:18 [Patch] unique per-AG inode generation number initialisation David Chinner
  2008-04-02  4:02 ` Niv Sardi
@ 2008-04-04  9:08 ` Hans-Peter Jansen
  2008-04-07 12:57 ` Christoph Hellwig
  2 siblings, 0 replies; 6+ messages in thread
From: Hans-Peter Jansen @ 2008-04-04  9:08 UTC (permalink / raw)
  To: David Chinner; +Cc: xfs-dev, xfs-oss

Dear David,

Am Mittwoch, 2. April 2008 schrieb David Chinner:
> Don't initialise new inode generation numbers to zero
>
> When we allocation new inode chunks, we initialise the generation
> numbers to zero. This works fine until we delete a chunk and then
> reallocate it, resulting in the same inode numbers but with a
> reset generation count. This can result in inode/generation
> pairs of different inodes occurring relatively close together.
>
> Given that the inode/gen pair makes up the "unique" portion of
> an NFS filehandle on XFS, this can result in file handles cached
> on clients being seen on the wire from the server but refer to
> a different file. This causes .... issues for NFS clients.
>
> Hence we need a unique generation number initialisation for
> each inode to prevent reuse of a small portion of the generation
> number space. Make this initialiser per-allocation group so
> that it is not a single point of contention in the filesystem,
> and increment it on every allocation within an AG to reduce the
> chance that a generation number is reused for a given inode number
> if the inode chunk is deleted and reallocated immediately
> afterwards.
>
> It is safe to add the agi_newinogen field to the AGI without
> using a feature bit. If an older kernel is used, it simply
> will not update the field on allocation. If the kernel is
> updated and the field has garbage in it, then it's like having a
> random seed to the generation number....

Is this patch ready for production systems? Did it went through QA already?

If it's ready, shouldn't it go to the Linux stable team then, also, since it 
sounds like a serious issue. My systems hit file servers with solely xfs 
via NFS3 rather heavily - hence I am concerned..

Thanks,
Pete

> Signed-off-by: Dave Chinner <dgc@sgi.com>
> ---
>  fs/xfs/xfs_ag.h     |    4 +++-
>  fs/xfs/xfs_ialloc.c |   30 ++++++++++++++++++++++--------
>  2 files changed, 25 insertions(+), 9 deletions(-)
>
> Index: 2.6.x-xfs-new/fs/xfs/xfs_ag.h
> ===================================================================
> --- 2.6.x-xfs-new.orig/fs/xfs/xfs_ag.h	2008-01-18 18:30:06.000000000
> +1100 +++ 2.6.x-xfs-new/fs/xfs/xfs_ag.h	2008-03-26 13:03:41.122918236
> +1100 @@ -121,6 +121,7 @@ typedef struct xfs_agi {
>  	 * still being referenced.
>  	 */
>  	__be32		agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
> +	__be32		agi_newinogen;	/* inode cluster generation */
>  } xfs_agi_t;
>
>  #define	XFS_AGI_MAGICNUM	0x00000001
> @@ -134,7 +135,8 @@ typedef struct xfs_agi {
>  #define	XFS_AGI_NEWINO		0x00000100
>  #define	XFS_AGI_DIRINO		0x00000200
>  #define	XFS_AGI_UNLINKED	0x00000400
> -#define	XFS_AGI_NUM_BITS	11
> +#define	XFS_AGI_NEWINOGEN	0x00000800
> +#define	XFS_AGI_NUM_BITS	12
>  #define	XFS_AGI_ALL_BITS	((1 << XFS_AGI_NUM_BITS) - 1)
>
>  /* disk block (xfs_daddr_t) in the AG */
> Index: 2.6.x-xfs-new/fs/xfs/xfs_ialloc.c
> ===================================================================
> --- 2.6.x-xfs-new.orig/fs/xfs/xfs_ialloc.c	2008-03-25 15:41:27.000000000
> +1100 +++ 2.6.x-xfs-new/fs/xfs/xfs_ialloc.c	2008-03-26 14:29:47.998554368
> +1100 @@ -309,6 +309,8 @@ xfs_ialloc_ag_alloc(
>  			free = XFS_MAKE_IPTR(args.mp, fbuf, i);
>  			free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
>  			free->di_core.di_version = version;
> +			free->di_core.di_gen = agi->agi_newinogen;
> +			be32_add_cpu(&agi->agi_newinogen, 1);
>  			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
>  			xfs_ialloc_log_di(tp, fbuf, i,
>  				XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
> @@ -347,7 +349,8 @@ xfs_ialloc_ag_alloc(
>  	 * Log allocation group header fields
>  	 */
>  	xfs_ialloc_log_agi(tp, agbp,
> -		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
> +		XFS_AGI_COUNT | XFS_AGI_FREECOUNT |
> +		XFS_AGI_NEWINO | XFS_AGI_NEWINOGEN);
>  	/*
>  	 * Modify/log superblock values for inode count and inode free count.
>  	 */
> @@ -896,11 +899,12 @@ nextag:
>  	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
>  	XFS_INOBT_CLR_FREE(&rec, offset);
>  	rec.ir_freecount--;
> +	be32_add_cpu(&agi->agi_newinogen, 1);
>  	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
>  			rec.ir_free)))
>  		goto error0;
>  	be32_add(&agi->agi_freecount, -1);
> -	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
> +	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT | XFS_AGI_NEWINOGEN);
>  	down_read(&mp->m_peraglock);
>  	mp->m_perag[tagno].pagi_freecount--;
>  	up_read(&mp->m_peraglock);
> @@ -1320,6 +1324,11 @@ xfs_ialloc_compute_maxlevels(
>
>  /*
>   * Log specified fields for the ag hdr (inode section)
> + *
> + * We don't log the unlinked inode fields through here; they
> + * get logged directly to the buffer. Hence we have a discontinuity
> + * in the fields we are logging and we need two calls to map all
> + * the dirtied parts of the agi....
>   */
>  void
>  xfs_ialloc_log_agi(
> @@ -1342,22 +1351,27 @@ xfs_ialloc_log_agi(
>  		offsetof(xfs_agi_t, agi_newino),
>  		offsetof(xfs_agi_t, agi_dirino),
>  		offsetof(xfs_agi_t, agi_unlinked),
> +		offsetof(xfs_agi_t, agi_newinogen),
>  		sizeof(xfs_agi_t)
>  	};
> +	int			log_newino = fields & XFS_AGI_NEWINOGEN;
> +
>  #ifdef DEBUG
>  	xfs_agi_t		*agi;	/* allocation group header */
>
>  	agi = XFS_BUF_TO_AGI(bp);
>  	ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
>  #endif
> -	/*
> -	 * Compute byte offsets for the first and last fields.
> -	 */
> +	fields &= ~XFS_AGI_NEWINOGEN;
> +
> +	/* Compute byte offsets for the first and last fields.  */
>  	xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
> -	/*
> -	 * Log the allocation group inode header buffer.
> -	 */
>  	xfs_trans_log_buf(tp, bp, first, last);
> +	if (log_newino) {
> +		xfs_btree_offsets(XFS_AGI_NEWINOGEN, offsets, XFS_AGI_NUM_BITS,
> +					&first, &last);
> +		xfs_trans_log_buf(tp, bp, first, last);
> +	}
>  }
>
>  /*

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch] unique per-AG inode generation number initialisation
  2008-04-01 23:18 [Patch] unique per-AG inode generation number initialisation David Chinner
  2008-04-02  4:02 ` Niv Sardi
  2008-04-04  9:08 ` Hans-Peter Jansen
@ 2008-04-07 12:57 ` Christoph Hellwig
  2008-04-07 21:52   ` David Chinner
  2 siblings, 1 reply; 6+ messages in thread
From: Christoph Hellwig @ 2008-04-07 12:57 UTC (permalink / raw)
  To: David Chinner; +Cc: xfs-dev, xfs-oss

I don't really like this.  The chance to hit a previously used
generation seems to high.  What about making the first few bits
of each generation number a per-ag counter that's incremented anytime
we deallocate an inode cluster?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch] unique per-AG inode generation number initialisation
  2008-04-07 12:57 ` Christoph Hellwig
@ 2008-04-07 21:52   ` David Chinner
  2008-04-10  4:34     ` David Chinner
  0 siblings, 1 reply; 6+ messages in thread
From: David Chinner @ 2008-04-07 21:52 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: David Chinner, xfs-dev, xfs-oss

On Mon, Apr 07, 2008 at 08:57:38AM -0400, Christoph Hellwig wrote:
> I don't really like this.  The chance to hit a previously used generation
> seems to high.

The chance to hit an existing generation number is almost non-existant.

The counter is incremented on every allocation and not just when
inode chunks are allocated on disk. Hence a series of "allocate
chunk, unlink + free chunk, realloc chunk" is guaranteed to get a
higher generation number on reallocation, as is the "allocate a
chunk, while [1] {allocate; unlink}, unlink chunk, reallocate
chunk." These are the issues that are causing use problems right
now.

The generation number won't get reused at all until it wraps at 2^32
allocations within the AG, and then you've got to have a chunk of inodes
get freed and reallocated at the same time the counter matches an inode
generation number. While not impossible, it'll be pretty rare....

> What about making the first few bits of each generation
> number a per-ag counter that's incremented anytime we deallocate an inode
> cluster?

First thing I considered - increment on chunk freeing is not
sufficient guarantee of short-term uniqueness. To guarantee short
term uniqueness, the generation number used to initialise the inode
chunk if it is immediately reallocated needs to be greater than the
maximum used by any inode in the chunk that got freed. Now the "counter"
becomes a "maximum generation number used in the AG" value. This
also adds significant complexity to xfs_icluster_free() as we have to
look at every inode in the chunk and not just the ones that are
in-core.

FWIW, the biggest complexity with this approach is wrapping - how do
you tell what the highest highest generation number in the inode
chunk being freed is when some have wrapped through zero?

I basically gave up on this approach because of the extra complexity
and nasty, untestable corner cases it introduced into code that is
already complex. A simple incrementing counter solves the short-term
uniqueness problem while still making it very hard to get duplicates in
the long term. If you really, really need long term uniqueness, then
use 'ikeep'.

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch] unique per-AG inode generation number initialisation
  2008-04-07 21:52   ` David Chinner
@ 2008-04-10  4:34     ` David Chinner
  0 siblings, 0 replies; 6+ messages in thread
From: David Chinner @ 2008-04-10  4:34 UTC (permalink / raw)
  To: David Chinner; +Cc: Christoph Hellwig, xfs-dev, xfs-oss

Ping? Any further concerns on this? I'd like to get this
resolved quickly.....

Cheers,

Dave.

On Tue, Apr 08, 2008 at 07:52:03AM +1000, David Chinner wrote:
> On Mon, Apr 07, 2008 at 08:57:38AM -0400, Christoph Hellwig wrote:
> > I don't really like this.  The chance to hit a previously used generation
> > seems to high.
> 
> The chance to hit an existing generation number is almost non-existant.
> 
> The counter is incremented on every allocation and not just when
> inode chunks are allocated on disk. Hence a series of "allocate
> chunk, unlink + free chunk, realloc chunk" is guaranteed to get a
> higher generation number on reallocation, as is the "allocate a
> chunk, while [1] {allocate; unlink}, unlink chunk, reallocate
> chunk." These are the issues that are causing use problems right
> now.
> 
> The generation number won't get reused at all until it wraps at 2^32
> allocations within the AG, and then you've got to have a chunk of inodes
> get freed and reallocated at the same time the counter matches an inode
> generation number. While not impossible, it'll be pretty rare....
> 
> > What about making the first few bits of each generation
> > number a per-ag counter that's incremented anytime we deallocate an inode
> > cluster?
> 
> First thing I considered - increment on chunk freeing is not
> sufficient guarantee of short-term uniqueness. To guarantee short
> term uniqueness, the generation number used to initialise the inode
> chunk if it is immediately reallocated needs to be greater than the
> maximum used by any inode in the chunk that got freed. Now the "counter"
> becomes a "maximum generation number used in the AG" value. This
> also adds significant complexity to xfs_icluster_free() as we have to
> look at every inode in the chunk and not just the ones that are
> in-core.
> 
> FWIW, the biggest complexity with this approach is wrapping - how do
> you tell what the highest highest generation number in the inode
> chunk being freed is when some have wrapped through zero?
> 
> I basically gave up on this approach because of the extra complexity
> and nasty, untestable corner cases it introduced into code that is
> already complex. A simple incrementing counter solves the short-term
> uniqueness problem while still making it very hard to get duplicates in
> the long term. If you really, really need long term uniqueness, then
> use 'ikeep'.
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> Principal Engineer
> SGI Australian Software Group

-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-04-10  4:34 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-01 23:18 [Patch] unique per-AG inode generation number initialisation David Chinner
2008-04-02  4:02 ` Niv Sardi
2008-04-04  9:08 ` Hans-Peter Jansen
2008-04-07 12:57 ` Christoph Hellwig
2008-04-07 21:52   ` David Chinner
2008-04-10  4:34     ` David Chinner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox