* [PATCH 1/7] repair: parallelise phase 7
2016-02-04 23:05 [PATCH 1/7 v2] repair: big broken filesystems cause pain Dave Chinner
@ 2016-02-04 23:05 ` Dave Chinner
2016-02-08 8:55 ` Christoph Hellwig
2016-02-04 23:05 ` [PATCH 2/7] repair: parallelise uncertin inode processing in phase 3 Dave Chinner
` (5 subsequent siblings)
6 siblings, 1 reply; 20+ messages in thread
From: Dave Chinner @ 2016-02-04 23:05 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
It operates on a single AG at a time, sequentially, doing inode updates. All the
data structures accessed and modified are per AG, as are the modification to
on-disk structures. Hence we can run this phase concurrently across multiple
AGs.
This is important for large, broken filesystem repairs, where there can be
millions of inodes that need link counts updated. Once such repair image takes
more than 45 minutes to run phase 7 as a single threaded operation.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
repair/phase7.c | 77 ++++++++++++++++++++++++++++++++++-------------------
repair/progress.c | 4 +--
repair/protos.h | 2 +-
repair/xfs_repair.c | 2 +-
4 files changed, 54 insertions(+), 31 deletions(-)
diff --git a/repair/phase7.c b/repair/phase7.c
index b1e3a55..91dad02 100644
--- a/repair/phase7.c
+++ b/repair/phase7.c
@@ -26,6 +26,7 @@
#include "dinode.h"
#include "versions.h"
#include "progress.h"
+#include "threads.h"
/* dinoc is a pointer to the IN-CORE dinode core */
static void
@@ -108,45 +109,67 @@ update_inode_nlinks(
IRELE(ip);
}
-void
-phase7(xfs_mount_t *mp)
+/*
+ * for each ag, look at each inode 1 at a time. If the number of
+ * links is bad, reset it, log the inode core, commit the transaction
+ */
+static void
+do_link_updates(
+ struct work_queue *wq,
+ xfs_agnumber_t agno,
+ void *arg)
{
ino_tree_node_t *irec;
- int i;
int j;
__uint32_t nrefs;
+ irec = findfirst_inode_rec(agno);
+
+ while (irec != NULL) {
+ for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
+ ASSERT(is_inode_confirmed(irec, j));
+
+ if (is_inode_free(irec, j))
+ continue;
+
+ ASSERT(no_modify || is_inode_reached(irec, j));
+
+ nrefs = num_inode_references(irec, j);
+ ASSERT(no_modify || nrefs > 0);
+
+ if (get_inode_disk_nlinks(irec, j) != nrefs)
+ update_inode_nlinks(wq->mp,
+ XFS_AGINO_TO_INO(wq->mp, agno,
+ irec->ino_startnum + j),
+ nrefs);
+ }
+ irec = next_ino_rec(irec);
+ }
+
+ PROG_RPT_INC(prog_rpt_done[agno], 1);
+}
+
+void
+phase7(
+ struct xfs_mount *mp,
+ int scan_threads)
+{
+ struct work_queue wq;
+ int agno;
+
if (!no_modify)
do_log(_("Phase 7 - verify and correct link counts...\n"));
else
do_log(_("Phase 7 - verify link counts...\n"));
- /*
- * for each ag, look at each inode 1 at a time. If the number of
- * links is bad, reset it, log the inode core, commit the transaction
- */
- for (i = 0; i < glob_agcount; i++) {
- irec = findfirst_inode_rec(i);
-
- while (irec != NULL) {
- for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
- ASSERT(is_inode_confirmed(irec, j));
+ set_progress_msg(PROGRESS_FMT_CORR_LINK, (__uint64_t) glob_agcount);
- if (is_inode_free(irec, j))
- continue;
+ create_work_queue(&wq, mp, scan_threads);
- ASSERT(no_modify || is_inode_reached(irec, j));
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++)
+ queue_work(&wq, do_link_updates, agno, NULL);
- nrefs = num_inode_references(irec, j);
- ASSERT(no_modify || nrefs > 0);
+ destroy_work_queue(&wq);
- if (get_inode_disk_nlinks(irec, j) != nrefs)
- update_inode_nlinks(mp,
- XFS_AGINO_TO_INO(mp, i,
- irec->ino_startnum + j),
- nrefs);
- }
- irec = next_ino_rec(irec);
- }
- }
+ print_final_rpt();
}
diff --git a/repair/progress.c b/repair/progress.c
index 418b803..2a09b23 100644
--- a/repair/progress.c
+++ b/repair/progress.c
@@ -75,9 +75,9 @@ progress_rpt_t progress_rpt_reports[] = {
{FMT2, N_("moving disconnected inodes to lost+found"), /* 12 */
&rpt_fmts[FMT2], &rpt_types[TYPE_INODE]},
{FMT1, N_("verify and correct link counts"), /* 13 */
- &rpt_fmts[FMT1], &rpt_types[TYPE_INODE]},
+ &rpt_fmts[FMT1], &rpt_types[TYPE_AG]},
{FMT1, N_("verify link counts"), /* 14 */
- &rpt_fmts[FMT1], &rpt_types[TYPE_INODE]}
+ &rpt_fmts[FMT1], &rpt_types[TYPE_AG]}
};
pthread_t report_thread;
diff --git a/repair/protos.h b/repair/protos.h
index 9d5a2a6..b113aca 100644
--- a/repair/protos.h
+++ b/repair/protos.h
@@ -50,7 +50,7 @@ void phase3(struct xfs_mount *);
void phase4(struct xfs_mount *);
void phase5(struct xfs_mount *);
void phase6(struct xfs_mount *);
-void phase7(struct xfs_mount *);
+void phase7(struct xfs_mount *, int);
int verify_set_agheader(struct xfs_mount *, struct xfs_buf *,
struct xfs_sb *, struct xfs_agf *, struct xfs_agi *,
diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
index 3eaced4..fcdb212 100644
--- a/repair/xfs_repair.c
+++ b/repair/xfs_repair.c
@@ -893,7 +893,7 @@ main(int argc, char **argv)
phase6(mp);
timestamp(PHASE_END, 6, NULL);
- phase7(mp);
+ phase7(mp, phase2_threads);
timestamp(PHASE_END, 7, NULL);
} else {
do_warn(
--
2.5.0
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 1/7] repair: parallelise phase 7
2016-02-04 23:05 ` [PATCH 1/7] repair: parallelise phase 7 Dave Chinner
@ 2016-02-08 8:55 ` Christoph Hellwig
2016-02-09 0:12 ` Dave Chinner
0 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2016-02-08 8:55 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
> + irec = findfirst_inode_rec(agno);
> +
> + while (irec != NULL) {
> + for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
> + ASSERT(is_inode_confirmed(irec, j));
> +
> + if (is_inode_free(irec, j))
> + continue;
> +
> + ASSERT(no_modify || is_inode_reached(irec, j));
> +
> + nrefs = num_inode_references(irec, j);
> + ASSERT(no_modify || nrefs > 0);
> +
> + if (get_inode_disk_nlinks(irec, j) != nrefs)
> + update_inode_nlinks(wq->mp,
> + XFS_AGINO_TO_INO(wq->mp, agno,
> + irec->ino_startnum + j),
> + nrefs);
> + }
> + irec = next_ino_rec(irec);
Wouldn't this look be slightly cleaner as:
for (irec = findfirst_inode_rec(agno);
irec;
irec = next_ino_rec(irec)) {
?
Otherwise looks fine,
Reviewed-by: Christoph Hellwig <hch@lst.de>
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 1/7] repair: parallelise phase 7
2016-02-08 8:55 ` Christoph Hellwig
@ 2016-02-09 0:12 ` Dave Chinner
0 siblings, 0 replies; 20+ messages in thread
From: Dave Chinner @ 2016-02-09 0:12 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs
On Mon, Feb 08, 2016 at 12:55:55AM -0800, Christoph Hellwig wrote:
> > + irec = findfirst_inode_rec(agno);
> > +
> > + while (irec != NULL) {
> > + for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
> > + ASSERT(is_inode_confirmed(irec, j));
> > +
> > + if (is_inode_free(irec, j))
> > + continue;
> > +
> > + ASSERT(no_modify || is_inode_reached(irec, j));
> > +
> > + nrefs = num_inode_references(irec, j);
> > + ASSERT(no_modify || nrefs > 0);
> > +
> > + if (get_inode_disk_nlinks(irec, j) != nrefs)
> > + update_inode_nlinks(wq->mp,
> > + XFS_AGINO_TO_INO(wq->mp, agno,
> > + irec->ino_startnum + j),
> > + nrefs);
> > + }
> > + irec = next_ino_rec(irec);
>
> Wouldn't this look be slightly cleaner as:
>
> for (irec = findfirst_inode_rec(agno);
> irec;
> irec = next_ino_rec(irec)) {
Yup, done.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 2/7] repair: parallelise uncertin inode processing in phase 3
2016-02-04 23:05 [PATCH 1/7 v2] repair: big broken filesystems cause pain Dave Chinner
2016-02-04 23:05 ` [PATCH 1/7] repair: parallelise phase 7 Dave Chinner
@ 2016-02-04 23:05 ` Dave Chinner
2016-02-08 8:58 ` Christoph Hellwig
2016-02-04 23:05 ` [PATCH 3/7] libxfs: directory node splitting does not have an extra block Dave Chinner
` (4 subsequent siblings)
6 siblings, 1 reply; 20+ messages in thread
From: Dave Chinner @ 2016-02-04 23:05 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
This can take a long time when there are millions of uncertain inodes in badly
broken filesystems. THe processing is per-ag, the data structures are all
per-ag, and the load is mostly CPU time spent checking CRCs on each
uncertaini inode. Parallelising reduced the runtime of this phase on a badly
broken filesytem from ~30 minutes to under 5 miniutes.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
repair/phase3.c | 59 ++++++++++++++++++++++++++++++++++++++++++++---------
repair/protos.h | 2 +-
repair/xfs_repair.c | 2 +-
3 files changed, 51 insertions(+), 12 deletions(-)
diff --git a/repair/phase3.c b/repair/phase3.c
index 76c9440..0890a27 100644
--- a/repair/phase3.c
+++ b/repair/phase3.c
@@ -28,6 +28,7 @@
#include "dinode.h"
#include "progress.h"
#include "bmap.h"
+#include "threads.h"
static void
process_agi_unlinked(
@@ -87,10 +88,33 @@ process_ags(
do_inode_prefetch(mp, ag_stride, process_ag_func, false, false);
}
+static void
+do_uncertain_aginodes(
+ work_queue_t *wq,
+ xfs_agnumber_t agno,
+ void *arg)
+{
+ int *count = arg;
+
+ *count = process_uncertain_aginodes(wq->mp, agno);
+
+#ifdef XR_INODE_TRACE
+ fprintf(stderr,
+ "\t\t phase 3 - ag %d process_uncertain_inodes returns %d\n",
+ *count, j);
+#endif
+
+ PROG_RPT_INC(prog_rpt_done[agno], 1);
+}
+
void
-phase3(xfs_mount_t *mp)
+phase3(
+ struct xfs_mount *mp,
+ int scan_threads)
{
- int i, j;
+ int i, j;
+ int *counts;
+ work_queue_t wq;
do_log(_("Phase 3 - for each AG...\n"));
if (!no_modify)
@@ -129,20 +153,35 @@ phase3(xfs_mount_t *mp)
*/
do_log(_(" - process newly discovered inodes...\n"));
set_progress_msg(PROG_FMT_NEW_INODES, (__uint64_t) glob_agcount);
+
+ counts = calloc(sizeof(*counts), mp->m_sb.sb_agcount);
+ if (!counts) {
+ do_abort(_("no memory for uncertain inode counts\n"));
+ return;
+ }
+
do {
/*
* have to loop until no ag has any uncertain
* inodes
*/
j = 0;
- for (i = 0; i < mp->m_sb.sb_agcount; i++) {
- j += process_uncertain_aginodes(mp, i);
-#ifdef XR_INODE_TRACE
- fprintf(stderr,
- "\t\t phase 3 - process_uncertain_inodes returns %d\n", j);
-#endif
- PROG_RPT_INC(prog_rpt_done[i], 1);
- }
+ memset(counts, 0, mp->m_sb.sb_agcount * sizeof(*counts));
+
+ create_work_queue(&wq, mp, scan_threads);
+
+ for (i = 0; i < mp->m_sb.sb_agcount; i++)
+ queue_work(&wq, do_uncertain_aginodes, i, &counts[i]);
+
+ destroy_work_queue(&wq);
+
+ /* tally up the counts */
+ for (i = 0; i < mp->m_sb.sb_agcount; i++)
+ j += counts[i];
+
} while (j != 0);
+
+ free(counts);
+
print_final_rpt();
}
diff --git a/repair/protos.h b/repair/protos.h
index b113aca..0290420 100644
--- a/repair/protos.h
+++ b/repair/protos.h
@@ -46,7 +46,7 @@ void thread_init(void);
void phase1(struct xfs_mount *);
void phase2(struct xfs_mount *, int);
-void phase3(struct xfs_mount *);
+void phase3(struct xfs_mount *, int);
void phase4(struct xfs_mount *);
void phase5(struct xfs_mount *);
void phase6(struct xfs_mount *);
diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
index fcdb212..5d5f3aa 100644
--- a/repair/xfs_repair.c
+++ b/repair/xfs_repair.c
@@ -871,7 +871,7 @@ main(int argc, char **argv)
if (do_prefetch)
init_prefetch(mp);
- phase3(mp);
+ phase3(mp, phase2_threads);
timestamp(PHASE_END, 3, NULL);
phase4(mp);
--
2.5.0
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 3/7] libxfs: directory node splitting does not have an extra block
2016-02-04 23:05 [PATCH 1/7 v2] repair: big broken filesystems cause pain Dave Chinner
2016-02-04 23:05 ` [PATCH 1/7] repair: parallelise phase 7 Dave Chinner
2016-02-04 23:05 ` [PATCH 2/7] repair: parallelise uncertin inode processing in phase 3 Dave Chinner
@ 2016-02-04 23:05 ` Dave Chinner
2016-02-05 14:20 ` Brian Foster
2016-02-08 9:00 ` Christoph Hellwig
2016-02-04 23:05 ` [PATCH 4/7] libxfs: don't discard dirty buffers Dave Chinner
` (3 subsequent siblings)
6 siblings, 2 replies; 20+ messages in thread
From: Dave Chinner @ 2016-02-04 23:05 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
xfs_da3_split() has to handle all three versions of the
directory/attribute btree structure. The attr tree is v1, the dir
tre is v2 or v3. The main difference between the v1 and v2/3 trees
is the way tree nodes are split - in the v1 tree we can require a
double split to occur because the object to be inserted may be
larger than the space made by splitting a leaf. In this case we need
to do a double split - one to split the full leaf, then another to
allocate an empty leaf block in the correct location for the new
entry. This does not happen with dir (v2/v3) formats as the objects
being inserted are always guaranteed to fit into the new space in
the split blocks.
Indeed, for directories they *may* be an extra block on this buffer
pointer. However, it's guaranteed not to be a leaf block (i.e. a
directory data block) - the directory code only ever places hash
index or free space blocks in this pointer (as a cursor of
sorts), and so to use it as a directory data block will immediately
corrupt the directory.
The problem is that the code assumes that there may be extra blocks
that we need to link into the tree once we've split the root, but
this is not true for either dir or attr trees, because the extra
attr block is always consumed by the last node split before we split
the root. Hence the linking in an extra block is always wrong at the
root split level, and this manifests itself in repair as a directory
corruption in a repaired directory, leaving the directory rebuild
incomplete.
This is a dir v2 zero-day bug - it was in the initial dir v2 commit
that was made back in February 1998.
Fix this by ensuring the linking of the blocks after the root split
never tries to make use of the extra blocks that may be held in the
cursor. They are held there for other purposes and should never be
touched by the root splitting code.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
libxfs/xfs_da_btree.c | 59 +++++++++++++++++++++++++--------------------------
1 file changed, 29 insertions(+), 30 deletions(-)
diff --git a/libxfs/xfs_da_btree.c b/libxfs/xfs_da_btree.c
index bf5fe21..25072c7 100644
--- a/libxfs/xfs_da_btree.c
+++ b/libxfs/xfs_da_btree.c
@@ -351,7 +351,6 @@ xfs_da3_split(
struct xfs_da_state_blk *newblk;
struct xfs_da_state_blk *addblk;
struct xfs_da_intnode *node;
- struct xfs_buf *bp;
int max;
int action = 0;
int error;
@@ -392,7 +391,9 @@ xfs_da3_split(
break;
}
/*
- * Entry wouldn't fit, split the leaf again.
+ * Entry wouldn't fit, split the leaf again. The new
+ * extrablk will be consumed by xfs_da3_node_split if
+ * the node is split.
*/
state->extravalid = 1;
if (state->inleaf) {
@@ -441,6 +442,14 @@ xfs_da3_split(
return 0;
/*
+ * xfs_da3_node_split() should have consumed any extra blocks we added
+ * during a double leaf split in the attr fork. This is guaranteed as
+ * we can't be here if the attr fork only has a single leaf block.
+ */
+ ASSERT(state->extravalid == 0 ||
+ state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
+ /*
* Split the root node.
*/
ASSERT(state->path.active == 0);
@@ -452,43 +461,33 @@ xfs_da3_split(
}
/*
- * Update pointers to the node which used to be block 0 and
- * just got bumped because of the addition of a new root node.
- * There might be three blocks involved if a double split occurred,
- * and the original block 0 could be at any position in the list.
+ * Update pointers to the node which used to be block 0 and just got
+ * bumped because of the addition of a new root node. Note that the
+ * original block 0 could be at any position in the list of blocks in
+ * the tree.
*
- * Note: the magic numbers and sibling pointers are in the same
- * physical place for both v2 and v3 headers (by design). Hence it
- * doesn't matter which version of the xfs_da_intnode structure we use
- * here as the result will be the same using either structure.
+ * Note: the magic numbers and sibling pointers are in the same physical
+ * place for both v2 and v3 headers (by design). Hence it doesn't matter
+ * which version of the xfs_da_intnode structure we use here as the
+ * result will be the same using either structure.
*/
node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
- if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
- bp = addblk->bp;
- } else {
- ASSERT(state->extravalid);
- bp = state->extrablk.bp;
- }
- node = bp->b_addr;
+ ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
+ node = addblk->bp->b_addr;
node->hdr.info.back = cpu_to_be32(oldblk->blkno);
- xfs_trans_log_buf(state->args->trans, bp,
- XFS_DA_LOGRANGE(node, &node->hdr.info,
- sizeof(node->hdr.info)));
+ xfs_trans_log_buf(state->args->trans, addblk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr.info,
+ sizeof(node->hdr.info)));
}
node = oldblk->bp->b_addr;
if (node->hdr.info.back) {
- if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
- bp = addblk->bp;
- } else {
- ASSERT(state->extravalid);
- bp = state->extrablk.bp;
- }
- node = bp->b_addr;
+ ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
+ node = addblk->bp->b_addr;
node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
- xfs_trans_log_buf(state->args->trans, bp,
- XFS_DA_LOGRANGE(node, &node->hdr.info,
- sizeof(node->hdr.info)));
+ xfs_trans_log_buf(state->args->trans, addblk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr.info,
+ sizeof(node->hdr.info)));
}
addblk->bp = NULL;
return 0;
--
2.5.0
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 3/7] libxfs: directory node splitting does not have an extra block
2016-02-04 23:05 ` [PATCH 3/7] libxfs: directory node splitting does not have an extra block Dave Chinner
@ 2016-02-05 14:20 ` Brian Foster
2016-02-08 9:00 ` Christoph Hellwig
1 sibling, 0 replies; 20+ messages in thread
From: Brian Foster @ 2016-02-05 14:20 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Fri, Feb 05, 2016 at 10:05:04AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> xfs_da3_split() has to handle all three versions of the
> directory/attribute btree structure. The attr tree is v1, the dir
> tre is v2 or v3. The main difference between the v1 and v2/3 trees
> is the way tree nodes are split - in the v1 tree we can require a
> double split to occur because the object to be inserted may be
> larger than the space made by splitting a leaf. In this case we need
> to do a double split - one to split the full leaf, then another to
> allocate an empty leaf block in the correct location for the new
> entry. This does not happen with dir (v2/v3) formats as the objects
> being inserted are always guaranteed to fit into the new space in
> the split blocks.
>
> Indeed, for directories they *may* be an extra block on this buffer
> pointer. However, it's guaranteed not to be a leaf block (i.e. a
> directory data block) - the directory code only ever places hash
> index or free space blocks in this pointer (as a cursor of
> sorts), and so to use it as a directory data block will immediately
> corrupt the directory.
>
> The problem is that the code assumes that there may be extra blocks
> that we need to link into the tree once we've split the root, but
> this is not true for either dir or attr trees, because the extra
> attr block is always consumed by the last node split before we split
> the root. Hence the linking in an extra block is always wrong at the
> root split level, and this manifests itself in repair as a directory
> corruption in a repaired directory, leaving the directory rebuild
> incomplete.
>
> This is a dir v2 zero-day bug - it was in the initial dir v2 commit
> that was made back in February 1998.
>
> Fix this by ensuring the linking of the blocks after the root split
> never tries to make use of the extra blocks that may be held in the
> cursor. They are held there for other purposes and should never be
> touched by the root splitting code.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
Reviewed-by: Brian Foster <bfoster@redhat.com>
> libxfs/xfs_da_btree.c | 59 +++++++++++++++++++++++++--------------------------
> 1 file changed, 29 insertions(+), 30 deletions(-)
>
> diff --git a/libxfs/xfs_da_btree.c b/libxfs/xfs_da_btree.c
> index bf5fe21..25072c7 100644
> --- a/libxfs/xfs_da_btree.c
> +++ b/libxfs/xfs_da_btree.c
> @@ -351,7 +351,6 @@ xfs_da3_split(
> struct xfs_da_state_blk *newblk;
> struct xfs_da_state_blk *addblk;
> struct xfs_da_intnode *node;
> - struct xfs_buf *bp;
> int max;
> int action = 0;
> int error;
> @@ -392,7 +391,9 @@ xfs_da3_split(
> break;
> }
> /*
> - * Entry wouldn't fit, split the leaf again.
> + * Entry wouldn't fit, split the leaf again. The new
> + * extrablk will be consumed by xfs_da3_node_split if
> + * the node is split.
> */
> state->extravalid = 1;
> if (state->inleaf) {
> @@ -441,6 +442,14 @@ xfs_da3_split(
> return 0;
>
> /*
> + * xfs_da3_node_split() should have consumed any extra blocks we added
> + * during a double leaf split in the attr fork. This is guaranteed as
> + * we can't be here if the attr fork only has a single leaf block.
> + */
> + ASSERT(state->extravalid == 0 ||
> + state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
> +
> + /*
> * Split the root node.
> */
> ASSERT(state->path.active == 0);
> @@ -452,43 +461,33 @@ xfs_da3_split(
> }
>
> /*
> - * Update pointers to the node which used to be block 0 and
> - * just got bumped because of the addition of a new root node.
> - * There might be three blocks involved if a double split occurred,
> - * and the original block 0 could be at any position in the list.
> + * Update pointers to the node which used to be block 0 and just got
> + * bumped because of the addition of a new root node. Note that the
> + * original block 0 could be at any position in the list of blocks in
> + * the tree.
> *
> - * Note: the magic numbers and sibling pointers are in the same
> - * physical place for both v2 and v3 headers (by design). Hence it
> - * doesn't matter which version of the xfs_da_intnode structure we use
> - * here as the result will be the same using either structure.
> + * Note: the magic numbers and sibling pointers are in the same physical
> + * place for both v2 and v3 headers (by design). Hence it doesn't matter
> + * which version of the xfs_da_intnode structure we use here as the
> + * result will be the same using either structure.
> */
> node = oldblk->bp->b_addr;
> if (node->hdr.info.forw) {
> - if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
> - bp = addblk->bp;
> - } else {
> - ASSERT(state->extravalid);
> - bp = state->extrablk.bp;
> - }
> - node = bp->b_addr;
> + ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
> + node = addblk->bp->b_addr;
> node->hdr.info.back = cpu_to_be32(oldblk->blkno);
> - xfs_trans_log_buf(state->args->trans, bp,
> - XFS_DA_LOGRANGE(node, &node->hdr.info,
> - sizeof(node->hdr.info)));
> + xfs_trans_log_buf(state->args->trans, addblk->bp,
> + XFS_DA_LOGRANGE(node, &node->hdr.info,
> + sizeof(node->hdr.info)));
> }
> node = oldblk->bp->b_addr;
> if (node->hdr.info.back) {
> - if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
> - bp = addblk->bp;
> - } else {
> - ASSERT(state->extravalid);
> - bp = state->extrablk.bp;
> - }
> - node = bp->b_addr;
> + ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
> + node = addblk->bp->b_addr;
> node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
> - xfs_trans_log_buf(state->args->trans, bp,
> - XFS_DA_LOGRANGE(node, &node->hdr.info,
> - sizeof(node->hdr.info)));
> + xfs_trans_log_buf(state->args->trans, addblk->bp,
> + XFS_DA_LOGRANGE(node, &node->hdr.info,
> + sizeof(node->hdr.info)));
> }
> addblk->bp = NULL;
> return 0;
> --
> 2.5.0
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 3/7] libxfs: directory node splitting does not have an extra block
2016-02-04 23:05 ` [PATCH 3/7] libxfs: directory node splitting does not have an extra block Dave Chinner
2016-02-05 14:20 ` Brian Foster
@ 2016-02-08 9:00 ` Christoph Hellwig
1 sibling, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2016-02-08 9:00 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
Looks fine,
Reviewed-by: Christoph Hellwig <hch@lst.de>
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 4/7] libxfs: don't discard dirty buffers
2016-02-04 23:05 [PATCH 1/7 v2] repair: big broken filesystems cause pain Dave Chinner
` (2 preceding siblings ...)
2016-02-04 23:05 ` [PATCH 3/7] libxfs: directory node splitting does not have an extra block Dave Chinner
@ 2016-02-04 23:05 ` Dave Chinner
2016-02-08 9:03 ` Christoph Hellwig
2016-02-04 23:05 ` [PATCH 5/7] libxfs: don't repeatedly shake unwritable buffers Dave Chinner
` (2 subsequent siblings)
6 siblings, 1 reply; 20+ messages in thread
From: Dave Chinner @ 2016-02-04 23:05 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
When we release a buffer from the cache, if it is dirty we wite it
to disk then put the buffer on the free list for recycling. However,
if the write fails (e.g. verifier failure due unfixed corruption) we
effectively throw the buffer and it contents away. This causes all
sorts of problems for xfs_repair as it then re-reads the buffer from
disk on the next access and hence loses all the corrections that had
previously been made, resulting in tripping over corruptions in code
that assumes the corruptions have already been fixed/flagged in the
buffer it receives.
TO fix this, we have to make the cache aware that writes can fail,
and keep the buffer in cache when writes fail. Hence we have to add
an explicit error notification to the flush operation, and we need
to do that before we release the buffer to the free list. This also
means that we need to remove the writeback code from the release
mechanisms, instead replacing them with assertions that the buffers
are already clean.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
include/cache.h | 2 +-
libxfs/cache.c | 15 ++++++++++++++-
libxfs/rdwr.c | 44 +++++++++++++++++++++++++++-----------------
3 files changed, 42 insertions(+), 19 deletions(-)
diff --git a/include/cache.h b/include/cache.h
index 0a84c69..87826be 100644
--- a/include/cache.h
+++ b/include/cache.h
@@ -64,7 +64,7 @@ typedef void *cache_key_t;
typedef void (*cache_walk_t)(struct cache_node *);
typedef struct cache_node * (*cache_node_alloc_t)(cache_key_t);
-typedef void (*cache_node_flush_t)(struct cache_node *);
+typedef int (*cache_node_flush_t)(struct cache_node *);
typedef void (*cache_node_relse_t)(struct cache_node *);
typedef unsigned int (*cache_node_hash_t)(cache_key_t, unsigned int,
unsigned int);
diff --git a/libxfs/cache.c b/libxfs/cache.c
index 4753a1d..a48ebd9 100644
--- a/libxfs/cache.c
+++ b/libxfs/cache.c
@@ -219,6 +219,12 @@ cache_shake(
if (pthread_mutex_trylock(&node->cn_mutex) != 0)
continue;
+ /* can't release dirty objects */
+ if (cache->flush(node)) {
+ pthread_mutex_unlock(&node->cn_mutex);
+ continue;
+ }
+
hash = cache->c_hash + node->cn_hashidx;
if (pthread_mutex_trylock(&hash->ch_mutex) != 0) {
pthread_mutex_unlock(&node->cn_mutex);
@@ -311,6 +317,13 @@ __cache_node_purge(
pthread_mutex_unlock(&node->cn_mutex);
return count;
}
+
+ /* can't purge dirty objects */
+ if (cache->flush(node)) {
+ pthread_mutex_unlock(&node->cn_mutex);
+ return 1;
+ }
+
mru = &cache->c_mrus[node->cn_priority];
pthread_mutex_lock(&mru->cm_mutex);
list_del_init(&node->cn_mru);
@@ -321,7 +334,7 @@ __cache_node_purge(
pthread_mutex_destroy(&node->cn_mutex);
list_del_init(&node->cn_hash);
cache->relse(node);
- return count;
+ return 0;
}
/*
diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
index 7b23394..37162cd 100644
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -659,6 +659,8 @@ __libxfs_getbufr(int blen)
bp = kmem_zone_zalloc(xfs_buf_zone, 0);
pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
bp->b_ops = NULL;
+ if (bp->b_flags & LIBXFS_B_DIRTY)
+ fprintf(stderr, "found dirty buffer (bulk) on free list!");
return bp;
}
@@ -1223,23 +1225,26 @@ libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
}
static void
-libxfs_brelse(struct cache_node *node)
+libxfs_brelse(
+ struct cache_node *node)
{
- xfs_buf_t *bp = (xfs_buf_t *)node;
+ struct xfs_buf *bp = (struct xfs_buf *)node;
- if (bp != NULL) {
- if (bp->b_flags & LIBXFS_B_DIRTY)
- libxfs_writebufr(bp);
- pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
- list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
- pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
- }
+ if (!bp)
+ return;
+ if (bp->b_flags & LIBXFS_B_DIRTY)
+ fprintf(stderr,
+ "releasing dirty buffer to free list!");
+
+ pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
+ list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
+ pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
}
static unsigned int
libxfs_bulkrelse(
- struct cache *cache,
- struct list_head *list)
+ struct cache *cache,
+ struct list_head *list)
{
xfs_buf_t *bp;
int count = 0;
@@ -1249,7 +1254,8 @@ libxfs_bulkrelse(
list_for_each_entry(bp, list, b_node.cn_mru) {
if (bp->b_flags & LIBXFS_B_DIRTY)
- libxfs_writebufr(bp);
+ fprintf(stderr,
+ "releasing dirty buffer (bulk) to free list!");
count++;
}
@@ -1260,18 +1266,22 @@ libxfs_bulkrelse(
return count;
}
-static void
-libxfs_bflush(struct cache_node *node)
+static int
+libxfs_bflush(
+ struct cache_node *node)
{
- xfs_buf_t *bp = (xfs_buf_t *)node;
+ struct xfs_buf *bp = (struct xfs_buf *)node;
- if ((bp != NULL) && (bp->b_flags & LIBXFS_B_DIRTY))
- libxfs_writebufr(bp);
+ if (bp->b_flags & LIBXFS_B_DIRTY)
+ return libxfs_writebufr(bp);
+ return 0;
}
void
libxfs_putbufr(xfs_buf_t *bp)
{
+ if (bp->b_flags & LIBXFS_B_DIRTY)
+ libxfs_writebufr(bp);
libxfs_brelse((struct cache_node *)bp);
}
--
2.5.0
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 5/7] libxfs: don't repeatedly shake unwritable buffers
2016-02-04 23:05 [PATCH 1/7 v2] repair: big broken filesystems cause pain Dave Chinner
` (3 preceding siblings ...)
2016-02-04 23:05 ` [PATCH 4/7] libxfs: don't discard dirty buffers Dave Chinner
@ 2016-02-04 23:05 ` Dave Chinner
2016-02-08 9:03 ` Christoph Hellwig
2016-02-04 23:05 ` [PATCH 6/7] libxfs: keep unflushable buffers off the cache MRUs Dave Chinner
2016-02-04 23:05 ` [PATCH 7/7] libxfs: reset dirty buffer priority on lookup Dave Chinner
6 siblings, 1 reply; 20+ messages in thread
From: Dave Chinner @ 2016-02-04 23:05 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
now that we try to write dirty buffers before we release them, we
can get buildup of unwritable dirty buffers on the LRU lists, This
results in the cache shaker repeatedly trying to write out these
buffers every time the cache fills up. This results in more
corruption warnings, and takes up a lot of time doing reclaiming
nothing. This can effectively livelock the processing parts of phase
4.
Fix this by not trying to write buffers with corruption errors on
them. These errors will get cleared when the buffer is re-read and
fixed and them marked dirty again. At which point, we'll be able to
write them and so the cache can reclaim them successfully.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
libxfs/rdwr.c | 27 ++++++++++++++++-----------
1 file changed, 16 insertions(+), 11 deletions(-)
diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
index 37162cd..2e298c2 100644
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -1103,7 +1103,6 @@ int
libxfs_writebufr(xfs_buf_t *bp)
{
int fd = libxfs_device_to_fd(bp->b_target->dev);
- int error = 0;
/*
* we never write buffers that are marked stale. This indicates they
@@ -1134,7 +1133,7 @@ libxfs_writebufr(xfs_buf_t *bp)
}
if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) {
- error = __write_buf(fd, bp->b_addr, bp->b_bcount,
+ bp->b_error = __write_buf(fd, bp->b_addr, bp->b_bcount,
LIBXFS_BBTOOFF64(bp->b_bn), bp->b_flags);
} else {
int i;
@@ -1144,11 +1143,10 @@ libxfs_writebufr(xfs_buf_t *bp)
off64_t offset = LIBXFS_BBTOOFF64(bp->b_map[i].bm_bn);
int len = BBTOB(bp->b_map[i].bm_len);
- error = __write_buf(fd, buf, len, offset, bp->b_flags);
- if (error) {
- bp->b_error = error;
+ bp->b_error = __write_buf(fd, buf, len, offset,
+ bp->b_flags);
+ if (bp->b_error)
break;
- }
buf += len;
}
}
@@ -1157,14 +1155,14 @@ libxfs_writebufr(xfs_buf_t *bp)
printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p, error %d\n",
pthread_self(), __FUNCTION__, bp->b_bcount,
(long long)LIBXFS_BBTOOFF64(bp->b_bn),
- (long long)bp->b_bn, bp, error);
+ (long long)bp->b_bn, bp, bp->b_error);
#endif
- if (!error) {
+ if (!bp->b_error) {
bp->b_flags |= LIBXFS_B_UPTODATE;
bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_EXIT |
LIBXFS_B_UNCHECKED);
}
- return error;
+ return bp->b_error;
}
int
@@ -1266,15 +1264,22 @@ libxfs_bulkrelse(
return count;
}
+/*
+ * When a buffer is marked dirty, the error is cleared. Hence if we are trying
+ * to flush a buffer prior to cache reclaim that has an error on it it means
+ * we've already tried to flush it and it failed. Prevent repeated corruption
+ * errors from being reported by skipping such buffers - when the corruption is
+ * fixed the buffer will be marked dirty again and we can write it again.
+ */
static int
libxfs_bflush(
struct cache_node *node)
{
struct xfs_buf *bp = (struct xfs_buf *)node;
- if (bp->b_flags & LIBXFS_B_DIRTY)
+ if (!bp->b_error && bp->b_flags & LIBXFS_B_DIRTY)
return libxfs_writebufr(bp);
- return 0;
+ return bp->b_error;
}
void
--
2.5.0
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 6/7] libxfs: keep unflushable buffers off the cache MRUs
2016-02-04 23:05 [PATCH 1/7 v2] repair: big broken filesystems cause pain Dave Chinner
` (4 preceding siblings ...)
2016-02-04 23:05 ` [PATCH 5/7] libxfs: don't repeatedly shake unwritable buffers Dave Chinner
@ 2016-02-04 23:05 ` Dave Chinner
2016-02-05 14:22 ` Brian Foster
2016-02-08 10:06 ` Christoph Hellwig
2016-02-04 23:05 ` [PATCH 7/7] libxfs: reset dirty buffer priority on lookup Dave Chinner
6 siblings, 2 replies; 20+ messages in thread
From: Dave Chinner @ 2016-02-04 23:05 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
There's no point trying to free buffers that are dirty and return
errors on flush as we have to keep them around until the corruption
is fixed. Hence if we fail to flush an inode during a cache shake,
move the buffer to a special dirty MRU list that the cache does not
shake. This prevents memory pressure from seeing these buffers, but
allows subsequent cache lookups to still find them through the hash.
This ensures we don't waste huge amounts of CPU trying to flush and
reclaim buffers that canot be flushed or reclaimed.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
include/cache.h | 3 ++-
libxfs/cache.c | 78 ++++++++++++++++++++++++++++++++++++++++++---------------
2 files changed, 60 insertions(+), 21 deletions(-)
diff --git a/include/cache.h b/include/cache.h
index 87826be..55761d3 100644
--- a/include/cache.h
+++ b/include/cache.h
@@ -51,6 +51,7 @@ enum {
#define CACHE_BASE_PRIORITY 0
#define CACHE_PREFETCH_PRIORITY 8
#define CACHE_MAX_PRIORITY 15
+#define CACHE_DIRTY_PRIORITY (CACHE_MAX_PRIORITY + 1)
/*
* Simple, generic implementation of a cache (arbitrary data).
@@ -115,7 +116,7 @@ struct cache {
unsigned int c_hashsize; /* hash bucket count */
unsigned int c_hashshift; /* hash key shift */
struct cache_hash *c_hash; /* hash table buckets */
- struct cache_mru c_mrus[CACHE_MAX_PRIORITY + 1];
+ struct cache_mru c_mrus[CACHE_DIRTY_PRIORITY + 1];
unsigned long long c_misses; /* cache misses */
unsigned long long c_hits; /* cache hits */
unsigned int c_max; /* max nodes ever used */
diff --git a/libxfs/cache.c b/libxfs/cache.c
index a48ebd9..d4b4a4e 100644
--- a/libxfs/cache.c
+++ b/libxfs/cache.c
@@ -81,7 +81,7 @@ cache_init(
pthread_mutex_init(&cache->c_hash[i].ch_mutex, NULL);
}
- for (i = 0; i <= CACHE_MAX_PRIORITY; i++) {
+ for (i = 0; i <= CACHE_DIRTY_PRIORITY; i++) {
list_head_init(&cache->c_mrus[i].cm_list);
cache->c_mrus[i].cm_count = 0;
pthread_mutex_init(&cache->c_mrus[i].cm_mutex, NULL);
@@ -154,7 +154,7 @@ cache_destroy(
list_head_destroy(&cache->c_hash[i].ch_list);
pthread_mutex_destroy(&cache->c_hash[i].ch_mutex);
}
- for (i = 0; i <= CACHE_MAX_PRIORITY; i++) {
+ for (i = 0; i <= CACHE_DIRTY_PRIORITY; i++) {
list_head_destroy(&cache->c_mrus[i].cm_list);
pthread_mutex_destroy(&cache->c_mrus[i].cm_mutex);
}
@@ -183,15 +183,45 @@ cache_generic_bulkrelse(
}
/*
- * We've hit the limit on cache size, so we need to start reclaiming
- * nodes we've used. The MRU specified by the priority is shaken.
- * Returns new priority at end of the call (in case we call again).
+ * Park unflushable nodes on their own special MRU so that cache_shake() doesn't
+ * end up repeatedly scanning them in the futile attempt to clean them before
+ * reclaim.
+ */
+static void
+cache_move_to_dirty_mru(
+ struct cache *cache,
+ struct cache_node *node)
+{
+ struct cache_mru *mru;
+
+ mru = &cache->c_mrus[CACHE_DIRTY_PRIORITY];
+
+ pthread_mutex_lock(&mru->cm_mutex);
+ node->cn_priority = CACHE_DIRTY_PRIORITY;
+ list_move(&node->cn_mru, &mru->cm_list);
+ mru->cm_count++;
+ pthread_mutex_unlock(&mru->cm_mutex);
+}
+
+/*
+ * We've hit the limit on cache size, so we need to start reclaiming nodes we've
+ * used. The MRU specified by the priority is shaken. Returns new priority at
+ * end of the call (in case we call again). We are not allowed to reclaim dirty
+ * objects, so we have to flush them first. If flushing fails, we move them to
+ * the "dirty, unreclaimable" list.
+ *
+ * Hence we skip priorities > CACHE_MAX_PRIORITY unless "purge" is set as we
+ * park unflushable (and hence unreclaimable) buffers at these priorities.
+ * Trying to shake unreclaimable buffer lists whent here is memory pressure is a
+ * waste of time and CPU and greatly slows down cache node recycling operations.
+ * Hence we only try to free them if we are being asked to purge the cache of
+ * all entries.
*/
static unsigned int
cache_shake(
struct cache * cache,
unsigned int priority,
- int all)
+ bool purge)
{
struct cache_mru *mru;
struct cache_hash * hash;
@@ -202,10 +232,11 @@ cache_shake(
struct cache_node * node;
unsigned int count;
- ASSERT(priority <= CACHE_MAX_PRIORITY);
- if (priority > CACHE_MAX_PRIORITY)
+ ASSERT(priority <= CACHE_DIRTY_PRIORITY);
+ if (priority > CACHE_MAX_PRIORITY && !purge)
priority = 0;
+
mru = &cache->c_mrus[priority];
count = 0;
list_head_init(&temp);
@@ -219,8 +250,10 @@ cache_shake(
if (pthread_mutex_trylock(&node->cn_mutex) != 0)
continue;
- /* can't release dirty objects */
- if (cache->flush(node)) {
+ /* memory pressure is not allowed to release dirty objects */
+ if (cache->flush(node) && !purge) {
+ cache_move_to_dirty_mru(cache, node);
+ mru->cm_count--;
pthread_mutex_unlock(&node->cn_mutex);
continue;
}
@@ -242,7 +275,7 @@ cache_shake(
pthread_mutex_unlock(&node->cn_mutex);
count++;
- if (!all && count == CACHE_SHAKE_COUNT)
+ if (!purge && count == CACHE_SHAKE_COUNT)
break;
}
pthread_mutex_unlock(&mru->cm_mutex);
@@ -423,7 +456,7 @@ next_object:
node = cache_node_allocate(cache, key);
if (node)
break;
- priority = cache_shake(cache, priority, 0);
+ priority = cache_shake(cache, priority, false);
/*
* We start at 0; if we free CACHE_SHAKE_COUNT we get
* back the same priority, if not we get back priority+1.
@@ -578,8 +611,8 @@ cache_purge(
{
int i;
- for (i = 0; i <= CACHE_MAX_PRIORITY; i++)
- cache_shake(cache, i, 1);
+ for (i = 0; i <= CACHE_DIRTY_PRIORITY; i++)
+ cache_shake(cache, i, true);
#ifdef CACHE_DEBUG
if (cache->c_count != 0) {
@@ -626,13 +659,13 @@ cache_flush(
#define HASH_REPORT (3 * HASH_CACHE_RATIO)
void
cache_report(
- FILE *fp,
- const char *name,
- struct cache *cache)
+ FILE *fp,
+ const char *name,
+ struct cache *cache)
{
- int i;
- unsigned long count, index, total;
- unsigned long hash_bucket_lengths[HASH_REPORT + 2];
+ int i;
+ unsigned long count, index, total;
+ unsigned long hash_bucket_lengths[HASH_REPORT + 2];
if ((cache->c_hits + cache->c_misses) == 0)
return;
@@ -662,6 +695,11 @@ cache_report(
i, cache->c_mrus[i].cm_count,
cache->c_mrus[i].cm_count * 100 / cache->c_count);
+ i = CACHE_DIRTY_PRIORITY;
+ fprintf(fp, "Dirty MRU %d entries = %6u (%3u%%)\n",
+ i, cache->c_mrus[i].cm_count,
+ cache->c_mrus[i].cm_count * 100 / cache->c_count);
+
/* report hash bucket lengths */
bzero(hash_bucket_lengths, sizeof(hash_bucket_lengths));
--
2.5.0
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 6/7] libxfs: keep unflushable buffers off the cache MRUs
2016-02-04 23:05 ` [PATCH 6/7] libxfs: keep unflushable buffers off the cache MRUs Dave Chinner
@ 2016-02-05 14:22 ` Brian Foster
2016-02-08 10:06 ` Christoph Hellwig
1 sibling, 0 replies; 20+ messages in thread
From: Brian Foster @ 2016-02-05 14:22 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Fri, Feb 05, 2016 at 10:05:07AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> There's no point trying to free buffers that are dirty and return
> errors on flush as we have to keep them around until the corruption
> is fixed. Hence if we fail to flush an inode during a cache shake,
> move the buffer to a special dirty MRU list that the cache does not
> shake. This prevents memory pressure from seeing these buffers, but
> allows subsequent cache lookups to still find them through the hash.
> This ensures we don't waste huge amounts of CPU trying to flush and
> reclaim buffers that canot be flushed or reclaimed.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
> include/cache.h | 3 ++-
> libxfs/cache.c | 78 ++++++++++++++++++++++++++++++++++++++++++---------------
> 2 files changed, 60 insertions(+), 21 deletions(-)
>
...
> diff --git a/libxfs/cache.c b/libxfs/cache.c
> index a48ebd9..d4b4a4e 100644
> --- a/libxfs/cache.c
> +++ b/libxfs/cache.c
...
> @@ -183,15 +183,45 @@ cache_generic_bulkrelse(
> }
>
...
> +/*
> + * We've hit the limit on cache size, so we need to start reclaiming nodes we've
> + * used. The MRU specified by the priority is shaken. Returns new priority at
> + * end of the call (in case we call again). We are not allowed to reclaim dirty
> + * objects, so we have to flush them first. If flushing fails, we move them to
> + * the "dirty, unreclaimable" list.
> + *
> + * Hence we skip priorities > CACHE_MAX_PRIORITY unless "purge" is set as we
> + * park unflushable (and hence unreclaimable) buffers at these priorities.
> + * Trying to shake unreclaimable buffer lists whent here is memory pressure is a
typo ^
> + * waste of time and CPU and greatly slows down cache node recycling operations.
> + * Hence we only try to free them if we are being asked to purge the cache of
> + * all entries.
> */
> static unsigned int
> cache_shake(
> struct cache * cache,
> unsigned int priority,
> - int all)
> + bool purge)
> {
> struct cache_mru *mru;
> struct cache_hash * hash;
> @@ -202,10 +232,11 @@ cache_shake(
> struct cache_node * node;
> unsigned int count;
>
> - ASSERT(priority <= CACHE_MAX_PRIORITY);
> - if (priority > CACHE_MAX_PRIORITY)
> + ASSERT(priority <= CACHE_DIRTY_PRIORITY);
> + if (priority > CACHE_MAX_PRIORITY && !purge)
> priority = 0;
>
> +
... and still an extra newline here. Otherwise looks good:
Reviewed-by: Brian Foster <bfoster@redhat.com>
> mru = &cache->c_mrus[priority];
> count = 0;
> list_head_init(&temp);
> @@ -219,8 +250,10 @@ cache_shake(
> if (pthread_mutex_trylock(&node->cn_mutex) != 0)
> continue;
>
> - /* can't release dirty objects */
> - if (cache->flush(node)) {
> + /* memory pressure is not allowed to release dirty objects */
> + if (cache->flush(node) && !purge) {
> + cache_move_to_dirty_mru(cache, node);
> + mru->cm_count--;
> pthread_mutex_unlock(&node->cn_mutex);
> continue;
> }
> @@ -242,7 +275,7 @@ cache_shake(
> pthread_mutex_unlock(&node->cn_mutex);
>
> count++;
> - if (!all && count == CACHE_SHAKE_COUNT)
> + if (!purge && count == CACHE_SHAKE_COUNT)
> break;
> }
> pthread_mutex_unlock(&mru->cm_mutex);
> @@ -423,7 +456,7 @@ next_object:
> node = cache_node_allocate(cache, key);
> if (node)
> break;
> - priority = cache_shake(cache, priority, 0);
> + priority = cache_shake(cache, priority, false);
> /*
> * We start at 0; if we free CACHE_SHAKE_COUNT we get
> * back the same priority, if not we get back priority+1.
> @@ -578,8 +611,8 @@ cache_purge(
> {
> int i;
>
> - for (i = 0; i <= CACHE_MAX_PRIORITY; i++)
> - cache_shake(cache, i, 1);
> + for (i = 0; i <= CACHE_DIRTY_PRIORITY; i++)
> + cache_shake(cache, i, true);
>
> #ifdef CACHE_DEBUG
> if (cache->c_count != 0) {
> @@ -626,13 +659,13 @@ cache_flush(
> #define HASH_REPORT (3 * HASH_CACHE_RATIO)
> void
> cache_report(
> - FILE *fp,
> - const char *name,
> - struct cache *cache)
> + FILE *fp,
> + const char *name,
> + struct cache *cache)
> {
> - int i;
> - unsigned long count, index, total;
> - unsigned long hash_bucket_lengths[HASH_REPORT + 2];
> + int i;
> + unsigned long count, index, total;
> + unsigned long hash_bucket_lengths[HASH_REPORT + 2];
>
> if ((cache->c_hits + cache->c_misses) == 0)
> return;
> @@ -662,6 +695,11 @@ cache_report(
> i, cache->c_mrus[i].cm_count,
> cache->c_mrus[i].cm_count * 100 / cache->c_count);
>
> + i = CACHE_DIRTY_PRIORITY;
> + fprintf(fp, "Dirty MRU %d entries = %6u (%3u%%)\n",
> + i, cache->c_mrus[i].cm_count,
> + cache->c_mrus[i].cm_count * 100 / cache->c_count);
> +
> /* report hash bucket lengths */
> bzero(hash_bucket_lengths, sizeof(hash_bucket_lengths));
>
> --
> 2.5.0
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 6/7] libxfs: keep unflushable buffers off the cache MRUs
2016-02-04 23:05 ` [PATCH 6/7] libxfs: keep unflushable buffers off the cache MRUs Dave Chinner
2016-02-05 14:22 ` Brian Foster
@ 2016-02-08 10:06 ` Christoph Hellwig
2016-02-08 19:54 ` Dave Chinner
1 sibling, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2016-02-08 10:06 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
> --- a/include/cache.h
> +++ b/include/cache.h
> @@ -51,6 +51,7 @@ enum {
> #define CACHE_BASE_PRIORITY 0
> #define CACHE_PREFETCH_PRIORITY 8
> #define CACHE_MAX_PRIORITY 15
> +#define CACHE_DIRTY_PRIORITY (CACHE_MAX_PRIORITY + 1)
Sizing arrays based on, and iterating up to CACHE_DIRTY_PRIORITY seems
rather odd. Maybe add a new
#define CACHE_NR_PRIORITIES CACHE_DIRTY_PRIORITY
and a comment explaining the magic to make it more obvious?
> +cache_move_to_dirty_mru(
> + struct cache *cache,
> + struct cache_node *node)
> +{
> + struct cache_mru *mru;
> +
> + mru = &cache->c_mrus[CACHE_DIRTY_PRIORITY];
> +
> + pthread_mutex_lock(&mru->cm_mutex);
> + node->cn_priority = CACHE_DIRTY_PRIORITY;
> + list_move(&node->cn_mru, &mru->cm_list);
> + mru->cm_count++;
> + pthread_mutex_unlock(&mru->cm_mutex);
> +}
Maybe it would better to just do a list_add here and leave the
list_del to the caller to avoid needing to nest two different
cm_mutex instances.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 6/7] libxfs: keep unflushable buffers off the cache MRUs
2016-02-08 10:06 ` Christoph Hellwig
@ 2016-02-08 19:54 ` Dave Chinner
0 siblings, 0 replies; 20+ messages in thread
From: Dave Chinner @ 2016-02-08 19:54 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs
On Mon, Feb 08, 2016 at 02:06:36AM -0800, Christoph Hellwig wrote:
> > --- a/include/cache.h
> > +++ b/include/cache.h
> > @@ -51,6 +51,7 @@ enum {
> > #define CACHE_BASE_PRIORITY 0
> > #define CACHE_PREFETCH_PRIORITY 8
> > #define CACHE_MAX_PRIORITY 15
> > +#define CACHE_DIRTY_PRIORITY (CACHE_MAX_PRIORITY + 1)
>
> Sizing arrays based on, and iterating up to CACHE_DIRTY_PRIORITY seems
> rather odd. Maybe add a new
>
> #define CACHE_NR_PRIORITIES CACHE_DIRTY_PRIORITY
>
> and a comment explaining the magic to make it more obvious?
Ok.
> > +cache_move_to_dirty_mru(
> > + struct cache *cache,
> > + struct cache_node *node)
> > +{
> > + struct cache_mru *mru;
> > +
> > + mru = &cache->c_mrus[CACHE_DIRTY_PRIORITY];
> > +
> > + pthread_mutex_lock(&mru->cm_mutex);
> > + node->cn_priority = CACHE_DIRTY_PRIORITY;
> > + list_move(&node->cn_mru, &mru->cm_list);
> > + mru->cm_count++;
> > + pthread_mutex_unlock(&mru->cm_mutex);
> > +}
>
> Maybe it would better to just do a list_add here and leave the
> list_del to the caller to avoid needing to nest two different
> cm_mutex instances.
I'll have a look at it.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 7/7] libxfs: reset dirty buffer priority on lookup
2016-02-04 23:05 [PATCH 1/7 v2] repair: big broken filesystems cause pain Dave Chinner
` (5 preceding siblings ...)
2016-02-04 23:05 ` [PATCH 6/7] libxfs: keep unflushable buffers off the cache MRUs Dave Chinner
@ 2016-02-04 23:05 ` Dave Chinner
2016-02-05 14:23 ` Brian Foster
2016-02-08 10:08 ` Christoph Hellwig
6 siblings, 2 replies; 20+ messages in thread
From: Dave Chinner @ 2016-02-04 23:05 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
When a buffer on the dirty MRU is looked up and found, we remove the
buffer from the MRU. However, we've already set the priority ofthe
buffer to "dirty" so when we are donw with it it will go back on the
dirty buffer MRU regardless of whether it needs to or not.
Hence when we move a buffer to a the dirty MRU, reocrd the old
priority and restore it when we remove the buffer from the MRU on
lookup. This will prevent us from putting fixed, now writeable
buffers back on the dirty MRU and allow the cache routine to write,
shake and reclaim the buffers once they are clean.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
include/cache.h | 1 +
libxfs/cache.c | 7 +++++++
2 files changed, 8 insertions(+)
diff --git a/include/cache.h b/include/cache.h
index 55761d3..3a989d7 100644
--- a/include/cache.h
+++ b/include/cache.h
@@ -99,6 +99,7 @@ struct cache_node {
unsigned int cn_count; /* reference count */
unsigned int cn_hashidx; /* hash chain index */
int cn_priority; /* priority, -1 = free list */
+ int cn_old_priority;/* saved pre-dirty prio */
pthread_mutex_t cn_mutex; /* node mutex */
};
diff --git a/libxfs/cache.c b/libxfs/cache.c
index d4b4a4e..0398be3 100644
--- a/libxfs/cache.c
+++ b/libxfs/cache.c
@@ -197,6 +197,7 @@ cache_move_to_dirty_mru(
mru = &cache->c_mrus[CACHE_DIRTY_PRIORITY];
pthread_mutex_lock(&mru->cm_mutex);
+ node->cn_old_priority = node->cn_priority;
node->cn_priority = CACHE_DIRTY_PRIORITY;
list_move(&node->cn_mru, &mru->cm_list);
mru->cm_count++;
@@ -325,6 +326,7 @@ cache_node_allocate(
list_head_init(&node->cn_mru);
node->cn_count = 1;
node->cn_priority = 0;
+ node->cn_old_priority = -1;
return node;
}
@@ -434,6 +436,10 @@ cache_node_get(
mru->cm_count--;
list_del_init(&node->cn_mru);
pthread_mutex_unlock(&mru->cm_mutex);
+ if (node->cn_old_priority != -1) {
+ node->cn_priority = node->cn_old_priority;
+ node->cn_old_priority = -1;
+ }
}
node->cn_count++;
@@ -534,6 +540,7 @@ cache_node_set_priority(
pthread_mutex_lock(&node->cn_mutex);
ASSERT(node->cn_count > 0);
node->cn_priority = priority;
+ node->cn_old_priority = -1;
pthread_mutex_unlock(&node->cn_mutex);
}
--
2.5.0
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 7/7] libxfs: reset dirty buffer priority on lookup
2016-02-04 23:05 ` [PATCH 7/7] libxfs: reset dirty buffer priority on lookup Dave Chinner
@ 2016-02-05 14:23 ` Brian Foster
2016-02-08 10:08 ` Christoph Hellwig
1 sibling, 0 replies; 20+ messages in thread
From: Brian Foster @ 2016-02-05 14:23 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Fri, Feb 05, 2016 at 10:05:08AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> When a buffer on the dirty MRU is looked up and found, we remove the
> buffer from the MRU. However, we've already set the priority ofthe
of the
> buffer to "dirty" so when we are donw with it it will go back on the
done
> dirty buffer MRU regardless of whether it needs to or not.
>
> Hence when we move a buffer to a the dirty MRU, reocrd the old
record
> priority and restore it when we remove the buffer from the MRU on
> lookup. This will prevent us from putting fixed, now writeable
> buffers back on the dirty MRU and allow the cache routine to write,
> shake and reclaim the buffers once they are clean.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
> include/cache.h | 1 +
> libxfs/cache.c | 7 +++++++
> 2 files changed, 8 insertions(+)
>
...
> diff --git a/libxfs/cache.c b/libxfs/cache.c
> index d4b4a4e..0398be3 100644
> --- a/libxfs/cache.c
> +++ b/libxfs/cache.c
...
> @@ -434,6 +436,10 @@ cache_node_get(
> mru->cm_count--;
> list_del_init(&node->cn_mru);
> pthread_mutex_unlock(&mru->cm_mutex);
> + if (node->cn_old_priority != -1) {
Might be good to ASSERT(node->cn_priority == CACHE_DIRTY_PRIORITY) here.
Otherwise looks good:
Reviewed-by: Brian Foster <bfoster@redhat.com>
> + node->cn_priority = node->cn_old_priority;
> + node->cn_old_priority = -1;
> + }
> }
> node->cn_count++;
>
> @@ -534,6 +540,7 @@ cache_node_set_priority(
> pthread_mutex_lock(&node->cn_mutex);
> ASSERT(node->cn_count > 0);
> node->cn_priority = priority;
> + node->cn_old_priority = -1;
> pthread_mutex_unlock(&node->cn_mutex);
> }
>
> --
> 2.5.0
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 7/7] libxfs: reset dirty buffer priority on lookup
2016-02-04 23:05 ` [PATCH 7/7] libxfs: reset dirty buffer priority on lookup Dave Chinner
2016-02-05 14:23 ` Brian Foster
@ 2016-02-08 10:08 ` Christoph Hellwig
1 sibling, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2016-02-08 10:08 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
Looks fine,
Reviewed-by: Christoph Hellwig <hch@lst.de>
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread