From: James Simmons <jsimmons@infradead.org>
To: Andreas Dilger <adilger@whamcloud.com>,
Oleg Drokin <green@whamcloud.com>, NeilBrown <neilb@suse.de>
Cc: Vitaly Fertman <c17818@cray.com>,
Lustre Development List <lustre-devel@lists.lustre.org>
Subject: [lustre-devel] [PATCH 34/49] lustre: lov: cancel layout lock on replay deadlock
Date: Thu, 15 Apr 2021 00:02:26 -0400 [thread overview]
Message-ID: <1618459361-17909-35-git-send-email-jsimmons@infradead.org> (raw)
In-Reply-To: <1618459361-17909-1-git-send-email-jsimmons@infradead.org>
From: Vitaly Fertman <c17818@cray.com>
layout locks are not replayed and instead cancelled as unused, what
requires to take lov_conf_lock. the semaphore may be already taken by
cl_lock_flush() which prepares a new IO which is not be able to be
sent to MDS as it is in the recovery.
HPE-bug-id: LUS-9232
WC-bug-id: https://jira.whamcloud.com/browse/LU-14182
Lustre-commit: 68fb53ad4bb2dbc ("LU-14182 lov: cancel layout lock on replay deadlock")
Signed-off-by: Vitaly Fertman <c17818@cray.com>
Reviewed-by: Alexey Lyashkov <c17817@cray.com>
Reviewed-by: Andriy Skulysh <c17819@cray.com>
Reviewed-on: https://review.whamcloud.com/40867
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-by: Andriy Skulysh <askulysh@gmail.com>
Reviewed-by: Mike Pershin <mpershin@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
fs/lustre/include/obd_support.h | 1 +
fs/lustre/ldlm/ldlm_request.c | 2 ++
fs/lustre/llite/namei.c | 2 ++
fs/lustre/lov/lov_cl_internal.h | 10 +++++++---
fs/lustre/lov/lov_object.c | 44 ++++++++++++++++++++++++-----------------
5 files changed, 38 insertions(+), 21 deletions(-)
diff --git a/fs/lustre/include/obd_support.h b/fs/lustre/include/obd_support.h
index 152f95c..b2f97f1 100644
--- a/fs/lustre/include/obd_support.h
+++ b/fs/lustre/include/obd_support.h
@@ -308,6 +308,7 @@
#define OBD_FAIL_LDLM_GRANT_CHECK 0x32a
#define OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE 0x32c
+#define OBD_FAIL_LDLM_REPLAY_PAUSE 0x32e
/* LOCKLESS IO */
#define OBD_FAIL_LDLM_SET_CONTENTION 0x385
diff --git a/fs/lustre/ldlm/ldlm_request.c b/fs/lustre/ldlm/ldlm_request.c
index d8ca744..3527678 100644
--- a/fs/lustre/ldlm/ldlm_request.c
+++ b/fs/lustre/ldlm/ldlm_request.c
@@ -2220,6 +2220,8 @@ static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
"Dropping as many unused locks as possible before replay for namespace %s (%d)\n",
ldlm_ns_name(ns), ns->ns_nr_unused);
+ OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_REPLAY_PAUSE, cfs_fail_val);
+
/*
* We don't need to care whether or not LRU resize is enabled
* because the LDLM_LRU_FLAG_NO_WAIT policy doesn't use the
diff --git a/fs/lustre/llite/namei.c b/fs/lustre/llite/namei.c
index 1095fa9..654d065 100644
--- a/fs/lustre/llite/namei.c
+++ b/fs/lustre/llite/namei.c
@@ -204,6 +204,8 @@ static int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
if (IS_ERR(env))
return PTR_ERR(env);
+ OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_REPLAY_PAUSE, cfs_fail_val);
+
/* reach MDC layer to flush data under the DoM ldlm lock */
rc = cl_object_flush(env, lli->lli_clob, lock);
if (rc == -ENODATA) {
diff --git a/fs/lustre/lov/lov_cl_internal.h b/fs/lustre/lov/lov_cl_internal.h
index e9ef5aa..f231be9 100644
--- a/fs/lustre/lov/lov_cl_internal.h
+++ b/fs/lustre/lov/lov_cl_internal.h
@@ -251,6 +251,11 @@ struct lov_mirror_entry {
unsigned short lre_end; /* end index of this mirror */
};
+enum lov_object_flags {
+ /* Layout is invalid, set when layout lock is lost */
+ LO_LAYOUT_INVALID = 0x1,
+};
+
/**
* lov-specific file state.
*
@@ -281,10 +286,9 @@ struct lov_object {
*/
enum lov_layout_type lo_type;
/**
- * True if layout is invalid. This bit is cleared when layout lock
- * is lost.
+ * Object flags.
*/
- bool lo_layout_invalid;
+ unsigned long lo_obj_flags;
/**
* How many IOs are on going on this object. Layout can be changed
* only if there is no active IO.
diff --git a/fs/lustre/lov/lov_object.c b/fs/lustre/lov/lov_object.c
index abe1cee..db4070f 100644
--- a/fs/lustre/lov/lov_object.c
+++ b/fs/lustre/lov/lov_object.c
@@ -177,7 +177,7 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type);
LASSERT(old_obj);
old_lov = cl2lov(lu2cl(old_obj));
- if (old_lov->lo_layout_invalid) {
+ if (test_bit(LO_LAYOUT_INVALID, &old_lov->lo_obj_flags)) {
/* the object's layout has already changed but isn't
* refreshed
*/
@@ -628,7 +628,7 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
LASSERT(lsm->lsm_entry_count > 0);
LASSERT(!lov->lo_lsm);
lov->lo_lsm = lsm_addref(lsm);
- lov->lo_layout_invalid = true;
+ set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
dump_lsm(D_INODE, lsm);
@@ -910,7 +910,8 @@ static void lov_fini_released(const struct lu_env *env, struct lov_object *lov,
static int lov_print_empty(const struct lu_env *env, void *cookie,
lu_printer_t p, const struct lu_object *o)
{
- (*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid);
+ (*p)(env, cookie, "empty %d\n",
+ test_bit(LO_LAYOUT_INVALID, &lu2lov(o)->lo_obj_flags));
return 0;
}
@@ -923,8 +924,8 @@ static int lov_print_composite(const struct lu_env *env, void *cookie,
(*p)(env, cookie, "entries: %d, %s, lsm{%p 0x%08X %d %u}:\n",
lsm->lsm_entry_count,
- lov->lo_layout_invalid ? "invalid" : "valid", lsm,
- lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+ test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? "invalid" :
+ "valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
lsm->lsm_layout_gen);
for (i = 0; i < lsm->lsm_entry_count; i++) {
@@ -953,8 +954,8 @@ static int lov_print_released(const struct lu_env *env, void *cookie,
(*p)(env, cookie,
"released: %s, lsm{%p 0x%08X %d %u}:\n",
- lov->lo_layout_invalid ? "invalid" : "valid", lsm,
- lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+ test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? "invalid" :
+ "valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
lsm->lsm_layout_gen);
return 0;
}
@@ -967,7 +968,8 @@ static int lov_print_foreign(const struct lu_env *env, void *cookie,
(*p)(env, cookie,
"foreign: %s, lsm{%p 0x%08X %d %u}:\n",
- lov->lo_layout_invalid ? "invalid" : "valid", lsm,
+ test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ?
+ "invalid" : "valid", lsm,
lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
lsm->lsm_layout_gen);
(*p)(env, cookie,
@@ -1352,15 +1354,15 @@ static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
dump_lsm(D_INODE, lsm);
}
- lov_conf_lock(lov);
if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
- lov->lo_layout_invalid = true;
+ set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
result = 0;
- goto out;
+ goto out_lsm;
}
+ lov_conf_lock(lov);
if (conf->coc_opc == OBJECT_CONF_WAIT) {
- if (lov->lo_layout_invalid &&
+ if (test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) &&
atomic_read(&lov->lo_active_ios) > 0) {
lov_conf_unlock(lov);
result = lov_layout_wait(env, lov);
@@ -1378,26 +1380,31 @@ static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
(lov->lo_lsm->lsm_entries[0]->lsme_pattern ==
lsm->lsm_entries[0]->lsme_pattern))) {
/* same version of layout */
- lov->lo_layout_invalid = false;
+ clear_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
result = 0;
goto out;
}
/* will change layout - check if there still exists active IO. */
if (atomic_read(&lov->lo_active_ios) > 0) {
- lov->lo_layout_invalid = true;
+ set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
result = -EBUSY;
goto out;
}
result = lov_layout_change(env, lov, lsm, conf);
- lov->lo_layout_invalid = result != 0;
+ if (result)
+ set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
+ else
+ clear_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
out:
lov_conf_unlock(lov);
+out_lsm:
lov_lsm_put(lsm);
- CDEBUG(D_INODE, DFID " lo_layout_invalid=%d\n",
- PFID(lu_object_fid(lov2lu(lov))), lov->lo_layout_invalid);
+ CDEBUG(D_INODE, DFID " lo_layout_invalid=%u\n",
+ PFID(lu_object_fid(lov2lu(lov))),
+ test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags));
return result;
}
@@ -2254,7 +2261,8 @@ static struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov)
lsm = lsm_addref(lov->lo_lsm);
CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n",
lsm, atomic_read(&lsm->lsm_refc),
- lov->lo_layout_invalid, current);
+ test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags),
+ current);
}
lov_conf_thaw(lov);
return lsm;
--
1.8.3.1
_______________________________________________
lustre-devel mailing list
lustre-devel@lists.lustre.org
http://lists.lustre.org/listinfo.cgi/lustre-devel-lustre.org
next prev parent reply other threads:[~2021-04-15 4:05 UTC|newest]
Thread overview: 50+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-04-15 4:01 [lustre-devel] [PATCH 00/49] lustre: sync to OpenSFS as of March 30 2021 James Simmons
2021-04-15 4:01 ` [lustre-devel] [PATCH 01/49] lnet: libcfs: Fix for unconfigured arch_stackwalk James Simmons
2021-04-15 4:01 ` [lustre-devel] [PATCH 02/49] lustre: lmv: iput() can safely be passed NULL James Simmons
2021-04-15 4:01 ` [lustre-devel] [PATCH 03/49] lustre: llite: mark extended attr and inode flags James Simmons
2021-04-15 4:01 ` [lustre-devel] [PATCH 04/49] lnet: lnet_notify sets route aliveness incorrectly James Simmons
2021-04-15 4:01 ` [lustre-devel] [PATCH 05/49] lnet: Prevent discovery on peer marked deletion James Simmons
2021-04-15 4:01 ` [lustre-devel] [PATCH 06/49] lnet: Prevent discovery on deleted peer James Simmons
2021-04-15 4:01 ` [lustre-devel] [PATCH 07/49] lnet: Transfer disc src NID when merging peers James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 08/49] lnet: Lookup lpni after discovery James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 09/49] lustre: llite: update and fix module loading bug in mounting code James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 10/49] lnet: socklnd: change various ints to bool James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 11/49] lnet: Correct asymmetric route detection James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 12/49] lustre: fixup ldlm_pool and lu_object shrinker failure cases James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 13/49] lustre: log: Add ending newline for some messages James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 14/49] lustre: use with_imp_locked() more broadly James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 15/49] lnet: o2iblnd: change some ints to bool James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 16/49] lustre: lmv: striped directory as subdirectory mount James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 17/49] lustre: llite: create file_operations registration function James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 18/49] lustre: osc: fix performance regression in osc_extent_merge() James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 19/49] lustre: mds: add enums for MDS_ATTR flags James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 20/49] lustre: uapi: remove OBD_IOC_LOV_GET_CONFIG James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 21/49] lustre: sec: fix migrate for encrypted dir James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 22/49] lnet: libcfs: restore LNET_DUMP_ON_PANIC functionality James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 23/49] lustre: ptlrpc: fix ASSERTION on scp_rqbd_posted James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 24/49] lustre: ldlm: not freed req on enqueue James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 25/49] lnet: uapi: move userland only nidstr.h handling James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 26/49] lnet: libcfs: don't depend on sysctl support for debugfs James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 27/49] lustre: ptlrpc: Add a binary heap implementation James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 28/49] lustre: ptlrpc: Implement NRS Delay Policy James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 29/49] lustre: ptlrpc: rename cfs_binheap to simply binheap James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 30/49] lustre: ptlrpc: mark some functions as static James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 31/49] lustre: use tgt_pool for lov layer James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 32/49] lustre: quota: make used for pool correct James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 33/49] lustre: quota: call rhashtable_lookup near params decl James Simmons
2021-04-15 4:02 ` James Simmons [this message]
2021-04-15 4:02 ` [lustre-devel] [PATCH 35/49] lustre: obdclass: Protect cl_env_percpu[] James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 36/49] lnet: libcfs: discard cfs_trace_console_buffers[] James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 37/49] lnet: libcfs: discard cfs_trace_copyin_string() James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 38/49] lustre: lmv: don't use lqr_alloc spinlock in lmv James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 39/49] lustre: lov: fault page update cp_lov_index James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 40/49] lustre: update version to 2.14.51 James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 41/49] lustre: llite: mirror extend/copy keeps sparseness James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 42/49] lustre: ptlrpc: don't use list_for_each_entry_safe unnecessarily James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 43/49] lnet: Age peer NI out of recovery James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 44/49] lnet: Only recover known good peer NIs James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 45/49] lnet: Recover peer NI w/exponential backoff interval James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 46/49] lustre: lov: return valid stripe_count/size for PFL files James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 47/49] lnet: convert lpni_refcount to a kref James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 48/49] lustre: lmv: handle default stripe_count=-1 properly James Simmons
2021-04-15 4:02 ` [lustre-devel] [PATCH 49/49] lnet: libcfs: discard cfs_array_alloc() James Simmons
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1618459361-17909-35-git-send-email-jsimmons@infradead.org \
--to=jsimmons@infradead.org \
--cc=adilger@whamcloud.com \
--cc=c17818@cray.com \
--cc=green@whamcloud.com \
--cc=lustre-devel@lists.lustre.org \
--cc=neilb@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).