* [Ocfs2-devel] [patch 4/8] ocfs2/dlm: do not purge lockres that is queued for assert master
@ 2014-06-09 20:04 akpm at linux-foundation.org
2014-06-13 21:31 ` Mark Fasheh
0 siblings, 1 reply; 5+ messages in thread
From: akpm at linux-foundation.org @ 2014-06-09 20:04 UTC (permalink / raw)
To: ocfs2-devel
From: Xue jiufei <xuejiufei@huawei.com>
Subject: ocfs2/dlm: do not purge lockres that is queued for assert master
When workqueue is delayed, it may occur that a lockres is purged while it
is still queued for master assert. it may trigger BUG() as follows.
N1 N2
dlm_get_lockres()
->dlm_do_master_requery
is the master of lockres,
so queue assert_master work
dlm_thread() start running
and purge the lockres
dlm_assert_master_worker()
send assert master message
to other nodes
receiving the assert_master
message, set master to N2
dlmlock_remote() send create_lock message to N2, but receive DLM_IVLOCKID,
if it is RECOVERY lockres, it triggers the BUG().
Another BUG() is triggered when N3 become the new master and send
assert_master to N1, N1 will trigger the BUG() because owner doesn't
match. So we should not purge lockres when it is queued for assert
master.
Signed-off-by: joyce.xue <xuejiufei@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
fs/ocfs2/dlm/dlmcommon.h | 4 +++
fs/ocfs2/dlm/dlmmaster.c | 43 ++++++++++++++++++++++++++++++++++-
fs/ocfs2/dlm/dlmrecovery.c | 3 +-
fs/ocfs2/dlm/dlmthread.c | 11 +++++---
4 files changed, 55 insertions(+), 6 deletions(-)
diff -puN fs/ocfs2/dlm/dlmcommon.h~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master fs/ocfs2/dlm/dlmcommon.h
--- a/fs/ocfs2/dlm/dlmcommon.h~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master
+++ a/fs/ocfs2/dlm/dlmcommon.h
@@ -332,6 +332,7 @@ struct dlm_lock_resource
u16 state;
char lvb[DLM_LVB_LEN];
unsigned int inflight_locks;
+ unsigned int inflight_assert_workers;
unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
};
@@ -911,6 +912,9 @@ void dlm_lockres_drop_inflight_ref(struc
void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
+
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff -puN fs/ocfs2/dlm/dlmmaster.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master fs/ocfs2/dlm/dlmmaster.c
--- a/fs/ocfs2/dlm/dlmmaster.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master
+++ a/fs/ocfs2/dlm/dlmmaster.c
@@ -581,6 +581,7 @@ static void dlm_init_lockres(struct dlm_
atomic_set(&res->asts_reserved, 0);
res->migration_pending = 0;
res->inflight_locks = 0;
+ res->inflight_assert_workers = 0;
res->dlm = dlm;
@@ -683,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struc
wake_up(&res->wq);
}
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ res->inflight_assert_workers++;
+ mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->inflight_assert_workers);
+}
+
+void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ spin_lock(&res->spinlock);
+ __dlm_lockres_grab_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
+}
+
+void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ BUG_ON(res->inflight_assert_workers == 0);
+ res->inflight_assert_workers--;
+ mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->inflight_assert_workers);
+}
+
+void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ spin_lock(&res->spinlock);
+ __dlm_lockres_drop_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
+}
+
/*
* lookup a lock resource by name.
* may already exist in the hashtable.
@@ -1603,7 +1641,8 @@ send_response:
mlog(ML_ERROR, "failed to dispatch assert master work\n");
response = DLM_MASTER_RESP_ERROR;
dlm_lockres_put(res);
- }
+ } else
+ dlm_lockres_grab_inflight_worker(dlm, res);
} else {
if (res)
dlm_lockres_put(res);
@@ -2118,6 +2157,8 @@ static void dlm_assert_master_worker(str
dlm_lockres_release_ast(dlm, res);
put:
+ dlm_lockres_drop_inflight_worker(dlm, res);
+
dlm_lockres_put(res);
mlog(0, "finished with dlm_assert_master_worker\n");
diff -puN fs/ocfs2/dlm/dlmrecovery.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master fs/ocfs2/dlm/dlmrecovery.c
--- a/fs/ocfs2/dlm/dlmrecovery.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master
+++ a/fs/ocfs2/dlm/dlmrecovery.c
@@ -1708,7 +1708,8 @@ int dlm_master_requery_handler(struct o2
mlog_errno(-ENOMEM);
/* retry!? */
BUG();
- }
+ } else
+ __dlm_lockres_grab_inflight_worker(dlm, res);
} else /* put.. incase we are not the master */
dlm_lockres_put(res);
spin_unlock(&res->spinlock);
diff -puN fs/ocfs2/dlm/dlmthread.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master fs/ocfs2/dlm/dlmthread.c
--- a/fs/ocfs2/dlm/dlmthread.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master
+++ a/fs/ocfs2/dlm/dlmthread.c
@@ -259,11 +259,14 @@ static void dlm_run_purge_list(struct dl
* refs on it. */
unused = __dlm_lockres_unused(lockres);
if (!unused ||
- (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+ (lockres->state & DLM_LOCK_RES_MIGRATING) ||
+ (lockres->inflight_assert_workers != 0)) {
mlog(0, "%s: res %.*s is in use or being remastered, "
- "used %d, state %d\n", dlm->name,
- lockres->lockname.len, lockres->lockname.name,
- !unused, lockres->state);
+ "used %d, state %d, assert master workers %u\n",
+ dlm->name, lockres->lockname.len,
+ lockres->lockname.name,
+ !unused, lockres->state,
+ lockres->inflight_assert_workers);
list_move_tail(&dlm->purge_list, &lockres->purge);
spin_unlock(&lockres->spinlock);
continue;
_
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Ocfs2-devel] [patch 4/8] ocfs2/dlm: do not purge lockres that is queued for assert master
2014-06-09 20:04 [Ocfs2-devel] [patch 4/8] ocfs2/dlm: do not purge lockres that is queued for assert master akpm at linux-foundation.org
@ 2014-06-13 21:31 ` Mark Fasheh
2014-06-16 1:26 ` Xue jiufei
0 siblings, 1 reply; 5+ messages in thread
From: Mark Fasheh @ 2014-06-13 21:31 UTC (permalink / raw)
To: ocfs2-devel
On Mon, Jun 09, 2014 at 01:04:03PM -0700, Andrew Morton wrote:
> @@ -683,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struc
> wake_up(&res->wq);
> }
>
> +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
> + struct dlm_lock_resource *res)
> +{
> + assert_spin_locked(&res->spinlock);
> + res->inflight_assert_workers++;
> + mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
> + dlm->name, res->lockname.len, res->lockname.name,
> + res->inflight_assert_workers);
> +}
> +
> +void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
> + struct dlm_lock_resource *res)
> +{
> + spin_lock(&res->spinlock);
> + __dlm_lockres_grab_inflight_worker(dlm, res);
> + spin_unlock(&res->spinlock);
> +}
> +
> +void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
> + struct dlm_lock_resource *res)
> +{
> + assert_spin_locked(&res->spinlock);
> + BUG_ON(res->inflight_assert_workers == 0);
> + res->inflight_assert_workers--;
> + mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
> + dlm->name, res->lockname.len, res->lockname.name,
> + res->inflight_assert_workers);
> +}
> +
> +void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
> + struct dlm_lock_resource *res)
> +{
> + spin_lock(&res->spinlock);
> + __dlm_lockres_drop_inflight_worker(dlm, res);
> + spin_unlock(&res->spinlock);
> +}
> +
> /*
> * lookup a lock resource by name.
> * may already exist in the hashtable.
> @@ -1603,7 +1641,8 @@ send_response:
> mlog(ML_ERROR, "failed to dispatch assert master work\n");
> response = DLM_MASTER_RESP_ERROR;
> dlm_lockres_put(res);
> - }
> + } else
> + dlm_lockres_grab_inflight_worker(dlm, res);
> } else {
> if (res)
> dlm_lockres_put(res);
> @@ -2118,6 +2157,8 @@ static void dlm_assert_master_worker(str
> dlm_lockres_release_ast(dlm, res);
>
> put:
> + dlm_lockres_drop_inflight_worker(dlm, res);
> +
> dlm_lockres_put(res);
>
> mlog(0, "finished with dlm_assert_master_worker\n");
> diff -puN fs/ocfs2/dlm/dlmrecovery.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master fs/ocfs2/dlm/dlmrecovery.c
> --- a/fs/ocfs2/dlm/dlmrecovery.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master
> +++ a/fs/ocfs2/dlm/dlmrecovery.c
> @@ -1708,7 +1708,8 @@ int dlm_master_requery_handler(struct o2
> mlog_errno(-ENOMEM);
> /* retry!? */
> BUG();
> - }
> + } else
> + __dlm_lockres_grab_inflight_worker(dlm, res);
> } else /* put.. incase we are not the master */
> dlm_lockres_put(res);
> spin_unlock(&res->spinlock);
> diff -puN fs/ocfs2/dlm/dlmthread.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master fs/ocfs2/dlm/dlmthread.c
> --- a/fs/ocfs2/dlm/dlmthread.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master
> +++ a/fs/ocfs2/dlm/dlmthread.c
> @@ -259,11 +259,14 @@ static void dlm_run_purge_list(struct dl
> * refs on it. */
> unused = __dlm_lockres_unused(lockres);
> if (!unused ||
> - (lockres->state & DLM_LOCK_RES_MIGRATING)) {
> + (lockres->state & DLM_LOCK_RES_MIGRATING) ||
> + (lockres->inflight_assert_workers != 0)) {
If there's any assert master message we will halt purging *all* lock
resources. That seems extreme to me :/
What about putting a flag on lockres->state to indicate that it's got some
work queued? We would set it before queuing the work, then clear it in the
worker, or if we ever cancel the work.
--Mark
--
Mark Fasheh
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Ocfs2-devel] [patch 4/8] ocfs2/dlm: do not purge lockres that is queued for assert master
2014-06-13 21:31 ` Mark Fasheh
@ 2014-06-16 1:26 ` Xue jiufei
2014-06-20 22:33 ` Andrew Morton
0 siblings, 1 reply; 5+ messages in thread
From: Xue jiufei @ 2014-06-16 1:26 UTC (permalink / raw)
To: ocfs2-devel
On 2014/6/14 5:31, Mark Fasheh wrote:
> On Mon, Jun 09, 2014 at 01:04:03PM -0700, Andrew Morton wrote:
>> @@ -683,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struc
>> wake_up(&res->wq);
>> }
>>
>> +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
>> + struct dlm_lock_resource *res)
>> +{
>> + assert_spin_locked(&res->spinlock);
>> + res->inflight_assert_workers++;
>> + mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
>> + dlm->name, res->lockname.len, res->lockname.name,
>> + res->inflight_assert_workers);
>> +}
>> +
>> +void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
>> + struct dlm_lock_resource *res)
>> +{
>> + spin_lock(&res->spinlock);
>> + __dlm_lockres_grab_inflight_worker(dlm, res);
>> + spin_unlock(&res->spinlock);
>> +}
>> +
>> +void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
>> + struct dlm_lock_resource *res)
>> +{
>> + assert_spin_locked(&res->spinlock);
>> + BUG_ON(res->inflight_assert_workers == 0);
>> + res->inflight_assert_workers--;
>> + mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
>> + dlm->name, res->lockname.len, res->lockname.name,
>> + res->inflight_assert_workers);
>> +}
>> +
>> +void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
>> + struct dlm_lock_resource *res)
>> +{
>> + spin_lock(&res->spinlock);
>> + __dlm_lockres_drop_inflight_worker(dlm, res);
>> + spin_unlock(&res->spinlock);
>> +}
>> +
>> /*
>> * lookup a lock resource by name.
>> * may already exist in the hashtable.
>> @@ -1603,7 +1641,8 @@ send_response:
>> mlog(ML_ERROR, "failed to dispatch assert master work\n");
>> response = DLM_MASTER_RESP_ERROR;
>> dlm_lockres_put(res);
>> - }
>> + } else
>> + dlm_lockres_grab_inflight_worker(dlm, res);
>> } else {
>> if (res)
>> dlm_lockres_put(res);
>> @@ -2118,6 +2157,8 @@ static void dlm_assert_master_worker(str
>> dlm_lockres_release_ast(dlm, res);
>>
>> put:
>> + dlm_lockres_drop_inflight_worker(dlm, res);
>> +
>> dlm_lockres_put(res);
>>
>> mlog(0, "finished with dlm_assert_master_worker\n");
>> diff -puN fs/ocfs2/dlm/dlmrecovery.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master fs/ocfs2/dlm/dlmrecovery.c
>> --- a/fs/ocfs2/dlm/dlmrecovery.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master
>> +++ a/fs/ocfs2/dlm/dlmrecovery.c
>> @@ -1708,7 +1708,8 @@ int dlm_master_requery_handler(struct o2
>> mlog_errno(-ENOMEM);
>> /* retry!? */
>> BUG();
>> - }
>> + } else
>> + __dlm_lockres_grab_inflight_worker(dlm, res);
>> } else /* put.. incase we are not the master */
>> dlm_lockres_put(res);
>> spin_unlock(&res->spinlock);
>> diff -puN fs/ocfs2/dlm/dlmthread.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master fs/ocfs2/dlm/dlmthread.c
>> --- a/fs/ocfs2/dlm/dlmthread.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master
>> +++ a/fs/ocfs2/dlm/dlmthread.c
>> @@ -259,11 +259,14 @@ static void dlm_run_purge_list(struct dl
>> * refs on it. */
>> unused = __dlm_lockres_unused(lockres);
>> if (!unused ||
>> - (lockres->state & DLM_LOCK_RES_MIGRATING)) {
>> + (lockres->state & DLM_LOCK_RES_MIGRATING) ||
>> + (lockres->inflight_assert_workers != 0)) {
>
> If there's any assert master message we will halt purging *all* lock
> resources. That seems extreme to me :/
>
Not halt purging *all* lock resource, when one lockres is queued for
master assert, it will be moved to the tail of the purge list, so
dlm_thread can keep purging other lock resources.
-- joyce.xue
> What about putting a flag on lockres->state to indicate that it's got some
> work queued? We would set it before queuing the work, then clear it in the
> worker, or if we ever cancel the work.
> --Mark
>
> --
> Mark Fasheh
> .
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Ocfs2-devel] [patch 4/8] ocfs2/dlm: do not purge lockres that is queued for assert master
2014-06-16 1:26 ` Xue jiufei
@ 2014-06-20 22:33 ` Andrew Morton
2014-06-20 22:49 ` Mark Fasheh
0 siblings, 1 reply; 5+ messages in thread
From: Andrew Morton @ 2014-06-20 22:33 UTC (permalink / raw)
To: ocfs2-devel
On Mon, 16 Jun 2014 09:26:53 +0800 Xue jiufei <xuejiufei@huawei.com> wrote:
> >> spin_unlock(&res->spinlock);
> >> diff -puN fs/ocfs2/dlm/dlmthread.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master fs/ocfs2/dlm/dlmthread.c
> >> --- a/fs/ocfs2/dlm/dlmthread.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master
> >> +++ a/fs/ocfs2/dlm/dlmthread.c
> >> @@ -259,11 +259,14 @@ static void dlm_run_purge_list(struct dl
> >> * refs on it. */
> >> unused = __dlm_lockres_unused(lockres);
> >> if (!unused ||
> >> - (lockres->state & DLM_LOCK_RES_MIGRATING)) {
> >> + (lockres->state & DLM_LOCK_RES_MIGRATING) ||
> >> + (lockres->inflight_assert_workers != 0)) {
> >
> > If there's any assert master message we will halt purging *all* lock
> > resources. That seems extreme to me :/
> >
> Not halt purging *all* lock resource, when one lockres is queued for
> master assert, it will be moved to the tail of the purge list, so
> dlm_thread can keep purging other lock resources.
Where are we up to with this one?
Thanks.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Ocfs2-devel] [patch 4/8] ocfs2/dlm: do not purge lockres that is queued for assert master
2014-06-20 22:33 ` Andrew Morton
@ 2014-06-20 22:49 ` Mark Fasheh
0 siblings, 0 replies; 5+ messages in thread
From: Mark Fasheh @ 2014-06-20 22:49 UTC (permalink / raw)
To: ocfs2-devel
On Fri, Jun 20, 2014 at 03:33:46PM -0700, Andrew Morton wrote:
> On Mon, 16 Jun 2014 09:26:53 +0800 Xue jiufei <xuejiufei@huawei.com> wrote:
>
> > >> spin_unlock(&res->spinlock);
> > >> diff -puN fs/ocfs2/dlm/dlmthread.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master fs/ocfs2/dlm/dlmthread.c
> > >> --- a/fs/ocfs2/dlm/dlmthread.c~ocfs2-dlm-do-not-purge-lockres-that-is-queued-for-assert-master
> > >> +++ a/fs/ocfs2/dlm/dlmthread.c
> > >> @@ -259,11 +259,14 @@ static void dlm_run_purge_list(struct dl
> > >> * refs on it. */
> > >> unused = __dlm_lockres_unused(lockres);
> > >> if (!unused ||
> > >> - (lockres->state & DLM_LOCK_RES_MIGRATING)) {
> > >> + (lockres->state & DLM_LOCK_RES_MIGRATING) ||
> > >> + (lockres->inflight_assert_workers != 0)) {
> > >
> > > If there's any assert master message we will halt purging *all* lock
> > > resources. That seems extreme to me :/
> > >
> > Not halt purging *all* lock resource, when one lockres is queued for
> > master assert, it will be moved to the tail of the purge list, so
> > dlm_thread can keep purging other lock resources.
>
> Where are we up to with this one?
I read the code wrong, Xue is correct:
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Thanks,
--Mark
--
Mark Fasheh
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2014-06-20 22:49 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-06-09 20:04 [Ocfs2-devel] [patch 4/8] ocfs2/dlm: do not purge lockres that is queued for assert master akpm at linux-foundation.org
2014-06-13 21:31 ` Mark Fasheh
2014-06-16 1:26 ` Xue jiufei
2014-06-20 22:33 ` Andrew Morton
2014-06-20 22:49 ` Mark Fasheh
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.