From: "Yan, Zheng" <zheng.z.yan@intel.com>
To: Greg Farnum <greg@inktank.com>
Cc: ceph-devel@vger.kernel.org, sage@inktank.com
Subject: Re: [PATCH 09/39] mds: defer eval gather locks when removing replica
Date: Thu, 21 Mar 2013 10:29:58 +0800 [thread overview]
Message-ID: <514A70A6.7020204@intel.com> (raw)
In-Reply-To: <706819819FC545BA9301A427164F5983@inktank.com>
Will update my git tree.
Thanks
Yan, Zheng
On 03/21/2013 03:36 AM, Greg Farnum wrote:
> On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>
>> Locks' states should not change between composing the cache rejoin ack
>> messages and sending the message. If Locker::eval_gather() is called
>> in MDCache::{inode,dentry}_remove_replica(), it may wake requests and
>> change locks' states.
>>
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
>> ---
>> src/mds/MDCache.cc (http://MDCache.cc) | 51 ++++++++++++++++++++++++++++++---------------------
>> src/mds/MDCache.h | 8 +++++---
>> 2 files changed, 35 insertions(+), 24 deletions(-)
>>
>> diff --git a/src/mds/MDCache.cc (http://MDCache.cc) b/src/mds/MDCache.cc (http://MDCache.cc)
>> index 19dc60b..0f6b842 100644
>> --- a/src/mds/MDCache.cc (http://MDCache.cc)
>> +++ b/src/mds/MDCache.cc (http://MDCache.cc)
>> @@ -3729,6 +3729,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>> // possible response(s)
>> MMDSCacheRejoin *ack = 0; // if survivor
>> set<vinodeno_t> acked_inodes; // if survivor
>> + set<SimpleLock *> gather_locks; // if survivor
>> bool survivor = false; // am i a survivor?
>>
>> if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
>> @@ -3851,7 +3852,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>> assert(dnl->is_primary());
>>
>> if (survivor && dn->is_replica(from))
>> - dentry_remove_replica(dn, from); // this induces a lock gather completion
>> + dentry_remove_replica(dn, from, gather_locks); // this induces a lock gather completion
>
> This comment is no longer accurate :)
>> int dnonce = dn->add_replica(from);
>> dout(10) << " have " << *dn << dendl;
>> if (ack)
>> @@ -3864,7 +3865,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>> assert(in);
>>
>> if (survivor && in->is_replica(from))
>> - inode_remove_replica(in, from);
>> + inode_remove_replica(in, from, gather_locks);
>> int inonce = in->add_replica(from);
>> dout(10) << " have " << *in << dendl;
>>
>> @@ -3887,7 +3888,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>> CInode *in = get_inode(*p);
>> assert(in); // hmm fixme wrt stray?
>> if (survivor && in->is_replica(from))
>> - inode_remove_replica(in, from); // this induces a lock gather completion
>> + inode_remove_replica(in, from, gather_locks); // this induces a lock gather completion
>
> Same here.
>
> Other than those, looks good.
> -Greg
> Software Engineer #42 @ http://inktank.com | http://ceph.com
>
>
>> int inonce = in->add_replica(from);
>> dout(10) << " have base " << *in << dendl;
>>
>> @@ -3909,8 +3910,11 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>> ack->add_inode_base(in);
>> }
>>
>> - rejoin_scour_survivor_replicas(from, ack, acked_inodes);
>> + rejoin_scour_survivor_replicas(from, ack, gather_locks, acked_inodes);
>> mds->send_message(ack, weak->get_connection());
>> +
>> + for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p)
>> + mds->locker->eval_gather(*p);
>> } else {
>> // done?
>> assert(rejoin_gather.count(from));
>> @@ -4055,7 +4059,9 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
>> * all validated replicas are acked with a strong nonce, etc. if that isn't in the
>> * ack, the replica dne, and we can remove it from our replica maps.
>> */
>> -void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set<vinodeno_t>& acked_inodes)
>> +void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
>> + set<SimpleLock *>& gather_locks,
>> + set<vinodeno_t>& acked_inodes)
>> {
>> dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
>>
>> @@ -4070,7 +4076,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set
>> if (in->is_auth() &&
>> in->is_replica(from) &&
>> acked_inodes.count(p->second->vino()) == 0) {
>> - inode_remove_replica(in, from);
>> + inode_remove_replica(in, from, gather_locks);
>> dout(10) << " rem " << *in << dendl;
>> }
>>
>> @@ -4099,7 +4105,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set
>> if (dn->is_replica(from) &&
>> (ack->strong_dentries.count(dir->dirfrag()) == 0 ||
>> ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->name, dn->last)) == 0)) {
>> - dentry_remove_replica(dn, from);
>> + dentry_remove_replica(dn, from, gather_locks);
>> dout(10) << " rem " << *dn << dendl;
>> }
>> }
>> @@ -6189,6 +6195,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
>> return;
>> }
>>
>> + set<SimpleLock *> gather_locks;
>> // loop over realms
>> for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
>> p != m->realms.end();
>> @@ -6255,7 +6262,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
>> // remove from our cached_by
>> dout(7) << " inode expire on " << *in << " from mds." << from
>> << " cached_by was " << in->get_replicas() << dendl;
>> - inode_remove_replica(in, from);
>> + inode_remove_replica(in, from, gather_locks);
>> }
>> else {
>> // this is an old nonce, ignore expire.
>> @@ -6332,7 +6339,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
>>
>> if (nonce == dn->get_replica_nonce(from)) {
>> dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
>> - dentry_remove_replica(dn, from);
>> + dentry_remove_replica(dn, from, gather_locks);
>> }
>> else {
>> dout(7) << " dentry_expire on " << *dn << " from mds." << from
>> @@ -6343,6 +6350,8 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
>> }
>> }
>>
>> + for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p)
>> + mds->locker->eval_gather(*p);
>>
>> // done
>> m->put();
>> @@ -6368,35 +6377,35 @@ void MDCache::discard_delayed_expire(CDir *dir)
>> delayed_expire.erase(dir);
>> }
>>
>> -void MDCache::inode_remove_replica(CInode *in, int from)
>> +void MDCache::inode_remove_replica(CInode *in, int from, set<SimpleLock *>& gather_locks)
>> {
>> in->remove_replica(from);
>> in->mds_caps_wanted.erase(from);
>>
>> // note: this code calls _eval more often than it needs to!
>> // fix lock
>> - if (in->authlock.remove_replica(from)) mds->locker->eval_gather(&in->authlock);
>> - if (in->linklock.remove_replica(from)) mds->locker->eval_gather(&in->linklock);
>> - if (in->dirfragtreelock.remove_replica(from)) mds->locker->eval_gather(&in->dirfragtreelock);
>> - if (in->filelock.remove_replica(from)) mds->locker->eval_gather(&in->filelock);
>> - if (in->snaplock.remove_replica(from)) mds->locker->eval_gather(&in->snaplock);
>> - if (in->xattrlock.remove_replica(from)) mds->locker->eval_gather(&in->xattrlock);
>> + if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
>> + if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
>> + if (in->dirfragtreelock.remove_replica(from)) gather_locks.insert(&in->dirfragtreelock);
>> + if (in->filelock.remove_replica(from)) gather_locks.insert(&in->filelock);
>> + if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
>> + if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
>>
>> - if (in->nestlock.remove_replica(from)) mds->locker->eval_gather(&in->nestlock);
>> - if (in->flocklock.remove_replica(from)) mds->locker->eval_gather(&in->flocklock);
>> - if (in->policylock.remove_replica(from)) mds->locker->eval_gather(&in->policylock);
>> + if (in->nestlock.remove_replica(from)) gather_locks.insert(&in->nestlock);
>> + if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
>> + if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
>>
>> // trim?
>> maybe_eval_stray(in);
>> }
>>
>> -void MDCache::dentry_remove_replica(CDentry *dn, int from)
>> +void MDCache::dentry_remove_replica(CDentry *dn, int from, set<SimpleLock *>& gather_locks)
>> {
>> dn->remove_replica(from);
>>
>> // fix lock
>> if (dn->lock.remove_replica(from))
>> - mds->locker->eval_gather(&dn->lock);
>> + gather_locks.insert(&dn->lock);
>>
>> CDentry::linkage_t *dnl = dn->get_projected_linkage();
>> if (dnl->is_primary())
>> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
>> index f07ea74..a9f05c6 100644
>> --- a/src/mds/MDCache.h
>> +++ b/src/mds/MDCache.h
>> @@ -406,7 +406,9 @@ protected:
>> CDir* rejoin_invent_dirfrag(dirfrag_t df);
>> bool rejoin_fetch_dirfrags(MMDSCacheRejoin *m);
>> void handle_cache_rejoin_strong(MMDSCacheRejoin *m);
>> - void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set<vinodeno_t>& acked_inodes);
>> + void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
>> + set<SimpleLock *>& gather_locks,
>> + set<vinodeno_t>& acked_inodes);
>> void handle_cache_rejoin_ack(MMDSCacheRejoin *m);
>> void handle_cache_rejoin_purge(MMDSCacheRejoin *m);
>> void handle_cache_rejoin_missing(MMDSCacheRejoin *m);
>> @@ -607,8 +609,8 @@ public:
>> }
>> protected:
>>
>> - void inode_remove_replica(CInode *in, int rep);
>> - void dentry_remove_replica(CDentry *dn, int rep);
>> + void inode_remove_replica(CInode *in, int rep, set<SimpleLock *>& gather_locks);
>> + void dentry_remove_replica(CDentry *dn, int rep, set<SimpleLock *>& gather_locks);
>>
>> void rename_file(CDentry *srcdn, CDentry *destdn);
>>
>> --
>> 1.7.11.7
>
>
next prev parent reply other threads:[~2013-03-21 2:30 UTC|newest]
Thread overview: 117+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
2013-03-17 14:51 ` [PATCH 01/39] mds: preserve subtree bounds until slave commit Yan, Zheng
2013-03-20 18:33 ` Greg Farnum
2013-03-17 14:51 ` [PATCH 02/39] mds: process finished contexts in batch Yan, Zheng
2013-03-20 18:33 ` Greg Farnum
2013-03-17 14:51 ` [PATCH 03/39] mds: fix MDCache::adjust_bounded_subtree_auth() Yan, Zheng
2013-03-20 18:33 ` Greg Farnum
2013-03-17 14:51 ` [PATCH 04/39] mds: make sure table request id unique Yan, Zheng
2013-03-19 23:09 ` Greg Farnum
2013-03-20 5:53 ` Yan, Zheng
2013-03-20 6:15 ` Sage Weil
2013-03-20 6:24 ` Yan, Zheng
2013-03-20 6:49 ` Yan, Zheng
2013-03-20 18:31 ` Greg Farnum
2013-03-21 8:07 ` Yan, Zheng
2013-03-21 22:03 ` Gregory Farnum
2013-03-25 11:30 ` Yan, Zheng
2013-03-29 22:12 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 05/39] mds: send table request when peer is in proper state Yan, Zheng
2013-03-20 18:34 ` Greg Farnum
2013-03-29 21:58 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 06/39] mds: make table client/server tolerate duplicated message Yan, Zheng
2013-03-29 22:00 ` Gregory Farnum
2013-03-31 13:21 ` Yan, Zheng
2013-03-17 14:51 ` [PATCH 07/39] mds: mark connection down when MDS fails Yan, Zheng
2013-03-20 18:37 ` Greg Farnum
2013-03-17 14:51 ` [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state Yan, Zheng
2013-03-20 18:40 ` Greg Farnum
2013-03-21 2:22 ` Yan, Zheng
2013-03-21 21:43 ` Gregory Farnum
2013-03-20 19:09 ` Greg Farnum
2013-03-17 14:51 ` [PATCH 09/39] mds: defer eval gather locks when removing replica Yan, Zheng
2013-03-20 19:36 ` Greg Farnum
2013-03-21 2:29 ` Yan, Zheng [this message]
2013-03-17 14:51 ` [PATCH 10/39] mds: unify slave request waiting Yan, Zheng
2013-03-20 22:52 ` Sage Weil
2013-03-17 14:51 ` [PATCH 11/39] mds: don't delay processing replica buffer in slave request Yan, Zheng
2013-03-20 21:19 ` Greg Farnum
2013-03-21 2:38 ` Yan, Zheng
2013-03-21 4:15 ` Sage Weil
2013-03-21 21:48 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 12/39] mds: compose and send resolve messages in batch Yan, Zheng
2013-03-20 21:45 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 13/39] mds: don't send resolve message between active MDS Yan, Zheng
2013-03-20 21:56 ` Gregory Farnum
2013-03-21 2:55 ` Yan, Zheng
2013-03-21 21:55 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 14/39] mds: set resolve/rejoin gather MDS set in advance Yan, Zheng
2013-03-20 22:09 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 15/39] mds: don't send MDentry{Link,Unlink} before receiving cache rejoin Yan, Zheng
2013-03-20 22:17 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 16/39] mds: send cache rejoin messages after gathering all resolves Yan, Zheng
2013-03-20 22:57 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 17/39] mds: send resolve acks after master updates are safely logged Yan, Zheng
2013-03-20 22:58 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 18/39] mds: fix MDS recovery involving cross authority rename Yan, Zheng
2013-03-21 17:59 ` Gregory Farnum
2013-03-22 3:04 ` Yan, Zheng
2013-03-29 22:02 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 19/39] mds: remove MDCache::rejoin_fetch_dirfrags() Yan, Zheng
2013-03-20 22:58 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 20/39] mds: include replica nonce in MMDSCacheRejoin::inode_strong Yan, Zheng
2013-03-20 23:26 ` Gregory Farnum
2013-03-20 23:36 ` Sage Weil
2013-03-17 14:51 ` [PATCH 21/39] mds: encode dirfrag base in cache rejoin ack Yan, Zheng
2013-03-20 23:33 ` Gregory Farnum
2013-03-20 23:40 ` Gregory Farnum
2013-03-21 6:41 ` Yan, Zheng
2013-03-21 21:58 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 22/39] mds: handle linkage mismatch during cache rejoin Yan, Zheng
2013-03-21 21:23 ` Gregory Farnum
2013-03-22 3:05 ` Yan, Zheng
2013-03-25 16:14 ` Gregory Farnum
2013-03-26 7:21 ` Yan, Zheng
2013-03-29 22:09 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 23/39] mds: reqid for rejoinning authpin/wrlock need to be list Yan, Zheng
2013-03-20 23:59 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 24/39] mds: take object's versionlock when rejoinning xlock Yan, Zheng
2013-03-21 0:37 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 25/39] mds: share inode max size after MDS recovers Yan, Zheng
2013-03-21 0:45 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 26/39] mds: issue caps when lock state in replica become SYNC Yan, Zheng
2013-03-21 0:52 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 27/39] mds: send lock action message when auth MDS is in proper state Yan, Zheng
2013-03-21 3:12 ` Gregory Farnum
2013-03-21 3:20 ` Yan, Zheng
2013-03-17 14:51 ` [PATCH 28/39] mds: add dirty imported dirfrag to LogSegment Yan, Zheng
2013-03-21 3:14 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 29/39] mds: avoid double auth pin for file recovery Yan, Zheng
2013-03-21 3:20 ` Gregory Farnum
2013-03-21 3:33 ` Yan, Zheng
2013-03-21 4:20 ` Sage Weil
2013-03-21 21:58 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 30/39] mds: check MDS peer's state through mdsmap Yan, Zheng
2013-03-21 3:24 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 31/39] mds: unfreeze subtree if import aborts in PREPPED state Yan, Zheng
2013-03-21 3:27 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 32/39] mds: fix export cancel notification Yan, Zheng
2013-03-21 3:31 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 33/39] mds: notify bystanders if export aborts Yan, Zheng
2013-03-21 3:34 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 34/39] mds: don't open dirfrag while subtree is frozen Yan, Zheng
2013-03-21 3:38 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 35/39] mds: clear dirty inode rstat if import fails Yan, Zheng
2013-03-21 3:40 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 36/39] mds: try merging subtree after clear EXPORTBOUND Yan, Zheng
2013-03-21 3:44 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 37/39] mds: eval inodes with caps imported by cache rejoin message Yan, Zheng
2013-03-21 3:45 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 38/39] mds: don't replicate purging dentry Yan, Zheng
2013-03-21 3:46 ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 39/39] mds: clear scatter dirty if replica inode has no auth subtree Yan, Zheng
2013-03-21 3:49 ` Gregory Farnum
2013-04-01 8:46 ` [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
2013-04-01 17:00 ` Gregory Farnum
2013-04-01 8:51 ` [PATCH] mds: avoid sending duplicated table prepare/commit Yan, Zheng
2013-04-01 8:51 ` [PATCH] mds: don't roll back prepared table updates Yan, Zheng
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=514A70A6.7020204@intel.com \
--to=zheng.z.yan@intel.com \
--cc=ceph-devel@vger.kernel.org \
--cc=greg@inktank.com \
--cc=sage@inktank.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.