All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/8] misc fixes for mds
@ 2013-06-17 12:10 Yan, Zheng
  2013-06-17 12:10 ` [PATCH 1/8] mds: don't update migrate_seq when importing non-auth cap Yan, Zheng
                   ` (8 more replies)
  0 siblings, 9 replies; 13+ messages in thread
From: Yan, Zheng @ 2013-06-17 12:10 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

these patches are also in:
  git://github.com/ukernel/ceph.git wip-mds

Regards
Yan, Zheng

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/8] mds: don't update migrate_seq when importing non-auth cap
  2013-06-17 12:10 [PATCH 0/8] misc fixes for mds Yan, Zheng
@ 2013-06-17 12:10 ` Yan, Zheng
  2013-06-17 12:10 ` [PATCH 2/8] mds: fix frozen check in Server::try_open_auth_dirfrag() Yan, Zheng
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Yan, Zheng @ 2013-06-17 12:10 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

We use migrate_seq to distinguish old and new auth MDS. So we should
not change migrate_seq when importing non-auth cap.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Capability.h | 5 +++--
 src/mds/Migrator.cc  | 8 ++++----
 src/mds/Migrator.h   | 3 ++-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/mds/Capability.h b/src/mds/Capability.h
index 54d2312..fdecb90 100644
--- a/src/mds/Capability.h
+++ b/src/mds/Capability.h
@@ -273,7 +273,7 @@ public:
     return Export(_wanted, issued(), pending(), client_follows, mseq+1, last_issue_stamp);
   }
   void rejoin_import() { mseq++; }
-  void merge(Export& other) {
+  void merge(Export& other, bool auth_cap) {
     // issued + pending
     int newpending = other.pending | pending();
     if (other.issued & ~newpending)
@@ -286,7 +286,8 @@ public:
 
     // wanted
     _wanted = _wanted | other.wanted;
-    mseq = other.mseq;
+    if (auth_cap)
+      mseq = other.mseq;
   }
   void merge(int otherwanted, int otherissued) {
     // issued + pending
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 6ea28c9..0647448 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -2223,7 +2223,7 @@ void Migrator::import_logged_start(dirfrag_t df, CDir *dir, int from,
   for (map<CInode*, map<client_t,Capability::Export> >::iterator p = import_caps[dir].begin();
        p != import_caps[dir].end();
        ++p) {
-    finish_import_inode_caps(p->first, from, p->second);
+    finish_import_inode_caps(p->first, true, p->second);
   }
   
   // send notify's etc.
@@ -2398,7 +2398,7 @@ void Migrator::decode_import_inode_caps(CInode *in,
   }
 }
 
-void Migrator::finish_import_inode_caps(CInode *in, int from, 
+void Migrator::finish_import_inode_caps(CInode *in, bool auth_cap,
 					map<client_t,Capability::Export> &cap_map)
 {
   for (map<client_t,Capability::Export>::iterator it = cap_map.begin();
@@ -2412,7 +2412,7 @@ void Migrator::finish_import_inode_caps(CInode *in, int from,
     if (!cap) {
       cap = in->add_client_cap(it->first, session);
     }
-    cap->merge(it->second);
+    cap->merge(it->second, auth_cap);
 
     mds->mdcache->do_cap_import(session, in, cap);
   }
@@ -2688,7 +2688,7 @@ void Migrator::logged_import_caps(CInode *in,
   mds->server->finish_force_open_sessions(client_map, sseqmap);
 
   assert(cap_imports.count(in));
-  finish_import_inode_caps(in, from, cap_imports[in]);  
+  finish_import_inode_caps(in, false, cap_imports[in]);
   mds->locker->eval(in, CEPH_CAP_LOCKS, true);
 
   mds->send_message_mds(new MExportCapsAck(in->ino()), from);
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
index 70b59bc..afe2e6c 100644
--- a/src/mds/Migrator.h
+++ b/src/mds/Migrator.h
@@ -256,7 +256,8 @@ public:
   void decode_import_inode_caps(CInode *in,
 				bufferlist::iterator &blp,
 				map<CInode*, map<client_t,Capability::Export> >& cap_imports);
-  void finish_import_inode_caps(CInode *in, int from, map<client_t,Capability::Export> &cap_map);
+  void finish_import_inode_caps(CInode *in, bool auth_cap,
+			        map<client_t,Capability::Export> &cap_map);
   int decode_import_dir(bufferlist::iterator& blp,
 			int oldauth,
 			CDir *import_root,
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/8] mds: fix frozen check in Server::try_open_auth_dirfrag()
  2013-06-17 12:10 [PATCH 0/8] misc fixes for mds Yan, Zheng
  2013-06-17 12:10 ` [PATCH 1/8] mds: don't update migrate_seq when importing non-auth cap Yan, Zheng
@ 2013-06-17 12:10 ` Yan, Zheng
  2013-06-17 12:10 ` [PATCH 3/8] mds: handle undefined dirfrags when opening inode Yan, Zheng
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Yan, Zheng @ 2013-06-17 12:10 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Server.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 253c56d..c3162e7 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -2204,7 +2204,7 @@ CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr)
   }
 
   // not open and inode frozen?
-  if (!dir && diri->is_frozen_dir()) {
+  if (!dir && diri->is_frozen()) {
     dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
     assert(diri->get_parent_dir());
     diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/8] mds: handle undefined dirfrags when opening inode
  2013-06-17 12:10 [PATCH 0/8] misc fixes for mds Yan, Zheng
  2013-06-17 12:10 ` [PATCH 1/8] mds: don't update migrate_seq when importing non-auth cap Yan, Zheng
  2013-06-17 12:10 ` [PATCH 2/8] mds: fix frozen check in Server::try_open_auth_dirfrag() Yan, Zheng
@ 2013-06-17 12:10 ` Yan, Zheng
  2013-06-17 12:10 ` [PATCH 4/8] mds: try purging stray inode after storing backtrace Yan, Zheng
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Yan, Zheng @ 2013-06-17 12:10 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

When MDS is rejoin stage, cache rejoin message can add undefined
inodes and dirfrags to the cache. These undefined objects can affect
"lookup-by-ino" processes. If an undefined dirfrag is encountered,
we should fetch it from disk.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 37 +++++++++++++++++++++++++++++++++----
 src/mds/MDCache.h  |  1 +
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 2b7ad71..b9b154d 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -8015,6 +8015,29 @@ void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int r
   do_open_ino(ino, info, ret);
 }
 
+void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir)
+{
+  if (dir->state_test(CDir::STATE_REJOINUNDEF) && dir->get_frag() == frag_t()) {
+    rejoin_undef_dirfrags.erase(dir);
+    dir->state_clear(CDir::STATE_REJOINUNDEF);
+
+    CInode *diri = dir->get_inode();
+    diri->force_dirfrags();
+    list<CDir*> ls;
+    diri->get_dirfrags(ls);
+
+    C_GatherBuilder gather(g_ceph_context, _open_ino_get_waiter(ino, m));
+    for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+      rejoin_undef_dirfrags.insert(*p);
+      (*p)->state_set(CDir::STATE_REJOINUNDEF);
+      (*p)->fetch(gather.new_sub());
+    }
+    assert(gather.has_subs());
+    gather.activate();
+  } else
+    dir->fetch(_open_ino_get_waiter(ino, m));
+}
+
 int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
 				   vector<inode_backpointer_t>& ancestors,
 				   bool discover, bool want_xlocked, int *hint)
@@ -8032,8 +8055,14 @@ int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
       continue;
     }
 
-    if (diri->state_test(CInode::STATE_REJOINUNDEF))
-      continue;
+    if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
+      CDir *dir = diri->get_parent_dir();
+      while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
+	     dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
+	dir = dir->get_inode()->get_parent_dir();
+      _open_ino_fetch_dir(ino, m, dir);
+      return 1;
+    }
 
     if (!diri->is_dir()) {
       dout(10) << " " << *diri << " is not dir" << dendl;
@@ -8067,14 +8096,14 @@ int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
 	if (dnl && dnl->is_primary() &&
 	    dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
 	  dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
-	  dir->fetch(_open_ino_get_waiter(ino, m));
+	  _open_ino_fetch_dir(ino, m, dir);
 	  return 1;
 	}
 
 	if (!dnl && !dir->is_complete() &&
 	    (!dir->has_bloom() || dir->is_in_bloom(name))) {
 	  dout(10) << " fetching incomplete " << *dir << dendl;
-	  dir->fetch(_open_ino_get_waiter(ino, m));
+	  _open_ino_fetch_dir(ino, m, dir);
 	  return 1;
 	}
 
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 3da8a36..36a322c 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -790,6 +790,7 @@ protected:
   void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
   void _open_ino_parent_opened(inodeno_t ino, int ret);
   void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
+  void _open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir);
   Context* _open_ino_get_waiter(inodeno_t ino, MMDSOpenIno *m);
   int open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
 			    vector<inode_backpointer_t>& ancestors,
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/8] mds: try purging stray inode after storing backtrace
  2013-06-17 12:10 [PATCH 0/8] misc fixes for mds Yan, Zheng
                   ` (2 preceding siblings ...)
  2013-06-17 12:10 ` [PATCH 3/8] mds: handle undefined dirfrags when opening inode Yan, Zheng
@ 2013-06-17 12:10 ` Yan, Zheng
  2013-06-17 12:10 ` [PATCH 5/8] mds: fix cross-authorty rename race Yan, Zheng
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Yan, Zheng @ 2013-06-17 12:10 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Inode is auth pinned and can't be purged while storing backtrace,
so we should try purging stray inode after storing backtrace.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/CInode.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 0e14293..4a592bc 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -1069,11 +1069,12 @@ void CInode::_stored_backtrace(version_t v, Context *fin)
 {
   dout(10) << "_stored_backtrace" << dendl;
 
+  auth_unpin(this);
   if (v == inode.backtrace_version)
     clear_dirty_parent();
-  auth_unpin(this);
   if (fin)
     fin->complete(0);
+  mdcache->maybe_eval_stray(this);
 }
 
 void CInode::_mark_dirty_parent(LogSegment *ls, bool dirty_pool)
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 5/8] mds: fix cross-authorty rename race
  2013-06-17 12:10 [PATCH 0/8] misc fixes for mds Yan, Zheng
                   ` (3 preceding siblings ...)
  2013-06-17 12:10 ` [PATCH 4/8] mds: try purging stray inode after storing backtrace Yan, Zheng
@ 2013-06-17 12:10 ` Yan, Zheng
  2013-06-17 12:10 ` [PATCH 6/8] mds: don't journal bare dirfrag Yan, Zheng
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Yan, Zheng @ 2013-06-17 12:10 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

When doing cross-authorty rename, we need to make sure bystanders
have received all messages sent by inode's original auth MDS,
then change inode's authorty. Otherwise lock messages sent by the
original/new auth MDS can arrive bystanders out of order. The fix
is: inode's original auth MDS sends notify messages to bystanders,
performs slave rename after receiving all bystanders' notify acks.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc              | 31 ++++++++++++++++++++-----
 src/mds/Server.cc               | 51 +++++++++++++++++++++++++++++++++++++++--
 src/mds/Server.h                |  1 +
 src/messages/MMDSSlaveRequest.h |  6 +++++
 4 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index b9b154d..2b0029f 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -2651,6 +2651,15 @@ void MDCache::handle_mds_failure(int who)
     if (p->second->slave_to_mds == who) {
       if (p->second->slave_did_prepare()) {
 	dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
+	if (!p->second->more()->waiting_on_slave.empty()) {
+	  assert(p->second->more()->srcdn_auth_mds == mds->get_nodeid());
+	  // will rollback, no need to wait
+	  if (p->second->slave_request) {
+	    p->second->slave_request->put();
+	    p->second->slave_request = 0;
+	  }
+	  p->second->more()->waiting_on_slave.clear();
+	}
       } else {
 	dout(10) << " slave request " << *p->second << " has no prepare, finishing up" << dendl;
 	if (p->second->slave_request)
@@ -2660,12 +2669,22 @@ void MDCache::handle_mds_failure(int who)
       }
     }
 
-    if (p->second->is_slave() &&
-	p->second->slave_did_prepare() && p->second->more()->srcdn_auth_mds == who &&
-	mds->mdsmap->is_clientreplay_or_active_or_stopping(p->second->slave_to_mds)) {
-      // rename srcdn's auth mds failed, resolve even I'm a survivor.
-      dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
-      add_ambiguous_slave_update(p->first, p->second->slave_to_mds);
+    if (p->second->is_slave() && p->second->slave_did_prepare()) {
+      if (p->second->more()->waiting_on_slave.count(who)) {
+	assert(p->second->more()->srcdn_auth_mds == mds->get_nodeid());
+	dout(10) << " slave request " << *p->second << " no longer need rename notity ack from mds."
+		 << who << dendl;
+	p->second->more()->waiting_on_slave.erase(who);
+	if (p->second->more()->waiting_on_slave.empty())
+	  mds->queue_waiter(new C_MDS_RetryRequest(this, p->second));
+      }
+
+      if (p->second->more()->srcdn_auth_mds == who &&
+	  mds->mdsmap->is_clientreplay_or_active_or_stopping(p->second->slave_to_mds)) {
+	// rename srcdn's auth mds failed, resolve even I'm a survivor.
+	dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
+	add_ambiguous_slave_update(p->first, p->second->slave_to_mds);
+      }
     }
     
     // failed node is slave?
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index c3162e7..00bf018 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -1280,6 +1280,16 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
   if (m->is_reply())
     return handle_slave_request_reply(m);
 
+  // the purpose of rename notify is enforcing causal message ordering. making sure
+  // bystanders have received all messages from rename srcdn's auth MDS.
+  if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
+    MMDSSlaveRequest *reply = new MMDSSlaveRequest(m->get_reqid(), m->get_attempt(),
+						   MMDSSlaveRequest::OP_RENAMENOTIFYACK);
+    mds->send_message(reply, m->get_connection());
+    m->put();
+    return;
+  }
+
   CDentry *straydn = NULL;
   if (m->stray.length() > 0) {
     straydn = mdcache->add_replica_stray(m->stray, from);
@@ -1432,6 +1442,10 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
     handle_slave_rename_prep_ack(mdr, m);
     break;
 
+  case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
+    handle_slave_rename_notify_ack(mdr, m);
+    break;
+
   default:
     assert(0);
   }
@@ -6560,6 +6574,9 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
 
   // am i srcdn auth?
   if (srcdn->is_auth()) {
+    set<int> srcdnrep;
+    srcdn->list_replicas(srcdnrep);
+
     bool reply_witness = false;
     if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
       // freeze?
@@ -6594,12 +6611,19 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
       if (mdr->slave_request->witnesses.size() > 1) {
 	dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
 	reply_witness = true;
+	for (set<int>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
+	  if (*p == mdr->slave_to_mds ||
+	      !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p))
+	    continue;
+	  MMDSSlaveRequest *notify = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
+							  MMDSSlaveRequest::OP_RENAMENOTIFY);
+	  mds->send_message_mds(notify, *p);
+	  mdr->more()->waiting_on_slave.insert(*p);
+	}
       }
     }
 
     // is witness list sufficient?
-    set<int> srcdnrep;
-    srcdn->list_replicas(srcdnrep);
     for (set<int>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
       if (*p == mdr->slave_to_mds ||
 	  mdr->slave_request->witnesses.count(*p)) continue;
@@ -6619,6 +6643,11 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
       return;	
     }
     dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
+    if (!mdr->more()->waiting_on_slave.empty()) {
+      dout(10) << " still waiting for rename notify acks from "
+	       << mdr->more()->waiting_on_slave << dendl;
+      return;
+    }
   } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
     // set ambiguous auth for srci on witnesses
     mdr->set_ambiguous_auth(srcdnl->get_inode());
@@ -7187,6 +7216,24 @@ void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
     dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
 }
 
+void Server::handle_slave_rename_notify_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
+{
+  dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
+	   << ack->get_source() << dendl;
+  assert(mdr->is_slave());
+  int from = ack->get_source().num();
+
+  if (mdr->more()->waiting_on_slave.count(from)) {
+    mdr->more()->waiting_on_slave.erase(from);
+
+    if (mdr->more()->waiting_on_slave.empty()) {
+      if (mdr->slave_request)
+	dispatch_slave_request(mdr);
+    } else 
+      dout(10) << " still waiting for rename notify acks from "
+	       << mdr->more()->waiting_on_slave << dendl;
+  }
+}
 
 
 
diff --git a/src/mds/Server.h b/src/mds/Server.h
index 35a405b..269b286 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -242,6 +242,7 @@ public:
   // slaving
   void handle_slave_rename_prep(MDRequest *mdr);
   void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m);
+  void handle_slave_rename_notify_ack(MDRequest *mdr, MMDSSlaveRequest *m);
   void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
   void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
   void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr, bool finish_mdr=false);
diff --git a/src/messages/MMDSSlaveRequest.h b/src/messages/MMDSSlaveRequest.h
index 35af81d..f8f30b2 100644
--- a/src/messages/MMDSSlaveRequest.h
+++ b/src/messages/MMDSSlaveRequest.h
@@ -43,6 +43,9 @@ class MMDSSlaveRequest : public Message {
 
   static const int OP_DROPLOCKS	= 11;
 
+  static const int OP_RENAMENOTIFY = 12;
+  static const int OP_RENAMENOTIFYACK = -12;
+
   static const int OP_FINISH = 17;  
   static const int OP_COMMITTED = -18;  
 
@@ -77,6 +80,9 @@ class MMDSSlaveRequest : public Message {
 
     case OP_DROPLOCKS: return "drop_locks";
 
+    case OP_RENAMENOTIFY: return "reame_notify";
+    case OP_RENAMENOTIFYACK: return "rename_notify_ack";
+
     case OP_ABORT: return "abort";
       //case OP_COMMIT: return "commit";
 
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 6/8] mds: don't journal bare dirfrag
  2013-06-17 12:10 [PATCH 0/8] misc fixes for mds Yan, Zheng
                   ` (4 preceding siblings ...)
  2013-06-17 12:10 ` [PATCH 5/8] mds: fix cross-authorty rename race Yan, Zheng
@ 2013-06-17 12:10 ` Yan, Zheng
  2013-06-19 15:20   ` Sage Weil
  2013-06-17 12:10 ` [PATCH 7/8] mds: fix race between scatter gather and dirfrag export Yan, Zheng
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 13+ messages in thread
From: Yan, Zheng @ 2013-06-17 12:10 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

don't journal bare dirfrag when starting scatter. also add debug
code for bare dirfrag modification.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/CDir.cc   | 1 +
 src/mds/CInode.cc | 2 ++
 src/mds/Server.cc | 5 +++--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 211cec0..8c83eba 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1211,6 +1211,7 @@ void CDir::finish_waiting(uint64_t mask, int result)
 
 fnode_t *CDir::project_fnode()
 {
+  assert(get_version() != 0);
   fnode_t *p = new fnode_t;
   *p = *get_projected_fnode();
   projected_fnode.push_back(p);
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 4a592bc..8936acd 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -1625,6 +1625,8 @@ void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir,
 
   if (dir->is_frozen()) {
     dout(10) << "finish_scatter_update " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl;
+  } else if (dir->get_version() == 0) {
+    dout(10) << "finish_scatter_update " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl;
   } else {
     if (dir_accounted_version != inode_version) {
       dout(10) << "finish_scatter_update " << fg << " journaling accounted scatterstat update v" << inode_version << dendl;
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 00bf018..1d16d04 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -4010,7 +4010,8 @@ public:
     if (newi->inode.is_dir()) { 
       CDir *dir = newi->get_dirfrag(frag_t());
       assert(dir);
-      dir->mark_dirty(1, mdr->ls);
+      dir->fnode.version--;
+      dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
       dir->mark_new(mdr->ls);
     }
 
@@ -4169,7 +4170,7 @@ void Server::handle_client_mkdir(MDRequest *mdr)
   // ...and that new dir is empty.
   CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t());
   newdir->mark_complete();
-  newdir->pre_dirty();
+  newdir->fnode.version = newdir->pre_dirty();
 
   // prepare finisher
   mdr->ls = mdlog->get_current_segment();
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 7/8] mds: fix race between scatter gather and dirfrag export
  2013-06-17 12:10 [PATCH 0/8] misc fixes for mds Yan, Zheng
                   ` (5 preceding siblings ...)
  2013-06-17 12:10 ` [PATCH 6/8] mds: don't journal bare dirfrag Yan, Zheng
@ 2013-06-17 12:10 ` Yan, Zheng
  2013-06-19 15:23   ` Sage Weil
  2013-06-17 12:10 ` [PATCH 8/8] mds: fix remote wrlock rejoin Yan, Zheng
  2013-06-19 15:33 ` [PATCH 0/8] misc fixes for mds Sage Weil
  8 siblings, 1 reply; 13+ messages in thread
From: Yan, Zheng @ 2013-06-17 12:10 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

If we gather dirty scatter lock state while corresponding dirfrag
is been exporting, we may receive different dirfrag states from
two MDS and we need to find which one is the newest. The solution
is adding a new variable "migrate seq" to dirfrag, increase it by
one when dirfrag's auth MDS changes. When gathering dirty scatter
lock state, use "migrate seq" to find the newest dirfrag state.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/CDir.cc    |  4 ++++
 src/mds/CDir.h     |  4 +++-
 src/mds/CInode.cc  | 18 ++++++++++++++++++
 src/mds/MDCache.cc | 13 +++++++++++--
 4 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 8c83eba..2b991d7 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -154,6 +154,7 @@ ostream& CDir::print_db_line_prefix(ostream& out)
 // CDir
 
 CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) :
+  mseq(0),
   dirty_rstat_inodes(member_offset(CInode, dirty_rstat_item)),
   item_dirty(this), item_new(this),
   pop_me(ceph_clock_now(g_ceph_context)),
@@ -2121,6 +2122,8 @@ void CDir::_committed(version_t v)
 void CDir::encode_export(bufferlist& bl)
 {
   assert(!is_projected());
+  ceph_seq_t seq = mseq + 1;
+  ::encode(seq, bl);
   ::encode(first, bl);
   ::encode(fnode, bl);
   ::encode(dirty_old_rstat, bl);
@@ -2150,6 +2153,7 @@ void CDir::finish_export(utime_t now)
 
 void CDir::decode_import(bufferlist::iterator& blp, utime_t now, LogSegment *ls)
 {
+  ::decode(mseq, blp);
   ::decode(first, blp);
   ::decode(fnode, blp);
   ::decode(dirty_old_rstat, blp);
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index 87c79c2..11f4a76 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -170,6 +170,7 @@ public:
 
   fnode_t fnode;
   snapid_t first;
+  ceph_seq_t mseq; // migrate sequence
   map<snapid_t,old_rstat_t> dirty_old_rstat;  // [value.first,key]
 
   // my inodes with dirty rstat data
@@ -547,7 +548,8 @@ public:
   // -- import/export --
   void encode_export(bufferlist& bl);
   void finish_export(utime_t now);
-  void abort_export() { 
+  void abort_export() {
+    mseq += 2;
     put(PIN_TEMPEXPORTING);
   }
   void decode_import(bufferlist::iterator& blp, utime_t now, LogSegment *ls);
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 8936acd..c1ce8a1 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -1222,6 +1222,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
 	  dout(20) << fg << "           fragstat " << pf->fragstat << dendl;
 	  dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
 	  ::encode(fg, tmp);
+	  ::encode(dir->mseq, tmp);
 	  ::encode(dir->first, tmp);
 	  ::encode(pf->fragstat, tmp);
 	  ::encode(pf->accounted_fragstat, tmp);
@@ -1255,6 +1256,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
 	  dout(10) << fg << " " << pf->rstat << dendl;
 	  dout(10) << fg << " " << dir->dirty_old_rstat << dendl;
 	  ::encode(fg, tmp);
+	  ::encode(dir->mseq, tmp);
 	  ::encode(dir->first, tmp);
 	  ::encode(pf->rstat, tmp);
 	  ::encode(pf->accounted_rstat, tmp);
@@ -1404,10 +1406,12 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
       dout(10) << " ...got " << n << " fragstats on " << *this << dendl;
       while (n--) {
 	frag_t fg;
+	ceph_seq_t mseq;
 	snapid_t fgfirst;
 	frag_info_t fragstat;
 	frag_info_t accounted_fragstat;
 	::decode(fg, p);
+	::decode(mseq, p);
 	::decode(fgfirst, p);
 	::decode(fragstat, p);
 	::decode(accounted_fragstat, p);
@@ -1420,6 +1424,12 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
 	  assert(dir);                // i am auth; i had better have this dir open
 	  dout(10) << fg << " first " << dir->first << " -> " << fgfirst
 		   << " on " << *dir << dendl;
+	  if (dir->fnode.fragstat.version == inode.dirstat.version &&
+	      ceph_seq_cmp(mseq, dir->mseq) < 0) {
+	    dout(10) << " mseq " << mseq << " < " << dir->mseq << ", ignoring" << dendl;
+	    continue;
+	  }
+	  dir->mseq = mseq;
 	  dir->first = fgfirst;
 	  dir->fnode.fragstat = fragstat;
 	  dir->fnode.accounted_fragstat = accounted_fragstat;
@@ -1462,11 +1472,13 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
       ::decode(n, p);
       while (n--) {
 	frag_t fg;
+	ceph_seq_t mseq;
 	snapid_t fgfirst;
 	nest_info_t rstat;
 	nest_info_t accounted_rstat;
 	map<snapid_t,old_rstat_t> dirty_old_rstat;
 	::decode(fg, p);
+	::decode(mseq, p);
 	::decode(fgfirst, p);
 	::decode(rstat, p);
 	::decode(accounted_rstat, p);
@@ -1481,6 +1493,12 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
 	  assert(dir);                // i am auth; i had better have this dir open
 	  dout(10) << fg << " first " << dir->first << " -> " << fgfirst
 		   << " on " << *dir << dendl;
+	  if (dir->fnode.rstat.version == inode.rstat.version &&
+	      ceph_seq_cmp(mseq, dir->mseq) < 0) {
+	    dout(10) << " mseq " << mseq << " < " << dir->mseq << ", ignoring" << dendl;
+	    continue;
+	  }
+	  dir->mseq = mseq;
 	  dir->first = fgfirst;
 	  dir->fnode.rstat = rstat;
 	  dir->fnode.accounted_rstat = accounted_rstat;
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 2b0029f..f1ebedf 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -3764,6 +3764,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
     dout(15) << " add_strong_dirfrag " << *dir << dendl;
     rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
     dir->state_set(CDir::STATE_REJOINING);
+    dir->mseq = 0;
 
     for (CDir::map_t::iterator p = dir->items.begin();
 	 p != dir->items.end();
@@ -3913,11 +3914,19 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
        ++p) {
     CInode *in = get_inode(p->first);
     assert(in);
+    if (survivor) {
+      list<CDir*> ls;
+      in->get_subtree_dirfrags(ls);
+      for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
+	if ((*q)->get_dir_auth().first == from)
+	  (*q)->mseq = 0;
+      }
+    } else {
+      rejoin_potential_updated_scatterlocks.insert(in);
+    }
     in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
     in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
     in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
-    if (!survivor)
-      rejoin_potential_updated_scatterlocks.insert(in);
   }
 
   // recovering peer may send incorrect dirfrags here.  we need to
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 8/8] mds: fix remote wrlock rejoin
  2013-06-17 12:10 [PATCH 0/8] misc fixes for mds Yan, Zheng
                   ` (6 preceding siblings ...)
  2013-06-17 12:10 ` [PATCH 7/8] mds: fix race between scatter gather and dirfrag export Yan, Zheng
@ 2013-06-17 12:10 ` Yan, Zheng
  2013-06-19 15:33 ` [PATCH 0/8] misc fixes for mds Sage Weil
  8 siblings, 0 replies; 13+ messages in thread
From: Yan, Zheng @ 2013-06-17 12:10 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

remote wrlock's target is not always inode's auth MDS.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index f1ebedf..2b127b5 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -4523,25 +4523,29 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	mdr->locks.insert(lock);
       }
     }
-    // wrlock(s)?
-    if (strong->wrlocked_inodes.count(in->vino())) {
-      for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = strong->wrlocked_inodes[in->vino()].begin();
-	   q != strong->wrlocked_inodes[in->vino()].end();
-	   ++q) {
-	SimpleLock *lock = in->get_lock(q->first);
-	for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
-	     r != q->second.end();
-	     ++r) {
-	  dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
-	  MDRequest *mdr = request_get(r->reqid);  // should have this from auth_pin above.
+  }
+  // wrlock(s)?
+  for (map<vinodeno_t, map<int, list<MMDSCacheRejoin::slave_reqid> > >::iterator p = strong->wrlocked_inodes.begin();
+       p != strong->wrlocked_inodes.end();
+       ++p) {
+    CInode *in = get_inode(p->first);
+    for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = p->second.begin();
+	 q != p->second.end();
+	++q) {
+      SimpleLock *lock = in->get_lock(q->first);
+      for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
+	  r != q->second.end();
+	  ++r) {
+	dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
+	MDRequest *mdr = request_get(r->reqid);  // should have this from auth_pin above.
+	if (in->is_auth())
 	  assert(mdr->is_auth_pinned(in));
-	  lock->set_state(LOCK_MIX);
-	  if (lock == &in->filelock)
-	    in->loner_cap = -1;
-	  lock->get_wrlock(true);
-	  mdr->wrlocks.insert(lock);
-	  mdr->locks.insert(lock);
-	}
+	lock->set_state(LOCK_MIX);
+	if (lock == &in->filelock)
+	  in->loner_cap = -1;
+	lock->get_wrlock(true);
+	mdr->wrlocks.insert(lock);
+	mdr->locks.insert(lock);
       }
     }
   }
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 6/8] mds: don't journal bare dirfrag
  2013-06-17 12:10 ` [PATCH 6/8] mds: don't journal bare dirfrag Yan, Zheng
@ 2013-06-19 15:20   ` Sage Weil
  0 siblings, 0 replies; 13+ messages in thread
From: Sage Weil @ 2013-06-19 15:20 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel

On Mon, 17 Jun 2013, Yan, Zheng wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
> 
> don't journal bare dirfrag when starting scatter. also add debug
> code for bare dirfrag modification.
> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/CDir.cc   | 1 +
>  src/mds/CInode.cc | 2 ++
>  src/mds/Server.cc | 5 +++--
>  3 files changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
> index 211cec0..8c83eba 100644
> --- a/src/mds/CDir.cc
> +++ b/src/mds/CDir.cc
> @@ -1211,6 +1211,7 @@ void CDir::finish_waiting(uint64_t mask, int result)
>  
>  fnode_t *CDir::project_fnode()
>  {
> +  assert(get_version() != 0);
>    fnode_t *p = new fnode_t;
>    *p = *get_projected_fnode();
>    projected_fnode.push_back(p);
> diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
> index 4a592bc..8936acd 100644
> --- a/src/mds/CInode.cc
> +++ b/src/mds/CInode.cc
> @@ -1625,6 +1625,8 @@ void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir,
>  
>    if (dir->is_frozen()) {
>      dout(10) << "finish_scatter_update " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl;
> +  } else if (dir->get_version() == 0) {
> +    dout(10) << "finish_scatter_update " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl;
>    } else {
>      if (dir_accounted_version != inode_version) {
>        dout(10) << "finish_scatter_update " << fg << " journaling accounted scatterstat update v" << inode_version << dendl;
> diff --git a/src/mds/Server.cc b/src/mds/Server.cc
> index 00bf018..1d16d04 100644
> --- a/src/mds/Server.cc
> +++ b/src/mds/Server.cc
> @@ -4010,7 +4010,8 @@ public:
>      if (newi->inode.is_dir()) { 
>        CDir *dir = newi->get_dirfrag(frag_t());
>        assert(dir);
> -      dir->mark_dirty(1, mdr->ls);
> +      dir->fnode.version--;
> +      dir->mark_dirty(dir->fnode.version + 1, mdr->ls);

BTW I notice that fnode_t's ctor doesn't initialize version or 
snap_purged_thru (although CDir's ctor does a memset).  Does that mean 
that this is rolling over from (u64)-1 to 0 here?  Would it be simpler to 
make '1' the "don't journal me" value and leave this starting at v1?

>        dir->mark_new(mdr->ls);
>      }
>  
> @@ -4169,7 +4170,7 @@ void Server::handle_client_mkdir(MDRequest *mdr)
>    // ...and that new dir is empty.
>    CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t());
>    newdir->mark_complete();
> -  newdir->pre_dirty();
> +  newdir->fnode.version = newdir->pre_dirty();
>  
>    // prepare finisher
>    mdr->ls = mdlog->get_current_segment();
> -- 
> 1.8.1.4
> 
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 7/8] mds: fix race between scatter gather and dirfrag export
  2013-06-17 12:10 ` [PATCH 7/8] mds: fix race between scatter gather and dirfrag export Yan, Zheng
@ 2013-06-19 15:23   ` Sage Weil
  2013-06-19 18:01     ` Yan, Zheng
  0 siblings, 1 reply; 13+ messages in thread
From: Sage Weil @ 2013-06-19 15:23 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel

On Mon, 17 Jun 2013, Yan, Zheng wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
> 
> If we gather dirty scatter lock state while corresponding dirfrag
> is been exporting, we may receive different dirfrag states from
> two MDS and we need to find which one is the newest. The solution
> is adding a new variable "migrate seq" to dirfrag, increase it by
> one when dirfrag's auth MDS changes. When gathering dirty scatter
> lock state, use "migrate seq" to find the newest dirfrag state.

This should bump the CEPH_MDS_PROTOCOL #define in MDS.h

Otherwise, looks godo!
sage

> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/CDir.cc    |  4 ++++
>  src/mds/CDir.h     |  4 +++-
>  src/mds/CInode.cc  | 18 ++++++++++++++++++
>  src/mds/MDCache.cc | 13 +++++++++++--
>  4 files changed, 36 insertions(+), 3 deletions(-)
> 
> diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
> index 8c83eba..2b991d7 100644
> --- a/src/mds/CDir.cc
> +++ b/src/mds/CDir.cc
> @@ -154,6 +154,7 @@ ostream& CDir::print_db_line_prefix(ostream& out)
>  // CDir
>  
>  CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) :
> +  mseq(0),
>    dirty_rstat_inodes(member_offset(CInode, dirty_rstat_item)),
>    item_dirty(this), item_new(this),
>    pop_me(ceph_clock_now(g_ceph_context)),
> @@ -2121,6 +2122,8 @@ void CDir::_committed(version_t v)
>  void CDir::encode_export(bufferlist& bl)
>  {
>    assert(!is_projected());
> +  ceph_seq_t seq = mseq + 1;
> +  ::encode(seq, bl);
>    ::encode(first, bl);
>    ::encode(fnode, bl);
>    ::encode(dirty_old_rstat, bl);
> @@ -2150,6 +2153,7 @@ void CDir::finish_export(utime_t now)
>  
>  void CDir::decode_import(bufferlist::iterator& blp, utime_t now, LogSegment *ls)
>  {
> +  ::decode(mseq, blp);
>    ::decode(first, blp);
>    ::decode(fnode, blp);
>    ::decode(dirty_old_rstat, blp);
> diff --git a/src/mds/CDir.h b/src/mds/CDir.h
> index 87c79c2..11f4a76 100644
> --- a/src/mds/CDir.h
> +++ b/src/mds/CDir.h
> @@ -170,6 +170,7 @@ public:
>  
>    fnode_t fnode;
>    snapid_t first;
> +  ceph_seq_t mseq; // migrate sequence
>    map<snapid_t,old_rstat_t> dirty_old_rstat;  // [value.first,key]
>  
>    // my inodes with dirty rstat data
> @@ -547,7 +548,8 @@ public:
>    // -- import/export --
>    void encode_export(bufferlist& bl);
>    void finish_export(utime_t now);
> -  void abort_export() { 
> +  void abort_export() {
> +    mseq += 2;
>      put(PIN_TEMPEXPORTING);
>    }
>    void decode_import(bufferlist::iterator& blp, utime_t now, LogSegment *ls);
> diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
> index 8936acd..c1ce8a1 100644
> --- a/src/mds/CInode.cc
> +++ b/src/mds/CInode.cc
> @@ -1222,6 +1222,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
>  	  dout(20) << fg << "           fragstat " << pf->fragstat << dendl;
>  	  dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
>  	  ::encode(fg, tmp);
> +	  ::encode(dir->mseq, tmp);
>  	  ::encode(dir->first, tmp);
>  	  ::encode(pf->fragstat, tmp);
>  	  ::encode(pf->accounted_fragstat, tmp);
> @@ -1255,6 +1256,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
>  	  dout(10) << fg << " " << pf->rstat << dendl;
>  	  dout(10) << fg << " " << dir->dirty_old_rstat << dendl;
>  	  ::encode(fg, tmp);
> +	  ::encode(dir->mseq, tmp);
>  	  ::encode(dir->first, tmp);
>  	  ::encode(pf->rstat, tmp);
>  	  ::encode(pf->accounted_rstat, tmp);
> @@ -1404,10 +1406,12 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
>        dout(10) << " ...got " << n << " fragstats on " << *this << dendl;
>        while (n--) {
>  	frag_t fg;
> +	ceph_seq_t mseq;
>  	snapid_t fgfirst;
>  	frag_info_t fragstat;
>  	frag_info_t accounted_fragstat;
>  	::decode(fg, p);
> +	::decode(mseq, p);
>  	::decode(fgfirst, p);
>  	::decode(fragstat, p);
>  	::decode(accounted_fragstat, p);
> @@ -1420,6 +1424,12 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
>  	  assert(dir);                // i am auth; i had better have this dir open
>  	  dout(10) << fg << " first " << dir->first << " -> " << fgfirst
>  		   << " on " << *dir << dendl;
> +	  if (dir->fnode.fragstat.version == inode.dirstat.version &&
> +	      ceph_seq_cmp(mseq, dir->mseq) < 0) {
> +	    dout(10) << " mseq " << mseq << " < " << dir->mseq << ", ignoring" << dendl;
> +	    continue;
> +	  }
> +	  dir->mseq = mseq;
>  	  dir->first = fgfirst;
>  	  dir->fnode.fragstat = fragstat;
>  	  dir->fnode.accounted_fragstat = accounted_fragstat;
> @@ -1462,11 +1472,13 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
>        ::decode(n, p);
>        while (n--) {
>  	frag_t fg;
> +	ceph_seq_t mseq;
>  	snapid_t fgfirst;
>  	nest_info_t rstat;
>  	nest_info_t accounted_rstat;
>  	map<snapid_t,old_rstat_t> dirty_old_rstat;
>  	::decode(fg, p);
> +	::decode(mseq, p);
>  	::decode(fgfirst, p);
>  	::decode(rstat, p);
>  	::decode(accounted_rstat, p);
> @@ -1481,6 +1493,12 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
>  	  assert(dir);                // i am auth; i had better have this dir open
>  	  dout(10) << fg << " first " << dir->first << " -> " << fgfirst
>  		   << " on " << *dir << dendl;
> +	  if (dir->fnode.rstat.version == inode.rstat.version &&
> +	      ceph_seq_cmp(mseq, dir->mseq) < 0) {
> +	    dout(10) << " mseq " << mseq << " < " << dir->mseq << ", ignoring" << dendl;
> +	    continue;
> +	  }
> +	  dir->mseq = mseq;
>  	  dir->first = fgfirst;
>  	  dir->fnode.rstat = rstat;
>  	  dir->fnode.accounted_rstat = accounted_rstat;
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 2b0029f..f1ebedf 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -3764,6 +3764,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
>      dout(15) << " add_strong_dirfrag " << *dir << dendl;
>      rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
>      dir->state_set(CDir::STATE_REJOINING);
> +    dir->mseq = 0;
>  
>      for (CDir::map_t::iterator p = dir->items.begin();
>  	 p != dir->items.end();
> @@ -3913,11 +3914,19 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>         ++p) {
>      CInode *in = get_inode(p->first);
>      assert(in);
> +    if (survivor) {
> +      list<CDir*> ls;
> +      in->get_subtree_dirfrags(ls);
> +      for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
> +	if ((*q)->get_dir_auth().first == from)
> +	  (*q)->mseq = 0;
> +      }
> +    } else {
> +      rejoin_potential_updated_scatterlocks.insert(in);
> +    }
>      in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
>      in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
>      in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
> -    if (!survivor)
> -      rejoin_potential_updated_scatterlocks.insert(in);
>    }
>  
>    // recovering peer may send incorrect dirfrags here.  we need to
> -- 
> 1.8.1.4
> 
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/8] misc fixes for mds
  2013-06-17 12:10 [PATCH 0/8] misc fixes for mds Yan, Zheng
                   ` (7 preceding siblings ...)
  2013-06-17 12:10 ` [PATCH 8/8] mds: fix remote wrlock rejoin Yan, Zheng
@ 2013-06-19 15:33 ` Sage Weil
  8 siblings, 0 replies; 13+ messages in thread
From: Sage Weil @ 2013-06-19 15:33 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel

Hi Yan-

On Mon, 17 Jun 2013, Yan, Zheng wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
> 
> these patches are also in:
>   git://github.com/ukernel/ceph.git wip-mds

Aside from the minor nit on the mseq patch, this looks good!  I'll pull in 
the existing branch to wip-mds and make that change on top, then run it 
through the fs suite.

By the way, in the future, these are a bit easier to review if you include 
more detail in the changelog to describe what situations these corner 
cases can happen in.

Thanks!
sage


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 7/8] mds: fix race between scatter gather and dirfrag export
  2013-06-19 15:23   ` Sage Weil
@ 2013-06-19 18:01     ` Yan, Zheng
  0 siblings, 0 replies; 13+ messages in thread
From: Yan, Zheng @ 2013-06-19 18:01 UTC (permalink / raw)
  To: Sage Weil; +Cc: ceph-devel

previous patch is buggy. please review this one.

"git://github.com/ukernel/ceph.git wip-mds" is also updated.

Regards
Yan, Zheng

---
From 15a5d37a90daca11d9dc0a581417d2a07b9d14ad Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Sun, 16 Jun 2013 07:01:52 +0800
Subject: [PATCH 1/2] mds: fix race between scatter gather and dirfrag export

If we gather dirty scatter lock state while corresponding dirfrag
is been exporting, we may receive different dirfrag states from
two MDS and we need to find which one is the newest. The solution
is adding a new variable "migrate seq" to dirfrag, increase it by
one when dirfrag's auth MDS changes. When gathering dirty scatter
lock state, use "migrate seq" to find the newest dirfrag state.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/CDir.cc    |  4 ++++
 src/mds/CDir.h     |  4 +++-
 src/mds/CInode.cc  | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/mds/CInode.h   |  1 +
 src/mds/Locker.cc  | 19 +++++++++++++------
 src/mds/MDCache.cc |  9 +++++++--
 src/mds/MDS.h      |  2 +-
 7 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 8c83eba..2b991d7 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -154,6 +154,7 @@ ostream& CDir::print_db_line_prefix(ostream& out)
 // CDir
 
 CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) :
+  mseq(0),
   dirty_rstat_inodes(member_offset(CInode, dirty_rstat_item)),
   item_dirty(this), item_new(this),
   pop_me(ceph_clock_now(g_ceph_context)),
@@ -2121,6 +2122,8 @@ void CDir::_committed(version_t v)
 void CDir::encode_export(bufferlist& bl)
 {
   assert(!is_projected());
+  ceph_seq_t seq = mseq + 1;
+  ::encode(seq, bl);
   ::encode(first, bl);
   ::encode(fnode, bl);
   ::encode(dirty_old_rstat, bl);
@@ -2150,6 +2153,7 @@ void CDir::finish_export(utime_t now)
 
 void CDir::decode_import(bufferlist::iterator& blp, utime_t now, LogSegment *ls)
 {
+  ::decode(mseq, blp);
   ::decode(first, blp);
   ::decode(fnode, blp);
   ::decode(dirty_old_rstat, blp);
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index 87c79c2..11f4a76 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -170,6 +170,7 @@ public:
 
   fnode_t fnode;
   snapid_t first;
+  ceph_seq_t mseq; // migrate sequence
   map<snapid_t,old_rstat_t> dirty_old_rstat;  // [value.first,key]
 
   // my inodes with dirty rstat data
@@ -547,7 +548,8 @@ public:
   // -- import/export --
   void encode_export(bufferlist& bl);
   void finish_export(utime_t now);
-  void abort_export() { 
+  void abort_export() {
+    mseq += 2;
     put(PIN_TEMPEXPORTING);
   }
   void decode_import(bufferlist::iterator& blp, utime_t now, LogSegment *ls);
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 8936acd..4852994 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -1222,6 +1222,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
 	  dout(20) << fg << "           fragstat " << pf->fragstat << dendl;
 	  dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
 	  ::encode(fg, tmp);
+	  ::encode(dir->mseq, tmp);
 	  ::encode(dir->first, tmp);
 	  ::encode(pf->fragstat, tmp);
 	  ::encode(pf->accounted_fragstat, tmp);
@@ -1255,6 +1256,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
 	  dout(10) << fg << " " << pf->rstat << dendl;
 	  dout(10) << fg << " " << dir->dirty_old_rstat << dendl;
 	  ::encode(fg, tmp);
+	  ::encode(dir->mseq, tmp);
 	  ::encode(dir->first, tmp);
 	  ::encode(pf->rstat, tmp);
 	  ::encode(pf->accounted_rstat, tmp);
@@ -1404,10 +1406,12 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
       dout(10) << " ...got " << n << " fragstats on " << *this << dendl;
       while (n--) {
 	frag_t fg;
+	ceph_seq_t mseq;
 	snapid_t fgfirst;
 	frag_info_t fragstat;
 	frag_info_t accounted_fragstat;
 	::decode(fg, p);
+	::decode(mseq, p);
 	::decode(fgfirst, p);
 	::decode(fragstat, p);
 	::decode(accounted_fragstat, p);
@@ -1420,6 +1424,12 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
 	  assert(dir);                // i am auth; i had better have this dir open
 	  dout(10) << fg << " first " << dir->first << " -> " << fgfirst
 		   << " on " << *dir << dendl;
+	  if (dir->fnode.fragstat.version == get_projected_inode()->dirstat.version &&
+	      ceph_seq_cmp(mseq, dir->mseq) < 0) {
+	    dout(10) << " mseq " << mseq << " < " << dir->mseq << ", ignoring" << dendl;
+	    continue;
+	  }
+	  dir->mseq = mseq;
 	  dir->first = fgfirst;
 	  dir->fnode.fragstat = fragstat;
 	  dir->fnode.accounted_fragstat = accounted_fragstat;
@@ -1462,11 +1472,13 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
       ::decode(n, p);
       while (n--) {
 	frag_t fg;
+	ceph_seq_t mseq;
 	snapid_t fgfirst;
 	nest_info_t rstat;
 	nest_info_t accounted_rstat;
 	map<snapid_t,old_rstat_t> dirty_old_rstat;
 	::decode(fg, p);
+	::decode(mseq, p);
 	::decode(fgfirst, p);
 	::decode(rstat, p);
 	::decode(accounted_rstat, p);
@@ -1481,6 +1493,12 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
 	  assert(dir);                // i am auth; i had better have this dir open
 	  dout(10) << fg << " first " << dir->first << " -> " << fgfirst
 		   << " on " << *dir << dendl;
+	  if (dir->fnode.rstat.version == get_projected_inode()->rstat.version &&
+	      ceph_seq_cmp(mseq, dir->mseq) < 0) {
+	    dout(10) << " mseq " << mseq << " < " << dir->mseq << ", ignoring" << dendl;
+	    continue;
+	  }
+	  dir->mseq = mseq;
 	  dir->first = fgfirst;
 	  dir->fnode.rstat = rstat;
 	  dir->fnode.accounted_rstat = accounted_rstat;
@@ -1606,6 +1624,36 @@ void CInode::start_scatter(ScatterLock *lock)
   }
 }
 
+/*
+ * set dirfrag_version to inode_version - 1. so that we can use dirfrag version
+ * to check if we have gathered scatter state for a given dirfrag.
+ */
+void CInode::start_scatter_gather(ScatterLock *lock, int auth)
+{
+  assert(is_auth());
+  inode_t *pi = get_projected_inode();
+
+  for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+       p != dirfrags.end();
+       ++p) {
+    CDir *dir = p->second;
+
+    if (dir->is_auth())
+      continue;
+    if (auth >= 0 && dir->authority().first != auth)
+      continue;
+
+    switch (lock->get_type()) {
+      case CEPH_LOCK_IFILE:
+	dir->fnode.fragstat.version = pi->dirstat.version - 1;
+	break;
+      case CEPH_LOCK_INEST:
+	dir->fnode.rstat.version = pi->rstat.version - 1;
+	break;
+    }
+  }
+}
+
 struct C_Inode_FragUpdate : public Context {
   CInode *in;
   CDir *dir;
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 779bb63..82c2347 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -651,6 +651,7 @@ public:
   void clear_scatter_dirty();  // on rejoin ack
 
   void start_scatter(ScatterLock *lock);
+  void start_scatter_gather(ScatterLock *lock, int auth=-1);
   void finish_scatter_update(ScatterLock *lock, CDir *dir,
 			     version_t inode_version, version_t dir_accounted_version);
   void finish_scatter_gather_update(int type);
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 8065207..30e014a 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -731,8 +731,9 @@ void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, list<C
 	  lock->get_parent()->is_replicated()) {
 	dout(10) << " finished (local) gather for mix->lock, now gathering from replicas" << dendl;
 	send_lock_message(lock, LOCK_AC_LOCK);
-	lock->init_gather();
 	lock->set_state(LOCK_MIX_LOCK2);
+	lock->init_gather();
+	in->start_scatter_gather(static_cast<ScatterLock *>(lock));
 	return;
       }
 
@@ -3430,7 +3431,7 @@ bool Locker::simple_sync(SimpleLock *lock, bool *need_issue)
   assert(lock->is_stable());
 
   CInode *in = 0;
-  if (lock->get_cap_shift())
+  if (lock->get_type() != CEPH_LOCK_DN)
     in = static_cast<CInode *>(lock->get_parent());
 
   int old_state = lock->get_state();
@@ -3453,10 +3454,11 @@ bool Locker::simple_sync(SimpleLock *lock, bool *need_issue)
     if (lock->get_parent()->is_replicated() && old_state == LOCK_MIX) {
       send_lock_message(lock, LOCK_AC_SYNC);
       lock->init_gather();
+      in->start_scatter_gather(static_cast<ScatterLock *>(lock));
       gather++;
     }
     
-    if (in && in->is_head()) {
+    if (lock->get_cap_shift() && in->is_head()) {
       if (in->issued_caps_need_gather(lock)) {
 	if (need_issue)
 	  *need_issue = true;
@@ -3568,7 +3570,7 @@ void Locker::simple_lock(SimpleLock *lock, bool *need_issue)
   assert(lock->get_state() != LOCK_LOCK);
   
   CInode *in = 0;
-  if (lock->get_cap_shift())
+  if (lock->get_type() != CEPH_LOCK_DN)
     in = static_cast<CInode *>(lock->get_parent());
 
   int old_state = lock->get_state();
@@ -3596,7 +3598,7 @@ void Locker::simple_lock(SimpleLock *lock, bool *need_issue)
   }
   if (lock->is_rdlocked())
     gather++;
-  if (in && in->is_head()) {
+  if (lock->get_cap_shift() && in->is_head()) {
     if (in->issued_caps_need_gather(lock)) {
       if (need_issue)
 	*need_issue = true;
@@ -3629,6 +3631,8 @@ void Locker::simple_lock(SimpleLock *lock, bool *need_issue)
       gather++;
       send_lock_message(lock, LOCK_AC_LOCK);
       lock->init_gather();
+      if (lock->get_state() == LOCK_MIX_LOCK2)
+	in->start_scatter_gather(static_cast<ScatterLock *>(lock));
     }
   }
 
@@ -4034,8 +4038,9 @@ void Locker::scatter_tempsync(ScatterLock *lock, bool *need_issue)
 
   if (lock->get_state() == LOCK_MIX_TSYN &&
       in->is_replicated()) {
-    lock->init_gather();
     send_lock_message(lock, LOCK_AC_LOCK);
+    lock->init_gather();
+    in->start_scatter_gather(static_cast<ScatterLock *>(lock));
     gather++;
   }
 
@@ -4364,6 +4369,8 @@ void Locker::file_excl(ScatterLock *lock, bool *need_issue)
       lock->get_state() != LOCK_XSYN_EXCL) {  // if we were lock, replicas are already lock.
     send_lock_message(lock, LOCK_AC_LOCK);
     lock->init_gather();
+    if (lock->get_state() == LOCK_MIX_EXCL)
+      in->start_scatter_gather(static_cast<ScatterLock *>(lock));
     gather++;
   }
   if (lock->is_leased()) {
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 2b0029f..f2068e5 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -3764,6 +3764,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
     dout(15) << " add_strong_dirfrag " << *dir << dendl;
     rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
     dir->state_set(CDir::STATE_REJOINING);
+    dir->mseq = 0;
 
     for (CDir::map_t::iterator p = dir->items.begin();
 	 p != dir->items.end();
@@ -3913,11 +3914,15 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
        ++p) {
     CInode *in = get_inode(p->first);
     assert(in);
+    if (survivor) {
+      in->start_scatter_gather(&in->filelock, from);
+      in->start_scatter_gather(&in->nestlock, from);
+    } else {
+      rejoin_potential_updated_scatterlocks.insert(in);
+    }
     in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
     in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
     in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
-    if (!survivor)
-      rejoin_potential_updated_scatterlocks.insert(in);
   }
 
   // recovering peer may send incorrect dirfrags here.  we need to
diff --git a/src/mds/MDS.h b/src/mds/MDS.h
index 9e3e2da..b99815d 100644
--- a/src/mds/MDS.h
+++ b/src/mds/MDS.h
@@ -35,7 +35,7 @@
 #include "SessionMap.h"
 
 
-#define CEPH_MDS_PROTOCOL    17 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    18 /* cluster internal */
 
 
 enum {
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2013-06-19 18:01 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-06-17 12:10 [PATCH 0/8] misc fixes for mds Yan, Zheng
2013-06-17 12:10 ` [PATCH 1/8] mds: don't update migrate_seq when importing non-auth cap Yan, Zheng
2013-06-17 12:10 ` [PATCH 2/8] mds: fix frozen check in Server::try_open_auth_dirfrag() Yan, Zheng
2013-06-17 12:10 ` [PATCH 3/8] mds: handle undefined dirfrags when opening inode Yan, Zheng
2013-06-17 12:10 ` [PATCH 4/8] mds: try purging stray inode after storing backtrace Yan, Zheng
2013-06-17 12:10 ` [PATCH 5/8] mds: fix cross-authorty rename race Yan, Zheng
2013-06-17 12:10 ` [PATCH 6/8] mds: don't journal bare dirfrag Yan, Zheng
2013-06-19 15:20   ` Sage Weil
2013-06-17 12:10 ` [PATCH 7/8] mds: fix race between scatter gather and dirfrag export Yan, Zheng
2013-06-19 15:23   ` Sage Weil
2013-06-19 18:01     ` Yan, Zheng
2013-06-17 12:10 ` [PATCH 8/8] mds: fix remote wrlock rejoin Yan, Zheng
2013-06-19 15:33 ` [PATCH 0/8] misc fixes for mds Sage Weil

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.