* [PATCH] osd: speedup startup by finishing pending removals in background
@ 2014-05-18 12:54 Alexandre Oliva
2014-05-19 11:32 ` Alexandre Oliva
0 siblings, 1 reply; 2+ messages in thread
From: Alexandre Oliva @ 2014-05-18 12:54 UTC (permalink / raw)
To: ceph-devel
This patch applies on top of the one I just posted, Subject “osd: avoid
flushing every TEMP removal to speedup startup”.
When PG removals are underway and the OSD is restarted, or when
multiple removals are scheduled manually with ceph_filestore_dump
premove, the OSD may take a long time to process all pending removals
before it will join the cluster.
This patch introduces an option that enables the OSD to join the
cluster first, performing the removals in background while actively
participating in the cluster, as the OSD would if it hadn't been
restarted.
In hindsight, I suppose it might have been wiser to add a data member to
DeletingStateRef to hold the coll_t, instead of having to search for it
again, but this patch is what I tested, and it's likely good enough for
now.
Signed-off-by: Alexandre Oliva <oliva@gnu.org>
---
src/common/config_opts.h | 10 ++++++
src/osd/OSD.cc | 75 ++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 82 insertions(+), 3 deletions(-)
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 2c65e6c..9baa356 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -583,6 +583,16 @@ OPTION(osd_client_op_priority, OPT_U32, 63)
OPTION(osd_recovery_op_priority, OPT_U32, 10)
OPTION(osd_recovery_op_warn_multiple, OPT_U32, 16)
+// Removal of PGs is done in background, but if the osd is restarted,
+// it will finish all pending removals before joining the cluster.
+// This can take a while. If this option is set to true, then pending
+// removals will be performed in background, while the osd runs
+// normally. This is a bit dangerous if the OSD gets a new copy of
+// the PG before the pending removal is completed: attributes stored
+// in the leveldb may be lost when removal cleans up an object's
+// attributes AFTER the new object is backfilled.
+OPTION(osd_startup_finish_remove_in_background, OPT_BOOL, false)
+
// Max time to wait between notifying mon of shutdown and shutting down
OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE, 5)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 504cb71..a6d58c1 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1924,6 +1924,15 @@ OSD::res_result OSD::_try_resurrect_pg(
if (!df)
return RES_NONE; // good to go
+ // If we're background deleting a pg scheduled for removal in an
+ // earlier session, this will be NULL. We can't resurrect this one,
+ // nor should we create a new PG with the same pgid, so we'll fail
+ // this assert and let the user restart the osd without
+ // osd_startup_finish_remove_in_background, so that removal is
+ // completed before the osd gets a chance to try to create or
+ // resurrect the PG.
+ assert(df->old_pg_state);
+
df->old_pg_state->lock();
OSDMapRef create_map = df->old_pg_state->get_osdmap();
df->old_pg_state->unlock();
@@ -2048,6 +2057,7 @@ void OSD::load_pgs()
set<spg_t> head_pgs;
map<spg_t, interval_set<snapid_t> > pgs;
+ map<uint64_t, spg_t> *bgremove = NULL;
bool flush = false;
for (vector<coll_t>::iterator it = ls.begin();
it != ls.end();
@@ -2056,14 +2066,30 @@ void OSD::load_pgs()
snapid_t snap;
uint64_t seq;
- if (it->is_temp(pgid) ||
- it->is_removal(&seq, &pgid)) {
- dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
+ if (it->is_temp(pgid)) {
+ dout(10) << "load_pgs " << *it << " clearing temp " << dendl;
recursive_remove_collection(store, *it, false);
flush = true;
continue;
}
+ if (it->is_removal(&seq, &pgid)) {
+ if (cct->_conf->osd_startup_finish_remove_in_background) {
+ dout(10) << "load_pgs " << *it
+ << " delaying pending removal" << dendl;
+ if (seq >= next_removal_seq)
+ next_removal_seq = seq + 1;
+ if (!bgremove)
+ bgremove = new map<uint64_t, spg_t>();
+ (*bgremove)[seq] = pgid;
+ } else {
+ dout(10) << "load_pgs " << *it << " clearing pending removal " << dendl;
+ recursive_remove_collection(store, *it, false);
+ flush = true;
+ }
+ continue;
+ }
+
if (it->is_pg(pgid, snap)) {
if (snap != CEPH_NOSNAP) {
dout(10) << "load_pgs skipping snapped dir " << *it
@@ -2081,6 +2107,18 @@ void OSD::load_pgs()
if (flush)
store->sync_and_flush();
+ if (bgremove) {
+ for (map<uint64_t, spg_t>::iterator it = bgremove->begin();
+ it != bgremove->end(); it++) {
+ dout(10) << "load_pgs FORREMOVAL_" << it->first << "_" << it->second
+ << " scheduling background removal " << dendl;
+ DeletingStateRef deleting = service.deleting_pgs.lookup_or_create
+ (it->second, make_pair(it->second, PGRef(0)));
+ remove_wq.queue(make_pair(PGRef(0), deleting));
+ }
+ delete bgremove;
+ }
+
bool has_upgraded = false;
for (map<spg_t, interval_set<snapid_t> >::iterator i = pgs.begin();
i != pgs.end();
@@ -3520,6 +3558,37 @@ void OSD::RemoveWQ::_process(
ThreadPool::TPHandle &handle)
{
PGRef pg(item.first);
+
+ if (!pg) {
+ // this is for background live removal of pending FORREMOVAL pgs,
+ // remaining from earlier OSD sessions. This only happens if
+ // osd_startup_finish_remove_in_background is enabled.
+ if (!item.second->start_clearing())
+ return;
+
+ if (!item.second->start_deleting())
+ return;
+
+ vector<coll_t> ls;
+ int r = store->list_collections(ls);
+ assert (!(r < 0));
+
+ for (vector<coll_t>::iterator it = ls.begin();
+ it != ls.end();
+ ++it) {
+ spg_t pgid;
+ uint64_t seq;
+
+ if (it->is_removal(&seq, &pgid) && pgid == item.second->pgid) {
+ recursive_remove_collection(store, *it, false);
+ break;
+ }
+ }
+
+ item.second->finish_deleting();
+ return;
+ }
+
SnapMapper &mapper = pg->snap_mapper;
OSDriver &driver = pg->osdriver;
coll_t coll = coll_t(pg->info.pgid);
--
Alexandre Oliva, freedom fighter http://FSFLA.org/~lxoliva/
You must be the change you wish to see in the world. -- Gandhi
Be Free! -- http://FSFLA.org/ FSF Latin America board member
Free Software Evangelist|Red Hat Brasil GNU Toolchain Engineer
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH] osd: speedup startup by finishing pending removals in background
2014-05-18 12:54 [PATCH] osd: speedup startup by finishing pending removals in background Alexandre Oliva
@ 2014-05-19 11:32 ` Alexandre Oliva
0 siblings, 0 replies; 2+ messages in thread
From: Alexandre Oliva @ 2014-05-19 11:32 UTC (permalink / raw)
To: ceph-devel
On May 18, 2014, Alexandre Oliva <oliva@gnu.org> wrote:
> In hindsight, I suppose it might have been wiser to add a data member to
> DeletingStateRef to hold the coll_t, instead of having to search for it
> again, but this patch is what I tested, and it's likely good enough for
> now.
I've now implemented this, and verified that it works.
I've also arranged for attempts to resurrect a pending removal from an
earlier session to be delayed rather than to cause an abort. I haven't
tested this scenario, though; it might be the case that the osd would
timeout and suicide or somesuch.
---
When PG removals are underway and the OSD is restarted, or when
multiple removals are scheduled manually with ceph_filestore_dump
premove, the OSD may take a long time to process all pending removals
before it will join the cluster.
This patch introduces an option that enables the OSD to join the
cluster first, performing the removals in background while actively
participating in the cluster, as the OSD would if it hadn't been
restarted.
Signed-off-by: Alexandre Oliva <oliva@gnu.org>
---
src/common/config_opts.h | 10 +++++++++
src/osd/OSD.cc | 52 +++++++++++++++++++++++++++++++++++++++++++---
src/osd/OSD.h | 17 +++++++++++++--
3 files changed, 74 insertions(+), 5 deletions(-)
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 2c65e6c..9baa356 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -583,6 +583,16 @@ OPTION(osd_client_op_priority, OPT_U32, 63)
OPTION(osd_recovery_op_priority, OPT_U32, 10)
OPTION(osd_recovery_op_warn_multiple, OPT_U32, 16)
+// Removal of PGs is done in background, but if the osd is restarted,
+// it will finish all pending removals before joining the cluster.
+// This can take a while. If this option is set to true, then pending
+// removals will be performed in background, while the osd runs
+// normally. This is a bit dangerous if the OSD gets a new copy of
+// the PG before the pending removal is completed: attributes stored
+// in the leveldb may be lost when removal cleans up an object's
+// attributes AFTER the new object is backfilled.
+OPTION(osd_startup_finish_remove_in_background, OPT_BOOL, false)
+
// Max time to wait between notifying mon of shutdown and shutting down
OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE, 5)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 504cb71..a1e3b6f 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2048,6 +2048,8 @@ void OSD::load_pgs()
set<spg_t> head_pgs;
map<spg_t, interval_set<snapid_t> > pgs;
+ typedef map<uint64_t, pair<spg_t, coll_t> > bgremove_t;
+ bgremove_t *bgremove = NULL;
bool flush = false;
for (vector<coll_t>::iterator it = ls.begin();
it != ls.end();
@@ -2056,14 +2058,30 @@ void OSD::load_pgs()
snapid_t snap;
uint64_t seq;
- if (it->is_temp(pgid) ||
- it->is_removal(&seq, &pgid)) {
- dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
+ if (it->is_temp(pgid)) {
+ dout(10) << "load_pgs " << *it << " clearing temp " << dendl;
recursive_remove_collection(store, *it, false);
flush = true;
continue;
}
+ if (it->is_removal(&seq, &pgid)) {
+ if (cct->_conf->osd_startup_finish_remove_in_background) {
+ dout(10) << "load_pgs " << *it
+ << " delaying pending removal" << dendl;
+ if (seq >= next_removal_seq)
+ next_removal_seq = seq + 1;
+ if (!bgremove)
+ bgremove = new bgremove_t();
+ (*bgremove)[seq] = make_pair(pgid, *it);
+ } else {
+ dout(10) << "load_pgs " << *it << " clearing pending removal " << dendl;
+ recursive_remove_collection(store, *it, false);
+ flush = true;
+ }
+ continue;
+ }
+
if (it->is_pg(pgid, snap)) {
if (snap != CEPH_NOSNAP) {
dout(10) << "load_pgs skipping snapped dir " << *it
@@ -2081,6 +2099,18 @@ void OSD::load_pgs()
if (flush)
store->sync_and_flush();
+ if (bgremove) {
+ for (bgremove_t::iterator it = bgremove->begin();
+ it != bgremove->end(); it++) {
+ dout(10) << "load_pgs FORREMOVAL_" << it->first << "_" << it->second
+ << " scheduling background removal " << dendl;
+ DeletingStateRef deleting = service.deleting_pgs.lookup_or_create
+ (it->second.first, make_pair(it->second.first, it->second.second));
+ remove_wq.queue(make_pair(PGRef(0), deleting));
+ }
+ delete bgremove;
+ }
+
bool has_upgraded = false;
for (map<spg_t, interval_set<snapid_t> >::iterator i = pgs.begin();
i != pgs.end();
@@ -3519,6 +3549,22 @@ void OSD::RemoveWQ::_process(
pair<PGRef, DeletingStateRef> item,
ThreadPool::TPHandle &handle)
{
+ if (!item.second->resurrectable_p()) {
+ // this is for background live removal of pending FORREMOVAL pgs,
+ // remaining from earlier OSD sessions. This only happens if
+ // osd_startup_finish_remove_in_background is enabled.
+ if (!item.second->start_clearing())
+ return;
+
+ if (!item.second->start_deleting())
+ return;
+
+ recursive_remove_collection(store, item.second->get_coll (), false);
+
+ item.second->finish_deleting();
+ return;
+ }
+
PGRef pg(item.first);
SnapMapper &mapper = pg->snap_mapper;
OSDriver &driver = pg->osdriver;
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index f599e43..e81fbe2b 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -217,9 +217,21 @@ class DeletingState {
public:
const spg_t pgid;
const PGRef old_pg_state;
+ const coll_t old_coll; // iff old_pg_state is NULL
DeletingState(const pair<spg_t, PGRef> &in) :
lock("DeletingState::lock"), status(QUEUED), stop_deleting(false),
- pgid(in.first), old_pg_state(in.second) {}
+ pgid(in.first), old_pg_state(in.second), old_coll() {}
+ DeletingState(const pair<spg_t, coll_t> &in) :
+ lock("DeletingState::lock"), status(QUEUED), stop_deleting(false),
+ pgid(in.first), old_pg_state(NULL), old_coll(in.second) {}
+
+ bool resurrectable_p() const {
+ return !!old_pg_state;
+ }
+ coll_t get_coll() const {
+ assert(!resurrectable_p());
+ return old_coll;
+ }
/// transition status to clearing
bool start_clearing() {
@@ -286,7 +298,8 @@ public:
/// try to halt the deletion
bool try_stop_deletion() {
Mutex::Locker l(lock);
- stop_deleting = true;
+ if (resurrectable_p())
+ stop_deleting = true;
/**
* If we are in DELETING_DIR or CLEARING_DIR, there are in progress
* operations we have to wait for before continuing on. States
--
Alexandre Oliva, freedom fighter http://FSFLA.org/~lxoliva/
You must be the change you wish to see in the world. -- Gandhi
Be Free! -- http://FSFLA.org/ FSF Latin America board member
Free Software Evangelist|Red Hat Brasil GNU Toolchain Engineer
^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2014-05-19 11:33 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-05-18 12:54 [PATCH] osd: speedup startup by finishing pending removals in background Alexandre Oliva
2014-05-19 11:32 ` Alexandre Oliva
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).