From: <gregkh@linuxfoundation.org>
To: idryomov@gmail.com, gregkh@linuxfoundation.org, jdurgin@redhat.com
Cc: <stable@vger.kernel.org>, <stable-commits@vger.kernel.org>
Subject: Patch "libceph: apply new_state before new_up_client on incrementals" has been added to the 4.6-stable tree
Date: Mon, 08 Aug 2016 16:23:32 +0200 [thread overview]
Message-ID: <147066621260211@kroah.com> (raw)
This is a note to let you know that I've just added the patch titled
libceph: apply new_state before new_up_client on incrementals
to the 4.6-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary
The filename of the patch is:
libceph-apply-new_state-before-new_up_client-on-incrementals.patch
and it can be found in the queue-4.6 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@vger.kernel.org> know about it.
>From 930c532869774ebf8af9efe9484c597f896a7d46 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 19 Jul 2016 03:50:28 +0200
Subject: libceph: apply new_state before new_up_client on incrementals
From: Ilya Dryomov <idryomov@gmail.com>
commit 930c532869774ebf8af9efe9484c597f896a7d46 upstream.
Currently, osd_weight and osd_state fields are updated in the encoding
order. This is wrong, because an incremental map may look like e.g.
new_up_client: { osd=6, addr=... } # set osd_state and addr
new_state: { osd=6, xorstate=EXISTS } # clear osd_state
Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After
applying new_up_client, osd_state is changed to EXISTS | UP. Carrying
on with the new_state update, we flip EXISTS and leave osd6 in a weird
"!EXISTS but UP" state. A non-existent OSD is considered down by the
mapping code
2087 for (i = 0; i < pg->pg_temp.len; i++) {
2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
2089 if (ceph_can_shift_osds(pi))
2090 continue;
2091
2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE;
and so requests get directed to the second OSD in the set instead of
the first, resulting in OSD-side errors like:
[WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680
and hung rbds on the client:
[ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0)
[ 493.566805] rbd: rbd0: result -6 xferred 400000
[ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688
The fix is to decouple application from the decoding and:
- apply new_weight first
- apply new_state before new_up_client
- twiddle osd_state flags if marking in
- clear out some of the state if osd is destroyed
Fixes: http://tracker.ceph.com/issues/14901
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Josh Durgin <jdurgin@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
net/ceph/osdmap.c | 156 +++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 113 insertions(+), 43 deletions(-)
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1201,6 +1201,115 @@ struct ceph_osdmap *ceph_osdmap_decode(v
}
/*
+ * Encoding order is (new_up_client, new_state, new_weight). Need to
+ * apply in the (new_weight, new_state, new_up_client) order, because
+ * an incremental map may look like e.g.
+ *
+ * new_up_client: { osd=6, addr=... } # set osd_state and addr
+ * new_state: { osd=6, xorstate=EXISTS } # clear osd_state
+ */
+static int decode_new_up_state_weight(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ void *new_up_client;
+ void *new_state;
+ void *new_weight_end;
+ u32 len;
+
+ new_up_client = *p;
+ ceph_decode_32_safe(p, end, len, e_inval);
+ len *= sizeof(u32) + sizeof(struct ceph_entity_addr);
+ ceph_decode_need(p, end, len, e_inval);
+ *p += len;
+
+ new_state = *p;
+ ceph_decode_32_safe(p, end, len, e_inval);
+ len *= sizeof(u32) + sizeof(u8);
+ ceph_decode_need(p, end, len, e_inval);
+ *p += len;
+
+ /* new_weight */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ while (len--) {
+ s32 osd;
+ u32 w;
+
+ ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
+ osd = ceph_decode_32(p);
+ w = ceph_decode_32(p);
+ BUG_ON(osd >= map->max_osd);
+ pr_info("osd%d weight 0x%x %s\n", osd, w,
+ w == CEPH_OSD_IN ? "(in)" :
+ (w == CEPH_OSD_OUT ? "(out)" : ""));
+ map->osd_weight[osd] = w;
+
+ /*
+ * If we are marking in, set the EXISTS, and clear the
+ * AUTOOUT and NEW bits.
+ */
+ if (w) {
+ map->osd_state[osd] |= CEPH_OSD_EXISTS;
+ map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
+ CEPH_OSD_NEW);
+ }
+ }
+ new_weight_end = *p;
+
+ /* new_state (up/down) */
+ *p = new_state;
+ len = ceph_decode_32(p);
+ while (len--) {
+ s32 osd;
+ u8 xorstate;
+ int ret;
+
+ osd = ceph_decode_32(p);
+ xorstate = ceph_decode_8(p);
+ if (xorstate == 0)
+ xorstate = CEPH_OSD_UP;
+ BUG_ON(osd >= map->max_osd);
+ if ((map->osd_state[osd] & CEPH_OSD_UP) &&
+ (xorstate & CEPH_OSD_UP))
+ pr_info("osd%d down\n", osd);
+ if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
+ (xorstate & CEPH_OSD_EXISTS)) {
+ pr_info("osd%d does not exist\n", osd);
+ map->osd_weight[osd] = CEPH_OSD_IN;
+ ret = set_primary_affinity(map, osd,
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
+ if (ret)
+ return ret;
+ memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
+ map->osd_state[osd] = 0;
+ } else {
+ map->osd_state[osd] ^= xorstate;
+ }
+ }
+
+ /* new_up_client */
+ *p = new_up_client;
+ len = ceph_decode_32(p);
+ while (len--) {
+ s32 osd;
+ struct ceph_entity_addr addr;
+
+ osd = ceph_decode_32(p);
+ ceph_decode_copy(p, &addr, sizeof(addr));
+ ceph_decode_addr(&addr);
+ BUG_ON(osd >= map->max_osd);
+ pr_info("osd%d up\n", osd);
+ map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
+ map->osd_addr[osd] = addr;
+ }
+
+ *p = new_weight_end;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+/*
* decode and apply an incremental map update.
*/
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
@@ -1299,49 +1408,10 @@ struct ceph_osdmap *osdmap_apply_increme
__remove_pg_pool(&map->pg_pools, pi);
}
- /* new_up */
- ceph_decode_32_safe(p, end, len, e_inval);
- while (len--) {
- u32 osd;
- struct ceph_entity_addr addr;
- ceph_decode_32_safe(p, end, osd, e_inval);
- ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
- ceph_decode_addr(&addr);
- pr_info("osd%d up\n", osd);
- BUG_ON(osd >= map->max_osd);
- map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
- map->osd_addr[osd] = addr;
- }
-
- /* new_state */
- ceph_decode_32_safe(p, end, len, e_inval);
- while (len--) {
- u32 osd;
- u8 xorstate;
- ceph_decode_32_safe(p, end, osd, e_inval);
- xorstate = **(u8 **)p;
- (*p)++; /* clean flag */
- if (xorstate == 0)
- xorstate = CEPH_OSD_UP;
- if (xorstate & CEPH_OSD_UP)
- pr_info("osd%d down\n", osd);
- if (osd < map->max_osd)
- map->osd_state[osd] ^= xorstate;
- }
-
- /* new_weight */
- ceph_decode_32_safe(p, end, len, e_inval);
- while (len--) {
- u32 osd, off;
- ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
- osd = ceph_decode_32(p);
- off = ceph_decode_32(p);
- pr_info("osd%d weight 0x%x %s\n", osd, off,
- off == CEPH_OSD_IN ? "(in)" :
- (off == CEPH_OSD_OUT ? "(out)" : ""));
- if (osd < map->max_osd)
- map->osd_weight[osd] = off;
- }
+ /* new_up_client, new_state, new_weight */
+ err = decode_new_up_state_weight(p, end, map);
+ if (err)
+ goto bad;
/* new_pg_temp */
err = decode_new_pg_temp(p, end, map);
Patches currently in stable-queue which might be from idryomov@gmail.com are
queue-4.6/libceph-apply-new_state-before-new_up_client-on-incrementals.patch
reply other threads:[~2016-08-08 14:23 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=147066621260211@kroah.com \
--to=gregkh@linuxfoundation.org \
--cc=idryomov@gmail.com \
--cc=jdurgin@redhat.com \
--cc=stable-commits@vger.kernel.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.