stable.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Ilya Dryomov <idryomov@gmail.com>,
	Josh Durgin <jdurgin@redhat.com>
Subject: [PATCH 3.14 20/21] libceph: apply new_state before new_up_client on incrementals
Date: Mon,  8 Aug 2016 21:09:51 +0200	[thread overview]
Message-ID: <20160808180144.776351837@linuxfoundation.org> (raw)
In-Reply-To: <20160808180143.919366850@linuxfoundation.org>

3.14-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Ilya Dryomov <idryomov@gmail.com>

commit 930c532869774ebf8af9efe9484c597f896a7d46 upstream.

Currently, osd_weight and osd_state fields are updated in the encoding
order.  This is wrong, because an incremental map may look like e.g.

    new_up_client: { osd=6, addr=... } # set osd_state and addr
    new_state: { osd=6, xorstate=EXISTS } # clear osd_state

Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down).  After
applying new_up_client, osd_state is changed to EXISTS | UP.  Carrying
on with the new_state update, we flip EXISTS and leave osd6 in a weird
"!EXISTS but UP" state.  A non-existent OSD is considered down by the
mapping code

2087    for (i = 0; i < pg->pg_temp.len; i++) {
2088            if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
2089                    if (ceph_can_shift_osds(pi))
2090                            continue;
2091
2092                    temp->osds[temp->size++] = CRUSH_ITEM_NONE;

and so requests get directed to the second OSD in the set instead of
the first, resulting in OSD-side errors like:

[WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680

and hung rbds on the client:

[  493.566367] rbd: rbd0: write 400000 at 11cc00000 (0)
[  493.566805] rbd: rbd0:   result -6 xferred 400000
[  493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688

The fix is to decouple application from the decoding and:
- apply new_weight first
- apply new_state before new_up_client
- twiddle osd_state flags if marking in
- clear out some of the state if osd is destroyed

Fixes: http://tracker.ceph.com/issues/14901

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Josh Durgin <jdurgin@redhat.com>
[idryomov@gmail.com: backport to 3.10-3.14: strip primary-affinity]
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 net/ceph/osdmap.c |  152 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 108 insertions(+), 44 deletions(-)

--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -825,6 +825,110 @@ bad:
 }
 
 /*
+ * Encoding order is (new_up_client, new_state, new_weight).  Need to
+ * apply in the (new_weight, new_state, new_up_client) order, because
+ * an incremental map may look like e.g.
+ *
+ *     new_up_client: { osd=6, addr=... } # set osd_state and addr
+ *     new_state: { osd=6, xorstate=EXISTS } # clear osd_state
+ */
+static int decode_new_up_state_weight(void **p, void *end,
+				      struct ceph_osdmap *map)
+{
+	void *new_up_client;
+	void *new_state;
+	void *new_weight_end;
+	u32 len;
+
+	new_up_client = *p;
+	ceph_decode_32_safe(p, end, len, e_inval);
+	len *= sizeof(u32) + sizeof(struct ceph_entity_addr);
+	ceph_decode_need(p, end, len, e_inval);
+	*p += len;
+
+	new_state = *p;
+	ceph_decode_32_safe(p, end, len, e_inval);
+	len *= sizeof(u32) + sizeof(u8);
+	ceph_decode_need(p, end, len, e_inval);
+	*p += len;
+
+	/* new_weight */
+	ceph_decode_32_safe(p, end, len, e_inval);
+	while (len--) {
+		s32 osd;
+		u32 w;
+
+		ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
+		osd = ceph_decode_32(p);
+		w = ceph_decode_32(p);
+		BUG_ON(osd >= map->max_osd);
+		pr_info("osd%d weight 0x%x %s\n", osd, w,
+		     w == CEPH_OSD_IN ? "(in)" :
+		     (w == CEPH_OSD_OUT ? "(out)" : ""));
+		map->osd_weight[osd] = w;
+
+		/*
+		 * If we are marking in, set the EXISTS, and clear the
+		 * AUTOOUT and NEW bits.
+		 */
+		if (w) {
+			map->osd_state[osd] |= CEPH_OSD_EXISTS;
+			map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
+						 CEPH_OSD_NEW);
+		}
+	}
+	new_weight_end = *p;
+
+	/* new_state (up/down) */
+	*p = new_state;
+	len = ceph_decode_32(p);
+	while (len--) {
+		s32 osd;
+		u8 xorstate;
+
+		osd = ceph_decode_32(p);
+		xorstate = ceph_decode_8(p);
+		if (xorstate == 0)
+			xorstate = CEPH_OSD_UP;
+		BUG_ON(osd >= map->max_osd);
+		if ((map->osd_state[osd] & CEPH_OSD_UP) &&
+		    (xorstate & CEPH_OSD_UP))
+			pr_info("osd%d down\n", osd);
+		if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
+		    (xorstate & CEPH_OSD_EXISTS)) {
+			pr_info("osd%d does not exist\n", osd);
+			map->osd_weight[osd] = CEPH_OSD_IN;
+			memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
+			map->osd_state[osd] = 0;
+		} else {
+			map->osd_state[osd] ^= xorstate;
+		}
+	}
+
+	/* new_up_client */
+	*p = new_up_client;
+	len = ceph_decode_32(p);
+	while (len--) {
+		s32 osd;
+		struct ceph_entity_addr addr;
+
+		osd = ceph_decode_32(p);
+		ceph_decode_copy(p, &addr, sizeof(addr));
+		ceph_decode_addr(&addr);
+		BUG_ON(osd >= map->max_osd);
+		pr_info("osd%d up\n", osd);
+		map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
+		map->osd_addr[osd] = addr;
+	}
+
+	*p = new_weight_end;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+/*
  * decode and apply an incremental map update.
  */
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
@@ -939,50 +1043,10 @@ struct ceph_osdmap *osdmap_apply_increme
 			__remove_pg_pool(&map->pg_pools, pi);
 	}
 
-	/* new_up */
-	err = -EINVAL;
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		u32 osd;
-		struct ceph_entity_addr addr;
-		ceph_decode_32_safe(p, end, osd, bad);
-		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
-		ceph_decode_addr(&addr);
-		pr_info("osd%d up\n", osd);
-		BUG_ON(osd >= map->max_osd);
-		map->osd_state[osd] |= CEPH_OSD_UP;
-		map->osd_addr[osd] = addr;
-	}
-
-	/* new_state */
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		u32 osd;
-		u8 xorstate;
-		ceph_decode_32_safe(p, end, osd, bad);
-		xorstate = **(u8 **)p;
-		(*p)++;  /* clean flag */
-		if (xorstate == 0)
-			xorstate = CEPH_OSD_UP;
-		if (xorstate & CEPH_OSD_UP)
-			pr_info("osd%d down\n", osd);
-		if (osd < map->max_osd)
-			map->osd_state[osd] ^= xorstate;
-	}
-
-	/* new_weight */
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		u32 osd, off;
-		ceph_decode_need(p, end, sizeof(u32)*2, bad);
-		osd = ceph_decode_32(p);
-		off = ceph_decode_32(p);
-		pr_info("osd%d weight 0x%x %s\n", osd, off,
-		     off == CEPH_OSD_IN ? "(in)" :
-		     (off == CEPH_OSD_OUT ? "(out)" : ""));
-		if (osd < map->max_osd)
-			map->osd_weight[osd] = off;
-	}
+	/* new_up_client, new_state, new_weight */
+	err = decode_new_up_state_weight(p, end, map);
+	if (err)
+		goto bad;
 
 	/* new_pg_temp */
 	ceph_decode_32_safe(p, end, len, bad);



  parent reply	other threads:[~2016-08-08 19:10 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20160808191004uscas1p26944bddcdda269e11e609e5ab288a7dc@uscas1p2.samsung.com>
2016-08-08 19:09 ` [PATCH 3.14 00/21] 3.14.75-stable review Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 01/21] fs/nilfs2: fix potential underflow in call to crc32_le Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 02/21] arc: unwind: warn only once if DW2_UNWIND is disabled Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 03/21] xen/pciback: Fix conf_space read/write overlap check Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 07/21] Input: wacom_w8001 - w8001_MAX_LENGTH should be 13 Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 08/21] Input: xpad - validate USB endpoint count during probe Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 09/21] pinctrl: single: Fix missing flush of posted write for a wakeirq Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 10/21] Revert "ecryptfs: forbid opening files without mmap handler" Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 11/21] ecryptfs: dont allow mmap when the lower fs doesnt support it Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 12/21] ARC: use ASL assembler mnemonic Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 13/21] ext4: verify extent header depth Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 14/21] qeth: delete napi struct when removing a qeth device Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 15/21] mmc: block: fix packed command header endianness Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 16/21] can: at91_can: RX queue could get stuck at high bus load Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 17/21] can: fix handling of unmodifiable configuration options fix Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 18/21] can: fix oops caused by wrong rtnl dellink usage Greg Kroah-Hartman
2016-08-08 19:09   ` [PATCH 3.14 19/21] ipr: Clear interrupt on croc/crocodile when running with LSI Greg Kroah-Hartman
2016-08-08 19:09   ` Greg Kroah-Hartman [this message]
2016-08-08 19:09   ` [PATCH 3.14 21/21] net: mvneta: set real interrupt per packet for tx_done Greg Kroah-Hartman
2016-08-09  4:16   ` [PATCH 3.14 00/21] 3.14.75-stable review Guenter Roeck
2016-08-09  8:19     ` Greg Kroah-Hartman
2016-08-09 15:09   ` Shuah Khan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160808180144.776351837@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=idryomov@gmail.com \
    --cc=jdurgin@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).