From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
stable@vger.kernel.org, Ilya Dryomov <idryomov@gmail.com>,
Josh Durgin <jdurgin@redhat.com>
Subject: [PATCH 4.6 90/96] libceph: apply new_state before new_up_client on incrementals
Date: Mon, 8 Aug 2016 21:11:53 +0200 [thread overview]
Message-ID: <20160808180247.868543573@linuxfoundation.org> (raw)
In-Reply-To: <20160808180243.898163389@linuxfoundation.org>
4.6-stable review patch. If anyone has any objections, please let me know.
------------------
From: Ilya Dryomov <idryomov@gmail.com>
commit 930c532869774ebf8af9efe9484c597f896a7d46 upstream.
Currently, osd_weight and osd_state fields are updated in the encoding
order. This is wrong, because an incremental map may look like e.g.
new_up_client: { osd=6, addr=... } # set osd_state and addr
new_state: { osd=6, xorstate=EXISTS } # clear osd_state
Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After
applying new_up_client, osd_state is changed to EXISTS | UP. Carrying
on with the new_state update, we flip EXISTS and leave osd6 in a weird
"!EXISTS but UP" state. A non-existent OSD is considered down by the
mapping code
2087 for (i = 0; i < pg->pg_temp.len; i++) {
2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
2089 if (ceph_can_shift_osds(pi))
2090 continue;
2091
2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE;
and so requests get directed to the second OSD in the set instead of
the first, resulting in OSD-side errors like:
[WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680
and hung rbds on the client:
[ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0)
[ 493.566805] rbd: rbd0: result -6 xferred 400000
[ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688
The fix is to decouple application from the decoding and:
- apply new_weight first
- apply new_state before new_up_client
- twiddle osd_state flags if marking in
- clear out some of the state if osd is destroyed
Fixes: http://tracker.ceph.com/issues/14901
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Josh Durgin <jdurgin@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
net/ceph/osdmap.c | 156 +++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 113 insertions(+), 43 deletions(-)
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1201,6 +1201,115 @@ struct ceph_osdmap *ceph_osdmap_decode(v
}
/*
+ * Encoding order is (new_up_client, new_state, new_weight). Need to
+ * apply in the (new_weight, new_state, new_up_client) order, because
+ * an incremental map may look like e.g.
+ *
+ * new_up_client: { osd=6, addr=... } # set osd_state and addr
+ * new_state: { osd=6, xorstate=EXISTS } # clear osd_state
+ */
+static int decode_new_up_state_weight(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ void *new_up_client;
+ void *new_state;
+ void *new_weight_end;
+ u32 len;
+
+ new_up_client = *p;
+ ceph_decode_32_safe(p, end, len, e_inval);
+ len *= sizeof(u32) + sizeof(struct ceph_entity_addr);
+ ceph_decode_need(p, end, len, e_inval);
+ *p += len;
+
+ new_state = *p;
+ ceph_decode_32_safe(p, end, len, e_inval);
+ len *= sizeof(u32) + sizeof(u8);
+ ceph_decode_need(p, end, len, e_inval);
+ *p += len;
+
+ /* new_weight */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ while (len--) {
+ s32 osd;
+ u32 w;
+
+ ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
+ osd = ceph_decode_32(p);
+ w = ceph_decode_32(p);
+ BUG_ON(osd >= map->max_osd);
+ pr_info("osd%d weight 0x%x %s\n", osd, w,
+ w == CEPH_OSD_IN ? "(in)" :
+ (w == CEPH_OSD_OUT ? "(out)" : ""));
+ map->osd_weight[osd] = w;
+
+ /*
+ * If we are marking in, set the EXISTS, and clear the
+ * AUTOOUT and NEW bits.
+ */
+ if (w) {
+ map->osd_state[osd] |= CEPH_OSD_EXISTS;
+ map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
+ CEPH_OSD_NEW);
+ }
+ }
+ new_weight_end = *p;
+
+ /* new_state (up/down) */
+ *p = new_state;
+ len = ceph_decode_32(p);
+ while (len--) {
+ s32 osd;
+ u8 xorstate;
+ int ret;
+
+ osd = ceph_decode_32(p);
+ xorstate = ceph_decode_8(p);
+ if (xorstate == 0)
+ xorstate = CEPH_OSD_UP;
+ BUG_ON(osd >= map->max_osd);
+ if ((map->osd_state[osd] & CEPH_OSD_UP) &&
+ (xorstate & CEPH_OSD_UP))
+ pr_info("osd%d down\n", osd);
+ if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
+ (xorstate & CEPH_OSD_EXISTS)) {
+ pr_info("osd%d does not exist\n", osd);
+ map->osd_weight[osd] = CEPH_OSD_IN;
+ ret = set_primary_affinity(map, osd,
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
+ if (ret)
+ return ret;
+ memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
+ map->osd_state[osd] = 0;
+ } else {
+ map->osd_state[osd] ^= xorstate;
+ }
+ }
+
+ /* new_up_client */
+ *p = new_up_client;
+ len = ceph_decode_32(p);
+ while (len--) {
+ s32 osd;
+ struct ceph_entity_addr addr;
+
+ osd = ceph_decode_32(p);
+ ceph_decode_copy(p, &addr, sizeof(addr));
+ ceph_decode_addr(&addr);
+ BUG_ON(osd >= map->max_osd);
+ pr_info("osd%d up\n", osd);
+ map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
+ map->osd_addr[osd] = addr;
+ }
+
+ *p = new_weight_end;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+/*
* decode and apply an incremental map update.
*/
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
@@ -1299,49 +1408,10 @@ struct ceph_osdmap *osdmap_apply_increme
__remove_pg_pool(&map->pg_pools, pi);
}
- /* new_up */
- ceph_decode_32_safe(p, end, len, e_inval);
- while (len--) {
- u32 osd;
- struct ceph_entity_addr addr;
- ceph_decode_32_safe(p, end, osd, e_inval);
- ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
- ceph_decode_addr(&addr);
- pr_info("osd%d up\n", osd);
- BUG_ON(osd >= map->max_osd);
- map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
- map->osd_addr[osd] = addr;
- }
-
- /* new_state */
- ceph_decode_32_safe(p, end, len, e_inval);
- while (len--) {
- u32 osd;
- u8 xorstate;
- ceph_decode_32_safe(p, end, osd, e_inval);
- xorstate = **(u8 **)p;
- (*p)++; /* clean flag */
- if (xorstate == 0)
- xorstate = CEPH_OSD_UP;
- if (xorstate & CEPH_OSD_UP)
- pr_info("osd%d down\n", osd);
- if (osd < map->max_osd)
- map->osd_state[osd] ^= xorstate;
- }
-
- /* new_weight */
- ceph_decode_32_safe(p, end, len, e_inval);
- while (len--) {
- u32 osd, off;
- ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
- osd = ceph_decode_32(p);
- off = ceph_decode_32(p);
- pr_info("osd%d weight 0x%x %s\n", osd, off,
- off == CEPH_OSD_IN ? "(in)" :
- (off == CEPH_OSD_OUT ? "(out)" : ""));
- if (osd < map->max_osd)
- map->osd_weight[osd] = off;
- }
+ /* new_up_client, new_state, new_weight */
+ err = decode_new_up_state_weight(p, end, map);
+ if (err)
+ goto bad;
/* new_pg_temp */
err = decode_new_pg_temp(p, end, map);
next prev parent reply other threads:[~2016-08-08 19:11 UTC|newest]
Thread overview: 99+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CGME20160808191823uscas1p21b9903f952ca81e8d85ef950478b703e@uscas1p2.samsung.com>
2016-08-08 19:10 ` [PATCH 4.6 00/96] 4.6.6-stable review Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 02/96] x86/quirks: Apply nvidia_bugs quirk only on root bus Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 03/96] x86/quirks: Reintroduce scanning of secondary buses Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 05/96] dmaengine: at_xdmac: align descriptors on 64 bits Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 06/96] dmaengine: at_xdmac: fix residue corruption Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 07/96] dmaengine: at_xdmac: double FIFO flush needed to compute residue Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 08/96] mm, sl[au]b: add __GFP_ATOMIC to the GFP reclaim mask Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 09/96] memcg: mem_cgroup_migrate() may be called with irq disabled Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 10/96] memcg: css_alloc should return an ERR_PTR value on error Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 11/96] mm/swap.c: flush lru pvecs on compound page arrival Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 12/96] mm, compaction: abort free scanner if split fails Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 13/96] fs/nilfs2: fix potential underflow in call to crc32_le Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 14/96] mm, compaction: prevent VM_BUG_ON when terminating freeing scanner Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 15/96] uapi: export lirc.h header Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 16/96] mm, meminit: always return a valid node from early_pfn_to_nid Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 17/96] mm, meminit: ensure node is online before checking whether pages are uninitialised Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 18/96] vmlinux.lds: account for destructor sections Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 19/96] mm: thp: refix false positive BUG in page_move_anon_rmap() Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 20/96] mm: memcontrol: fix cgroup creation failure after many small jobs Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 21/96] radix-tree: fix radix_tree_iter_retry() for tagged iterators Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 22/96] pps: do not crash when failed to register Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 23/96] kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 24/96] sched/debug: Fix deadlock when enabling sched events Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 25/96] arc: unwind: warn only once if DW2_UNWIND is disabled Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 26/96] ARC: unwind: ensure that .debug_frame is generated (vs. .eh_frame) Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 27/96] xen/pciback: Fix conf_space read/write overlap check Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 28/96] xen-blkfront: save uncompleted reqs in blkfront_resume() Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 29/96] xenbus: dont BUG() on user mode induced condition Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 30/96] xenbus: dont bail early from xenbus_dev_request_and_reply() Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 31/96] xen-blkfront: fix resume issues after a migration Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 32/96] xen-blkfront: dont call talk_to_blkback when already connected to blkback Greg Kroah-Hartman
2016-08-08 19:10 ` [PATCH 4.6 36/96] Input: vmmouse - remove port reservation Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 37/96] Input: elantech - add more IC body types to the list Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 38/96] Input: xpad - fix oops when attaching an unknown Xbox One gamepad Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 39/96] Input: wacom_w8001 - w8001_MAX_LENGTH should be 13 Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 40/96] Input: wacom_w8001 - ignore invalid pen data packets Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 41/96] Input: xpad - validate USB endpoint count during probe Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 42/96] Revert "Input: wacom_w8001 - drop use of ABS_MT_TOOL_TYPE" Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 43/96] Input: synaptics-rmi4 - fix maximum size check for F12 control register 8 Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 45/96] pvclock: Add CPU barriers to get correct version value Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 46/96] pinctrl: single: Fix missing flush of posted write for a wakeirq Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 47/96] pinctrl: imx: Do not treat a PIN without MUX register as an error Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 48/96] cgroup: remove redundant cleanup in css_create Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 49/96] cgroup: set css->id to -1 during init Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 50/96] cgroup: Disable IRQs while holding css_set_lock Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 51/96] power_supply: power_supply_read_temp only if use_cnt > 0 Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 52/96] locks: use file_inode() Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 53/96] Revert "ecryptfs: forbid opening files without mmap handler" Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 54/96] ecryptfs: dont allow mmap when the lower fs doesnt support it Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 55/96] ext4: verify extent header depth Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 56/96] 9p: use file_dentry() Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 57/96] cpufreq: Avoid false-positive WARN_ON()s in cpufreq_update_policy() Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 58/96] devpts: fix null pointer dereference on failed memory allocation Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 59/96] namespace: update event counter when umounting a deleted dentry Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 60/96] spi: rockchip: Signal unfinished DMA transfers Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 61/96] spi: sunxi: fix transfer timeout Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 62/96] spi: sun4i: fix FIFO limit Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 63/96] clk: rockchip: initialize flags of clk_init_data in mmc-phase clock Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 64/96] clk: at91: fix clk_programmable_set_parent() Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 65/96] lockd: unregister notifier blocks if the service fails to come up completely Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 66/96] platform/chrome: cros_ec_dev - double fetch bug in ioctl Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 67/96] qeth: delete napi struct when removing a qeth device Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 68/96] init/Kconfig: keep Expert users menu together Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 69/96] block: fix use-after-free in sys_ioprio_get() Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 70/96] mmc: block: fix free of uninitialized idata->buf Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 71/96] mmc: block: fix packed command header endianness Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 72/96] sched/fair: Fix effective_load() to consistently use smoothed load Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 73/96] can: at91_can: RX queue could get stuck at high bus load Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 74/96] can: c_can: Update D_CAN TX and RX functions to 32 bit - fix Altera Cyclone access Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 75/96] can: fix handling of unmodifiable configuration options fix Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 76/96] can: fix oops caused by wrong rtnl dellink usage Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 77/96] RDS: fix rds_tcp_init() error path Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 78/96] irqchip/mips-gic: Map to VPs using HW VPNum Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 79/96] irqchip/mips-gic: Match IPI IRQ domain by bus token only Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 81/96] SCSI: fix new bug in scsi_dev_info_list string matching Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 82/96] ipr: Clear interrupt on croc/crocodile when running with LSI Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 83/96] media: fix airspy usb probe error path Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 84/96] posix_cpu_timer: Exit early when process has been reaped Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 85/96] cpu/hotplug: Keep enough storage space if SMP=n to avoid array out of bounds scribble Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 86/96] [media] adv7604: Dont ignore pad number in subdev DV timings pad operations Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 87/96] i2c: qup: Fix wrong value of index variable Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 88/96] i2c: mux: reg: wrong condition checked for of_address_to_resource return value Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 89/96] libata: LITE-ON CX1-JB256-HP needs lower max_sectors Greg Kroah-Hartman
2016-08-08 19:11 ` Greg Kroah-Hartman [this message]
2016-08-08 19:11 ` [PATCH 4.6 91/96] net: mvneta: set real interrupt per packet for tx_done Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 92/96] cfg80211: handle failed skb allocation Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 93/96] intel_th: pci: Add Kaby Lake PCH-H support Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 94/96] intel_th: Fix a deadlock in modprobing Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 95/96] vfs: ioctl: prevent double-fetch in dedupe ioctl Greg Kroah-Hartman
2016-08-08 19:11 ` [PATCH 4.6 96/96] vfs: fix deadlock in file_remove_privs() on overlayfs Greg Kroah-Hartman
2016-08-09 5:03 ` [PATCH 4.6 00/96] 4.6.6-stable review Guenter Roeck
2016-08-09 8:24 ` Greg Kroah-Hartman
2016-08-09 8:33 ` Paul Burton
2016-08-09 8:37 ` Greg Kroah-Hartman
2016-08-09 16:19 ` Guenter Roeck
2016-08-09 17:22 ` Greg Kroah-Hartman
2016-08-10 1:25 ` Guenter Roeck
2016-08-09 15:10 ` Shuah Khan
2016-08-09 17:22 ` Greg Kroah-Hartman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20160808180247.868543573@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=idryomov@gmail.com \
--cc=jdurgin@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).