public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Sasha Levin <sashal@kernel.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Xiubo Li <xiubli@redhat.com>, Venky Shankar <vshankar@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Sasha Levin <sashal@kernel.org>,
	ceph-devel@vger.kernel.org
Subject: [PATCH AUTOSEL 6.1 38/38] ceph: blocklist the kclient when receiving corrupted snap trace
Date: Thu,  9 Feb 2023 06:14:57 -0500	[thread overview]
Message-ID: <20230209111459.1891941-38-sashal@kernel.org> (raw)
In-Reply-To: <20230209111459.1891941-1-sashal@kernel.org>

From: Xiubo Li <xiubli@redhat.com>

[ Upstream commit a68e564adcaa69b0930809fb64d9d5f7d9c32ba9 ]

When received corrupted snap trace we don't know what exactly has
happened in MDS side. And we shouldn't continue IOs and metadatas
access to MDS, which may corrupt or get incorrect contents.

This patch will just block all the further IO/MDS requests
immediately and then evict the kclient itself.

The reason why we still need to evict the kclient just after
blocking all the further IOs is that the MDS could revoke the caps
faster.

Link: https://tracker.ceph.com/issues/57686
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Venky Shankar <vshankar@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ceph/addr.c       | 17 +++++++++++++++--
 fs/ceph/caps.c       | 16 +++++++++++++---
 fs/ceph/file.c       |  3 +++
 fs/ceph/mds_client.c | 30 +++++++++++++++++++++++++++---
 fs/ceph/snap.c       | 36 ++++++++++++++++++++++++++++++++++--
 fs/ceph/super.h      |  1 +
 6 files changed, 93 insertions(+), 10 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index dcf701b05cc1c..b6737ee920b22 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -305,7 +305,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 	struct inode *inode = rreq->inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-	struct ceph_osd_request *req;
+	struct ceph_osd_request *req = NULL;
 	struct ceph_vino vino = ceph_vino(inode);
 	struct iov_iter iter;
 	struct page **pages;
@@ -313,6 +313,11 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 	int err = 0;
 	u64 len = subreq->len;
 
+	if (ceph_inode_is_shutdown(inode)) {
+		err = -EIO;
+		goto out;
+	}
+
 	if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
 		return;
 
@@ -563,6 +568,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 
 	dout("writepage %p idx %lu\n", page, page->index);
 
+	if (ceph_inode_is_shutdown(inode))
+		return -EIO;
+
 	/* verify this is a writeable snap context */
 	snapc = page_snap_context(page);
 	if (!snapc) {
@@ -1643,7 +1651,7 @@ int ceph_uninline_data(struct file *file)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_osd_request *req = NULL;
-	struct ceph_cap_flush *prealloc_cf;
+	struct ceph_cap_flush *prealloc_cf = NULL;
 	struct folio *folio = NULL;
 	u64 inline_version = CEPH_INLINE_NONE;
 	struct page *pages[1];
@@ -1657,6 +1665,11 @@ int ceph_uninline_data(struct file *file)
 	dout("uninline_data %p %llx.%llx inline_version %llu\n",
 	     inode, ceph_vinop(inode), inline_version);
 
+	if (ceph_inode_is_shutdown(inode)) {
+		err = -EIO;
+		goto out;
+	}
+
 	if (inline_version == CEPH_INLINE_NONE)
 		return 0;
 
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cd69bf267d1b1..795fd6d84bde0 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4081,6 +4081,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	void *p, *end;
 	struct cap_extra_info extra_info = {};
 	bool queue_trunc;
+	bool close_sessions = false;
 
 	dout("handle_caps from mds%d\n", session->s_mds);
 
@@ -4218,9 +4219,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		realm = NULL;
 		if (snaptrace_len) {
 			down_write(&mdsc->snap_rwsem);
-			ceph_update_snap_trace(mdsc, snaptrace,
-					       snaptrace + snaptrace_len,
-					       false, &realm);
+			if (ceph_update_snap_trace(mdsc, snaptrace,
+						   snaptrace + snaptrace_len,
+						   false, &realm)) {
+				up_write(&mdsc->snap_rwsem);
+				close_sessions = true;
+				goto done;
+			}
 			downgrade_write(&mdsc->snap_rwsem);
 		} else {
 			down_read(&mdsc->snap_rwsem);
@@ -4280,6 +4285,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	iput(inode);
 out:
 	ceph_put_string(extra_info.pool_ns);
+
+	/* Defer closing the sessions after s_mutex lock being released */
+	if (close_sessions)
+		ceph_mdsc_close_sessions(mdsc);
+
 	return;
 
 flush_cap_releases:
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 04fd34557de84..ecdd64780e8cd 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2004,6 +2004,9 @@ static int ceph_zero_partial_object(struct inode *inode,
 	loff_t zero = 0;
 	int op;
 
+	if (ceph_inode_is_shutdown(inode))
+		return -EIO;
+
 	if (!length) {
 		op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
 		length = &zero;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 26a0a8b9975ef..e163f58ff3c50 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -806,6 +806,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 {
 	struct ceph_mds_session *s;
 
+	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
+		return ERR_PTR(-EIO);
+
 	if (mds >= mdsc->mdsmap->possible_max_rank)
 		return ERR_PTR(-EINVAL);
 
@@ -1478,6 +1481,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
 	int mstate;
 	int mds = session->s_mds;
 
+	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
+		return -EIO;
+
 	/* wait for mds to go active? */
 	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
 	dout("open_session to mds%d (%s)\n", mds,
@@ -2860,6 +2866,11 @@ static void __do_request(struct ceph_mds_client *mdsc,
 		return;
 	}
 
+	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) {
+		dout("do_request metadata corrupted\n");
+		err = -EIO;
+		goto finish;
+	}
 	if (req->r_timeout &&
 	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
 		dout("do_request timed out\n");
@@ -3245,6 +3256,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	u64 tid;
 	int err, result;
 	int mds = session->s_mds;
+	bool close_sessions = false;
 
 	if (msg->front.iov_len < sizeof(*head)) {
 		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
@@ -3351,10 +3363,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	realm = NULL;
 	if (rinfo->snapblob_len) {
 		down_write(&mdsc->snap_rwsem);
-		ceph_update_snap_trace(mdsc, rinfo->snapblob,
+		err = ceph_update_snap_trace(mdsc, rinfo->snapblob,
 				rinfo->snapblob + rinfo->snapblob_len,
 				le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
 				&realm);
+		if (err) {
+			up_write(&mdsc->snap_rwsem);
+			close_sessions = true;
+			if (err == -EIO)
+				ceph_msg_dump(msg);
+			goto out_err;
+		}
 		downgrade_write(&mdsc->snap_rwsem);
 	} else {
 		down_read(&mdsc->snap_rwsem);
@@ -3412,6 +3431,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 				     req->r_end_latency, err);
 out:
 	ceph_mdsc_put_request(req);
+
+	/* Defer closing the sessions after s_mutex lock being released */
+	if (close_sessions)
+		ceph_mdsc_close_sessions(mdsc);
 	return;
 }
 
@@ -5011,7 +5034,7 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
 }
 
 /*
- * called after sb is ro.
+ * called after sb is ro or when metadata corrupted.
  */
 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
@@ -5301,7 +5324,8 @@ static void mds_peer_reset(struct ceph_connection *con)
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 
 	pr_warn("mds%d closed our session\n", s->s_mds);
-	send_mds_reconnect(mdsc, s);
+	if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
+		send_mds_reconnect(mdsc, s);
 }
 
 static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index e4151852184e0..87007203f130e 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/ceph/ceph_debug.h>
 
+#include <linux/fs.h>
 #include <linux/sort.h>
 #include <linux/slab.h>
 #include <linux/iversion.h>
@@ -766,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
 	struct ceph_snap_realm *realm;
 	struct ceph_snap_realm *first_realm = NULL;
 	struct ceph_snap_realm *realm_to_rebuild = NULL;
+	struct ceph_client *client = mdsc->fsc->client;
 	int rebuild_snapcs;
 	int err = -ENOMEM;
+	int ret;
 	LIST_HEAD(dirty_realms);
 
 	lockdep_assert_held_write(&mdsc->snap_rwsem);
@@ -884,6 +887,27 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
 	if (first_realm)
 		ceph_put_snap_realm(mdsc, first_realm);
 	pr_err("%s error %d\n", __func__, err);
+
+	/*
+	 * When receiving a corrupted snap trace we don't know what
+	 * exactly has happened in MDS side. And we shouldn't continue
+	 * writing to OSD, which may corrupt the snapshot contents.
+	 *
+	 * Just try to blocklist this kclient and then this kclient
+	 * must be remounted to continue after the corrupted metadata
+	 * fixed in the MDS side.
+	 */
+	WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO);
+	ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
+	if (ret)
+		pr_err("%s failed to blocklist %s: %d\n", __func__,
+		       ceph_pr_addr(&client->msgr.inst.addr), ret);
+
+	WARN(1, "%s: %s%sdo remount to continue%s",
+	     __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
+	     ret ? "" : " was blocklisted, ",
+	     err == -EIO ? " after corrupted snaptrace is fixed" : "");
+
 	return err;
 }
 
@@ -984,6 +1008,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 	__le64 *split_inos = NULL, *split_realms = NULL;
 	int i;
 	int locked_rwsem = 0;
+	bool close_sessions = false;
 
 	/* decode */
 	if (msg->front.iov_len < sizeof(*h))
@@ -1092,8 +1117,12 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 	 * update using the provided snap trace. if we are deleting a
 	 * snap, we can avoid queueing cap_snaps.
 	 */
-	ceph_update_snap_trace(mdsc, p, e,
-			       op == CEPH_SNAP_OP_DESTROY, NULL);
+	if (ceph_update_snap_trace(mdsc, p, e,
+				   op == CEPH_SNAP_OP_DESTROY,
+				   NULL)) {
+		close_sessions = true;
+		goto bad;
+	}
 
 	if (op == CEPH_SNAP_OP_SPLIT)
 		/* we took a reference when we created the realm, above */
@@ -1112,6 +1141,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 out:
 	if (locked_rwsem)
 		up_write(&mdsc->snap_rwsem);
+
+	if (close_sessions)
+		ceph_mdsc_close_sessions(mdsc);
 	return;
 }
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 735279b2ceb55..3599fefa91f99 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -108,6 +108,7 @@ enum {
 	CEPH_MOUNT_UNMOUNTED,
 	CEPH_MOUNT_SHUTDOWN,
 	CEPH_MOUNT_RECOVER,
+	CEPH_MOUNT_FENCE_IO,
 };
 
 #define CEPH_ASYNC_CREATE_CONFLICT_BITS 8
-- 
2.39.0


      parent reply	other threads:[~2023-02-09 11:20 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-09 11:14 [PATCH AUTOSEL 6.1 01/38] ASoC: amd: yc: Add DMI support for new acer/emdoor platforms Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 02/38] ASoC: SOF: sof-audio: start with the right widget type Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 03/38] ALSA: usb-audio: Add FIXED_RATE quirk for JBL Quantum610 Wireless Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 04/38] ASoC: Intel: sof_rt5682: always set dpcm_capture for amplifiers Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 05/38] ASoC: Intel: sof_cs42l42: " Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 06/38] ASoC: Intel: sof_nau8825: " Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 07/38] ASoC: Intel: sof_ssp_amp: " Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 08/38] selftests/bpf: Verify copy_register_state() preserves parent/live fields Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 09/38] ALSA: hda: Do not unset preset when cleaning up codec Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 10/38] ASoC: amd: yc: Add Xiaomi Redmi Book Pro 15 2022 into DMI table Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 11/38] bpf, sockmap: Don't let sock_map_{close,destroy,unhash} call itself Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 12/38] ASoC: cs42l56: fix DT probe Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 13/38] tools/virtio: fix the vringh test for virtio ring changes Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 14/38] vdpa: ifcvf: Do proper cleanup if IFCVF init fails Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 15/38] net/rose: Fix to not accept on connected socket Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 16/38] selftest: net: Improve IPV6_TCLASS/IPV6_HOPLIMIT tests apparmor compatibility Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 17/38] powerpc/85xx: Fix unannotated intra-function call warning Sasha Levin
2023-02-10 11:25   ` Sathvika Vasireddy
2023-02-14  0:06     ` Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 18/38] powerpc/kvm: " Sasha Levin
2023-02-10 11:26   ` Sathvika Vasireddy
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 19/38] net: stmmac: do not stop RX_CLK in Rx LPI state for qcs404 SoC Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 20/38] powerpc/64: Fix perf profiling asynchronous interrupt handlers Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 21/38] fscache: Use clear_and_wake_up_bit() in fscache_create_volume_work() Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 22/38] drm/nouveau/devinit/tu102-: wait for GFW_BOOT_PROGRESS == COMPLETED Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 23/38] net: ethernet: mtk_eth_soc: Avoid truncating allocation Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 24/38] net: sched: sch: Bounds check priority Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 25/38] s390/decompressor: specify __decompress() buf len to avoid overflow Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 26/38] nvme-fc: fix a missing queue put in nvmet_fc_ls_create_association Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 27/38] nvme: clear the request_queue pointers on failure in nvme_alloc_admin_tag_set Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 28/38] nvme: clear the request_queue pointers on failure in nvme_alloc_io_tag_set Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 29/38] drm/amd/display: Add missing brackets in calculation Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 30/38] drm/amd/display: Adjust downscaling limits for dcn314 Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 31/38] drm/amd/display: Unassign does_plane_fit_in_mall function from dcn3.2 Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 32/38] drm/amd/display: Reset DMUB mailbox SW state after HW reset Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 33/38] drm/amdgpu: enable HDP SD for gfx 11.0.3 Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 34/38] drm/amdgpu: Enable vclk dclk node for gc11.0.3 Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 35/38] drm/amd/display: Properly handle additional cases where DCN is not supported Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 36/38] platform/x86: touchscreen_dmi: Add Chuwi Vi8 (CWI501) DMI match Sasha Levin
2023-02-09 11:14 ` [PATCH AUTOSEL 6.1 37/38] ceph: move mount state enum to super.h Sasha Levin
2023-02-09 11:14 ` Sasha Levin [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230209111459.1891941-38-sashal@kernel.org \
    --to=sashal@kernel.org \
    --cc=ceph-devel@vger.kernel.org \
    --cc=idryomov@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    --cc=vshankar@redhat.com \
    --cc=xiubli@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox