All of lore.kernel.org
 help / color / mirror / Atom feed
From: Dexuan Cui <decui@microsoft.com>
To: kys@microsoft.com, haiyangz@microsoft.com,
	sthemmin@microsoft.com, sashal@kernel.org,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	mikelley@microsoft.com, vkuznets@redhat.com
Cc: Dexuan Cui <decui@microsoft.com>
Subject: [PATCH v3 1/4] Tools: hv: Reopen the devices if read() or write() returns errors
Date: Sat, 25 Jan 2020 11:53:53 -0800	[thread overview]
Message-ID: <1579982036-121722-2-git-send-email-decui@microsoft.com> (raw)
In-Reply-To: <1579982036-121722-1-git-send-email-decui@microsoft.com>

The state machine in the hv_utils driver can run out of order in some
corner cases, e.g. if the kvp daemon doesn't call write() fast enough
due to some reason, kvp_timeout_func() can run first and move the state
to HVUTIL_READY; next, when kvp_on_msg() is called it returns -EINVAL
since kvp_transaction.state is smaller than HVUTIL_USERSPACE_REQ; later,
the daemon's write() gets an error -EINVAL, and the daemon will exit().

We can reproduce the issue by sending a SIGSTOP signal to the daemon, wait
for 1 minute, and send a SIGCONT signal to the daemon: the daemon will
exit() quickly.

We can fix the issue by forcing a reset of the device (which means the
daemon can close() and open() the device again) and doing extra necessary
clean-up.

Signed-off-by: Dexuan Cui <decui@microsoft.com>

---
Changes in v2:
    This is actually a new patch that makes the daemons more robust.

Changes in v3 (I addressed Michael's comments):
    Don't reset target_fd, since that's unnecessary.
    Reset target_fname by: target_fname[0] = '\0';
    Added the missing "fs_frozen = true;" in vss_operate().
    After reopen_vss_fd: if vss_operate(VSS_OP_THAW) can not clear
        fs_frozen due to an error, we just exit().
    Added comments.

 tools/hv/hv_fcopy_daemon.c | 33 +++++++++++++++++++++----
 tools/hv/hv_kvp_daemon.c   | 36 ++++++++++++++++------------
 tools/hv/hv_vss_daemon.c   | 49 +++++++++++++++++++++++++++++---------
 3 files changed, 88 insertions(+), 30 deletions(-)

diff --git a/tools/hv/hv_fcopy_daemon.c b/tools/hv/hv_fcopy_daemon.c
index aea2d91ab364..48cfa032432c 100644
--- a/tools/hv/hv_fcopy_daemon.c
+++ b/tools/hv/hv_fcopy_daemon.c
@@ -80,6 +80,8 @@ static int hv_start_fcopy(struct hv_start_fcopy *smsg)
 
 	error = 0;
 done:
+	if (error)
+		target_fname[0] = '\0';
 	return error;
 }
 
@@ -108,15 +110,29 @@ static int hv_copy_data(struct hv_do_fcopy *cpmsg)
 	return ret;
 }
 
+/*
+ * Reset target_fname to "" in the two below functions for hibernation: if
+ * the fcopy operation is aborted by hibernation, the daemon should remove the
+ * partially-copied file; to achieve this, the hv_utils driver always fakes a
+ * CANCEL_FCOPY message upon suspend, and later when the VM resumes back,
+ * the daemon calls hv_copy_cancel() to remove the file; if a file is copied
+ * successfully before suspend, hv_copy_finished() must reset target_fname to
+ * avoid that the file can be incorrectly removed upon resume, since the faked
+ * CANCEL_FCOPY message is spurious in this case.
+ */
 static int hv_copy_finished(void)
 {
 	close(target_fd);
+	target_fname[0] = '\0';
 	return 0;
 }
 static int hv_copy_cancel(void)
 {
 	close(target_fd);
-	unlink(target_fname);
+	if (strlen(target_fname) > 0) {
+		unlink(target_fname);
+		target_fname[0] = '\0';
+	}
 	return 0;
 
 }
@@ -141,7 +157,7 @@ int main(int argc, char *argv[])
 		struct hv_do_fcopy copy;
 		__u32 kernel_modver;
 	} buffer = { };
-	int in_handshake = 1;
+	int in_handshake;
 
 	static struct option long_options[] = {
 		{"help",	no_argument,	   0,  'h' },
@@ -170,6 +186,10 @@ int main(int argc, char *argv[])
 	openlog("HV_FCOPY", 0, LOG_USER);
 	syslog(LOG_INFO, "starting; pid is:%d", getpid());
 
+reopen_fcopy_fd:
+	/* Remove any possible partially-copied file on error */
+	hv_copy_cancel();
+	in_handshake = 1;
 	fcopy_fd = open("/dev/vmbus/hv_fcopy", O_RDWR);
 
 	if (fcopy_fd < 0) {
@@ -196,7 +216,7 @@ int main(int argc, char *argv[])
 		len = pread(fcopy_fd, &buffer, sizeof(buffer), 0);
 		if (len < 0) {
 			syslog(LOG_ERR, "pread failed: %s", strerror(errno));
-			exit(EXIT_FAILURE);
+			goto reopen_fcopy_fd;
 		}
 
 		if (in_handshake) {
@@ -231,9 +251,14 @@ int main(int argc, char *argv[])
 
 		}
 
+		/*
+		 * pwrite() may return an error due to the faked CANCEL_FCOPY
+		 * message upon hibernation. Ignore the error by resetting the
+		 * dev file, i.e. closing and re-opening it.
+		 */
 		if (pwrite(fcopy_fd, &error, sizeof(int), 0) != sizeof(int)) {
 			syslog(LOG_ERR, "pwrite failed: %s", strerror(errno));
-			exit(EXIT_FAILURE);
+			goto reopen_fcopy_fd;
 		}
 	}
 }
diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index e9ef4ca6a655..ee9c1bb2293e 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c
@@ -76,7 +76,7 @@ enum {
 	DNS
 };
 
-static int in_hand_shake = 1;
+static int in_hand_shake;
 
 static char *os_name = "";
 static char *os_major = "";
@@ -1360,7 +1360,7 @@ void print_usage(char *argv[])
 
 int main(int argc, char *argv[])
 {
-	int kvp_fd, len;
+	int kvp_fd = -1, len;
 	int error;
 	struct pollfd pfd;
 	char    *p;
@@ -1400,14 +1400,6 @@ int main(int argc, char *argv[])
 	openlog("KVP", 0, LOG_USER);
 	syslog(LOG_INFO, "KVP starting; pid is:%d", getpid());
 
-	kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC);
-
-	if (kvp_fd < 0) {
-		syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s",
-			errno, strerror(errno));
-		exit(EXIT_FAILURE);
-	}
-
 	/*
 	 * Retrieve OS release information.
 	 */
@@ -1423,6 +1415,18 @@ int main(int argc, char *argv[])
 		exit(EXIT_FAILURE);
 	}
 
+reopen_kvp_fd:
+	if (kvp_fd != -1)
+		close(kvp_fd);
+	in_hand_shake = 1;
+	kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC);
+
+	if (kvp_fd < 0) {
+		syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s",
+		       errno, strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
 	/*
 	 * Register ourselves with the kernel.
 	 */
@@ -1456,9 +1460,7 @@ int main(int argc, char *argv[])
 		if (len != sizeof(struct hv_kvp_msg)) {
 			syslog(LOG_ERR, "read failed; error:%d %s",
 			       errno, strerror(errno));
-
-			close(kvp_fd);
-			return EXIT_FAILURE;
+			goto reopen_kvp_fd;
 		}
 
 		/*
@@ -1617,13 +1619,17 @@ int main(int argc, char *argv[])
 			break;
 		}
 
-		/* Send the value back to the kernel. */
+		/*
+		 * Send the value back to the kernel. Note: the write() may
+		 * return an error due to hibernation; we can ignore the error
+		 * by resetting the dev file, i.e. closing and re-opening it.
+		 */
 kvp_done:
 		len = write(kvp_fd, hv_msg, sizeof(struct hv_kvp_msg));
 		if (len != sizeof(struct hv_kvp_msg)) {
 			syslog(LOG_ERR, "write failed; error: %d %s", errno,
 			       strerror(errno));
-			exit(EXIT_FAILURE);
+			goto reopen_kvp_fd;
 		}
 	}
 
diff --git a/tools/hv/hv_vss_daemon.c b/tools/hv/hv_vss_daemon.c
index 92902a88f671..dd111870beee 100644
--- a/tools/hv/hv_vss_daemon.c
+++ b/tools/hv/hv_vss_daemon.c
@@ -28,6 +28,8 @@
 #include <stdbool.h>
 #include <dirent.h>
 
+static bool fs_frozen;
+
 /* Don't use syslog() in the function since that can cause write to disk */
 static int vss_do_freeze(char *dir, unsigned int cmd)
 {
@@ -155,18 +157,27 @@ static int vss_operate(int operation)
 			continue;
 		}
 		error |= vss_do_freeze(ent->mnt_dir, cmd);
-		if (error && operation == VSS_OP_FREEZE)
-			goto err;
+		if (operation == VSS_OP_FREEZE) {
+			if (error)
+				goto err;
+			fs_frozen = true;
+		}
 	}
 
 	endmntent(mounts);
 
 	if (root_seen) {
 		error |= vss_do_freeze("/", cmd);
-		if (error && operation == VSS_OP_FREEZE)
-			goto err;
+		if (operation == VSS_OP_FREEZE) {
+			if (error)
+				goto err;
+			fs_frozen = true;
+		}
 	}
 
+	if (operation == VSS_OP_THAW && !error)
+		fs_frozen = false;
+
 	goto out;
 err:
 	save_errno = errno;
@@ -175,6 +186,7 @@ static int vss_operate(int operation)
 		endmntent(mounts);
 	}
 	vss_operate(VSS_OP_THAW);
+	fs_frozen = false;
 	/* Call syslog after we thaw all filesystems */
 	if (ent)
 		syslog(LOG_ERR, "FREEZE of %s failed; error:%d %s",
@@ -196,13 +208,13 @@ void print_usage(char *argv[])
 
 int main(int argc, char *argv[])
 {
-	int vss_fd, len;
+	int vss_fd = -1, len;
 	int error;
 	struct pollfd pfd;
 	int	op;
 	struct hv_vss_msg vss_msg[1];
 	int daemonize = 1, long_index = 0, opt;
-	int in_handshake = 1;
+	int in_handshake;
 	__u32 kernel_modver;
 
 	static struct option long_options[] = {
@@ -232,6 +244,18 @@ int main(int argc, char *argv[])
 	openlog("Hyper-V VSS", 0, LOG_USER);
 	syslog(LOG_INFO, "VSS starting; pid is:%d", getpid());
 
+reopen_vss_fd:
+	if (vss_fd != -1)
+		close(vss_fd);
+	if (fs_frozen) {
+		if (vss_operate(VSS_OP_THAW) || fs_frozen) {
+			syslog(LOG_ERR, "failed to thaw file system: err=%d",
+			       errno);
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	in_handshake = 1;
 	vss_fd = open("/dev/vmbus/hv_vss", O_RDWR);
 	if (vss_fd < 0) {
 		syslog(LOG_ERR, "open /dev/vmbus/hv_vss failed; error: %d %s",
@@ -284,8 +308,7 @@ int main(int argc, char *argv[])
 		if (len != sizeof(struct hv_vss_msg)) {
 			syslog(LOG_ERR, "read failed; error:%d %s",
 			       errno, strerror(errno));
-			close(vss_fd);
-			return EXIT_FAILURE;
+			goto reopen_vss_fd;
 		}
 
 		op = vss_msg->vss_hdr.operation;
@@ -312,14 +335,18 @@ int main(int argc, char *argv[])
 		default:
 			syslog(LOG_ERR, "Illegal op:%d\n", op);
 		}
+
+		/*
+		 * The write() may return an error due to the faked VSS_OP_THAW
+		 * message upon hibernation. Ignore the error by resetting the
+		 * dev file, i.e. closing and re-opening it.
+		 */
 		vss_msg->error = error;
 		len = write(vss_fd, vss_msg, sizeof(struct hv_vss_msg));
 		if (len != sizeof(struct hv_vss_msg)) {
 			syslog(LOG_ERR, "write failed; error: %d %s", errno,
 			       strerror(errno));
-
-			if (op == VSS_OP_FREEZE)
-				vss_operate(VSS_OP_THAW);
+			goto reopen_vss_fd;
 		}
 	}
 
-- 
2.19.1


  reply	other threads:[~2020-01-25 19:55 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-25 19:53 [PATCH v3 0/4] hv_utils: Add the support of hibernation Dexuan Cui
2020-01-25 19:53 ` Dexuan Cui [this message]
2020-01-26  4:49   ` [PATCH v3 1/4] Tools: hv: Reopen the devices if read() or write() returns errors Michael Kelley
2020-01-26  4:59     ` Dexuan Cui
2020-01-25 19:53 ` [PATCH v3 2/4] hv_utils: Support host-initiated restart request Dexuan Cui
2020-01-26  4:57   ` Michael Kelley
2020-01-26  5:05     ` Dexuan Cui
2020-01-25 19:53 ` [PATCH v3 3/4] hv_utils: Support host-initiated hibernation request Dexuan Cui
2020-01-25 19:53 ` [PATCH v3 4/4] hv_utils: Add the support of hibernation Dexuan Cui
2020-01-26  4:58   ` Michael Kelley

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1579982036-121722-2-git-send-email-decui@microsoft.com \
    --to=decui@microsoft.com \
    --cc=haiyangz@microsoft.com \
    --cc=kys@microsoft.com \
    --cc=linux-hyperv@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mikelley@microsoft.com \
    --cc=sashal@kernel.org \
    --cc=sthemmin@microsoft.com \
    --cc=vkuznets@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.