cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
* [Cluster-devel] [PATCH] dlm_controld: trigger network interface failover if a communications error is detected
@ 2019-06-11 13:21 David Windsor
  0 siblings, 0 replies; only message in thread
From: David Windsor @ 2019-06-11 13:21 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Support for automatic failover in the face of network interruptions
is being added to the DLM kernel component [1].  This patch aids in that
effort by adding a mechanism whereby userspace can request the DLM
kernel component switch to the next usable network interface.

When --failover is set, dlm_controld will write to a configfs node that
alerts the DLM kernel component to the fact that a communications error
has occurred in userspace.  The kernel then reinitializes the DLM
communications stack, binding to the next usable network interface.
The kernel implements a round-robin mechanism for selecting the next
network interface.  If necessary, other interface selection heuristics
may be added later.

[1] https://www.redhat.com/archives/cluster-devel/2019-January/msg00009.html

Signed-off-by: David Windsor <dwindsor@redhat.com>

diff --git a/dlm_controld/action.c b/dlm_controld/action.c
index ecd0d022..c107092d 100644
--- a/dlm_controld/action.c
+++ b/dlm_controld/action.c
@@ -639,6 +639,28 @@ int add_configfs_node(int nodeid, char *addr, int addrlen, int local)
 	}
 	close(fd);
 
+	/*
+	 * set failover policy
+	 */
+	if opt(failover_ind) {
+		memset(path, 0, PATH_MAX);
+		snprintf(path, PATH_MAX, "%s/failover", COMMS_DIR);
+
+		fd = open(path, O_WRONLY);
+		if (fd < 0) {
+			log_error("%s: open failed: %d", path, errno);
+			return -1;
+		}
+
+		rv = do_write(fd, (void *)"1", strlen("1"));
+		if (rv < 0) {
+			log_error("%s: write failed: %d", path, errno);
+			close(fd);
+			return -1;
+		}
+		close(fd);
+	}
+
 	/*
 	 * set local
 	 */
@@ -681,6 +703,7 @@ int add_configfs_node(int nodeid, char *addr, int addrlen, int local)
 		}
 		close(fd);
 	}
+
  out:
 	return 0;
 }
@@ -907,6 +930,34 @@ int setup_configfs_members(void)
 	return 0;
 }
 
+/*
+ * Write to the configfs node triggering a switch to the next DLM
+ * failover network interface.
+ */
+int configfs_next_addr(void)
+{
+	int fd, rv;
+	char path[PATH_MAX];
+
+        memset(path, 0, PATH_MAX);
+        snprintf(path, PATH_MAX, "%s/error", COMMS_DIR);
+
+        fd = open(path, O_WRONLY);
+        if (fd < 0) {
+                log_error("%s: open failed: %d", path, errno);
+                return -1;
+        }
+
+        rv = do_write(fd, (void *)"1", strlen("1"));
+        if (rv < 0) {
+                log_error("%s: write failed: %d", path, errno);
+                close(fd);
+                return -1;
+        }
+        close(fd);
+	return 0;
+}
+
 static void find_minors(void)
 {
 	FILE *fl;
diff --git a/dlm_controld/dlm.conf.5 b/dlm_controld/dlm.conf.5
index 09492176..f086dfb1 100644
--- a/dlm_controld/dlm.conf.5
+++ b/dlm_controld/dlm.conf.5
@@ -40,6 +40,8 @@ protocol
 .br
 bind_all
 .br
+failover
+.br
 debug_logfile
 .br
 enable_plock
diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h
index 3221e19c..9f244fd0 100644
--- a/dlm_controld/dlm_daemon.h
+++ b/dlm_controld/dlm_daemon.h
@@ -96,6 +96,7 @@ enum {
         protocol_ind,
         debug_logfile_ind,
 	bind_all_ind,
+	failover_ind,
         enable_fscontrol_ind,
         enable_plock_ind,
         plock_debug_ind,
@@ -363,6 +364,7 @@ void del_configfs_node(int nodeid);
 void clear_configfs(void);
 int setup_configfs_options(void);
 int setup_configfs_members(void);
+int configfs_next_addr(void);
 int check_uncontrolled_lockspaces(void);
 int setup_misc_devices(void);
 int path_exists(const char *path);
diff --git a/dlm_controld/main.c b/dlm_controld/main.c
index 8be6a4bc..ca19eac9 100644
--- a/dlm_controld/main.c
+++ b/dlm_controld/main.c
@@ -1501,6 +1501,7 @@ static int loop(void)
 			if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) {
 				deadfn = client[i].deadfn;
 				deadfn(i);
+				configfs_next_addr();
 			}
 		}
 		query_unlock();
@@ -1732,6 +1733,11 @@ static void set_opt_defaults(void)
 			0, NULL,
 			""); /* do not advertise */
 
+	set_opt_default(failover_ind,
+			"failover", '\0', req_arg_int,
+			0, NULL,
+			""); /* do not advertise */
+
 	set_opt_default(debug_logfile_ind,
 			"debug_logfile", 'L', no_arg,
 			0, NULL,
@@ -2096,4 +2102,3 @@ int main(int argc, char **argv)
 	unlink_lockfile(fd, RUNDIR, RUN_FILE_NAME);
 	return rv < 0 ? 1 : 0;
 }
-
-- 
2.21.0



^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2019-06-11 13:21 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-06-11 13:21 [Cluster-devel] [PATCH] dlm_controld: trigger network interface failover if a communications error is detected David Windsor

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).