All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/1] multipath-tools: fixes for systemd watchdog
@ 2024-11-18 13:04 Martin Wilck
  2024-11-18 13:04 ` [PATCH v2 1/1] multipathd: move systemd watchdog handling into daemon Martin Wilck
  0 siblings, 1 reply; 3+ messages in thread
From: Martin Wilck @ 2024-11-18 13:04 UTC (permalink / raw)
  To: Christophe Varoqui, Benjamin Marzinski; +Cc: dm-devel, Martin Wilck

After Ben Marzinski's feedback to v1 of this series, reworked the
watchdog logic completely, and moved it in to multipathd.
The watchdog interval is now independent of the checker interval.
systemd's watchdog enablement logic is correctly followed, and
better fallbacks are used in case of unreasonable settings in the
unit file.

Martin Wilck (1):
  multipathd: move systemd watchdog handling into daemon

 libmultipath/config.c | 25 ----------------
 libmultipath/config.h |  1 -
 multipathd/main.c     | 67 ++++++++++++++++++++++++++++++++++---------
 3 files changed, 53 insertions(+), 40 deletions(-)

-- 
2.47.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH v2 1/1] multipathd: move systemd watchdog handling into daemon
  2024-11-18 13:04 [PATCH v2 0/1] multipath-tools: fixes for systemd watchdog Martin Wilck
@ 2024-11-18 13:04 ` Martin Wilck
  2024-11-18 20:56   ` Benjamin Marzinski
  0 siblings, 1 reply; 3+ messages in thread
From: Martin Wilck @ 2024-11-18 13:04 UTC (permalink / raw)
  To: Christophe Varoqui, Benjamin Marzinski; +Cc: dm-devel, Martin Wilck

Only multipathd needs to take care of notifying systemd. There's
no need to track this information in struct config, or to limit our
checker interval to it, as checkerloop() wakes up every second anyway.

While at it, fix the watchdog enablement logic:

- the watchdog should only be active if WATCHDOG_PID is either unset,
  or matches the daemon's PID, and if WATCHDOG_USEC is not 0.
- the watchdog should trigger twice per systemd-set interval.
- if WatchdogSec= is set to an unreasonable value, make a smarter
  choice than just disabling the watchdog, and print a more meaningful
  error message.

Signed-off-by: Martin Wilck <mwilck@suse.com>
---
 libmultipath/config.c | 25 ----------------
 libmultipath/config.h |  1 -
 multipathd/main.c     | 67 ++++++++++++++++++++++++++++++++++---------
 3 files changed, 53 insertions(+), 40 deletions(-)

diff --git a/libmultipath/config.c b/libmultipath/config.c
index 0e3a5cc..8b424d1 100644
--- a/libmultipath/config.c
+++ b/libmultipath/config.c
@@ -858,27 +858,6 @@ process_config_dir(struct config *conf, char *dir)
 	pthread_cleanup_pop(1);
 }
 
-#ifdef USE_SYSTEMD
-static void set_max_checkint_from_watchdog(struct config *conf)
-{
-	char *envp = getenv("WATCHDOG_USEC");
-	unsigned long checkint;
-
-	if (envp && sscanf(envp, "%lu", &checkint) == 1) {
-		/* Value is in microseconds */
-		checkint /= 1000000;
-		if (checkint < 1 || checkint > UINT_MAX) {
-			condlog(1, "invalid value for WatchdogSec: \"%s\"", envp);
-			return;
-		}
-		if (conf->max_checkint == 0 || conf->max_checkint > checkint)
-			conf->max_checkint = checkint;
-		condlog(3, "enabling watchdog, interval %ld", checkint);
-		conf->use_watchdog = true;
-	}
-}
-#endif
-
 static int init_config__ (const char *file, struct config *conf);
 
 int init_config(const char *file)
@@ -916,7 +895,6 @@ int init_config__ (const char *file, struct config *conf)
 	conf->attribute_flags = 0;
 	conf->reassign_maps = DEFAULT_REASSIGN_MAPS;
 	conf->checkint = CHECKINT_UNDEF;
-	conf->use_watchdog = false;
 	conf->max_checkint = 0;
 	conf->force_sync = DEFAULT_FORCE_SYNC;
 	conf->partition_delim = (default_partition_delim != NULL ?
@@ -967,9 +945,6 @@ int init_config__ (const char *file, struct config *conf)
 	/*
 	 * fill the voids left in the config file
 	 */
-#ifdef USE_SYSTEMD
-	set_max_checkint_from_watchdog(conf);
-#endif
 	if (conf->max_checkint == 0) {
 		if (conf->checkint == CHECKINT_UNDEF)
 			conf->checkint = DEFAULT_CHECKINT;
diff --git a/libmultipath/config.h b/libmultipath/config.h
index 94cdf25..5b4ebf8 100644
--- a/libmultipath/config.h
+++ b/libmultipath/config.h
@@ -148,7 +148,6 @@ struct config {
 	unsigned int checkint;
 	unsigned int max_checkint;
 	unsigned int adjust_int;
-	bool use_watchdog;
 	int pgfailback;
 	int rr_weight;
 	int no_path_retry;
diff --git a/multipathd/main.c b/multipathd/main.c
index a99da81..1f0d02c 100644
--- a/multipathd/main.c
+++ b/multipathd/main.c
@@ -2153,6 +2153,58 @@ partial_retrigger_tick(vector pathvec)
 	}
 }
 
+#ifdef USE_SYSTEMD
+static int get_watchdog_interval(void)
+{
+	char *envp;
+	long checkint;
+	long pid;
+
+	envp = getenv("WATCHDOG_PID");
+	/* See sd_watchdog_enabled(3) */
+	if (envp && sscanf(envp, "%lu", &pid) == 1 && pid != daemon_pid)
+		return -1;
+
+	envp = getenv("WATCHDOG_USEC");
+	if (!envp || sscanf(envp, "%lu", &checkint) != 1 || checkint == 0)
+		return -1;
+
+	/*
+	 * Value is in microseconds, and the watchdog should be triggered
+	 * twice per interval.
+	 */
+	checkint /= 2000000;
+	if (checkint > INT_MAX / 2) {
+		condlog(1, "WatchdogSec=%ld is too high, assuming %d",
+			checkint * 2, INT_MAX);
+			checkint = INT_MAX / 2;
+	} else if (checkint < 1) {
+		condlog(1, "WatchdogSec=1 is too low, daemon will be killed by systemd!");
+		checkint = 1;
+	}
+
+	condlog(3, "enabling watchdog, interval %lds", checkint);
+	return checkint;
+}
+
+static void watchdog_tick(void) {
+	static int watchdog_interval, watchdog_tick;
+
+	if (watchdog_interval == 0)
+		watchdog_interval = get_watchdog_interval();
+	if (watchdog_interval < 0)
+		return;
+
+	if (--watchdog_tick < 0) {
+		condlog(4, "%s: sending watchdog message", __func__);
+		sd_notify(0, "WATCHDOG=1");
+		watchdog_tick = watchdog_interval;
+	}
+}
+#else
+static void watchdog_tick(void) {}
+#endif
+
 static bool update_prio(struct multipath *mpp, bool refresh_all)
 {
 	int oldpriority;
@@ -2931,9 +2983,6 @@ checkerloop (void *ap)
 	struct timespec last_time;
 	struct config *conf;
 	int foreign_tick = 0;
-#ifdef USE_SYSTEMD
-	bool use_watchdog;
-#endif
 
 	pthread_cleanup_push(rcu_unregister, NULL);
 	rcu_register_thread();
@@ -2944,13 +2993,6 @@ checkerloop (void *ap)
 	get_monotonic_time(&last_time);
 	last_time.tv_sec -= 1;
 
-	/* use_watchdog is set from process environment and never changes */
-	conf = get_multipath_config();
-#ifdef USE_SYSTEMD
-	use_watchdog = conf->use_watchdog;
-#endif
-	put_multipath_config(conf);
-
 	while (1) {
 		struct timespec diff_time, start_time, end_time;
 		int num_paths = 0, strict_timing;
@@ -2967,10 +3009,7 @@ checkerloop (void *ap)
 			(long)diff_time.tv_sec, diff_time.tv_nsec / 1000);
 		last_time = start_time;
 		ticks = diff_time.tv_sec;
-#ifdef USE_SYSTEMD
-		if (use_watchdog)
-			sd_notify(0, "WATCHDOG=1");
-#endif
+		watchdog_tick();
 		while (checker_state != CHECKER_FINISHED) {
 			struct multipath *mpp;
 			int i;
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2 1/1] multipathd: move systemd watchdog handling into daemon
  2024-11-18 13:04 ` [PATCH v2 1/1] multipathd: move systemd watchdog handling into daemon Martin Wilck
@ 2024-11-18 20:56   ` Benjamin Marzinski
  0 siblings, 0 replies; 3+ messages in thread
From: Benjamin Marzinski @ 2024-11-18 20:56 UTC (permalink / raw)
  To: Martin Wilck; +Cc: Christophe Varoqui, dm-devel, Martin Wilck

On Mon, Nov 18, 2024 at 02:04:47PM +0100, Martin Wilck wrote:
> Only multipathd needs to take care of notifying systemd. There's
> no need to track this information in struct config, or to limit our
> checker interval to it, as checkerloop() wakes up every second anyway.
> 
> While at it, fix the watchdog enablement logic:
> 
> - the watchdog should only be active if WATCHDOG_PID is either unset,
>   or matches the daemon's PID, and if WATCHDOG_USEC is not 0.
> - the watchdog should trigger twice per systemd-set interval.
> - if WatchdogSec= is set to an unreasonable value, make a smarter
>   choice than just disabling the watchdog, and print a more meaningful
>   error message.
> 
> Signed-off-by: Martin Wilck <mwilck@suse.com>
> ---
>  libmultipath/config.c | 25 ----------------
>  libmultipath/config.h |  1 -
>  multipathd/main.c     | 67 ++++++++++++++++++++++++++++++++++---------
>  3 files changed, 53 insertions(+), 40 deletions(-)
> 
> diff --git a/libmultipath/config.c b/libmultipath/config.c
> index 0e3a5cc..8b424d1 100644
> --- a/libmultipath/config.c
> +++ b/libmultipath/config.c
> @@ -858,27 +858,6 @@ process_config_dir(struct config *conf, char *dir)
>  	pthread_cleanup_pop(1);
>  }
>  
> -#ifdef USE_SYSTEMD
> -static void set_max_checkint_from_watchdog(struct config *conf)
> -{
> -	char *envp = getenv("WATCHDOG_USEC");
> -	unsigned long checkint;
> -
> -	if (envp && sscanf(envp, "%lu", &checkint) == 1) {
> -		/* Value is in microseconds */
> -		checkint /= 1000000;
> -		if (checkint < 1 || checkint > UINT_MAX) {
> -			condlog(1, "invalid value for WatchdogSec: \"%s\"", envp);
> -			return;
> -		}
> -		if (conf->max_checkint == 0 || conf->max_checkint > checkint)
> -			conf->max_checkint = checkint;
> -		condlog(3, "enabling watchdog, interval %ld", checkint);
> -		conf->use_watchdog = true;
> -	}
> -}
> -#endif
> -
>  static int init_config__ (const char *file, struct config *conf);
>  
>  int init_config(const char *file)
> @@ -916,7 +895,6 @@ int init_config__ (const char *file, struct config *conf)
>  	conf->attribute_flags = 0;
>  	conf->reassign_maps = DEFAULT_REASSIGN_MAPS;
>  	conf->checkint = CHECKINT_UNDEF;
> -	conf->use_watchdog = false;
>  	conf->max_checkint = 0;
>  	conf->force_sync = DEFAULT_FORCE_SYNC;
>  	conf->partition_delim = (default_partition_delim != NULL ?
> @@ -967,9 +945,6 @@ int init_config__ (const char *file, struct config *conf)
>  	/*
>  	 * fill the voids left in the config file
>  	 */
> -#ifdef USE_SYSTEMD
> -	set_max_checkint_from_watchdog(conf);
> -#endif
>  	if (conf->max_checkint == 0) {
>  		if (conf->checkint == CHECKINT_UNDEF)
>  			conf->checkint = DEFAULT_CHECKINT;
> diff --git a/libmultipath/config.h b/libmultipath/config.h
> index 94cdf25..5b4ebf8 100644
> --- a/libmultipath/config.h
> +++ b/libmultipath/config.h
> @@ -148,7 +148,6 @@ struct config {
>  	unsigned int checkint;
>  	unsigned int max_checkint;
>  	unsigned int adjust_int;
> -	bool use_watchdog;
>  	int pgfailback;
>  	int rr_weight;
>  	int no_path_retry;
> diff --git a/multipathd/main.c b/multipathd/main.c
> index a99da81..1f0d02c 100644
> --- a/multipathd/main.c
> +++ b/multipathd/main.c
> @@ -2153,6 +2153,58 @@ partial_retrigger_tick(vector pathvec)
>  	}
>  }
>  
> +#ifdef USE_SYSTEMD
> +static int get_watchdog_interval(void)
> +{
> +	char *envp;
> +	long checkint;
> +	long pid;
> +
> +	envp = getenv("WATCHDOG_PID");
> +	/* See sd_watchdog_enabled(3) */
> +	if (envp && sscanf(envp, "%lu", &pid) == 1 && pid != daemon_pid)
> +		return -1;
> +
> +	envp = getenv("WATCHDOG_USEC");
> +	if (!envp || sscanf(envp, "%lu", &checkint) != 1 || checkint == 0)
> +		return -1;
> +
> +	/*
> +	 * Value is in microseconds, and the watchdog should be triggered
> +	 * twice per interval.
> +	 */
> +	checkint /= 2000000;
> +	if (checkint > INT_MAX / 2) {

If we're worried that WATCHDOG_USEC might be larger than INT_MAX, we
should probably make checkint a "unsigned long long", because I believe
it's possible to run multipath on a 32-bit system. We can still return
an int, of course.

> +		condlog(1, "WatchdogSec=%ld is too high, assuming %d",
> +			checkint * 2, INT_MAX);
> +			checkint = INT_MAX / 2;
> +	} else if (checkint < 1) {
> +		condlog(1, "WatchdogSec=1 is too low, daemon will be killed by systemd!");
> +		checkint = 1;
> +	}
> +
> +	condlog(3, "enabling watchdog, interval %lds", checkint);
> +	return checkint;
> +}
> +
> +static void watchdog_tick(void) {

Instead of assuming one tick here, we should either pass in ticks or
pass in the current time (start_tile) from checkerloop(). Otherwise, if
the checkers take too long and more than two seconds elapse on average
between loops, we might not send our notification soon enough for the
watchdog timer.

-Ben

> +	static int watchdog_interval, watchdog_tick;
> +
> +	if (watchdog_interval == 0)
> +		watchdog_interval = get_watchdog_interval();
> +	if (watchdog_interval < 0)
> +		return;
> +
> +	if (--watchdog_tick < 0) {
> +		condlog(4, "%s: sending watchdog message", __func__);
> +		sd_notify(0, "WATCHDOG=1");
> +		watchdog_tick = watchdog_interval;
> +	}
> +}
> +#else
> +static void watchdog_tick(void) {}
> +#endif
> +
>  static bool update_prio(struct multipath *mpp, bool refresh_all)
>  {
>  	int oldpriority;
> @@ -2931,9 +2983,6 @@ checkerloop (void *ap)
>  	struct timespec last_time;
>  	struct config *conf;
>  	int foreign_tick = 0;
> -#ifdef USE_SYSTEMD
> -	bool use_watchdog;
> -#endif
>  
>  	pthread_cleanup_push(rcu_unregister, NULL);
>  	rcu_register_thread();
> @@ -2944,13 +2993,6 @@ checkerloop (void *ap)
>  	get_monotonic_time(&last_time);
>  	last_time.tv_sec -= 1;
>  
> -	/* use_watchdog is set from process environment and never changes */
> -	conf = get_multipath_config();
> -#ifdef USE_SYSTEMD
> -	use_watchdog = conf->use_watchdog;
> -#endif
> -	put_multipath_config(conf);
> -
>  	while (1) {
>  		struct timespec diff_time, start_time, end_time;
>  		int num_paths = 0, strict_timing;
> @@ -2967,10 +3009,7 @@ checkerloop (void *ap)
>  			(long)diff_time.tv_sec, diff_time.tv_nsec / 1000);
>  		last_time = start_time;
>  		ticks = diff_time.tv_sec;
> -#ifdef USE_SYSTEMD
> -		if (use_watchdog)
> -			sd_notify(0, "WATCHDOG=1");
> -#endif
> +		watchdog_tick();
>  		while (checker_state != CHECKER_FINISHED) {
>  			struct multipath *mpp;
>  			int i;
> -- 
> 2.47.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-11-18 20:56 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-18 13:04 [PATCH v2 0/1] multipath-tools: fixes for systemd watchdog Martin Wilck
2024-11-18 13:04 ` [PATCH v2 1/1] multipathd: move systemd watchdog handling into daemon Martin Wilck
2024-11-18 20:56   ` Benjamin Marzinski

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.