All of lore.kernel.org
 help / color / mirror / Atom feed
From: Martin Wilck <martin.wilck@suse.com>
To: Christophe Varoqui <christophe.varoqui@opensvc.com>,
	Benjamin Marzinski <bmarzins@redhat.com>,
	Brian Bunker <brian@purestorage.com>,
	dm-devel@lists.linux.dev
Cc: Martin Wilck <mwilck@suse.com>
Subject: [PATCH 4/4] libmultipath: TUR checker: use runner threads
Date: Thu, 19 Mar 2026 23:13:44 +0100	[thread overview]
Message-ID: <20260319221344.753790-5-mwilck@suse.com> (raw)
In-Reply-To: <20260319221344.753790-1-mwilck@suse.com>

Use the generic runners to simplify the TUR checker code.

The logic is the same as before, with one exception: the runner API does
not allow to track a thread after it has been cancelled, like our current
code does. The current code used this to determine when a previously
cancelled thread eventually did complete. The use case for doing this in
the current code was the MAX_NR_RUNNERS logic: the code waited "forever"
for the "last" checker to complete. This is not possible with runners;
once a runner is cancelled, it is off limits. (Waiting for a zombie thread to
write to memory belonging to the main program, as the current code does,
doesn't seem to be the best idea, anyway).

Therefore this patch implements a different logic for MAX_NR_RUNNERS. When
nr_timeouts reaches the limit, the last checker is spawned with an infinite
timeout. Like before, no new checker thread will be spawned until this last
checker eventually completes. One difference to the previous code is that
this last checker thread is never cancelled (after all, it never times
out). But this code is meant for the case where cancellation is not
effective, anyway. Another difference is that the new code will report
PATH_TIMEOUT state as soon as this last thread is started rather than
PATH_PENDING. That seems reasonable given that the checker previously timed
out for this thread.

Signed-off-by: Martin Wilck <mwilck@suse.com>
---
 libmultipath/checkers/tur.c | 341 +++++++++++-------------------------
 1 file changed, 103 insertions(+), 238 deletions(-)

diff --git a/libmultipath/checkers/tur.c b/libmultipath/checkers/tur.c
index ba4ca68..6f22c45 100644
--- a/libmultipath/checkers/tur.c
+++ b/libmultipath/checkers/tur.c
@@ -3,7 +3,6 @@
  *
  * Copyright (c) 2004 Christophe Varoqui
  */
-#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
@@ -17,13 +16,13 @@
 #include <pthread.h>
 #include <urcu.h>
 #include <urcu/uatomic.h>
+#include <assert.h>
 
 #include "checkers.h"
 
 #include "debug.h"
 #include "sg_include.h"
-#include "util.h"
-#include "time-util.h"
+#include "runner.h"
 
 #define TUR_CMD_LEN 6
 #define HEAVY_CHECK_COUNT       10
@@ -45,21 +44,20 @@ const char *libcheck_msgtable[] = {
 	NULL,
 };
 
-struct tur_checker_context {
-	dev_t devt;
-	int state;
-	int running; /* uatomic access only */
+struct tur_context {
 	int fd;
+	dev_t devt;
 	unsigned int timeout;
-	time_t time;
-	pthread_t thread;
-	pthread_mutex_t lock;
-	pthread_cond_t active;
-	int holders; /* uatomic access only */
-	int msgid;
+	int state;
+	short msgid;
+};
+
+struct tur_checker_context {
 	struct checker_context ctx;
+	int last_runner_state;
 	unsigned int nr_timeouts;
-	bool checked_state;
+	struct runner_context *rtx;
+	struct tur_context tcx;
 };
 
 int libcheck_init (struct checker * c)
@@ -67,48 +65,26 @@ int libcheck_init (struct checker * c)
 	struct tur_checker_context *ct;
 	struct stat sb;
 
-	ct = malloc(sizeof(struct tur_checker_context));
-	if (!ct)
-		return 1;
-	memset(ct, 0, sizeof(struct tur_checker_context));
-
-	ct->state = PATH_UNCHECKED;
-	ct->fd = -1;
-	uatomic_set(&ct->holders, 1);
-	pthread_cond_init_mono(&ct->active);
-	pthread_mutex_init(&ct->lock, NULL);
+	ct = calloc(1, sizeof(*ct));
+	ct->tcx.state = PATH_UNCHECKED;
+	ct->tcx.fd = -1;
 	if (fstat(c->fd, &sb) == 0)
-		ct->devt = sb.st_rdev;
+		ct->tcx.devt = sb.st_rdev;
 	ct->ctx.cls = c->cls;
 	c->context = ct;
-
 	return 0;
 }
 
-static void cleanup_context(struct tur_checker_context *ct)
-{
-	pthread_mutex_destroy(&ct->lock);
-	pthread_cond_destroy(&ct->active);
-	free(ct);
-}
-
 void libcheck_free (struct checker * c)
 {
-	if (c->context) {
-		struct tur_checker_context *ct = c->context;
-		int holders;
-		int running;
+	struct tur_checker_context *tcc = c->context;
 
-		running = uatomic_xchg(&ct->running, 0);
-		if (running)
-			pthread_cancel(ct->thread);
-		ct->thread = 0;
-		holders = uatomic_sub_return(&ct->holders, 1);
-		if (!holders)
-			cleanup_context(ct);
-		c->context = NULL;
-	}
-	return;
+	if (!tcc)
+		return;
+	c->context = NULL;
+	if (tcc->rtx)
+		cancel_runner(tcc->rtx);
+	free(tcc);
 }
 
 static int
@@ -216,19 +192,6 @@ retry:
 	return PATH_UP;
 }
 
-#define tur_thread_cleanup_push(ct) pthread_cleanup_push(cleanup_func, ct)
-#define tur_thread_cleanup_pop(ct) pthread_cleanup_pop(1)
-
-static void cleanup_func(void *data)
-{
-	int holders;
-	struct tur_checker_context *ct = data;
-
-	holders = uatomic_sub_return(&ct->holders, 1);
-	if (!holders)
-		cleanup_context(ct);
-}
-
 /*
  * Test code for "zombie tur thread" handling.
  * Compile e.g. with CFLAGS=-DTUR_TEST_MAJOR=8
@@ -273,110 +236,82 @@ static void tur_deep_sleep(const struct tur_checker_context *ct)
 #define tur_deep_sleep(x) do {} while (0)
 #endif /* TUR_TEST_MAJOR */
 
-void *libcheck_thread(struct checker_context *ctx)
+void runner_callback(void *arg)
 {
-	struct tur_checker_context *ct =
-		container_of(ctx, struct tur_checker_context, ctx);
-	int state, running;
-	short msgid;
+	struct tur_context *tcx = arg;
+	int state;
 
-	/* This thread can be canceled, so setup clean up */
-	tur_thread_cleanup_push(ct);
+	condlog(4, "%d:%d : tur checker starting up", major(tcx->devt),
+		minor(tcx->devt));
 
-	condlog(4, "%d:%d : tur checker starting up", major(ct->devt),
-		minor(ct->devt));
-
-	tur_deep_sleep(ct);
-	state = tur_check(ct->fd, ct->timeout, &msgid);
+	tur_deep_sleep(tcx);
+	state = tur_check(tcx->fd, tcx->timeout, &tcx->msgid);
+	tcx->state = state;
 	pthread_testcancel();
-
-	/* TUR checker done */
-	pthread_mutex_lock(&ct->lock);
-	ct->state = state;
-	ct->msgid = msgid;
-	pthread_cond_signal(&ct->active);
-	pthread_mutex_unlock(&ct->lock);
-
-	condlog(4, "%d:%d : tur checker finished, state %s", major(ct->devt),
-		minor(ct->devt), checker_state_name(state));
-
-	running = uatomic_xchg(&ct->running, 0);
-	if (!running)
-		pause();
-
-	tur_thread_cleanup_pop(ct);
-
-	return ((void *)0);
+	condlog(4, "%d:%d : tur checker finished, state %s", major(tcx->devt),
+		minor(tcx->devt), checker_state_name(state));
 }
 
-static void tur_set_async_timeout(struct checker *c)
+static int check_runner_state(struct tur_checker_context *tcc)
 {
-	struct tur_checker_context *ct = c->context;
-	struct timespec now;
+	struct runner_context *rtx = tcc->rtx;
+	int rc;
 
-	get_monotonic_time(&now);
-	ct->time = now.tv_sec + c->timeout;
-}
-
-static int tur_check_async_timeout(struct checker *c)
-{
-	struct tur_checker_context *ct = c->context;
-	struct timespec now;
-
-	get_monotonic_time(&now);
-	return (now.tv_sec > ct->time);
-}
-
-int check_pending(struct checker *c)
-{
-	struct tur_checker_context *ct = c->context;
-	int tur_status = PATH_PENDING;
-
-	pthread_mutex_lock(&ct->lock);
-
-	if (ct->state != PATH_PENDING || ct->msgid != MSG_TUR_RUNNING)
-	{
-		tur_status = ct->state;
-		c->msgid = ct->msgid;
-	}
-	pthread_mutex_unlock(&ct->lock);
-	if (tur_status == PATH_PENDING && c->msgid == MSG_TUR_RUNNING) {
+	rc = check_runner(rtx, &tcc->tcx, sizeof(tcc->tcx));
+	switch (rc) {
+	case RUNNER_DONE:
+		tcc->last_runner_state = rc;
+		tcc->rtx = NULL;
+		tcc->nr_timeouts = 0;
+		condlog(3, "%d:%d : tur checker finished, state %s",
+			major(tcc->tcx.devt), minor(tcc->tcx.devt),
+			checker_state_name(tcc->tcx.state));
+		break;
+	case RUNNER_CANCELLED:
+		tcc->last_runner_state = rc;
+		tcc->rtx = NULL;
+		tcc->tcx.state = PATH_TIMEOUT;
+		tcc->tcx.msgid = MSG_TUR_TIMEOUT;
+		if (tcc->nr_timeouts < MAX_NR_TIMEOUTS)
+			tcc->nr_timeouts++;
+		condlog(3, "%d:%d : tur checker timed out",
+			major(tcc->tcx.devt), minor(tcc->tcx.devt));
+		break;
+	case RUNNER_RUNNING:
 		condlog(4, "%d:%d : tur checker still running",
-			major(ct->devt), minor(ct->devt));
-	} else {
-		int running = uatomic_xchg(&ct->running, 0);
-		if (running)
-			pthread_cancel(ct->thread);
-		ct->thread = 0;
+			major(tcc->tcx.devt), minor(tcc->tcx.devt));
+		tcc->tcx.msgid = MSG_TUR_RUNNING;
+		break;
+	default:
+		assert(false);
+		break;
 	}
-
-	ct->checked_state = true;
-	return tur_status;
+	return rc;
 }
 
 bool libcheck_need_wait(struct checker *c)
 {
 	struct tur_checker_context *ct = c->context;
-	return (ct && ct->thread && uatomic_read(&ct->running) != 0 &&
-		!ct->checked_state);
+
+	return ct && ct->rtx;
 }
 
 int libcheck_pending(struct checker *c)
 {
 	struct tur_checker_context *ct = c->context;
-
 	/* The if path checker isn't running, just return the exiting value. */
-	if (!ct || !ct->thread)
+	if (!ct || !ct->rtx)
 		return c->path_state;
 
-	return check_pending(c);
+	/* This may nullify ct->rtx */
+	check_runner_state(ct);
+	c->msgid = ct->tcx.msgid;
+	return ct->tcx.state;
 }
 
 int libcheck_check(struct checker * c)
 {
 	struct tur_checker_context *ct = c->context;
-	pthread_attr_t attr;
-	int tur_status, r;
 
 	if (!ct)
 		return PATH_UNCHECKED;
@@ -384,109 +319,39 @@ int libcheck_check(struct checker * c)
 	if (checker_is_sync(c))
 		return tur_check(c->fd, c->timeout, &c->msgid);
 
-	/*
-	 * Async mode
-	 */
-	if (ct->thread) {
-		ct->checked_state = true;
-		if (tur_check_async_timeout(c)) {
-			int running = uatomic_xchg(&ct->running, 0);
-			if (running) {
-				pthread_cancel(ct->thread);
-				condlog(3, "%d:%d : tur checker timeout",
-					major(ct->devt), minor(ct->devt));
-				c->msgid = MSG_TUR_TIMEOUT;
-				tur_status = PATH_TIMEOUT;
-			} else {
-				pthread_mutex_lock(&ct->lock);
-				tur_status = ct->state;
-				c->msgid = ct->msgid;
-				pthread_mutex_unlock(&ct->lock);
-			}
-			ct->thread = 0;
-		} else if (uatomic_read(&ct->running) != 0) {
+	/* Handle the case that the checker just completed */
+	if (ct->rtx) {
+		if (check_runner_state(ct) == RUNNER_RUNNING)
 			condlog(3, "%d:%d : tur checker not finished",
-				major(ct->devt), minor(ct->devt));
-			tur_status = PATH_PENDING;
-			c->msgid = MSG_TUR_RUNNING;
-		} else {
-			/* TUR checker done */
-			ct->thread = 0;
-			pthread_mutex_lock(&ct->lock);
-			tur_status = ct->state;
-			c->msgid = ct->msgid;
-			pthread_mutex_unlock(&ct->lock);
-		}
-	} else {
-		if (uatomic_read(&ct->holders) > 1) {
-			/* The thread has been cancelled but hasn't quit. */
-			if (ct->nr_timeouts == MAX_NR_TIMEOUTS) {
-				condlog(2, "%d:%d : waiting for stalled tur thread to finish",
-					major(ct->devt), minor(ct->devt));
-				ct->nr_timeouts++;
-			}
-			/*
-			 * Don't start new threads until the last once has
-			 * finished.
-			 */
-			if (ct->nr_timeouts > MAX_NR_TIMEOUTS) {
-				c->msgid = MSG_TUR_TIMEOUT;
-				return PATH_TIMEOUT;
-			}
-			ct->nr_timeouts++;
-			/*
-			 * Start a new thread while the old one is stalled.
-			 * We have to prevent it from interfering with the new
-			 * thread. We create a new context and leave the old
-			 * one with the stale thread, hoping it will clean up
-			 * eventually.
-			 */
-			condlog(3, "%d:%d : tur thread not responding",
-				major(ct->devt), minor(ct->devt));
-
-			/*
-			 * libcheck_init will replace c->context.
-			 * It fails only in OOM situations. In this case, return
-			 * PATH_UNCHECKED to avoid prematurely failing the path.
-			 */
-			if (libcheck_init(c) != 0) {
-				c->msgid = MSG_TUR_FAILED;
-				return PATH_UNCHECKED;
-			}
-			((struct tur_checker_context *)c->context)->nr_timeouts = ct->nr_timeouts;
-
-			if (!uatomic_sub_return(&ct->holders, 1)) {
-				/* It did terminate, eventually */
-				cleanup_context(ct);
-				((struct tur_checker_context *)c->context)->nr_timeouts = 0;
-			}
-
-			ct = c->context;
-		} else
-			ct->nr_timeouts = 0;
-		/* Start new TUR checker */
-		pthread_mutex_lock(&ct->lock);
-		tur_status = ct->state = PATH_PENDING;
-		c->msgid = ct->msgid = MSG_TUR_RUNNING;
-		pthread_mutex_unlock(&ct->lock);
-		ct->fd = c->fd;
-		ct->timeout = c->timeout;
-		ct->checked_state = false;
-		uatomic_add(&ct->holders, 1);
-		uatomic_set(&ct->running, 1);
-		tur_set_async_timeout(c);
-		setup_thread_attr(&attr, 32 * 1024, 1);
-		r = start_checker_thread(&ct->thread, &attr, &ct->ctx);
-		pthread_attr_destroy(&attr);
-		if (r) {
-			uatomic_sub(&ct->holders, 1);
-			uatomic_set(&ct->running, 0);
-			ct->thread = 0;
-			condlog(3, "%d:%d : failed to start tur thread, using"
-				" sync mode", major(ct->devt), minor(ct->devt));
-			return tur_check(c->fd, c->timeout, &c->msgid);
-		}
+				major(ct->tcx.devt), minor(ct->tcx.devt));
+		c->msgid = ct->tcx.msgid;
+		return ct->tcx.state;
 	}
 
-	return tur_status;
+	/* create new checker thread */
+	ct->tcx.fd = c->fd;
+	ct->tcx.timeout = c->timeout;
+
+	if (ct->nr_timeouts < MAX_NR_TIMEOUTS) {
+		condlog(3, "%d:%d : starting checker with timeout",
+			major(ct->tcx.devt), minor(ct->tcx.devt));
+		ct->tcx.state = PATH_PENDING;
+		ct->tcx.msgid = MSG_TUR_RUNNING;
+		ct->rtx = get_runner(runner_callback, &ct->tcx,
+				     sizeof(ct->tcx), 1000000 * c->timeout);
+	} else {
+		condlog(3, "%d:%d : starting checker without timeout",
+			major(ct->tcx.devt), minor(ct->tcx.devt));
+		ct->tcx.state = PATH_TIMEOUT;
+		ct->rtx = get_runner(runner_callback, &ct->tcx, sizeof(ct->tcx), 0);
+	}
+
+	if (ct->rtx) {
+		c->msgid = ct->tcx.msgid;
+		return ct->tcx.state;
+	} else {
+		condlog(3, "%d:%d : failed to start tur thread, using sync mode",
+			major(ct->tcx.devt), minor(ct->tcx.devt));
+		return tur_check(c->fd, c->timeout, &c->msgid);
+	}
 }
-- 
2.53.0


  parent reply	other threads:[~2026-03-19 22:14 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-19 22:13 [PATCH 0/4] multipath-tools: generic async threads for TUR checker Martin Wilck
2026-03-19 22:13 ` [PATCH 1/4] multipathd: get_new_state: map PATH_TIMEOUT to PATH_DOWN Martin Wilck
2026-03-24  5:35   ` Benjamin Marzinski
2026-03-19 22:13 ` [PATCH 2/4] libmpathutil: add generic implementation for checker thread runners Martin Wilck
2026-03-24  5:44   ` Benjamin Marzinski
2026-03-19 22:13 ` [PATCH 3/4] multipath-tools tests: add test program for " Martin Wilck
2026-03-24  5:47   ` Benjamin Marzinski
2026-03-19 22:13 ` Martin Wilck [this message]
2026-03-24  6:38   ` [PATCH 4/4] libmultipath: TUR checker: use runner threads Benjamin Marzinski
2026-03-24 12:24     ` Martin Wilck
2026-03-24 14:46       ` Benjamin Marzinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260319221344.753790-5-mwilck@suse.com \
    --to=martin.wilck@suse.com \
    --cc=bmarzins@redhat.com \
    --cc=brian@purestorage.com \
    --cc=christophe.varoqui@opensvc.com \
    --cc=dm-devel@lists.linux.dev \
    --cc=mwilck@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.