cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
From: Alexander Aring <aahringo@redhat.com>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] [PATCH RESEND v5.19-rc3 17/20] fs: dlm: remove warn waiter handling
Date: Wed, 22 Jun 2022 14:45:20 -0400	[thread overview]
Message-ID: <20220622184523.1886869-18-aahringo@redhat.com> (raw)
In-Reply-To: <20220622184523.1886869-1-aahringo@redhat.com>

This patch removes a feature in DLM to warn about if we don't get an
answer back if we send a DLM message out and we wait for a reply
message. There are in my opinion two reasons to add such warning, first
is to know there might be a communication error and a message drop due
communication errors. Second the other side dropped the message on stack
because some reason.

However the first reason the midcomms layer will notify us if a message
was received otherwise midcomms debugfs can tell us if there are still
pending messages around.

For the second reason we can check the other nodes debug messages to see
if there was a drop on application layer. However I think the initial
idea of this feature was to check for the first reason.

This patch also removes the configfs entry waitwarn_us which is not
being set by official dlm user space software, we assume fail to setting
this configfs entry will not end in an critical error.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
---
 fs/dlm/config.c       |  7 ----
 fs/dlm/config.h       |  1 -
 fs/dlm/dlm_internal.h |  1 -
 fs/dlm/lock.c         | 80 -------------------------------------------
 fs/dlm/lock.h         |  1 -
 fs/dlm/lockspace.c    |  1 -
 6 files changed, 91 deletions(-)

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 42eee2783756..081fd201e3a8 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -76,7 +76,6 @@ struct dlm_cluster {
 	unsigned int cl_protocol;
 	unsigned int cl_mark;
 	unsigned int cl_timewarn_cs;
-	unsigned int cl_waitwarn_us;
 	unsigned int cl_new_rsb_count;
 	unsigned int cl_recover_callbacks;
 	char cl_cluster_name[DLM_LOCKSPACE_LEN];
@@ -103,7 +102,6 @@ enum {
 	CLUSTER_ATTR_PROTOCOL,
 	CLUSTER_ATTR_MARK,
 	CLUSTER_ATTR_TIMEWARN_CS,
-	CLUSTER_ATTR_WAITWARN_US,
 	CLUSTER_ATTR_NEW_RSB_COUNT,
 	CLUSTER_ATTR_RECOVER_CALLBACKS,
 	CLUSTER_ATTR_CLUSTER_NAME,
@@ -225,7 +223,6 @@ CLUSTER_ATTR(log_info, NULL);
 CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running);
 CLUSTER_ATTR(mark, NULL);
 CLUSTER_ATTR(timewarn_cs, dlm_check_zero);
-CLUSTER_ATTR(waitwarn_us, NULL);
 CLUSTER_ATTR(new_rsb_count, NULL);
 CLUSTER_ATTR(recover_callbacks, NULL);
 
@@ -241,7 +238,6 @@ static struct configfs_attribute *cluster_attrs[] = {
 	[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
 	[CLUSTER_ATTR_MARK] = &cluster_attr_mark,
 	[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs,
-	[CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us,
 	[CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count,
 	[CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks,
 	[CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name,
@@ -433,7 +429,6 @@ static struct config_group *make_cluster(struct config_group *g,
 	cl->cl_log_info = dlm_config.ci_log_info;
 	cl->cl_protocol = dlm_config.ci_protocol;
 	cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
-	cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
 	cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
 	cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
 	memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
@@ -955,7 +950,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_PROTOCOL           DLM_PROTO_TCP
 #define DEFAULT_MARK               0
 #define DEFAULT_TIMEWARN_CS      500 /* 5 sec = 500 centiseconds */
-#define DEFAULT_WAITWARN_US	   0
 #define DEFAULT_NEW_RSB_COUNT    128
 #define DEFAULT_RECOVER_CALLBACKS  0
 #define DEFAULT_CLUSTER_NAME      ""
@@ -972,7 +966,6 @@ struct dlm_config_info dlm_config = {
 	.ci_protocol = DEFAULT_PROTOCOL,
 	.ci_mark = DEFAULT_MARK,
 	.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
-	.ci_waitwarn_us = DEFAULT_WAITWARN_US,
 	.ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
 	.ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
 	.ci_cluster_name = DEFAULT_CLUSTER_NAME
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index df92b0a07fc6..cb23d018e863 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -38,7 +38,6 @@ struct dlm_config_info {
 	int ci_protocol;
 	int ci_mark;
 	int ci_timewarn_cs;
-	int ci_waitwarn_us;
 	int ci_new_rsb_count;
 	int ci_recover_callbacks;
 	char ci_cluster_name[DLM_LOCKSPACE_LEN];
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index afec22b1a65f..84dad619081e 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -259,7 +259,6 @@ struct dlm_lkb {
 	struct list_head	lkb_ownqueue;	/* list of locks for a process */
 	struct list_head	lkb_time_list;
 	ktime_t			lkb_timestamp;
-	ktime_t			lkb_wait_time;
 	unsigned long		lkb_timeout_cs;
 
 	struct mutex		lkb_cb_mutex;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e80d42ba64ae..080cd216a9a4 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1402,75 +1402,6 @@ static int msg_reply_type(int mstype)
 	return -1;
 }
 
-static int nodeid_warned(int nodeid, int num_nodes, int *warned)
-{
-	int i;
-
-	for (i = 0; i < num_nodes; i++) {
-		if (!warned[i]) {
-			warned[i] = nodeid;
-			return 0;
-		}
-		if (warned[i] == nodeid)
-			return 1;
-	}
-	return 0;
-}
-
-void dlm_scan_waiters(struct dlm_ls *ls)
-{
-	struct dlm_lkb *lkb;
-	s64 us;
-	s64 debug_maxus = 0;
-	u32 debug_scanned = 0;
-	u32 debug_expired = 0;
-	int num_nodes = 0;
-	int *warned = NULL;
-
-	if (!dlm_config.ci_waitwarn_us)
-		return;
-
-	mutex_lock(&ls->ls_waiters_mutex);
-
-	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
-		if (!lkb->lkb_wait_time)
-			continue;
-
-		debug_scanned++;
-
-		us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
-
-		if (us < dlm_config.ci_waitwarn_us)
-			continue;
-
-		lkb->lkb_wait_time = 0;
-
-		debug_expired++;
-		if (us > debug_maxus)
-			debug_maxus = us;
-
-		if (!num_nodes) {
-			num_nodes = ls->ls_num_nodes;
-			warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL);
-		}
-		if (!warned)
-			continue;
-		if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
-			continue;
-
-		log_error(ls, "waitwarn %x %lld %d us check connection to "
-			  "node %d", lkb->lkb_id, (long long)us,
-			  dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
-	}
-	mutex_unlock(&ls->ls_waiters_mutex);
-	kfree(warned);
-
-	if (debug_expired)
-		log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
-			  debug_scanned, debug_expired,
-			  dlm_config.ci_waitwarn_us, (long long)debug_maxus);
-}
-
 /* add/remove lkb from global waiters list of lkb's waiting for
    a reply from a remote node */
 
@@ -1514,7 +1445,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
 
 	lkb->lkb_wait_count++;
 	lkb->lkb_wait_type = mstype;
-	lkb->lkb_wait_time = ktime_get();
 	lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
 	hold_lkb(lkb);
 	list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
@@ -1962,16 +1892,6 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
 	list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
 		lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
 	mutex_unlock(&ls->ls_timeout_mutex);
-
-	if (!dlm_config.ci_waitwarn_us)
-		return;
-
-	mutex_lock(&ls->ls_waiters_mutex);
-	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
-		if (ktime_to_us(lkb->lkb_wait_time))
-			lkb->lkb_wait_time = ktime_get();
-	}
-	mutex_unlock(&ls->ls_waiters_mutex);
 }
 
 /* lkb is master or local copy */
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 252a5898f908..40781d9a24df 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -24,7 +24,6 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
 void dlm_scan_rsbs(struct dlm_ls *ls);
 int dlm_lock_recovery_try(struct dlm_ls *ls);
 void dlm_unlock_recovery(struct dlm_ls *ls);
-void dlm_scan_waiters(struct dlm_ls *ls);
 void dlm_scan_timeout(struct dlm_ls *ls);
 void dlm_adjust_timeouts(struct dlm_ls *ls);
 int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len,
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 0c3613d09c5e..ca1eca0809d4 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -275,7 +275,6 @@ static int dlm_scand(void *data)
 				ls->ls_scan_time = jiffies;
 				dlm_scan_rsbs(ls);
 				dlm_scan_timeout(ls);
-				dlm_scan_waiters(ls);
 				dlm_unlock_recovery(ls);
 			} else {
 				ls->ls_scan_time += HZ;
-- 
2.31.1


  parent reply	other threads:[~2022-06-22 18:45 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-22 18:45 [Cluster-devel] [PATCH RESEND v5.19-rc3 00/20] fs: dlm: plock, recovery and API deprecation Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 01/20] fs: dlm: plock use list_first_entry Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 02/20] fs: dlm: remove may interrupted message Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 03/20] fs: dlm: add pid to debug log Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 04/20] fs: dlm: change log output to debug again Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 05/20] fs: dlm: use dlm_plock_info for do_unlock_close Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 06/20] fs: dlm: change posix lock sigint handling Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 07/20] fs: dlm: change ast and bast trace order Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 08/20] fs: dlm: remove additional dereference of lkbsb Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 09/20] fs: dlm: add resource name to tracepoints Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 10/20] fs: dlm: add notes for recovery and membership handling Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 11/20] fs: dlm: call dlm_lsop_recover_prep once Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 12/20] fs: dlm: let new_lockspace() wait until recovery Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 13/20] fs: dlm: handle recovery result outside of ls_recover Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 14/20] fs: dlm: handle recovery -EAGAIN case as retry Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 15/20] fs: dlm: change -EINVAL recovery error to -EAGAIN Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 16/20] fs: dlm: add comment about lkb IFL flags Alexander Aring
2022-06-22 18:45 ` Alexander Aring [this message]
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 18/20] fs: dlm: remove timeout from dlm_user_adopt_orphan Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 19/20] fs: dlm: add API deprecation warning Alexander Aring
2022-06-22 18:45 ` [Cluster-devel] [PATCH RESEND v5.19-rc3 20/20] fs: dlm: don't use deprecated API by default Alexander Aring

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220622184523.1886869-18-aahringo@redhat.com \
    --to=aahringo@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).