[Ocfs2-devel] [patch 0/1] OCFS Configurable timeouts

All of lore.kernel.org
 help / color / mirror / Atom feed

* [Ocfs2-devel] [patch 0/1] OCFS Configurable timeouts - Revision 2
@ 2006-11-29  1:04 abeekhof
  2006-11-29  1:04 ` [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch abeekhof
  2006-11-30 17:50 ` [Ocfs2-devel] [patch 0/1] OCFS Configurable timeouts - Revision 2 Joel Becker
  0 siblings, 2 replies; 9+ messages in thread
From: abeekhof @ 2006-11-29  1:04 UTC (permalink / raw)
  To: ocfs2-devel

Hopefully this is the final version :-)

--

Added a global spinlock around modifications to o2net_connected_peer (as discussed with Mark).

I have a separate patch that uses to_o2nm_cluster_from_node() but since I cant reproduce the problem mentioned in Jeff's comment (it apparently needs the userspace heartbeating modifications), I'd prefer to leave it out.

Andrew

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch
  2006-11-29  1:04 [Ocfs2-devel] [patch 0/1] OCFS Configurable timeouts - Revision 2 abeekhof
@ 2006-11-29  1:04 ` abeekhof
  2006-11-29 15:30   ` Zach Brown
  2006-11-29 15:31   ` Mark Fasheh
  2006-11-30 17:50 ` [Ocfs2-devel] [patch 0/1] OCFS Configurable timeouts - Revision 2 Joel Becker
  1 sibling, 2 replies; 9+ messages in thread
From: abeekhof @ 2006-11-29  1:04 UTC (permalink / raw)
  To: ocfs2-devel


From: Andrew Beekhof <abeekhof@suse.de>
Subject: [patch 1/1] OCFS2 Configurable timeouts - Protocol changes

Modify the OCFS2 handshake to ensure essential timeouts are configured
  identically on all nodes.
Only allow changes when there are no connected peers
Improves the logic in o2net_advance_rx() which broke now that
  sizeof(struct o2net_handshake) is greater than sizeof(struct o2net_msg)
Included is the field for userspace-heartbeat timeout to avoid the need for
  further protocol changes.
Uses a global spinlock to ensure the decisions to update configfs entries
  are made on the correct value.  The region covered by the spinlock when
  incrimenting the counter is much larger as this is the more critical case.

Signed-off-by: Andrew Beekhof <abeekhof@suse.de>
---
 fs/ocfs2/cluster/nodemanager.c  |   19 +++++++
 fs/ocfs2/cluster/tcp.c          |   96 +++++++++++++++++++++++++++++++++++-----
 fs/ocfs2/cluster/tcp.h          |    1
 fs/ocfs2/cluster/nodemanager.c  |   55 ++++++++++++++------
 fs/ocfs2/cluster/tcp.c          |  105 +++++++++++++++++++++++++++++++++++-----
 fs/ocfs2/cluster/tcp.h          |    2 
 fs/ocfs2/cluster/tcp_internal.h |    6 +-
 4 files changed, 139 insertions(+), 29 deletions(-)




Index: fs/ocfs2/cluster/nodemanager.c
===================================================================
--- fs/ocfs2/cluster/nodemanager.c.orig	2006-11-20 16:25:58.000000000 +0100
+++ fs/ocfs2/cluster/nodemanager.c	2006-11-27 09:57:56.000000000 +0100
@@ -558,15 +558,14 @@ static ssize_t o2nm_cluster_attr_write(c
 	return count;
 }
 
-static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(struct o2nm_cluster *cluster,
-                                                 char *page)
+static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
+	struct o2nm_cluster *cluster, char *page)
 {
 	return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
 }
 
-static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(struct o2nm_cluster *cluster,
-                                                  const char *page,
-						  size_t count)
+static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
+	struct o2nm_cluster *cluster, const char *page, size_t count)
 {
 	ssize_t ret;
 	unsigned int val;
@@ -574,10 +573,22 @@ static ssize_t o2nm_cluster_attr_idle_ti
 	ret =  o2nm_cluster_attr_write(page, count, &val);
 
 	if (ret > 0) {
+		if (cluster->cl_idle_timeout_ms != val) {
+			spin_lock(&connected_lock);
+			if(o2net_num_connected_peers()) {
+				mlog(ML_NOTICE,
+				     "o2net: cannot change idle timeout after "
+				     "the first peer has agreed to it."
+				     "  %d connected peers\n",
+				     o2net_num_connected_peers());
+				ret = -EINVAL;
+			}
+			spin_unlock(&connected_lock);
+		}
 		if (val <= cluster->cl_keepalive_delay_ms) {
 			mlog(ML_NOTICE, "o2net: idle timeout must be larger "
 			     "than keepalive delay\n");
-			return -EINVAL;
+			ret = -EINVAL;
 		}
 		cluster->cl_idle_timeout_ms = val;
 	}
@@ -585,15 +596,14 @@ static ssize_t o2nm_cluster_attr_idle_ti
 	return ret;
 }
 
-static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read(struct o2nm_cluster *cluster,
-                                                 char *page)
+static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read(
+	struct o2nm_cluster *cluster, char *page)
 {
 	return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms);
 }
 
-static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(struct o2nm_cluster *cluster,
-                                                  const char *page,
-						  size_t count)
+static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
+	struct o2nm_cluster *cluster, const char *page, size_t count)
 {
 	ssize_t ret;
 	unsigned int val;
@@ -601,10 +611,22 @@ static ssize_t o2nm_cluster_attr_keepali
 	ret =  o2nm_cluster_attr_write(page, count, &val);
 
 	if (ret > 0) {
+		if (cluster->cl_keepalive_delay_ms != val) {
+			spin_lock(&connected_lock);
+			if(o2net_num_connected_peers()) {
+				mlog(ML_NOTICE,
+				     "o2net: cannot change keepalive delay after"
+				     " the first peer has agreed to it."
+				     "  %d connected peers\n",
+				     o2net_num_connected_peers());
+				ret = -EINVAL;
+			}
+			spin_unlock(&connected_lock);
+		}
 		if (val >= cluster->cl_idle_timeout_ms) {
 			mlog(ML_NOTICE, "o2net: keepalive delay must be "
 			     "smaller than idle timeout\n");
-			return -EINVAL;
+			ret = -EINVAL;
 		}
 		cluster->cl_keepalive_delay_ms = val;
 	}
@@ -612,15 +634,14 @@ static ssize_t o2nm_cluster_attr_keepali
 	return ret;
 }
 
-static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read(struct o2nm_cluster *cluster,
-                                                 char *page)
+static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read(
+	struct o2nm_cluster *cluster, char *page)
 {
 	return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms);
 }
 
-static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(struct o2nm_cluster *cluster,
-                                                  const char *page,
-						  size_t count)
+static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
+	struct o2nm_cluster *cluster, const char *page, size_t count)
 {
 	return o2nm_cluster_attr_write(page, count,
 	                               &cluster->cl_reconnect_delay_ms);
Index: fs/ocfs2/cluster/tcp.c
===================================================================
--- fs/ocfs2/cluster/tcp.c.orig	2006-11-20 16:19:12.000000000 +0100
+++ fs/ocfs2/cluster/tcp.c	2006-11-27 10:41:20.000000000 +0100
@@ -1121,6 +1121,44 @@ static int o2net_check_handshake(struct 
 		return -1;
 	}
 
+	/*
+	 * Ensure timeouts are consistent with other nodes, otherwise
+	 * we can end up with one node thinking that the other must be down,
+	 * but isn't. This can ultimately cause corruption.
+	 */
+	if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
+				o2net_idle_timeout(sc->sc_node)) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
+		     "%u ms, but we use %u ms locally.  disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     be32_to_cpu(hand->o2net_idle_timeout_ms),
+		     o2net_idle_timeout(sc->sc_node));
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
+			o2net_keepalive_delay(sc->sc_node)) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
+		     "%u ms, but we use %u ms locally.  disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     be32_to_cpu(hand->o2net_keepalive_delay_ms),
+		     o2net_keepalive_delay(sc->sc_node));
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
+			O2HB_MAX_WRITE_TIMEOUT_MS) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
+		     "%u ms, but we use %u ms locally.  disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     be32_to_cpu(hand->o2net_keepalive_delay_ms),
+		     O2HB_MAX_WRITE_TIMEOUT_MS);
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
 	sc->sc_handshake_ok = 1;
 
 	spin_lock(&nn->nn_lock);
@@ -1153,6 +1191,26 @@ static int o2net_advance_rx(struct o2net
 	sclog(sc, "receiving\n");
 	do_gettimeofday(&sc->sc_tv_advance_start);
 
+	if(unlikely(sc->sc_handshake_ok == 0)) {
+		if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
+			data = page_address(sc->sc_page) + sc->sc_page_off;
+			datalen = sizeof(struct o2net_handshake) - sc->sc_page_off;
+			ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+			if (ret > 0)
+				sc->sc_page_off += ret;
+		}
+
+		if (sc->sc_page_off == sizeof(struct o2net_handshake)) {
+			o2net_check_handshake(sc);
+			if(sc->sc_handshake_ok == 0) {
+				BUG_ON(sizeof(struct o2net_handshake)
+				       == sizeof(struct o2net_msg));
+				ret = -EPROTO;
+			}
+			goto out;
+		}
+	}
+
 	/* do we need more header? */
 	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
 		data = page_address(sc->sc_page) + sc->sc_page_off;
@@ -1160,15 +1218,6 @@ static int o2net_advance_rx(struct o2net
 		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
 		if (ret > 0) {
 			sc->sc_page_off += ret;
-
-			/* this working relies on the handshake being
-			 * smaller than the normal message header */
-			if (sc->sc_page_off >= sizeof(struct o2net_handshake)&&
-			    !sc->sc_handshake_ok && o2net_check_handshake(sc)) {
-				ret = -EPROTO;
-				goto out;
-			}
-
 			/* only swab incoming here.. we can
 			 * only get here once as we cross from
 			 * being under to over */
@@ -1178,8 +1227,7 @@ static int o2net_advance_rx(struct o2net
 				    O2NET_MAX_PAYLOAD_BYTES)
 					ret = -EOVERFLOW;
 			}
-		}
-		if (ret <= 0)
+		} else
 			goto out;
 	}
 
@@ -1269,6 +1317,18 @@ static int o2net_set_nodelay(struct sock
 	return ret;
 }
 
+static void o2net_initialize_handshake(void)
+{
+	o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
+		O2HB_MAX_WRITE_TIMEOUT_MS);
+	o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(
+		o2net_idle_timeout(NULL));
+	o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
+		o2net_keepalive_delay(NULL));
+	o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
+		o2net_reconnect_delay(NULL));
+}
+
 /* ------------------------------------------------------------ */
 
 /* called when a connect completes and after a sock is accepted.  the
@@ -1281,6 +1341,7 @@ static void o2net_sc_connect_completed(v
               (unsigned long long)O2NET_PROTOCOL_VERSION,
 	      (unsigned long long)be64_to_cpu(o2net_hand->connector_id));
 
+	o2net_initialize_handshake();
 	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
 	sc_put(sc);
 }
@@ -1481,11 +1542,23 @@ static void o2net_still_up(void *arg)
 
 /* ------------------------------------------------------------ */
 
+static int o2net_connected_peers = 0;
+spinlock_t connected_lock;
+
+int o2net_num_connected_peers(void)
+{
+	return o2net_connected_peers;
+}
+
 void o2net_disconnect_node(struct o2nm_node *node)
 {
 	struct o2net_node *nn = o2net_nn_from_num(node->nd_num);
 
 	/* don't reconnect until it's heartbeating again */
+	spin_lock(&connected_lock);
+	o2net_connected_peers--;
+	spin_unlock(&connected_lock);
+
 	spin_lock(&nn->nn_lock);
 	o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
 	spin_unlock(&nn->nn_lock);
@@ -1505,13 +1578,17 @@ static void o2net_hb_node_down_cb(struct
 
 	if (node_num != o2nm_this_node())
 		o2net_disconnect_node(node);
+
+	BUG_ON(o2net_connected_peers < 0);
 }
 
+
 static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
 				void *data)
 {
 	struct o2net_node *nn = o2net_nn_from_num(node_num);
 
+	spin_lock(&connected_lock);
 	o2quo_hb_up(node_num);
 
 	/* ensure an immediate connect attempt */
@@ -1519,6 +1596,8 @@ static void o2net_hb_node_up_cb(struct o
 		(msecs_to_jiffies(o2net_reconnect_delay(node)) + 1);
 
 	if (node_num != o2nm_this_node()) {
+		o2net_connected_peers++;
+
 		/* heartbeat doesn't work unless a local node number is
 		 * configured and doing so brings up the o2net_wq, so we can
 		 * use it.. */
@@ -1534,6 +1613,8 @@ static void o2net_hb_node_up_cb(struct o
 			o2net_set_nn_state(nn, NULL, 0, 0);
 		spin_unlock(&nn->nn_lock);
 	}
+
+	spin_unlock(&connected_lock);
 }
 
 void o2net_unregister_hb_callbacks(void)
@@ -1668,6 +1749,7 @@ static int o2net_accept_one(struct socke
 	o2net_register_callbacks(sc->sc_sock->sk, sc);
 	o2net_sc_queue_work(sc, &sc->sc_rx_work);
 
+	o2net_initialize_handshake();
 	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
 
 out:
@@ -1834,6 +1916,7 @@ int o2net_init(void)
 	unsigned long i;
 
 	o2quo_init();
+	spin_lock_init(&connected_lock);
 
 	o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
 	o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
Index: fs/ocfs2/cluster/tcp.h
===================================================================
--- fs/ocfs2/cluster/tcp.h.orig	2006-11-20 16:19:12.000000000 +0100
+++ fs/ocfs2/cluster/tcp.h	2006-11-27 09:52:12.000000000 +0100
@@ -103,11 +103,13 @@ int o2net_register_handler(u32 msg_type,
 void o2net_unregister_handler_list(struct list_head *list);
 
 struct o2nm_node;
+extern spinlock_t connected_lock;
 int o2net_register_hb_callbacks(void);
 void o2net_unregister_hb_callbacks(void);
 int o2net_start_listening(struct o2nm_node *node);
 void o2net_stop_listening(struct o2nm_node *node);
 void o2net_disconnect_node(struct o2nm_node *node);
+int o2net_num_connected_peers(void);
 
 int o2net_init(void);
 void o2net_exit(void);
Index: fs/ocfs2/cluster/tcp_internal.h
===================================================================
--- fs/ocfs2/cluster/tcp_internal.h.orig	2006-11-20 16:19:12.000000000 +0100
+++ fs/ocfs2/cluster/tcp_internal.h	2006-11-20 16:25:36.000000000 +0100
@@ -48,10 +48,14 @@
  * 	- full 64 bit i_size in the metadata lock lvbs
  * 	- introduction of "rw" lock and pushing meta/data locking down
  */
-#define O2NET_PROTOCOL_VERSION 4ULL
+#define O2NET_PROTOCOL_VERSION 5ULL
 struct o2net_handshake {
 	__be64	protocol_version;
 	__be64	connector_id;
+	__be32  o2hb_heartbeat_timeout_ms;
+	__be32  o2net_idle_timeout_ms;
+	__be32  o2net_keepalive_delay_ms;
+	__be32  o2net_reconnect_delay_ms;
 };
 
 struct o2net_node {

--

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch
  2006-11-29  1:04 ` [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch abeekhof
@ 2006-11-29 15:30   ` Zach Brown
  2006-11-29 15:31   ` Mark Fasheh
  1 sibling, 0 replies; 9+ messages in thread
From: Zach Brown @ 2006-11-29 15:30 UTC (permalink / raw)
  To: ocfs2-devel


> Only allow changes when there are no connected peers

I think we can do this differently.

>  	if (ret > 0) {
> +		if (cluster->cl_idle_timeout_ms != val) {
> +			spin_lock(&connected_lock);
> +			if(o2net_num_connected_peers()) {
> +				mlog(ML_NOTICE,
> +				     "o2net: cannot change idle timeout after "
> +				     "the first peer has agreed to it."
> +				     "  %d connected peers\n",
> +				     o2net_num_connected_peers());
> +				ret = -EINVAL;
> +			}
> +			spin_unlock(&connected_lock);
> +		}

Lose the locking here so this just becomes (paraphrasing) :

  if (cluster != val && connected()) {
   ...
  }

> +static int o2net_connected_peers = 0;
> +spinlock_t connected_lock;
> +
> +int o2net_num_connected_peers(void)
> +{
> +	return o2net_connected_peers;
> +}

Make this an "atomic_t o2net_connected_peers = ATOMIC_INIT(0);" and then
"return atomic_read();".  We probably don't really need a heavy-weight
atomic_t, but it's trivial and this isn't a fast path.

>  void o2net_disconnect_node(struct o2nm_node *node)
>  {
>  	struct o2net_node *nn = o2net_nn_from_num(node->nd_num);
>  
>  	/* don't reconnect until it's heartbeating again */
> +	spin_lock(&connected_lock);
> +	o2net_connected_peers--;
> +	spin_unlock(&connected_lock);
> +

Then don't do this work here and in other places, do it once in
o2net_set_nn_state().  something like:

if (old_sc != sc) {
	if (old_sc)
		atomic_dec(&o2net_connected_peers);
	else
		atomic_inc(&o2net_connected_peers);
}

That's less confusing and catches the place where sockets are in use by
a node.

- z

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch
  2006-11-29  1:04 ` [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch abeekhof
  2006-11-29 15:30   ` Zach Brown
@ 2006-11-29 15:31   ` Mark Fasheh
  2006-11-30  4:25     ` Andrew Beekhof
  1 sibling, 1 reply; 9+ messages in thread
From: Mark Fasheh @ 2006-11-29 15:31 UTC (permalink / raw)
  To: ocfs2-devel

Hi Andrew,
	Things are looking much better, but there's still a few issues that
I found while reviewing the patch. I got Zach to look at it too (he's the
original author of the ocfs2 network code) which has generated some good
comments.

On Wed, Nov 29, 2006 at 09:51:31AM +0100, abeekhof@suse.de wrote:
> 
> From: Andrew Beekhof <abeekhof@suse.de>
> Subject: [patch 1/1] OCFS2 Configurable timeouts - Protocol changes
> 
> Modify the OCFS2 handshake to ensure essential timeouts are configured
>   identically on all nodes.
> Only allow changes when there are no connected peers
> Improves the logic in o2net_advance_rx() which broke now that
>   sizeof(struct o2net_handshake) is greater than sizeof(struct o2net_msg)
> Included is the field for userspace-heartbeat timeout to avoid the need for
>   further protocol changes.
> Uses a global spinlock to ensure the decisions to update configfs entries
>   are made on the correct value.  The region covered by the spinlock when
>   incrimenting the counter is much larger as this is the more critical case.
Nitpick: Can you format that commit log to be a bit more in line with
standard kernel commits (the indenting is weird)


> Index: fs/ocfs2/cluster/nodemanager.c
> ===================================================================
> --- fs/ocfs2/cluster/nodemanager.c.orig	2006-11-20 16:25:58.000000000 +0100
> +++ fs/ocfs2/cluster/nodemanager.c	2006-11-27 09:57:56.000000000 +0100
> @@ -558,15 +558,14 @@ static ssize_t o2nm_cluster_attr_write(c
>  	return count;
>  }
>  
> -static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(struct o2nm_cluster *cluster,
> -                                                 char *page)
> +static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
> +	struct o2nm_cluster *cluster, char *page)
>  {
>  	return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
>  }
Can you not re-write the function prototypes unless they're actually
changing please? It clutters up the patch and makes it harder to find the
actual code to check (see below).


> @@ -574,10 +573,22 @@ static ssize_t o2nm_cluster_attr_idle_ti
>  	ret =  o2nm_cluster_attr_write(page, count, &val);
>  
>  	if (ret > 0) {
> +		if (cluster->cl_idle_timeout_ms != val) {
> +			spin_lock(&connected_lock);
> +			if(o2net_num_connected_peers()) {
> +				mlog(ML_NOTICE,
> +				     "o2net: cannot change idle timeout after "
> +				     "the first peer has agreed to it."
> +				     "  %d connected peers\n",
> +				     o2net_num_connected_peers());
> +				ret = -EINVAL;
> +			}
> +			spin_unlock(&connected_lock);
> +		}
>  		if (val <= cluster->cl_keepalive_delay_ms) {
>  			mlog(ML_NOTICE, "o2net: idle timeout must be larger "
>  			     "than keepalive delay\n");
> -			return -EINVAL;
> +			ret = -EINVAL;
>  		}
>  		cluster->cl_idle_timeout_ms = val;
I don't know how I missed this before, but you're erroring with a negative return
value, yet continuing with the work of setting cluster->cl_idle_timeout_ms
anyway. I think we're missing some goto's here and in the similar blocks
below.


> @@ -1121,6 +1121,44 @@ static int o2net_check_handshake(struct 
>  		return -1;
>  	}
>  
> +	/*
> +	 * Ensure timeouts are consistent with other nodes, otherwise
> +	 * we can end up with one node thinking that the other must be down,
> +	 * but isn't. This can ultimately cause corruption.
> +	 */
> +	if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
> +				o2net_idle_timeout(sc->sc_node)) {
> +		mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
> +		     "%u ms, but we use %u ms locally.  disconnecting\n",
> +		     SC_NODEF_ARGS(sc),
> +		     be32_to_cpu(hand->o2net_idle_timeout_ms),
> +		     o2net_idle_timeout(sc->sc_node));
> +		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
> +		return -1;
> +	}
> +
> +	if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
> +			o2net_keepalive_delay(sc->sc_node)) {
> +		mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
> +		     "%u ms, but we use %u ms locally.  disconnecting\n",
> +		     SC_NODEF_ARGS(sc),
> +		     be32_to_cpu(hand->o2net_keepalive_delay_ms),
> +		     o2net_keepalive_delay(sc->sc_node));
> +		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
> +		return -1;
> +	}
> +
> +	if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
> +			O2HB_MAX_WRITE_TIMEOUT_MS) {
> +		mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
> +		     "%u ms, but we use %u ms locally.  disconnecting\n",
> +		     SC_NODEF_ARGS(sc),
> +		     be32_to_cpu(hand->o2net_keepalive_delay_ms),
We check hearbeat timeout here, but print keepalive delay...


> @@ -1153,6 +1191,26 @@ static int o2net_advance_rx(struct o2net
>  	sclog(sc, "receiving\n");
>  	do_gettimeofday(&sc->sc_tv_advance_start);
>  
> +	if(unlikely(sc->sc_handshake_ok == 0)) {
> +		if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
> +			data = page_address(sc->sc_page) + sc->sc_page_off;
> +			datalen = sizeof(struct o2net_handshake) - sc->sc_page_off;
> +			ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
> +			if (ret > 0)
> +				sc->sc_page_off += ret;
> +		}
> +
> +		if (sc->sc_page_off == sizeof(struct o2net_handshake)) {
> +			o2net_check_handshake(sc);
> +			if(sc->sc_handshake_ok == 0) {
> +				BUG_ON(sizeof(struct o2net_handshake)
> +				       == sizeof(struct o2net_msg));
Is this necessary? Didn't we fix the logic such that the relative sizes
don't matter any more? If it _is_ necessary, then it should be a
BUILD_BUG_ON() in a more visible place, with a nice fat comment explaining
why...


> +				ret = -EPROTO;
> +			}
> +			goto out;
Do you mean to move that goto within the

if (sc->sc_handshake_ok == 0) {

block? I _think_ it's ok for us to continue otherwise...


> @@ -1178,8 +1227,7 @@ static int o2net_advance_rx(struct o2net
>  				    O2NET_MAX_PAYLOAD_BYTES)
>  					ret = -EOVERFLOW;
>  			}
> -		}
> -		if (ret <= 0)
> +		} else
>  			goto out;
>  	}
Why are you doing that? We'll continue now if we want to return -EOVERFLOW
where we would error out before.

Thanks,
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch
  2006-11-29 15:31   ` Mark Fasheh
@ 2006-11-30  4:25     ` Andrew Beekhof
  2006-11-30  9:37       ` Mark Fasheh
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Beekhof @ 2006-11-30  4:25 UTC (permalink / raw)
  To: ocfs2-devel


On Nov 30, 2006, at 12:31 AM, Mark Fasheh wrote:

> Hi Andrew,
> 	Things are looking much better, but there's still a few issues that
> I found while reviewing the patch. I got Zach to look at it too  
> (he's the
> original author of the ocfs2 network code) which has generated some  
> good
> comments.
>
> On Wed, Nov 29, 2006 at 09:51:31AM +0100, abeekhof@suse.de wrote:
>>
>> From: Andrew Beekhof <abeekhof@suse.de>
>> Subject: [patch 1/1] OCFS2 Configurable timeouts - Protocol changes
>>
>> Modify the OCFS2 handshake to ensure essential timeouts are  
>> configured
>>   identically on all nodes.
>> Only allow changes when there are no connected peers
>> Improves the logic in o2net_advance_rx() which broke now that
>>   sizeof(struct o2net_handshake) is greater than sizeof(struct  
>> o2net_msg)
>> Included is the field for userspace-heartbeat timeout to avoid the  
>> need for
>>   further protocol changes.
>> Uses a global spinlock to ensure the decisions to update configfs  
>> entries
>>   are made on the correct value.  The region covered by the  
>> spinlock when
>>   incrimenting the counter is much larger as this is the more  
>> critical case.
> Nitpick: Can you format that commit log to be a bit more in line with
> standard kernel commits (the indenting is weird)

sure

>
>
>> Index: fs/ocfs2/cluster/nodemanager.c
>> ===================================================================
>> --- fs/ocfs2/cluster/nodemanager.c.orig	2006-11-20  
>> 16:25:58.000000000 +0100
>> +++ fs/ocfs2/cluster/nodemanager.c	2006-11-27 09:57:56.000000000  
>> +0100
>> @@ -558,15 +558,14 @@ static ssize_t o2nm_cluster_attr_write(c
>>  	return count;
>>  }
>>
>> -static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(struct  
>> o2nm_cluster *cluster,
>> -                                                 char *page)
>> +static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
>> +	struct o2nm_cluster *cluster, char *page)
>>  {
>>  	return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
>>  }
> Can you not re-write the function prototypes unless they're actually
> changing please? It clutters up the patch and makes it harder to  
> find the
> actual code to check (see below).

ah, bad habit i picked up working on smaller projects.
is it ok in a separate patch?  or have I got my wrap points set too  
small by default?

>
>
>> @@ -574,10 +573,22 @@ static ssize_t o2nm_cluster_attr_idle_ti
>>  	ret =  o2nm_cluster_attr_write(page, count, &val);
>>
>>  	if (ret > 0) {
>> +		if (cluster->cl_idle_timeout_ms != val) {
>> +			spin_lock(&connected_lock);
>> +			if(o2net_num_connected_peers()) {
>> +				mlog(ML_NOTICE,
>> +				     "o2net: cannot change idle timeout after "
>> +				     "the first peer has agreed to it."
>> +				     "  %d connected peers\n",
>> +				     o2net_num_connected_peers());
>> +				ret = -EINVAL;
>> +			}
>> +			spin_unlock(&connected_lock);
>> +		}
>>  		if (val <= cluster->cl_keepalive_delay_ms) {
>>  			mlog(ML_NOTICE, "o2net: idle timeout must be larger "
>>  			     "than keepalive delay\n");
>> -			return -EINVAL;
>> +			ret = -EINVAL;
>>  		}
>>  		cluster->cl_idle_timeout_ms = val;
> I don't know how I missed this before, but you're erroring with a  
> negative return
> value, yet continuing with the work of setting cluster- 
> >cl_idle_timeout_ms
> anyway. I think we're missing some goto's here and in the similar  
> blocks
> below.

my bad - fixed

>
>> @@ -1121,6 +1121,44 @@ static int o2net_check_handshake(struct
>>  		return -1;
>>  	}
>>
>> +	/*
>> +	 * Ensure timeouts are consistent with other nodes, otherwise
>> +	 * we can end up with one node thinking that the other must be  
>> down,
>> +	 * but isn't. This can ultimately cause corruption.
>> +	 */
>> +	if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
>> +				o2net_idle_timeout(sc->sc_node)) {
>> +		mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
>> +		     "%u ms, but we use %u ms locally.  disconnecting\n",
>> +		     SC_NODEF_ARGS(sc),
>> +		     be32_to_cpu(hand->o2net_idle_timeout_ms),
>> +		     o2net_idle_timeout(sc->sc_node));
>> +		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
>> +		return -1;
>> +	}
>> +
>> +	if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
>> +			o2net_keepalive_delay(sc->sc_node)) {
>> +		mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
>> +		     "%u ms, but we use %u ms locally.  disconnecting\n",
>> +		     SC_NODEF_ARGS(sc),
>> +		     be32_to_cpu(hand->o2net_keepalive_delay_ms),
>> +		     o2net_keepalive_delay(sc->sc_node));
>> +		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
>> +		return -1;
>> +	}
>> +
>> +	if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
>> +			O2HB_MAX_WRITE_TIMEOUT_MS) {
>> +		mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
>> +		     "%u ms, but we use %u ms locally.  disconnecting\n",
>> +		     SC_NODEF_ARGS(sc),
>> +		     be32_to_cpu(hand->o2net_keepalive_delay_ms),
> We check hearbeat timeout here, but print keepalive delay...
>

fixed

>
>> @@ -1153,6 +1191,26 @@ static int o2net_advance_rx(struct o2net
>>  	sclog(sc, "receiving\n");
>>  	do_gettimeofday(&sc->sc_tv_advance_start);
>>
>> +	if(unlikely(sc->sc_handshake_ok == 0)) {
>> +		if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
>> +			data = page_address(sc->sc_page) + sc->sc_page_off;
>> +			datalen = sizeof(struct o2net_handshake) - sc->sc_page_off;
>> +			ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
>> +			if (ret > 0)
>> +				sc->sc_page_off += ret;
>> +		}
>> +
>> +		if (sc->sc_page_off == sizeof(struct o2net_handshake)) {
>> +			o2net_check_handshake(sc);
>> +			if(sc->sc_handshake_ok == 0) {
>> +				BUG_ON(sizeof(struct o2net_handshake)
>> +				       == sizeof(struct o2net_msg));
> Is this necessary?

I wasnt sure at the time - see below - so i wanted to make sure it at  
least died sanely
apparently i still needed education on how that is done :)

> Didn't we fix the logic such that the relative sizes
> don't matter any more? If it _is_ necessary, then it should be a
> BUILD_BUG_ON() in a more visible place,

ah, I was not familiar with that macro yet

> with a nice fat comment explaining
> why...
>
>> +				ret = -EPROTO;
>> +			}
>> +			goto out;
> Do you mean to move that goto within the
>
> if (sc->sc_handshake_ok == 0) {
>
> block? I _think_ it's ok for us to continue otherwise...

i did - but if we never want to process an o2net_msg if the handshake  
has not been completed, then i can structure things a little  
differently/clearly

>
>
>> @@ -1178,8 +1227,7 @@ static int o2net_advance_rx(struct o2net
>>  				    O2NET_MAX_PAYLOAD_BYTES)
>>  					ret = -EOVERFLOW;
>>  			}
>> -		}
>> -		if (ret <= 0)
>> +		} else
>>  			goto out;
>>  	}
> Why are you doing that? We'll continue now if we want to return - 
> EOVERFLOW
> where we would error out before.

damn - i should have noticed that


I'll resubmit once we sort out the  o2net_msg / o2net_handshake  
situation

--
Andrew Beekhof

"Would the last person to leave please turn out the enlightenment?" -  
TISM

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch
  2006-11-30  4:25     ` Andrew Beekhof
@ 2006-11-30  9:37       ` Mark Fasheh
  0 siblings, 0 replies; 9+ messages in thread
From: Mark Fasheh @ 2006-11-30  9:37 UTC (permalink / raw)
  To: ocfs2-devel

On Thu, Nov 30, 2006 at 01:24:29PM +0100, Andrew Beekhof wrote:
> >Can you not re-write the function prototypes unless they're actually
> >changing please? It clutters up the patch and makes it harder to  
> >find the
> >actual code to check (see below).
> 
> ah, bad habit i picked up working on smaller projects.
> is it ok in a separate patch?  or have I got my wrap points set too  
> small by default?
In general if you see something that's inconsisent with
Documentation/CodingStyle, then yeah fixing it in a seperate patch is fine.

As far as those functions are concerned, if you're not happy with the
indentation, then it would be nicest if you got them right in the patch
which introduced them.


> >>+		if (sc->sc_page_off == sizeof(struct o2net_handshake)) {
> >>+			o2net_check_handshake(sc);
> >>+			if(sc->sc_handshake_ok == 0) {
> >>+				BUG_ON(sizeof(struct o2net_handshake)
> >>+				       == sizeof(struct o2net_msg));
> >Is this necessary?
> 
> I wasnt sure at the time - see below - so i wanted to make sure it at  
> least died sanely
> apparently i still needed education on how that is done :)
> 
> >Didn't we fix the logic such that the relative sizes
> >don't matter any more? If it _is_ necessary, then it should be a
> >BUILD_BUG_ON() in a more visible place,
> 
> ah, I was not familiar with that macro yet
Yeah, I'm mostly going on your description of the patch, and Zach's
description of the problem. I'll have to look more closely to see if this is
still something we need to trap or not.


> >with a nice fat comment explaining
> >why...
> >
> >>+				ret = -EPROTO;
> >>+			}
> >>+			goto out;
> >Do you mean to move that goto within the
> >
> >if (sc->sc_handshake_ok == 0) {
> >
> >block? I _think_ it's ok for us to continue otherwise...
> 
> i did - but if we never want to process an o2net_msg if the handshake  
> has not been completed, then i can structure things a little  
> differently/clearly
Hmm, yeah... It looks like what'd happen if we don't get a properly sized
handshake is that we'd continue to process the o2net_msg, which I don't
think we want to do. If we skipped that, then we don't have to depend on the
sizes not matching... Which is fine because I don't think we ought to be
processing messages from nodes which haven't properly connected to us yet.


> I'll resubmit once we sort out the  o2net_msg / o2net_handshake  
> situation
Great, thanks alot Andrew!
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Ocfs2-devel] [patch 0/1] OCFS Configurable timeouts - Revision 2
  2006-11-29  1:04 [Ocfs2-devel] [patch 0/1] OCFS Configurable timeouts - Revision 2 abeekhof
  2006-11-29  1:04 ` [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch abeekhof
@ 2006-11-30 17:50 ` Joel Becker
  2006-12-01  0:16   ` Andrew Beekhof
  1 sibling, 1 reply; 9+ messages in thread
From: Joel Becker @ 2006-11-30 17:50 UTC (permalink / raw)
  To: ocfs2-devel

On Wed, Nov 29, 2006 at 09:51:30AM +0100, abeekhof@suse.de wrote:
> Added a global spinlock around modifications to o2net_connected_peer (as discussed with Mark).
> 
> I have a separate patch that uses to_o2nm_cluster_from_node() but since I cant reproduce the problem mentioned in Jeff's comment (it apparently needs the userspace heartbeating modifications), I'd prefer to leave it out.

	Which problem do you mean?  I'm trying to know if the callback
changes (->disconnect_notify()) make you happy and are good enough to
push towards mainline.  Have you tested with them, etc?

Joel

-- 

Bram's Law:
	The easier a piece of software is to write, the worse it's
	implemented in practice.

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Ocfs2-devel] [patch 0/1] OCFS Configurable timeouts - Revision 2
  2006-11-30 17:50 ` [Ocfs2-devel] [patch 0/1] OCFS Configurable timeouts - Revision 2 Joel Becker
@ 2006-12-01  0:16   ` Andrew Beekhof
  0 siblings, 0 replies; 9+ messages in thread
From: Andrew Beekhof @ 2006-12-01  0:16 UTC (permalink / raw)
  To: ocfs2-devel


On Dec 1, 2006, at 2:50 AM, Joel Becker wrote:

> On Wed, Nov 29, 2006 at 09:51:30AM +0100, abeekhof@suse.de wrote:
>> Added a global spinlock around modifications to  
>> o2net_connected_peer (as discussed with Mark).
>>
>> I have a separate patch that uses to_o2nm_cluster_from_node() but  
>> since I cant reproduce the problem mentioned in Jeff's comment (it  
>> apparently needs the userspace heartbeating modifications), I'd  
>> prefer to leave it out.
>
> 	Which problem do you mean?  I'm trying to know if the callback
> changes (->disconnect_notify()) make you happy and are good enough to
> push towards mainline.  Have you tested with them, etc?

I'm referring to this problem noted by Jeff

/*
* FIXME: These should use to_o2nm_cluster_from_node(), but we end up
* losing our parent link to the cluster during shutdown. This can be
* solved by adding a pre-removal callback to configfs, or passing
* around the cluster with the node. -jeffm
*/

Without the userspace heartbeat code I can't seem to create this  
situation and everything works nicely.
So I'm not yet in a position to really comment on  
the .disconnect_notify changes.

--
Andrew Beekhof

"Would the last person to leave please turn out the enlightenment?" -  
TISM

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Ocfs2-devel] [patch 0/1] OCFS Configurable timeouts - Revision 2
@ 2006-11-21  7:12 abeekhof
  2006-11-21  7:12 ` [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch abeekhof
  0 siblings, 1 reply; 9+ messages in thread
From: abeekhof @ 2006-11-21  7:12 UTC (permalink / raw)
  To: ocfs2-devel

The following is an update to the 3rd patch in the configurable timeouts series.
Changes to the original patch include:
- setting and checking of o2hb_heartbeat_timeout_ms in handshakes
- better logic in o2net_advance_rx() removing the need for dummy fields in o2net_msg struct
- tracks how many peers are connected
- prevents o2net_idle_timeout_ms and o2net_keepalive_delay_ms from being modified when we have connected peers

The rationale for including these changes in the final patch is that patch 2 creates the capability and patch 3 ensures it it used sanely across all nodes.

Still to come is a patch which makes use of the to_o2nm_cluster_from_node() function as previously suggested by Mark.

Andrew

--

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch
  2006-11-21  7:12 abeekhof
@ 2006-11-21  7:12 ` abeekhof
  0 siblings, 0 replies; 9+ messages in thread
From: abeekhof @ 2006-11-21  7:12 UTC (permalink / raw)
  To: ocfs2-devel


From: Andrew Beekhof <abeekhof@suse.de>
Subject: [patch 1/1] OCFS2 Configurable timeouts - Protocol changes

Modify the OCFS2 handshake to ensure essential timeouts are configured
  identically on all nodes.
Included is the field for userspace-heartbeat timeout to avoid the need for
  further protocol changes.

The addition of two dummy fields is a temporary measure to 
   satisfy the logic in o2net_check_handshake() and will be
   rectified in a future version of this patch

Signed-off-by: Andrew Beekhof <abeekhof@suse.de>
---
 fs/ocfs2/cluster/nodemanager.c  |   19 ++++++-
 fs/ocfs2/cluster/tcp.c          |  102 ++++++++++++++++++++++++++++++++++------
 fs/ocfs2/cluster/tcp.h          |    1 
 fs/ocfs2/cluster/tcp_internal.h |    6 +-
 4 files changed, 111 insertions(+), 17 deletions(-)








Index: fs/ocfs2/cluster/nodemanager.c
===================================================================
--- fs/ocfs2/cluster/nodemanager.c.orig	2006-11-20 16:25:58.000000000 +0100
+++ fs/ocfs2/cluster/nodemanager.c	2006-11-21 12:49:03.000000000 +0100
@@ -574,7 +574,14 @@ static ssize_t o2nm_cluster_attr_idle_ti
 	ret =  o2nm_cluster_attr_write(page, count, &val);
 
 	if (ret > 0) {
-		if (val <= cluster->cl_keepalive_delay_ms) {
+		if(cluster->cl_idle_timeout_ms != val
+		   && o2net_num_connected_peers()) {
+			mlog(ML_NOTICE, "o2net: cannot change idle timeout after "
+			     "the first peer has agreed to it.  %d connected peers\n",
+			     o2net_num_connected_peers());
+			return -EINVAL;
+
+		} else if (val <= cluster->cl_keepalive_delay_ms) {
 			mlog(ML_NOTICE, "o2net: idle timeout must be larger "
 			     "than keepalive delay\n");
 			return -EINVAL;
@@ -601,7 +608,15 @@ static ssize_t o2nm_cluster_attr_keepali
 	ret =  o2nm_cluster_attr_write(page, count, &val);
 
 	if (ret > 0) {
-		if (val >= cluster->cl_idle_timeout_ms) {
+
+		if(cluster->cl_keepalive_delay_ms != val
+		   && o2net_num_connected_peers()) {
+			mlog(ML_NOTICE, "o2net: cannot change keepalive delay after "
+			     "the first peer has agreed to it.  %d connected peers\n",
+			     o2net_num_connected_peers());
+			return -EINVAL;
+
+		} else if (val >= cluster->cl_idle_timeout_ms) {
 			mlog(ML_NOTICE, "o2net: keepalive delay must be "
 			     "smaller than idle timeout\n");
 			return -EINVAL;
Index: fs/ocfs2/cluster/tcp.c
===================================================================
--- fs/ocfs2/cluster/tcp.c.orig	2006-11-20 16:19:12.000000000 +0100
+++ fs/ocfs2/cluster/tcp.c	2006-11-21 15:42:01.000000000 +0100
@@ -1121,6 +1121,44 @@ static int o2net_check_handshake(struct 
 		return -1;
 	}
 
+	/*
+	 * Ensure timeouts are consistent with other nodes, otherwise
+	 * we can end up with one node thinking that the other must be down,
+	 * but isn't. This can ultimately cause corruption.
+	 */
+	if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
+				o2net_idle_timeout(sc->sc_node)) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
+		     "%u ms, but we use %u ms locally.  disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     be32_to_cpu(hand->o2net_idle_timeout_ms),
+		     o2net_idle_timeout(sc->sc_node));
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
+			o2net_keepalive_delay(sc->sc_node)) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
+		     "%u ms, but we use %u ms locally.  disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     be32_to_cpu(hand->o2net_keepalive_delay_ms),
+		     o2net_keepalive_delay(sc->sc_node));
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
+			O2HB_MAX_WRITE_TIMEOUT_MS) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
+		     "%u ms, but we use %u ms locally.  disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     be32_to_cpu(hand->o2net_keepalive_delay_ms),
+		     O2HB_MAX_WRITE_TIMEOUT_MS);
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+	
 	sc->sc_handshake_ok = 1;
 
 	spin_lock(&nn->nn_lock);
@@ -1153,6 +1191,26 @@ static int o2net_advance_rx(struct o2net
 	sclog(sc, "receiving\n");
 	do_gettimeofday(&sc->sc_tv_advance_start);
 
+	if(unlikely(sc->sc_handshake_ok == 0)) {
+		if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
+			data = page_address(sc->sc_page) + sc->sc_page_off;
+			datalen = sizeof(struct o2net_handshake) - sc->sc_page_off;
+			ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+			if (ret > 0)
+				sc->sc_page_off += ret;
+		}
+		
+		if (sc->sc_page_off == sizeof(struct o2net_handshake)) {
+			o2net_check_handshake(sc);
+			if(sc->sc_handshake_ok == 0) {
+				BUG_ON(sizeof(struct o2net_handshake)
+				       == sizeof(struct o2net_msg));
+				ret = -EPROTO;
+			}
+			goto out;
+		}
+	}
+
 	/* do we need more header? */
 	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
 		data = page_address(sc->sc_page) + sc->sc_page_off;
@@ -1160,26 +1218,16 @@ static int o2net_advance_rx(struct o2net
 		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
 		if (ret > 0) {
 			sc->sc_page_off += ret;
-
-			/* this working relies on the handshake being
-			 * smaller than the normal message header */
-			if (sc->sc_page_off >= sizeof(struct o2net_handshake)&&
-			    !sc->sc_handshake_ok && o2net_check_handshake(sc)) {
-				ret = -EPROTO;
-				goto out;
-			}
-
-			/* only swab incoming here.. we can
-			 * only get here once as we cross from
-			 * being under to over */
 			if (sc->sc_page_off == sizeof(struct o2net_msg)) {
+				/* only swab incoming here.. we can
+				 * only get here once as we cross from
+				 * being under to over */
 				hdr = page_address(sc->sc_page);
 				if (be16_to_cpu(hdr->data_len) >
 				    O2NET_MAX_PAYLOAD_BYTES)
 					ret = -EOVERFLOW;
 			}
-		}
-		if (ret <= 0)
+		} else
 			goto out;
 	}
 
@@ -1269,6 +1317,18 @@ static int o2net_set_nodelay(struct sock
 	return ret;
 }
 
+static void o2net_initialize_handshake(void)
+{
+	o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
+		O2HB_MAX_WRITE_TIMEOUT_MS);
+	o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(
+		o2net_idle_timeout(NULL));
+	o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
+		o2net_keepalive_delay(NULL));
+	o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
+		o2net_reconnect_delay(NULL));
+}
+
 /* ------------------------------------------------------------ */
 
 /* called when a connect completes and after a sock is accepted.  the
@@ -1281,6 +1341,7 @@ static void o2net_sc_connect_completed(v
               (unsigned long long)O2NET_PROTOCOL_VERSION,
 	      (unsigned long long)be64_to_cpu(o2net_hand->connector_id));
 
+	o2net_initialize_handshake();
 	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
 	sc_put(sc);
 }
@@ -1481,12 +1542,20 @@ static void o2net_still_up(void *arg)
 
 /* ------------------------------------------------------------ */
 
+static int o2net_connected_peers = 0;
+
+int o2net_num_connected_peers(void) 
+{
+	return o2net_connected_peers;
+}
+
 void o2net_disconnect_node(struct o2nm_node *node)
 {
 	struct o2net_node *nn = o2net_nn_from_num(node->nd_num);
 
 	/* don't reconnect until it's heartbeating again */
 	spin_lock(&nn->nn_lock);
+	o2net_connected_peers--;
 	o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
 	spin_unlock(&nn->nn_lock);
 
@@ -1505,8 +1574,11 @@ static void o2net_hb_node_down_cb(struct
 
 	if (node_num != o2nm_this_node())
 		o2net_disconnect_node(node);
+
+	BUG_ON(o2net_connected_peers < 0);
 }
 
+
 static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
 				void *data)
 {
@@ -1530,6 +1602,7 @@ static void o2net_hb_node_up_cb(struct o
 		 * only use set_nn_state to clear the persistent error
 		 * if that hasn't already happened */
 		spin_lock(&nn->nn_lock);
+		o2net_connected_peers++;
 		if (nn->nn_persistent_error)
 			o2net_set_nn_state(nn, NULL, 0, 0);
 		spin_unlock(&nn->nn_lock);
@@ -1668,6 +1741,7 @@ static int o2net_accept_one(struct socke
 	o2net_register_callbacks(sc->sc_sock->sk, sc);
 	o2net_sc_queue_work(sc, &sc->sc_rx_work);
 
+	o2net_initialize_handshake();
 	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
 
 out:
Index: fs/ocfs2/cluster/tcp.h
===================================================================
--- fs/ocfs2/cluster/tcp.h.orig	2006-11-20 16:19:12.000000000 +0100
+++ fs/ocfs2/cluster/tcp.h	2006-11-21 15:32:53.000000000 +0100
@@ -108,6 +108,7 @@ void o2net_unregister_hb_callbacks(void)
 int o2net_start_listening(struct o2nm_node *node);
 void o2net_stop_listening(struct o2nm_node *node);
 void o2net_disconnect_node(struct o2nm_node *node);
+int o2net_num_connected_peers(void);
 
 int o2net_init(void);
 void o2net_exit(void);
Index: fs/ocfs2/cluster/tcp_internal.h
===================================================================
--- fs/ocfs2/cluster/tcp_internal.h.orig	2006-11-20 16:19:12.000000000 +0100
+++ fs/ocfs2/cluster/tcp_internal.h	2006-11-20 16:25:36.000000000 +0100
@@ -48,10 +48,14 @@
  * 	- full 64 bit i_size in the metadata lock lvbs
  * 	- introduction of "rw" lock and pushing meta/data locking down
  */
-#define O2NET_PROTOCOL_VERSION 4ULL
+#define O2NET_PROTOCOL_VERSION 5ULL
 struct o2net_handshake {
 	__be64	protocol_version;
 	__be64	connector_id;
+	__be32  o2hb_heartbeat_timeout_ms;
+	__be32  o2net_idle_timeout_ms;
+	__be32  o2net_keepalive_delay_ms;
+	__be32  o2net_reconnect_delay_ms;
 };
 
 struct o2net_node {

--

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2006-12-01  0:16 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-11-29  1:04 [Ocfs2-devel] [patch 0/1] OCFS Configurable timeouts - Revision 2 abeekhof
2006-11-29  1:04 ` [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch abeekhof
2006-11-29 15:30   ` Zach Brown
2006-11-29 15:31   ` Mark Fasheh
2006-11-30  4:25     ` Andrew Beekhof
2006-11-30  9:37       ` Mark Fasheh
2006-11-30 17:50 ` [Ocfs2-devel] [patch 0/1] OCFS Configurable timeouts - Revision 2 Joel Becker
2006-12-01  0:16   ` Andrew Beekhof
  -- strict thread matches above, loose matches on Subject: below --
2006-11-21  7:12 abeekhof
2006-11-21  7:12 ` [Ocfs2-devel] [patch 1/1] ocfs2-timeout-protocol.patch abeekhof

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.