[PATCH] Add exponential backoff + random delay to MADs when retrying after timeout.

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] Add exponential backoff + random delay to MADs when retrying after timeout.
@ 2010-10-11 15:34 Mike Heinz
  0 siblings, 0 replies; 8+ messages in thread
From: Mike Heinz @ 2010-10-11 15:34 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

[-- Attachment #1: Type: text/plain, Size: 6484 bytes --]

This patch builds upon a discussion we had earlier this year on adding a backoff function when retrying MAD sends after a timeout.

This patch does NOT implement the ABI/API changes that would be needed to take advantage of the new features, but it lays the groundwork for doing so. In addition, it provides a new module parameter that allow the administrator to coerce existing code into using the new capability.

First, I've added a new field called "randomized_wait" to the ib_mad_send_buf structure. If this parameter is set, each time the WR times out, the the timeout for the next retry is set to (send_wr->timeout_ms + 511<<(send_wr->retries) - random32()&511). In other words, on the first retry, the randomization code will add between 0 and 1/2 second to the timeout. On the second retry, it will add between 1 and 1.5 seconds to the timeout, on the 3rd, between 2 and 2.5 seconds, on the 4th, between 4 and 4.5, et cetera. In addition, a new field, total_timeout has been added to the ib_mad_send_wr_private and is initialized to (send_wr->timeout * send_wr->max_retries). Retries cannot exceed this total time, even though that will mean a lower number of retry attempts.

Finally, I've added a module parameter to coerce all mad work requests to use this feature if desired.

parm:           randomized_wait:When true, use a randomized backoff algorithm to control retries for timeouts. (int)

--------

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index ef1304f..3b03f1c 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -42,6 +42,11 @@
 #include "smi.h"
 #include "agent.h"
 
+#include "linux/random.h"
+
+#define MAD_MIN_TIMEOUT_MS 511
+#define MAD_RAND_TIMEOUT_MS 511
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("kernel IB MAD API");
 MODULE_AUTHOR("Hal Rosenstock");
@@ -55,6 +60,10 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests
 module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
 
+int mad_randomized_wait = 0;
+module_param_named(randomized_wait, mad_randomized_wait, int, 0444);
+MODULE_PARM_DESC(randomized_wait, "When true, use a randomized backoff algorithm to control retries for timeouts.");
+
 static struct kmem_cache *ib_mad_cache;
 
 static struct list_head ib_mad_port_list;
@@ -1102,11 +1111,18 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 		}
 
 		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+
+		mad_send_wr->randomized_wait = mad_randomized_wait || send_buf->randomized_wait;
+		mad_send_wr->total_timeout = msecs_to_jiffies(send_buf->timeout_ms) * send_buf->retries;
+		
 		/* Timeout will be updated after send completes */
 		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+
 		mad_send_wr->max_retries = send_buf->retries;
 		mad_send_wr->retries_left = send_buf->retries;
+		
 		send_buf->retries = 0;
+		
 		/* Reference for work request to QP + response */
 		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
 		mad_send_wr->status = IB_WC_SUCCESS;
@@ -1803,6 +1819,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 
 	/* Complete corresponding request */
 	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
 		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
 		if (!mad_send_wr) {
@@ -1811,6 +1828,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 			deref_mad_agent(mad_agent_priv);
 			return;
 		}
+
 		ib_mark_mad_done(mad_send_wr);
 		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
@@ -2429,14 +2447,33 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	int ret;
 
-	if (!mad_send_wr->retries_left)
+	if (!mad_send_wr->retries_left || (mad_send_wr->total_timeout == 0))
 		return -ETIMEDOUT;
 
 	mad_send_wr->retries_left--;
 	mad_send_wr->send_buf.retries++;
 
-	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	if (mad_send_wr->randomized_wait) {
+		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms +
+			(MAD_MIN_TIMEOUT_MS<<mad_send_wr->send_buf.retries) -
+			(random32()&MAD_RAND_TIMEOUT_MS));
+		if (mad_send_wr->timeout > mad_send_wr->total_timeout) {
+			mad_send_wr->timeout = mad_send_wr->total_timeout;
+			mad_send_wr->total_timeout = 0;
+		} else {
+			mad_send_wr->total_timeout -= mad_send_wr->timeout;
+		}
+	} else {
+		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	}
 
+	printk(KERN_DEBUG PFX "Retrying send %p: retries: %u, retries_left: %u, timeout: %lu, total_timeout: %lu\n",
+		mad_send_wr,
+		mad_send_wr->send_buf.retries,
+		mad_send_wr->retries_left,
+		mad_send_wr->timeout,
+		mad_send_wr->total_timeout);
+		
 	if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
 		ret = ib_retry_rmpp(mad_send_wr);
 		switch (ret) {
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 9430ab4..01fb7ed 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -132,8 +132,10 @@ struct ib_mad_send_wr_private {
 	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
 	__be64 tid;
 	unsigned long timeout;
+	unsigned long total_timeout;
 	int max_retries;
 	int retries_left;
+	int randomized_wait;
 	int retry;
 	int refcount;
 	enum ib_wc_status status;
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index d3b9401..c3d6efb 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -77,6 +77,15 @@
 
 #define IB_MGMT_MAX_METHODS			128
 
+/* MAD Status field bit masks */
+#define IB_MGMT_MAD_STATUS_SUCCESS						0x0000
+#define IB_MGMT_MAD_STATUS_BUSY							0x0001
+#define IB_MGMT_MAD_STATUS_REDIRECT_REQD				0x0002
+#define IB_MGMT_MAD_STATUS_BAD_VERERSION				0x0004	
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD			0x0008	
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB	0x000c
+#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE			0x001c
+
 /* RMPP information */
 #define IB_MGMT_RMPP_VERSION			1
 
@@ -246,6 +255,7 @@ struct ib_mad_send_buf {
 	int			seg_count;
 	int			seg_size;
 	int			timeout_ms;
+	int			randomized_wait;
 	int			retries;
 };


[-- Attachment #2: randomized_mad_timeout.patch --]
[-- Type: application/octet-stream, Size: 4914 bytes --]

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index ef1304f..3b03f1c 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -42,6 +42,11 @@
 #include "smi.h"
 #include "agent.h"
 
+#include "linux/random.h"
+
+#define MAD_MIN_TIMEOUT_MS 511
+#define MAD_RAND_TIMEOUT_MS 511
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("kernel IB MAD API");
 MODULE_AUTHOR("Hal Rosenstock");
@@ -55,6 +60,10 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests
 module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
 
+int mad_randomized_wait = 0;
+module_param_named(randomized_wait, mad_randomized_wait, int, 0444);
+MODULE_PARM_DESC(randomized_wait, "When true, use a randomized backoff algorithm to control retries for timeouts.");
+
 static struct kmem_cache *ib_mad_cache;
 
 static struct list_head ib_mad_port_list;
@@ -1102,11 +1111,18 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 		}
 
 		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+
+		mad_send_wr->randomized_wait = mad_randomized_wait || send_buf->randomized_wait;
+		mad_send_wr->total_timeout = msecs_to_jiffies(send_buf->timeout_ms) * send_buf->retries;
+		
 		/* Timeout will be updated after send completes */
 		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+
 		mad_send_wr->max_retries = send_buf->retries;
 		mad_send_wr->retries_left = send_buf->retries;
+		
 		send_buf->retries = 0;
+		
 		/* Reference for work request to QP + response */
 		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
 		mad_send_wr->status = IB_WC_SUCCESS;
@@ -1803,6 +1819,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 
 	/* Complete corresponding request */
 	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
 		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
 		if (!mad_send_wr) {
@@ -1811,6 +1828,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 			deref_mad_agent(mad_agent_priv);
 			return;
 		}
+
 		ib_mark_mad_done(mad_send_wr);
 		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
@@ -2429,14 +2447,33 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	int ret;
 
-	if (!mad_send_wr->retries_left)
+	if (!mad_send_wr->retries_left || (mad_send_wr->total_timeout == 0))
 		return -ETIMEDOUT;
 
 	mad_send_wr->retries_left--;
 	mad_send_wr->send_buf.retries++;
 
-	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	if (mad_send_wr->randomized_wait) {
+		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms +
+			(MAD_MIN_TIMEOUT_MS<<mad_send_wr->send_buf.retries) -
+			(random32()&MAD_RAND_TIMEOUT_MS));
+		if (mad_send_wr->timeout > mad_send_wr->total_timeout) {
+			mad_send_wr->timeout = mad_send_wr->total_timeout;
+			mad_send_wr->total_timeout = 0;
+		} else {
+			mad_send_wr->total_timeout -= mad_send_wr->timeout;
+		}
+	} else {
+		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	}
 
+	printk(KERN_DEBUG PFX "Retrying send %p: retries: %u, retries_left: %u, timeout: %lu, total_timeout: %lu\n",
+		mad_send_wr,
+		mad_send_wr->send_buf.retries,
+		mad_send_wr->retries_left,
+		mad_send_wr->timeout,
+		mad_send_wr->total_timeout);
+		
 	if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
 		ret = ib_retry_rmpp(mad_send_wr);
 		switch (ret) {
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 9430ab4..01fb7ed 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -132,8 +132,10 @@ struct ib_mad_send_wr_private {
 	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
 	__be64 tid;
 	unsigned long timeout;
+	unsigned long total_timeout;
 	int max_retries;
 	int retries_left;
+	int randomized_wait;
 	int retry;
 	int refcount;
 	enum ib_wc_status status;
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index d3b9401..c3d6efb 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -77,6 +77,15 @@
 
 #define IB_MGMT_MAX_METHODS			128
 
+/* MAD Status field bit masks */
+#define IB_MGMT_MAD_STATUS_SUCCESS						0x0000
+#define IB_MGMT_MAD_STATUS_BUSY							0x0001
+#define IB_MGMT_MAD_STATUS_REDIRECT_REQD				0x0002
+#define IB_MGMT_MAD_STATUS_BAD_VERERSION				0x0004	
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD			0x0008	
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB	0x000c
+#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE			0x001c
+
 /* RMPP information */
 #define IB_MGMT_RMPP_VERSION			1
 
@@ -246,6 +255,7 @@ struct ib_mad_send_buf {
 	int			seg_count;
 	int			seg_size;
 	int			timeout_ms;
+	int			randomized_wait;
 	int			retries;
 };
 

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH] Add exponential backoff + random delay to MADs when retrying after timeout.
@ 2010-10-26 18:33 Mike Heinz
       [not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49D4675DEB-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Mike Heinz @ 2010-10-26 18:33 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

[-- Attachment #1: Type: text/plain, Size: 6784 bytes --]

Resending. Didn't get any reply after the last posting.

-----Original Message-----
From: Mike Heinz 
Sent: Monday, October 11, 2010 11:34 AM
To: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'
Subject: [PATCH] Add exponential backoff + random delay to MADs when retrying after timeout.

This patch builds upon a discussion we had earlier this year on adding a backoff function when retrying MAD sends after a timeout.

This patch does NOT implement the ABI/API changes that would be needed to take advantage of the new features, but it lays the groundwork for doing so. In addition, it provides a new module parameter that allow the administrator to coerce existing code into using the new capability.

First, I've added a new field called "randomized_wait" to the ib_mad_send_buf structure. If this parameter is set, each time the WR times out, the the timeout for the next retry is set to (send_wr->timeout_ms + 511<<(send_wr->retries) - random32()&511). In other words, on the first retry, the randomization code will add between 0 and 1/2 second to the timeout. On the second retry, it will add between 1 and 1.5 seconds to the timeout, on the 3rd, between 2 and 2.5 seconds, on the 4th, between 4 and 4.5, et cetera. In addition, a new field, total_timeout has been added to the ib_mad_send_wr_private and is initialized to (send_wr->timeout * send_wr->max_retries). Retries cannot exceed this total time, even though that will mean a lower number of retry attempts.

Finally, I've added a module parameter to coerce all mad work requests to use this feature if desired.

parm:           randomized_wait:When true, use a randomized backoff algorithm to control retries for timeouts. (int)

--------

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index ef1304f..3b03f1c 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -42,6 +42,11 @@
 #include "smi.h"
 #include "agent.h"
 
+#include "linux/random.h"
+
+#define MAD_MIN_TIMEOUT_MS 511
+#define MAD_RAND_TIMEOUT_MS 511
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("kernel IB MAD API");  MODULE_AUTHOR("Hal Rosenstock"); @@ -55,6 +60,10 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests  module_param_named(recv_queue_size, mad_recvq_size, int, 0444);  MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
 
+int mad_randomized_wait = 0;
+module_param_named(randomized_wait, mad_randomized_wait, int, 0444); 
+MODULE_PARM_DESC(randomized_wait, "When true, use a randomized backoff 
+algorithm to control retries for timeouts.");
+
 static struct kmem_cache *ib_mad_cache;
 
 static struct list_head ib_mad_port_list; @@ -1102,11 +1111,18 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 		}
 
 		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+
+		mad_send_wr->randomized_wait = mad_randomized_wait || send_buf->randomized_wait;
+		mad_send_wr->total_timeout = msecs_to_jiffies(send_buf->timeout_ms) * 
+send_buf->retries;
+		
 		/* Timeout will be updated after send completes */
 		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+
 		mad_send_wr->max_retries = send_buf->retries;
 		mad_send_wr->retries_left = send_buf->retries;
+		
 		send_buf->retries = 0;
+		
 		/* Reference for work request to QP + response */
 		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
 		mad_send_wr->status = IB_WC_SUCCESS;
@@ -1803,6 +1819,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 
 	/* Complete corresponding request */
 	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
 		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
 		if (!mad_send_wr) {
@@ -1811,6 +1828,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 			deref_mad_agent(mad_agent_priv);
 			return;
 		}
+
 		ib_mark_mad_done(mad_send_wr);
 		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
@@ -2429,14 +2447,33 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)  {
 	int ret;
 
-	if (!mad_send_wr->retries_left)
+	if (!mad_send_wr->retries_left || (mad_send_wr->total_timeout == 0))
 		return -ETIMEDOUT;
 
 	mad_send_wr->retries_left--;
 	mad_send_wr->send_buf.retries++;
 
-	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	if (mad_send_wr->randomized_wait) {
+		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms +
+			(MAD_MIN_TIMEOUT_MS<<mad_send_wr->send_buf.retries) -
+			(random32()&MAD_RAND_TIMEOUT_MS));
+		if (mad_send_wr->timeout > mad_send_wr->total_timeout) {
+			mad_send_wr->timeout = mad_send_wr->total_timeout;
+			mad_send_wr->total_timeout = 0;
+		} else {
+			mad_send_wr->total_timeout -= mad_send_wr->timeout;
+		}
+	} else {
+		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	}
 
+	printk(KERN_DEBUG PFX "Retrying send %p: retries: %u, retries_left: %u, timeout: %lu, total_timeout: %lu\n",
+		mad_send_wr,
+		mad_send_wr->send_buf.retries,
+		mad_send_wr->retries_left,
+		mad_send_wr->timeout,
+		mad_send_wr->total_timeout);
+		
 	if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
 		ret = ib_retry_rmpp(mad_send_wr);
 		switch (ret) {
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 9430ab4..01fb7ed 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -132,8 +132,10 @@ struct ib_mad_send_wr_private {
 	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
 	__be64 tid;
 	unsigned long timeout;
+	unsigned long total_timeout;
 	int max_retries;
 	int retries_left;
+	int randomized_wait;
 	int retry;
 	int refcount;
 	enum ib_wc_status status;
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index d3b9401..c3d6efb 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -77,6 +77,15 @@
 
 #define IB_MGMT_MAX_METHODS			128
 
+/* MAD Status field bit masks */
+#define IB_MGMT_MAD_STATUS_SUCCESS						0x0000
+#define IB_MGMT_MAD_STATUS_BUSY							0x0001
+#define IB_MGMT_MAD_STATUS_REDIRECT_REQD				0x0002
+#define IB_MGMT_MAD_STATUS_BAD_VERERSION				0x0004	
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD			0x0008	
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB	0x000c
+#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE			0x001c
+
 /* RMPP information */
 #define IB_MGMT_RMPP_VERSION			1
 
@@ -246,6 +255,7 @@ struct ib_mad_send_buf {
 	int			seg_count;
 	int			seg_size;
 	int			timeout_ms;
+	int			randomized_wait;
 	int			retries;
 };


[-- Attachment #2: randomized_mad_timeout.patch --]
[-- Type: application/octet-stream, Size: 4914 bytes --]

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index ef1304f..3b03f1c 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -42,6 +42,11 @@
 #include "smi.h"
 #include "agent.h"
 
+#include "linux/random.h"
+
+#define MAD_MIN_TIMEOUT_MS 511
+#define MAD_RAND_TIMEOUT_MS 511
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("kernel IB MAD API");
 MODULE_AUTHOR("Hal Rosenstock");
@@ -55,6 +60,10 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests
 module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
 
+int mad_randomized_wait = 0;
+module_param_named(randomized_wait, mad_randomized_wait, int, 0444);
+MODULE_PARM_DESC(randomized_wait, "When true, use a randomized backoff algorithm to control retries for timeouts.");
+
 static struct kmem_cache *ib_mad_cache;
 
 static struct list_head ib_mad_port_list;
@@ -1102,11 +1111,18 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 		}
 
 		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+
+		mad_send_wr->randomized_wait = mad_randomized_wait || send_buf->randomized_wait;
+		mad_send_wr->total_timeout = msecs_to_jiffies(send_buf->timeout_ms) * send_buf->retries;
+		
 		/* Timeout will be updated after send completes */
 		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+
 		mad_send_wr->max_retries = send_buf->retries;
 		mad_send_wr->retries_left = send_buf->retries;
+		
 		send_buf->retries = 0;
+		
 		/* Reference for work request to QP + response */
 		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
 		mad_send_wr->status = IB_WC_SUCCESS;
@@ -1803,6 +1819,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 
 	/* Complete corresponding request */
 	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
 		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
 		if (!mad_send_wr) {
@@ -1811,6 +1828,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 			deref_mad_agent(mad_agent_priv);
 			return;
 		}
+
 		ib_mark_mad_done(mad_send_wr);
 		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
@@ -2429,14 +2447,33 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	int ret;
 
-	if (!mad_send_wr->retries_left)
+	if (!mad_send_wr->retries_left || (mad_send_wr->total_timeout == 0))
 		return -ETIMEDOUT;
 
 	mad_send_wr->retries_left--;
 	mad_send_wr->send_buf.retries++;
 
-	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	if (mad_send_wr->randomized_wait) {
+		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms +
+			(MAD_MIN_TIMEOUT_MS<<mad_send_wr->send_buf.retries) -
+			(random32()&MAD_RAND_TIMEOUT_MS));
+		if (mad_send_wr->timeout > mad_send_wr->total_timeout) {
+			mad_send_wr->timeout = mad_send_wr->total_timeout;
+			mad_send_wr->total_timeout = 0;
+		} else {
+			mad_send_wr->total_timeout -= mad_send_wr->timeout;
+		}
+	} else {
+		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	}
 
+	printk(KERN_DEBUG PFX "Retrying send %p: retries: %u, retries_left: %u, timeout: %lu, total_timeout: %lu\n",
+		mad_send_wr,
+		mad_send_wr->send_buf.retries,
+		mad_send_wr->retries_left,
+		mad_send_wr->timeout,
+		mad_send_wr->total_timeout);
+		
 	if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
 		ret = ib_retry_rmpp(mad_send_wr);
 		switch (ret) {
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 9430ab4..01fb7ed 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -132,8 +132,10 @@ struct ib_mad_send_wr_private {
 	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
 	__be64 tid;
 	unsigned long timeout;
+	unsigned long total_timeout;
 	int max_retries;
 	int retries_left;
+	int randomized_wait;
 	int retry;
 	int refcount;
 	enum ib_wc_status status;
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index d3b9401..c3d6efb 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -77,6 +77,15 @@
 
 #define IB_MGMT_MAX_METHODS			128
 
+/* MAD Status field bit masks */
+#define IB_MGMT_MAD_STATUS_SUCCESS						0x0000
+#define IB_MGMT_MAD_STATUS_BUSY							0x0001
+#define IB_MGMT_MAD_STATUS_REDIRECT_REQD				0x0002
+#define IB_MGMT_MAD_STATUS_BAD_VERERSION				0x0004	
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD			0x0008	
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB	0x000c
+#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE			0x001c
+
 /* RMPP information */
 #define IB_MGMT_RMPP_VERSION			1
 
@@ -246,6 +255,7 @@ struct ib_mad_send_buf {
 	int			seg_count;
 	int			seg_size;
 	int			timeout_ms;
+	int			randomized_wait;
 	int			retries;
 };
 

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] Add exponential backoff + random delay to MADs when retrying after timeout.
       [not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49D4675DEB-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
@ 2010-10-28 13:12   ` Hal Rosenstock
       [not found]     ` <4CC976CC.1080009-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Hal Rosenstock @ 2010-10-28 13:12 UTC (permalink / raw)
  To: Mike Heinz; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

On 10/26/2010 2:33 PM, Mike Heinz wrote:
> Resending. Didn't get any reply after the last posting.
>
> -----Original Message-----
> From: Mike Heinz
> Sent: Monday, October 11, 2010 11:34 AM
> To: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'
> Subject: [PATCH] Add exponential backoff + random delay to MADs when retrying after timeout.
>
> This patch builds upon a discussion we had earlier this year on adding a backoff function when retrying MAD sends after a timeout.

Agreed that the fixed timer retry strategy is b0rken for some scenarios 
(e.g. BUSY handling). I think a description of this should be added to 
the description as background/motivation. You had previously written:

"The current behavior is to simply return the BUSY to the client or ULP, 
which  is either treated as a permanent error or causes an immediate 
retry. This can be a big problem with, for example, ipoib which sets 
retries to 15 and (as I understand it) immediately retries to connect 
when getting an error response from the SA. Other ulps have similar 
settings. Without some kind of delay, starting up ipoib on a large 
fabric (at boot time, for example) can cause a real packet storm.

By treating BUSY replies identically to timeouts, this patch at least 
introduces a delay between attempts. In the case of the ULPs, the delay 
is typically 4 seconds.

This approach encourages applications to adjust their timeouts 
appropriately by treating BUSY responses as non-events and forcing the 
applications to wait for their request to time out.

Depending on the application developers to take BUSY responses into 
account seems to be asking for trouble - it allows one rogue app to 
bring the SA to its knees, for example. By enforcing this timeout model 
in the kernel, we guarantee that there will be at least some delay 
between each message when the SA is reporting a busy status."

Maybe some shorter version of the above should be part of this and/or 
your subsequent busy handling patch.

> This patch does NOT implement the ABI/API changes that would be needed to take advantage of the new features, but it lays the groundwork for doing so. In addition, it provides a new module parameter that allow the administrator to coerce existing code into using the new capability.
>
> First, I've added a new field called "randomized_wait" to the ib_mad_send_buf structure. If this parameter is set, each time the WR times out, the the timeout for the next retry is set to (send_wr->timeout_ms + 511<<(send_wr->retries) - random32()&511). In other words, on the first retry, the randomization code will add between 0 and 1/2 second to the timeout. On the second retry, it will add between 1 and 1.5 seconds to the timeout, on the 3rd, between 2 and 2.5 seconds, on the 4th, between 4 and 4.5, et cetera.

What experience/confidence is there in this (specific) randomization 
policy ? On what (how large) IB cluster sizes has this policy been tried 
? Is this specific policy modeled from other policies in use elsewhere ?

Also, is this randomized timeout used on RMPP packets if this parameter 
is not 0 ?

> In addition, a new field, total_timeout has been added to the ib_mad_send_wr_private and is initialized to (send_wr->timeout * send_wr->max_retries). Retries cannot exceed this total time, even though that will mean a lower number of retry attempts.

Why is the total timeout more important than number of retries in terms 
of terminating the transaction ?

Shouldn't there be another parameter as to whether this limits the 
retries or whether the number of retries should be the limiting factor ? 
I'm not sure about reducing the number of retries on a large fabric. I 
think the typical number used is 3 so this would be at most 2 depending 
on the per retry timeout. I think the default policy should be to 
preserve the number of retries rather than the total timeout.

> Finally, I've added a module parameter to coerce all mad work requests to use this feature if desired.

On one hand, I don't want to introduce unneeded parameters/complexity 
but I'm wondering whether more granularity is useful on which requests 
(classes ?) this applies to. For example, should SM requests be 
randomized ? This feature is primarily an SA thing although busy can be 
used for other management classes but it's use is mainly GS related.

> parm:           randomized_wait:When true, use a randomized backoff algorithm to control retries for timeouts. (int)
>
> --------
>
> diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index ef1304f..3b03f1c 100644
> --- a/drivers/infiniband/core/mad.c
> +++ b/drivers/infiniband/core/mad.c
> @@ -42,6 +42,11 @@
>   #include "smi.h"
>   #include "agent.h"
>
> +#include "linux/random.h"
> +
> +#define MAD_MIN_TIMEOUT_MS 511
> +#define MAD_RAND_TIMEOUT_MS 511
> +
>   MODULE_LICENSE("Dual BSD/GPL");
>   MODULE_DESCRIPTION("kernel IB MAD API");  MODULE_AUTHOR("Hal Rosenstock"); @@ -55,6 +60,10 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests  module_param_named(recv_queue_size, mad_recvq_size, int, 0444);  MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
>
> +int mad_randomized_wait = 0;
> +module_param_named(randomized_wait, mad_randomized_wait, int, 0444);
> +MODULE_PARM_DESC(randomized_wait, "When true, use a randomized backoff
> +algorithm to control retries for timeouts.");
> +
>   static struct kmem_cache *ib_mad_cache;
>
>   static struct list_head ib_mad_port_list; @@ -1102,11 +1111,18 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
>   		}
>
>   		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
> +
> +		mad_send_wr->randomized_wait = mad_randomized_wait || send_buf->randomized_wait;
> +		mad_send_wr->total_timeout = msecs_to_jiffies(send_buf->timeout_ms) *
> +send_buf->retries;
> +		
>   		/* Timeout will be updated after send completes */
>   		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
> +
>   		mad_send_wr->max_retries = send_buf->retries;
>   		mad_send_wr->retries_left = send_buf->retries;
> +		
>   		send_buf->retries = 0;
> +		
>   		/* Reference for work request to QP + response */
>   		mad_send_wr->refcount = 1 + (mad_send_wr->timeout>  0);
>   		mad_send_wr->status = IB_WC_SUCCESS;
> @@ -1803,6 +1819,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
>
>   	/* Complete corresponding request */
>   	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
> +
>   		spin_lock_irqsave(&mad_agent_priv->lock, flags);
>   		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
>   		if (!mad_send_wr) {
> @@ -1811,6 +1828,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
>   			deref_mad_agent(mad_agent_priv);
>   			return;
>   		}
> +

Nit: no need for extra lines in this and above.

>   		ib_mark_mad_done(mad_send_wr);
>   		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
>
> @@ -2429,14 +2447,33 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)  {
>   	int ret;
>
> -	if (!mad_send_wr->retries_left)
> +	if (!mad_send_wr->retries_left || (mad_send_wr->total_timeout == 0))
>   		return -ETIMEDOUT;
>
>   	mad_send_wr->retries_left--;
>   	mad_send_wr->send_buf.retries++;
>
> -	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
> +	if (mad_send_wr->randomized_wait) {
> +		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms +
> +			(MAD_MIN_TIMEOUT_MS<<mad_send_wr->send_buf.retries) -
> +			(random32()&MAD_RAND_TIMEOUT_MS));
> +		if (mad_send_wr->timeout>  mad_send_wr->total_timeout) {
> +			mad_send_wr->timeout = mad_send_wr->total_timeout;
> +			mad_send_wr->total_timeout = 0;
> +		} else {
> +			mad_send_wr->total_timeout -= mad_send_wr->timeout;
> +		}
> +	} else {
> +		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
> +	}
>
> +	printk(KERN_DEBUG PFX "Retrying send %p: retries: %u, retries_left: %u, timeout: %lu, total_timeout: %lu\n",
> +		mad_send_wr,
> +		mad_send_wr->send_buf.retries,
> +		mad_send_wr->retries_left,
> +		mad_send_wr->timeout,
> +		mad_send_wr->total_timeout);
> +		
>   	if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
>   		ret = ib_retry_rmpp(mad_send_wr);
>   		switch (ret) {
> diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
> index 9430ab4..01fb7ed 100644
> --- a/drivers/infiniband/core/mad_priv.h
> +++ b/drivers/infiniband/core/mad_priv.h
> @@ -132,8 +132,10 @@ struct ib_mad_send_wr_private {
>   	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
>   	__be64 tid;
>   	unsigned long timeout;
> +	unsigned long total_timeout;
>   	int max_retries;
>   	int retries_left;
> +	int randomized_wait;
>   	int retry;
>   	int refcount;
>   	enum ib_wc_status status;
> diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index d3b9401..c3d6efb 100644
> --- a/include/rdma/ib_mad.h
> +++ b/include/rdma/ib_mad.h
> @@ -77,6 +77,15 @@
>
>   #define IB_MGMT_MAX_METHODS			128
>
> +/* MAD Status field bit masks */
> +#define IB_MGMT_MAD_STATUS_SUCCESS						0x0000
> +#define IB_MGMT_MAD_STATUS_BUSY							0x0001
> +#define IB_MGMT_MAD_STATUS_REDIRECT_REQD				0x0002
> +#define IB_MGMT_MAD_STATUS_BAD_VERERSION				0x0004	
> +#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD			0x0008	
> +#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB	0x000c
> +#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE			0x001c
> +

Another nit: Although this does no harm, these don't look needed/used in 
this patch so I don't think they should be part of this and moved to 
your subsequent busy patch.

-- Hal

>   /* RMPP information */
>   #define IB_MGMT_RMPP_VERSION			1
>
> @@ -246,6 +255,7 @@ struct ib_mad_send_buf {
>   	int			seg_count;
>   	int			seg_size;
>   	int			timeout_ms;
> +	int			randomized_wait;
>   	int			retries;
>   };
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2] Add exponential backoff + random delay to MADs when retrying after timeout.
       [not found]     ` <4CC976CC.1080009-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
@ 2010-11-02 21:58       ` Mike Heinz
       [not found]         ` <4C2744E8AD2982428C5BFE523DF8CDCB4A207A80C5-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Mike Heinz @ 2010-11-02 21:58 UTC (permalink / raw)
  To: Hal Rosenstock, Sean Hefty
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Todd Rimmer

[-- Attachment #1: Type: text/plain, Size: 6169 bytes --]

Hal,

At the bottom of this is a slight rewrite of my previous email (and a tweak to the patch) to address your concerns and to make things more clear. Other items are answered inline.

>What experience/confidence is there in this (specific) randomization 
>policy ? On what (how large) IB cluster sizes has this policy been tried 
>? Is this specific policy modeled from other policies in use elsewhere ?

To explicitly discuss this: The old Infinicon stack added 1 second on each successive retry, but didn't randomize. I modeled this algorithm after the Ethernet model but I chose the terms to be on the same order of magnitude as we typically use for MAD timeouts. I can't claim to have any special experience showing this particular policy is best except to say that the principles are sound.

>Also, is this randomized timeout used on RMPP packets if this parameter 
> is not 0 ?

If the module parameter is non-zero then yes, it will coerce all timeouts for all MAD requests to randomize. Keep in mind that this code doesn't change how packets are processed when they timeout, it just changes how the timeout is calculated.

>> Finally, I've added a module parameter to coerce all mad work requests to use this feature if desired.

>On one hand, I don't want to introduce unneeded parameters/complexity 
>but I'm wondering whether more granularity is useful on which requests 
>(classes ?) this applies to. For example, should SM requests be 
>randomized ? This feature is primarily an SA thing although busy can be 
>used for other management classes but it's use is mainly GS related.

First, I think we should separate this from the BUSY handling issue - not because they aren't connected but because every time I start focusing on these things I promptly get yanked onto something else. Hopefully we can focus on just the randomization aspect and bring it to a satisfactory agreement first, then I'll re-submit the BUSY handling patch based on that. 

That said, there's been some argument over whether the best place for choosing the retry policy is in ib_mad or in the individual ulps and apps. The intent of the module parameter is to provide relief on larger clusters while waiting for the authors of other components to modify their models. I do also think randomizing on retry is just as applicable for SM requests as for SA - if requests are timing out, then the SA/SM is getting overloaded, regardless of the type of request.

-----------------------------
Design notes:

This patch builds upon a discussion we had earlier this year on adding a
backoff function when retrying MAD sends after a timeout.

The current behavior is to retry MAD requests at a fixed interval, specified by
the caller, and no more than the number of times specified by the caller.

The problem with this approach is that if the same application or ulp is
installed on many hundreds (or thousands) of nodes, all using the same retry
interval, they could all end up retrying at roughly the same time, causing
repeatable packet storms. On a large cluster, these storms can effectively act
as a denial of service attack. To get around this, the retry timer should have
a randomization component of a similar order of magnitude as the retries
themselves. Since retries are usually on the order of one second, the patch
defines the randomization component as between zero and roughly 1/2 second
(511 ms) although the upper limit can tuned by changing a #define.

The other standard method for prevent storms of retries is to implement an
exponential backoff, such as is used in the Ethernet protocol. However, because
the user has also explicitly specified a timeout value, I chose to treat
that value as a minimum delay, then I add an exponential value on top of that,
defined as BASE*2^c, where 'c' is the number of retries already attempted,
minus 1.

Currently, the base value is defined as 511 ms (1/2 second), so that the
retry interval is defined as:

(minimum timeout) + 511<<c - (random value between 0 & 511)

This causes the following retry times:

0:      minimum timeout
1:      minimum timeout + (random value between 0 & 511)
2:      minimum timeout + 1 second - (random value between 0 & 511)
3:      minimum timeout + 2 seconds - (random value between 0 & 511)
4:      minimum timeout + 4 seconds - (random value between 0 & 511)
.
.
.
c:      minimum timeout + (1/2 second)*2^c - (random value between 0 & 511)

(For comparison, the old Silverstorm/Infinicon stack waited 1 second *
the number of retries.)

------------------------

Implementation:

This patch does NOT implement the ABI/API changes that would be needed to take
advantage of the new features, but it lays the groundwork for doing so. In
addition, it provides a new module parameter that allow the administrator to
coerce existing code into using the new capability:

parm: randomized_wait: When true, use a randomized backoff algorithm to control
retries for timeouts. (int)

Note that this parameter will not force retries if the caller specified 0
retries.

Next, I've added a new field called "randomized_wait" to the ib_mad_send_buf
structure. If this parameter is set, each time the WR times out, the timeout
for the next retry is set to

(send_wr->timeout_ms + 511<<(send_wr->retries) - random32()&511).

In other words, on the first retry, the randomization code will add between 0
and 1/2 second to the timeout. On the second retry, it will add between 0.5 and
1.0 seconds to the timeout, on the 3rd, between 1.5 and 2 seconds, on the 4th,
between 3.5 and 4, et cetera. In addition, a new field, total_timeout has been
added to the ib_mad_send_wr_private. My plan is that if the caller specifies 
randomized retries, total_timeout will be set to send_wr->timeout and 
send_wr->timeout will be set to the base (default) timeout as its initial 
value. If randomized_wait is set, total_timeout will instead be set to
(send_wr->timeout * send_wr->max_retries).

In either case, retries cannot exceed this total time, even if that will mean a 
lower number of retry attempts.

[-- Attachment #2: randomized_mad_timeout.patch --]
[-- Type: application/octet-stream, Size: 4768 bytes --]

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index d5dff2c..4f2394c 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -42,6 +42,11 @@
 #include "agent.h"
 #include "linux/utsname.h"

+#include "linux/random.h"
+
+#define MAD_MIN_TIMEOUT_MS 511
+#define MAD_RAND_TIMEOUT_MS 511
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("kernel IB MAD API");
 MODULE_AUTHOR("Hal Rosenstock");
@@ -55,6 +60,10 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests
 module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");

+int mad_randomized_wait = 0;
+module_param_named(randomized_wait, mad_randomized_wait, int, 0444);
+MODULE_PARM_DESC(randomized_wait, "When true, use a randomized backoff algorithm to control retries for timeouts.");
+
 static struct kmem_cache *ib_mad_cache;

 static struct list_head ib_mad_port_list;
@@ -1134,11 +1143,29 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 		}

 		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+
+		mad_send_wr->randomized_wait = mad_randomized_wait || send_buf->randomized_wait;
+		if (send_buf->randomized_wait) {
+			// In this mode, timeout_ms refers to the total time
+			// the caller is willing to wait for a reply, and 
+			// the initial wait time is set to the minimum.
+			mad_send_wr->total_timeout = msecs_to_jiffies(send_buf->timeout_ms);
+			send_buf->timeout_ms = MAD_MIN_TIMEOUT_MS;
+		} else {
+			// In this mode, the caller specified a specified number
+			// of retries and the time to wait for each retry.
+			mad_send_wr->total_timeout = msecs_to_jiffies(send_buf->timeout_ms)
+					* send_buf->retries;
+		}
+		
 		/* Timeout will be updated after send completes */
 		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+
 		mad_send_wr->max_retries = send_buf->retries;
 		mad_send_wr->retries_left = send_buf->retries;
+		
 		send_buf->retries = 0;
+		
 		/* Reference for work request to QP + response */
 		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
 		mad_send_wr->status = IB_WC_SUCCESS;
@@ -1846,6 +1873,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,

 	/* Complete corresponding request */
 	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
 		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
 		if (mad_send_wr) {
@@ -2477,14 +2505,38 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	int ret;

-	if (!mad_send_wr->retries_left)
+	if (!mad_send_wr->retries_left || (mad_send_wr->total_timeout == 0))
 		return -ETIMEDOUT;

 	mad_send_wr->retries_left--;
 	mad_send_wr->send_buf.retries++;

-	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	if (mad_send_wr->randomized_wait) {
+		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms +
+			(MAD_MIN_TIMEOUT_MS<<mad_send_wr->send_buf.retries) -
+			(random32()&MAD_RAND_TIMEOUT_MS));
+		if (mad_send_wr->timeout > mad_send_wr->total_timeout) {
+			mad_send_wr->timeout = mad_send_wr->total_timeout;
+			mad_send_wr->total_timeout = 0;
+
+			// Make sure we don't fall below the minimum time out.
+			if (mad_send_wr->timeout < msecs_to_jiffies(MAD_MIN_TIMEOUT_MS)) {
+					return -ETIMEDOUT;
+			}
+		} else {
+			mad_send_wr->total_timeout -= mad_send_wr->timeout;
+		}
+	} else {
+		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	}

+	printk(KERN_DEBUG PFX "Retrying send %p: retries: %u, retries_left: %u, timeout: %lu, total_timeout: %lu\n",
+		mad_send_wr,
+		mad_send_wr->send_buf.retries,
+		mad_send_wr->retries_left,
+		mad_send_wr->timeout,
+		mad_send_wr->total_timeout);
+		
 	if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
 		ret = ib_retry_rmpp(mad_send_wr);
 		switch (ret) {
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 8b4df0a..390c8b8 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -133,8 +133,10 @@ struct ib_mad_send_wr_private {
 	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
 	__be64 tid;
 	unsigned long timeout;
+	unsigned long total_timeout;
 	int max_retries;
 	int retries_left;
+	int randomized_wait;
 	int retry;
 	int refcount;
 	enum ib_wc_status status;
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index 5916617..851b3ca 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -246,6 +246,7 @@ struct ib_mad_send_buf {
 	int			seg_count;
 	int			seg_size;
 	int			timeout_ms;
+	int			randomized_wait;
 	int			retries;
 };

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* RE: [PATCH v2] Add exponential backoff + random delay to MADs when retrying after timeout.
       [not found]         ` <4C2744E8AD2982428C5BFE523DF8CDCB4A207A80C5-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
@ 2010-11-02 23:12           ` Hefty, Sean
       [not found]             ` <CF9C39F99A89134C9CF9C4CCB68B8DDF25B83084D5-osO9UTpF0USkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Hefty, Sean @ 2010-11-02 23:12 UTC (permalink / raw)
  To: Mike Heinz, Hal Rosenstock
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Todd Rimmer

> The problem with this approach is that if the same application or ulp is
> installed on many hundreds (or thousands) of nodes, all using the same
> retry
> interval, they could all end up retrying at roughly the same time, causing
> repeatable packet storms. On a large cluster, these storms can effectively
> act
> as a denial of service attack. To get around this, the retry timer should
> have
> a randomization component of a similar order of magnitude as the retries
> themselves. Since retries are usually on the order of one second, the patch
> defines the randomization component as between zero and roughly 1/2 second
> (511 ms) although the upper limit can tuned by changing a #define.
> 
> The other standard method for prevent storms of retries is to implement an
> exponential backoff, such as is used in the Ethernet protocol. However,
> because
> the user has also explicitly specified a timeout value, I chose to treat
> that value as a minimum delay, then I add an exponential value on top of
> that,
> defined as BASE*2^c, where 'c' is the number of retries already attempted,
> minus 1.
> 
> Currently, the base value is defined as 511 ms (1/2 second), so that the
> retry interval is defined as:
> 
> (minimum timeout) + 511<<c - (random value between 0 & 511)
> 
> This causes the following retry times:
> 
> 0:      minimum timeout
> 1:      minimum timeout + (random value between 0 & 511)
> 2:      minimum timeout + 1 second - (random value between 0 & 511)
> 3:      minimum timeout + 2 seconds - (random value between 0 & 511)
> 4:      minimum timeout + 4 seconds - (random value between 0 & 511)

When you consider RMPP, the timeout/retry values specified by the user are not straightforward in their meaning.  I haven't look at this patch in detail yet, but how do the timeout changes work with RMPP MADs?  Is the timeout reset to the minimum after an ACK is received?

My personal preference at this time is to push more intelligence into the timeout/retry algorithm used by the MAD layer, but restricted to SA clients.  I'd like to see even more randomization in the retry time, coupled with a TCP-like congestion windowing implementation when issuing SA queries.

For example: Never allow more than, say, 8 SA queries outstanding at a time.  If an SA query times out, reduce the number of outstanding queries to 1 until we get a response, then double the number of queries allowed to be outstanding until we reach the max.  Have the mad layer calculate the SA query timeout based on the actual SA response time, with randomization based on that.  The user specified timeout value can basically be ignored.

The only reason I'm suggesting we restrict the algorithm to SA queries is to avoid storing per endpoint information.  That may be better handled by the CM (since CM responses are sends).

Given all this, then I think it would be okay to accept the patch to drop busy responses from the SA until this framework is in place, which wouldn't be until 2.6.38 or 39.

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v2] Add exponential backoff + random delay to MADs when retrying after timeout.
       [not found]             ` <CF9C39F99A89134C9CF9C4CCB68B8DDF25B83084D5-osO9UTpF0USkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2010-11-03 15:36               ` Mike Heinz
       [not found]                 ` <4C2744E8AD2982428C5BFE523DF8CDCB4A207A813F-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Mike Heinz @ 2010-11-03 15:36 UTC (permalink / raw)
  To: Hefty, Sean, Hal Rosenstock
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Todd Rimmer

Sean said:
> When you consider RMPP, the timeout/retry values specified by 
> the user are not straightforward in their meaning.  I haven't 
> look at this patch in detail yet, but how do the timeout 
> changes work with RMPP MADs?  Is the timeout reset to the 
> minimum after an ACK is received?

Hal asked the same thing - and I'm confused because I thought
that if receiving an RMPP response times out, the entire transaction is
aborted. 

First, the existing code - before I patched it - doesn't distinguish 
between RMPP and regular MADs when dealing with timeouts. 

Second, the spec says (on p 788):

| If the Receiver does not receive all the packets in this transaction within 
| its transaction timer, it ABORTs the transaction and terminates.

As far as I can tell, that's what the current ib_mad module implements -
if the entire transaction doesn't complete with the receiver-specified
time out, the entire thing is retried.

> My personal preference at this time is to push more intelligence 
> into the timeout/retry algorithm used by the MAD layer, but 
> restricted to SA clients.  I'd like to see even more randomization 
> in the retry time, coupled with a TCP-like congestion windowing 
> implementation when issuing SA queries.

> For example: Never allow more than, say, 8 SA queries outstanding 
> at a time.  If an SA query times out, reduce the number of 
> outstanding queries to 1 until we get a response, then double the 
> number of queries allowed to be outstanding until we reach the max.  
> Have the mad layer calculate the SA query timeout based on the 
> actual SA response time, with randomization based on that.  The 
> user specified timeout value can basically be ignored.

>The only reason I'm suggesting we restrict the algorithm to SA 
> queries is to avoid storing per endpoint information.  That may 
> be better handled by the CM (since CM responses are sends).

> Given all this, then I think it would be okay to accept the 
> patch to drop busy responses from the SA until this framework 
> is in place, which wouldn't be until 2.6.38 or 39.

I'm open to this, but do we really need TCP/IP level congestion 
control? How many nodes are likely to have more than a few SA 
queries outstanding at a time?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v2] Add exponential backoff + random delay to MADs when retrying after timeout.
       [not found]                 ` <4C2744E8AD2982428C5BFE523DF8CDCB4A207A813F-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
@ 2010-11-03 16:03                   ` Hefty, Sean
       [not found]                     ` <CF9C39F99A89134C9CF9C4CCB68B8DDF25B8308894-osO9UTpF0USkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Hefty, Sean @ 2010-11-03 16:03 UTC (permalink / raw)
  To: Mike Heinz, Hal Rosenstock
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Todd Rimmer

> Hal asked the same thing - and I'm confused because I thought
> that if receiving an RMPP response times out, the entire transaction is
> aborted.

RMPP still uses retries.  If the user specifies a timeout of 1 second, with 3 retries, _each_ RMPP window will be retried up to 3 times, waiting for an ACK.  Once an ACK is received, the next window can be retried up to 3 times, with a 1 second timeout per ACK, etc.  It looks like your patch increments the timeout, and the increment is maintained across windows.

> I'm open to this, but do we really need TCP/IP level congestion
> control? How many nodes are likely to have more than a few SA
> queries outstanding at a time?

With large MPI job startup, we could have hundreds or thousands of SA queries issued from a single node.  Even if the number of requests per node is small, the intent is to have all nodes back off from flooding the SA.  So, I would say, yes, we want something like TCP congestion control.  A delay in a response seems more likely to be a result in the SA being flooded with requests than an actual packet being dropped.

This would also allow a node to delay sending any SA query after receiving a busy response to one.  Caching data can help here, but we get the data from the SA first, plus still be able to handle errors, topology changes, QoS, etc.

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v2] Add exponential backoff + random delay to MADs when retrying after timeout.
       [not found]                     ` <CF9C39F99A89134C9CF9C4CCB68B8DDF25B8308894-osO9UTpF0USkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2010-11-03 17:22                       ` Mike Heinz
  0 siblings, 0 replies; 8+ messages in thread
From: Mike Heinz @ 2010-11-03 17:22 UTC (permalink / raw)
  To: Hefty, Sean, Hal Rosenstock
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Todd Rimmer

Okay - I see that; but it's independent of the code I've patched. Basically, if an RMPP is requested it won't be affected by this change, it will continue to use the existing algorithm - which appears to use a maximum 2 second time out for the first try of each segment (mad_rmpp.c). 

Reviewing the code, under the existing code I think that if the segment is retried it will use the value that was provided for the caller, in the patched version it would be the caller's value plus the randomization algorithm. So, the risk I see is that total_timeout could expire while 
still sending rmpp packets, if each segment experiences time outs.

On the subject of ignoring the value of the time out passed in by the caller - back in June we had talked about a model where the caller specified a total time to wait, regardless of how many retries are involved. I still think that idea has some merit; it still gives the developer some control over how long they will be waiting.

-----Original Message-----
From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org] 
Sent: Wednesday, November 03, 2010 12:03 PM
To: Mike Heinz; Hal Rosenstock
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Todd Rimmer
Subject: RE: [PATCH v2] Add exponential backoff + random delay to MADs when retrying after timeout.

> Hal asked the same thing - and I'm confused because I thought
> that if receiving an RMPP response times out, the entire transaction is
> aborted.

RMPP still uses retries.  If the user specifies a timeout of 1 second, with 3 retries, _each_ RMPP window will be retried up to 3 times, waiting for an ACK.  Once an ACK is received, the next window can be retried up to 3 times, with a 1 second timeout per ACK, etc.  It looks like your patch increments the timeout, and the increment is maintained across windows.

> I'm open to this, but do we really need TCP/IP level congestion
> control? How many nodes are likely to have more than a few SA
> queries outstanding at a time?

With large MPI job startup, we could have hundreds or thousands of SA queries issued from a single node.  Even if the number of requests per node is small, the intent is to have all nodes back off from flooding the SA.  So, I would say, yes, we want something like TCP congestion control.  A delay in a response seems more likely to be a result in the SA being flooded with requests than an actual packet being dropped.

This would also allow a node to delay sending any SA query after receiving a busy response to one.  Caching data can help here, but we get the data from the SA first, plus still be able to handle errors, topology changes, QoS, etc.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2010-11-03 17:22 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-10-26 18:33 [PATCH] Add exponential backoff + random delay to MADs when retrying after timeout Mike Heinz
     [not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49D4675DEB-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
2010-10-28 13:12   ` Hal Rosenstock
     [not found]     ` <4CC976CC.1080009-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2010-11-02 21:58       ` [PATCH v2] " Mike Heinz
     [not found]         ` <4C2744E8AD2982428C5BFE523DF8CDCB4A207A80C5-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
2010-11-02 23:12           ` Hefty, Sean
     [not found]             ` <CF9C39F99A89134C9CF9C4CCB68B8DDF25B83084D5-osO9UTpF0USkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
2010-11-03 15:36               ` Mike Heinz
     [not found]                 ` <4C2744E8AD2982428C5BFE523DF8CDCB4A207A813F-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
2010-11-03 16:03                   ` Hefty, Sean
     [not found]                     ` <CF9C39F99A89134C9CF9C4CCB68B8DDF25B8308894-osO9UTpF0USkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
2010-11-03 17:22                       ` Mike Heinz
  -- strict thread matches above, loose matches on Subject: below --
2010-10-11 15:34 [PATCH] " Mike Heinz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.