* [PATCH v2] allow passthrough of rmpp packets to user mad clients
@ 2010-06-04 17:14 Mike Heinz
[not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49A488DAD8-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
0 siblings, 1 reply; 12+ messages in thread
From: Mike Heinz @ 2010-06-04 17:14 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Hal Rosenstock, Hefty, Sean, Roland Dreier
This is an update to the previous version of the patch, based on feedback from Hal.
----
Currently, if a user application calls umad_register() or umad_register_oui() with an rmpp_version of zero, incoming rmpp messages are discarded and if the rmpp_version is 1, incoming rmpp packets are collected by the kernel layer and passed as a group to the user application.
This patch changes this behavior so that rmpp_version of 255 causes incoming rmpp packets to be passed through without alteration, instead.
There are IB users who have requested the ability to perform RMPP transaction handling in user space. This was an option in old proprietary stacks and this is useful to migrate old applications to OFED while containing the scope of their application changes.
Signed-Off-By: Michael Heinz <michael.heinz-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org>
-------
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index ef1304f..efca783 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -207,12 +207,18 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
int ret2, qpn;
unsigned long flags;
u8 mgmt_class, vclass;
+ u8 rmpp_passthru = 0;
/* Validate parameters */
qpn = get_spl_qp_index(qp_type);
if (qpn == -1)
goto error1;
+ if (rmpp_version == IB_MGMT_RMPP_PASSTHRU) {
+ rmpp_passthru = 255;
+ rmpp_version = 0;
+ }
+
if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION)
goto error1;
@@ -244,6 +250,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
if (!is_vendor_oui(mad_reg_req->oui))
goto error1;
}
+
/* Make sure class supplied is consistent with RMPP */
if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
if (rmpp_version)
@@ -302,6 +309,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
mad_agent_priv->reg_req = reg_req;
mad_agent_priv->agent.rmpp_version = rmpp_version;
+ mad_agent_priv->agent.rmpp_passthru = rmpp_passthru;
mad_agent_priv->agent.device = device;
mad_agent_priv->agent.recv_handler = recv_handler;
mad_agent_priv->agent.send_handler = send_handler;
@@ -1792,7 +1800,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
- if (mad_agent_priv->agent.rmpp_version) {
+ if (mad_agent_priv->agent.rmpp_version && !mad_agent_priv->agent.rmpp_passthru) {
mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
mad_recv_wc);
if (!mad_recv_wc) {
@@ -1801,29 +1809,47 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
}
}
+ /*
+ * At this point, the MAD is either not an RMPP or we are passing RMPPs thru to
+ * the client.
+ */
/* Complete corresponding request */
if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
spin_lock_irqsave(&mad_agent_priv->lock, flags);
mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
- if (!mad_send_wr) {
+ if (mad_send_wr) {
+ ib_mark_mad_done(mad_send_wr);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- ib_free_recv_mad(mad_recv_wc);
- deref_mad_agent(mad_agent_priv);
- return;
- }
- ib_mark_mad_done(mad_send_wr);
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- /* Defined behavior is to complete response before request */
- mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
- mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
- mad_recv_wc);
- atomic_dec(&mad_agent_priv->refcount);
+ /* Defined behavior is to complete response before request */
+ mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+ mad_recv_wc);
+ atomic_dec(&mad_agent_priv->refcount);
- mad_send_wc.status = IB_WC_SUCCESS;
- mad_send_wc.vendor_err = 0;
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ } else {
+ if (mad_agent_priv->agent.rmpp_passthru
+ && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ // user rmpp is in effect
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ mad_recv_wc->wc->wr_id = 0;
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+ mad_recv_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+ } else {
+ // not user rmpp, revert to normal behavior and drop the mad
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ ib_free_recv_mad(mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+ }
} else {
mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
mad_recv_wc);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 6babb72..baa11ae 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -501,7 +501,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
- if (!ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) {
+ if (!agent->rmpp_version || !ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) {
copy_offset = IB_MGMT_MAD_HDR;
rmpp_active = 0;
} else {
@@ -553,14 +553,22 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
rmpp_mad->mad_hdr.tid = *tid;
}
- spin_lock_irq(&file->send_lock);
- ret = is_duplicate(file, packet);
- if (!ret)
+ if (agent->rmpp_passthru
+ && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ spin_lock_irq(&file->send_lock);
list_add_tail(&packet->list, &file->send_list);
- spin_unlock_irq(&file->send_lock);
- if (ret) {
- ret = -EINVAL;
- goto err_msg;
+ spin_unlock_irq(&file->send_lock);
+ } else {
+ spin_lock_irq(&file->send_lock);
+ ret = is_duplicate(file, packet);
+ if (!ret)
+ list_add_tail(&packet->list, &file->send_list);
+ spin_unlock_irq(&file->send_lock);
+ if (ret) {
+ ret = -EINVAL;
+ goto err_msg;
+ }
}
ret = ib_post_send_mad(packet->msg, NULL);
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index d3b9401..2651e93 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -79,6 +79,7 @@
/* RMPP information */
#define IB_MGMT_RMPP_VERSION 1
+#define IB_MGMT_RMPP_PASSTHRU 255
#define IB_MGMT_RMPP_TYPE_DATA 1
#define IB_MGMT_RMPP_TYPE_ACK 2
@@ -360,6 +361,7 @@ struct ib_mad_agent {
u32 hi_tid;
u8 port_num;
u8 rmpp_version;
+ u8 rmpp_passthru;
};
/**
@@ -436,7 +438,9 @@ struct ib_mad_reg_req {
* wishes to receive solicited responses.
* @rmpp_version: If set, indicates that the client will send
* and receive MADs that contain the RMPP header for the given version.
- * If set to 0, indicates that RMPP is not used by this client.
+ * If set to 0, indicates that RMPP is not used by this client. If
+ * set to 255, incoming RMPP MADs are passed through to the client.
+ * Otherwise, RMPP MADs are handled according to the version #.
* @send_handler: The completion callback routine invoked after a send
* request has completed.
* @recv_handler: The completion callback routine invoked for a received
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v2] allow passthrough of rmpp packets to user mad clients
[not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49A488DAD8-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
@ 2010-06-04 19:54 ` Roland Dreier
[not found] ` <adask52o9p6.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-06-08 16:59 ` [PATCH v2] allow passthrough of rmpp protocol " Mike Heinz
1 sibling, 1 reply; 12+ messages in thread
From: Roland Dreier @ 2010-06-04 19:54 UTC (permalink / raw)
To: Mike Heinz; +Cc: linux-rdma@vger.kernel.org, Hal Rosenstock, Hefty, Sean
> This patch changes this behavior so that rmpp_version of 255 causes incoming rmpp packets to be passed through without alteration, instead.
>
> There are IB users who have requested the ability to perform RMPP transaction handling in user space. This was an option in old proprietary stacks and this is useful to migrate old applications to OFED while containing the scope of their application changes.
I'm a little dubious about this. We have an RMPP implementation in the
kernel, and it seems worthwhile to focus on stability and features
there. Allowing alternate RMPP implementations in userspace seems a bit
iffy -- we don't have a socket option that lets us do TCP in userspace
for a given connection, for example.
--
Roland Dreier <rolandd-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org> || For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/index.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH v2] allow passthrough of rmpp packets to user mad clients
[not found] ` <adask52o9p6.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
@ 2010-06-07 17:55 ` Mike Heinz
2010-06-18 13:42 ` Mike Heinz
1 sibling, 0 replies; 12+ messages in thread
From: Mike Heinz @ 2010-06-07 17:55 UTC (permalink / raw)
To: Roland Dreier
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Hal Rosenstock, Hefty, Sean
We also have some management applications which also need these capabilities. For those applications, the use of application RMPP control allows the application to perform some pacing of the RMPP transactions, permits some parts of the RMPP response to be built on the fly and also permits a degree of sharing of the response data between multiple requestors.
-----Original Message-----
From: Roland Dreier [mailto:rdreier-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org]
Sent: Friday, June 04, 2010 3:54 PM
To: Mike Heinz
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Hal Rosenstock; Hefty, Sean
Subject: Re: [PATCH v2] allow passthrough of rmpp packets to user mad clients
> This patch changes this behavior so that rmpp_version of 255 causes incoming rmpp packets to be passed through without alteration, instead.
>
> There are IB users who have requested the ability to perform RMPP transaction handling in user space. This was an option in old proprietary stacks and this is useful to migrate old applications to OFED while containing the scope of their application changes.
I'm a little dubious about this. We have an RMPP implementation in the
kernel, and it seems worthwhile to focus on stability and features
there. Allowing alternate RMPP implementations in userspace seems a bit
iffy -- we don't have a socket option that lets us do TCP in userspace
for a given connection, for example.
--
Roland Dreier <rolandd-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org> || For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/index.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH v2] allow passthrough of rmpp protocol to user mad clients
[not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49A488DAD8-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
2010-06-04 19:54 ` Roland Dreier
@ 2010-06-08 16:59 ` Mike Heinz
[not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49A488DCEE-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
1 sibling, 1 reply; 12+ messages in thread
From: Mike Heinz @ 2010-06-08 16:59 UTC (permalink / raw)
To: Mike Heinz, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Hal Rosenstock, Hefty, Sean, Rol
On a different subject - have we come to any conclusions about this patch?
-----Original Message-----
From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Mike Heinz
Sent: Friday, June 04, 2010 1:14 PM
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Hal Rosenstock; Hefty, Sean; Roland Dreier
Subject: [PATCH v2] allow passthrough of rmpp protocol to user mad clients
This is an update to the previous version of the patch, based on feedback from Hal.
----
Currently, if a user application calls umad_register() or umad_register_oui() with an rmpp_version of zero, incoming rmpp messages are discarded and if the rmpp_version is 1, incoming rmpp packets are collected by the kernel layer and passed as a group to the user application.
This patch changes this behavior so that rmpp_version of 255 causes incoming rmpp packets to be passed through without alteration, instead.
There are IB users who have requested the ability to perform RMPP transaction handling in user space. This was an option in old proprietary stacks and this is useful to migrate old applications to OFED while containing the scope of their application changes.
Signed-Off-By: Michael Heinz <michael.heinz-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org>
-------
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index ef1304f..efca783 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -207,12 +207,18 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
int ret2, qpn;
unsigned long flags;
u8 mgmt_class, vclass;
+ u8 rmpp_passthru = 0;
/* Validate parameters */
qpn = get_spl_qp_index(qp_type);
if (qpn == -1)
goto error1;
+ if (rmpp_version == IB_MGMT_RMPP_PASSTHRU) {
+ rmpp_passthru = 255;
+ rmpp_version = 0;
+ }
+
if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION)
goto error1;
@@ -244,6 +250,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
if (!is_vendor_oui(mad_reg_req->oui))
goto error1;
}
+
/* Make sure class supplied is consistent with RMPP */
if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
if (rmpp_version)
@@ -302,6 +309,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
mad_agent_priv->reg_req = reg_req;
mad_agent_priv->agent.rmpp_version = rmpp_version;
+ mad_agent_priv->agent.rmpp_passthru = rmpp_passthru;
mad_agent_priv->agent.device = device;
mad_agent_priv->agent.recv_handler = recv_handler;
mad_agent_priv->agent.send_handler = send_handler;
@@ -1792,7 +1800,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
- if (mad_agent_priv->agent.rmpp_version) {
+ if (mad_agent_priv->agent.rmpp_version && !mad_agent_priv->agent.rmpp_passthru) {
mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
mad_recv_wc);
if (!mad_recv_wc) {
@@ -1801,29 +1809,47 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
}
}
+ /*
+ * At this point, the MAD is either not an RMPP or we are passing RMPPs thru to
+ * the client.
+ */
/* Complete corresponding request */
if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
spin_lock_irqsave(&mad_agent_priv->lock, flags);
mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
- if (!mad_send_wr) {
+ if (mad_send_wr) {
+ ib_mark_mad_done(mad_send_wr);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- ib_free_recv_mad(mad_recv_wc);
- deref_mad_agent(mad_agent_priv);
- return;
- }
- ib_mark_mad_done(mad_send_wr);
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- /* Defined behavior is to complete response before request */
- mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
- mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
- mad_recv_wc);
- atomic_dec(&mad_agent_priv->refcount);
+ /* Defined behavior is to complete response before request */
+ mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+ mad_recv_wc);
+ atomic_dec(&mad_agent_priv->refcount);
- mad_send_wc.status = IB_WC_SUCCESS;
- mad_send_wc.vendor_err = 0;
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ } else {
+ if (mad_agent_priv->agent.rmpp_passthru
+ && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ // user rmpp is in effect
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ mad_recv_wc->wc->wr_id = 0;
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+ mad_recv_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+ } else {
+ // not user rmpp, revert to normal behavior and drop the mad
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ ib_free_recv_mad(mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+ }
} else {
mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
mad_recv_wc);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 6babb72..baa11ae 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -501,7 +501,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
- if (!ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) {
+ if (!agent->rmpp_version || !ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) {
copy_offset = IB_MGMT_MAD_HDR;
rmpp_active = 0;
} else {
@@ -553,14 +553,22 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
rmpp_mad->mad_hdr.tid = *tid;
}
- spin_lock_irq(&file->send_lock);
- ret = is_duplicate(file, packet);
- if (!ret)
+ if (agent->rmpp_passthru
+ && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ spin_lock_irq(&file->send_lock);
list_add_tail(&packet->list, &file->send_list);
- spin_unlock_irq(&file->send_lock);
- if (ret) {
- ret = -EINVAL;
- goto err_msg;
+ spin_unlock_irq(&file->send_lock);
+ } else {
+ spin_lock_irq(&file->send_lock);
+ ret = is_duplicate(file, packet);
+ if (!ret)
+ list_add_tail(&packet->list, &file->send_list);
+ spin_unlock_irq(&file->send_lock);
+ if (ret) {
+ ret = -EINVAL;
+ goto err_msg;
+ }
}
ret = ib_post_send_mad(packet->msg, NULL);
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index d3b9401..2651e93 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -79,6 +79,7 @@
/* RMPP information */
#define IB_MGMT_RMPP_VERSION 1
+#define IB_MGMT_RMPP_PASSTHRU 255
#define IB_MGMT_RMPP_TYPE_DATA 1
#define IB_MGMT_RMPP_TYPE_ACK 2
@@ -360,6 +361,7 @@ struct ib_mad_agent {
u32 hi_tid;
u8 port_num;
u8 rmpp_version;
+ u8 rmpp_passthru;
};
/**
@@ -436,7 +438,9 @@ struct ib_mad_reg_req {
* wishes to receive solicited responses.
* @rmpp_version: If set, indicates that the client will send
* and receive MADs that contain the RMPP header for the given version.
- * If set to 0, indicates that RMPP is not used by this client.
+ * If set to 0, indicates that RMPP is not used by this client. If
+ * set to 255, incoming RMPP MADs are passed through to the client.
+ * Otherwise, RMPP MADs are handled according to the version #.
* @send_handler: The completion callback routine invoked after a send
* request has completed.
* @recv_handler: The completion callback routine invoked for a received
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 12+ messages in thread
* RE: [PATCH v2] allow passthrough of rmpp protocol to user mad clients
[not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49A488DCEE-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
@ 2010-06-09 16:03 ` Hefty, Sean
0 siblings, 0 replies; 12+ messages in thread
From: Hefty, Sean @ 2010-06-09 16:03 UTC (permalink / raw)
To: Mike Heinz, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Hal Rosenstock, Roland Dreier
> On a different subject - have we come to any conclusions about this patch?
I agree with Roland's response on this. I don't think we want to support a user space implementation of RMPP. The posted receive buffers are ultimately owned by the kernel, so it should really control the windowing. IMO, the other thread is showing that exposing simple things like as timeouts, retries, and BUSY responses to the user leads to issues; exposing the full RMPP implementation can't be better.
- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH v2] allow passthrough of rmpp packets to user mad clients
[not found] ` <adask52o9p6.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-06-07 17:55 ` Mike Heinz
@ 2010-06-18 13:42 ` Mike Heinz
[not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49D09E7C2F-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
1 sibling, 1 reply; 12+ messages in thread
From: Mike Heinz @ 2010-06-18 13:42 UTC (permalink / raw)
To: Roland Dreier
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Hal Rosenstock, Hefty, Sean
Roland wrote:
> I'm a little dubious about this. We have an RMPP implementation in the kernel, and it seems
> worthwhile to focus on stability and features there. Allowing alternate RMPP implementations in
> userspace seems a bit iffy -- we don't have a socket option that lets us do TCP in userspace for a
> given connection, for example.
Sean wrote:
> I agree with Roland's response on this. I don't think we want to support a user space
> implementation of RMPP. The posted receive buffers are ultimately owned by the kernel, so it should
> really control the windowing. IMO, the other thread is showing that exposing simple things like as
> timeouts, retries, and BUSY responses to the user leads to issues; exposing the full RMPP
> implementation can't be better.
Hal wrote:
> I think the simplest change is to use rmpp_version 255 for this mode (and update doc/headers
> accordingly) and preserve existing rmpp_version behavior.
We've redesigned the patch to comply with Hal's feedback, it would seem that Hal is ok with this basic approach.
First addressing Roland's comment, there are in fact TCP socket options which control how much buffering is done in the kernel and hence control message size and segmentation points for TCP. Those options allow the careful balance of window size, kernel memory space and TCP performance to be tuned, the defaults for these options tend to be relatively small. This is possible for TCP since the protocol is defined at the application level as a byte stream protocol, hence it is up to the TCP stack to decide the proper segmentation points and windowing. Applications must be written to assume a recv() could return only part of a corresponding send() and could be at any arbitrary byte boundary.
Unfortunately for IB the size of an RMPP response and buffer cannot be controlled by the kernel. So if an application has a large response to send, the entire buffer must be copied into the kernel and the kernel cannot decide on its own segmentation boundaries. Hence the ability for selected management applications to control and limit the amount of kernel memory space is desirable. These issues become serious at scale when larger RMPP responses are needed and more clients may also be issuing requests. The two can combine and result in N^2 types behavior for kernel memory footprint relative to N=cluster node count or potentially N=cluster CPU core count.
To explain this, let's look at some basic RMPP queries. An end node may issue an RMPP query to a centralized entity. The size of this response can be a function of number of nodes. Let's assume the response had 100 bytes per node. At 1000 nodes, this response would be 100KB. In this case the present OFED RMPP mechanism would transfer the full 100KB into the kernel and then process RMPP out of that kernel copy. Now consider the fact that many nodes, perhaps even all, may want to issue queries at roughly the same time. In this case 1000 nodes could each have 100KB responses active, in which case there would be 100MB of RMPP data stored in the kernel. If this same example is expanded to 2000 nodes, the memory requirement grows to 400MB. At 4000 nodes it's 1.6GB. etc. Other factors
can make this even worse, for example if a given node could issue multiple queries (1 per process), etc.
Granted this is an extreme example. However use of such large amounts of kernel memory tends to be a serious issue. This is made worse by the fact the management application also may need a copy of the response to facilitate error handling, etc. In the applications I mention below, they were able to take advantage of data patterns in the responses to provide RMPP packets directly out of a single copy of the response/database. In which case by managing the RMPP protocol directly they were able to use the single copy to provide each window size worth of packets. In the 4000 node example this saved over 3 GB of RAM (1.6GB in kernel and 1.6GB-100KB in application). Saving this much ram greatly reduced swapping, avoided excessive kernel footprint, and significantly improved the applicatio
n performance.
For any centralized management application that uses RMPP, the present OFED approach will suffer from this issue.
Rather than require applications with these unique requirements to invent new RMPP-like protocols on special QPs, it seems reasonable to allow applications with special scaling needs to leverage the RMPP protocol standard but have control over the kernel buffering and ack handling.
The approach we are proposing accomplishes this requirement while maintaining backward compatibility and limiting the scope of ib_mad changes. The exposure of RMPP implementation issues is limited to applications which choose to use this approach, and only applications needing such advanced capabilities would even attempt to do so. This is unlike the timeout discussion where all applications are required to select timeouts and retry counts and most, unfortunately, give limited thought to such values and often only pick values which are appropriate to the small clusters in which the development was done.
As we indicated, there are two primary reasons for our implementation of this change.
>There are QLogic customers who have requested the ability to perform
>RMPP transaction handling in user space. This was an option in our old
>proprietary stack and there are a few customers still using it which
>need a way to forward migrate to OFED while containing the scope of
>their application changes. While we have developed appropriate "shim" libraries to allow their applications to migrate, we can't simulate/shim rmpp processing without some kernel support.
The customers in point have had this exact issue and implemented techniques in their applications to manage the RMPP transactions. The old QLogic stack supported this capability and the user's needed to take advantage of it. To further propagate OFED adoption we see it as desirable to permit the customer to migrate these applications easily and in a timely manner.
>We also have some management applications which also need these
>capabilities. For those applications, the use of application RMPP
>control allows the application to perform some pacing of the RMPP
>transactions, permits some parts of the RMPP response to be built on the fly and also permits a degree of sharing of the response data between multiple requestors.
We too have run into the exact same issue with our own management applications and have seen that the OFED approach can lead to large memory footprint, timeouts and other issues.
Since the servicing of RMPP requests is typically limited to a small number of nodes, one compromise might be to have a config option to enable/disable the feature. In this way only management nodes would have the feature enabled and other ULPs and applications would hence be discouraged from using it.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] allow passthrough of rmpp packets to user mad clients
[not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49D09E7C2F-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
@ 2010-06-18 15:41 ` Jason Gunthorpe
[not found] ` <20100618154134.GA12884-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-06-18 17:23 ` Hefty, Sean
2010-06-23 17:26 ` Roland Dreier
2 siblings, 1 reply; 12+ messages in thread
From: Jason Gunthorpe @ 2010-06-18 15:41 UTC (permalink / raw)
To: Mike Heinz
Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Hal Rosenstock, Hefty, Sean
On Fri, Jun 18, 2010 at 08:42:38AM -0500, Mike Heinz wrote:
> First addressing Roland's comment, there are in fact TCP socket
> options which control how much buffering is done in the kernel and
> hence control message size and segmentation points for TCP. Those
> options allow the careful balance of window size, kernel memory
> space and TCP performance to be tuned, the defaults for these
> options tend to be relatively small. This is possible for TCP since
> the protocol is defined at the application level as a byte stream
> protocol, hence it is up to the TCP stack to decide the proper
> segmentation points and windowing. Applications must be written to
> assume a recv() could return only part of a corresponding send() and
> could be at any arbitrary byte boundary.
Umh, dumb question..
Why not just add byte-stream like APIs to the kernel interface for
RMPP?
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH v2] allow passthrough of rmpp packets to user mad clients
[not found] ` <20100618154134.GA12884-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2010-06-18 16:33 ` Hefty, Sean
[not found] ` <CF9C39F99A89134C9CF9C4CCB68B8DDF259E5A7B73-osO9UTpF0USkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
0 siblings, 1 reply; 12+ messages in thread
From: Hefty, Sean @ 2010-06-18 16:33 UTC (permalink / raw)
To: Jason Gunthorpe, Mike Heinz
Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Hal Rosenstock
> Why not just add byte-stream like APIs to the kernel interface for
> RMPP?
RMPP is more like IP segmentation than TCP segmentation.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] allow passthrough of rmpp packets to user mad clients
[not found] ` <CF9C39F99A89134C9CF9C4CCB68B8DDF259E5A7B73-osO9UTpF0USkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2010-06-18 16:38 ` Jason Gunthorpe
[not found] ` <20100618163842.GJ4630-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
0 siblings, 1 reply; 12+ messages in thread
From: Jason Gunthorpe @ 2010-06-18 16:38 UTC (permalink / raw)
To: Hefty, Sean
Cc: Mike Heinz, Roland Dreier,
linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Hal Rosenstock
On Fri, Jun 18, 2010 at 09:33:01AM -0700, Hefty, Sean wrote:
> > Why not just add byte-stream like APIs to the kernel interface for
> > RMPP?
>
> RMPP is more like IP segmentation than TCP segmentation.
Why do you say that?
Each RMPP session is identified by a unique, ID, can run transfer data
in both directions, does reassembly, re-ordering, re-transmit, and can
transfer unbounded amounts of data, without knowing how much in
advance.
Sounds exactly like TCP to me..
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH v2] allow passthrough of rmpp packets to user mad clients
[not found] ` <20100618163842.GJ4630-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2010-06-18 16:42 ` Hefty, Sean
0 siblings, 0 replies; 12+ messages in thread
From: Hefty, Sean @ 2010-06-18 16:42 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Mike Heinz, Roland Dreier,
linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Hal Rosenstock
> > RMPP is more like IP segmentation than TCP segmentation.
>
> Why do you say that?
I was thinking of it more of segmenting a single larger packet in a stream of packets.
> Each RMPP session is identified by a unique, ID, can run transfer data
> in both directions, does reassembly, re-ordering, re-transmit, and can
> transfer unbounded amounts of data, without knowing how much in
> advance.
Yes, the protocol is closer to TCP.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH v2] allow passthrough of rmpp packets to user mad clients
[not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49D09E7C2F-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
2010-06-18 15:41 ` Jason Gunthorpe
@ 2010-06-18 17:23 ` Hefty, Sean
2010-06-23 17:26 ` Roland Dreier
2 siblings, 0 replies; 12+ messages in thread
From: Hefty, Sean @ 2010-06-18 17:23 UTC (permalink / raw)
To: Mike Heinz, Roland Dreier
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Hal Rosenstock
> Unfortunately for IB the size of an RMPP response and buffer cannot be
> controlled by the kernel. So if an application has a large response to
> send, the entire buffer must be copied into the kernel and the kernel
> cannot decide on its own segmentation boundaries. Hence the ability for
> selected management applications to control and limit the amount of kernel
> memory space is desirable. These issues become serious at scale when
> larger RMPP responses are needed and more clients may also be issuing
> requests. The two can combine and result in N^2 types behavior for kernel
> memory footprint relative to N=cluster node count or potentially N=cluster
> CPU core count.
There's no requirement that an entire RMPP response be copied to the kernel before being sent. The current implementation does this, but that behavior can be modified to copy the data only when needed. The total number of outstanding sends that a client can have can also be restricted to throttle back a client trying to send large messages to everyone on the fabric.
> >There are QLogic customers who have requested the ability to perform
> >RMPP transaction handling in user space. This was an option in our old
> >proprietary stack and there are a few customers still using it which
> >need a way to forward migrate to OFED while containing the scope of
> >their application changes. While we have developed appropriate "shim"
> libraries to allow their applications to migrate, we can't simulate/shim
> rmpp processing without some kernel support.
There's nothing that prevents RMPP from running between an application and a library, with the library exchanging reassembled mads with the kernel. It may not be ideal from your perspective, but I don't see why it's not possible to support existing applications.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] allow passthrough of rmpp packets to user mad clients
[not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49D09E7C2F-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
2010-06-18 15:41 ` Jason Gunthorpe
2010-06-18 17:23 ` Hefty, Sean
@ 2010-06-23 17:26 ` Roland Dreier
2 siblings, 0 replies; 12+ messages in thread
From: Roland Dreier @ 2010-06-23 17:26 UTC (permalink / raw)
To: Mike Heinz; +Cc: linux-rdma@vger.kernel.org, Hal Rosenstock, Hefty, Sean
> Unfortunately for IB the size of an RMPP response and buffer cannot
> be controlled by the kernel. So if an application has a large
> response to send, the entire buffer must be copied into the kernel
> and the kernel cannot decide on its own segmentation boundaries.
> Hence the ability for selected management applications to control and
> limit the amount of kernel memory space is desirable. These issues
> become serious at scale when larger RMPP responses are needed and
> more clients may also be issuing requests. The two can combine and
> result in N^2 types behavior for kernel memory footprint relative to
> N=cluster node count or potentially N=cluster CPU core count.
So this is an interesting point. However I don't think we really want
to solve this by adding some magic value of the version field that
changes the umad interface into a completely different mode and then
have every app implement its own copy of the RMPP protocol in userspace.
It would seem a lot cleaner to me to just fix up the kernel RMPP
implementation to avoid the huge double buffering; is that not possible?
The other argument (proprietary legacy apps) doesn't really carry any
weight with me. We're not going to introduce duplicate APIs just so
someone doesn't have to port old code.
- R.
--
Roland Dreier <rolandd-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org> || For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/index.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2010-06-23 17:26 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-06-04 17:14 [PATCH v2] allow passthrough of rmpp packets to user mad clients Mike Heinz
[not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49A488DAD8-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
2010-06-04 19:54 ` Roland Dreier
[not found] ` <adask52o9p6.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-06-07 17:55 ` Mike Heinz
2010-06-18 13:42 ` Mike Heinz
[not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49D09E7C2F-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
2010-06-18 15:41 ` Jason Gunthorpe
[not found] ` <20100618154134.GA12884-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-06-18 16:33 ` Hefty, Sean
[not found] ` <CF9C39F99A89134C9CF9C4CCB68B8DDF259E5A7B73-osO9UTpF0USkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
2010-06-18 16:38 ` Jason Gunthorpe
[not found] ` <20100618163842.GJ4630-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-06-18 16:42 ` Hefty, Sean
2010-06-18 17:23 ` Hefty, Sean
2010-06-23 17:26 ` Roland Dreier
2010-06-08 16:59 ` [PATCH v2] allow passthrough of rmpp protocol " Mike Heinz
[not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49A488DCEE-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
2010-06-09 16:03 ` Hefty, Sean
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox