From: Krishnamraju Eraparaju <krishna2@chelsio.com>
To: Bernard Metzler <BMT@zurich.ibm.com>
Cc: faisal.latif@intel.com, shiraz.saleem@intel.com,
mkalderon@marvell.com, aelior@marvell.com, dledford@redhat.com,
jgg@ziepe.ca, linux-rdma@vger.kernel.org, bharat@chelsio.com,
nirranjan@chelsio.com
Subject: Re: [RFC PATCH] RDMA/siw: Experimental e2e negotiation of GSO usage.
Date: Wed, 29 Apr 2020 01:30:44 +0530 [thread overview]
Message-ID: <20200428200043.GA930@chelsio.com> (raw)
In-Reply-To: <OFA289A103.141EDDE1-ON0025854B.003ED42A-0025854B.0041DBD8@notes.na.collabserv.com>
[-- Attachment #1: Type: text/plain, Size: 478 bytes --]
On Wednesday, April 04/15/20, 2020 at 11:59:21 +0000, Bernard Metzler wrote:
Hi Bernard,
The attached patches enables the GSO negotiation code in SIW with
few modifications, and also allows hardware iwarp drivers to advertise
their max length(in 16/32/64KB granularity) that they can accept.
The logic is almost similar to how TCP SYN MSS announcements works while
3-way handshake.
Please see if this approach works better for softiwarp <=> hardiwarp
case.
Thanks,
Krishna.
[-- Attachment #2: siw.patch --]
[-- Type: text/plain, Size: 7925 bytes --]
diff --git a/drivers/infiniband/sw/siw/iwarp.h b/drivers/infiniband/sw/siw/iwarp.h
index e8a04d9c89cb..fa7d93be5d18 100644
--- a/drivers/infiniband/sw/siw/iwarp.h
+++ b/drivers/infiniband/sw/siw/iwarp.h
@@ -18,12 +18,34 @@
#define MPA_KEY_REQ "MPA ID Req Frame"
#define MPA_KEY_REP "MPA ID Rep Frame"
#define MPA_IRD_ORD_MASK 0x3fff
+#define MPA_FPDU_LEN_MASK 0x000c
struct mpa_rr_params {
__be16 bits;
__be16 pd_len;
};
+/*
+ * MPA reserved bits[4:5] are used to advertise the maximum FPDU length that an
+ * endpoint is willing to accept, like TCP SYN MSS announcement. This is an
+ * experimental enhancement to RDMA connection establishment. Both local and
+ * remote endpoints must use these flags, if at least, one endpoint wants to
+ * advertise it's capability to receive large FPDU length.
+ * As SIW driver sits on top of TCP/IP stack it can use GSO for sending large
+ * FPDUs(upto 64K - 1, capped due to 16bit MPA len), thus improves performance.
+ * Making use of bits[4:5] backwards compatibility with the original MPA Frame
+ * format, IE, rules defined in RFC5044, regarding FDPU length, should be
+ * followed when these bits are zero.
+ * TODO make this experimental feature tunable via module params, as the remote
+ * endpoint might also be using these reserve bits for other experiments.
+ */
+enum {
+ MPA_FPDU_LEN_DEFAULT = cpu_to_be16(0x0000),
+ MPA_FPDU_LEN_16K = cpu_to_be16(0x0400),
+ MPA_FPDU_LEN_32K = cpu_to_be16(0x0800),
+ MPA_FPDU_LEN_64K = cpu_to_be16(0x0c00)
+};
+
/*
* MPA request/response header bits & fields
*/
@@ -32,7 +54,6 @@ enum {
MPA_RR_FLAG_CRC = cpu_to_be16(0x4000),
MPA_RR_FLAG_REJECT = cpu_to_be16(0x2000),
MPA_RR_FLAG_ENHANCED = cpu_to_be16(0x1000),
- MPA_RR_FLAG_GSO_EXP = cpu_to_be16(0x0800),
MPA_RR_MASK_REVISION = cpu_to_be16(0x00ff)
};
diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h
index dba4535494ab..a8e08f675cf4 100644
--- a/drivers/infiniband/sw/siw/siw.h
+++ b/drivers/infiniband/sw/siw/siw.h
@@ -415,7 +415,7 @@ struct siw_iwarp_tx {
u8 orq_fence : 1; /* ORQ full or Send fenced */
u8 in_syscall : 1; /* TX out of user context */
u8 zcopy_tx : 1; /* Use TCP_SENDPAGE if possible */
- u8 gso_seg_limit; /* Maximum segments for GSO, 0 = unbound */
+ u16 large_fpdulen; /* max outbound FPDU len, capped by 16bit MPA len */
u16 fpdu_len; /* len of FPDU to tx */
unsigned int tcp_seglen; /* remaining tcp seg space */
@@ -505,7 +505,6 @@ struct iwarp_msg_info {
/* Global siw parameters. Currently set in siw_main.c */
extern const bool zcopy_tx;
-extern const bool try_gso;
extern const bool loopback_enabled;
extern const bool mpa_crc_required;
extern const bool mpa_crc_strict;
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index 8c1931a57f4a..abeed777006f 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -712,6 +712,38 @@ static int siw_proc_mpareq(struct siw_cep *cep)
return -EOPNOTSUPP;
}
+/*
+ * Sets/limits max outbound FPDU length based on the length advertised
+ * by the peer.
+ */
+static void siw_set_outbound_fpdulen(struct siw_cep *cep,
+ struct mpa_rr_params *params,
+ u16 *outbound_fpdulen)
+{
+ switch (params->bits & MPA_FPDU_LEN_MASK) {
+ case MPA_FPDU_LEN_16K:
+ *outbound_fpdulen = SZ_16K - 1;
+ break;
+ case MPA_FPDU_LEN_32K:
+ *outbound_fpdulen = SZ_32K - 1;
+ break;
+ case MPA_FPDU_LEN_64K:
+ *outbound_fpdulen = SZ_64K - 1;
+ break;
+ default:
+ WARN(1,
+ "Peer advertised invalid FPDU len:0x%x, proceeding w/o GSO\n",
+ (params->bits & MPA_FPDU_LEN_MASK) & 0xffff);
+
+ /* fpdulen '0' here disables sending large FPDUs */
+ *outbound_fpdulen = 0;
+ return;
+ }
+ siw_dbg_cep(cep, "Max outbound FPDU length set to: %d\n",
+ *outbound_fpdulen);
+ return;
+}
+
static int siw_proc_mpareply(struct siw_cep *cep)
{
struct siw_qp_attrs qp_attrs;
@@ -750,10 +782,12 @@ static int siw_proc_mpareply(struct siw_cep *cep)
return -ECONNRESET;
}
- if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) {
- siw_dbg_cep(cep, "peer allows GSO on TX\n");
- qp->tx_ctx.gso_seg_limit = 0;
- }
+ if (rep->params.bits & MPA_FPDU_LEN_MASK)
+ siw_set_outbound_fpdulen(cep, &rep->params,
+ &qp->tx_ctx.large_fpdulen);
+ else
+ qp->tx_ctx.large_fpdulen = 0;
+
if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
(mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) ||
(mpa_crc_strict && !mpa_crc_required &&
@@ -1469,8 +1503,7 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
}
__mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version);
- if (try_gso)
- cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP;
+ cep->mpa.hdr.params.bits |= MPA_FPDU_LEN_64K;
if (mpa_crc_required)
cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC;
@@ -1602,10 +1635,12 @@ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
}
siw_dbg_cep(cep, "[QP %d]\n", params->qpn);
- if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) {
- siw_dbg_cep(cep, "peer allows GSO on TX\n");
- qp->tx_ctx.gso_seg_limit = 0;
- }
+ if (cep->mpa.hdr.params.bits & MPA_FPDU_LEN_MASK)
+ siw_set_outbound_fpdulen(cep, &cep->mpa.hdr.params,
+ &qp->tx_ctx.large_fpdulen);
+ else
+ qp->tx_ctx.large_fpdulen = 0;
+
if (params->ord > sdev->attrs.max_ord ||
params->ird > sdev->attrs.max_ird) {
siw_dbg_cep(
@@ -1676,6 +1711,7 @@ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
qp_attrs.sk = cep->sock;
if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC)
qp_attrs.flags = SIW_MPA_CRC;
+ cep->mpa.hdr.params.bits |= MPA_FPDU_LEN_64K;
qp_attrs.state = SIW_QP_STATE_RTS;
siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp));
diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
index 05a92f997f60..28c256e52454 100644
--- a/drivers/infiniband/sw/siw/siw_main.c
+++ b/drivers/infiniband/sw/siw/siw_main.c
@@ -31,12 +31,6 @@ MODULE_LICENSE("Dual BSD/GPL");
/* transmit from user buffer, if possible */
const bool zcopy_tx = true;
-/* Restrict usage of GSO, if hardware peer iwarp is unable to process
- * large packets. try_gso = true lets siw try to use local GSO,
- * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth.
- */
-const bool try_gso;
-
/* Attach siw also with loopback devices */
const bool loopback_enabled = true;
diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c
index 5d97bba0ce6d..d95bb0ed0d7c 100644
--- a/drivers/infiniband/sw/siw/siw_qp_tx.c
+++ b/drivers/infiniband/sw/siw/siw_qp_tx.c
@@ -662,13 +662,17 @@ static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx,
{
struct tcp_sock *tp = tcp_sk(s->sk);
- if (tp->gso_segs) {
- if (c_tx->gso_seg_limit == 0)
- c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs;
- else
+ if (tp->gso_segs && c_tx->large_fpdulen) {
+ if (tp->mss_cache > c_tx->large_fpdulen) {
+ c_tx->tcp_seglen = c_tx->large_fpdulen;
+ } else {
+ u8 gso_seg_limit;
+ gso_seg_limit = c_tx->large_fpdulen / tp->mss_cache;
+
c_tx->tcp_seglen =
tp->mss_cache *
- min_t(u16, c_tx->gso_seg_limit, tp->gso_segs);
+ min_t(u16, gso_seg_limit, tp->gso_segs);
+ }
} else {
c_tx->tcp_seglen = tp->mss_cache;
}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index b18a677832e1..208557f91ff4 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -444,8 +444,7 @@ struct ib_qp *siw_create_qp(struct ib_pd *pd,
qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
- /* Make those two tunables fixed for now. */
- qp->tx_ctx.gso_seg_limit = 1;
+ /* Make this tunable fixed for now. */
qp->tx_ctx.zcopy_tx = zcopy_tx;
qp->attrs.state = SIW_QP_STATE_IDLE;
[-- Attachment #3: cxgb4.patch --]
[-- Type: text/plain, Size: 2388 bytes --]
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index ee1182f..8941b64 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -135,6 +135,16 @@
module_param(snd_win, int, 0644);
MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=128KB)");
+/*
+ * Experimental large FPDU length support.
+ * This improves overall throughput while interop with SoftiWARP.
+ * Note that iw_cxgb4 advertises it's receiving capability only and ignores
+ * the remote peer's large FPDU length announcement, due to T6 HW limitaton.
+ */
+static int large_fpdulen;
+module_param(large_fpdulen, int, 0644);
+MODULE_PARM_DESC(large_fpdulen, "Experimental large FPDU length");
+
static struct workqueue_struct *workq;
static struct sk_buff_head rxq;
@@ -978,6 +988,12 @@ static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
mpa->flags = 0;
if (crc_enabled)
mpa->flags |= MPA_CRC;
+ if (large_fpdulen) {
+ if (CHELSIO_CHIP_VERSION(ep->com.dev->rdev.lldi.adapter_type) >=
+ CHELSIO_T6)
+ mpa->flags |= MPA_FPDU_LEN_16K;
+ }
+
if (markers_enabled) {
mpa->flags |= MPA_MARKERS;
ep->mpa_attr.recv_marker_enabled = 1;
@@ -1166,6 +1182,11 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
mpa->flags |= MPA_CRC;
if (ep->mpa_attr.recv_marker_enabled)
mpa->flags |= MPA_MARKERS;
+ if (large_fpdulen) {
+ if (CHELSIO_CHIP_VERSION(ep->com.dev->rdev.lldi.adapter_type) >=
+ CHELSIO_T6)
+ mpa->flags |= MPA_FPDU_LEN_16K;
+ }
mpa->revision = ep->mpa_attr.version;
mpa->private_data_size = htons(plen);
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 7d06b0f..051830e 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -688,6 +688,15 @@ enum c4iw_mmid_state {
#define MPA_V2_RDMA_READ_RTR 0x4000
#define MPA_V2_IRD_ORD_MASK 0x3FFF
+/* Following experimental bits should be in sync with SoftiWARP bits */
+#define MPA_FPDU_LEN_MASK 0x000c
+enum {
+ MPA_FPDU_LEN_DEFAULT = cpu_to_be16(0x0000),
+ MPA_FPDU_LEN_16K = cpu_to_be16(0x0400),
+ MPA_FPDU_LEN_32K = cpu_to_be16(0x0800),
+ MPA_FPDU_LEN_64K = cpu_to_be16(0x0c00)
+};
+
#define c4iw_put_ep(ep) { \
pr_debug("put_ep ep %p refcnt %d\n", \
ep, kref_read(&((ep)->kref))); \
next prev parent reply other threads:[~2020-04-28 20:01 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-04-14 14:48 [RFC PATCH] RDMA/siw: Experimental e2e negotiation of GSO usage Bernard Metzler
[not found] ` <20200415105135.GA8246@chelsio.com>
2020-04-15 11:59 ` Bernard Metzler
2020-04-15 12:52 ` Krishnamraju Eraparaju
2020-04-28 20:00 ` Krishnamraju Eraparaju [this message]
2020-05-05 11:19 ` Bernard Metzler
2020-05-07 11:06 ` Krishnamraju Eraparaju
2020-05-11 15:28 ` Bernard Metzler
2020-05-13 3:49 ` Krishnamraju Eraparaju
2020-05-13 11:25 ` Bernard Metzler
2020-05-14 11:17 ` Krishnamraju Eraparaju
2020-05-14 13:07 ` Bernard Metzler
2020-05-15 13:50 ` Krishnamraju Eraparaju
2020-05-15 13:58 ` Krishnamraju Eraparaju
2020-05-26 13:57 ` Bernard Metzler
2020-05-27 16:07 ` Krishnamraju Eraparaju
2020-05-31 7:03 ` Michal Kalderon
2020-05-29 15:20 ` Saleem, Shiraz
2020-05-29 15:48 ` Bernard Metzler
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200428200043.GA930@chelsio.com \
--to=krishna2@chelsio.com \
--cc=BMT@zurich.ibm.com \
--cc=aelior@marvell.com \
--cc=bharat@chelsio.com \
--cc=dledford@redhat.com \
--cc=faisal.latif@intel.com \
--cc=jgg@ziepe.ca \
--cc=linux-rdma@vger.kernel.org \
--cc=mkalderon@marvell.com \
--cc=nirranjan@chelsio.com \
--cc=shiraz.saleem@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).