Linux RDMA and InfiniBand development

Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed

* [PATCH net-next] cxgb4: Allocate Tx queues dynamically
From: Atul Gupta @ 2016-11-18 11:07 UTC (permalink / raw)
  To: netdev, linux-scsi, target-devel, linux-rdma, linux-crypto
  Cc: davem, nab, jejb, martin.petersen, dledford, herbert, leedom,
	nirranjan, varun, swise, hariprasad, Atul Gupta

From: Hariprasad Shenai <hariprasad@chelsio.com>

Allocate resources dynamically for Upper layer driver's (ULD) like
cxgbit, iw_cxgb4, cxgb4i and chcr. The resources allocated include Tx
queues which are allocated when ULD register with cxgb4 driver and freed
while un-registering. The Tx queues which are shared by ULD shall be
allocated by first registering driver and un-allocated by last
unregistering driver.

Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
---
 drivers/crypto/chelsio/chcr_algo.c                 |  16 +--
 drivers/crypto/chelsio/chcr_core.c                 |   3 +-
 drivers/infiniband/hw/cxgb4/device.c               |   1 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h         |  19 +++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c |  12 --
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |  64 +++++++----
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c     | 114 +++++++++++++++++++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h     |  17 +++
 drivers/net/ethernet/chelsio/cxgb4/sge.c           | 121 +++++++++++++++------
 drivers/scsi/cxgbi/cxgb4i/cxgb4i.c                 |   1 +
 drivers/target/iscsi/cxgbit/cxgbit_main.c          |   1 +
 11 files changed, 287 insertions(+), 82 deletions(-)

diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index e4ddb921d7b3..56b153805462 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -592,16 +592,18 @@ static int chcr_aes_cbc_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
 
 static int cxgb4_is_crypto_q_full(struct net_device *dev, unsigned int idx)
 {
-	int ret = 0;
-	struct sge_ofld_txq *q;
 	struct adapter *adap = netdev2adap(dev);
+	struct sge_uld_txq_info *txq_info =
+		adap->sge.uld_txq_info[CXGB4_TX_CRYPTO];
+	struct sge_uld_txq *txq;
+	int ret = 0;
 
 	local_bh_disable();
-	q = &adap->sge.ofldtxq[idx];
-	spin_lock(&q->sendq.lock);
-	if (q->full)
+	txq = &txq_info->uldtxq[idx];
+	spin_lock(&txq->sendq.lock);
+	if (txq->full)
 		ret = -1;
-	spin_unlock(&q->sendq.lock);
+	spin_unlock(&txq->sendq.lock);
 	local_bh_enable();
 	return ret;
 }
@@ -674,11 +676,11 @@ static int chcr_device_init(struct chcr_context *ctx)
 		}
 		u_ctx = ULD_CTX(ctx);
 		rxq_perchan = u_ctx->lldi.nrxq / u_ctx->lldi.nchan;
-		ctx->dev->tx_channel_id = 0;
 		rxq_idx = ctx->dev->tx_channel_id * rxq_perchan;
 		rxq_idx += id % rxq_perchan;
 		spin_lock(&ctx->dev->lock_chcr_dev);
 		ctx->tx_channel_id = rxq_idx;
+		ctx->dev->tx_channel_id = !ctx->dev->tx_channel_id;
 		spin_unlock(&ctx->dev->lock_chcr_dev);
 	}
 out:
diff --git a/drivers/crypto/chelsio/chcr_core.c b/drivers/crypto/chelsio/chcr_core.c
index fb5f9bbfa09c..4d7f6700fd7e 100644
--- a/drivers/crypto/chelsio/chcr_core.c
+++ b/drivers/crypto/chelsio/chcr_core.c
@@ -42,6 +42,7 @@ static chcr_handler_func work_handlers[NUM_CPL_CMDS] = {
 static struct cxgb4_uld_info chcr_uld_info = {
 	.name = DRV_MODULE_NAME,
 	.nrxq = MAX_ULD_QSETS,
+	.ntxq = MAX_ULD_QSETS,
 	.rxq_size = 1024,
 	.add = chcr_uld_add,
 	.state_change = chcr_uld_state_change,
@@ -126,7 +127,7 @@ static int cpl_fw6_pld_handler(struct chcr_dev *dev,
 
 int chcr_send_wr(struct sk_buff *skb)
 {
-	return cxgb4_ofld_send(skb->dev, skb);
+	return cxgb4_crypto_send(skb->dev, skb);
 }
 
 static void *chcr_uld_add(const struct cxgb4_lld_info *lld)
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 93e3d270a98a..4e5baf4fe15e 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -1481,6 +1481,7 @@ static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...)
 static struct cxgb4_uld_info c4iw_uld_info = {
 	.name = DRV_NAME,
 	.nrxq = MAX_ULD_QSETS,
+	.ntxq = MAX_ULD_QSETS,
 	.rxq_size = 511,
 	.ciq = true,
 	.lro = false,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 2125903043fb..0bce1bf9ca0f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -635,6 +635,7 @@ struct tx_sw_desc;
 
 struct sge_txq {
 	unsigned int  in_use;       /* # of in-use Tx descriptors */
+	unsigned int  q_type;	    /* Q type Eth/Ctrl/Ofld */
 	unsigned int  size;         /* # of descriptors */
 	unsigned int  cidx;         /* SW consumer index */
 	unsigned int  pidx;         /* producer index */
@@ -665,7 +666,7 @@ struct sge_eth_txq {                /* state for an SGE Ethernet Tx queue */
 	unsigned long mapping_err;  /* # of I/O MMU packet mapping errors */
 } ____cacheline_aligned_in_smp;
 
-struct sge_ofld_txq {               /* state for an SGE offload Tx queue */
+struct sge_uld_txq {               /* state for an SGE offload Tx queue */
 	struct sge_txq q;
 	struct adapter *adap;
 	struct sk_buff_head sendq;  /* list of backpressured packets */
@@ -693,14 +694,20 @@ struct sge_uld_rxq_info {
 	u8 uld;			/* uld type */
 };
 
+struct sge_uld_txq_info {
+	struct sge_uld_txq *uldtxq; /* Txq's for ULD */
+	atomic_t users;		/* num users */
+	u16 ntxq;		/* # of egress uld queues */
+};
+
 struct sge {
 	struct sge_eth_txq ethtxq[MAX_ETH_QSETS];
-	struct sge_ofld_txq ofldtxq[MAX_OFLD_QSETS];
 	struct sge_ctrl_txq ctrlq[MAX_CTRL_QUEUES];
 
 	struct sge_eth_rxq ethrxq[MAX_ETH_QSETS];
 	struct sge_rspq fw_evtq ____cacheline_aligned_in_smp;
 	struct sge_uld_rxq_info **uld_rxq_info;
+	struct sge_uld_txq_info **uld_txq_info;
 
 	struct sge_rspq intrq ____cacheline_aligned_in_smp;
 	spinlock_t intrq_lock;
@@ -1298,8 +1305,9 @@ int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
 			  unsigned int cmplqid);
 int t4_sge_mod_ctrl_txq(struct adapter *adap, unsigned int eqid,
 			unsigned int cmplqid);
-int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_ofld_txq *txq,
-			  struct net_device *dev, unsigned int iqid);
+int t4_sge_alloc_uld_txq(struct adapter *adap, struct sge_uld_txq *txq,
+			 struct net_device *dev, unsigned int iqid,
+			 unsigned int uld_type);
 irqreturn_t t4_sge_intr_msix(int irq, void *cookie);
 int t4_sge_init(struct adapter *adap);
 void t4_sge_start(struct adapter *adap);
@@ -1661,4 +1669,7 @@ int t4_uld_mem_alloc(struct adapter *adap);
 void t4_uld_clean_up(struct adapter *adap);
 void t4_register_netevent_notifier(void);
 void free_rspq_fl(struct adapter *adap, struct sge_rspq *rq, struct sge_fl *fl);
+void free_tx_desc(struct adapter *adap, struct sge_txq *q,
+		  unsigned int n, bool unmap);
+void free_txq(struct adapter *adap, struct sge_txq *q);
 #endif /* __CXGB4_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
index 20455d082cb8..acc231293e4d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
@@ -2512,18 +2512,6 @@ do { \
 		RL("FLLow:", fl.low);
 		RL("FLStarving:", fl.starving);
 
-	} else if (ofld_idx < ofld_entries) {
-		const struct sge_ofld_txq *tx =
-			&adap->sge.ofldtxq[ofld_idx * 4];
-		int n = min(4, adap->sge.ofldqsets - 4 * ofld_idx);
-
-		S("QType:", "OFLD-Txq");
-		T("TxQ ID:", q.cntxt_id);
-		T("TxQ size:", q.size);
-		T("TxQ inuse:", q.in_use);
-		T("TxQ CIDX:", q.cidx);
-		T("TxQ PIDX:", q.pidx);
-
 	} else if (ctrl_idx < ctrl_entries) {
 		const struct sge_ctrl_txq *tx = &adap->sge.ctrlq[ctrl_idx * 4];
 		int n = min(4, adap->params.nports - 4 * ctrl_idx);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index c0cc2ee77be7..449884f8dd67 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -530,15 +530,15 @@ static int fwevtq_handler(struct sge_rspq *q, const __be64 *rsp,
 
 		txq = q->adap->sge.egr_map[qid - q->adap->sge.egr_start];
 		txq->restarts++;
-		if ((u8 *)txq < (u8 *)q->adap->sge.ofldtxq) {
+		if (txq->q_type == CXGB4_TXQ_ETH) {
 			struct sge_eth_txq *eq;
 
 			eq = container_of(txq, struct sge_eth_txq, q);
 			netif_tx_wake_queue(eq->txq);
 		} else {
-			struct sge_ofld_txq *oq;
+			struct sge_uld_txq *oq;
 
-			oq = container_of(txq, struct sge_ofld_txq, q);
+			oq = container_of(txq, struct sge_uld_txq, q);
 			tasklet_schedule(&oq->qresume_tsk);
 		}
 	} else if (opcode == CPL_FW6_MSG || opcode == CPL_FW4_MSG) {
@@ -885,15 +885,6 @@ static int setup_sge_queues(struct adapter *adap)
 		}
 	}
 
-	j = s->ofldqsets / adap->params.nports; /* iscsi queues per channel */
-	for_each_ofldtxq(s, i) {
-		err = t4_sge_alloc_ofld_txq(adap, &s->ofldtxq[i],
-					    adap->port[i / j],
-					    s->fw_evtq.cntxt_id);
-		if (err)
-			goto freeout;
-	}
-
 	for_each_port(adap, i) {
 		/* Note that cmplqid below is 0 if we don't
 		 * have RDMA queues, and that's the right value.
@@ -1922,8 +1913,18 @@ static void disable_dbs(struct adapter *adap)
 
 	for_each_ethrxq(&adap->sge, i)
 		disable_txq_db(&adap->sge.ethtxq[i].q);
-	for_each_ofldtxq(&adap->sge, i)
-		disable_txq_db(&adap->sge.ofldtxq[i].q);
+	if (is_offload(adap)) {
+		struct sge_uld_txq_info *txq_info =
+			adap->sge.uld_txq_info[CXGB4_TX_OFLD];
+
+		if (txq_info) {
+			for_each_ofldtxq(&adap->sge, i) {
+				struct sge_uld_txq *txq = &txq_info->uldtxq[i];
+
+				disable_txq_db(&txq->q);
+			}
+		}
+	}
 	for_each_port(adap, i)
 		disable_txq_db(&adap->sge.ctrlq[i].q);
 }
@@ -1934,8 +1935,18 @@ static void enable_dbs(struct adapter *adap)
 
 	for_each_ethrxq(&adap->sge, i)
 		enable_txq_db(adap, &adap->sge.ethtxq[i].q);
-	for_each_ofldtxq(&adap->sge, i)
-		enable_txq_db(adap, &adap->sge.ofldtxq[i].q);
+	if (is_offload(adap)) {
+		struct sge_uld_txq_info *txq_info =
+			adap->sge.uld_txq_info[CXGB4_TX_OFLD];
+
+		if (txq_info) {
+			for_each_ofldtxq(&adap->sge, i) {
+				struct sge_uld_txq *txq = &txq_info->uldtxq[i];
+
+				enable_txq_db(adap, &txq->q);
+			}
+		}
+	}
 	for_each_port(adap, i)
 		enable_txq_db(adap, &adap->sge.ctrlq[i].q);
 }
@@ -2006,8 +2017,17 @@ static void recover_all_queues(struct adapter *adap)
 
 	for_each_ethrxq(&adap->sge, i)
 		sync_txq_pidx(adap, &adap->sge.ethtxq[i].q);
-	for_each_ofldtxq(&adap->sge, i)
-		sync_txq_pidx(adap, &adap->sge.ofldtxq[i].q);
+	if (is_offload(adap)) {
+		struct sge_uld_txq_info *txq_info =
+			adap->sge.uld_txq_info[CXGB4_TX_OFLD];
+		if (txq_info) {
+			for_each_ofldtxq(&adap->sge, i) {
+				struct sge_uld_txq *txq = &txq_info->uldtxq[i];
+
+				sync_txq_pidx(adap, &txq->q);
+			}
+		}
+	}
 	for_each_port(adap, i)
 		sync_txq_pidx(adap, &adap->sge.ctrlq[i].q);
 }
@@ -3991,7 +4011,7 @@ static inline bool is_x_10g_port(const struct link_config *lc)
 static void cfg_queues(struct adapter *adap)
 {
 	struct sge *s = &adap->sge;
-	int i, n10g = 0, qidx = 0;
+	int i = 0, n10g = 0, qidx = 0;
 #ifndef CONFIG_CHELSIO_T4_DCB
 	int q10g = 0;
 #endif
@@ -4006,8 +4026,7 @@ static void cfg_queues(struct adapter *adap)
 		adap->params.crypto = 0;
 	}
 
-	for_each_port(adap, i)
-		n10g += is_x_10g_port(&adap2pinfo(adap, i)->link_cfg);
+	n10g += is_x_10g_port(&adap2pinfo(adap, i)->link_cfg);
 #ifdef CONFIG_CHELSIO_T4_DCB
 	/* For Data Center Bridging support we need to be able to support up
 	 * to 8 Traffic Priorities; each of which will be assigned to its
@@ -4075,9 +4094,6 @@ static void cfg_queues(struct adapter *adap)
 	for (i = 0; i < ARRAY_SIZE(s->ctrlq); i++)
 		s->ctrlq[i].q.size = 512;
 
-	for (i = 0; i < ARRAY_SIZE(s->ofldtxq); i++)
-		s->ofldtxq[i].q.size = 1024;
-
 	init_rspq(adap, &s->fw_evtq, 0, 1, 1024, 64);
 	init_rspq(adap, &s->intrq, 0, 1, 512, 64);
 }
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
index 2471ff465d5c..565a6c6bfeaf 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
@@ -447,6 +447,106 @@ static void quiesce_rx_uld(struct adapter *adap, unsigned int uld_type)
 		quiesce_rx(adap, &rxq_info->uldrxq[idx].rspq);
 }
 
+static void
+free_sge_txq_uld(struct adapter *adap, struct sge_uld_txq_info *txq_info)
+{
+	int nq = txq_info->ntxq;
+	int i;
+
+	for (i = 0; i < nq; i++) {
+		struct sge_uld_txq *txq = &txq_info->uldtxq[i];
+
+		if (txq && txq->q.desc) {
+			tasklet_kill(&txq->qresume_tsk);
+			t4_ofld_eq_free(adap, adap->mbox, adap->pf, 0,
+					txq->q.cntxt_id);
+			free_tx_desc(adap, &txq->q, txq->q.in_use, false);
+			kfree(txq->q.sdesc);
+			__skb_queue_purge(&txq->sendq);
+			free_txq(adap, &txq->q);
+		}
+	}
+}
+
+static int
+alloc_sge_txq_uld(struct adapter *adap, struct sge_uld_txq_info *txq_info,
+		  unsigned int uld_type)
+{
+	struct sge *s = &adap->sge;
+	int nq = txq_info->ntxq;
+	int i, j, err;
+
+	j = nq / adap->params.nports;
+	for (i = 0; i < nq; i++) {
+		struct sge_uld_txq *txq = &txq_info->uldtxq[i];
+
+		txq->q.size = 1024;
+		err = t4_sge_alloc_uld_txq(adap, txq, adap->port[i / j],
+					   s->fw_evtq.cntxt_id, uld_type);
+		if (err)
+			goto freeout;
+	}
+	return 0;
+freeout:
+	free_sge_txq_uld(adap, txq_info);
+	return err;
+}
+
+static void
+release_sge_txq_uld(struct adapter *adap, unsigned int uld_type)
+{
+	struct sge_uld_txq_info *txq_info = NULL;
+	int tx_uld_type = TX_ULD(uld_type);
+
+	txq_info = adap->sge.uld_txq_info[tx_uld_type];
+
+	if (txq_info && atomic_dec_and_test(&txq_info->users)) {
+		free_sge_txq_uld(adap, txq_info);
+		kfree(txq_info->uldtxq);
+		kfree(txq_info);
+		adap->sge.uld_txq_info[tx_uld_type] = NULL;
+	}
+}
+
+static int
+setup_sge_txq_uld(struct adapter *adap, unsigned int uld_type,
+		  const struct cxgb4_uld_info *uld_info)
+{
+	struct sge_uld_txq_info *txq_info = NULL;
+	int tx_uld_type, i;
+
+	tx_uld_type = TX_ULD(uld_type);
+	txq_info = adap->sge.uld_txq_info[tx_uld_type];
+
+	if ((tx_uld_type == CXGB4_TX_OFLD) && txq_info &&
+	    (atomic_inc_return(&txq_info->users) > 1))
+		return 0;
+
+	txq_info = kzalloc(sizeof(*txq_info), GFP_KERNEL);
+	if (!txq_info)
+		return -ENOMEM;
+
+	i = min_t(int, uld_info->ntxq, num_online_cpus());
+	txq_info->ntxq = roundup(i, adap->params.nports);
+
+	txq_info->uldtxq = kcalloc(txq_info->ntxq, sizeof(struct sge_uld_txq),
+				   GFP_KERNEL);
+	if (!txq_info->uldtxq) {
+		kfree(txq_info->uldtxq);
+		return -ENOMEM;
+	}
+
+	if (alloc_sge_txq_uld(adap, txq_info, tx_uld_type)) {
+		kfree(txq_info->uldtxq);
+		kfree(txq_info);
+		return -ENOMEM;
+	}
+
+	atomic_inc(&txq_info->users);
+	adap->sge.uld_txq_info[tx_uld_type] = txq_info;
+	return 0;
+}
+
 static void uld_queue_init(struct adapter *adap, unsigned int uld_type,
 			   struct cxgb4_lld_info *lli)
 {
@@ -472,7 +572,15 @@ int t4_uld_mem_alloc(struct adapter *adap)
 	if (!s->uld_rxq_info)
 		goto err_uld;
 
+	s->uld_txq_info = kzalloc(CXGB4_TX_MAX *
+				  sizeof(struct sge_uld_txq_info *),
+				  GFP_KERNEL);
+	if (!s->uld_txq_info)
+		goto err_uld_rx;
 	return 0;
+
+err_uld_rx:
+	kfree(s->uld_rxq_info);
 err_uld:
 	kfree(adap->uld);
 	return -ENOMEM;
@@ -482,6 +590,7 @@ void t4_uld_mem_free(struct adapter *adap)
 {
 	struct sge *s = &adap->sge;
 
+	kfree(s->uld_txq_info);
 	kfree(s->uld_rxq_info);
 	kfree(adap->uld);
 }
@@ -616,6 +725,9 @@ int cxgb4_register_uld(enum cxgb4_uld type,
 			ret = -EBUSY;
 			goto free_irq;
 		}
+		ret = setup_sge_txq_uld(adap, type, p);
+		if (ret)
+			goto free_irq;
 		adap->uld[type] = *p;
 		uld_attach(adap, type);
 		adap_idx++;
@@ -644,6 +756,7 @@ int cxgb4_register_uld(enum cxgb4_uld type,
 			break;
 		adap->uld[type].handle = NULL;
 		adap->uld[type].add = NULL;
+		release_sge_txq_uld(adap, type);
 		if (adap->flags & FULL_INIT_DONE)
 			quiesce_rx_uld(adap, type);
 		if (adap->flags & USING_MSIX)
@@ -679,6 +792,7 @@ int cxgb4_unregister_uld(enum cxgb4_uld type)
 			continue;
 		adap->uld[type].handle = NULL;
 		adap->uld[type].add = NULL;
+		release_sge_txq_uld(adap, type);
 		if (adap->flags & FULL_INIT_DONE)
 			quiesce_rx_uld(adap, type);
 		if (adap->flags & USING_MSIX)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
index 2996793b1aaa..4c856605fdfa 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -77,6 +77,8 @@ enum {
 
 /* Special asynchronous notification message */
 #define CXGB4_MSG_AN ((void *)1)
+#define TX_ULD(uld)(((uld) != CXGB4_ULD_CRYPTO) ? CXGB4_TX_OFLD :\
+		      CXGB4_TX_CRYPTO)
 
 struct serv_entry {
 	void *data;
@@ -223,6 +225,19 @@ enum cxgb4_uld {
 	CXGB4_ULD_MAX
 };
 
+enum cxgb4_tx_uld {
+	CXGB4_TX_OFLD,
+	CXGB4_TX_CRYPTO,
+	CXGB4_TX_MAX
+};
+
+enum cxgb4_txq_type {
+	CXGB4_TXQ_ETH,
+	CXGB4_TXQ_ULD,
+	CXGB4_TXQ_CTRL,
+	CXGB4_TXQ_MAX
+};
+
 enum cxgb4_state {
 	CXGB4_STATE_UP,
 	CXGB4_STATE_START_RECOVERY,
@@ -316,6 +331,7 @@ struct cxgb4_uld_info {
 	void *handle;
 	unsigned int nrxq;
 	unsigned int rxq_size;
+	unsigned int ntxq;
 	bool ciq;
 	bool lro;
 	void *(*add)(const struct cxgb4_lld_info *p);
@@ -333,6 +349,7 @@ struct cxgb4_uld_info {
 int cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p);
 int cxgb4_unregister_uld(enum cxgb4_uld type);
 int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb);
+int cxgb4_crypto_send(struct net_device *dev, struct sk_buff *skb);
 unsigned int cxgb4_dbfifo_count(const struct net_device *dev, int lpfifo);
 unsigned int cxgb4_port_chan(const struct net_device *dev);
 unsigned int cxgb4_port_viid(const struct net_device *dev);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 1e74fd6085df..b7d0753b9242 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -377,8 +377,8 @@ unmap:			dma_unmap_page(dev, be64_to_cpu(p->addr[0]),
  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
  *	Tx buffers.  Called with the Tx queue lock held.
  */
-static void free_tx_desc(struct adapter *adap, struct sge_txq *q,
-			 unsigned int n, bool unmap)
+void free_tx_desc(struct adapter *adap, struct sge_txq *q,
+		  unsigned int n, bool unmap)
 {
 	struct tx_sw_desc *d;
 	unsigned int cidx = q->cidx;
@@ -1543,7 +1543,7 @@ static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb)
  *	inability to map packets.  A periodic timer attempts to restart
  *	queues so marked.
  */
-static void txq_stop_maperr(struct sge_ofld_txq *q)
+static void txq_stop_maperr(struct sge_uld_txq *q)
 {
 	q->mapping_err++;
 	q->q.stops++;
@@ -1559,7 +1559,7 @@ static void txq_stop_maperr(struct sge_ofld_txq *q)
  *	Stops an offload Tx queue that has become full and modifies the packet
  *	being written to request a wakeup.
  */
-static void ofldtxq_stop(struct sge_ofld_txq *q, struct sk_buff *skb)
+static void ofldtxq_stop(struct sge_uld_txq *q, struct sk_buff *skb)
 {
 	struct fw_wr_hdr *wr = (struct fw_wr_hdr *)skb->data;
 
@@ -1586,7 +1586,7 @@ static void ofldtxq_stop(struct sge_ofld_txq *q, struct sk_buff *skb)
  *	boolean "service_ofldq_running" to make sure that only one instance
  *	is ever running at a time ...
  */
-static void service_ofldq(struct sge_ofld_txq *q)
+static void service_ofldq(struct sge_uld_txq *q)
 {
 	u64 *pos, *before, *end;
 	int credits;
@@ -1706,7 +1706,7 @@ static void service_ofldq(struct sge_ofld_txq *q)
  *
  *	Send an offload packet through an SGE offload queue.
  */
-static int ofld_xmit(struct sge_ofld_txq *q, struct sk_buff *skb)
+static int ofld_xmit(struct sge_uld_txq *q, struct sk_buff *skb)
 {
 	skb->priority = calc_tx_flits_ofld(skb);       /* save for restart */
 	spin_lock(&q->sendq.lock);
@@ -1735,7 +1735,7 @@ static int ofld_xmit(struct sge_ofld_txq *q, struct sk_buff *skb)
  */
 static void restart_ofldq(unsigned long data)
 {
-	struct sge_ofld_txq *q = (struct sge_ofld_txq *)data;
+	struct sge_uld_txq *q = (struct sge_uld_txq *)data;
 
 	spin_lock(&q->sendq.lock);
 	q->full = 0;            /* the queue actually is completely empty now */
@@ -1767,17 +1767,23 @@ static inline unsigned int is_ctrl_pkt(const struct sk_buff *skb)
 	return skb->queue_mapping & 1;
 }
 
-static inline int ofld_send(struct adapter *adap, struct sk_buff *skb)
+static inline int uld_send(struct adapter *adap, struct sk_buff *skb,
+			   unsigned int tx_uld_type)
 {
+	struct sge_uld_txq_info *txq_info;
+	struct sge_uld_txq *txq;
 	unsigned int idx = skb_txq(skb);
 
+	txq_info = adap->sge.uld_txq_info[tx_uld_type];
+	txq = &txq_info->uldtxq[idx];
+
 	if (unlikely(is_ctrl_pkt(skb))) {
 		/* Single ctrl queue is a requirement for LE workaround path */
 		if (adap->tids.nsftids)
 			idx = 0;
 		return ctrl_xmit(&adap->sge.ctrlq[idx], skb);
 	}
-	return ofld_xmit(&adap->sge.ofldtxq[idx], skb);
+	return ofld_xmit(txq, skb);
 }
 
 /**
@@ -1794,7 +1800,7 @@ int t4_ofld_send(struct adapter *adap, struct sk_buff *skb)
 	int ret;
 
 	local_bh_disable();
-	ret = ofld_send(adap, skb);
+	ret = uld_send(adap, skb, CXGB4_TX_OFLD);
 	local_bh_enable();
 	return ret;
 }
@@ -1813,6 +1819,39 @@ int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(cxgb4_ofld_send);
 
+/**
+ *	t4_crypto_send - send crypto packet
+ *	@adap: the adapter
+ *	@skb: the packet
+ *
+ *	Sends crypto packet.  We use the packet queue_mapping to select the
+ *	appropriate Tx queue as follows: bit 0 indicates whether the packet
+ *	should be sent as regular or control, bits 1-15 select the queue.
+ */
+static int t4_crypto_send(struct adapter *adap, struct sk_buff *skb)
+{
+	int ret;
+
+	local_bh_disable();
+	ret = uld_send(adap, skb, CXGB4_TX_CRYPTO);
+	local_bh_enable();
+	return ret;
+}
+
+/**
+ *	cxgb4_crypto_send - send crypto packet
+ *	@dev: the net device
+ *	@skb: the packet
+ *
+ *	Sends crypto packet.  This is an exported version of @t4_crypto_send,
+ *	intended for ULDs.
+ */
+int cxgb4_crypto_send(struct net_device *dev, struct sk_buff *skb)
+{
+	return t4_crypto_send(netdev2adap(dev), skb);
+}
+EXPORT_SYMBOL(cxgb4_crypto_send);
+
 static inline void copy_frags(struct sk_buff *skb,
 			      const struct pkt_gl *gl, unsigned int offset)
 {
@@ -2479,7 +2518,7 @@ static void sge_tx_timer_cb(unsigned long data)
 	for (i = 0; i < BITS_TO_LONGS(s->egr_sz); i++)
 		for (m = s->txq_maperr[i]; m; m &= m - 1) {
 			unsigned long id = __ffs(m) + i * BITS_PER_LONG;
-			struct sge_ofld_txq *txq = s->egr_map[id];
+			struct sge_uld_txq *txq = s->egr_map[id];
 
 			clear_bit(id, s->txq_maperr);
 			tasklet_schedule(&txq->qresume_tsk);
@@ -2799,6 +2838,7 @@ int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
 		return ret;
 	}
 
+	txq->q.q_type = CXGB4_TXQ_ETH;
 	init_txq(adap, &txq->q, FW_EQ_ETH_CMD_EQID_G(ntohl(c.eqid_pkd)));
 	txq->txq = netdevq;
 	txq->tso = txq->tx_cso = txq->vlan_ins = 0;
@@ -2852,6 +2892,7 @@ int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
 		return ret;
 	}
 
+	txq->q.q_type = CXGB4_TXQ_CTRL;
 	init_txq(adap, &txq->q, FW_EQ_CTRL_CMD_EQID_G(ntohl(c.cmpliqid_eqid)));
 	txq->adap = adap;
 	skb_queue_head_init(&txq->sendq);
@@ -2872,13 +2913,15 @@ int t4_sge_mod_ctrl_txq(struct adapter *adap, unsigned int eqid,
 	return t4_set_params(adap, adap->mbox, adap->pf, 0, 1, &param, &val);
 }
 
-int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_ofld_txq *txq,
-			  struct net_device *dev, unsigned int iqid)
+int t4_sge_alloc_uld_txq(struct adapter *adap, struct sge_uld_txq *txq,
+			 struct net_device *dev, unsigned int iqid,
+			 unsigned int uld_type)
 {
 	int ret, nentries;
 	struct fw_eq_ofld_cmd c;
 	struct sge *s = &adap->sge;
 	struct port_info *pi = netdev_priv(dev);
+	int cmd = FW_EQ_OFLD_CMD;
 
 	/* Add status entries */
 	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
@@ -2891,7 +2934,9 @@ int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_ofld_txq *txq,
 		return -ENOMEM;
 
 	memset(&c, 0, sizeof(c));
-	c.op_to_vfn = htonl(FW_CMD_OP_V(FW_EQ_OFLD_CMD) | FW_CMD_REQUEST_F |
+	if (unlikely(uld_type == CXGB4_TX_CRYPTO))
+		cmd = FW_EQ_CTRL_CMD;
+	c.op_to_vfn = htonl(FW_CMD_OP_V(cmd) | FW_CMD_REQUEST_F |
 			    FW_CMD_WRITE_F | FW_CMD_EXEC_F |
 			    FW_EQ_OFLD_CMD_PFN_V(adap->pf) |
 			    FW_EQ_OFLD_CMD_VFN_V(0));
@@ -2919,6 +2964,7 @@ int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_ofld_txq *txq,
 		return ret;
 	}
 
+	txq->q.q_type = CXGB4_TXQ_ULD;
 	init_txq(adap, &txq->q, FW_EQ_OFLD_CMD_EQID_G(ntohl(c.eqid_pkd)));
 	txq->adap = adap;
 	skb_queue_head_init(&txq->sendq);
@@ -2928,7 +2974,7 @@ int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_ofld_txq *txq,
 	return 0;
 }
 
-static void free_txq(struct adapter *adap, struct sge_txq *q)
+void free_txq(struct adapter *adap, struct sge_txq *q)
 {
 	struct sge *s = &adap->sge;
 
@@ -3026,21 +3072,6 @@ void t4_free_sge_resources(struct adapter *adap)
 		}
 	}
 
-	/* clean up offload Tx queues */
-	for (i = 0; i < ARRAY_SIZE(adap->sge.ofldtxq); i++) {
-		struct sge_ofld_txq *q = &adap->sge.ofldtxq[i];
-
-		if (q->q.desc) {
-			tasklet_kill(&q->qresume_tsk);
-			t4_ofld_eq_free(adap, adap->mbox, adap->pf, 0,
-					q->q.cntxt_id);
-			free_tx_desc(adap, &q->q, q->q.in_use, false);
-			kfree(q->q.sdesc);
-			__skb_queue_purge(&q->sendq);
-			free_txq(adap, &q->q);
-		}
-	}
-
 	/* clean up control Tx queues */
 	for (i = 0; i < ARRAY_SIZE(adap->sge.ctrlq); i++) {
 		struct sge_ctrl_txq *cq = &adap->sge.ctrlq[i];
@@ -3093,12 +3124,34 @@ void t4_sge_stop(struct adapter *adap)
 	if (s->tx_timer.function)
 		del_timer_sync(&s->tx_timer);
 
-	for (i = 0; i < ARRAY_SIZE(s->ofldtxq); i++) {
-		struct sge_ofld_txq *q = &s->ofldtxq[i];
+	if (is_offload(adap)) {
+		struct sge_uld_txq_info *txq_info;
+
+		txq_info = adap->sge.uld_txq_info[CXGB4_TX_OFLD];
+		if (txq_info) {
+			struct sge_uld_txq *txq = txq_info->uldtxq;
 
-		if (q->q.desc)
-			tasklet_kill(&q->qresume_tsk);
+			for_each_ofldtxq(&adap->sge, i) {
+				if (txq->q.desc)
+					tasklet_kill(&txq->qresume_tsk);
+			}
+		}
 	}
+
+	if (is_pci_uld(adap)) {
+		struct sge_uld_txq_info *txq_info;
+
+		txq_info = adap->sge.uld_txq_info[CXGB4_TX_CRYPTO];
+		if (txq_info) {
+			struct sge_uld_txq *txq = txq_info->uldtxq;
+
+			for_each_ofldtxq(&adap->sge, i) {
+				if (txq->q.desc)
+					tasklet_kill(&txq->qresume_tsk);
+			}
+		}
+	}
+
 	for (i = 0; i < ARRAY_SIZE(s->ctrlq); i++) {
 		struct sge_ctrl_txq *cq = &s->ctrlq[i];
 
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
index 0039bebaa9e2..4655a9f9dcea 100644
--- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
@@ -85,6 +85,7 @@ static inline int send_tx_flowc_wr(struct cxgbi_sock *);
 static const struct cxgb4_uld_info cxgb4i_uld_info = {
 	.name = DRV_MODULE_NAME,
 	.nrxq = MAX_ULD_QSETS,
+	.ntxq = MAX_ULD_QSETS,
 	.rxq_size = 1024,
 	.lro = false,
 	.add = t4_uld_add,
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_main.c b/drivers/target/iscsi/cxgbit/cxgbit_main.c
index ad26b9372f10..96eedfc49c94 100644
--- a/drivers/target/iscsi/cxgbit/cxgbit_main.c
+++ b/drivers/target/iscsi/cxgbit/cxgbit_main.c
@@ -653,6 +653,7 @@ static struct iscsit_transport cxgbit_transport = {
 static struct cxgb4_uld_info cxgbit_uld_info = {
 	.name		= DRV_NAME,
 	.nrxq		= MAX_ULD_QSETS,
+	.ntxq		= MAX_ULD_QSETS,
 	.rxq_size	= 1024,
 	.lro		= true,
 	.add		= cxgbit_uld_add,
-- 
2.3.4

^ permalink raw reply related

* Re: [PATCH rdma-rc] IB/IPoIB: Remove can't use GFP_NOIO warning
From: Kamal Heib @ 2016-11-18 10:34 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Leon Romanovsky, Doug Ledford,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Mel Gorman,
	Jiri Kosina
In-Reply-To: <CAJ3xEMgCFmSMA5F-7D89n248K2HovvwHnnMgke7-FSDbROFA4Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Fri, Nov 18, 2016 at 12:30 AM, Or Gerlitz <gerlitz.or-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> On Thu, Nov 10, 2016 at 10:16 AM, Leon Romanovsky wrote:
>> From: Kamal Heib <kamalh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>
>> Remove the warning print of "can't use of GFP_NOIO" to avoid prints in
>> each QP creation when devices aren't supporting IB_QP_CREATE_USE_GFP_NOIO.
>>
>> This print become more annoying when the IPoIB interface is configured
>> to work in connected mode.
>>
>> Fixes: 09b93088d750 ('IB: Add a QP creation flag to use GFP_NOIO allocations')
>
> Hi Kamal,

Hi Or :)

>
> I don't think you're fixing a bug in this commit... you find the print
> annoying and you remove it, okay, maybe we can do that. Why not leave
> it in rate-limited manner? can you elaborate what was that deeply
> annoying with getting the warning?
>

It's really annoying when you have multiple ipoib interfaces and
pkeys, so the dmesg will look like the following:

[933236.858739] mlx5_ib0: can't use GFP_NOIO for QPs on device mlx5_0,
using GFP_KERNEL
[933407.571911] mlx5_ib0.8002: can't use GFP_NOIO for QPs on device
mlx5_0, using GFP_KERNEL
[933634.121955] mlx5_ib0.8006: can't use GFP_NOIO for QPs on device
mlx5_0, using GFP_KERNEL
[934167.851164] mlx5_ib0.8004: can't use GFP_NOIO for QPs on device
mlx5_0, using GFP_KERNEL
[934936.645002] mlx5_ib0.8002: can't use GFP_NOIO for QPs on device
mlx5_0, using GFP_KERNEL

>> --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
>> @@ -1053,8 +1053,6 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
>>
>>         tx_qp = ib_create_qp(priv->pd, &attr);
>>         if (PTR_ERR(tx_qp) == -EINVAL) {
>> -               ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
>> -                          priv->ca->name);
>>                 attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO;
>>                 tx_qp = ib_create_qp(priv->pd, &attr);
>>         }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v4 05/10] IB/isert: Replace semaphore sem with completion
From: Binoy Jayan @ 2016-11-18  9:10 UTC (permalink / raw)
  To: Arnd Bergmann, Mark Brown
  Cc: Sagi Grimberg, Doug Ledford, Sean Hefty, Hal Rosenstock,
	linux-rdma, Linux kernel mailing list
In-Reply-To: <3477625.cZS9dUaHfE@wuerfel>

Hi Arnd,

On 18 November 2016 at 14:28, Arnd Bergmann <arnd@arndb.de> wrote:
> On Friday, November 18, 2016 12:27:32 PM CET Binoy Jayan wrote:
>> Hi Sagi,

> I think you are right. This is behavior is actuallly documented in
> Documentation/scheduler/completion.txt:

Thanking for having a look.

> However, this is fairly unusual behavior and I wasn't immediately aware
> of it either when I read Sagi's reply. While your patch looks correct,
> it's probably a good idea to point out the counting behavior of this
> completion as explicitly as possible, in the changelog text of the patch
> as well as in a code comment and perhaps in the naming of the completion.

Will mention this and resend the patch series.

Thanks,
Binoy

^ permalink raw reply

* Re: [PATCH v4 05/10] IB/isert: Replace semaphore sem with completion
From: Arnd Bergmann @ 2016-11-18  8:58 UTC (permalink / raw)
  To: Binoy Jayan
  Cc: Sagi Grimberg, Doug Ledford, Sean Hefty, Hal Rosenstock,
	linux-rdma, Linux kernel mailing list
In-Reply-To: <CAHv-k__HHRzK7+AZ912GVw9ToxZWMw3OccPqgGt_NBf6+RXdzA@mail.gmail.com>

On Friday, November 18, 2016 12:27:32 PM CET Binoy Jayan wrote:
> Hi Sagi,
> 
> On 31 October 2016 at 02:42, Sagi Grimberg <sagi@grimberg.me> wrote:
> >> The semaphore 'sem' in isert_device is used as completion, so convert
> >> it to struct completion. Semaphores are going away in the future.
> >
> >
> > Umm, this is 100% *not* true. np->sem is designed as a counting to
> > sync the iscsi login thread with the connect requests coming from the
> > initiators. So this is actually a reliable bug insertion :(
> >
> > NAK from me...
> 
> Sorry for the late reply as I was held up in other activities.
> 
> I converted this to a wait_event() implementation but as I was doing it,
> I was wondering how it would have been different if it was a completion
> and not a semaphore.
> 
> File: drivers/infiniband/ulp/isert/ib_isert.c
> 
> If isert_connected_handler() is called multiple times, adding an entry to the
> list, and if that happens while we use completion, 'done' (part of struct
> completion) would be incremented by 1 each time 'complete' is called from
> isert_connected_handler. After 'n' iterations, done will be equal to 'n'. If we
> call wait_for_completion now from isert_accept_np, it would just decrement
> 'done' by one and continue without blocking, consuming one node at a time
> from the list 'isert_np->pending'.
> 
> Alternatively if "done" becomes zero, and the next time wait_for_completion is
> called, the API would add a node at the end of the wait queue 'wait' in 'struct
> completion' and block until "done" is nonzero. (Ref: do_wait_for_common)
> It exists the wait when a call to 'complete' turns 'done' back to 1.
> But if there
> are multiple waits called before calling complete, all the tasks
> calling the wait
> gets queued up and they will all would see "done" set to zero. When complete
> is called now, done turns 1 again and the first task in the queue is woken up
> as it is serialized as FIFO. Now the first wait returns and the done is
> decremented by 1 just before the return.
> 
> Am I missing something here?

I think you are right. This is behavior is actuallly documented in
Documentation/scheduler/completion.txt:

  If complete() is called multiple times then this will allow for that number
  of waiters to continue - each call to complete() will simply increment the
  done element. Calling complete_all() multiple times is a bug though. Both
  complete() and complete_all() can be called in hard-irq/atomic context
  safely.

However, this is fairly unusual behavior and I wasn't immediately aware
of it either when I read Sagi's reply. While your patch looks correct,
it's probably a good idea to point out the counting behavior of this
completion as explicitly as possible, in the changelog text of the patch
as well as in a code comment and perhaps in the naming of the completion.

	Arnd

^ permalink raw reply

* Re: [PATCH v4 05/10] IB/isert: Replace semaphore sem with completion
From: Binoy Jayan @ 2016-11-18  6:57 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Doug Ledford, Sean Hefty, Hal Rosenstock, Arnd Bergmann,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Linux kernel mailing list
In-Reply-To: <a267cc5e-4d4f-368e-a159-3eecc2187969-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>

Hi Sagi,

On 31 October 2016 at 02:42, Sagi Grimberg <sagi-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org> wrote:
>> The semaphore 'sem' in isert_device is used as completion, so convert
>> it to struct completion. Semaphores are going away in the future.
>
>
> Umm, this is 100% *not* true. np->sem is designed as a counting to
> sync the iscsi login thread with the connect requests coming from the
> initiators. So this is actually a reliable bug insertion :(
>
> NAK from me...

Sorry for the late reply as I was held up in other activities.

I converted this to a wait_event() implementation but as I was doing it,
I was wondering how it would have been different if it was a completion
and not a semaphore.

File: drivers/infiniband/ulp/isert/ib_isert.c

If isert_connected_handler() is called multiple times, adding an entry to the
list, and if that happens while we use completion, 'done' (part of struct
completion) would be incremented by 1 each time 'complete' is called from
isert_connected_handler. After 'n' iterations, done will be equal to 'n'. If we
call wait_for_completion now from isert_accept_np, it would just decrement
'done' by one and continue without blocking, consuming one node at a time
from the list 'isert_np->pending'.

Alternatively if "done" becomes zero, and the next time wait_for_completion is
called, the API would add a node at the end of the wait queue 'wait' in 'struct
completion' and block until "done" is nonzero. (Ref: do_wait_for_common)
It exists the wait when a call to 'complete' turns 'done' back to 1.
But if there
are multiple waits called before calling complete, all the tasks
calling the wait
gets queued up and they will all would see "done" set to zero. When complete
is called now, done turns 1 again and the first task in the queue is woken up
as it is serialized as FIFO. Now the first wait returns and the done is
decremented by 1 just before the return.

Am I missing something here?

Thanks,
Binoy
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Целевые клиентские базы Skype: prodawez390 Whatsapp: +79139230330 Viber:  +79139230330 Telegram: +79139230330 Email: prodawez391@gmail.com
From: crowther.heine-KK0ffGbhmjU@public.gmane.org @ 2016-11-18  2:05 UTC (permalink / raw)
  To: testuser-O5WfVfzUwx8@public.gmane.org

Целевые клиентские базы Skype: prodawez390 Whatsapp: +79139230330 Viber:  +79139230330 Telegram: +79139230330 Email: prodawez391-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PULL REQUEST] Please pull rdma.git
From: Doug Ledford @ 2016-11-18  2:01 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: linux-rdma, Linux Kernel
In-Reply-To: <CAJ3xEMjXYYnhS6qUzM9F+yjtq8Aahn08MjsSU4OnLS66Cu7mgw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

[-- Attachment #1.1: Type: text/plain, Size: 4002 bytes --]

On 11/17/2016 5:24 PM, Or Gerlitz wrote:
> On Thu, Nov 17, 2016 at 9:44 PM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>> On 11/17/16 1:49 PM, Leon Romanovsky wrote:
>>> On Thu, Nov 17, 2016 at 07:13:54AM -0500, Doug Ledford wrote:
>>>> Hi Linus,
>>>>
>>>> Due to various issues, I've been away and couldn't send a pull request
>>>> for about three weeks.  There were a number of -rc patches that built up
>>>> in the meantime (some where there already from the early -rc stages).
>>>> Obviously, there were way too many to send now, so I tried to pare the
>>>> list down to the more important patches for the -rc cycle.
> 
> Hi Doug,
> 
> Except for the hfi1 patches, all the rest (core, mlx5, mlx4 and rxe)
> are marked now as only 21 hours old in your 4.9-rc branch and they
> seems be made from you picking partial subsets of multiple series,

Correct.

> with none of them acked by you on the list.

I had an all day meeting today and had to get out the door early.  The
patches will be responded to.

> If you agree that I am describing things correctly - how are we
> expected to follow on your patch picking? I find it sort of impossible
> and error prone.

When I started this I said the official, canonical source of information
on patches like this is patchworks.  That still holds true.  In this
case, I pulled the full series of patches into a single bundle, then
reviewed every patch individually.  I checked for importance and
dependence on other patches.  Those that I thought could be moved to
4.10 were moved into a new bundle and then removed from the existing
bundle.  In this way, the patches were always in one or the other.  When
I was done, I used git am on the two bundles and one into the 4.9-rc and
the other into a -next branch.  In that way I made sure I didn't miss
any from the four series that I pulled.  Finally, I used the bundles to
mark the patches as accepted in patchworks.  By marking the entire
bundles as accepted, and not individual patches, it makes sure that what
I mark accepted is the same as what I ran git am on.  So, if the patch
shows in patchworks as accepted, then I got it.  If it doesn't, then I
missed it.

>>> Are you adding the rest to your for-next branch? We would like to have
>>> enough time to check that nothing is lost.
> 
>> Yes, it's already there in the mlx-next branch on github.
> 
> Re the patches there, this one
> 
> IB/mlx4: Set traffic class in AH
> 
> "Set traffic class within sl_tclass_flowlabel when create iboe AH.
> Without this the TOS value will be empty when running VLAN tagged
> traffic, because the TOS value is taken from the traffic class in the
> address handle attributes.
> 
> Fixes: 9106c41 ('IB/mlx4: Fix SL to 802.1Q priority-bits mapping for IBoE')"
> 
> claims to fix my commit, I have approached Leon and Co for
> clarifications/questions over the list on the patch and nothing was
> answered.

I agree with you.  It doesn't fix your patch.  The commit message can
still be fixed up.

> Please do not send it to Linus and wait for them to respond. I
> disagree that it fixes my commit b/c my commit was prior to when
> route-able RoCE  was introduced and on that time TOS had no relation.

I agree.  A better fix tag would be the commit that added RoCEv2 support.

> and this one
> 
> "IB/mlx4: Put non zero value in max_ah device attribute
> 
> Use INT_MAX since this is the max value the attribute can hold, though
> hardware capability is unlimited.
> 
> Fixes: 225c7b1 ('IB/mlx4: Add a driver Mellanox ConnectX InfiniBand adapters')"
> 
> does a tiny enhancement for a 10y old commit of Roland, why you think
> we need it in 4.9-rc6 or 7??

I don't, it's in the mlx-next branch which means I'll queue it up for
the 4.10 merge window.  I have no plan on sending that branch for 4.9-rc.

-- 
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
    GPG Key ID: 0E572FDD

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 884 bytes --]

^ permalink raw reply

* Re: [PATCH rdma-rc 5/6] IB/core: Save QP in ib_flow structure
From: Or Gerlitz @ 2016-11-17 22:41 UTC (permalink / raw)
  To: Doug Ledford; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <CAJ3xEMg=3eOqsKcrfcOoBkdpjiCmEAcPfNixHeLfVy45v_uvnQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Fri, Nov 18, 2016 at 12:40 AM, Or Gerlitz <gerlitz.or-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> Fixes: 319a441d1361 ("IB/core: Add receive flow steering support")
>
> same comment as the other two, probably applies some, most or all
> patches on your git hub branch, lets use what was submitted and not
> something else, okay?

Also, as I commented and you didn't respond, this fix protects against
**future** bug and has nothing to do with intree code. As such there's
no point to put it
into rc fix, it's a -next thing
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH rdma-rc 5/6] IB/core: Save QP in ib_flow structure
From: Or Gerlitz @ 2016-11-17 22:40 UTC (permalink / raw)
  To: Doug Ledford; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1477575391-20134-6-git-send-email-leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

> Fixes: 319a441d1361 ("IB/core: Add receive flow steering support")

same comment as the other two, probably applies some, most or all
patches on your git hub branch, lets use what was submitted and not
something else, okay?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH rdma-rc 12/12] IB/mlx5: Limit mkey page size to 2GB
From: Or Gerlitz @ 2016-11-17 22:38 UTC (permalink / raw)
  To: Doug Ledford; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1477575407-20562-13-git-send-email-leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

> Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB adapters')

the above line is different @ your git hub copy, bug as well?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH rdma-rc 10/12] IB/mlx5: Fix reported max SGE calculation
From: Or Gerlitz @ 2016-11-17 22:36 UTC (permalink / raw)
  To: Doug Ledford; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1477575407-20562-11-git-send-email-leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

On Thu, Oct 27, 2016 at 3:36 PM, Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> From: Eli Cohen <eli-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>
> Add the 512 bytes limit of RDMA READ and the size of remote
> address to the max SGE calculation.
>
> Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB adapters')

Doug, for some reason the above line is different in the commit
present @ your git hub tree, bug somewhere?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH rdma-rc] IB/IPoIB: Remove can't use GFP_NOIO warning
From: Or Gerlitz @ 2016-11-17 22:30 UTC (permalink / raw)
  To: Kamal Heib, Leon Romanovsky
  Cc: Doug Ledford, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Mel Gorman, Jiri Kosina
In-Reply-To: <1478765808-12517-1-git-send-email-leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

On Thu, Nov 10, 2016 at 10:16 AM, Leon Romanovsky wrote:
> From: Kamal Heib <kamalh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>
> Remove the warning print of "can't use of GFP_NOIO" to avoid prints in
> each QP creation when devices aren't supporting IB_QP_CREATE_USE_GFP_NOIO.
>
> This print become more annoying when the IPoIB interface is configured
> to work in connected mode.
>
> Fixes: 09b93088d750 ('IB: Add a QP creation flag to use GFP_NOIO allocations')

Hi Kamal,

I don't think you're fixing a bug in this commit... you find the print
annoying and you remove it, okay, maybe we can do that. Why not leave
it in rate-limited manner? can you elaborate what was that deeply
annoying with getting the warning?

> --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> @@ -1053,8 +1053,6 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
>
>         tx_qp = ib_create_qp(priv->pd, &attr);
>         if (PTR_ERR(tx_qp) == -EINVAL) {
> -               ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
> -                          priv->ca->name);
>                 attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO;
>                 tx_qp = ib_create_qp(priv->pd, &attr);
>         }
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PULL REQUEST] Please pull rdma.git
From: Or Gerlitz @ 2016-11-17 22:24 UTC (permalink / raw)
  To: Doug Ledford; +Cc: linux-rdma, Linux Kernel
In-Reply-To: <20161117200203.GQ4240-2ukJVAZIZ/Y@public.gmane.org>

On Thu, Nov 17, 2016 at 9:44 PM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> On 11/17/16 1:49 PM, Leon Romanovsky wrote:
>> On Thu, Nov 17, 2016 at 07:13:54AM -0500, Doug Ledford wrote:
>>> Hi Linus,
>>>
>>> Due to various issues, I've been away and couldn't send a pull request
>>> for about three weeks.  There were a number of -rc patches that built up
>>> in the meantime (some where there already from the early -rc stages).
>>> Obviously, there were way too many to send now, so I tried to pare the
>>> list down to the more important patches for the -rc cycle.

Hi Doug,

Except for the hfi1 patches, all the rest (core, mlx5, mlx4 and rxe)
are marked now as only 21 hours old in your 4.9-rc branch and they
seems be made from you picking partial subsets of multiple series,
with none of them acked by you on the list.

If you agree that I am describing things correctly - how are we
expected to follow on your patch picking? I find it sort of impossible
and error prone.

>> Are you adding the rest to your for-next branch? We would like to have
>> enough time to check that nothing is lost.

> Yes, it's already there in the mlx-next branch on github.

Re the patches there, this one

IB/mlx4: Set traffic class in AH

"Set traffic class within sl_tclass_flowlabel when create iboe AH.
Without this the TOS value will be empty when running VLAN tagged
traffic, because the TOS value is taken from the traffic class in the
address handle attributes.

Fixes: 9106c41 ('IB/mlx4: Fix SL to 802.1Q priority-bits mapping for IBoE')"

claims to fix my commit, I have approached Leon and Co for
clarifications/questions over the list on the patch and nothing was
answered.

Please do not send it to Linus and wait for them to respond. I
disagree that it fixes my commit b/c my commit was prior to when
route-able RoCE  was introduced and on that time TOS had no relation.

and this one

"IB/mlx4: Put non zero value in max_ah device attribute

Use INT_MAX since this is the max value the attribute can hold, though
hardware capability is unlimited.

Fixes: 225c7b1 ('IB/mlx4: Add a driver Mellanox ConnectX InfiniBand adapters')"

does a tiny enhancement for a 10y old commit of Roland, why you think
we need it in 4.9-rc6 or 7??

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: NFSD generic R/W API (sendto path) performance results
From: Chuck Lever @ 2016-11-17 20:42 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Christoph Hellwig, Steve Wise, List Linux RDMA Mailing
In-Reply-To: <c6190e4c-9b8e-3937-ba38-7861eebeaaae-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>

> On Nov 17, 2016, at 3:20 PM, Sagi Grimberg <sagi-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org> wrote:
> 
> 
>>>> Also did you try to always register for > max_sge
>>>> calls?  The code can already register all segments with the
>>>> rdma_rw_force_mr module option, so it would only need a small tweak for
>>>> that behavior.
>>> 
>>> For various reasons I decided the design should build one WR chain for
>>> each RDMA segment provided by the client. Good clients expose just
>>> one RDMA segment for the whole NFS READ payload.
>>> 
>>> Does force_mr make the generic API use FRWR with RDMA Write? I had
>>> assumed it changed only the behavior with RDMA Read. I'll try that
>>> too, if RDMA Write can easily be made to use FRWR.
>> 
>> Unfortunately, some RPC replies are formed from two or three
>> discontiguous buffers. The gap test in ib_sg_to_pages returns
>> a smaller number than sg_nents in this case, and rdma_rw_init_ctx
>> fails.
>> 
>> Thus with my current prototype I'm not able to test with FRWR.
>> 
>> I could fix this in my prototype, but it would be nicer for me if
>> rdma_rw_init_ctx handled this case the same for FRWR as it does
>> for physical addressing, which doesn't seem to have any problem
>> with a discontiguous SGL.
>> 
>> 
>>> But I'd like a better explanation for this result. Could be a bug
>>> in my implementation, my design, or in the driver. Continuing to
>>> investigate.
> 
> Hi Chuck, sorry for the late reply (have been busy lately..)
> 
> I think that the Call-to-first-Write phenomenon you are seeing makes
> perfect sense, the question is, is a QD=1 1M transfers latency that
> interesting? Did you see a positive effect on small (say 4k) transfers?
> both latency and iops scalability should be able to improve especially
> when serving multiple clients.
> 
> If indeed you feel that this is an interesting workload to optimize, I
> think we can come up with something.

I believe 1MB transfers are interesting: NFS is frequently used in
back-up scenarios, for example, and believe it or not, also for
non-linear editing and animation (4K video).

QD=1 exposes the individual components of latency. In this case, we
can clearly see the cost of preparing the data payload for transfer.
It's basically a tweak so we can debug the problem.

In the "Real World" I expect to see larger transfers, where several
1MB I/Os are dispatched in parallel. I don't reach fabric bandwidth
until 10 or more are in flight, which I think should be improved.

Wrt 4KB, I didn't see much change there, though I admit I didn't
expect much change since both cases have to DMA map a page, and post
one Write WR, so I haven't looked too closely. I'm already down
around 32us for a 4KB NFS READ, even without the server changes,
and 30us for 4KB NFS READ all-inline.

I agree, though, that there is a 3:1 reduction in ib_post_send calls
with the generic API in my test harness, and that can only be a good
thing.

> About the First-to-last-Write, thats weird, and sound like a bug
> somewhere. Maybe Mellanox folks can tell us if splitting 1M to multiple
> writes works better (although I cannot comprehend why).
> 
> Question, are the send and receive cqs still in IB_POLL_SOFTIRQ mode?

Yes, both are still in SOFTIRQ. A while back I played with using the
Work Queue mode, and noticed some slowdowns when the Send CQ was
changed to use Work Queue. I didn't explore it further.

Someday, these will need to be changed, so that the server runs
entirely in process context (as it does for TCP, AFAICT). That gets
rid of BH flapping, but adds heavy-weight context switching latencies.

--
Chuck Lever

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: NFSD generic R/W API (sendto path) performance results
From: Chuck Lever @ 2016-11-17 20:20 UTC (permalink / raw)
  To: Steve Wise; +Cc: Christoph Hellwig, Sagi Grimberg, List Linux RDMA Mailing
In-Reply-To: <01f001d2410d$b16c97f0$1445c7d0$@opengridcomputing.com>


> On Nov 17, 2016, at 3:03 PM, Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org> wrote:
> 
>>> 
>>> On Nov 17, 2016, at 10:04 AM, Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org> wrote:
>>> 
>>>> On Nov 17, 2016, at 7:46 AM, Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> wrote:
>>>> Also did you try to always register for > max_sge
>>>> calls?  The code can already register all segments with the
>>>> rdma_rw_force_mr module option, so it would only need a small tweak for
>>>> that behavior.
>>> 
>>> For various reasons I decided the design should build one WR chain for
>>> each RDMA segment provided by the client. Good clients expose just
>>> one RDMA segment for the whole NFS READ payload.
>>> 
>>> Does force_mr make the generic API use FRWR with RDMA Write? I had
>>> assumed it changed only the behavior with RDMA Read. I'll try that
>>> too, if RDMA Write can easily be made to use FRWR.
>> 
>> Unfortunately, some RPC replies are formed from two or three
>> discontiguous buffers. The gap test in ib_sg_to_pages returns
>> a smaller number than sg_nents in this case, and rdma_rw_init_ctx
>> fails.
>> 
>> Thus with my current prototype I'm not able to test with FRWR.
>> 
>> I could fix this in my prototype, but it would be nicer for me if
>> rdma_rw_init_ctx handled this case the same for FRWR as it does
>> for physical addressing, which doesn't seem to have any problem
>> with a discontiguous SGL.
> 
> Just to make sure I'm understanding you, for rdma-rw to handle this,  it would
> have to use multiple REG_MR registrations, one for each contiguous area in the
> scatter list.  
> 
> Right?

Right, that's the approach the NFS client takes. See
net/sunrpc/xprtrdma/frwr_ops.c :: frwr_op_map.

If the passed-in memory list isn't contiguous, frwr_op_map stops
registering and returns to the caller, who allocates another
MR and calls in again with the remaining part of the list.

I think this would not apply to SG_GAP MRs, which should
already be able to handle discontiguous SGLs?

Note this doesn't apply to most NFS READs, where just the data
payload is going via RDMA Write, and the payload is already in
a contiguous piece of memory. But Reply chunks, which are used
for READDIRs and other requests, can be built from discontiguous
memory.

I haven't looked closely at the RDMA Read logic, but I think
it always reads into a contiguous set of pages, then builds
the xdr_buf out of that. It shouldn't have the same problem
(and it is already known to work with FRWR ;-).


--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: NFSD generic R/W API (sendto path) performance results
From: Sagi Grimberg @ 2016-11-17 20:20 UTC (permalink / raw)
  To: Chuck Lever, Christoph Hellwig; +Cc: Steve Wise, List Linux RDMA Mailing
In-Reply-To: <676323E9-2F30-4DB0-AEF8-CDE38E8A0715-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>


>>> Also did you try to always register for > max_sge
>>> calls?  The code can already register all segments with the
>>> rdma_rw_force_mr module option, so it would only need a small tweak for
>>> that behavior.
>>
>> For various reasons I decided the design should build one WR chain for
>> each RDMA segment provided by the client. Good clients expose just
>> one RDMA segment for the whole NFS READ payload.
>>
>> Does force_mr make the generic API use FRWR with RDMA Write? I had
>> assumed it changed only the behavior with RDMA Read. I'll try that
>> too, if RDMA Write can easily be made to use FRWR.
>
> Unfortunately, some RPC replies are formed from two or three
> discontiguous buffers. The gap test in ib_sg_to_pages returns
> a smaller number than sg_nents in this case, and rdma_rw_init_ctx
> fails.
>
> Thus with my current prototype I'm not able to test with FRWR.
>
> I could fix this in my prototype, but it would be nicer for me if
> rdma_rw_init_ctx handled this case the same for FRWR as it does
> for physical addressing, which doesn't seem to have any problem
> with a discontiguous SGL.
>
>
>> But I'd like a better explanation for this result. Could be a bug
>> in my implementation, my design, or in the driver. Continuing to
>> investigate.

Hi Chuck, sorry for the late reply (have been busy lately..)

I think that the Call-to-first-Write phenomenon you are seeing makes
perfect sense, the question is, is a QD=1 1M transfers latency that
interesting? Did you see a positive effect on small (say 4k) transfers?
both latency and iops scalability should be able to improve especially
when serving multiple clients.

If indeed you feel that this is an interesting workload to optimize, I
think we can come up with something.

About the First-to-last-Write, thats weird, and sound like a bug
somewhere. Maybe Mellanox folks can tell us if splitting 1M to multiple
writes works better (although I cannot comprehend why).

Question, are the send and receive cqs still in IB_POLL_SOFTIRQ mode?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* RE: NFSD generic R/W API (sendto path) performance results
From: Steve Wise @ 2016-11-17 20:03 UTC (permalink / raw)
  To: 'Chuck Lever', 'Christoph Hellwig'
  Cc: 'Sagi Grimberg', 'List Linux RDMA Mailing'
In-Reply-To: <676323E9-2F30-4DB0-AEF8-CDE38E8A0715-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

> 
> > On Nov 17, 2016, at 10:04 AM, Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org> wrote:
> >
> >> On Nov 17, 2016, at 7:46 AM, Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> wrote:
> >> Also did you try to always register for > max_sge
> >> calls?  The code can already register all segments with the
> >> rdma_rw_force_mr module option, so it would only need a small tweak for
> >> that behavior.
> >
> > For various reasons I decided the design should build one WR chain for
> > each RDMA segment provided by the client. Good clients expose just
> > one RDMA segment for the whole NFS READ payload.
> >
> > Does force_mr make the generic API use FRWR with RDMA Write? I had
> > assumed it changed only the behavior with RDMA Read. I'll try that
> > too, if RDMA Write can easily be made to use FRWR.
> 
> Unfortunately, some RPC replies are formed from two or three
> discontiguous buffers. The gap test in ib_sg_to_pages returns
> a smaller number than sg_nents in this case, and rdma_rw_init_ctx
> fails.
> 
> Thus with my current prototype I'm not able to test with FRWR.
> 
> I could fix this in my prototype, but it would be nicer for me if
> rdma_rw_init_ctx handled this case the same for FRWR as it does
> for physical addressing, which doesn't seem to have any problem
> with a discontiguous SGL.

Just to make sure I'm understanding you, for rdma-rw to handle this,  it would
have to use multiple REG_MR registrations, one for each contiguous area in the
scatter list.  

Right?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PULL REQUEST] Please pull rdma.git
From: Leon Romanovsky @ 2016-11-17 20:02 UTC (permalink / raw)
  To: Doug Ledford; +Cc: linux-rdma
In-Reply-To: <582E089A.3040106-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 1237 bytes --]

On Thu, Nov 17, 2016 at 02:44:26PM -0500, Doug Ledford wrote:
> On 11/17/16 1:49 PM, Leon Romanovsky wrote:
> > On Thu, Nov 17, 2016 at 07:13:54AM -0500, Doug Ledford wrote:
> >> Hi Linus,
> >>
> >> Due to various issues, I've been away and couldn't send a pull request
> >> for about three weeks.  There were a number of -rc patches that built up
> >> in the meantime (some where there already from the early -rc stages).
> >> Obviously, there were way too many to send now, so I tried to pare the
> >> list down to the more important patches for the -rc cycle.
> >
> > Hi Doug,
> > Are you adding the rest to your for-next branch? We would like to have
> > enough time to check that nothing is lost.
> >
> > Thanks
> >
>
> Yes, it's already there in the mlx-next branch on github.

Thanks,
Do I understand you correctly that it is not final version and you
didn't upload general branch yet?

There is patch for the MAD which will be great to see in the list too.
https://patchwork.kernel.org/patch/9382745/
http://marc.info/?l=linux-rdma&m=147681123708587&w=2

Thanks

>
> --
> Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>    GPG Key ID: 0E572FDD
>   Red Hat, Inc.
>   100 E. Davie St
>   Raleigh, NC 27601 USA
>




[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [PULL REQUEST] Please pull rdma.git
From: Doug Ledford @ 2016-11-17 19:44 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: linux-rdma
In-Reply-To: <20161117184950.GP4240-2ukJVAZIZ/Y@public.gmane.org>


[-- Attachment #1.1: Type: text/plain, Size: 866 bytes --]

On 11/17/16 1:49 PM, Leon Romanovsky wrote:
> On Thu, Nov 17, 2016 at 07:13:54AM -0500, Doug Ledford wrote:
>> Hi Linus,
>>
>> Due to various issues, I've been away and couldn't send a pull request
>> for about three weeks.  There were a number of -rc patches that built up
>> in the meantime (some where there already from the early -rc stages).
>> Obviously, there were way too many to send now, so I tried to pare the
>> list down to the more important patches for the -rc cycle.
> 
> Hi Doug,
> Are you adding the rest to your for-next branch? We would like to have
> enough time to check that nothing is lost.
> 
> Thanks
> 

Yes, it's already there in the mlx-next branch on github.

-- 
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>    GPG Key ID: 0E572FDD
  Red Hat, Inc.
  100 E. Davie St
  Raleigh, NC 27601 USA


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 907 bytes --]

^ permalink raw reply

* Re: NFSD generic R/W API (sendto path) performance results
From: Chuck Lever @ 2016-11-17 19:20 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Steve Wise, Sagi Grimberg, List Linux RDMA Mailing
In-Reply-To: <84B43CFF-EBF7-4758-8751-8C97102C5BCF-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>


> On Nov 17, 2016, at 10:04 AM, Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org> wrote:
> 
>> On Nov 17, 2016, at 7:46 AM, Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> wrote:
>> Also did you try to always register for > max_sge
>> calls?  The code can already register all segments with the
>> rdma_rw_force_mr module option, so it would only need a small tweak for
>> that behavior.
> 
> For various reasons I decided the design should build one WR chain for
> each RDMA segment provided by the client. Good clients expose just
> one RDMA segment for the whole NFS READ payload.
> 
> Does force_mr make the generic API use FRWR with RDMA Write? I had
> assumed it changed only the behavior with RDMA Read. I'll try that
> too, if RDMA Write can easily be made to use FRWR.

Unfortunately, some RPC replies are formed from two or three
discontiguous buffers. The gap test in ib_sg_to_pages returns
a smaller number than sg_nents in this case, and rdma_rw_init_ctx
fails.

Thus with my current prototype I'm not able to test with FRWR.

I could fix this in my prototype, but it would be nicer for me if
rdma_rw_init_ctx handled this case the same for FRWR as it does
for physical addressing, which doesn't seem to have any problem
with a discontiguous SGL.


> But I'd like a better explanation for this result. Could be a bug
> in my implementation, my design, or in the driver. Continuing to
> investigate.


--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PULL REQUEST] Please pull rdma.git
From: Leon Romanovsky @ 2016-11-17 18:49 UTC (permalink / raw)
  To: Doug Ledford; +Cc: linux-rdma
In-Reply-To: <58466423-c87e-3921-101e-bffab8989fd8-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 562 bytes --]

On Thu, Nov 17, 2016 at 07:13:54AM -0500, Doug Ledford wrote:
> Hi Linus,
>
> Due to various issues, I've been away and couldn't send a pull request
> for about three weeks.  There were a number of -rc patches that built up
> in the meantime (some where there already from the early -rc stages).
> Obviously, there were way too many to send now, so I tried to pare the
> list down to the more important patches for the -rc cycle.

Hi Doug,
Are you adding the rest to your for-next branch? We would like to have
enough time to check that nothing is lost.

Thanks

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* RE: rdma-core release process questions
From: Nikolova, Tatyana E @ 2016-11-17 18:21 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Jason Gunthorpe, Doug Ledford,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <20161117102846.GJ4240-2ukJVAZIZ/Y@public.gmane.org>

Hi Leon and Jason,

Thank you for the information.

Tatyana

-----Original Message-----
From: Leon Romanovsky [mailto:leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org] 
Sent: Thursday, November 17, 2016 4:29 AM
To: Nikolova, Tatyana E <tatyana.e.nikolova-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Cc: Jason Gunthorpe <jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>; Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Subject: Re: rdma-core release process questions

On Thu, Nov 17, 2016 at 04:56:45AM +0000, Nikolova, Tatyana E wrote:
> Hi,
>
> We are submitting patches to the kernel space driver i40iw and to the user space plugin libi40iw, which is currently part of rdma-core. Some of the changes need to be coordinated so that they appear in both kernel space and user space in corresponding releases. We have some questions regarding the process about submitting patches to rdma-core which have dependencies on kernel patches.
>
> 1) Can user space patches target a for-next rdma-core release, if the corresponding kernel patches are queued for the next kernel?

I don't see any problem with that, once the patches accepted for the -next by Doug, they can be accepted to the rdma-core too. Anyway these changes should be compatible with old kernel without such new feature.

> 2) How are ABI changes handled in rdma-core?

Do you have specific thing in mind?
Generally speaking, send to ML pass review and we will apply.

>
> 3) Could you explain the release process for rdma-core?

The process as agreed will be something like that:
a. Review/accept/decline patches in 1-2 weeks time frame.
b. Once kernel released, stop accepting new features.
c. Wait for 1-2 weeks to see no one complains. It is just to be on safe side, because the library is always ready for release and checked constantly.
d. Create new tag and push release.

>
> 4) Is each rdma-core release going to correspond to a specific kernel version?

As Jason wrote, It will be aligned in release time to the kernel, but library should remain backward compatible.

>
> Thank you,
> Tatyana
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH rdma-next 0/4] Add packet pacing support for IB verbs
From: Leon Romanovsky @ 2016-11-17 18:15 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1477909297-14491-1-git-send-email-leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 2661 bytes --]

On Mon, Oct 31, 2016 at 12:21:33PM +0200, Leon Romanovsky wrote:
> When sending from a 10G host to a 1G host, it is easy to overrun the receiver,
> leading to packet loss and traffic backing off. Similar problems occur when
> a 10G host sends data to a sub-10G virtual circuit, or a 40G host sending
> to a 10G host. Packet pacing could control packet injection rate and reduces
> network congestion to maximize throughput & minimize network latency.
>
> Packet pacing is a rate limiting and shaping for a QP (SQ for RAW QP), set
> and change the rate is done by modifying QP. This series of patch made the
> following high level changes:
>  1. Report rate limit capabilities through user data. Reported capabilities
>     include: The maximum and minimum rate limit in kbps supported by packet
>     pacing; Bitmap showing which QP types are supported by packet pacing
>     operation.
>  2. Extend modify QP interface for growing attributes. Add rate limit support
>     to the extended interface.
>  3. Enable mlx5-based hardware to be able to update the rate limit for
>     RAW QP packet.
>
> Available in the "topic/packet_pacing" topic branch of this git repo:
> git://git.kernel.org/pub/scm/linux/kernel/git/leon/linux-rdma.git
>
> Or for browsing:
> https://git.kernel.org/cgit/linux/kernel/git/leon/linux-rdma.git/log/?h=topic/packet_pacing

Hi Doug,

Please drop this patch series, we discovered an issue with proposed
modify_qp implementation and will respin it.

Sorry for the inconvenience.

Thanks

>
> Thanks,
>   Bodong & Leon
>
> Bodong Wang (4):
>   IB/mlx5: Report mlx5 packet pacing capabilities when querying device
>   IB/core: Support rate limit for packet pacing
>   IB/uverbs: Extend modify_qp and support packet pacing
>   IB/mlx5: Update the rate limit according to user setting for RAW QP
>
>  drivers/infiniband/core/uverbs.h      |   1 +
>  drivers/infiniband/core/uverbs_cmd.c  | 178 +++++++++++++++++++++-------------
>  drivers/infiniband/core/uverbs_main.c |   1 +
>  drivers/infiniband/core/verbs.c       |   2 +
>  drivers/infiniband/hw/mlx5/main.c     |  16 ++-
>  drivers/infiniband/hw/mlx5/mlx5_ib.h  |   1 +
>  drivers/infiniband/hw/mlx5/qp.c       |  71 ++++++++++++--
>  include/rdma/ib_verbs.h               |   2 +
>  include/uapi/rdma/ib_user_verbs.h     |  12 +++
>  include/uapi/rdma/mlx5-abi.h          |  13 +++
>  10 files changed, 219 insertions(+), 78 deletions(-)
>
> --
> 2.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [patch] IB/rxe: Remove unneeded cast in rxe_srq_from_attr()
From: Yuval Shaia @ 2016-11-17 16:38 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Moni Shoua, Doug Ledford, Sean Hefty, Hal Rosenstock,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	kernel-janitors-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161117121554.GA4292-Hxa29pjIrETlQW142y8m19+IiqhCXseY@public.gmane.org>

Besides the soft-aggressive commit message -:)
Reviewed-by: Yuval Shaia <yuval.shaia-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
 
On Thu, Nov 17, 2016 at 02:00:05PM +0300, Dan Carpenter wrote:
> It makes me nervous when we cast pointer parameters.  I would estimate
> that around 50% of the time, it indicates a bug.  Here the cast is not
> needed becaue u32 and and unsigned int are the same thing.  Removing the
> cast makes the code more robust and future proof in case any of the
> types change.
> 
> Signed-off-by: Dan Carpenter <dan.carpenter-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c
> index 2a6e3cd..efc832a 100644
> --- a/drivers/infiniband/sw/rxe/rxe_srq.c
> +++ b/drivers/infiniband/sw/rxe/rxe_srq.c
> @@ -169,7 +169,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
>  			}
>  		}
>  
> -		err = rxe_queue_resize(q, (unsigned int *)&attr->max_wr,
> +		err = rxe_queue_resize(q, &attr->max_wr,
>  				       rcv_wqe_size(srq->rq.max_sge),
>  				       srq->rq.queue->ip ?
>  						srq->rq.queue->ip->context :
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: NFSD generic R/W API (sendto path) performance results
From: Chuck Lever @ 2016-11-17 15:04 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Steve Wise, Sagi Grimberg, List Linux RDMA Mailing
In-Reply-To: <20161117124602.GA25821-jcswGhMUV9g@public.gmane.org>


> On Nov 17, 2016, at 7:46 AM, Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> wrote:
> 
> On Wed, Nov 16, 2016 at 02:45:33PM -0500, Chuck Lever wrote:
>> Out of curiosity, I hacked up my NFS client to limit the size of RDMA
>> segments to 30 pages (the server HCA's max_sge).
>> 
>> A 1MB NFS READ now takes 9 segments. That forces the after-conversion
>> server to build single-Write chains and use 9 post_send calls to
>> transmit the READ payload, just like the before-conversion server.
>> 
>> Performance of before- and after-conversion servers is now equivalent.
>> 
>>              kB  reclen    write  rewrite    read    reread
>>         2097152    1024  1061237  1141614  1961410  2000223                                                                                  
> 
> What HCA is this, btw?

ConnectX-3 Pro, f/w 2.31.5050


> Also did you try to always register for > max_sge
> calls?  The code can already register all segments with the
> rdma_rw_force_mr module option, so it would only need a small tweak for
> that behavior.

For various reasons I decided the design should build one WR chain for
each RDMA segment provided by the client. Good clients expose just
one RDMA segment for the whole NFS READ payload.

Does force_mr make the generic API use FRWR with RDMA Write? I had
assumed it changed only the behavior with RDMA Read. I'll try that
too, if RDMA Write can easily be made to use FRWR.

But I'd like a better explanation for this result. Could be a bug
in my implementation, my design, or in the driver. Continuing to
investigate.


--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox