Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 5/5] qed*: Add support for read/write of eeprom
From: Yuval Mintz @ 2016-04-17 19:26 UTC (permalink / raw)
  To: davem, netdev; +Cc: Sudarsana Reddy Kalluru, Yuval Mintz
In-Reply-To: <1460921195-23352-1-git-send-email-Yuval.Mintz@qlogic.com>

From: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>

Add the driver logic needed for supporting the ethtool APIs for reading
and writing from the interface's eeprom.

There's quite a bit of `magic' here [based on the ethtool `magic' field]
which is retained by the user application responsible for interacting
with the driver for the purpose of upgrading the nvram image.

Basically, the interface goes via MFW which is responsible for actually
programming the non-volatile memory. There are 2 supported modes of
operations, one which is `file'-based and one which is based on raw data.

Signed-off-by: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
---
 drivers/net/ethernet/qlogic/qed/qed.h           |  10 +
 drivers/net/ethernet/qlogic/qed/qed_main.c      |  37 ++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c       | 274 ++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h       | 103 +++++++++
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  30 +++
 include/linux/qed/qed_if.h                      |  30 +++
 6 files changed, 484 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 33e2ed6..025248f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -38,6 +38,16 @@ enum qed_coalescing_mode {
 	QED_COAL_MODE_ENABLE
 };
 
+enum qed_nvm_cmd {
+	QED_PUT_FILE_BEGIN	= DRV_MSG_CODE_NVM_PUT_FILE_BEGIN,
+	QED_PUT_FILE_DATA	= DRV_MSG_CODE_NVM_PUT_FILE_DATA,
+	QED_NVM_READ_NVRAM	= DRV_MSG_CODE_NVM_READ_NVRAM,
+	QED_NVM_WRITE_NVRAM	= DRV_MSG_CODE_NVM_WRITE_NVRAM,
+	QED_NVM_DEL_FILE	= DRV_MSG_CODE_NVM_DEL_FILE,
+	QED_NVM_SET_SECURE_MODE = DRV_MSG_CODE_SET_SECURE_MODE,
+	QED_GET_MCP_NVM_RESP	= 0xFFFFFF00
+};
+
 struct qed_eth_cb_ops;
 struct qed_dev_info;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 1918b83..78b5966 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1165,6 +1165,41 @@ static int qed_drain(struct qed_dev *cdev)
 	return 0;
 }
 
+int qed_nvm_get_cmd(struct qed_dev *cdev, u32 cmd, u32 addr,
+		    u8 *buf, u32 len)
+{
+	switch (cmd) {
+	case QED_NVM_READ_NVRAM:
+		return qed_mcp_nvm_read(cdev, addr, buf, len);
+	case QED_GET_MCP_NVM_RESP:
+		return qed_mcp_nvm_resp(cdev, buf);
+	default:
+		cdev->mcp_nvm_resp = FW_MSG_CODE_NVM_OPERATION_FAILED;
+		DP_NOTICE(cdev, "Unknown command %d\n", cmd);
+		return -EOPNOTSUPP;
+	}
+}
+
+int qed_nvm_set_cmd(struct qed_dev *cdev, u32 cmd, u32 addr,
+		    u8 *buf, u32 len)
+{
+	switch (cmd) {
+	case QED_NVM_DEL_FILE:
+		return qed_mcp_nvm_del_file(cdev, addr);
+	case QED_PUT_FILE_BEGIN:
+		return qed_mcp_nvm_put_file_begin(cdev, addr);
+	case QED_PUT_FILE_DATA:
+	case QED_NVM_WRITE_NVRAM:
+		return qed_mcp_nvm_write(cdev, cmd, addr, buf, len);
+	case QED_NVM_SET_SECURE_MODE:
+		return qed_mcp_nvm_set_secure_mode(cdev, addr);
+	default:
+		cdev->mcp_nvm_resp = FW_MSG_CODE_NVM_OPERATION_FAILED;
+		DP_NOTICE(cdev, "Unknown command %d\n", cmd);
+		return -EOPNOTSUPP;
+	}
+}
+
 static int qed_set_led(struct qed_dev *cdev, enum qed_led_mode mode)
 {
 	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
@@ -1203,5 +1238,7 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.update_msglvl = &qed_init_dp,
 	.chain_alloc = &qed_chain_alloc,
 	.chain_free = &qed_chain_free,
+	.nvm_get_cmd = &qed_nvm_get_cmd,
+	.nvm_set_cmd = &qed_nvm_set_cmd,
 	.set_led = &qed_set_led,
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index b89c9a8..1812ff9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -395,6 +395,60 @@ int qed_mcp_cmd(struct qed_hwfn *p_hwfn,
 	return 0;
 }
 
+static int qed_mcp_nvm_wr_cmd(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      u32 cmd, u32 param,
+			      u32 *o_mcp_resp, u32 *o_mcp_param,
+			      u32 i_txn_size, u32 *i_buf)
+{
+	struct qed_mcp_mb_params mb_params;
+	union drv_union_data union_data;
+	int rc;
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = cmd;
+	mb_params.param = param;
+	memcpy(&union_data.raw_data, i_buf, i_txn_size);
+	mb_params.p_data_src = &union_data;
+
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc)
+		return rc;
+
+	*o_mcp_resp = mb_params.mcp_resp;
+	*o_mcp_param = mb_params.mcp_param;
+
+	return 0;
+}
+
+static int qed_mcp_nvm_rd_cmd(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      u32 cmd, u32 param,
+			      u32 *o_mcp_resp, u32 *o_mcp_param,
+			      u32 *o_txn_size, u32 *o_buf)
+{
+	struct qed_mcp_mb_params mb_params;
+	union drv_union_data union_data;
+	int rc;
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = cmd;
+	mb_params.param = param;
+	mb_params.p_data_dst = &union_data;
+
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc)
+		return rc;
+
+	*o_mcp_resp = mb_params.mcp_resp;
+	*o_mcp_param = mb_params.mcp_param;
+
+	*o_txn_size = *o_mcp_param;
+	memcpy(o_buf, &union_data.raw_data, *o_txn_size);
+
+	return 0;
+}
+
 int qed_mcp_load_req(struct qed_hwfn *p_hwfn,
 		     struct qed_ptt *p_ptt,
 		     u32 *p_load_code)
@@ -908,6 +962,43 @@ int qed_mcp_drain(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+static int qed_mcp_nvm_command(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt,
+			       struct qed_mcp_nvm_params *params)
+{
+	int rc;
+
+	switch (params->type) {
+	case QED_MCP_NVM_RD:
+		rc = qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt, params->nvm_common.cmd,
+					params->nvm_common.offset,
+					&params->nvm_common.resp,
+					&params->nvm_common.param,
+					params->nvm_rd.buf_size,
+					params->nvm_rd.buf);
+		break;
+	case QED_MCP_CMD:
+		rc = qed_mcp_cmd(p_hwfn, p_ptt, params->nvm_common.cmd,
+				 params->nvm_common.offset,
+				 &params->nvm_common.resp,
+				 &params->nvm_common.param);
+		break;
+	case QED_MCP_NVM_WR:
+		rc = qed_mcp_nvm_wr_cmd(p_hwfn, p_ptt, params->nvm_common.cmd,
+					params->nvm_common.offset,
+					&params->nvm_common.resp,
+					&params->nvm_common.param,
+					params->nvm_wr.buf_size,
+					params->nvm_wr.buf);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	return rc;
+}
+
 int qed_mcp_get_flash_size(struct qed_hwfn *p_hwfn,
 			   struct qed_ptt *p_ptt,
 			   u32 *p_flash_size)
@@ -979,3 +1070,186 @@ int qed_mcp_set_led(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 
 	return rc;
 }
+
+int qed_mcp_nvm_read(struct qed_dev *cdev,
+		     u32 addr,
+		     u8 *p_buf,
+		     u32 len)
+{
+	u32 bytes_left = len, offset = 0, bytes_to_copy, buf_size;
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_mcp_nvm_params params;
+	struct qed_ptt *p_ptt;
+	int rc = 0;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+
+	memset(&params, 0, sizeof(struct qed_mcp_nvm_params));
+	params.type = QED_MCP_NVM_RD;
+	params.nvm_rd.buf_size = &buf_size;
+	params.nvm_common.cmd = DRV_MSG_CODE_NVM_READ_NVRAM;
+
+	while (bytes_left > 0) {
+		bytes_to_copy = min_t(u32, bytes_left,
+				      MCP_DRV_NVM_BUF_LEN);
+		params.nvm_common.offset = (addr + offset) |
+					   (bytes_to_copy <<
+					    DRV_MB_PARAM_NVM_LEN_SHIFT);
+		params.nvm_rd.buf = (u32 *)(p_buf + offset);
+
+		rc = qed_mcp_nvm_command(p_hwfn, p_ptt, &params);
+		if (rc || (params.nvm_common.resp != FW_MSG_CODE_NVM_OK)) {
+			DP_NOTICE(cdev, "MCP command rc = %d\n",
+				  rc);
+			break;
+		}
+
+		offset += *params.nvm_rd.buf_size;
+		bytes_left -= *params.nvm_rd.buf_size;
+	}
+
+	cdev->mcp_nvm_resp = params.nvm_common.resp;
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return rc;
+}
+
+int qed_mcp_nvm_resp(struct qed_dev *cdev,
+		     u8 *p_buf)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_mcp_nvm_params params;
+	struct qed_ptt *p_ptt;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+
+	memset(&params, 0, sizeof(struct qed_mcp_nvm_params));
+	memcpy(p_buf, &cdev->mcp_nvm_resp, sizeof(cdev->mcp_nvm_resp));
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return 0;
+}
+
+int qed_mcp_nvm_del_file(struct qed_dev *cdev,
+			 u32 addr)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_mcp_nvm_params params;
+	struct qed_ptt *p_ptt;
+	int rc;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+
+	memset(&params, 0, sizeof(struct qed_mcp_nvm_params));
+	params.type = QED_MCP_CMD;
+	params.nvm_common.cmd = DRV_MSG_CODE_NVM_DEL_FILE;
+	params.nvm_common.offset = addr;
+
+	rc = qed_mcp_nvm_command(p_hwfn, p_ptt, &params);
+	cdev->mcp_nvm_resp = params.nvm_common.resp;
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return rc;
+}
+
+int qed_mcp_nvm_put_file_begin(struct qed_dev *cdev,
+			       u32 addr)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_mcp_nvm_params params;
+	struct qed_ptt *p_ptt;
+	int rc;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+
+	memset(&params, 0, sizeof(struct qed_mcp_nvm_params));
+	params.type = QED_MCP_CMD;
+	params.nvm_common.cmd = DRV_MSG_CODE_NVM_PUT_FILE_BEGIN;
+	params.nvm_common.offset = addr;
+
+	rc = qed_mcp_nvm_command(p_hwfn, p_ptt, &params);
+	cdev->mcp_nvm_resp = params.nvm_common.resp;
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return rc;
+}
+
+int qed_mcp_nvm_write(struct qed_dev *cdev,
+		      u32 cmd,
+		      u32 addr,
+		      u8 *p_buf,
+		      u32 len)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_mcp_nvm_params params;
+	struct qed_ptt *p_ptt;
+	u32 buf_idx = 0, buf_size;
+	int rc = -EINVAL;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+
+	memset(&params, 0, sizeof(struct qed_mcp_nvm_params));
+	params.type = QED_MCP_NVM_WR;
+	if (cmd == QED_PUT_FILE_DATA)
+		params.nvm_common.cmd = DRV_MSG_CODE_NVM_PUT_FILE_DATA;
+	else
+		params.nvm_common.cmd = DRV_MSG_CODE_NVM_WRITE_NVRAM;
+
+	while (buf_idx < len) {
+		buf_size = min_t(u32, (len - buf_idx),
+				 MCP_DRV_NVM_BUF_LEN);
+		params.nvm_common.offset = ((buf_size <<
+					     DRV_MB_PARAM_NVM_LEN_SHIFT) |
+					    addr) + buf_idx;
+		params.nvm_wr.buf_size	= buf_size;
+		params.nvm_wr.buf = (u32 *)&p_buf[buf_idx];
+
+		rc = qed_mcp_nvm_command(p_hwfn, p_ptt, &params);
+		if (rc ||
+		    ((params.nvm_common.resp != FW_MSG_CODE_NVM_OK) &&
+		     (params.nvm_common.resp !=
+		      FW_MSG_CODE_NVM_PUT_FILE_FINISH_OK)))
+			DP_NOTICE(cdev, "MCP command rc = %d\n", rc);
+
+		buf_idx += buf_size;
+	}
+
+	cdev->mcp_nvm_resp = params.nvm_common.resp;
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return rc;
+}
+
+int qed_mcp_nvm_set_secure_mode(struct qed_dev *cdev,
+				u32 addr)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_mcp_nvm_params params;
+	struct qed_ptt *p_ptt;
+	int rc;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+
+	memset(&params, 0, sizeof(struct qed_mcp_nvm_params));
+	params.type = QED_MCP_CMD;
+	params.nvm_common.cmd = DRV_MSG_CODE_SET_SECURE_MODE;
+	params.nvm_common.offset = addr;
+
+	rc =  qed_mcp_nvm_command(p_hwfn, p_ptt, &params);
+	cdev->mcp_nvm_resp = params.nvm_common.resp;
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return rc;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 50917a2..ab0f0a5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -92,6 +92,33 @@ struct qed_mcp_nvm_common {
 	u32	cmd;
 };
 
+struct qed_mcp_nvm_rd {
+	u32	*buf_size;
+	u32	*buf;
+};
+
+struct qed_mcp_nvm_wr {
+	u32	buf_size;
+	u32	*buf;
+};
+
+enum qed_mcp_num_parameters_type {
+	QED_MCP_CMD,
+	QED_MCP_NVM_RD,
+	QED_MCP_NVM_WR,
+};
+
+struct qed_mcp_nvm_params {
+	enum qed_mcp_num_parameters_type	type;
+
+	struct qed_mcp_nvm_common		nvm_common;
+
+	union {
+		struct qed_mcp_nvm_rd		nvm_rd;
+		struct qed_mcp_nvm_wr		nvm_wr;
+	};
+};
+
 struct qed_mcp_drv_version {
 	u32	version;
 	u8	name[MCP_DRV_VER_STR_SIZE - 4];
@@ -237,6 +264,82 @@ int qed_mcp_set_led(struct qed_hwfn *p_hwfn,
 		    struct qed_ptt *p_ptt,
 		    enum qed_led_mode mode);
 
+/**
+ * @brief Change security mode to allow writing faulty boards
+ *
+ *  @param cdev
+ *  @param addr - nvm offset
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_nvm_set_secure_mode(struct qed_dev *cdev,
+				u32 addr);
+
+/**
+ * @brief Write to nvm
+ *
+ *  @param cdev
+ *  @param addr - nvm offset
+ *  @param cmd - nvm command
+ *  @param p_buf - nvm write buffer
+ *  @param len - buffer len
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_nvm_write(struct qed_dev *cdev,
+		      u32 cmd,
+		      u32 addr,
+		      u8 *p_buf,
+		      u32 len);
+
+/**
+ * @brief Write a file image to nvram
+ *
+ *  @param cdev
+ *  @param addr - nvm offset
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_nvm_put_file_begin(struct qed_dev *cdev,
+			       u32 addr);
+
+/**
+ * @brief Delete a file image from nvram
+ *
+ *  @param cdev
+ *  @param addr - nvm offset
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_nvm_del_file(struct qed_dev *cdev,
+			 u32 addr);
+
+/**
+ * @brief Check latest mfw response in regard to nvm access
+ *
+ *  @param cdev
+ *  @param p_buf - nvm write buffer
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_nvm_resp(struct qed_dev *cdev,
+		     u8 *p_buf);
+
+/**
+ * @brief Read from nvm
+ *
+ *  @param cdev
+ *  @param addr - nvm offset
+ *  @param p_buf - nvm write buffer
+ *  @param len - buffer len
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_nvm_read(struct qed_dev *cdev,
+		     u32 addr,
+		     u8	*p_buf,
+		     u32 len);
+
 /* Using hwfn number (and not pf_num) is required since in CMT mode,
  * same pf_num may be used by two different hwfn
  * TODO - this shouldn't really be in .h file, but until all fields
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 0f11685..50580d2 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -389,6 +389,33 @@ static u32 qede_get_link(struct net_device *dev)
 	return current_link.link_up;
 }
 
+static int qede_get_eeprom_len(struct net_device *ndev)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	return edev->dev_info.common.flash_size;
+}
+
+static int qede_get_eeprom(struct net_device *dev,
+			    struct ethtool_eeprom *eeprom, u8 *eebuf)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	return edev->ops->common->nvm_get_cmd(edev->cdev, eeprom->magic,
+					      eeprom->offset, eebuf,
+					      eeprom->len);
+}
+
+static int qede_set_eeprom(struct net_device *dev,
+			    struct ethtool_eeprom *eeprom, u8 *eebuf)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	return edev->ops->common->nvm_set_cmd(edev->cdev, eeprom->magic,
+					      eeprom->offset, eebuf,
+					      eeprom->len);
+}
+
 static void qede_get_ringparam(struct net_device *dev,
 			       struct ethtool_ringparam *ering)
 {
@@ -839,6 +866,9 @@ static const struct ethtool_ops qede_ethtool_ops = {
 	.set_msglevel = qede_set_msglevel,
 	.nway_reset = qede_nway_reset,
 	.get_link = qede_get_link,
+	.get_eeprom_len = qede_get_eeprom_len,
+	.get_eeprom = qede_get_eeprom,
+	.set_eeprom = qede_set_eeprom,
 	.get_ringparam = qede_get_ringparam,
 	.set_ringparam = qede_set_ringparam,
 	.get_pauseparam = qede_get_pauseparam,
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index e5de42b..497cce9 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -270,6 +270,36 @@ struct qed_common_ops {
 				      struct qed_chain *p_chain);
 
 /**
+ * @brief nvm_get_cmd - Invoke mcp nvm get command
+ *
+ * @param cdev
+ * @param cmd - qed_mcp command
+ * @param offset
+ * @param buf - buffer
+ * @param len - buffer length
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*nvm_get_cmd)(struct qed_dev *cdev,
+			   u32 cmd, u32 offset,
+			   u8 *buf, u32 len);
+
+/**
+ * @brief nvm_put_cmd - Invoke mcp nvm get command
+ *
+ * @param cdev
+ * @param cmd - qed_mcp command
+ * @param offset
+ * @param buf - buffer
+ * @param len - buffer length
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*nvm_set_cmd)(struct qed_dev *cdev,
+			   u32 cmd, u32 offset,
+			   u8 *buf, u32 len);
+
+/**
  * @brief set_led - Configure LED mode
  *
  * @param cdev
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next V2 04/11] net/mlx5e: Use function pointers for RX data path handling
From: Saeed Mahameed @ 2016-04-17 21:31 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Tariq Toukan, Eran Ben Elisha,
	Achiad Shochat, Saeed Mahameed
In-Reply-To: <1460928725-18741-1-git-send-email-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

In preparation for Striding RQ feature, which will need its own
RX handlers.
This patch does not change any functionality.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |   33 ++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |    2 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |   74 +++++++++++----------
 3 files changed, 62 insertions(+), 47 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 7f19644..61e249d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -72,6 +72,17 @@
 #define MLX5E_SQ_BF_BUDGET             16
 
 #define MLX5E_NUM_MAIN_GROUPS 9
+#define MLX5E_NET_IP_ALIGN 2
+
+struct mlx5e_tx_wqe {
+	struct mlx5_wqe_ctrl_seg ctrl;
+	struct mlx5_wqe_eth_seg  eth;
+};
+
+struct mlx5e_rx_wqe {
+	struct mlx5_wqe_srq_next_seg  next;
+	struct mlx5_wqe_data_seg      data;
+};
 
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 #define MLX5E_MAX_BW_ALLOC 100 /* Max percentage of BW allocation */
@@ -357,6 +368,12 @@ struct mlx5e_cq {
 	struct mlx5_wq_ctrl        wq_ctrl;
 } ____cacheline_aligned_in_smp;
 
+struct mlx5e_rq;
+typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq *rq,
+				       struct mlx5_cqe64 *cqe);
+typedef int (*mlx5e_fp_alloc_wqe)(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe,
+				  u16 ix);
+
 struct mlx5e_rq {
 	/* data path */
 	struct mlx5_wq_ll      wq;
@@ -368,6 +385,8 @@ struct mlx5e_rq {
 	struct mlx5e_tstamp   *tstamp;
 	struct mlx5e_rq_stats  stats;
 	struct mlx5e_cq        cq;
+	mlx5e_fp_handle_rx_cqe handle_rx_cqe;
+	mlx5e_fp_alloc_wqe     alloc_wqe;
 
 	unsigned long          state;
 	int                    ix;
@@ -588,18 +607,6 @@ struct mlx5e_priv {
 	u16 q_counter;
 };
 
-#define MLX5E_NET_IP_ALIGN 2
-
-struct mlx5e_tx_wqe {
-	struct mlx5_wqe_ctrl_seg ctrl;
-	struct mlx5_wqe_eth_seg  eth;
-};
-
-struct mlx5e_rx_wqe {
-	struct mlx5_wqe_srq_next_seg  next;
-	struct mlx5_wqe_data_seg      data;
-};
-
 enum mlx5e_link_mode {
 	MLX5E_1000BASE_CX_SGMII	 = 0,
 	MLX5E_1000BASE_KX	 = 1,
@@ -642,7 +649,9 @@ void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event);
 int mlx5e_napi_poll(struct napi_struct *napi, int budget);
 bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget);
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
+void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
+int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix);
 struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq);
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9b58ef6..23ba12c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -357,6 +357,8 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 			cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
 	}
 
+	rq->handle_rx_cqe = mlx5e_handle_rx_cqe;
+	rq->alloc_wqe = mlx5e_alloc_rx_wqe;
 	rq->pdev    = c->pdev;
 	rq->netdev  = c->netdev;
 	rq->tstamp  = &priv->tstamp;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 58d4e2f..d7ccced 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -42,8 +42,7 @@ static inline bool mlx5e_rx_hw_stamp(struct mlx5e_tstamp *tstamp)
 	return tstamp->hwtstamp_config.rx_filter == HWTSTAMP_FILTER_ALL;
 }
 
-static inline int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
-				     struct mlx5e_rx_wqe *wqe, u16 ix)
+int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
 {
 	struct sk_buff *skb;
 	dma_addr_t dma_addr;
@@ -87,7 +86,7 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 	while (!mlx5_wq_ll_is_full(wq)) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
 
-		if (unlikely(mlx5e_alloc_rx_wqe(rq, wqe, wq->head)))
+		if (unlikely(rq->alloc_wqe(rq, wqe, wq->head)))
 			break;
 
 		mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
@@ -229,50 +228,55 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 	skb->mark = be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK;
 }
 
+void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	struct mlx5e_rx_wqe *wqe;
+	struct sk_buff *skb;
+	__be16 wqe_counter_be;
+	u16 wqe_counter;
+
+	wqe_counter_be = cqe->wqe_counter;
+	wqe_counter    = be16_to_cpu(wqe_counter_be);
+	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	skb            = rq->skb[wqe_counter];
+	prefetch(skb->data);
+	rq->skb[wqe_counter] = NULL;
+
+	dma_unmap_single(rq->pdev,
+			 *((dma_addr_t *)skb->cb),
+			 rq->wqe_sz,
+			 DMA_FROM_DEVICE);
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+		rq->stats.wqe_err++;
+		dev_kfree_skb(skb);
+		goto wq_ll_pop;
+	}
+
+	mlx5e_build_rx_skb(cqe, rq, skb);
+	rq->stats.packets++;
+	rq->stats.bytes += be32_to_cpu(cqe->byte_cnt);
+	napi_gro_receive(rq->cq.napi, skb);
+
+wq_ll_pop:
+	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+		       &wqe->next.next_wqe_index);
+}
+
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 {
 	struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
 	int work_done;
 
 	for (work_done = 0; work_done < budget; work_done++) {
-		struct mlx5e_rx_wqe *wqe;
-		struct mlx5_cqe64 *cqe;
-		struct sk_buff *skb;
-		__be16 wqe_counter_be;
-		u16 wqe_counter;
+		struct mlx5_cqe64 *cqe = mlx5e_get_cqe(cq);
 
-		cqe = mlx5e_get_cqe(cq);
 		if (!cqe)
 			break;
 
 		mlx5_cqwq_pop(&cq->wq);
 
-		wqe_counter_be = cqe->wqe_counter;
-		wqe_counter    = be16_to_cpu(wqe_counter_be);
-		wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
-		skb            = rq->skb[wqe_counter];
-		prefetch(skb->data);
-		rq->skb[wqe_counter] = NULL;
-
-		dma_unmap_single(rq->pdev,
-				 *((dma_addr_t *)skb->cb),
-				 rq->wqe_sz,
-				 DMA_FROM_DEVICE);
-
-		if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
-			rq->stats.wqe_err++;
-			dev_kfree_skb(skb);
-			goto wq_ll_pop;
-		}
-
-		mlx5e_build_rx_skb(cqe, rq, skb);
-		rq->stats.packets++;
-		rq->stats.bytes += be32_to_cpu(cqe->byte_cnt);
-		napi_gro_receive(cq->napi, skb);
-
-wq_ll_pop:
-		mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
-			       &wqe->next.next_wqe_index);
+		rq->handle_rx_cqe(rq, cqe);
 	}
 
 	mlx5_cqwq_update_db_record(&cq->wq);
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V2 00/11] Mellanox 100G mlx5 driver receive path optimizations
From: Saeed Mahameed @ 2016-04-17 21:31 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Tariq Toukan, Eran Ben Elisha,
	Saeed Mahameed

Hello Dave,

Changes from V1:
	- Rebased to efde611b0afa ("Merge branch 'nfp-next'")
	- Dropped: ("net/mlx5: Refactor mlx5_core_mr to mkey")
                Already merged into 4.6 from rdma tree. 
	- Dropped: ("net/mlx5_core: Add ConnectX-5 to list of supported devices")
                Will be pushed to net as we want it in 4.6 release.
	- Dropped: ("net/mlx5e: Change RX moderation period to be based on CQE")
                Will be pushed in a later series with full software based adaptive moderation.
	- Added: ("net/mlx5e: Delay skb->data access")
		Small trivial optimization.
	- Updated: ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
	 	Changed Striding RQ defaults to:
			> 	NUM WQEs = 16
			> 	Strides Per WQE = 1024
			> 	Stride Size = 128 
	- Updated: ("net/mlx5e: Use napi_alloc_skb for RX SKB allocations")
		Consider the IP packet alignment already done in napi_alloc_skb.	

Changes from V0:
	- Fixed a typo in commit message reported by Sergei
	- Align SKB fragments truesize to stride size
	- Use skb_add_rx_frag and remove the use of SKB_TRUESIZE
	- Fix: # MTTs alignment on Power PC
	- Fix: Free original (unaligned) pointer of MTT array
	- Use dev_alloc_pages and dev_alloc_page
	- Extend the stats.buff_alloc_err counter
	- Reform the copying of packet header into skb linear data
	- Add compiler hints for conditional statements
	- Prefetch skd->data prior to copying packet header into it
	- Rework: mlx5e_complete_rx_fragmented_mpwqe
	- Handle SKB fragments before linear data
	- Dropped ("net/mlx5e: Prefetch next RX CQE") for now 
	- Added a small patch that Adds ConnectX-5 devices to the list of supported devices
	- Rebased to 1cdba5505555 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next")

This series includes Some RX modifications and optimizations for
the mlx5 Ethernet driver. 

>From Rana, we have one patch that adds the support for Connectx-4
queue counters.

>From Tariq, several patches that are centralized around improving
RX path message rate, CPU and Memory utilization, in each patch
commit message you will find the performance improvements numbers
related to that specific patch.

In the 2nd patch we used a queue counter to report "out of buffer" 
dropped packet count, "Dropped packets due to lack of software resources"

3rd patch modifies the driver's to RSS default value to be spread along the
close NUMA node cores only for better out of the box experience.

In the 4th and 5th patches we utilized the use of RX multi-packet WQE
(Striding RQ) for better memory utilization especially in case of hardware
LRO is enabled and for better message rate for small packets.

In the 6th and 7th patches we added a fallback mechanism to use fragmented
memory when allocating large WQE strides fails, using UMR
(User Memory Registration) and ICO (Internal Control Operations) SQs.

In the 8th to 11th patches we did some small modification which show some small
extra improvements.

Thanks,
Saeed

Rana Shahout (1):
  net/mlx5e: Allocate set of queue counters per netdev

Saeed Mahameed (1):
  net/mlx5e: Delay skb->data access

Tariq Toukan (9):
  net/mlx5: Introduce device queue counters
  net/mlx5e: Use only close NUMA node for default RSS
  net/mlx5e: Use function pointers for RX data path handling
  net/mlx5e: Support RX multi-packet WQE (Striding RQ)
  net/mlx5e: Added ICO SQs
  net/mlx5e: Add fragmented memory support for RX multi packet WQE
  net/mlx5e: Use napi_alloc_skb for RX SKB allocations
  net/mlx5e: Remove redundant barrier
  net/mlx5e: Add ethtool counter for RX buffer allocation failures

 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  193 +++++++-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   28 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  361 ++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  511 ++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    |    6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |   59 +++-
 drivers/net/ethernet/mellanox/mlx5/core/qp.c       |   68 +++
 include/linux/mlx5/device.h                        |   39 ++-
 include/linux/mlx5/qp.h                            |    6 +
 9 files changed, 1138 insertions(+), 133 deletions(-)

^ permalink raw reply

* [PATCH net-next V2 02/11] net/mlx5e: Allocate set of queue counters per netdev
From: Saeed Mahameed @ 2016-04-17 21:31 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Tariq Toukan, Eran Ben Elisha,
	Rana Shahout, Saeed Mahameed
In-Reply-To: <1460928725-18741-1-git-send-email-saeedm@mellanox.com>

From: Rana Shahout <ranas@mellanox.com>

Connect all netdev RQs to this set of queue counters.
Also, add an "rx_out_of_buffer" counter to ethtool,
which indicates RX packet drops due to lack of receive
buffers.

Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   11 +++++
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   11 +++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   42 +++++++++++++++++++-
 3 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 879e627..c4ddbe8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -236,6 +236,15 @@ struct mlx5e_pport_stats {
 	__be64 RFC_2819_counters[NUM_RFC_2819_COUNTERS];
 };
 
+static const char qcounter_stats_strings[][ETH_GSTRING_LEN] = {
+	"rx_out_of_buffer",
+};
+
+struct mlx5e_qcounter_stats {
+	u32 rx_out_of_buffer;
+#define NUM_Q_COUNTERS 1
+};
+
 static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
 	"packets",
 	"bytes",
@@ -293,6 +302,7 @@ struct mlx5e_sq_stats {
 struct mlx5e_stats {
 	struct mlx5e_vport_stats   vport;
 	struct mlx5e_pport_stats   pport;
+	struct mlx5e_qcounter_stats qcnt;
 };
 
 struct mlx5e_params {
@@ -575,6 +585,7 @@ struct mlx5e_priv {
 	struct net_device         *netdev;
 	struct mlx5e_stats         stats;
 	struct mlx5e_tstamp        tstamp;
+	u16 q_counter;
 };
 
 #define MLX5E_NET_IP_ALIGN 2
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 68834b7..39c1902 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -165,6 +165,8 @@ static const struct {
 	},
 };
 
+#define MLX5E_NUM_Q_CNTRS(priv) (NUM_Q_COUNTERS * (!!priv->q_counter))
+
 static int mlx5e_get_sset_count(struct net_device *dev, int sset)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
@@ -172,6 +174,7 @@ static int mlx5e_get_sset_count(struct net_device *dev, int sset)
 	switch (sset) {
 	case ETH_SS_STATS:
 		return NUM_VPORT_COUNTERS + NUM_PPORT_COUNTERS +
+		       MLX5E_NUM_Q_CNTRS(priv) +
 		       priv->params.num_channels * NUM_RQ_STATS +
 		       priv->params.num_channels * priv->params.num_tc *
 						   NUM_SQ_STATS;
@@ -200,6 +203,11 @@ static void mlx5e_get_strings(struct net_device *dev,
 			strcpy(data + (idx++) * ETH_GSTRING_LEN,
 			       vport_strings[i]);
 
+		/* Q counters */
+		for (i = 0; i < MLX5E_NUM_Q_CNTRS(priv); i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       qcounter_stats_strings[i]);
+
 		/* PPORT counters */
 		for (i = 0; i < NUM_PPORT_COUNTERS; i++)
 			strcpy(data + (idx++) * ETH_GSTRING_LEN,
@@ -240,6 +248,9 @@ static void mlx5e_get_ethtool_stats(struct net_device *dev,
 	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
 		data[idx++] = ((u64 *)&priv->stats.vport)[i];
 
+	for (i = 0; i < MLX5E_NUM_Q_CNTRS(priv); i++)
+		data[idx++] = ((u32 *)&priv->stats.qcnt)[i];
+
 	for (i = 0; i < NUM_PPORT_COUNTERS; i++)
 		data[idx++] = be64_to_cpu(((__be64 *)&priv->stats.pport)[i]);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index e0adb60..7fbe1ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -129,6 +129,17 @@ free_out:
 	kvfree(out);
 }
 
+static void mlx5e_update_q_counter(struct mlx5e_priv *priv)
+{
+	struct mlx5e_qcounter_stats *qcnt = &priv->stats.qcnt;
+
+	if (!priv->q_counter)
+		return;
+
+	mlx5_core_query_out_of_buffer(priv->mdev, priv->q_counter,
+				      &qcnt->rx_out_of_buffer);
+}
+
 void mlx5e_update_stats(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
@@ -250,6 +261,8 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 			       s->rx_csum_sw;
 
 	mlx5e_update_pport_counters(priv);
+	mlx5e_update_q_counter(priv);
+
 free_out:
 	kvfree(out);
 }
@@ -1055,6 +1068,7 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
 	MLX5_SET(wq, wq, log_wq_sz,        priv->params.log_rq_size);
 	MLX5_SET(wq, wq, pd,               priv->pdn);
+	MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter);
 
 	param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev);
 	param->wq.linear = 1;
@@ -2442,6 +2456,26 @@ static int mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn,
 	return err;
 }
 
+static void mlx5e_create_q_counter(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx5_core_alloc_q_counter(mdev, &priv->q_counter);
+	if (err) {
+		mlx5_core_warn(mdev, "alloc queue counter failed, %d\n", err);
+		priv->q_counter = 0;
+	}
+}
+
+static void mlx5e_destroy_q_counter(struct mlx5e_priv *priv)
+{
+	if (!priv->q_counter)
+		return;
+
+	mlx5_core_dealloc_q_counter(priv->mdev, priv->q_counter);
+}
+
 static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
 {
 	struct net_device *netdev;
@@ -2527,13 +2561,15 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
 		goto err_destroy_tirs;
 	}
 
+	mlx5e_create_q_counter(priv);
+
 	mlx5e_init_eth_addr(priv);
 
 	mlx5e_vxlan_init(priv);
 
 	err = mlx5e_tc_init(priv);
 	if (err)
-		goto err_destroy_flow_tables;
+		goto err_dealloc_q_counters;
 
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 	mlx5e_dcbnl_ieee_setets_core(priv, &priv->params.ets);
@@ -2556,7 +2592,8 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
 err_tc_cleanup:
 	mlx5e_tc_cleanup(priv);
 
-err_destroy_flow_tables:
+err_dealloc_q_counters:
+	mlx5e_destroy_q_counter(priv);
 	mlx5e_destroy_flow_tables(priv);
 
 err_destroy_tirs:
@@ -2605,6 +2642,7 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
 	unregister_netdev(netdev);
 	mlx5e_tc_cleanup(priv);
 	mlx5e_vxlan_cleanup(priv);
+	mlx5e_destroy_q_counter(priv);
 	mlx5e_destroy_flow_tables(priv);
 	mlx5e_destroy_tirs(priv);
 	mlx5e_destroy_rqt(priv, MLX5E_SINGLE_RQ_RQT);
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V2 03/11] net/mlx5e: Use only close NUMA node for default RSS
From: Saeed Mahameed @ 2016-04-17 21:31 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Tariq Toukan, Eran Ben Elisha,
	Saeed Mahameed
In-Reply-To: <1460928725-18741-1-git-send-email-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Distribute default RSS table uniformly over the rings of the
close NUMA node, instead of all available channels.
This way we enforce the preference of close rings over far ones.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |    3 ++-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |    2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   15 +++++++++++++--
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c4ddbe8..7f19644 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -671,7 +671,8 @@ void mlx5e_build_tir_ctx_hash(void *tirc, struct mlx5e_priv *priv);
 
 int mlx5e_open_locked(struct net_device *netdev);
 int mlx5e_close_locked(struct net_device *netdev);
-void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
+void mlx5e_build_default_indir_rqt(struct mlx5_core_dev *mdev,
+				   u32 *indirection_rqt, int len,
 				   int num_channels);
 
 static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 39c1902..6f40ba4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -397,7 +397,7 @@ static int mlx5e_set_channels(struct net_device *dev,
 		mlx5e_close_locked(dev);
 
 	priv->params.num_channels = count;
-	mlx5e_build_default_indir_rqt(priv->params.indirection_rqt,
+	mlx5e_build_default_indir_rqt(priv->mdev, priv->params.indirection_rqt,
 				      MLX5E_INDIR_RQT_SIZE, count);
 
 	if (was_opened)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 7fbe1ba..9b58ef6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2297,11 +2297,22 @@ static void mlx5e_ets_init(struct mlx5e_priv *priv)
 }
 #endif
 
-void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
+void mlx5e_build_default_indir_rqt(struct mlx5_core_dev *mdev,
+				   u32 *indirection_rqt, int len,
 				   int num_channels)
 {
+	int node = mdev->priv.numa_node;
+	int node_num_of_cores;
 	int i;
 
+	if (node == -1)
+		node = first_online_node;
+
+	node_num_of_cores = cpumask_weight(cpumask_of_node(node));
+
+	if (node_num_of_cores)
+		num_channels = min_t(int, num_channels, node_num_of_cores);
+
 	for (i = 0; i < len; i++)
 		indirection_rqt[i] = i % num_channels;
 }
@@ -2333,7 +2344,7 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 	netdev_rss_key_fill(priv->params.toeplitz_hash_key,
 			    sizeof(priv->params.toeplitz_hash_key));
 
-	mlx5e_build_default_indir_rqt(priv->params.indirection_rqt,
+	mlx5e_build_default_indir_rqt(mdev, priv->params.indirection_rqt,
 				      MLX5E_INDIR_RQT_SIZE, num_channels);
 
 	priv->params.lro_wqe_sz            =
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V2 01/11] net/mlx5: Introduce device queue counters
From: Saeed Mahameed @ 2016-04-17 21:31 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Tariq Toukan, Eran Ben Elisha,
	Rana Shahout, Saeed Mahameed
In-Reply-To: <1460928725-18741-1-git-send-email-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

A queue counter can collect several statistics for one or more
hardware queues (QPs, RQs, etc ..) that the counter is attached to.

For Ethernet it will provide an "out of buffer" counter which collects
the number of all packets that are dropped due to lack of software
buffers.

Here we add device commands to alloc/query/dealloc queue counters.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c |   68 ++++++++++++++++++++++++++
 include/linux/mlx5/qp.h                      |    6 ++
 2 files changed, 74 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index def2893..b720a27 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -538,3 +538,71 @@ void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
 	mlx5_core_destroy_sq(dev, sq->qpn);
 }
 EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked);
+
+int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)];
+	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
+	err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*counter_id = MLX5_GET(alloc_q_counter_out, out,
+				       counter_set_id);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_alloc_q_counter);
+
+int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)];
+	u32 out[MLX5_ST_SZ_DW(dealloc_q_counter_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(dealloc_q_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
+	MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter_id);
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					  sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_dealloc_q_counter);
+
+int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id,
+			      int reset, void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
+	MLX5_SET(query_q_counter_in, in, clear, reset);
+	MLX5_SET(query_q_counter_in, in, counter_set_id, counter_id);
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, out_size);
+}
+EXPORT_SYMBOL_GPL(mlx5_core_query_q_counter);
+
+int mlx5_core_query_out_of_buffer(struct mlx5_core_dev *dev, u16 counter_id,
+				  u32 *out_of_buffer)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
+	void *out;
+	int err;
+
+	out = mlx5_vzalloc(outlen);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_core_query_q_counter(dev, counter_id, 0, out, outlen);
+	if (!err)
+		*out_of_buffer = MLX5_GET(query_q_counter_out, out,
+					  out_of_buffer);
+
+	kfree(out);
+	return err;
+}
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index cf031a3..6422102 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -668,6 +668,12 @@ int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 				struct mlx5_core_qp *sq);
 void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
 				  struct mlx5_core_qp *sq);
+int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id);
+int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id);
+int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id,
+			      int reset, void *out, int out_size);
+int mlx5_core_query_out_of_buffer(struct mlx5_core_dev *dev, u16 counter_id,
+				  u32 *out_of_buffer);
 
 static inline const char *mlx5_qp_type_str(int type)
 {
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V2 10/11] net/mlx5e: Delay skb->data access
From: Saeed Mahameed @ 2016-04-17 21:32 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Tariq Toukan, Eran Ben Elisha,
	Saeed Mahameed
In-Reply-To: <1460928725-18741-1-git-send-email-saeedm@mellanox.com>

Move mlx5e_handle_csum and eth_type_trans to the end of
mlx5e_build_rx_skb to gain some more time before accessing
skb->data, to reduce cache misses.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |    7 +++----
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index ea90e6f..ade858b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -455,10 +455,6 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 	if (unlikely(mlx5e_rx_hw_stamp(tstamp)))
 		mlx5e_fill_hwstamp(tstamp, get_cqe_ts(cqe), skb_hwtstamps(skb));
 
-	mlx5e_handle_csum(netdev, cqe, rq, skb, !!lro_num_seg);
-
-	skb->protocol = eth_type_trans(skb, netdev);
-
 	skb_record_rx_queue(skb, rq->ix);
 
 	if (likely(netdev->features & NETIF_F_RXHASH))
@@ -469,6 +465,9 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 				       be16_to_cpu(cqe->vlan_info));
 
 	skb->mark = be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK;
+
+	mlx5e_handle_csum(netdev, cqe, rq, skb, !!lro_num_seg);
+	skb->protocol = eth_type_trans(skb, netdev);
 }
 
 static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq,
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V2 05/11] net/mlx5e: Support RX multi-packet WQE (Striding RQ)
From: Saeed Mahameed @ 2016-04-17 21:31 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Tariq Toukan, Eran Ben Elisha,
	Achiad Shochat, Saeed Mahameed
In-Reply-To: <1460928725-18741-1-git-send-email-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.

Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.

In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.

For its flexibility, MPWQE allows a better memory utilization and lower
memory footprint (implying improvements in CPU utilization and packet
rate) as packets consume strides according to their size, preserving
the rest of the WQE to be available for other packets.

MPWQE default configuration:
	Num of WQEs	= 16
	Strides Per WQE = 2048
	Stride Size	= 64 byte

The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.

Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.

* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
  default, 64B, 1024B, 1478B, 65536B.

* Netperf multi TCP stream:
- No degradation, line rate reached.

* Pktgen: packet rate raised by 5-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.

* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
  |     2K      |       ~ 1K          |       0
  |     8K      |       ~ 6K          |       0
  |     16K     |       ~13K          |       0
  |     32K     |       ~28K          |       0
  |     64K     |       ~57K          |     ~24K

As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   74 +++++++++++-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   15 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  109 +++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  128 ++++++++++++++++++--
 include/linux/mlx5/device.h                        |   39 ++++++-
 5 files changed, 321 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 61e249d..bb6aa77 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -57,12 +57,27 @@
 #define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
 #define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE                0xd
 
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW            0x1
+#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW            0x4
+#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW            0x6
+
+#define MLX5_MPWRQ_LOG_NUM_STRIDES		11 /* >= 9, HW restriction */
+#define MLX5_MPWRQ_LOG_STRIDE_SIZE		6  /* >= 6, HW restriction */
+#define MLX5_MPWRQ_NUM_STRIDES			BIT(MLX5_MPWRQ_LOG_NUM_STRIDES)
+#define MLX5_MPWRQ_STRIDE_SIZE			BIT(MLX5_MPWRQ_LOG_STRIDE_SIZE)
+#define MLX5_MPWRQ_LOG_WQE_SZ			(MLX5_MPWRQ_LOG_NUM_STRIDES +\
+						 MLX5_MPWRQ_LOG_STRIDE_SIZE)
+#define MLX5_MPWRQ_WQE_PAGE_ORDER  (max_t(int, 0, \
+					  MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT))
+#define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD	(128)
+
 #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 (64 * 1024)
 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC      0x10
 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS      0x20
 #define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC      0x10
 #define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS      0x20
 #define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES                0x80
+#define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES_MPW            0x2
 
 #define MLX5E_LOG_INDIR_RQT_SIZE       0x7
 #define MLX5E_INDIR_RQT_SIZE           BIT(MLX5E_LOG_INDIR_RQT_SIZE)
@@ -74,6 +89,38 @@
 #define MLX5E_NUM_MAIN_GROUPS 9
 #define MLX5E_NET_IP_ALIGN 2
 
+static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size)
+{
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return min_t(u16, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES_MPW,
+			     wq_size / 2);
+	default:
+		return min_t(u16, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES,
+			     wq_size / 2);
+	}
+}
+
+static inline int mlx5_min_log_rq_size(int wq_type)
+{
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW;
+	default:
+		return MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE;
+	}
+}
+
+static inline int mlx5_max_log_rq_size(int wq_type)
+{
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW;
+	default:
+		return MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE;
+	}
+}
+
 struct mlx5e_tx_wqe {
 	struct mlx5_wqe_ctrl_seg ctrl;
 	struct mlx5_wqe_eth_seg  eth;
@@ -128,6 +175,7 @@ static const char vport_strings[][ETH_GSTRING_LEN] = {
 	"tx_queue_wake",
 	"tx_queue_dropped",
 	"rx_wqe_err",
+	"rx_mpwqe_filler",
 };
 
 struct mlx5e_vport_stats {
@@ -169,8 +217,9 @@ struct mlx5e_vport_stats {
 	u64 tx_queue_wake;
 	u64 tx_queue_dropped;
 	u64 rx_wqe_err;
+	u64 rx_mpwqe_filler;
 
-#define NUM_VPORT_COUNTERS     35
+#define NUM_VPORT_COUNTERS     36
 };
 
 static const char pport_strings[][ETH_GSTRING_LEN] = {
@@ -263,7 +312,8 @@ static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
 	"csum_sw",
 	"lro_packets",
 	"lro_bytes",
-	"wqe_err"
+	"wqe_err",
+	"mpwqe_filler",
 };
 
 struct mlx5e_rq_stats {
@@ -274,7 +324,8 @@ struct mlx5e_rq_stats {
 	u64 lro_packets;
 	u64 lro_bytes;
 	u64 wqe_err;
-#define NUM_RQ_STATS 7
+	u64 mpwqe_filler;
+#define NUM_RQ_STATS 8
 };
 
 static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
@@ -318,6 +369,7 @@ struct mlx5e_stats {
 
 struct mlx5e_params {
 	u8  log_sq_size;
+	u8  rq_wq_type;
 	u8  log_rq_size;
 	u16 num_channels;
 	u8  num_tc;
@@ -374,11 +426,23 @@ typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq *rq,
 typedef int (*mlx5e_fp_alloc_wqe)(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe,
 				  u16 ix);
 
+struct mlx5e_dma_info {
+	struct page	*page;
+	dma_addr_t	addr;
+};
+
+struct mlx5e_mpw_info {
+	struct mlx5e_dma_info dma_info;
+	u16		     consumed_strides;
+	u16		     skbs_frags;
+};
+
 struct mlx5e_rq {
 	/* data path */
 	struct mlx5_wq_ll      wq;
 	u32                    wqe_sz;
 	struct sk_buff       **skb;
+	struct mlx5e_mpw_info *wqe_info;
 
 	struct device         *pdev;
 	struct net_device     *netdev;
@@ -393,6 +457,7 @@ struct mlx5e_rq {
 
 	/* control */
 	struct mlx5_wq_ctrl    wq_ctrl;
+	u8                     wq_type;
 	u32                    rqn;
 	struct mlx5e_channel  *channel;
 	struct mlx5e_priv     *priv;
@@ -649,9 +714,12 @@ void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event);
 int mlx5e_napi_poll(struct napi_struct *napi, int budget);
 bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget);
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
 int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix);
+int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix);
 struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq);
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 6f40ba4..4077856 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -273,8 +273,9 @@ static void mlx5e_get_ringparam(struct net_device *dev,
 				struct ethtool_ringparam *param)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	int rq_wq_type = priv->params.rq_wq_type;
 
-	param->rx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE;
+	param->rx_max_pending = 1 << mlx5_max_log_rq_size(rq_wq_type);
 	param->tx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE;
 	param->rx_pending     = 1 << priv->params.log_rq_size;
 	param->tx_pending     = 1 << priv->params.log_sq_size;
@@ -285,6 +286,7 @@ static int mlx5e_set_ringparam(struct net_device *dev,
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	bool was_opened;
+	int rq_wq_type = priv->params.rq_wq_type;
 	u16 min_rx_wqes;
 	u8 log_rq_size;
 	u8 log_sq_size;
@@ -300,16 +302,16 @@ static int mlx5e_set_ringparam(struct net_device *dev,
 			    __func__);
 		return -EINVAL;
 	}
-	if (param->rx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE)) {
+	if (param->rx_pending < (1 << mlx5_min_log_rq_size(rq_wq_type))) {
 		netdev_info(dev, "%s: rx_pending (%d) < min (%d)\n",
 			    __func__, param->rx_pending,
-			    1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE);
+			    1 << mlx5_min_log_rq_size(rq_wq_type));
 		return -EINVAL;
 	}
-	if (param->rx_pending > (1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE)) {
+	if (param->rx_pending > (1 << mlx5_max_log_rq_size(rq_wq_type))) {
 		netdev_info(dev, "%s: rx_pending (%d) > max (%d)\n",
 			    __func__, param->rx_pending,
-			    1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE);
+			    1 << mlx5_max_log_rq_size(rq_wq_type));
 		return -EINVAL;
 	}
 	if (param->tx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) {
@@ -327,8 +329,7 @@ static int mlx5e_set_ringparam(struct net_device *dev,
 
 	log_rq_size = order_base_2(param->rx_pending);
 	log_sq_size = order_base_2(param->tx_pending);
-	min_rx_wqes = min_t(u16, param->rx_pending - 1,
-			    MLX5E_PARAMS_DEFAULT_MIN_RX_WQES);
+	min_rx_wqes = mlx5_min_rx_wqes(rq_wq_type, param->rx_pending);
 
 	if (log_rq_size == priv->params.log_rq_size &&
 	    log_sq_size == priv->params.log_sq_size &&
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 23ba12c..871f3af 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -175,6 +175,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 	s->rx_csum_none		= 0;
 	s->rx_csum_sw		= 0;
 	s->rx_wqe_err		= 0;
+	s->rx_mpwqe_filler	= 0;
 	for (i = 0; i < priv->params.num_channels; i++) {
 		rq_stats = &priv->channel[i]->rq.stats;
 
@@ -185,6 +186,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 		s->rx_csum_none	+= rq_stats->csum_none;
 		s->rx_csum_sw	+= rq_stats->csum_sw;
 		s->rx_wqe_err   += rq_stats->wqe_err;
+		s->rx_mpwqe_filler += rq_stats->mpwqe_filler;
 
 		for (j = 0; j < priv->params.num_tc; j++) {
 			sq_stats = &priv->channel[i]->sq[j].stats;
@@ -323,6 +325,7 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	struct mlx5_core_dev *mdev = priv->mdev;
 	void *rqc = param->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	u32 byte_count;
 	int wq_sz;
 	int err;
 	int i;
@@ -337,28 +340,47 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
 
 	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
-	rq->skb = kzalloc_node(wq_sz * sizeof(*rq->skb), GFP_KERNEL,
-			       cpu_to_node(c->cpu));
-	if (!rq->skb) {
-		err = -ENOMEM;
-		goto err_rq_wq_destroy;
-	}
 
-	rq->wqe_sz = (priv->params.lro_en) ? priv->params.lro_wqe_sz :
-					     MLX5E_SW2HW_MTU(priv->netdev->mtu);
-	rq->wqe_sz = SKB_DATA_ALIGN(rq->wqe_sz + MLX5E_NET_IP_ALIGN);
+	switch (priv->params.rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		rq->wqe_info = kzalloc_node(wq_sz * sizeof(*rq->wqe_info),
+					    GFP_KERNEL, cpu_to_node(c->cpu));
+		if (!rq->wqe_info) {
+			err = -ENOMEM;
+			goto err_rq_wq_destroy;
+		}
+		rq->handle_rx_cqe = mlx5e_handle_rx_cqe_mpwrq;
+		rq->alloc_wqe = mlx5e_alloc_rx_mpwqe;
+
+		rq->wqe_sz = MLX5_MPWRQ_NUM_STRIDES * MLX5_MPWRQ_STRIDE_SIZE;
+		byte_count = rq->wqe_sz;
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		rq->skb = kzalloc_node(wq_sz * sizeof(*rq->skb), GFP_KERNEL,
+				       cpu_to_node(c->cpu));
+		if (!rq->skb) {
+			err = -ENOMEM;
+			goto err_rq_wq_destroy;
+		}
+		rq->handle_rx_cqe = mlx5e_handle_rx_cqe;
+		rq->alloc_wqe = mlx5e_alloc_rx_wqe;
+
+		rq->wqe_sz = (priv->params.lro_en) ?
+				priv->params.lro_wqe_sz :
+				MLX5E_SW2HW_MTU(priv->netdev->mtu);
+		rq->wqe_sz = SKB_DATA_ALIGN(rq->wqe_sz + MLX5E_NET_IP_ALIGN);
+		byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
+		byte_count |= MLX5_HW_START_PADDING;
+	}
 
 	for (i = 0; i < wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
-		u32 byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
 
 		wqe->data.lkey       = c->mkey_be;
-		wqe->data.byte_count =
-			cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
+		wqe->data.byte_count = cpu_to_be32(byte_count);
 	}
 
-	rq->handle_rx_cqe = mlx5e_handle_rx_cqe;
-	rq->alloc_wqe = mlx5e_alloc_rx_wqe;
+	rq->wq_type = priv->params.rq_wq_type;
 	rq->pdev    = c->pdev;
 	rq->netdev  = c->netdev;
 	rq->tstamp  = &priv->tstamp;
@@ -376,7 +398,14 @@ err_rq_wq_destroy:
 
 static void mlx5e_destroy_rq(struct mlx5e_rq *rq)
 {
-	kfree(rq->skb);
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		kfree(rq->wqe_info);
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		kfree(rq->skb);
+	}
+
 	mlx5_wq_destroy(&rq->wq_ctrl);
 }
 
@@ -1065,7 +1094,18 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
 
-	MLX5_SET(wq, wq, wq_type,          MLX5_WQ_TYPE_LINKED_LIST);
+	switch (priv->params.rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		MLX5_SET(wq, wq, log_wqe_num_of_strides,
+			 MLX5_MPWRQ_LOG_NUM_STRIDES - 9);
+		MLX5_SET(wq, wq, log_wqe_stride_size,
+			 MLX5_MPWRQ_LOG_STRIDE_SIZE - 6);
+		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ);
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
+	}
+
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
 	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
 	MLX5_SET(wq, wq, log_wq_sz,        priv->params.log_rq_size);
@@ -1111,8 +1151,18 @@ static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
 				    struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
+	u8 log_cq_size;
 
-	MLX5_SET(cqc, cqc, log_cq_size,  priv->params.log_rq_size);
+	switch (priv->params.rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		log_cq_size = priv->params.log_rq_size +
+			MLX5_MPWRQ_LOG_NUM_STRIDES;
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		log_cq_size = priv->params.log_rq_size;
+	}
+
+	MLX5_SET(cqc, cqc, log_cq_size, log_cq_size);
 
 	mlx5e_build_common_cq_param(priv, param);
 }
@@ -1983,7 +2033,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 	if (changes & NETIF_F_LRO) {
 		bool was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 
-		if (was_opened)
+		if (was_opened && (priv->params.rq_wq_type ==
+				   MLX5_WQ_TYPE_LINKED_LIST))
 			mlx5e_close_locked(priv->netdev);
 
 		priv->params.lro_en = !!(features & NETIF_F_LRO);
@@ -1992,7 +2043,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 			mlx5_core_warn(priv->mdev, "lro modify failed, %d\n",
 				       err);
 
-		if (was_opened)
+		if (was_opened && (priv->params.rq_wq_type ==
+				   MLX5_WQ_TYPE_LINKED_LIST))
 			err = mlx5e_open_locked(priv->netdev);
 	}
 
@@ -2327,8 +2379,21 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 
 	priv->params.log_sq_size           =
 		MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
-	priv->params.log_rq_size           =
-		MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
+	priv->params.rq_wq_type = MLX5_CAP_GEN(mdev, striding_rq) ?
+		MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
+		MLX5_WQ_TYPE_LINKED_LIST;
+
+	switch (priv->params.rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW;
+		priv->params.lro_en = true;
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
+	}
+
+	priv->params.min_rx_wqes = mlx5_min_rx_wqes(priv->params.rq_wq_type,
+					    BIT(priv->params.log_rq_size));
 	priv->params.rx_cq_moderation_usec =
 		MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
 	priv->params.rx_cq_moderation_pkts =
@@ -2338,8 +2403,6 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 	priv->params.tx_cq_moderation_pkts =
 		MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
 	priv->params.tx_max_inline         = mlx5e_get_max_inline_cap(mdev);
-	priv->params.min_rx_wqes           =
-		MLX5E_PARAMS_DEFAULT_MIN_RX_WQES;
 	priv->params.num_tc                = 1;
 	priv->params.rss_hfunc             = ETH_RSS_HASH_XOR;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index d7ccced..e652604 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -76,6 +76,34 @@ err_free_skb:
 	return -ENOMEM;
 }
 
+int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
+{
+	struct mlx5e_mpw_info *wi = &rq->wqe_info[ix];
+	int ret = 0;
+
+	wi->dma_info.page = dev_alloc_pages(MLX5_MPWRQ_WQE_PAGE_ORDER);
+	if (unlikely(!wi->dma_info.page))
+		return -ENOMEM;
+
+	wi->dma_info.addr = dma_map_page(rq->pdev, wi->dma_info.page, 0,
+					 rq->wqe_sz, PCI_DMA_FROMDEVICE);
+	if (unlikely(dma_mapping_error(rq->pdev, wi->dma_info.addr))) {
+		ret = -ENOMEM;
+		goto err_put_page;
+	}
+
+	atomic_add(MLX5_MPWRQ_NUM_STRIDES, &wi->dma_info.page->_count);
+	wi->skbs_frags       = 0;
+	wi->consumed_strides = 0;
+	wqe->data.addr       = cpu_to_be64(wi->dma_info.addr);
+
+	return 0;
+
+err_put_page:
+	put_page(wi->dma_info.page);
+	return ret;
+}
+
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_ll *wq = &rq->wq;
@@ -100,7 +128,8 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 	return !mlx5_wq_ll_is_full(wq);
 }
 
-static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe)
+static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe,
+				 u32 cqe_bcnt)
 {
 	struct ethhdr	*eth	= (struct ethhdr *)(skb->data);
 	struct iphdr	*ipv4	= (struct iphdr *)(skb->data + ETH_HLEN);
@@ -111,7 +140,7 @@ static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe)
 	int tcp_ack = ((CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA  == l4_hdr_type) ||
 		       (CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA == l4_hdr_type));
 
-	u16 tot_len = be32_to_cpu(cqe->byte_cnt) - ETH_HLEN;
+	u16 tot_len = cqe_bcnt - ETH_HLEN;
 
 	if (eth->h_proto == htons(ETH_P_IP)) {
 		tcp = (struct tcphdr *)(skb->data + ETH_HLEN +
@@ -191,19 +220,17 @@ csum_none:
 }
 
 static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
+				      u32 cqe_bcnt,
 				      struct mlx5e_rq *rq,
 				      struct sk_buff *skb)
 {
 	struct net_device *netdev = rq->netdev;
-	u32 cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 	struct mlx5e_tstamp *tstamp = rq->tstamp;
 	int lro_num_seg;
 
-	skb_put(skb, cqe_bcnt);
-
 	lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
 	if (lro_num_seg > 1) {
-		mlx5e_lro_update_hdr(skb, cqe);
+		mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt);
 		skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg);
 		rq->stats.lro_packets++;
 		rq->stats.lro_bytes += cqe_bcnt;
@@ -228,12 +255,24 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 	skb->mark = be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK;
 }
 
+static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq,
+					 struct mlx5_cqe64 *cqe,
+					 u32 cqe_bcnt,
+					 struct sk_buff *skb)
+{
+	rq->stats.packets++;
+	rq->stats.bytes += cqe_bcnt;
+	mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb);
+	napi_gro_receive(rq->cq.napi, skb);
+}
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
 	struct mlx5e_rx_wqe *wqe;
 	struct sk_buff *skb;
 	__be16 wqe_counter_be;
 	u16 wqe_counter;
+	u32 cqe_bcnt;
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
@@ -253,16 +292,85 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 		goto wq_ll_pop;
 	}
 
-	mlx5e_build_rx_skb(cqe, rq, skb);
-	rq->stats.packets++;
-	rq->stats.bytes += be32_to_cpu(cqe->byte_cnt);
-	napi_gro_receive(rq->cq.napi, skb);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
+	skb_put(skb, cqe_bcnt);
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 
 wq_ll_pop:
 	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
 
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	u16 cstrides       = mpwrq_get_cqe_consumed_strides(cqe);
+	u16 stride_ix      = mpwrq_get_cqe_stride_index(cqe);
+	u32 consumed_bytes = cstrides  * MLX5_MPWRQ_STRIDE_SIZE;
+	u32 data_offset    = stride_ix * MLX5_MPWRQ_STRIDE_SIZE;
+	u16 wqe_id         = be16_to_cpu(cqe->wqe_id);
+	struct mlx5e_mpw_info *wi = &rq->wqe_info[wqe_id];
+	struct mlx5e_rx_wqe  *wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_id);
+	struct sk_buff *skb;
+	u16 byte_cnt;
+	u16 cqe_bcnt;
+	u16 headlen;
+
+	wi->consumed_strides += cstrides;
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+		rq->stats.wqe_err++;
+		goto mpwrq_cqe_out;
+	}
+
+	if (unlikely(mpwrq_is_filler_cqe(cqe))) {
+		rq->stats.mpwqe_filler++;
+		goto mpwrq_cqe_out;
+	}
+
+	skb = netdev_alloc_skb(rq->netdev,
+			       ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD,
+				     sizeof(long)));
+	if (unlikely(!skb))
+		goto mpwrq_cqe_out;
+
+	prefetch(skb->data);
+	dma_sync_single_for_cpu(rq->pdev, wi->dma_info.addr + data_offset,
+				consumed_bytes, DMA_FROM_DEVICE);
+
+	cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
+	headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, cqe_bcnt);
+
+	byte_cnt = cqe_bcnt - headlen;
+	if (byte_cnt) {
+		wi->skbs_frags++;
+		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+				wi->dma_info.page, data_offset + headlen,
+				byte_cnt,
+				ALIGN(byte_cnt, MLX5_MPWRQ_STRIDE_SIZE));
+	}
+
+	skb_copy_to_linear_data(skb,
+				page_address(wi->dma_info.page) + data_offset,
+				ALIGN(headlen, sizeof(long)));
+	/* skb linear part was allocated with headlen and aligned to long */
+	skb->tail += headlen;
+	skb->len  += headlen;
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+
+mpwrq_cqe_out:
+	if (likely(wi->consumed_strides < MLX5_MPWRQ_NUM_STRIDES))
+		return;
+
+	dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz,
+		       PCI_DMA_FROMDEVICE);
+	atomic_sub(MLX5_MPWRQ_NUM_STRIDES - wi->skbs_frags,
+		   &wi->dma_info.page->_count);
+	put_page(wi->dma_info.page);
+	mlx5_wq_ll_pop(&rq->wq, cqe->wqe_id, &wqe->next.next_wqe_index);
+}
+
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 {
 	struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 8156e3c..03f8d71 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -644,7 +644,8 @@ struct mlx5_err_cqe {
 };
 
 struct mlx5_cqe64 {
-	u8		rsvd0[4];
+	u8              rsvd0[2];
+	__be16          wqe_id;
 	u8		lro_tcppsh_abort_dupack;
 	u8		lro_min_ttl;
 	__be16		lro_tcp_win;
@@ -696,6 +697,42 @@ static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe)
 	return (u64)lo | ((u64)hi << 32);
 }
 
+struct mpwrq_cqe_bc {
+	__be16	filler_consumed_strides;
+	__be16	byte_cnt;
+};
+
+static inline u16 mpwrq_get_cqe_byte_cnt(struct mlx5_cqe64 *cqe)
+{
+	struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
+
+	return be16_to_cpu(bc->byte_cnt);
+}
+
+static inline u16 mpwrq_get_cqe_bc_consumed_strides(struct mpwrq_cqe_bc *bc)
+{
+	return 0x7fff & be16_to_cpu(bc->filler_consumed_strides);
+}
+
+static inline u16 mpwrq_get_cqe_consumed_strides(struct mlx5_cqe64 *cqe)
+{
+	struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
+
+	return mpwrq_get_cqe_bc_consumed_strides(bc);
+}
+
+static inline bool mpwrq_is_filler_cqe(struct mlx5_cqe64 *cqe)
+{
+	struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
+
+	return 0x8000 & be16_to_cpu(bc->filler_consumed_strides);
+}
+
+static inline u16 mpwrq_get_cqe_stride_index(struct mlx5_cqe64 *cqe)
+{
+	return be16_to_cpu(cqe->wqe_counter);
+}
+
 enum {
 	CQE_L4_HDR_TYPE_NONE			= 0x0,
 	CQE_L4_HDR_TYPE_TCP_NO_ACK		= 0x1,
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V2 06/11] net/mlx5e: Added ICO SQs
From: Saeed Mahameed @ 2016-04-17 21:32 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Tariq Toukan, Eran Ben Elisha,
	Saeed Mahameed
In-Reply-To: <1460928725-18741-1-git-send-email-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Added ICO (Internal Control Operations) SQ per channel to be used
for driver internal operations such as memory registration for
fragmented memory and nop requests upon ifconfig up.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |    7 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  135 +++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   |    2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c |   55 +++++++++
 4 files changed, 174 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index bb6aa77..f62495f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -485,6 +485,11 @@ enum {
 	MLX5E_SQ_STATE_BF_ENABLE,
 };
 
+struct mlx5e_ico_wqe_info {
+	u8  opcode;
+	u8  num_wqebbs;
+};
+
 struct mlx5e_sq {
 	/* data path */
 
@@ -526,6 +531,7 @@ struct mlx5e_sq {
 	struct mlx5_uar            uar;
 	struct mlx5e_channel      *channel;
 	int                        tc;
+	struct mlx5e_ico_wqe_info *ico_wqe_info;
 } ____cacheline_aligned_in_smp;
 
 static inline bool mlx5e_sq_has_room_for(struct mlx5e_sq *sq, u16 n)
@@ -542,6 +548,7 @@ struct mlx5e_channel {
 	/* data path */
 	struct mlx5e_rq            rq;
 	struct mlx5e_sq            sq[MLX5E_MAX_NUM_TC];
+	struct mlx5e_sq            icosq;   /* internal control operations */
 	struct napi_struct         napi;
 	struct device             *pdev;
 	struct net_device         *netdev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 871f3af..b25b429 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -48,6 +48,7 @@ struct mlx5e_sq_param {
 	u32                        sqc[MLX5_ST_SZ_DW(sqc)];
 	struct mlx5_wq_param       wq;
 	u16                        max_inline;
+	bool                       icosq;
 };
 
 struct mlx5e_cq_param {
@@ -59,8 +60,10 @@ struct mlx5e_cq_param {
 struct mlx5e_channel_param {
 	struct mlx5e_rq_param      rq;
 	struct mlx5e_sq_param      sq;
+	struct mlx5e_sq_param      icosq;
 	struct mlx5e_cq_param      rx_cq;
 	struct mlx5e_cq_param      tx_cq;
+	struct mlx5e_cq_param      icosq_cq;
 };
 
 static void mlx5e_update_carrier(struct mlx5e_priv *priv)
@@ -502,6 +505,8 @@ static int mlx5e_open_rq(struct mlx5e_channel *c,
 			 struct mlx5e_rq_param *param,
 			 struct mlx5e_rq *rq)
 {
+	struct mlx5e_sq *sq = &c->icosq;
+	u16 pi = sq->pc & sq->wq.sz_m1;
 	int err;
 
 	err = mlx5e_create_rq(c, param, rq);
@@ -517,7 +522,10 @@ static int mlx5e_open_rq(struct mlx5e_channel *c,
 		goto err_disable_rq;
 
 	set_bit(MLX5E_RQ_STATE_POST_WQES_ENABLE, &rq->state);
-	mlx5e_send_nop(&c->sq[0], true); /* trigger mlx5e_post_rx_wqes() */
+
+	sq->ico_wqe_info[pi].opcode     = MLX5_OPCODE_NOP;
+	sq->ico_wqe_info[pi].num_wqebbs = 1;
+	mlx5e_send_nop(sq, true); /* trigger mlx5e_post_rx_wqes() */
 
 	return 0;
 
@@ -583,7 +591,6 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 
 	void *sqc = param->sqc;
 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
-	int txq_ix;
 	int err;
 
 	err = mlx5_alloc_map_uar(mdev, &sq->uar, true);
@@ -611,8 +618,24 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 	if (err)
 		goto err_sq_wq_destroy;
 
-	txq_ix = c->ix + tc * priv->params.num_channels;
-	sq->txq = netdev_get_tx_queue(priv->netdev, txq_ix);
+	if (param->icosq) {
+		u8 wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
+
+		sq->ico_wqe_info = kzalloc_node(sizeof(*sq->ico_wqe_info) *
+						wq_sz,
+						GFP_KERNEL,
+						cpu_to_node(c->cpu));
+		if (!sq->ico_wqe_info) {
+			err = -ENOMEM;
+			goto err_free_sq_db;
+		}
+	} else {
+		int txq_ix;
+
+		txq_ix = c->ix + tc * priv->params.num_channels;
+		sq->txq = netdev_get_tx_queue(priv->netdev, txq_ix);
+		priv->txq_to_sq_map[txq_ix] = sq;
+	}
 
 	sq->pdev      = c->pdev;
 	sq->tstamp    = &priv->tstamp;
@@ -621,10 +644,12 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 	sq->tc        = tc;
 	sq->edge      = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
 	sq->bf_budget = MLX5E_SQ_BF_BUDGET;
-	priv->txq_to_sq_map[txq_ix] = sq;
 
 	return 0;
 
+err_free_sq_db:
+	mlx5e_free_sq_db(sq);
+
 err_sq_wq_destroy:
 	mlx5_wq_destroy(&sq->wq_ctrl);
 
@@ -639,6 +664,7 @@ static void mlx5e_destroy_sq(struct mlx5e_sq *sq)
 	struct mlx5e_channel *c = sq->channel;
 	struct mlx5e_priv *priv = c->priv;
 
+	kfree(sq->ico_wqe_info);
 	mlx5e_free_sq_db(sq);
 	mlx5_wq_destroy(&sq->wq_ctrl);
 	mlx5_unmap_free_uar(priv->mdev, &sq->uar);
@@ -667,10 +693,10 @@ static int mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param)
 
 	memcpy(sqc, param->sqc, sizeof(param->sqc));
 
-	MLX5_SET(sqc,  sqc, tis_num_0,		priv->tisn[sq->tc]);
-	MLX5_SET(sqc,  sqc, cqn,		c->sq[sq->tc].cq.mcq.cqn);
+	MLX5_SET(sqc,  sqc, tis_num_0, param->icosq ? 0 : priv->tisn[sq->tc]);
+	MLX5_SET(sqc,  sqc, cqn,		sq->cq.mcq.cqn);
 	MLX5_SET(sqc,  sqc, state,		MLX5_SQC_STATE_RST);
-	MLX5_SET(sqc,  sqc, tis_lst_sz,		1);
+	MLX5_SET(sqc,  sqc, tis_lst_sz,		param->icosq ? 0 : 1);
 	MLX5_SET(sqc,  sqc, flush_in_error_en,	1);
 
 	MLX5_SET(wq,   wq, wq_type,       MLX5_WQ_TYPE_CYCLIC);
@@ -745,9 +771,11 @@ static int mlx5e_open_sq(struct mlx5e_channel *c,
 	if (err)
 		goto err_disable_sq;
 
-	set_bit(MLX5E_SQ_STATE_WAKE_TXQ_ENABLE, &sq->state);
-	netdev_tx_reset_queue(sq->txq);
-	netif_tx_start_queue(sq->txq);
+	if (sq->txq) {
+		set_bit(MLX5E_SQ_STATE_WAKE_TXQ_ENABLE, &sq->state);
+		netdev_tx_reset_queue(sq->txq);
+		netif_tx_start_queue(sq->txq);
+	}
 
 	return 0;
 
@@ -768,15 +796,19 @@ static inline void netif_tx_disable_queue(struct netdev_queue *txq)
 
 static void mlx5e_close_sq(struct mlx5e_sq *sq)
 {
-	clear_bit(MLX5E_SQ_STATE_WAKE_TXQ_ENABLE, &sq->state);
-	napi_synchronize(&sq->channel->napi); /* prevent netif_tx_wake_queue */
-	netif_tx_disable_queue(sq->txq);
+	if (sq->txq) {
+		clear_bit(MLX5E_SQ_STATE_WAKE_TXQ_ENABLE, &sq->state);
+		/* prevent netif_tx_wake_queue */
+		napi_synchronize(&sq->channel->napi);
+		netif_tx_disable_queue(sq->txq);
 
-	/* ensure hw is notified of all pending wqes */
-	if (mlx5e_sq_has_room_for(sq, 1))
-		mlx5e_send_nop(sq, true);
+		/* ensure hw is notified of all pending wqes */
+		if (mlx5e_sq_has_room_for(sq, 1))
+			mlx5e_send_nop(sq, true);
+
+		mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
+	}
 
-	mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
 	while (sq->cc != sq->pc) /* wait till sq is empty */
 		msleep(20);
 
@@ -1030,10 +1062,14 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 
 	netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64);
 
-	err = mlx5e_open_tx_cqs(c, cparam);
+	err = mlx5e_open_cq(c, &cparam->icosq_cq, &c->icosq.cq, 0, 0);
 	if (err)
 		goto err_napi_del;
 
+	err = mlx5e_open_tx_cqs(c, cparam);
+	if (err)
+		goto err_close_icosq_cq;
+
 	err = mlx5e_open_cq(c, &cparam->rx_cq, &c->rq.cq,
 			    priv->params.rx_cq_moderation_usec,
 			    priv->params.rx_cq_moderation_pkts);
@@ -1042,10 +1078,14 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 
 	napi_enable(&c->napi);
 
-	err = mlx5e_open_sqs(c, cparam);
+	err = mlx5e_open_sq(c, 0, &cparam->icosq, &c->icosq);
 	if (err)
 		goto err_disable_napi;
 
+	err = mlx5e_open_sqs(c, cparam);
+	if (err)
+		goto err_close_icosq;
+
 	err = mlx5e_open_rq(c, &cparam->rq, &c->rq);
 	if (err)
 		goto err_close_sqs;
@@ -1058,6 +1098,9 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 err_close_sqs:
 	mlx5e_close_sqs(c);
 
+err_close_icosq:
+	mlx5e_close_sq(&c->icosq);
+
 err_disable_napi:
 	napi_disable(&c->napi);
 	mlx5e_close_cq(&c->rq.cq);
@@ -1065,6 +1108,9 @@ err_disable_napi:
 err_close_tx_cqs:
 	mlx5e_close_tx_cqs(c);
 
+err_close_icosq_cq:
+	mlx5e_close_cq(&c->icosq.cq);
+
 err_napi_del:
 	netif_napi_del(&c->napi);
 	napi_hash_del(&c->napi);
@@ -1077,9 +1123,11 @@ static void mlx5e_close_channel(struct mlx5e_channel *c)
 {
 	mlx5e_close_rq(&c->rq);
 	mlx5e_close_sqs(c);
+	mlx5e_close_sq(&c->icosq);
 	napi_disable(&c->napi);
 	mlx5e_close_cq(&c->rq.cq);
 	mlx5e_close_tx_cqs(c);
+	mlx5e_close_cq(&c->icosq.cq);
 	netif_napi_del(&c->napi);
 
 	napi_hash_del(&c->napi);
@@ -1125,17 +1173,27 @@ static void mlx5e_build_drop_rq_param(struct mlx5e_rq_param *param)
 	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
 }
 
-static void mlx5e_build_sq_param(struct mlx5e_priv *priv,
-				 struct mlx5e_sq_param *param)
+static void mlx5e_build_sq_param_common(struct mlx5e_priv *priv,
+					struct mlx5e_sq_param *param)
 {
 	void *sqc = param->sqc;
 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
 
-	MLX5_SET(wq, wq, log_wq_sz,     priv->params.log_sq_size);
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
 	MLX5_SET(wq, wq, pd,            priv->pdn);
 
 	param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev);
+}
+
+static void mlx5e_build_sq_param(struct mlx5e_priv *priv,
+				 struct mlx5e_sq_param *param)
+{
+	void *sqc = param->sqc;
+	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	mlx5e_build_sq_param_common(priv, param);
+	MLX5_SET(wq, wq, log_wq_sz,     priv->params.log_sq_size);
+
 	param->max_inline = priv->params.tx_max_inline;
 }
 
@@ -1172,20 +1230,49 @@ static void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
 {
 	void *cqc = param->cqc;
 
-	MLX5_SET(cqc, cqc, log_cq_size,  priv->params.log_sq_size);
+	MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_sq_size);
 
 	mlx5e_build_common_cq_param(priv, param);
 }
 
+static void mlx5e_build_ico_cq_param(struct mlx5e_priv *priv,
+				     struct mlx5e_cq_param *param,
+				     u8 log_wq_size)
+{
+	void *cqc = param->cqc;
+
+	MLX5_SET(cqc, cqc, log_cq_size, log_wq_size);
+
+	mlx5e_build_common_cq_param(priv, param);
+}
+
+static void mlx5e_build_icosq_param(struct mlx5e_priv *priv,
+				    struct mlx5e_sq_param *param,
+				    u8 log_wq_size)
+{
+	void *sqc = param->sqc;
+	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	mlx5e_build_sq_param_common(priv, param);
+
+	MLX5_SET(wq, wq, log_wq_sz, log_wq_size);
+
+	param->icosq = true;
+}
+
 static void mlx5e_build_channel_param(struct mlx5e_priv *priv,
 				      struct mlx5e_channel_param *cparam)
 {
+	u8 icosq_log_wq_sz = 0;
+
 	memset(cparam, 0, sizeof(*cparam));
 
 	mlx5e_build_rq_param(priv, &cparam->rq);
 	mlx5e_build_sq_param(priv, &cparam->sq);
+	mlx5e_build_icosq_param(priv, &cparam->icosq, icosq_log_wq_sz);
 	mlx5e_build_rx_cq_param(priv, &cparam->rx_cq);
 	mlx5e_build_tx_cq_param(priv, &cparam->tx_cq);
+	mlx5e_build_ico_cq_param(priv, &cparam->icosq_cq, icosq_log_wq_sz);
 }
 
 static int mlx5e_open_channels(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 1ffc7cb..a8d2935 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -54,6 +54,7 @@ void mlx5e_send_nop(struct mlx5e_sq *sq, bool notify_hw)
 
 	sq->skb[pi] = NULL;
 	sq->pc++;
+	sq->stats.nop++;
 
 	if (notify_hw) {
 		cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
@@ -387,7 +388,6 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 			wi = &sq->wqe_info[ci];
 
 			if (unlikely(!skb)) { /* nop */
-				sq->stats.nop++;
 				sqcc++;
 				continue;
 			}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 9bb4395..ad624cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -49,6 +49,57 @@ struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq)
 	return cqe;
 }
 
+static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq)
+{
+	struct mlx5_wq_cyc *wq;
+	struct mlx5_cqe64 *cqe;
+	struct mlx5e_sq *sq;
+	u16 sqcc;
+
+	cqe = mlx5e_get_cqe(cq);
+	if (likely(!cqe))
+		return;
+
+	sq = container_of(cq, struct mlx5e_sq, cq);
+	wq = &sq->wq;
+
+	/* sq->cc must be updated only after mlx5_cqwq_update_db_record(),
+	 * otherwise a cq overrun may occur
+	 */
+	sqcc = sq->cc;
+
+	do {
+		u16 ci = be16_to_cpu(cqe->wqe_counter) & wq->sz_m1;
+		struct mlx5e_ico_wqe_info *icowi = &sq->ico_wqe_info[ci];
+
+		mlx5_cqwq_pop(&cq->wq);
+		sqcc += icowi->num_wqebbs;
+
+		if (unlikely((cqe->op_own >> 4) != MLX5_CQE_REQ)) {
+			WARN_ONCE(true, "mlx5e: Bad OP in ICOSQ CQE: 0x%x\n",
+				  cqe->op_own);
+			break;
+		}
+
+		switch (icowi->opcode) {
+		case MLX5_OPCODE_NOP:
+			break;
+		default:
+			WARN_ONCE(true,
+				  "mlx5e: Bad OPCODE in ICOSQ WQE info: 0x%x\n",
+				  icowi->opcode);
+		}
+
+	} while ((cqe = mlx5e_get_cqe(cq)));
+
+	mlx5_cqwq_update_db_record(&cq->wq);
+
+	/* ensure cq space is freed before enabling more cqes */
+	wmb();
+
+	sq->cc = sqcc;
+}
+
 int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 {
 	struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
@@ -64,6 +115,9 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 
 	work_done = mlx5e_poll_rx_cq(&c->rq.cq, budget);
 	busy |= work_done == budget;
+
+	mlx5e_poll_ico_cq(&c->icosq.cq);
+
 	busy |= mlx5e_post_rx_wqes(&c->rq);
 
 	if (busy)
@@ -80,6 +134,7 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 	for (i = 0; i < c->num_tc; i++)
 		mlx5e_cq_arm(&c->sq[i].cq);
 	mlx5e_cq_arm(&c->rq.cq);
+	mlx5e_cq_arm(&c->icosq.cq);
 
 	return work_done;
 }
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V2 07/11] net/mlx5e: Add fragmented memory support for RX multi packet WQE
From: Saeed Mahameed @ 2016-04-17 21:32 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Tariq Toukan, Eran Ben Elisha,
	Saeed Mahameed
In-Reply-To: <1460928725-18741-1-git-send-email-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

If the allocation of a linear (physically continuous) MPWQE fails,
we allocate a fragmented MPWQE.

This is implemented via device's UMR (User Memory Registration)
which allows to register multiple memory fragments into ConnectX
hardware as a continuous buffer.
UMR registration is an asynchronous operation and is done via
ICO SQs.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |   78 ++++-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   64 ++++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |  357 +++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   |    4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c |    3 +
 5 files changed, 458 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index f62495f..1d4e8fe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -69,6 +69,10 @@
 						 MLX5_MPWRQ_LOG_STRIDE_SIZE)
 #define MLX5_MPWRQ_WQE_PAGE_ORDER  (max_t(int, 0, \
 					  MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT))
+#define MLX5_MPWRQ_WQE_NUM_PAGES		BIT(MLX5_MPWRQ_WQE_PAGE_ORDER)
+#define MLX5_CHANNEL_MAX_NUM_MTTS (ALIGN(MLX5_MPWRQ_WQE_NUM_PAGES, 8) * \
+				   BIT(MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW))
+#define MLX5_UMR_ALIGN				(2048)
 #define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD	(128)
 
 #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 (64 * 1024)
@@ -131,6 +135,13 @@ struct mlx5e_rx_wqe {
 	struct mlx5_wqe_data_seg      data;
 };
 
+struct mlx5e_umr_wqe {
+	struct mlx5_wqe_ctrl_seg       ctrl;
+	struct mlx5_wqe_umr_ctrl_seg   uctrl;
+	struct mlx5_mkey_seg           mkc;
+	struct mlx5_wqe_data_seg       data;
+};
+
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 #define MLX5E_MAX_BW_ALLOC 100 /* Max percentage of BW allocation */
 #define MLX5E_MIN_BW_ALLOC 1   /* Min percentage of BW allocation */
@@ -176,6 +187,7 @@ static const char vport_strings[][ETH_GSTRING_LEN] = {
 	"tx_queue_dropped",
 	"rx_wqe_err",
 	"rx_mpwqe_filler",
+	"rx_mpwqe_frag",
 };
 
 struct mlx5e_vport_stats {
@@ -218,8 +230,9 @@ struct mlx5e_vport_stats {
 	u64 tx_queue_dropped;
 	u64 rx_wqe_err;
 	u64 rx_mpwqe_filler;
+	u64 rx_mpwqe_frag;
 
-#define NUM_VPORT_COUNTERS     36
+#define NUM_VPORT_COUNTERS     37
 };
 
 static const char pport_strings[][ETH_GSTRING_LEN] = {
@@ -314,6 +327,7 @@ static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
 	"lro_bytes",
 	"wqe_err",
 	"mpwqe_filler",
+	"mpwqe_frag",
 };
 
 struct mlx5e_rq_stats {
@@ -325,7 +339,8 @@ struct mlx5e_rq_stats {
 	u64 lro_bytes;
 	u64 wqe_err;
 	u64 mpwqe_filler;
-#define NUM_RQ_STATS 8
+	u64 mpwqe_frag;
+#define NUM_RQ_STATS 9
 };
 
 static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
@@ -404,6 +419,7 @@ struct mlx5e_tstamp {
 
 enum {
 	MLX5E_RQ_STATE_POST_WQES_ENABLE,
+	MLX5E_RQ_STATE_UMR_WQE_IN_PROGRESS,
 };
 
 struct mlx5e_cq {
@@ -431,18 +447,14 @@ struct mlx5e_dma_info {
 	dma_addr_t	addr;
 };
 
-struct mlx5e_mpw_info {
-	struct mlx5e_dma_info dma_info;
-	u16		     consumed_strides;
-	u16		     skbs_frags;
-};
-
 struct mlx5e_rq {
 	/* data path */
 	struct mlx5_wq_ll      wq;
 	u32                    wqe_sz;
 	struct sk_buff       **skb;
 	struct mlx5e_mpw_info *wqe_info;
+	__be32                 mkey_be;
+	__be32                 umr_mkey_be;
 
 	struct device         *pdev;
 	struct net_device     *netdev;
@@ -463,6 +475,29 @@ struct mlx5e_rq {
 	struct mlx5e_priv     *priv;
 } ____cacheline_aligned_in_smp;
 
+struct mlx5e_umr_dma_info {
+	__be64                *mtt;
+	__be64                *mtt_no_align;
+	dma_addr_t             mtt_addr;
+	struct mlx5e_dma_info *dma_info;
+};
+
+struct mlx5e_mpw_info {
+	union {
+		struct mlx5e_dma_info     dma_info;
+		struct mlx5e_umr_dma_info umr;
+	};
+	u16 consumed_strides;
+	u16 skbs_frags;
+
+	void (*complete_wqe)(struct mlx5e_rq *rq,
+			     struct mlx5_cqe64 *cqe,
+			     u16 byte_cnt,
+			     struct mlx5e_mpw_info *wi,
+			     struct sk_buff *skb);
+	void (*free_wqe)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi);
+};
+
 struct mlx5e_tx_wqe_info {
 	u32 num_bytes;
 	u8  num_wqebbs;
@@ -655,6 +690,7 @@ struct mlx5e_priv {
 	u32                        pdn;
 	u32                        tdn;
 	struct mlx5_core_mkey      mkey;
+	struct mlx5_core_mkey      umr_mkey;
 	struct mlx5e_rq            drop_rq;
 
 	struct mlx5e_channel     **channel;
@@ -727,6 +763,21 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
 int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix);
 int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix);
+void mlx5e_post_rx_fragmented_mpwqe(struct mlx5e_rq *rq);
+void mlx5e_complete_rx_linear_mpwqe(struct mlx5e_rq *rq,
+				    struct mlx5_cqe64 *cqe,
+				    u16 byte_cnt,
+				    struct mlx5e_mpw_info *wi,
+				    struct sk_buff *skb);
+void mlx5e_complete_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
+					struct mlx5_cqe64 *cqe,
+					u16 byte_cnt,
+					struct mlx5e_mpw_info *wi,
+					struct sk_buff *skb);
+void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq,
+				struct mlx5e_mpw_info *wi);
+void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
+				    struct mlx5e_mpw_info *wi);
 struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq);
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
@@ -760,7 +811,7 @@ void mlx5e_build_default_indir_rqt(struct mlx5_core_dev *mdev,
 				   int num_channels);
 
 static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
-				      struct mlx5e_tx_wqe *wqe, int bf_sz)
+				      struct mlx5_wqe_ctrl_seg *ctrl, int bf_sz)
 {
 	u16 ofst = MLX5_BF_OFFSET + sq->bf_offset;
 
@@ -774,9 +825,9 @@ static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
 	 */
 	wmb();
 	if (bf_sz)
-		__iowrite64_copy(sq->uar_map + ofst, &wqe->ctrl, bf_sz);
+		__iowrite64_copy(sq->uar_map + ofst, ctrl, bf_sz);
 	else
-		mlx5_write64((__be32 *)&wqe->ctrl, sq->uar_map + ofst, NULL);
+		mlx5_write64((__be32 *)ctrl, sq->uar_map + ofst, NULL);
 	/* flush the write-combining mapped buffer */
 	wmb();
 
@@ -797,6 +848,11 @@ static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
 		     MLX5E_MAX_NUM_CHANNELS);
 }
 
+static inline int mlx5e_get_mtt_octw(int npages)
+{
+	return ALIGN(npages, 8) / 2;
+}
+
 extern const struct ethtool_ops mlx5e_ethtool_ops;
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 extern const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b25b429..942829e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -179,6 +179,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 	s->rx_csum_sw		= 0;
 	s->rx_wqe_err		= 0;
 	s->rx_mpwqe_filler	= 0;
+	s->rx_mpwqe_frag	= 0;
 	for (i = 0; i < priv->params.num_channels; i++) {
 		rq_stats = &priv->channel[i]->rq.stats;
 
@@ -190,6 +191,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 		s->rx_csum_sw	+= rq_stats->csum_sw;
 		s->rx_wqe_err   += rq_stats->wqe_err;
 		s->rx_mpwqe_filler += rq_stats->mpwqe_filler;
+		s->rx_mpwqe_frag   += rq_stats->mpwqe_frag;
 
 		for (j = 0; j < priv->params.num_tc; j++) {
 			sq_stats = &priv->channel[i]->sq[j].stats;
@@ -379,7 +381,6 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	for (i = 0; i < wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
 
-		wqe->data.lkey       = c->mkey_be;
 		wqe->data.byte_count = cpu_to_be32(byte_count);
 	}
 
@@ -390,6 +391,8 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	rq->channel = c;
 	rq->ix      = c->ix;
 	rq->priv    = c->priv;
+	rq->mkey_be = c->mkey_be;
+	rq->umr_mkey_be = cpu_to_be32(c->priv->umr_mkey.key);
 
 	return 0;
 
@@ -1256,6 +1259,7 @@ static void mlx5e_build_icosq_param(struct mlx5e_priv *priv,
 	mlx5e_build_sq_param_common(priv, param);
 
 	MLX5_SET(wq, wq, log_wq_sz, log_wq_size);
+	MLX5_SET(sqc, sqc, reg_umr, MLX5_CAP_ETH(priv->mdev, reg_umr_sq));
 
 	param->icosq = true;
 }
@@ -1263,7 +1267,7 @@ static void mlx5e_build_icosq_param(struct mlx5e_priv *priv,
 static void mlx5e_build_channel_param(struct mlx5e_priv *priv,
 				      struct mlx5e_channel_param *cparam)
 {
-	u8 icosq_log_wq_sz = 0;
+	u8 icosq_log_wq_sz = MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
 
 	memset(cparam, 0, sizeof(*cparam));
 
@@ -2458,6 +2462,13 @@ void mlx5e_build_default_indir_rqt(struct mlx5_core_dev *mdev,
 		indirection_rqt[i] = i % num_channels;
 }
 
+static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
+{
+	return MLX5_CAP_GEN(mdev, striding_rq) &&
+		MLX5_CAP_GEN(mdev, umr_ptr_rlky) &&
+		MLX5_CAP_ETH(mdev, reg_umr_sq);
+}
+
 static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 				    struct net_device *netdev,
 				    int num_channels)
@@ -2466,7 +2477,7 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 
 	priv->params.log_sq_size           =
 		MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
-	priv->params.rq_wq_type = MLX5_CAP_GEN(mdev, striding_rq) ?
+	priv->params.rq_wq_type = mlx5e_check_fragmented_striding_rq_cap(mdev) ?
 		MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
 		MLX5_WQ_TYPE_LINKED_LIST;
 
@@ -2639,6 +2650,41 @@ static void mlx5e_destroy_q_counter(struct mlx5e_priv *priv)
 	mlx5_core_dealloc_q_counter(priv->mdev, priv->q_counter);
 }
 
+static int mlx5e_create_umr_mkey(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_mkey_seg *mkc;
+	int inlen = sizeof(*in);
+	u64 npages =
+		mlx5e_get_max_num_channels(mdev) * MLX5_CHANNEL_MAX_NUM_MTTS;
+	int err;
+
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	mkc = &in->seg;
+	mkc->status = MLX5_MKEY_STATUS_FREE;
+	mkc->flags = MLX5_PERM_UMR_EN |
+		     MLX5_PERM_LOCAL_READ |
+		     MLX5_PERM_LOCAL_WRITE |
+		     MLX5_ACCESS_MODE_MTT;
+
+	mkc->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	mkc->flags_pd = cpu_to_be32(priv->pdn);
+	mkc->len = cpu_to_be64(npages << PAGE_SHIFT);
+	mkc->xlt_oct_size = cpu_to_be32(mlx5e_get_mtt_octw(npages));
+	mkc->log2_page_size = PAGE_SHIFT;
+
+	err = mlx5_core_create_mkey(mdev, &priv->umr_mkey, in, inlen, NULL,
+				    NULL, NULL);
+
+	kvfree(in);
+
+	return err;
+}
+
 static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
 {
 	struct net_device *netdev;
@@ -2688,10 +2734,16 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
 		goto err_dealloc_transport_domain;
 	}
 
+	err = mlx5e_create_umr_mkey(priv);
+	if (err) {
+		mlx5_core_err(mdev, "create umr mkey failed, %d\n", err);
+		goto err_destroy_mkey;
+	}
+
 	err = mlx5e_create_tises(priv);
 	if (err) {
 		mlx5_core_warn(mdev, "create tises failed, %d\n", err);
-		goto err_destroy_mkey;
+		goto err_destroy_umr_mkey;
 	}
 
 	err = mlx5e_open_drop_rq(priv);
@@ -2774,6 +2826,9 @@ err_close_drop_rq:
 err_destroy_tises:
 	mlx5e_destroy_tises(priv);
 
+err_destroy_umr_mkey:
+	mlx5_core_destroy_mkey(mdev, &priv->umr_mkey);
+
 err_destroy_mkey:
 	mlx5_core_destroy_mkey(mdev, &priv->mkey);
 
@@ -2812,6 +2867,7 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
 	mlx5e_destroy_rqt(priv, MLX5E_INDIRECTION_RQT);
 	mlx5e_close_drop_rq(priv);
 	mlx5e_destroy_tises(priv);
+	mlx5_core_destroy_mkey(priv->mdev, &priv->umr_mkey);
 	mlx5_core_destroy_mkey(priv->mdev, &priv->mkey);
 	mlx5_core_dealloc_transport_domain(priv->mdev, priv->tdn);
 	mlx5_core_dealloc_pd(priv->mdev, priv->pdn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index e652604..22c2812 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -65,6 +65,7 @@ int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
 
 	*((dma_addr_t *)skb->cb) = dma_addr;
 	wqe->data.addr = cpu_to_be64(dma_addr + MLX5E_NET_IP_ALIGN);
+	wqe->data.lkey = rq->mkey_be;
 
 	rq->skb[ix] = skb;
 
@@ -76,7 +77,190 @@ err_free_skb:
 	return -ENOMEM;
 }
 
-int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
+static u16 mlx5e_get_wqe_mtt_offset(u16 rq_ix, u16 wqe_ix)
+{
+	return rq_ix * MLX5_CHANNEL_MAX_NUM_MTTS +
+		wqe_ix * ALIGN(MLX5_MPWRQ_WQE_NUM_PAGES, 8);
+}
+
+static void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
+				struct mlx5e_sq *sq,
+				struct mlx5e_umr_wqe *wqe,
+				u16 ix)
+{
+	struct mlx5_wqe_ctrl_seg      *cseg = &wqe->ctrl;
+	struct mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl;
+	struct mlx5_wqe_data_seg      *dseg = &wqe->data;
+	struct mlx5e_mpw_info *wi = &rq->wqe_info[ix];
+	u8 ds_cnt = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS);
+	u16 umr_wqe_mtt_offset = mlx5e_get_wqe_mtt_offset(rq->ix, ix);
+
+	memset(wqe, 0, sizeof(*wqe));
+	cseg->opmod_idx_opcode =
+		cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
+			    MLX5_OPCODE_UMR);
+	cseg->qpn_ds    = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) |
+				      ds_cnt);
+	cseg->fm_ce_se  = MLX5_WQE_CTRL_CQ_UPDATE;
+	cseg->imm       = rq->umr_mkey_be;
+
+	ucseg->flags = MLX5_UMR_TRANSLATION_OFFSET_EN;
+	ucseg->klm_octowords =
+		cpu_to_be16(mlx5e_get_mtt_octw(MLX5_MPWRQ_WQE_NUM_PAGES));
+	ucseg->bsf_octowords =
+		cpu_to_be16(mlx5e_get_mtt_octw(umr_wqe_mtt_offset));
+	ucseg->mkey_mask     = cpu_to_be64(MLX5_MKEY_MASK_FREE);
+
+	dseg->lkey = sq->mkey_be;
+	dseg->addr = cpu_to_be64(wi->umr.mtt_addr);
+}
+
+static void mlx5e_post_umr_wqe(struct mlx5e_rq *rq, u16 ix)
+{
+	struct mlx5e_sq *sq = &rq->channel->icosq;
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	struct mlx5e_umr_wqe *wqe;
+	u8 num_wqebbs = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_BB);
+	u16 pi;
+
+	/* fill sq edge with nops to avoid wqe wrap around */
+	while ((pi = (sq->pc & wq->sz_m1)) > sq->edge) {
+		sq->ico_wqe_info[pi].opcode = MLX5_OPCODE_NOP;
+		sq->ico_wqe_info[pi].num_wqebbs = 1;
+		mlx5e_send_nop(sq, true);
+	}
+
+	wqe = mlx5_wq_cyc_get_wqe(wq, pi);
+	mlx5e_build_umr_wqe(rq, sq, wqe, ix);
+	sq->ico_wqe_info[pi].opcode = MLX5_OPCODE_UMR;
+	sq->ico_wqe_info[pi].num_wqebbs = num_wqebbs;
+	sq->pc += num_wqebbs;
+	mlx5e_tx_notify_hw(sq, &wqe->ctrl, 0);
+}
+
+static inline int mlx5e_get_wqe_mtt_sz(void)
+{
+	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
+	 * To avoid copying garbage after the mtt array, we allocate
+	 * a little more.
+	 */
+	return ALIGN(MLX5_MPWRQ_WQE_NUM_PAGES * sizeof(__be64),
+		     MLX5_UMR_MTT_ALIGNMENT);
+}
+
+static int mlx5e_alloc_and_map_page(struct mlx5e_rq *rq,
+				    struct mlx5e_mpw_info *wi,
+				    int i)
+{
+	struct page *page;
+
+	page = dev_alloc_page();
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	wi->umr.dma_info[i].page = page;
+	wi->umr.dma_info[i].addr = dma_map_page(rq->pdev, page, 0, PAGE_SIZE,
+						PCI_DMA_FROMDEVICE);
+	if (unlikely(dma_mapping_error(rq->pdev, wi->umr.dma_info[i].addr))) {
+		put_page(page);
+		return -ENOMEM;
+	}
+	wi->umr.mtt[i] = cpu_to_be64(wi->umr.dma_info[i].addr | MLX5_EN_WR);
+
+	return 0;
+}
+
+static int mlx5e_alloc_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
+					   struct mlx5e_rx_wqe *wqe,
+					   u16 ix)
+{
+	struct mlx5e_mpw_info *wi = &rq->wqe_info[ix];
+	int mtt_sz = mlx5e_get_wqe_mtt_sz();
+	u32 dma_offset = mlx5e_get_wqe_mtt_offset(rq->ix, ix) << PAGE_SHIFT;
+	int i;
+
+	wi->umr.dma_info = kmalloc(sizeof(*wi->umr.dma_info) *
+				   MLX5_MPWRQ_WQE_NUM_PAGES,
+				   GFP_ATOMIC);
+	if (unlikely(!wi->umr.dma_info))
+		goto err_out;
+
+	/* We allocate more than mtt_sz as we will align the pointer */
+	wi->umr.mtt_no_align = kzalloc(mtt_sz + MLX5_UMR_ALIGN - 1,
+				       GFP_ATOMIC);
+	if (unlikely(!wi->umr.mtt_no_align))
+		goto err_free_umr;
+
+	wi->umr.mtt = PTR_ALIGN(wi->umr.mtt_no_align, MLX5_UMR_ALIGN);
+	wi->umr.mtt_addr = dma_map_single(rq->pdev, wi->umr.mtt, mtt_sz,
+					  PCI_DMA_TODEVICE);
+	if (unlikely(dma_mapping_error(rq->pdev, wi->umr.mtt_addr)))
+		goto err_free_mtt;
+
+	for (i = 0; i < MLX5_MPWRQ_WQE_NUM_PAGES; i++)
+		if (unlikely(mlx5e_alloc_and_map_page(rq, wi, i)))
+			goto err_unmap;
+
+	wi->consumed_strides = 0;
+	wi->complete_wqe = mlx5e_complete_rx_fragmented_mpwqe;
+	wi->free_wqe     = mlx5e_free_rx_fragmented_mpwqe;
+	wqe->data.lkey = rq->umr_mkey_be;
+	wqe->data.addr = cpu_to_be64(dma_offset);
+
+	return 0;
+
+err_unmap:
+	while (--i >= 0) {
+		dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
+			       PCI_DMA_FROMDEVICE);
+		put_page(wi->umr.dma_info[i].page);
+	}
+	dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
+
+err_free_mtt:
+	kfree(wi->umr.mtt_no_align);
+
+err_free_umr:
+	kfree(wi->umr.dma_info);
+
+err_out:
+	return -ENOMEM;
+}
+
+void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
+				    struct mlx5e_mpw_info *wi)
+{
+	int mtt_sz = mlx5e_get_wqe_mtt_sz();
+	int i;
+
+	for (i = 0; i < MLX5_MPWRQ_WQE_NUM_PAGES; i++) {
+		dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
+			       PCI_DMA_FROMDEVICE);
+		put_page(wi->umr.dma_info[i].page);
+	}
+	dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
+	kfree(wi->umr.mtt_no_align);
+	kfree(wi->umr.dma_info);
+}
+
+void mlx5e_post_rx_fragmented_mpwqe(struct mlx5e_rq *rq)
+{
+	struct mlx5_wq_ll *wq = &rq->wq;
+	struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
+
+	clear_bit(MLX5E_RQ_STATE_UMR_WQE_IN_PROGRESS, &rq->state);
+	mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
+	rq->stats.mpwqe_frag++;
+
+	/* ensure wqes are visible to device before updating doorbell record */
+	dma_wmb();
+
+	mlx5_wq_ll_update_db_record(wq);
+}
+
+static int mlx5e_alloc_rx_linear_mpwqe(struct mlx5e_rq *rq,
+				       struct mlx5e_rx_wqe *wqe,
+				       u16 ix)
 {
 	struct mlx5e_mpw_info *wi = &rq->wqe_info[ix];
 	int ret = 0;
@@ -95,7 +279,10 @@ int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
 	atomic_add(MLX5_MPWRQ_NUM_STRIDES, &wi->dma_info.page->_count);
 	wi->skbs_frags       = 0;
 	wi->consumed_strides = 0;
-	wqe->data.addr       = cpu_to_be64(wi->dma_info.addr);
+	wi->complete_wqe = mlx5e_complete_rx_linear_mpwqe;
+	wi->free_wqe     = mlx5e_free_rx_linear_mpwqe;
+	wqe->data.lkey = rq->mkey_be;
+	wqe->data.addr = cpu_to_be64(wi->dma_info.addr);
 
 	return 0;
 
@@ -104,11 +291,42 @@ err_put_page:
 	return ret;
 }
 
+void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq,
+				struct mlx5e_mpw_info *wi)
+{
+	dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz,
+		       PCI_DMA_FROMDEVICE);
+	atomic_sub(MLX5_MPWRQ_NUM_STRIDES - wi->skbs_frags,
+		   &wi->dma_info.page->_count);
+	put_page(wi->dma_info.page);
+}
+
+int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
+{
+	int err;
+
+	err = mlx5e_alloc_rx_linear_mpwqe(rq, wqe, ix);
+	if (unlikely(err)) {
+		err = mlx5e_alloc_rx_fragmented_mpwqe(rq, wqe, ix);
+		if (unlikely(err))
+			return err;
+		set_bit(MLX5E_RQ_STATE_UMR_WQE_IN_PROGRESS, &rq->state);
+		mlx5e_post_umr_wqe(rq, ix);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+#define RQ_CANNOT_POST(rq) \
+		(!test_bit(MLX5E_RQ_STATE_POST_WQES_ENABLE, &rq->state) || \
+		 test_bit(MLX5E_RQ_STATE_UMR_WQE_IN_PROGRESS, &rq->state))
+
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_ll *wq = &rq->wq;
 
-	if (unlikely(!test_bit(MLX5E_RQ_STATE_POST_WQES_ENABLE, &rq->state)))
+	if (unlikely(RQ_CANNOT_POST(rq)))
 		return false;
 
 	while (!mlx5_wq_ll_is_full(wq)) {
@@ -302,19 +520,119 @@ wq_ll_pop:
 		       &wqe->next.next_wqe_index);
 }
 
-void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+static inline void mlx5e_sync_copy_linear_data(struct mlx5e_rq *rq,
+					       struct sk_buff *skb,
+					       struct mlx5e_dma_info *dma_info,
+					       unsigned int pg_offset,
+					       unsigned int skb_offset,
+					       unsigned int len)
+{
+	 /* Aligning len to sizeof(long) optimizes memcpy performance */
+	len = ALIGN(len, sizeof(long));
+	dma_sync_single_for_cpu(rq->pdev, dma_info->addr + pg_offset, len,
+				DMA_FROM_DEVICE);
+	skb_copy_to_linear_data_offset(skb, skb_offset,
+				       page_address(dma_info->page) + pg_offset,
+				       len);
+}
+
+void mlx5e_complete_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
+					struct mlx5_cqe64 *cqe,
+					u16 byte_cnt,
+					struct mlx5e_mpw_info *wi,
+					struct sk_buff *skb)
+{
+	u16 stride_ix      = mpwrq_get_cqe_stride_index(cqe);
+	u32 wqe_offset     = stride_ix * MLX5_MPWRQ_STRIDE_SIZE;
+	u32 head_offset    = wqe_offset & (PAGE_SIZE - 1);
+	u32 page_idx       = wqe_offset >> PAGE_SHIFT;
+	struct mlx5e_dma_info *head_dma_info;
+	struct mlx5e_dma_info *dma_info;
+	u16 headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, byte_cnt);
+	u16 headlen_pg = min_t(u32, headlen, PAGE_SIZE - head_offset);
+	u32 frag_offset = head_offset + headlen;
+
+	dma_info = &wi->umr.dma_info[page_idx];
+	head_dma_info = dma_info;
+#if (MLX5_MPWRQ_SMALL_PACKET_THRESHOLD >= MLX5_MPWRQ_STRIDE_SIZE)
+	if (unlikely(frag_offset >= PAGE_SIZE)) {
+		dma_info++;
+		frag_offset = headlen - headlen_pg;
+	}
+#endif
+
+	byte_cnt -= headlen;
+
+	while (byte_cnt) {
+		u32 pg_consumed_bytes =
+			min_t(u32, PAGE_SIZE - frag_offset, byte_cnt);
+		unsigned int truesize;
+
+		dma_sync_single_for_cpu(rq->pdev,
+					dma_info->addr + frag_offset,
+					pg_consumed_bytes,
+					DMA_FROM_DEVICE);
+		truesize = ALIGN(pg_consumed_bytes, MLX5_MPWRQ_STRIDE_SIZE);
+		get_page(dma_info->page);
+		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, dma_info->page,
+				frag_offset, pg_consumed_bytes, truesize);
+		byte_cnt -= pg_consumed_bytes;
+		dma_info++;
+		frag_offset = 0;
+	}
+	/* copy header */
+	mlx5e_sync_copy_linear_data(rq, skb, head_dma_info, head_offset, 0,
+				    headlen_pg);
+#if (MLX5_MPWRQ_SMALL_PACKET_THRESHOLD >= MLX5_MPWRQ_STRIDE_SIZE)
+	if (unlikely(headlen - headlen_pg > 0)) {
+		mlx5e_sync_copy_linear_data(rq, skb, ++head_dma_info, 0,
+					    headlen_pg, headlen - headlen_pg);
+	}
+#endif
+	/* skb linear part was allocated with headlen and aligned to long */
+	skb->tail += headlen;
+	skb->len  += headlen;
+}
+
+void mlx5e_complete_rx_linear_mpwqe(struct mlx5e_rq *rq,
+				    struct mlx5_cqe64 *cqe,
+				    u16 byte_cnt,
+				    struct mlx5e_mpw_info *wi,
+				    struct sk_buff *skb)
 {
 	u16 cstrides       = mpwrq_get_cqe_consumed_strides(cqe);
 	u16 stride_ix      = mpwrq_get_cqe_stride_index(cqe);
 	u32 consumed_bytes = cstrides  * MLX5_MPWRQ_STRIDE_SIZE;
 	u32 data_offset    = stride_ix * MLX5_MPWRQ_STRIDE_SIZE;
+	u16 headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, byte_cnt);
+	struct mlx5e_dma_info *dma_info;
+
+	dma_info = &wi->dma_info;
+	dma_sync_single_for_cpu(rq->pdev, dma_info->addr + data_offset,
+				consumed_bytes, DMA_FROM_DEVICE);
+
+	byte_cnt -= headlen;
+	if (byte_cnt) {
+		wi->skbs_frags++;
+		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, dma_info->page,
+				data_offset + headlen, byte_cnt,
+				ALIGN(byte_cnt, MLX5_MPWRQ_STRIDE_SIZE));
+	}
+	skb_copy_to_linear_data(skb, page_address(dma_info->page) + data_offset,
+				ALIGN(headlen, sizeof(long)));
+	/* skb linear part was allocated with headlen and aligned to long */
+	skb->tail += headlen;
+	skb->len  += headlen;
+}
+
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	u16 cstrides       = mpwrq_get_cqe_consumed_strides(cqe);
 	u16 wqe_id         = be16_to_cpu(cqe->wqe_id);
 	struct mlx5e_mpw_info *wi = &rq->wqe_info[wqe_id];
 	struct mlx5e_rx_wqe  *wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_id);
 	struct sk_buff *skb;
-	u16 byte_cnt;
 	u16 cqe_bcnt;
-	u16 headlen;
 
 	wi->consumed_strides += cstrides;
 
@@ -335,27 +653,8 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 		goto mpwrq_cqe_out;
 
 	prefetch(skb->data);
-	dma_sync_single_for_cpu(rq->pdev, wi->dma_info.addr + data_offset,
-				consumed_bytes, DMA_FROM_DEVICE);
-
 	cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
-	headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, cqe_bcnt);
-
-	byte_cnt = cqe_bcnt - headlen;
-	if (byte_cnt) {
-		wi->skbs_frags++;
-		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
-				wi->dma_info.page, data_offset + headlen,
-				byte_cnt,
-				ALIGN(byte_cnt, MLX5_MPWRQ_STRIDE_SIZE));
-	}
-
-	skb_copy_to_linear_data(skb,
-				page_address(wi->dma_info.page) + data_offset,
-				ALIGN(headlen, sizeof(long)));
-	/* skb linear part was allocated with headlen and aligned to long */
-	skb->tail += headlen;
-	skb->len  += headlen;
+	wi->complete_wqe(rq, cqe, cqe_bcnt, wi, skb);
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 
@@ -363,11 +662,7 @@ mpwrq_cqe_out:
 	if (likely(wi->consumed_strides < MLX5_MPWRQ_NUM_STRIDES))
 		return;
 
-	dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz,
-		       PCI_DMA_FROMDEVICE);
-	atomic_sub(MLX5_MPWRQ_NUM_STRIDES - wi->skbs_frags,
-		   &wi->dma_info.page->_count);
-	put_page(wi->dma_info.page);
+	wi->free_wqe(rq, wi);
 	mlx5_wq_ll_pop(&rq->wq, cqe->wqe_id, &wqe->next.next_wqe_index);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index a8d2935..229ab16 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -58,7 +58,7 @@ void mlx5e_send_nop(struct mlx5e_sq *sq, bool notify_hw)
 
 	if (notify_hw) {
 		cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
-		mlx5e_tx_notify_hw(sq, wqe, 0);
+		mlx5e_tx_notify_hw(sq, &wqe->ctrl, 0);
 	}
 }
 
@@ -310,7 +310,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 			bf_sz = wi->num_wqebbs << 3;
 
 		cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
-		mlx5e_tx_notify_hw(sq, wqe, bf_sz);
+		mlx5e_tx_notify_hw(sq, &wqe->ctrl, bf_sz);
 	}
 
 	/* fill sq edge with nops to avoid wqe wrap around */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index ad624cb..a3fd0f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -84,6 +84,9 @@ static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq)
 		switch (icowi->opcode) {
 		case MLX5_OPCODE_NOP:
 			break;
+		case MLX5_OPCODE_UMR:
+			mlx5e_post_rx_fragmented_mpwqe(&sq->channel->rq);
+			break;
 		default:
 			WARN_ONCE(true,
 				  "mlx5e: Bad OPCODE in ICOSQ WQE info: 0x%x\n",
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V2 08/11] net/mlx5e: Use napi_alloc_skb for RX SKB allocations
From: Saeed Mahameed @ 2016-04-17 21:32 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Tariq Toukan, Eran Ben Elisha,
	Saeed Mahameed
In-Reply-To: <1460928725-18741-1-git-send-email-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Instead of netdev_alloc_skb, we use the napi_alloc_skb function
which is designated to allocate skbuff's for RX in a
channel-specific NAPI instance, and implies the IP packet alignment.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |    1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |    4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |   12 +++++-------
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 1d4e8fe..6127bc7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -91,7 +91,6 @@
 #define MLX5E_SQ_BF_BUDGET             16
 
 #define MLX5E_NUM_MAIN_GROUPS 9
-#define MLX5E_NET_IP_ALIGN 2
 
 static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 942829e..9b17bc0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -373,8 +373,8 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 		rq->wqe_sz = (priv->params.lro_en) ?
 				priv->params.lro_wqe_sz :
 				MLX5E_SW2HW_MTU(priv->netdev->mtu);
-		rq->wqe_sz = SKB_DATA_ALIGN(rq->wqe_sz + MLX5E_NET_IP_ALIGN);
-		byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
+		rq->wqe_sz = SKB_DATA_ALIGN(rq->wqe_sz);
+		byte_count = rq->wqe_sz;
 		byte_count |= MLX5_HW_START_PADDING;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 22c2812..ea90e6f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -47,7 +47,7 @@ int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
 	struct sk_buff *skb;
 	dma_addr_t dma_addr;
 
-	skb = netdev_alloc_skb(rq->netdev, rq->wqe_sz);
+	skb = napi_alloc_skb(rq->cq.napi, rq->wqe_sz);
 	if (unlikely(!skb))
 		return -ENOMEM;
 
@@ -61,10 +61,8 @@ int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
 	if (unlikely(dma_mapping_error(rq->pdev, dma_addr)))
 		goto err_free_skb;
 
-	skb_reserve(skb, MLX5E_NET_IP_ALIGN);
-
 	*((dma_addr_t *)skb->cb) = dma_addr;
-	wqe->data.addr = cpu_to_be64(dma_addr + MLX5E_NET_IP_ALIGN);
+	wqe->data.addr = cpu_to_be64(dma_addr);
 	wqe->data.lkey = rq->mkey_be;
 
 	rq->skb[ix] = skb;
@@ -646,9 +644,9 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 		goto mpwrq_cqe_out;
 	}
 
-	skb = netdev_alloc_skb(rq->netdev,
-			       ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD,
-				     sizeof(long)));
+	skb = napi_alloc_skb(rq->cq.napi,
+			     ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD,
+				   sizeof(long)));
 	if (unlikely(!skb))
 		goto mpwrq_cqe_out;
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V2 09/11] net/mlx5e: Remove redundant barrier
From: Saeed Mahameed @ 2016-04-17 21:32 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Tariq Toukan, Eran Ben Elisha,
	Saeed Mahameed
In-Reply-To: <1460928725-18741-1-git-send-email-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

The bit-op operation one line before is an explicit barrier
by itself.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index a3fd0f5..c38781f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -147,7 +147,6 @@ void mlx5e_completion_event(struct mlx5_core_cq *mcq)
 	struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq);
 
 	set_bit(MLX5E_CHANNEL_NAPI_SCHED, &cq->channel->flags);
-	barrier();
 	napi_schedule(cq->napi);
 }
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V2 11/11] net/mlx5e: Add ethtool counter for RX buffer allocation failures
From: Saeed Mahameed @ 2016-04-17 21:32 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Tariq Toukan, Eran Ben Elisha,
	Saeed Mahameed
In-Reply-To: <1460928725-18741-1-git-send-email-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Counts the number of RX buffer allocation failures and shows it
in ethtool statistics.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |    8 ++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |    2 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |   11 +++++++++--
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 6127bc7..9aaa23d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -187,6 +187,7 @@ static const char vport_strings[][ETH_GSTRING_LEN] = {
 	"rx_wqe_err",
 	"rx_mpwqe_filler",
 	"rx_mpwqe_frag",
+	"rx_buff_alloc_err",
 };
 
 struct mlx5e_vport_stats {
@@ -230,8 +231,9 @@ struct mlx5e_vport_stats {
 	u64 rx_wqe_err;
 	u64 rx_mpwqe_filler;
 	u64 rx_mpwqe_frag;
+	u64 rx_buff_alloc_err;
 
-#define NUM_VPORT_COUNTERS     37
+#define NUM_VPORT_COUNTERS     38
 };
 
 static const char pport_strings[][ETH_GSTRING_LEN] = {
@@ -327,6 +329,7 @@ static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
 	"wqe_err",
 	"mpwqe_filler",
 	"mpwqe_frag",
+	"buff_alloc_err",
 };
 
 struct mlx5e_rq_stats {
@@ -339,7 +342,8 @@ struct mlx5e_rq_stats {
 	u64 wqe_err;
 	u64 mpwqe_filler;
 	u64 mpwqe_frag;
-#define NUM_RQ_STATS 9
+	u64 buff_alloc_err;
+#define NUM_RQ_STATS 10
 };
 
 static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9b17bc0..d485d1e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -180,6 +180,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 	s->rx_wqe_err		= 0;
 	s->rx_mpwqe_filler	= 0;
 	s->rx_mpwqe_frag	= 0;
+	s->rx_buff_alloc_err	= 0;
 	for (i = 0; i < priv->params.num_channels; i++) {
 		rq_stats = &priv->channel[i]->rq.stats;
 
@@ -192,6 +193,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 		s->rx_wqe_err   += rq_stats->wqe_err;
 		s->rx_mpwqe_filler += rq_stats->mpwqe_filler;
 		s->rx_mpwqe_frag   += rq_stats->mpwqe_frag;
+		s->rx_buff_alloc_err += rq_stats->buff_alloc_err;
 
 		for (j = 0; j < priv->params.num_tc; j++) {
 			sq_stats = &priv->channel[i]->sq[j].stats;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index ade858b..e4e8c37 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -329,9 +329,14 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 
 	while (!mlx5_wq_ll_is_full(wq)) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
+		int err;
 
-		if (unlikely(rq->alloc_wqe(rq, wqe, wq->head)))
+		err = rq->alloc_wqe(rq, wqe, wq->head);
+		if (unlikely(err)) {
+			if (err != -EBUSY)
+				rq->stats.buff_alloc_err++;
 			break;
+		}
 
 		mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
 	}
@@ -646,8 +651,10 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	skb = napi_alloc_skb(rq->cq.napi,
 			     ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD,
 				   sizeof(long)));
-	if (unlikely(!skb))
+	if (unlikely(!skb)) {
+		rq->stats.buff_alloc_err++;
 		goto mpwrq_cqe_out;
+	}
 
 	prefetch(skb->data);
 	cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH v3] prism54: isl_38xx: Replace 'struct timeval'
From: Arnd Bergmann @ 2016-04-17 22:10 UTC (permalink / raw)
  To: Johannes Berg
  Cc: y2038, netdev, linux-wireless, Tina Ruchandani, linux-kernel
In-Reply-To: <1460896953.15362.2.camel@sipsolutions.net>

On Sunday 17 April 2016 14:42:33 Johannes Berg wrote:
> 
> I was thinking more restrictively of just the stuff that can't even be
> built without modifying the sources - like the "#if VERBOSE" thing.

All the DEBUG() statements are inside of this kind of check, so if we
remove the #ifdefs, it would be logical to remove the rest of the debugging
infrastructure (DEBUG() macros, SHOW_*, pc_debug, maybe more) as well.

	Arnd
_______________________________________________
Y2038 mailing list
Y2038@lists.linaro.org
https://lists.linaro.org/mailman/listinfo/y2038

^ permalink raw reply

* Re: [PATCHv2] wlcore: spi: add wl18xx support
From: Arnd Bergmann @ 2016-04-17 22:19 UTC (permalink / raw)
  To: Reizer, Eyal
  Cc: Kalle Valo, Eyal Reizer, linux-wireless@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	devicetree@vger.kernel.org, linux-spi
In-Reply-To: <8665E2433BC68541A24DFFCA87B70F5B360B6C90@DFRE01.ent.ti.com>

On Sunday 10 April 2016 07:37:23 Reizer, Eyal wrote:
> Add support for using with both wl12xx and wl18xx.
> 
> - all wilink family needs special init command for entering wspi mode.
>   extra clock cycles should be sent after the spi init command while the
>   cs pin is high.
> - switch to controling the cs pin from the spi driver for achieveing the
>   above.
> - the selected cs gpio is read from the spi device-tree node using the
>   cs-gpios field and setup as a gpio.
> - See the example below for specifying the cs gpio using the cs-gpios entry
> &spi0   {
>         ...
>         cs-gpios = <&gpio0 5 0>;
>         ...
>         wlcore: wlcore@0 {
>                 compatible = "ti,wl1835";
>         ...
>         ...
>         };
> };
> 
> Signed-off-by: Eyal Reizer <eyalr@ti.com>

I don't think this can work in general: not all SPI hosts uses GPIOs
for controlling CS, so the logic can't work, and it's also a layering
violation for the driver to look at the parent.

I would suggest fixing this using a new API function from the SPI
core, if we don't already have a generic way to do it.

	Arnd

^ permalink raw reply

* [PATCH net-next 0/2] BPF updates
From: Daniel Borkmann @ 2016-04-17 22:29 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, tgraf, netdev, linux-kernel, Daniel Borkmann

This minor set adds a new helper bpf_skb_event_output() for eBPF cls/act
program types which allows to pass events to user space applications. For
details, please see individual patches.

Thanks!

Daniel Borkmann (2):
  bpf, trace: add BPF_F_CURRENT_CPU flag for bpf_perf_event_output
  bpf: add event output helper for notifications/sampling/logging

 include/linux/bpf.h      |  2 ++
 include/uapi/linux/bpf.h |  4 ++++
 kernel/trace/bpf_trace.c |  7 ++++++-
 net/core/filter.c        | 30 ++++++++++++++++++++++++++++++
 4 files changed, 42 insertions(+), 1 deletion(-)

-- 
1.9.3

^ permalink raw reply

* [PATCH net-next 1/2] bpf, trace: add BPF_F_CURRENT_CPU flag for bpf_perf_event_output
From: Daniel Borkmann @ 2016-04-17 22:29 UTC (permalink / raw)
  To: davem
  Cc: alexei.starovoitov, tgraf, netdev, linux-kernel, Daniel Borkmann,
	Alexei Starovoitov
In-Reply-To: <cover.1460931309.git.daniel@iogearbox.net>

Add a BPF_F_CURRENT_CPU flag to optimize the use-case where user space has
per-CPU ring buffers and the eBPF program pushes the data into the current
CPU's ring buffer which saves us an extra helper function call in eBPF.
Also, make sure to properly reserve the remaining flags which are not used.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 4 ++++
 kernel/trace/bpf_trace.c | 7 ++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 70eda5a..b7b0fb1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -347,6 +347,10 @@ enum bpf_func_id {
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
+/* BPF_FUNC_perf_event_output flags. */
+#define BPF_F_INDEX_MASK		0xffffffffULL
+#define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6855878..6bfe55c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -225,11 +225,12 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 {
 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
 	struct bpf_map *map = (struct bpf_map *) (long) r2;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u64 index = flags & BPF_F_INDEX_MASK;
 	void *data = (void *) (long) r4;
 	struct perf_sample_data sample_data;
 	struct perf_event *event;
@@ -239,6 +240,10 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
 		.data = data,
 	};
 
+	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+		return -EINVAL;
+	if (index == BPF_F_CURRENT_CPU)
+		index = raw_smp_processor_id();
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next 2/2] bpf: add event output helper for notifications/sampling/logging
From: Daniel Borkmann @ 2016-04-17 22:29 UTC (permalink / raw)
  To: davem
  Cc: alexei.starovoitov, tgraf, netdev, linux-kernel, Daniel Borkmann,
	Alexei Starovoitov
In-Reply-To: <cover.1460931309.git.daniel@iogearbox.net>

This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").

The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.

User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.

While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.

Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  2 ++
 kernel/trace/bpf_trace.c |  2 +-
 net/core/filter.c        | 30 ++++++++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5fb3c61..2116924 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -167,6 +167,8 @@ struct bpf_array {
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size);
+
 void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6bfe55c..7fe2f22 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -225,7 +225,7 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 {
 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
 	struct bpf_map *map = (struct bpf_map *) (long) r2;
diff --git a/net/core/filter.c b/net/core/filter.c
index 5d2ac2b..3521252 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -46,6 +46,7 @@
 #include <linux/seccomp.h>
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
+#include <linux/perf_event.h>
 #include <net/sch_generic.h>
 #include <net/cls_cgroup.h>
 #include <net/dst_metadata.h>
@@ -1584,6 +1585,33 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+u64 __weak bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+	return -EACCES;
+}
+
+static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
+
+static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+
+	perf_fetch_caller_regs(regs);
+
+	return bpf_perf_event_output((long)regs, r2, flags, r4, size);
+}
+
+static const struct bpf_func_proto bpf_skb_event_output_proto = {
+	.func		= bpf_skb_event_output,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_STACK,
+	.arg5_type	= ARG_CONST_STACK_SIZE,
+};
+
 static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
@@ -2039,6 +2067,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_redirect_proto;
 	case BPF_FUNC_get_route_realm:
 		return &bpf_get_route_realm_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_skb_event_output_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
1.9.3

^ permalink raw reply related

* Re: bpf: use-after-free in array_map_alloc
From: Sasha Levin @ 2016-04-17 22:45 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: ast, netdev@vger.kernel.org, LKML, Tejun Heo
In-Reply-To: <20160417172943.GA83672@ast-mbp.thefacebook.com>

On 04/17/2016 01:29 PM, Alexei Starovoitov wrote:
> On Sun, Apr 17, 2016 at 12:58:21PM -0400, Sasha Levin wrote:
>> > Hi all,
>> > 
>> > I've hit the following while fuzzing with syzkaller inside a KVM tools guest
>> > running the latest -next kernel:
> thanks for the report. Adding Tejun...
> if I read the report correctly it's not about bpf, but rather points to
> the issue inside percpu logic.
> First __alloc_percpu_gfp() is called, then the memory is freed with
> free_percpu() which triggers async pcpu_balance_work and then
> pcpu_extend_area_map is hitting use-after-free.
> I guess bpf percpu array map is stressing this logic the most.
> Any simpler steps to reproduce ?

No simple way to reproduce. I blamed bpf because I saw a few traces
and it was only bpf that was causing it, there was no other reasoning
behind it.


Thanks,
Sasha

^ permalink raw reply

* Re: [PATCH] macsec: fix crypto Kconfig dependency
From: David Miller @ 2016-04-17 22:52 UTC (permalink / raw)
  To: arnd; +Cc: sd, hannes, johannes, netdev, linux-kernel
In-Reply-To: <1460884918-2879720-1-git-send-email-arnd@arndb.de>

From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 17 Apr 2016 11:19:55 +0200

> The new MACsec driver uses the AES crypto algorithm, but can be configured
> even if CONFIG_CRYPTO is disabled, leading to a build error:
> 
> warning: (MAC80211 && MACSEC) selects CRYPTO_GCM which has unmet direct dependencies (CRYPTO)
> warning: (BT && CEPH_LIB && INET && MAC802154 && MAC80211 && BLK_DEV_RBD && MACSEC && AIRO_CS && LIBIPW && HOSTAP && USB_WUSB && RTLLIB_CRYPTO_CCMP && FS_ENCRYPTION && EXT4_ENCRYPTION && CEPH_FS && BIG_KEYS && ENCRYPTED_KEYS) selects CRYPTO_AES which has unmet direct dependencies (CRYPTO)
> crypto/built-in.o: In function `gcm_enc_copy_hash':
> aes_generic.c:(.text+0x2b8): undefined reference to `crypto_xor'
> aes_generic.c:(.text+0x2dc): undefined reference to `scatterwalk_map_and_copy'
> 
> This adds an explicit 'select CRYPTO' statement the way that other
> drivers handle it.
> 
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver")

Applied, thanks Arnd.

^ permalink raw reply

* Re: [PATCH net-next v4 0/9] net: dsa: mv88e6xxx: factorize switch info
From: David Miller @ 2016-04-17 22:54 UTC (permalink / raw)
  To: vivien.didelot; +Cc: netdev, linux-kernel, kernel, f.fainelli, andrew
In-Reply-To: <1460913843-7459-1-git-send-email-vivien.didelot@savoirfairelinux.com>

From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Sun, 17 Apr 2016 13:23:54 -0400

> This patchset factorizes the mv88e6xxx code by sharing a new extendable
> info structure to store static data such as switch family, product
> number, number of ports, number of databases and the name.
 ...

Series applied, thanks Vivien.

^ permalink raw reply

* Re: [PATCH net-next 5/5] qed*: Add support for read/write of eeprom
From: David Miller @ 2016-04-17 23:18 UTC (permalink / raw)
  To: Yuval.Mintz; +Cc: netdev, sudarsana.kalluru
In-Reply-To: <1460921195-23352-6-git-send-email-Yuval.Mintz@qlogic.com>

From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Sun, 17 Apr 2016 22:26:35 +0300

> There's quite a bit of `magic' here [based on the ethtool `magic'
> field] which is retained by the user application responsible for
> interacting with the driver for the purpose of upgrading the nvram
> image.

This is not how the 'magic' value of the eeprom structure is specified
to be used, please use it the way it is supposed to be used rather
than inventing semantics which only apply to your device.

The entire point of defining a common interface for device driver
interface interaction with the user is exactly so that driver authors
don't do things like this.

^ permalink raw reply

* Re: [PATCH net-next 2/2] bpf: add event output helper for notifications/sampling/logging
From: kbuild test robot @ 2016-04-17 23:55 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: kbuild-all, davem, alexei.starovoitov, tgraf, netdev,
	linux-kernel, Daniel Borkmann, Alexei Starovoitov
In-Reply-To: <3f0cdec313dfaefcc5254a10a58c4e1a380ad79c.1460931309.git.daniel@iogearbox.net>

[-- Attachment #1: Type: text/plain, Size: 1386 bytes --]

Hi Daniel,

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Daniel-Borkmann/bpf-trace-add-BPF_F_CURRENT_CPU-flag-for-bpf_perf_event_output/20160418-063147
config: m68k-allyesconfig (attached as .config)
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=m68k 

All errors (new ones prefixed by >>):

   net/core/filter.c: In function 'bpf_skb_event_output':
>> net/core/filter.c:1599:2: error: implicit declaration of function 'perf_fetch_caller_regs' [-Werror=implicit-function-declaration]
     perf_fetch_caller_regs(regs);
     ^
   cc1: some warnings being treated as errors

vim +/perf_fetch_caller_regs +1599 net/core/filter.c

  1593	static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
  1594	
  1595	static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
  1596	{
  1597		struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
  1598	
> 1599		perf_fetch_caller_regs(regs);
  1600	
  1601		return bpf_perf_event_output((long)regs, r2, flags, r4, size);
  1602	}

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 36318 bytes --]

^ permalink raw reply

* Re: [PATCH net-next V2 05/11] net/mlx5e: Support RX multi-packet WQE (Striding RQ)
From: Eric Dumazet @ 2016-04-18  0:29 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: David S. Miller, netdev, Or Gerlitz, Tal Alon, Tariq Toukan,
	Eran Ben Elisha, Achiad Shochat
In-Reply-To: <1460928725-18741-6-git-send-email-saeedm@mellanox.com>

On Mon, 2016-04-18 at 00:31 +0300, Saeed Mahameed wrote:

> Performance tested on ConnectX4-Lx 50G.
> To isolate the feature under test, the numbers below were measured with
> HW LRO turned off. We verified that the performance just improves when
> LRO is turned back on.
> 
> * Netperf single TCP stream:
> - BW raised by 10-15% for representative packet sizes:
>   default, 64B, 1024B, 1478B, 65536B.
> 
> * Netperf multi TCP stream:
> - No degradation, line rate reached.
> 
> * Pktgen: packet rate raised by 5-10% for traffic of different message
> sizes: 64B, 128B, 256B, 1024B, and 1500B.
> 
> * Pktgen: packet loss in bursts of small messages (64byte),
> single stream:
> - | num packets | packets loss before | packets loss after
>   |     2K      |       ~ 1K          |       0
>   |     8K      |       ~ 6K          |       0
>   |     16K     |       ~13K          |       0
>   |     32K     |       ~28K          |       0
>   |     64K     |       ~57K          |     ~24K


As I already mentioned, allocated order-5 pages and hoping host only
receives friendly traffic is very optimistic.

A 192 bytes frame, is claiming to consume 192 bytes frag with your new
allocation strategy. (skb->truesize is kind of minimal)

In reality, it can prevent a whole 131072 bytes of memory from being
reclaimed/freed. TCP stack will not consider such skb has a candidate
for collapsing in case of memory pressure or hostile peer.

Your tests are obviously run on a freshly booted host, where all
physical memory can be consumed for networking buffers.

Even with order-3 pages, we have problems (at Facebook and Google) on
hosts that we do not reboot every day.

At the time order-5 allocations fail, it is already too late, as maybe
thousands of out-of-order TCP packets might have consumed all the memory
and the host will die.

/proc/sys/net/ipv4/tcp_mem by default allows TCP to use up to 10% of
hysical memory, assuming skb->truesize is true.

In your schem, TCP might never notice it uses 100% of the ram for
packets stored in out or order queues, since a frag will hold 32 times
more pages than really announced.

If really you need to allocate physically contiguous memory, have you
considered converting the order-5 pages into 32 order-0 ones ?

This way, a 192 bytes frame sitting in one socket would hold one order-0
page in the worst case, and TCP wont be allowed to use all physical
memory.

^ permalink raw reply

* Re: [PATCH net] net: bcmgenet: device stats are unsigned long
From: Florian Fainelli @ 2016-04-18  0:49 UTC (permalink / raw)
  To: Eric Dumazet, David Miller; +Cc: netdev
In-Reply-To: <1460742472.10638.78.camel@edumazet-glaptop3.roam.corp.google.com>

Le 15/04/2016 10:47, Eric Dumazet a écrit :
> From: Eric Dumazet <edumazet@google.com>
> 
> On 64bit kernels, device stats are 64bit wide, not 32bit.
> 
> Fixes: 1c1008c793fa4 ("net: bcmgenet: add main driver file")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Florian Fainelli <f.fainelli@gmail.com>

Late, but:

Acked-by: Florian Fainelli <f.fainelli@gmail.com>

Thanks Eric!
-- 
Florian

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox