* [PATCH V3 for-next 07/11] IB/hns: Modify the macro for the timeout when cmd process
From: Salil Mehta @ 2016-11-23 19:41 UTC (permalink / raw)
To: dledford
Cc: salil.mehta, xavier.huwei, oulijun, xushaobo2, mehta.salil.lnk,
lijun_nudt, linux-rdma, netdev, linux-kernel, linuxarm
In-Reply-To: <20161123194109.420760-1-salil.mehta@huawei.com>
From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>
This patch modified the macro for the timeout when cmd is
processing as follows:
Before modification:
enum {
HNS_ROCE_CMD_TIME_CLASS_A = 10000,
HNS_ROCE_CMD_TIME_CLASS_B = 10000,
HNS_ROCE_CMD_TIME_CLASS_C = 10000,
};
After modification:
#define HNS_ROCE_CMD_TIMEOUT_MSECS 10000
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
drivers/infiniband/hw/hns/hns_roce_cmd.h | 7 +------
drivers/infiniband/hw/hns/hns_roce_cq.c | 4 ++--
drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 8 ++++----
drivers/infiniband/hw/hns/hns_roce_mr.c | 4 ++--
4 files changed, 9 insertions(+), 14 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h
index e3997d3..ed14ad3 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cmd.h
+++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h
@@ -34,6 +34,7 @@
#define _HNS_ROCE_CMD_H
#define HNS_ROCE_MAILBOX_SIZE 4096
+#define HNS_ROCE_CMD_TIMEOUT_MSECS 10000
enum {
/* TPT commands */
@@ -57,12 +58,6 @@ enum {
HNS_ROCE_CMD_QUERY_QP = 0x22,
};
-enum {
- HNS_ROCE_CMD_TIME_CLASS_A = 10000,
- HNS_ROCE_CMD_TIME_CLASS_B = 10000,
- HNS_ROCE_CMD_TIME_CLASS_C = 10000,
-};
-
struct hns_roce_cmd_mailbox {
void *buf;
dma_addr_t dma;
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 5dc8d92..461a273 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -77,7 +77,7 @@ static int hns_roce_sw2hw_cq(struct hns_roce_dev *dev,
unsigned long cq_num)
{
return hns_roce_cmd_mbox(dev, mailbox->dma, 0, cq_num, 0,
- HNS_ROCE_CMD_SW2HW_CQ, HNS_ROCE_CMD_TIME_CLASS_A);
+ HNS_ROCE_CMD_SW2HW_CQ, HNS_ROCE_CMD_TIMEOUT_MSECS);
}
static int hns_roce_cq_alloc(struct hns_roce_dev *hr_dev, int nent,
@@ -176,7 +176,7 @@ static int hns_roce_hw2sw_cq(struct hns_roce_dev *dev,
{
return hns_roce_cmd_mbox(dev, 0, mailbox ? mailbox->dma : 0, cq_num,
mailbox ? 0 : 1, HNS_ROCE_CMD_HW2SW_CQ,
- HNS_ROCE_CMD_TIME_CLASS_A);
+ HNS_ROCE_CMD_TIMEOUT_MSECS);
}
static void hns_roce_free_cq(struct hns_roce_dev *hr_dev,
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index b835a55..509ea75 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -1871,12 +1871,12 @@ static int hns_roce_v1_qp_modify(struct hns_roce_dev *hr_dev,
if (op[cur_state][new_state] == HNS_ROCE_CMD_2RST_QP)
return hns_roce_cmd_mbox(hr_dev, 0, 0, hr_qp->qpn, 2,
HNS_ROCE_CMD_2RST_QP,
- HNS_ROCE_CMD_TIME_CLASS_A);
+ HNS_ROCE_CMD_TIMEOUT_MSECS);
if (op[cur_state][new_state] == HNS_ROCE_CMD_2ERR_QP)
return hns_roce_cmd_mbox(hr_dev, 0, 0, hr_qp->qpn, 2,
HNS_ROCE_CMD_2ERR_QP,
- HNS_ROCE_CMD_TIME_CLASS_A);
+ HNS_ROCE_CMD_TIMEOUT_MSECS);
mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
if (IS_ERR(mailbox))
@@ -1886,7 +1886,7 @@ static int hns_roce_v1_qp_modify(struct hns_roce_dev *hr_dev,
ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_qp->qpn, 0,
op[cur_state][new_state],
- HNS_ROCE_CMD_TIME_CLASS_C);
+ HNS_ROCE_CMD_TIMEOUT_MSECS);
hns_roce_free_cmd_mailbox(hr_dev, mailbox);
return ret;
@@ -2681,7 +2681,7 @@ static int hns_roce_v1_query_qpc(struct hns_roce_dev *hr_dev,
ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, hr_qp->qpn, 0,
HNS_ROCE_CMD_QUERY_QP,
- HNS_ROCE_CMD_TIME_CLASS_A);
+ HNS_ROCE_CMD_TIMEOUT_MSECS);
if (!ret)
memcpy(hr_context, mailbox->buf, sizeof(*hr_context));
else
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index d87d189..a5bd645 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -53,7 +53,7 @@ static int hns_roce_sw2hw_mpt(struct hns_roce_dev *hr_dev,
{
return hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, mpt_index, 0,
HNS_ROCE_CMD_SW2HW_MPT,
- HNS_ROCE_CMD_TIME_CLASS_B);
+ HNS_ROCE_CMD_TIMEOUT_MSECS);
}
static int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
@@ -62,7 +62,7 @@ static int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
{
return hns_roce_cmd_mbox(hr_dev, 0, mailbox ? mailbox->dma : 0,
mpt_index, !mailbox, HNS_ROCE_CMD_HW2SW_MPT,
- HNS_ROCE_CMD_TIME_CLASS_B);
+ HNS_ROCE_CMD_TIMEOUT_MSECS);
}
static int hns_roce_buddy_alloc(struct hns_roce_buddy *buddy, int order,
--
1.7.9.5
^ permalink raw reply related
* [PATCH V3 for-next 06/11] IB/hns: Fix the bug for qp state in hns_roce_v1_m_qp()
From: Salil Mehta @ 2016-11-23 19:41 UTC (permalink / raw)
To: dledford
Cc: salil.mehta, xavier.huwei, oulijun, xushaobo2, mehta.salil.lnk,
lijun_nudt, linux-rdma, netdev, linux-kernel, linuxarm
In-Reply-To: <20161123194109.420760-1-salil.mehta@huawei.com>
From: Lijun Ou <oulijun@huawei.com>
In old code, the value of qp state from qpc was assigned for
attr->qp_state. The value may be an error while attr_mask &
IB_QP_STATE is zero.
Signed-off-by: Lijun Ou <oulijun@huawei.com>
Reviewed-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 643a2ff..b835a55 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -2571,7 +2571,7 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
/* Every status migrate must change state */
roce_set_field(context->qpc_bytes_144,
QP_CONTEXT_QPC_BYTES_144_QP_STATE_M,
- QP_CONTEXT_QPC_BYTES_144_QP_STATE_S, attr->qp_state);
+ QP_CONTEXT_QPC_BYTES_144_QP_STATE_S, new_state);
/* SW pass context to HW */
ret = hns_roce_v1_qp_modify(hr_dev, &hr_qp->mtt,
--
1.7.9.5
^ permalink raw reply related
* [PATCH V3 for-next 05/11] IB/hns: Modify the condition of notifying hardware loopback
From: Salil Mehta @ 2016-11-23 19:41 UTC (permalink / raw)
To: dledford
Cc: salil.mehta, xavier.huwei, oulijun, xushaobo2, mehta.salil.lnk,
lijun_nudt, linux-rdma, netdev, linux-kernel, linuxarm
In-Reply-To: <20161123194109.420760-1-salil.mehta@huawei.com>
From: Lijun Ou <oulijun@huawei.com>
This patch modified the condition of notifying hardware loopback.
In hip06, RoCE Engine has several ports, one QP is related
to one port. hardware only support loopback in the same port,
not in the different ports.
So, If QP related to port N, the dmac in the QP context equals
the smac of the local port N or the loop_idc is 1, we should
set loopback bit in QP context to notify hardware.
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 24 +++++++-----------------
1 file changed, 7 insertions(+), 17 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index e080dd6..643a2ff 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -2244,24 +2244,14 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
QP_CONTEXT_QPC_BYTE_32_SIGNALING_TYPE_S,
hr_qp->sq_signal_bits);
- for (port = 0; port < hr_dev->caps.num_ports; port++) {
- smac = (u8 *)hr_dev->dev_addr[port];
- dev_dbg(dev, "smac: %2x: %2x: %2x: %2x: %2x: %2x\n",
- smac[0], smac[1], smac[2], smac[3], smac[4],
- smac[5]);
- if ((dmac[0] == smac[0]) && (dmac[1] == smac[1]) &&
- (dmac[2] == smac[2]) && (dmac[3] == smac[3]) &&
- (dmac[4] == smac[4]) && (dmac[5] == smac[5])) {
- roce_set_bit(context->qpc_bytes_32,
- QP_CONTEXT_QPC_BYTE_32_LOOPBACK_INDICATOR_S,
- 1);
- break;
- }
- }
-
- if (hr_dev->loop_idc == 0x1)
+ port = (attr_mask & IB_QP_PORT) ? (attr->port_num - 1) :
+ hr_qp->port;
+ smac = (u8 *)hr_dev->dev_addr[port];
+ /* when dmac equals smac or loop_idc is 1, it should loopback */
+ if (ether_addr_equal_unaligned(dmac, smac) ||
+ hr_dev->loop_idc == 0x1)
roce_set_bit(context->qpc_bytes_32,
- QP_CONTEXT_QPC_BYTE_32_LOOPBACK_INDICATOR_S, 1);
+ QP_CONTEXT_QPC_BYTE_32_LOOPBACK_INDICATOR_S, 1);
roce_set_bit(context->qpc_bytes_32,
QP_CONTEXT_QPC_BYTE_32_GLOBAL_HEADER_S,
--
1.7.9.5
^ permalink raw reply related
* [PATCH V3 for-next 04/11] IB/hns: add self loopback for CM
From: Salil Mehta @ 2016-11-23 19:41 UTC (permalink / raw)
To: dledford
Cc: salil.mehta, xavier.huwei, oulijun, xushaobo2, mehta.salil.lnk,
lijun_nudt, linux-rdma, netdev, linux-kernel, linuxarm,
Peter Chen
In-Reply-To: <20161123194109.420760-1-salil.mehta@huawei.com>
From: Lijun Ou <oulijun@huawei.com>
This patch mainly adds self loopback support for CM.
Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Peter Chen <luck.chen@huawei.com>
Reviewed-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 11 +++++++++++
drivers/infiniband/hw/hns/hns_roce_hw_v1.h | 2 ++
2 files changed, 13 insertions(+)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 959d5ca..e080dd6 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -32,6 +32,7 @@
#include <linux/platform_device.h>
#include <linux/acpi.h>
+#include <linux/etherdevice.h>
#include <rdma/ib_umem.h>
#include "hns_roce_common.h"
#include "hns_roce_device.h"
@@ -72,6 +73,8 @@ int hns_roce_v1_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
int nreq = 0;
u32 ind = 0;
int ret = 0;
+ u8 *smac;
+ int loopback;
if (unlikely(ibqp->qp_type != IB_QPT_GSI &&
ibqp->qp_type != IB_QPT_RC)) {
@@ -129,6 +132,14 @@ int hns_roce_v1_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
UD_SEND_WQE_U32_8_DMAC_5_M,
UD_SEND_WQE_U32_8_DMAC_5_S,
ah->av.mac[5]);
+
+ smac = (u8 *)hr_dev->dev_addr[qp->port];
+ loopback = ether_addr_equal_unaligned(ah->av.mac,
+ smac) ? 1 : 0;
+ roce_set_bit(ud_sq_wqe->u32_8,
+ UD_SEND_WQE_U32_8_LOOPBACK_INDICATOR_S,
+ loopback);
+
roce_set_field(ud_sq_wqe->u32_8,
UD_SEND_WQE_U32_8_OPERATION_TYPE_M,
UD_SEND_WQE_U32_8_OPERATION_TYPE_S,
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
index 6004c7f..cf28f1b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
@@ -440,6 +440,8 @@ struct hns_roce_ud_send_wqe {
#define UD_SEND_WQE_U32_8_DMAC_5_M \
(((1UL << 8) - 1) << UD_SEND_WQE_U32_8_DMAC_5_S)
+#define UD_SEND_WQE_U32_8_LOOPBACK_INDICATOR_S 22
+
#define UD_SEND_WQE_U32_8_OPERATION_TYPE_S 16
#define UD_SEND_WQE_U32_8_OPERATION_TYPE_M \
(((1UL << 4) - 1) << UD_SEND_WQE_U32_8_OPERATION_TYPE_S)
--
1.7.9.5
^ permalink raw reply related
* [PATCH V3 for-next 03/11] IB/hns: Optimize the logic of allocating memory using APIs
From: Salil Mehta @ 2016-11-23 19:41 UTC (permalink / raw)
To: dledford
Cc: salil.mehta, xavier.huwei, oulijun, xushaobo2, mehta.salil.lnk,
lijun_nudt, linux-rdma, netdev, linux-kernel, linuxarm,
Ping Zhang
In-Reply-To: <20161123194109.420760-1-salil.mehta@huawei.com>
From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>
This patch modified the logic of allocating memory using APIs in
hns RoCE driver. We used kcalloc instead of kmalloc_array and
bitmap_zero. And When kcalloc failed, call vzalloc to alloc
memory.
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Ping Zhang <zhangping5@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
Change log:
PATCH V2: Addressed comment given by Leon
Link: https://patchwork.kernel.org/patch/9412859/
PATCH V1: Initial Submit
---
drivers/infiniband/hw/hns/hns_roce_mr.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index fb87883..d87d189 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -137,11 +137,13 @@ static int hns_roce_buddy_init(struct hns_roce_buddy *buddy, int max_order)
for (i = 0; i <= buddy->max_order; ++i) {
s = BITS_TO_LONGS(1 << (buddy->max_order - i));
- buddy->bits[i] = kmalloc_array(s, sizeof(long), GFP_KERNEL);
- if (!buddy->bits[i])
- goto err_out_free;
-
- bitmap_zero(buddy->bits[i], 1 << (buddy->max_order - i));
+ buddy->bits[i] = kcalloc(s, sizeof(long), GFP_KERNEL |
+ __GFP_NOWARN);
+ if (!buddy->bits[i]) {
+ buddy->bits[i] = vzalloc(s * sizeof(long));
+ if (!buddy->bits[i])
+ goto err_out_free;
+ }
}
set_bit(0, buddy->bits[buddy->max_order]);
@@ -151,7 +153,7 @@ static int hns_roce_buddy_init(struct hns_roce_buddy *buddy, int max_order)
err_out_free:
for (i = 0; i <= buddy->max_order; ++i)
- kfree(buddy->bits[i]);
+ kvfree(buddy->bits[i]);
err_out:
kfree(buddy->bits);
@@ -164,7 +166,7 @@ static void hns_roce_buddy_cleanup(struct hns_roce_buddy *buddy)
int i;
for (i = 0; i <= buddy->max_order; ++i)
- kfree(buddy->bits[i]);
+ kvfree(buddy->bits[i]);
kfree(buddy->bits);
kfree(buddy->num_free);
--
1.7.9.5
^ permalink raw reply related
* [PATCH V3 for-next 02/11] IB/hns: Add code for refreshing CQ CI using TPTR
From: Salil Mehta @ 2016-11-23 19:41 UTC (permalink / raw)
To: dledford
Cc: salil.mehta, xavier.huwei, oulijun, xushaobo2, mehta.salil.lnk,
lijun_nudt, linux-rdma, netdev, linux-kernel, linuxarm,
Dongdong Huang
In-Reply-To: <20161123194109.420760-1-salil.mehta@huawei.com>
From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>
This patch added the code for refreshing CQ CI using TPTR in hip06
SoC.
We will send a doorbell to hardware for refreshing CQ CI when user
succeed to poll a cqe. But it will be failed if the doorbell has
been blocked. So hardware will read a special buffer called TPTR
to get the lastest CI value when the cq is almost full.
This patch support the special CI buffer as follows:
a) Alloc the memory for TPTR in the hns_roce_tptr_init function and
free it in hns_roce_tptr_free function, these two functions will
be called in probe function and in the remove function.
b) Add the code for computing offset(every cq need 2 bytes) and
write the dma addr to every cq context to notice hardware in the
function named hns_roce_v1_write_cqc.
c) Add code for mapping TPTR buffer to user space in function named
hns_roce_mmap. The mapping distinguish TPTR and UAR of user mode
by vm_pgoff(0: UAR, 1: TPTR, others:invaild) in hip06.
d) Alloc the code for refreshing CQ CI using TPTR in the function
named hns_roce_v1_poll_cq.
e) Add some variable definitions to the related structure.
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Dongdong Huang(Donald) <hdd.huang@huawei.com>
Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
drivers/infiniband/hw/hns/hns_roce_common.h | 2 -
drivers/infiniband/hw/hns/hns_roce_cq.c | 9 +++
drivers/infiniband/hw/hns/hns_roce_device.h | 6 +-
drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 79 ++++++++++++++++++++++++---
drivers/infiniband/hw/hns/hns_roce_hw_v1.h | 9 +++
drivers/infiniband/hw/hns/hns_roce_main.c | 13 ++++-
6 files changed, 103 insertions(+), 15 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
index 2970161..0dcb620 100644
--- a/drivers/infiniband/hw/hns/hns_roce_common.h
+++ b/drivers/infiniband/hw/hns/hns_roce_common.h
@@ -253,8 +253,6 @@
#define ROCEE_VENDOR_ID_REG 0x0
#define ROCEE_VENDOR_PART_ID_REG 0x4
-#define ROCEE_HW_VERSION_REG 0x8
-
#define ROCEE_SYS_IMAGE_GUID_L_REG 0xC
#define ROCEE_SYS_IMAGE_GUID_H_REG 0x10
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 0973659..5dc8d92 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -349,6 +349,15 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
goto err_mtt;
}
+ /*
+ * For the QP created by kernel space, tptr value should be initialized
+ * to zero; For the QP created by user space, it will cause synchronous
+ * problems if tptr is set to zero here, so we initialze it in user
+ * space.
+ */
+ if (!context)
+ *hr_cq->tptr_addr = 0;
+
/* Get created cq handler and carry out event */
hr_cq->comp = hns_roce_ib_cq_comp;
hr_cq->event = hns_roce_ib_cq_event;
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 3417315..7242b14 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -37,6 +37,8 @@
#define DRV_NAME "hns_roce"
+#define HNS_ROCE_HW_VER1 ('h' << 24 | 'i' << 16 | '0' << 8 | '6')
+
#define MAC_ADDR_OCTET_NUM 6
#define HNS_ROCE_MAX_MSG_LEN 0x80000000
@@ -296,7 +298,7 @@ struct hns_roce_cq {
u32 cq_depth;
u32 cons_index;
void __iomem *cq_db_l;
- void __iomem *tptr_addr;
+ u16 *tptr_addr;
unsigned long cqn;
u32 vector;
atomic_t refcount;
@@ -553,6 +555,8 @@ struct hns_roce_dev {
int cmd_mod;
int loop_idc;
+ dma_addr_t tptr_dma_addr; /*only for hw v1*/
+ u32 tptr_size; /*only for hw v1*/
struct hns_roce_hw *hw;
};
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 7485514..959d5ca 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -849,6 +849,45 @@ static void hns_roce_bt_free(struct hns_roce_dev *hr_dev)
priv->bt_table.qpc_buf.buf, priv->bt_table.qpc_buf.map);
}
+static int hns_roce_tptr_init(struct hns_roce_dev *hr_dev)
+{
+ struct device *dev = &hr_dev->pdev->dev;
+ struct hns_roce_buf_list *tptr_buf;
+ struct hns_roce_v1_priv *priv;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ tptr_buf = &priv->tptr_table.tptr_buf;
+
+ /*
+ * This buffer will be used for CQ's tptr(tail pointer), also
+ * named ci(customer index). Every CQ will use 2 bytes to save
+ * cqe ci in hip06. Hardware will read this area to get new ci
+ * when the queue is almost full.
+ */
+ tptr_buf->buf = dma_alloc_coherent(dev, HNS_ROCE_V1_TPTR_BUF_SIZE,
+ &tptr_buf->map, GFP_KERNEL);
+ if (!tptr_buf->buf)
+ return -ENOMEM;
+
+ hr_dev->tptr_dma_addr = tptr_buf->map;
+ hr_dev->tptr_size = HNS_ROCE_V1_TPTR_BUF_SIZE;
+
+ return 0;
+}
+
+static void hns_roce_tptr_free(struct hns_roce_dev *hr_dev)
+{
+ struct device *dev = &hr_dev->pdev->dev;
+ struct hns_roce_buf_list *tptr_buf;
+ struct hns_roce_v1_priv *priv;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ tptr_buf = &priv->tptr_table.tptr_buf;
+
+ dma_free_coherent(dev, HNS_ROCE_V1_TPTR_BUF_SIZE,
+ tptr_buf->buf, tptr_buf->map);
+}
+
/**
* hns_roce_v1_reset - reset RoCE
* @hr_dev: RoCE device struct pointer
@@ -906,12 +945,11 @@ void hns_roce_v1_profile(struct hns_roce_dev *hr_dev)
hr_dev->vendor_id = le32_to_cpu(roce_read(hr_dev, ROCEE_VENDOR_ID_REG));
hr_dev->vendor_part_id = le32_to_cpu(roce_read(hr_dev,
ROCEE_VENDOR_PART_ID_REG));
- hr_dev->hw_rev = le32_to_cpu(roce_read(hr_dev, ROCEE_HW_VERSION_REG));
-
hr_dev->sys_image_guid = le32_to_cpu(roce_read(hr_dev,
ROCEE_SYS_IMAGE_GUID_L_REG)) |
((u64)le32_to_cpu(roce_read(hr_dev,
ROCEE_SYS_IMAGE_GUID_H_REG)) << 32);
+ hr_dev->hw_rev = HNS_ROCE_HW_VER1;
caps->num_qps = HNS_ROCE_V1_MAX_QP_NUM;
caps->max_wqes = HNS_ROCE_V1_MAX_WQE_NUM;
@@ -1009,8 +1047,17 @@ int hns_roce_v1_init(struct hns_roce_dev *hr_dev)
goto error_failed_bt_init;
}
+ ret = hns_roce_tptr_init(hr_dev);
+ if (ret) {
+ dev_err(dev, "tptr init failed!\n");
+ goto error_failed_tptr_init;
+ }
+
return 0;
+error_failed_tptr_init:
+ hns_roce_bt_free(hr_dev);
+
error_failed_bt_init:
hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_DOWN);
hns_roce_raq_free(hr_dev);
@@ -1022,6 +1069,7 @@ int hns_roce_v1_init(struct hns_roce_dev *hr_dev)
void hns_roce_v1_exit(struct hns_roce_dev *hr_dev)
{
+ hns_roce_tptr_free(hr_dev);
hns_roce_bt_free(hr_dev);
hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_DOWN);
hns_roce_raq_free(hr_dev);
@@ -1339,14 +1387,21 @@ void hns_roce_v1_write_cqc(struct hns_roce_dev *hr_dev,
dma_addr_t dma_handle, int nent, u32 vector)
{
struct hns_roce_cq_context *cq_context = NULL;
- void __iomem *tptr_addr;
+ struct hns_roce_buf_list *tptr_buf;
+ struct hns_roce_v1_priv *priv;
+ dma_addr_t tptr_dma_addr;
+ int offset;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ tptr_buf = &priv->tptr_table.tptr_buf;
cq_context = mb_buf;
memset(cq_context, 0, sizeof(*cq_context));
- tptr_addr = 0;
- hr_dev->priv_addr = tptr_addr;
- hr_cq->tptr_addr = tptr_addr;
+ /* Get the tptr for this CQ. */
+ offset = hr_cq->cqn * HNS_ROCE_V1_TPTR_ENTRY_SIZE;
+ tptr_dma_addr = tptr_buf->map + offset;
+ hr_cq->tptr_addr = (u16 *)(tptr_buf->buf + offset);
/* Register cq_context members */
roce_set_field(cq_context->cqc_byte_4,
@@ -1390,10 +1445,10 @@ void hns_roce_v1_write_cqc(struct hns_roce_dev *hr_dev,
roce_set_field(cq_context->cqc_byte_20,
CQ_CONTEXT_CQC_BYTE_20_CQE_TPTR_ADDR_H_M,
CQ_CONTEXT_CQC_BYTE_20_CQE_TPTR_ADDR_H_S,
- (u64)tptr_addr >> 44);
+ tptr_dma_addr >> 44);
cq_context->cqc_byte_20 = cpu_to_le32(cq_context->cqc_byte_20);
- cq_context->cqe_tptr_addr_l = (u32)((u64)tptr_addr >> 12);
+ cq_context->cqe_tptr_addr_l = (u32)(tptr_dma_addr >> 12);
roce_set_field(cq_context->cqc_byte_32,
CQ_CONTEXT_CQC_BYTE_32_CUR_CQE_BA1_H_M,
@@ -1659,8 +1714,14 @@ int hns_roce_v1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
break;
}
- if (npolled)
+ if (npolled) {
+ *hr_cq->tptr_addr = hr_cq->cons_index &
+ ((hr_cq->cq_depth << 1) - 1);
+
+ /* Memroy barrier */
+ wmb();
hns_roce_v1_cq_set_ci(hr_cq, hr_cq->cons_index);
+ }
spin_unlock_irqrestore(&hr_cq->lock, flags);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
index 2e1878b..6004c7f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
@@ -104,6 +104,10 @@
#define HNS_ROCE_BT_RSV_BUF_SIZE (1 << 17)
+#define HNS_ROCE_V1_TPTR_ENTRY_SIZE 2
+#define HNS_ROCE_V1_TPTR_BUF_SIZE \
+ (HNS_ROCE_V1_TPTR_ENTRY_SIZE * HNS_ROCE_V1_MAX_CQ_NUM)
+
#define HNS_ROCE_ODB_POLL_MODE 0
#define HNS_ROCE_SDB_NORMAL_MODE 0
@@ -983,10 +987,15 @@ struct hns_roce_bt_table {
struct hns_roce_buf_list cqc_buf;
};
+struct hns_roce_tptr_table {
+ struct hns_roce_buf_list tptr_buf;
+};
+
struct hns_roce_v1_priv {
struct hns_roce_db_table db_table;
struct hns_roce_raq_table raq_table;
struct hns_roce_bt_table bt_table;
+ struct hns_roce_tptr_table tptr_table;
};
int hns_dsaf_roce_reset(struct fwnode_handle *dsaf_fwnode, bool dereset);
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 764e35a..6770171 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -549,6 +549,8 @@ static int hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext)
static int hns_roce_mmap(struct ib_ucontext *context,
struct vm_area_struct *vma)
{
+ struct hns_roce_dev *hr_dev = to_hr_dev(context->device);
+
if (((vma->vm_end - vma->vm_start) % PAGE_SIZE) != 0)
return -EINVAL;
@@ -558,10 +560,15 @@ static int hns_roce_mmap(struct ib_ucontext *context,
to_hr_ucontext(context)->uar.pfn,
PAGE_SIZE, vma->vm_page_prot))
return -EAGAIN;
-
- } else {
+ } else if (vma->vm_pgoff == 1 && hr_dev->hw_rev == HNS_ROCE_HW_VER1) {
+ /* vm_pgoff: 1 -- TPTR */
+ if (io_remap_pfn_range(vma, vma->vm_start,
+ hr_dev->tptr_dma_addr >> PAGE_SHIFT,
+ hr_dev->tptr_size,
+ vma->vm_page_prot))
+ return -EAGAIN;
+ } else
return -EINVAL;
- }
return 0;
}
--
1.7.9.5
^ permalink raw reply related
* [PATCH V3 for-next 01/11] IB/hns: Add the interface for querying QP1
From: Salil Mehta @ 2016-11-23 19:40 UTC (permalink / raw)
To: dledford-H+wXaHxf7aLQT0dZR+AlfA
Cc: salil.mehta-hv44wF8Li93QT0dZR+AlfA,
xavier.huwei-hv44wF8Li93QT0dZR+AlfA,
oulijun-hv44wF8Li93QT0dZR+AlfA, xushaobo2-hv44wF8Li93QT0dZR+AlfA,
mehta.salil.lnk-Re5JQEeQqe8AvxtiuMwx3w, lijun_nudt-9Onoh4P/yGk,
linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linuxarm-hv44wF8Li93QT0dZR+AlfA
In-Reply-To: <20161123194109.420760-1-salil.mehta-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
From: Lijun Ou <oulijun-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
In old code, It only added the interface for querying non-specific
QP. This patch mainly adds an interface for querying QP1.
Signed-off-by: Lijun Ou <oulijun-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
Reviewed-by: Wei Hu (Xavier) <xavier.huwei-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
Signed-off-by: Salil Mehta <salil.mehta-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
Change Log
Patch V2: Addressed the comment provided by Anurup M
Link: https://patchwork.kernel.org/patch/9412855/
Patch V1: Initial Submit
---
drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 83 +++++++++++++++++++++++++++-
drivers/infiniband/hw/hns/hns_roce_hw_v1.h | 6 +-
2 files changed, 86 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 71232e5..7485514 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -2630,8 +2630,78 @@ static int hns_roce_v1_query_qpc(struct hns_roce_dev *hr_dev,
return ret;
}
-int hns_roce_v1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
- int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+static int hns_roce_v1_q_sqp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+ struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+ struct hns_roce_sqp_context context;
+ u32 addr;
+
+ mutex_lock(&hr_qp->mutex);
+
+ if (hr_qp->state == IB_QPS_RESET) {
+ qp_attr->qp_state = IB_QPS_RESET;
+ goto done;
+ }
+
+ addr = ROCEE_QP1C_CFG0_0_REG +
+ hr_qp->port * sizeof(struct hns_roce_sqp_context);
+ context.qp1c_bytes_4 = roce_read(hr_dev, addr);
+ context.sq_rq_bt_l = roce_read(hr_dev, addr + 1);
+ context.qp1c_bytes_12 = roce_read(hr_dev, addr + 2);
+ context.qp1c_bytes_16 = roce_read(hr_dev, addr + 3);
+ context.qp1c_bytes_20 = roce_read(hr_dev, addr + 4);
+ context.cur_rq_wqe_ba_l = roce_read(hr_dev, addr + 5);
+ context.qp1c_bytes_28 = roce_read(hr_dev, addr + 6);
+ context.qp1c_bytes_32 = roce_read(hr_dev, addr + 7);
+ context.cur_sq_wqe_ba_l = roce_read(hr_dev, addr + 8);
+ context.qp1c_bytes_40 = roce_read(hr_dev, addr + 9);
+
+ hr_qp->state = roce_get_field(context.qp1c_bytes_4,
+ QP1C_BYTES_4_QP_STATE_M,
+ QP1C_BYTES_4_QP_STATE_S);
+ qp_attr->qp_state = hr_qp->state;
+ qp_attr->path_mtu = IB_MTU_256;
+ qp_attr->path_mig_state = IB_MIG_ARMED;
+ qp_attr->qkey = QKEY_VAL;
+ qp_attr->rq_psn = 0;
+ qp_attr->sq_psn = 0;
+ qp_attr->dest_qp_num = 1;
+ qp_attr->qp_access_flags = 6;
+
+ qp_attr->pkey_index = roce_get_field(context.qp1c_bytes_20,
+ QP1C_BYTES_20_PKEY_IDX_M,
+ QP1C_BYTES_20_PKEY_IDX_S);
+ qp_attr->port_num = hr_qp->port + 1;
+ qp_attr->sq_draining = 0;
+ qp_attr->max_rd_atomic = 0;
+ qp_attr->max_dest_rd_atomic = 0;
+ qp_attr->min_rnr_timer = 0;
+ qp_attr->timeout = 0;
+ qp_attr->retry_cnt = 0;
+ qp_attr->rnr_retry = 0;
+ qp_attr->alt_timeout = 0;
+
+done:
+ qp_attr->cur_qp_state = qp_attr->qp_state;
+ qp_attr->cap.max_recv_wr = hr_qp->rq.wqe_cnt;
+ qp_attr->cap.max_recv_sge = hr_qp->rq.max_gs;
+ qp_attr->cap.max_send_wr = hr_qp->sq.wqe_cnt;
+ qp_attr->cap.max_send_sge = hr_qp->sq.max_gs;
+ qp_attr->cap.max_inline_data = 0;
+ qp_init_attr->cap = qp_attr->cap;
+ qp_init_attr->create_flags = 0;
+
+ mutex_unlock(&hr_qp->mutex);
+
+ return 0;
+}
+
+static int hns_roce_v1_q_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
{
struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
@@ -2767,6 +2837,15 @@ int hns_roce_v1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
return ret;
}
+int hns_roce_v1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+ struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+
+ return hr_qp->doorbell_qpn <= 1 ?
+ hns_roce_v1_q_sqp(ibqp, qp_attr, qp_attr_mask, qp_init_attr) :
+ hns_roce_v1_q_qp(ibqp, qp_attr, qp_attr_mask, qp_init_attr);
+}
static void hns_roce_v1_destroy_qp_common(struct hns_roce_dev *hr_dev,
struct hns_roce_qp *hr_qp,
int is_user)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
index 539b0a3b..2e1878b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
@@ -480,13 +480,17 @@ struct hns_roce_sqp_context {
u32 qp1c_bytes_12;
u32 qp1c_bytes_16;
u32 qp1c_bytes_20;
- u32 qp1c_bytes_28;
u32 cur_rq_wqe_ba_l;
+ u32 qp1c_bytes_28;
u32 qp1c_bytes_32;
u32 cur_sq_wqe_ba_l;
u32 qp1c_bytes_40;
};
+#define QP1C_BYTES_4_QP_STATE_S 0
+#define QP1C_BYTES_4_QP_STATE_M \
+ (((1UL << 3) - 1) << QP1C_BYTES_4_QP_STATE_S)
+
#define QP1C_BYTES_4_SQ_WQE_SHIFT_S 8
#define QP1C_BYTES_4_SQ_WQE_SHIFT_M \
(((1UL << 4) - 1) << QP1C_BYTES_4_SQ_WQE_SHIFT_S)
--
1.7.9.5
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH V3 for-next 00/11] Code improvements & fixes for HNS RoCE driver
From: Salil Mehta @ 2016-11-23 19:40 UTC (permalink / raw)
To: dledford
Cc: salil.mehta, xavier.huwei, oulijun, xushaobo2, mehta.salil.lnk,
lijun_nudt, linux-rdma, netdev, linux-kernel, linuxarm
This patchset introduces some code improvements and fixes
for the identified problems in the HNS RoCE driver.
Lijun Ou (4):
IB/hns: Add the interface for querying QP1
IB/hns: add self loopback for CM
IB/hns: Modify the condition of notifying hardware loopback
IB/hns: Fix the bug for qp state in hns_roce_v1_m_qp()
Salil Mehta (1):
IB/hns: Fix for Checkpatch.pl comment style errors
Shaobo Xu (1):
IB/hns: Implement the add_gid/del_gid and optimize the GIDs
management
Wei Hu (Xavier) (5):
IB/hns: Add code for refreshing CQ CI using TPTR
IB/hns: Optimize the logic of allocating memory using APIs
IB/hns: Modify the macro for the timeout when cmd process
IB/hns: Modify query info named port_num when querying RC QP
IB/hns: Change qpn allocation to round-robin mode.
drivers/infiniband/hw/hns/hns_roce_alloc.c | 11 +-
drivers/infiniband/hw/hns/hns_roce_cmd.c | 8 +-
drivers/infiniband/hw/hns/hns_roce_cmd.h | 7 +-
drivers/infiniband/hw/hns/hns_roce_common.h | 2 -
drivers/infiniband/hw/hns/hns_roce_cq.c | 17 +-
drivers/infiniband/hw/hns/hns_roce_device.h | 45 ++--
drivers/infiniband/hw/hns/hns_roce_eq.c | 6 +-
drivers/infiniband/hw/hns/hns_roce_hem.c | 6 +-
drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 267 +++++++++++++++++------
drivers/infiniband/hw/hns/hns_roce_hw_v1.h | 17 +-
drivers/infiniband/hw/hns/hns_roce_main.c | 311 +++++++--------------------
drivers/infiniband/hw/hns/hns_roce_mr.c | 22 +-
drivers/infiniband/hw/hns/hns_roce_pd.c | 5 +-
drivers/infiniband/hw/hns/hns_roce_qp.c | 2 +-
14 files changed, 364 insertions(+), 362 deletions(-)
--
1.7.9.5
^ permalink raw reply
* Re: Enabling peer to peer device transactions for PCIe devices
From: Jason Gunthorpe @ 2016-11-23 19:32 UTC (permalink / raw)
To: Serguei Sagalovitch
Cc: Logan Gunthorpe, Dan Williams, Deucher, Alexander,
linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org,
linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Kuehling, Felix, Bridgman, John,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org,
Koenig, Christian, Sander, Ben, Suthikulpanit, Suravee,
Blinzer, Paul,
Linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <7bc38037-b6ab-943f-59db-6280e16901ab-5C7GfCeVMHo@public.gmane.org>
On Wed, Nov 23, 2016 at 02:14:40PM -0500, Serguei Sagalovitch wrote:
>
> On 2016-11-23 02:05 PM, Jason Gunthorpe wrote:
> >As Bart says, it would be best to be combined with something like
> >Mellanox's ODP MRs, which allows a page to be evicted and then trigger
> >a CPU interrupt if a DMA is attempted so it can be brought back.
> Please note that in the general case (including MR one) we could have
> "page fault" from the different PCIe device. So all PCIe device must
> be synchronized.
Standard RDMA MRs require pinned pages, the DMA address cannot change
while the MR exists (there is no hardware support for this at all), so
page faulting from any other device is out of the question while they
exist. This is the same requirement as typical simple driver DMA which
requires pages pinned until the simple device completes DMA.
ODP RDMA MRs do not require that, they just page fault like the CPU or
really anything and the kernel has to make sense of concurrant page
faults from multiple sources.
The upshot is that GPU scenarios that rely on highly dynamic
virtual->physical translation cannot sanely be combined with standard
long-life RDMA MRs.
Certainly, any solution for GPUs must follow the typical page pinning
semantics, changing the DMA address of a page must be blocked while
any DMA is in progress.
> >Does HMM solve the peer-peer problem? Does it do it generically or
> >only for drivers that are mirroring translation tables?
> In current form HMM doesn't solve peer-peer problem. Currently it allow
> "mirroring" of "malloc" memory on GPU which is not always what needed.
> Additionally there is need to have opportunity to share VRAM allocations
> between different processes.
Humm, so it can be removed from Alexander's list then :\
As Dan suggested, maybe we need to do both. Some kind of fix for
get_user_pages() for smaller mappings (eg ZONE_DEVICE) and a mandatory
API conversion to get_user_dma_sg() for other cases?
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: Enabling peer to peer device transactions for PCIe devices
From: Serguei Sagalovitch @ 2016-11-23 19:27 UTC (permalink / raw)
To: Christian König, Dan Williams, Dave Hansen,
linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org,
linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Kuehling, Felix,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org,
Sander, Ben, Suthikulpanit, Suravee, Deucher, Alexander,
Blinzer, Paul,
Linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <2a8a6582-f3de-5cda-0c6e-1c93774147e0-5C7GfCeVMHo@public.gmane.org>
On 2016-11-23 03:51 AM, Christian König wrote:
> Am 23.11.2016 um 08:49 schrieb Daniel Vetter:
>> On Tue, Nov 22, 2016 at 01:21:03PM -0800, Dan Williams wrote:
>>> On Tue, Nov 22, 2016 at 1:03 PM, Daniel Vetter <daniel-/w4YWyX8dFk@public.gmane.org> wrote:
>>>> On Tue, Nov 22, 2016 at 9:35 PM, Serguei Sagalovitch
>>>> <serguei.sagalovitch-5C7GfCeVMHo@public.gmane.org> wrote:
>>>>> On 2016-11-22 03:10 PM, Daniel Vetter wrote:
>>>>>> On Tue, Nov 22, 2016 at 9:01 PM, Dan Williams
>>>>>> <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>>>>>> wrote:
>>>>>>> On Tue, Nov 22, 2016 at 10:59 AM, Serguei Sagalovitch
>>>>>>> <serguei.sagalovitch-5C7GfCeVMHo@public.gmane.org> wrote:
>>>>>>>> I personally like "device-DAX" idea but my concerns are:
>>>>>>>>
>>>>>>>> - How well it will co-exists with the DRM infrastructure /
>>>>>>>> implementations
>>>>>>>> in part dealing with CPU pointers?
>>>>>>> Inside the kernel a device-DAX range is "just memory" in the sense
>>>>>>> that you can perform pfn_to_page() on it and issue I/O, but the
>>>>>>> vma is
>>>>>>> not migratable. To be honest I do not know how well that co-exists
>>>>>>> with drm infrastructure.
>>>>>>>
>>>>>>>> - How well we will be able to handle case when we need to
>>>>>>>> "move"/"evict"
>>>>>>>> memory/data to the new location so CPU pointer should
>>>>>>>> point to the
>>>>>>>> new
>>>>>>>> physical location/address
>>>>>>>> (and may be not in PCI device memory at all)?
>>>>>>> So, device-DAX deliberately avoids support for in-kernel
>>>>>>> migration or
>>>>>>> overcommit. Those cases are left to the core mm or drm. The
>>>>>>> device-dax
>>>>>>> interface is for cases where all that is needed is a
>>>>>>> direct-mapping to
>>>>>>> a statically-allocated physical-address range be it persistent
>>>>>>> memory
>>>>>>> or some other special reserved memory range.
>>>>>> For some of the fancy use-cases (e.g. to be comparable to what
>>>>>> HMM can
>>>>>> pull off) I think we want all the magic in core mm, i.e.
>>>>>> migration and
>>>>>> overcommit. At least that seems to be the very strong drive in all
>>>>>> general-purpose gpu abstractions and implementations, where
>>>>>> memory is
>>>>>> allocated with malloc, and then mapped/moved into vram/gpu address
>>>>>> space through some magic,
>>>>> It is possible that there is other way around: memory is requested
>>>>> to be
>>>>> allocated and should be kept in vram for performance reason but due
>>>>> to possible overcommit case we need at least temporally to "move"
>>>>> such
>>>>> allocation to system memory.
>>>> With migration I meant migrating both ways of course. And with stuff
>>>> like numactl we can also influence where exactly the malloc'ed memory
>>>> is allocated originally, at least if we'd expose the vram range as a
>>>> very special numa node that happens to be far away and not hold any
>>>> cpu cores.
>>> I don't think we should be using numa distance to reverse engineer a
>>> certain allocation behavior. The latency data should be truthful, but
>>> you're right we'll need a mechanism to keep general purpose
>>> allocations out of that range by default. Btw, strict isolation is
>>> another design point of device-dax, but I think in this case we're
>>> describing something between the two extremes of full isolation and
>>> full compatibility with existing numactl apis.
>> Yes, agreed. My idea with exposing vram sections using numa nodes wasn't
>> to reuse all the existing allocation policies directly, those won't
>> work.
>> So at boot-up your default numa policy would exclude any vram nodes.
>>
>> But I think (as an -mm layman) that numa gives us a lot of the tools and
>> policy interface that we need to implement what we want for gpus.
>
> Agree completely. From a ten mile high view our GPUs are just command
> processors with local memory as well .
>
> Basically this is also the whole idea of what AMD is pushing with HSA
> for a while.
>
> It's just that a lot of problems start to pop up when you look at all
> the nasty details. For example only part of the GPU memory is usually
> accessible by the CPU.
>
> So even when numa nodes expose a good foundation for this I think
> there is still a lot of code to write.
>
> BTW: I should probably start to read into the numa code of the kernel.
> Any good pointers for that?
I would assume that "page" allocation logic itself should be inside of
graphics driver due to possible different requirements especially from
graphics: alignment, etc.
>
> Regards,
> Christian.
>
>> Wrt isolation: There's a sliding scale of what different users expect,
>> from full auto everything, including migrating pages around if needed to
>> full isolation all seems to be on the table. As long as we keep vram
>> nodes
>> out of any default allocation numasets, full isolation should be
>> possible.
>> -Daniel
>
>
Sincerely yours,
Serguei Sagalovitch
^ permalink raw reply
* Re: Enabling peer to peer device transactions for PCIe devices
From: Serguei Sagalovitch @ 2016-11-23 19:24 UTC (permalink / raw)
To: Jason Gunthorpe, Dan Williams
Cc: linux-rdma@vger.kernel.org, linux-pci@vger.kernel.org,
Kuehling, Felix, Blinzer, Paul, linux-kernel@vger.kernel.org,
dri-devel@lists.freedesktop.org, Sander, Ben,
Suthikulpanit, Suravee, linux-nvdimm@lists.01.org,
Deucher, Alexander, Bart Van Assche, Logan Gunthorpe,
Koenig, Christian, Linux-media@vger.kernel.org
In-Reply-To: <20161123191215.GB12146@obsidianresearch.com>
On 2016-11-23 02:12 PM, Jason Gunthorpe wrote:
> On Wed, Nov 23, 2016 at 10:40:47AM -0800, Dan Williams wrote:
>
>> I don't think that was designed for the case where the backing memory
>> is a special/static physical address range rather than anonymous
>> "System RAM", right?
> The hardware doesn't care where the memory is. ODP is just a generic
> mechanism to provide demand-fault behavior for a mirrored page table.
>
> ODP has the same issue as everything else, it needs to translate a
> page table entry into a DMA address, and we have no API to do that
> when the page table points to peer-peer memory.
>
> Jason
I would like to note that for graphics applications (especially for VR
support) we
should avoid ODP case at any cost during graphics commands execution due
to requirement to have smooth and predictable playback. We want to load
/ "pin"
all required resources before graphics processor begin to touch them.
This is not
so critical for compute applications. Because only graphics / compute stack
knows which resource will be in used as well as all statistics
accordingly only graphics
stack is capable to make the correct decision when and _where_ evict as
well
as when and _where_ to put memory back.
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply
* Re: Enabling peer to peer device transactions for PCIe devices
From: Serguei Sagalovitch @ 2016-11-23 19:14 UTC (permalink / raw)
To: Jason Gunthorpe, Logan Gunthorpe
Cc: Haggai Eran, Bridgman, John,
linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org,
Kuehling, Felix, Blinzer, Paul,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org,
Sander, Ben, Suthikulpanit, Suravee,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Deucher, Alexander, Koenig, Christian,
Linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <20161123190515.GA12146-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
On 2016-11-23 02:05 PM, Jason Gunthorpe wrote:
> On Wed, Nov 23, 2016 at 10:13:03AM -0700, Logan Gunthorpe wrote:
>
>> an MR would be very tricky. The MR may be relied upon by another host
>> and the kernel would have to inform user-space the MR was invalid then
>> user-space would have to tell the remote application.
> As Bart says, it would be best to be combined with something like
> Mellanox's ODP MRs, which allows a page to be evicted and then trigger
> a CPU interrupt if a DMA is attempted so it can be brought back.
Please note that in the general case (including MR one) we could have
"page fault" from the different PCIe device. So all PCIe device must
be synchronized.
> includes the usual fencing mechanism so the CPU can block, flush, and
> then evict a page coherently.
>
> This is the general direction the industry is going in: Link PCI DMA
> directly to dynamic user page tabels, including support for demand
> faulting and synchronicity.
>
> Mellanox ODP is a rough implementation of mirroring a process's page
> table via the kernel, while IBM's CAPI (and CCIX, PCI ATS?) is
> probably a good example of where this is ultimately headed.
>
> CAPI allows a PCI DMA to directly target an ASID associated with a
> user process and then use the usual CPU machinery to do the page
> translation for the DMA. This includes page faults for evicted pages,
> and obviously allows eviction and migration..
>
> So, of all the solutions in the original list, I would discard
> anything that isn't VMA focused. Emulating what CAPI does in hardware
> with software is probably the best choice, or we have to do it all
> again when CAPI style hardware broadly rolls out :(
>
> DAX and GPU allocators should create VMAs and manipulate them in the
> usual way to achieve migration, windowing, cache&mirror, movement or
> swap of the potentially peer-peer memory pages. They would have to
> respect the usual rules for a VMA, including pinning.
>
> DMA drivers would use the usual approaches for dealing with DMA from
> a VMA: short term pin or long term coherent translation mirror.
>
> So, to my view (looking from RDMA), the main problem with peer-peer is
> how do you DMA translate VMA's that point at non struct page memory?
>
> Does HMM solve the peer-peer problem? Does it do it generically or
> only for drivers that are mirroring translation tables?
In current form HMM doesn't solve peer-peer problem. Currently it allow
"mirroring" of "malloc" memory on GPU which is not always what needed.
Additionally there is need to have opportunity to share VRAM allocations
between different processes.
> From a RDMA perspective we could use something other than
> get_user_pages() to pin and DMA translate a VMA if the core community
> could decide on an API. eg get_user_dma_sg() would probably be quite
> usable.
>
> Jason
^ permalink raw reply
* Re: Enabling peer to peer device transactions for PCIe devices
From: Jason Gunthorpe @ 2016-11-23 19:12 UTC (permalink / raw)
To: Dan Williams
Cc: Bridgman, John,
linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org,
Kuehling, Felix, Serguei Sagalovitch, Blinzer, Paul,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org,
Sander, Ben, Suthikulpanit, Suravee,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Deucher, Alexander, Bart Van Assche, Koenig, Christian,
Linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <CAPcyv4jsgrsQaeewFedUzcD1XLSQ8vQ5Zyr8EoB_5ORUqmL4nQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
On Wed, Nov 23, 2016 at 10:40:47AM -0800, Dan Williams wrote:
> I don't think that was designed for the case where the backing memory
> is a special/static physical address range rather than anonymous
> "System RAM", right?
The hardware doesn't care where the memory is. ODP is just a generic
mechanism to provide demand-fault behavior for a mirrored page table.
ODP has the same issue as everything else, it needs to translate a
page table entry into a DMA address, and we have no API to do that
when the page table points to peer-peer memory.
Jason
^ permalink raw reply
* Re: Enabling peer to peer device transactions for PCIe devices
From: Serguei Sagalovitch @ 2016-11-23 19:06 UTC (permalink / raw)
To: Bart Van Assche, Logan Gunthorpe, Dan Williams,
Deucher, Alexander
Cc: haggaie-VPRAkNaXOzVWk0Htik3J/w, Bridgman, John,
linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Kuehling, Felix,
linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org,
Blinzer, Paul,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org,
Sander, Ben, Suthikulpanit, Suravee, Koenig, Christian,
Linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <eca737c1-415c-bcd4-80b9-628010638051-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
On 2016-11-23 12:27 PM, Bart Van Assche wrote:
> On 11/23/2016 09:13 AM, Logan Gunthorpe wrote:
>> IMO any memory that has been registered for a P2P transaction should be
>> locked from being evicted. So if there's a get_user_pages call it needs
>> to be pinned until the put_page. The main issue being with the RDMA
>> case: handling an eviction when a chunk of memory has been registered as
>> an MR would be very tricky. The MR may be relied upon by another host
>> and the kernel would have to inform user-space the MR was invalid then
>> user-space would have to tell the remote application.
>
> Hello Logan,
>
> Are you aware that the Linux kernel already supports ODP (On Demand
> Paging)? See also the output of git grep -nHi on.demand.paging. See
> also
> https://www.openfabrics.org/images/eventpresos/workshops2014/DevWorkshop/presos/Tuesday/pdf/04_ODP_update.pdf.
>
> Bart.
My understanding is that the main problems are (a) h/w support (b)
compatibility with IB Verbs semantic.
^ permalink raw reply
* Re: Enabling peer to peer device transactions for PCIe devices
From: Jason Gunthorpe @ 2016-11-23 19:05 UTC (permalink / raw)
To: Logan Gunthorpe
Cc: Bridgman, John,
linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org,
Kuehling, Felix, Serguei Sagalovitch, Blinzer, Paul,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org,
Sander, Ben, Suthikulpanit, Suravee,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Deucher, Alexander, Koenig, Christian,
Linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <45c6e878-bece-7987-aee7-0e940044158c-OTvnGxWRz7hWk0Htik3J/w@public.gmane.org>
On Wed, Nov 23, 2016 at 10:13:03AM -0700, Logan Gunthorpe wrote:
> an MR would be very tricky. The MR may be relied upon by another host
> and the kernel would have to inform user-space the MR was invalid then
> user-space would have to tell the remote application.
As Bart says, it would be best to be combined with something like
Mellanox's ODP MRs, which allows a page to be evicted and then trigger
a CPU interrupt if a DMA is attempted so it can be brought back. This
includes the usual fencing mechanism so the CPU can block, flush, and
then evict a page coherently.
This is the general direction the industry is going in: Link PCI DMA
directly to dynamic user page tabels, including support for demand
faulting and synchronicity.
Mellanox ODP is a rough implementation of mirroring a process's page
table via the kernel, while IBM's CAPI (and CCIX, PCI ATS?) is
probably a good example of where this is ultimately headed.
CAPI allows a PCI DMA to directly target an ASID associated with a
user process and then use the usual CPU machinery to do the page
translation for the DMA. This includes page faults for evicted pages,
and obviously allows eviction and migration..
So, of all the solutions in the original list, I would discard
anything that isn't VMA focused. Emulating what CAPI does in hardware
with software is probably the best choice, or we have to do it all
again when CAPI style hardware broadly rolls out :(
DAX and GPU allocators should create VMAs and manipulate them in the
usual way to achieve migration, windowing, cache&mirror, movement or
swap of the potentially peer-peer memory pages. They would have to
respect the usual rules for a VMA, including pinning.
DMA drivers would use the usual approaches for dealing with DMA from
a VMA: short term pin or long term coherent translation mirror.
So, to my view (looking from RDMA), the main problem with peer-peer is
how do you DMA translate VMA's that point at non struct page memory?
Does HMM solve the peer-peer problem? Does it do it generically or
only for drivers that are mirroring translation tables?
>From a RDMA perspective we could use something other than
get_user_pages() to pin and DMA translate a VMA if the core community
could decide on an API. eg get_user_dma_sg() would probably be quite
usable.
Jason
^ permalink raw reply
* [PATCH infiniband-diags] ibsendtrap.c: Add support for security traps
From: Hal Rosenstock @ 2016-11-23 18:56 UTC (permalink / raw)
To: Weiny, Ira
Cc: Eitan Zahavi, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
From: Eitan Zahavi <eitan-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Add support for trap numbers 256, 257, and 258
Signed-off-by: Eitan Zahavi <eitan-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Hal Rosenstock <hal-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
diff --git a/src/ibsendtrap.c b/src/ibsendtrap.c
index 659f2d2..7044deb 100644
--- a/src/ibsendtrap.c
+++ b/src/ibsendtrap.c
@@ -121,6 +121,64 @@ static void build_trap129(ib_mad_notice_attr_t * n, ib_portid_t * port)
n->data_details.ntc_129_131.port_num = (uint8_t) error_port;
}
+static void build_trap256_local(ib_mad_notice_attr_t * n, ib_portid_t * port)
+{
+ n->generic_type = 0x80 | IB_NOTICE_TYPE_SECURITY;
+ n->g_or_v.generic.prod_type_lsb = cl_hton16(get_node_type(port));
+ n->g_or_v.generic.trap_num = cl_hton16(256);
+ n->issuer_lid = cl_hton16((uint16_t) port->lid);
+ n->data_details.ntc_256.lid = n->issuer_lid;
+ n->data_details.ntc_256.dr_slid = 0xffff;
+ n->data_details.ntc_256.method = 1;
+ n->data_details.ntc_256.attr_id = cl_ntoh16(0x15);
+ n->data_details.ntc_256.attr_mod = cl_ntoh32(0x12);
+ n->data_details.ntc_256.mkey = cl_ntoh64(0x1234567812345678);
+}
+
+static void build_trap256_lid(ib_mad_notice_attr_t * n, ib_portid_t * port)
+{
+ build_trap256_local(n, port);
+ n->data_details.ntc_256.dr_trunc_hop = 0;
+}
+
+static void build_trap256_dr(ib_mad_notice_attr_t * n, ib_portid_t * port)
+{
+ build_trap256_local(n, port);
+ n->data_details.ntc_256.dr_trunc_hop = 0x80 | 0x4;
+ n->data_details.ntc_256.dr_rtn_path[0] = 5;
+ n->data_details.ntc_256.dr_rtn_path[1] = 6;
+ n->data_details.ntc_256.dr_rtn_path[2] = 7;
+ n->data_details.ntc_256.dr_rtn_path[3] = 8;
+}
+
+static void build_trap257_258(ib_mad_notice_attr_t * n, ib_portid_t * port,
+ uint16_t trap_num)
+{
+ n->generic_type = 0x80 | IB_NOTICE_TYPE_SECURITY;
+ n->g_or_v.generic.prod_type_lsb = cl_hton16(get_node_type(port));
+ n->g_or_v.generic.trap_num = cl_hton16(trap_num);
+ n->issuer_lid = cl_hton16((uint16_t) port->lid);
+ n->data_details.ntc_257_258.lid1 = cl_hton16(1);
+ n->data_details.ntc_257_258.lid2 = cl_hton16(2);
+ n->data_details.ntc_257_258.key = cl_hton32(0x12345678);
+ n->data_details.ntc_257_258.qp1 = cl_hton32(0x010101);
+ n->data_details.ntc_257_258.qp2 = cl_hton32(0x020202);
+ n->data_details.ntc_257_258.gid1.unicast.prefix = cl_ntoh64(0xf8c0000000000001);
+ n->data_details.ntc_257_258.gid1.unicast.interface_id = cl_ntoh64(0x1111222233334444);
+ n->data_details.ntc_257_258.gid2.unicast.prefix = cl_ntoh64(0xf8c0000000000001);
+ n->data_details.ntc_257_258.gid2.unicast.interface_id = cl_ntoh64(0x5678567812341234);
+}
+
+static void build_trap257(ib_mad_notice_attr_t * n, ib_portid_t * port)
+{
+ build_trap257_258(n, port, 257);
+}
+
+static void build_trap258(ib_mad_notice_attr_t * n, ib_portid_t * port)
+{
+ build_trap257_258(n, port, 258);
+}
+
static int send_trap(void (*build) (ib_mad_notice_attr_t *, ib_portid_t *))
{
ib_portid_t sm_port;
@@ -159,6 +217,10 @@ static const trap_def_t traps[] = {
{"link_speed_enabled_change", build_trap144_linkspeed},
{"local_link_integrity", build_trap129},
{"sys_image_guid_change", build_trap145},
+ {"mkey_lid", build_trap256_lid},
+ {"mkey_dr", build_trap256_dr},
+ {"pkey", build_trap257},
+ {"qkey", build_trap258},
{NULL, NULL}
};
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* Re: Enabling peer to peer device transactions for PCIe devices
From: Dan Williams @ 2016-11-23 18:40 UTC (permalink / raw)
To: Bart Van Assche
Cc: Logan Gunthorpe, Serguei Sagalovitch, Deucher, Alexander,
linux-nvdimm@lists.01.org, linux-rdma@vger.kernel.org,
linux-pci@vger.kernel.org, Kuehling, Felix, Bridgman, John,
linux-kernel@vger.kernel.org, dri-devel@lists.freedesktop.org,
Koenig, Christian, Sander, Ben, Suthikulpanit, Suravee,
Blinzer, Paul, Linux-media@vger.kernel.org
In-Reply-To: <eca737c1-415c-bcd4-80b9-628010638051@sandisk.com>
On Wed, Nov 23, 2016 at 9:27 AM, Bart Van Assche
<bart.vanassche@sandisk.com> wrote:
> On 11/23/2016 09:13 AM, Logan Gunthorpe wrote:
>>
>> IMO any memory that has been registered for a P2P transaction should be
>> locked from being evicted. So if there's a get_user_pages call it needs
>> to be pinned until the put_page. The main issue being with the RDMA
>> case: handling an eviction when a chunk of memory has been registered as
>> an MR would be very tricky. The MR may be relied upon by another host
>> and the kernel would have to inform user-space the MR was invalid then
>> user-space would have to tell the remote application.
>
>
> Hello Logan,
>
> Are you aware that the Linux kernel already supports ODP (On Demand Paging)?
> See also the output of git grep -nHi on.demand.paging. See also
> https://www.openfabrics.org/images/eventpresos/workshops2014/DevWorkshop/presos/Tuesday/pdf/04_ODP_update.pdf.
>
I don't think that was designed for the case where the backing memory
is a special/static physical address range rather than anonymous
"System RAM", right?
I think we should handle the graphics P2P concerns separately from the
general P2P-DMA case since the latter does not require the higher
order memory management facilities. Using ZONE_DEVICE/DAX mappings to
avoid changes to every driver that wants to support P2P-DMA separately
from typical DMA still seems the path of least resistance.
^ permalink raw reply
* Re: [PATCH v2 06/11] IB/mad: Ensure DR MADs are correctly specified when using OPA devices
From: Chandramouli, Dasaratharaman @ 2016-11-23 18:29 UTC (permalink / raw)
To: Hal Rosenstock, Ira Weiny, Don Hiatt, linux-rdma, Doug Ledford
In-Reply-To: <4a729fdc-9386-c0b0-a29e-11cbddf66058-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
On 11/23/2016 6:21 AM, Hal Rosenstock wrote:
> On 11/22/2016 2:38 PM, Dasaratharaman Chandramouli wrote:
>> From: Don Hiatt <don.hiatt-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>>
>> Pure DR MADs do not need OPA GIDs to be specified in the GRH since
>> they do not rely on LID information.
>>
>> Reviewed-by: Ira Weiny <ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>> Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>> Signed-off-by: Don Hiatt <don.hiatt-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>> ---
>> drivers/infiniband/core/mad.c | 104 +++++++++++++++++++++++++++++++++++++-----
>> include/rdma/opa_addr.h | 17 +++++++
>> 2 files changed, 109 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
>> index 40cbd6b..c0ee997 100644
>> --- a/drivers/infiniband/core/mad.c
>> +++ b/drivers/infiniband/core/mad.c
>> @@ -41,6 +41,7 @@
>> #include <linux/slab.h>
>> #include <linux/module.h>
>> #include <rdma/ib_cache.h>
>> +#include <rdma/opa_addr.h>
>>
>> #include "mad_priv.h"
>> #include "mad_rmpp.h"
>> @@ -731,6 +732,80 @@ static size_t mad_priv_dma_size(const struct ib_mad_private *mp)
>> return sizeof(struct ib_grh) + mp->mad_size;
>> }
>>
>> +static int verify_mad_ah(struct ib_mad_agent_private *mad_agent_priv,
>
> I think it would be better if this were named opa_verify_mad_ah to make
> it clearer that this is an OPA only routine.
>
> -- Hal
Thanks. Will rename this to opa_verify_mad_ah
-Dasa
>
>> + struct ib_mad_send_wr_private *mad_send_wr)
>> +{
>> + struct ib_device *ib_dev = mad_agent_priv->qp_info->port_priv->device;
>> + u8 port = mad_agent_priv->qp_info->port_priv->port_num;
>> + struct ib_smp *smp = mad_send_wr->send_buf.mad;
>> + struct opa_smp *opa_smp = (struct opa_smp *)smp;
>> + u32 opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid);
>> + u32 opa_drdlid = be32_to_cpu(opa_smp->route.dr.dr_dlid);
>> +
>> + bool dr_slid_is_permissive = (OPA_LID_PERMISSIVE ==
>> + opa_smp->route.dr.dr_slid) ? true : false;
>> + bool dr_dlid_is_permissive = (OPA_LID_PERMISSIVE ==
>> + opa_smp->route.dr.dr_dlid) ? true : false;
>> + bool drslid_is_ib_ucast = (opa_drslid <
>> + be16_to_cpu(IB_MULTICAST_LID_BASE)) ?
>> + true : false;
>> + bool drdlid_is_ib_ucast = (opa_drdlid <
>> + be16_to_cpu(IB_MULTICAST_LID_BASE)) ?
>> + true : false;
>> + bool drslid_is_ext = !drslid_is_ib_ucast && !dr_slid_is_permissive;
>> + bool drdlid_is_ext = !drdlid_is_ib_ucast && !dr_dlid_is_permissive;
>> + bool grh_present = false;
>> + struct ib_ah_attr attr;
>> + union ib_gid sgid;
>> + int ret = 0;
>> +
>> + ret = ib_query_ah(mad_send_wr->send_buf.ah, &attr);
>> + if (ret)
>> + return ret;
>> + grh_present = (attr.ah_flags & IB_AH_GRH);
>> + if (grh_present) {
>> + ret = ib_query_gid(ib_dev, port, attr.grh.sgid_index,
>> + &sgid, NULL);
>> + if (ret)
>> + return ret;
>> + }
>> +
>> + if (smp->class_version == OPA_SMP_CLASS_VERSION) {
>> + /*
>> + * Conditions when GRH info should not be specified
>> + * 1. both dr_slid and dr_dlid are permissve (Pure DR)
>> + * 2. both dr_slid and dr_dlid are less than 0xc000.
>> + *
>> + * Conditions when GRH info should be specified
>> + * 1. dr_dlid is not permissive and above 0xbfff
>> + * OR
>> + * 2. dr_slid is not permissive and above 0xbfff
>> + */
>> + if (grh_present) {
>> + if ((dr_slid_is_permissive &&
>> + dr_dlid_is_permissive) ||
>> + (drslid_is_ib_ucast && drdlid_is_ib_ucast))
>> + if (ib_is_opa_gid(&attr.grh.dgid) &&
>> + ib_is_opa_gid(&sgid))
>> + return -EINVAL;
>> + if (drslid_is_ext && !ib_is_opa_gid(&sgid))
>> + return -EINVAL;
>> + if (drdlid_is_ext &&
>> + !ib_is_opa_gid(&attr.grh.dgid))
>> + return -EINVAL;
>> + } else { /* There is no GRH */
>> + if (drslid_is_ext || drdlid_is_ext)
>> + return -EINVAL;
>> + }
>> + } else {
>> + if (grh_present)
>> + if (ib_is_opa_gid(&attr.grh.dgid) &&
>> + ib_is_opa_gid(&sgid))
>> + return -EINVAL;
>> + }
>> + return ret;
>> +}
>> +
>> /*
>> * Return 0 if SMP is to be sent
>> * Return 1 if SMP was consumed locally (whether or not solicited)
>> @@ -754,8 +829,12 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
>> size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv);
>> u16 out_mad_pkey_index = 0;
>> u16 drslid;
>> - bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
>> - mad_agent_priv->qp_info->port_priv->port_num);
>> + bool opa_mad =
>> + rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
>> + mad_agent_priv->qp_info->port_priv->port_num);
>> + bool opa_ah =
>> + rdma_cap_opa_ah(mad_agent_priv->qp_info->port_priv->device,
>> + mad_agent_priv->qp_info->port_priv->port_num);
>>
>> if (rdma_cap_ib_switch(device) &&
>> smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
>> @@ -763,13 +842,21 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
>> else
>> port_num = mad_agent_priv->agent.port_num;
>>
>> + if (opa_mad && opa_ah) {
>> + ret = verify_mad_ah(mad_agent_priv, mad_send_wr);
>> + if (ret) {
>> + dev_err(&device->dev,
>> + "Error verifying MAD format\n");
>> + goto out;
>> + }
>> + }
>> /*
>> * Directed route handling starts if the initial LID routed part of
>> * a request or the ending LID routed part of a response is empty.
>> * If we are at the start of the LID routed part, don't update the
>> * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec.
>> */
>> - if (opa && smp->class_version == OPA_SMP_CLASS_VERSION) {
>> + if (opa_mad && smp->class_version == OPA_SMP_CLASS_VERSION) {
>> u32 opa_drslid;
>>
>> if ((opa_get_smp_direction(opa_smp)
>> @@ -783,13 +870,6 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
>> goto out;
>> }
>> opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid);
>> - if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) &&
>> - opa_drslid & 0xffff0000) {
>> - ret = -EINVAL;
>> - dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n",
>> - opa_drslid);
>> - goto out;
>> - }
>> drslid = (u16)(opa_drslid & 0x0000ffff);
>>
>> /* Check to post send on QP or process locally */
>> @@ -834,7 +914,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
>> send_wr->pkey_index,
>> send_wr->port_num, &mad_wc);
>>
>> - if (opa && smp->base_version == OPA_MGMT_BASE_VERSION) {
>> + if (opa_mad && smp->base_version == OPA_MGMT_BASE_VERSION) {
>> mad_wc.byte_len = mad_send_wr->send_buf.hdr_len
>> + mad_send_wr->send_buf.data_len
>> + sizeof(struct ib_grh);
>> @@ -891,7 +971,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
>> }
>>
>> local->mad_send_wr = mad_send_wr;
>> - if (opa) {
>> + if (opa_mad) {
>> local->mad_send_wr->send_wr.pkey_index = out_mad_pkey_index;
>> local->return_wc_byte_len = mad_size;
>> }
>> diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h
>> index 142b327..3e22937 100644
>> --- a/include/rdma/opa_addr.h
>> +++ b/include/rdma/opa_addr.h
>> @@ -33,6 +33,23 @@
>> #if !defined(OPA_ADDR_H)
>> #define OPA_ADDR_H
>>
>> +#include <rdma/ib_verbs.h>
>> +
>> #define OPA_TO_IB_UCAST_LID(x) (((x) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) \
>> ? 0 : x)
>> +#define OPA_SPECIAL_OUI (0x00066AULL)
>> +
>> +/**
>> + * ib_is_opa_gid: Returns true if the top 24 bits of the gid
>> + * contains the OPA_STL_OUI identifier. This identifies that
>> + * the provided gid is a special purpose GID meant to carry
>> + * extended LID information.
>> + *
>> + * @gid: The Global identifier
>> + */
>> +static inline bool ib_is_opa_gid(union ib_gid *gid)
>> +{
>> + return ((be64_to_cpu(gid->global.interface_id) >> 40) ==
>> + OPA_SPECIAL_OUI);
>> +}
>> #endif /* OPA_ADDR_H */
>>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH v2 0/8] RXE improvements
From: Moni Shoua @ 2016-11-23 17:59 UTC (permalink / raw)
To: Andrew Boyer; +Cc: Yonatan Cohen, linux-rdma
In-Reply-To: <1479922764-13091-1-git-send-email-andrew.boyer-8PEkshWhKlo@public.gmane.org>
> Andrew Boyer (8):
> IB/rxe: Remove buffer used for printing IP address
> IB/rxe: Advance the consumer pointer before posting the CQE
> IB/rxe: Don't update the response PSN unless it's going forwards
> IB/rxe: Unblock loopback by moving skb_out increment
> IB/rxe: Add support for zero-byte operations
> IB/rxe: Add support for IB_CQ_REPORT_MISSED_EVENTS
> IB/rxe: Fix ref leak in rxe_create_qp()
> IB/rxe: Fix ref leak in duplicate_request()
Thanks for the series.
Please see comment in response for the first patch but except that
Acked-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
for everything else
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH v2 1/8] IB/rxe: Remove buffer used for printing IP address
From: Moni Shoua @ 2016-11-23 17:48 UTC (permalink / raw)
To: Andrew Boyer; +Cc: Yonatan Cohen, linux-rdma
In-Reply-To: <1479922764-13091-2-git-send-email-andrew.boyer-8PEkshWhKlo@public.gmane.org>
> diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c
> index 46f0628..252b4d6 100644
> --- a/drivers/infiniband/sw/rxe/rxe_recv.c
> +++ b/drivers/infiniband/sw/rxe/rxe_recv.c
> @@ -391,16 +391,15 @@ int rxe_rcv(struct sk_buff *skb)
> payload_size(pkt));
> calc_icrc = cpu_to_be32(~calc_icrc);
> if (unlikely(calc_icrc != pack_icrc)) {
> - char saddr[sizeof(struct in6_addr)];
> -
> if (skb->protocol == htons(ETH_P_IPV6))
> - sprintf(saddr, "%pI6", &ipv6_hdr(skb)->saddr);
> + pr_warn_ratelimited("bad ICRC from %pI6c\n",
> + &ipv6_hdr(skb)->saddr);
> else if (skb->protocol == htons(ETH_P_IP))
> - sprintf(saddr, "%pI4", &ip_hdr(skb)->saddr);
> + pr_warn_ratelimited("bad ICRC from %pI4\n",
> + &ip_hdr(skb)->saddr);
> else
> - sprintf(saddr, "unknown");
> + pr_warn_ratelimited("bad ICRC from unknown\n");
>
> - pr_warn_ratelimited("bad ICRC from %s\n", saddr);
> goto drop;
> }
Have you considered Bart's suggestion to use %pIS instead of %pI4 and %pI6
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=1067964305df
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* [PATCH v2 8/8] IB/rxe: Fix ref leak in duplicate_request()
From: Andrew Boyer @ 2016-11-23 17:39 UTC (permalink / raw)
To: monis-VPRAkNaXOzVWk0Htik3J/w, yonatanc-VPRAkNaXOzVWk0Htik3J/w,
linux-rdma-u79uwXL29TY76Z2rM5mHXA
Cc: Andrew Boyer
In-Reply-To: <1479922764-13091-1-git-send-email-andrew.boyer-8PEkshWhKlo@public.gmane.org>
A ref was added after the call to skb_clone().
Signed-off-by: Andrew Boyer <andrew.boyer-8PEkshWhKlo@public.gmane.org>
---
drivers/infiniband/sw/rxe/rxe_resp.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index a5e9ce3..8643797 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -1145,6 +1145,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp,
pkt, skb_copy);
if (rc) {
pr_err("Failed resending result. This flow is not handled - skb ignored\n");
+ rxe_drop_ref(qp);
kfree_skb(skb_copy);
rc = RESPST_CLEANUP;
goto out;
--
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v2 7/8] IB/rxe: Fix ref leak in rxe_create_qp()
From: Andrew Boyer @ 2016-11-23 17:39 UTC (permalink / raw)
To: monis-VPRAkNaXOzVWk0Htik3J/w, yonatanc-VPRAkNaXOzVWk0Htik3J/w,
linux-rdma-u79uwXL29TY76Z2rM5mHXA
Cc: Andrew Boyer
In-Reply-To: <1479922764-13091-1-git-send-email-andrew.boyer-8PEkshWhKlo@public.gmane.org>
The udata->inlen error path needs to clean up the ref
added by rxe_alloc().
Signed-off-by: Andrew Boyer <andrew.boyer-8PEkshWhKlo@public.gmane.org>
---
drivers/infiniband/sw/rxe/rxe_verbs.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index de39b0a..071430c 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -564,7 +564,7 @@ static struct ib_qp *rxe_create_qp(struct ib_pd *ibpd,
if (udata) {
if (udata->inlen) {
err = -EINVAL;
- goto err1;
+ goto err2;
}
qp->is_user = 1;
}
@@ -573,12 +573,13 @@ static struct ib_qp *rxe_create_qp(struct ib_pd *ibpd,
err = rxe_qp_from_init(rxe, qp, pd, init, udata, ibpd);
if (err)
- goto err2;
+ goto err3;
return &qp->ibqp;
-err2:
+err3:
rxe_drop_index(qp);
+err2:
rxe_drop_ref(qp);
err1:
return ERR_PTR(err);
--
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v2 6/8] IB/rxe: Add support for IB_CQ_REPORT_MISSED_EVENTS
From: Andrew Boyer @ 2016-11-23 17:39 UTC (permalink / raw)
To: monis-VPRAkNaXOzVWk0Htik3J/w, yonatanc-VPRAkNaXOzVWk0Htik3J/w,
linux-rdma-u79uwXL29TY76Z2rM5mHXA
Cc: Andrew Boyer
In-Reply-To: <1479922764-13091-1-git-send-email-andrew.boyer-8PEkshWhKlo@public.gmane.org>
Peek at the CQ after arming it so that we can return a hint.
This avoids missed completions due to a race between posting
CQEs and arming the CQ.
For example, CM teardown waits on MAD requests to complete with
ib_cq_poll_work(). Without this fix, the last completion might be
left on the CQ, hanging the kthread doing the teardown.
The console backtraces look like this:
[ 4199.911284] Call Trace:
[ 4199.911401] [<ffffffff9657fe95>] schedule+0x35/0x80
[ 4199.911556] [<ffffffff965830df>] schedule_timeout+0x22f/0x2c0
[ 4199.911727] [<ffffffff9657f7a8>] ? __schedule+0x368/0xa20
[ 4199.911891] [<ffffffff96580903>] wait_for_completion+0xb3/0x130
[ 4199.912067] [<ffffffff960a17e0>] ? wake_up_q+0x70/0x70
[ 4199.912243] [<ffffffffc074a06d>] cm_destroy_id+0x13d/0x450 [ib_cm]
[ 4199.912422] [<ffffffff961615d5>] ? printk+0x57/0x73
[ 4199.912578] [<ffffffffc074a390>] ib_destroy_cm_id+0x10/0x20 [ib_cm]
[ 4199.912759] [<ffffffffc076098c>] rdma_destroy_id+0xac/0x340 [rdma_cm]
[ 4199.912941] [<ffffffffc076f2cc>] 0xffffffffc076f2cc
Signed-off-by: Andrew Boyer <andrew.boyer-8PEkshWhKlo@public.gmane.org>
---
drivers/infiniband/sw/rxe/rxe_verbs.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 19841c8..de39b0a 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1007,11 +1007,19 @@ static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt)
static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
{
struct rxe_cq *cq = to_rcq(ibcq);
+ unsigned long irq_flags;
+ int ret = 0;
+ spin_lock_irqsave(&cq->cq_lock, irq_flags);
if (cq->notify != IB_CQ_NEXT_COMP)
cq->notify = flags & IB_CQ_SOLICITED_MASK;
- return 0;
+ if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !queue_empty(cq->queue))
+ ret = 1;
+
+ spin_unlock_irqrestore(&cq->cq_lock, irq_flags);
+
+ return ret;
}
static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access)
--
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v2 5/8] IB/rxe: Add support for zero-byte operations
From: Andrew Boyer @ 2016-11-23 17:39 UTC (permalink / raw)
To: monis-VPRAkNaXOzVWk0Htik3J/w, yonatanc-VPRAkNaXOzVWk0Htik3J/w,
linux-rdma-u79uwXL29TY76Z2rM5mHXA
Cc: Andrew Boyer
In-Reply-To: <1479922764-13091-1-git-send-email-andrew.boyer-8PEkshWhKlo@public.gmane.org>
The last_psn algorithm fails in the zero-byte case: it calculates
first_psn = N, last_psn = N-1. This makes the operation unretryable since
the res structure will fail the (first_psn <= psn <= last_psn) test in
find_resource().
While here, use BTH_PSN_MASK to mask the calculated last_psn.
Signed-off-by: Andrew Boyer <andrew.boyer-8PEkshWhKlo@public.gmane.org>
Reviewed-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
drivers/infiniband/sw/rxe/rxe_mr.c | 3 +++
drivers/infiniband/sw/rxe/rxe_resp.c | 18 +++++++++++++++---
2 files changed, 18 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 1869152..d0faca2 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -355,6 +355,9 @@ int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length,
size_t offset;
u32 crc = crcp ? (*crcp) : 0;
+ if (length == 0)
+ return 0;
+
if (mem->type == RXE_MEM_TYPE_DMA) {
u8 *src, *dest;
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index cb3fd4c..a5e9ce3 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -444,6 +444,13 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
return RESPST_EXECUTE;
}
+ /* A zero-byte op is not required to set an addr or rkey. */
+ if ((pkt->mask & (RXE_READ_MASK | RXE_WRITE_OR_SEND)) &&
+ (pkt->mask & RXE_RETH_MASK) &&
+ reth_len(pkt) == 0) {
+ return RESPST_EXECUTE;
+ }
+
va = qp->resp.va;
rkey = qp->resp.rkey;
resid = qp->resp.resid;
@@ -680,9 +687,14 @@ static enum resp_states read_reply(struct rxe_qp *qp,
res->read.va_org = qp->resp.va;
res->first_psn = req_pkt->psn;
- res->last_psn = req_pkt->psn +
- (reth_len(req_pkt) + mtu - 1) /
- mtu - 1;
+
+ if (reth_len(req_pkt)) {
+ res->last_psn = (req_pkt->psn +
+ (reth_len(req_pkt) + mtu - 1) /
+ mtu - 1) & BTH_PSN_MASK;
+ } else {
+ res->last_psn = res->first_psn;
+ }
res->cur_psn = req_pkt->psn;
res->read.resid = qp->resp.resid;
--
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v2 4/8] IB/rxe: Unblock loopback by moving skb_out increment
From: Andrew Boyer @ 2016-11-23 17:39 UTC (permalink / raw)
To: monis-VPRAkNaXOzVWk0Htik3J/w, yonatanc-VPRAkNaXOzVWk0Htik3J/w,
linux-rdma-u79uwXL29TY76Z2rM5mHXA
Cc: Andrew Boyer
In-Reply-To: <1479922764-13091-1-git-send-email-andrew.boyer-8PEkshWhKlo@public.gmane.org>
skb_out is decremented in rxe_skb_tx_dtor(), which is not called in the
loopback() path. Move the increment to the send() path rather than
rxe_xmit_packet().
Signed-off-by: Andrew Boyer <andrew.boyer-8PEkshWhKlo@public.gmane.org>
Acked-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
drivers/infiniband/sw/rxe/rxe_loc.h | 2 --
drivers/infiniband/sw/rxe/rxe_net.c | 2 ++
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index 73849a5a..efe4c6a 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -266,8 +266,6 @@ static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp,
return err;
}
- atomic_inc(&qp->skb_out);
-
if ((qp_type(qp) != IB_QPT_RC) &&
(pkt->mask & RXE_END_MASK)) {
pkt->wqe->state = wqe_state_done;
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index ffff5a5..332ce52 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -455,6 +455,8 @@ static int send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
return -EAGAIN;
}
+ if (pkt->qp)
+ atomic_inc(&pkt->qp->skb_out);
kfree_skb(skb);
return 0;
--
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox