* [PATCH for-next 2/6] IB/hns: Fix the bug when free mr
From: Salil Mehta @ 2016-11-29 23:10 UTC (permalink / raw)
To: dledford
Cc: salil.mehta, xavier.huwei, oulijun, xushaobo2, mehta.salil.lnk,
lijun_nudt, linux-rdma, netdev, linux-kernel, linuxarm
In-Reply-To: <20161129231030.1105600-1-salil.mehta@huawei.com>
From: Shaobo Xu <xushaobo2@huawei.com>
If the resources of mr are freed while executing the user case, hardware
can not been notified in hip06 SoC. Then hardware will hold on when it
reads the payload by the PA which has been released.
In order to slove this problem, RoCE driver creates 8 reserved loopback
QPs to ensure zero wqe when free mr. When the mac address is reset, in
order to avoid loopback failure, we need to release the reserved loopback
QPs and recreate them.
Signed-off-by: Shaobo Xu <xushaobo2@huawei.com>
Reviewed-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
drivers/infiniband/hw/hns/hns_roce_cmd.h | 5 -
drivers/infiniband/hw/hns/hns_roce_device.h | 10 +
drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 485 +++++++++++++++++++++++++++
drivers/infiniband/hw/hns/hns_roce_hw_v1.h | 34 ++
drivers/infiniband/hw/hns/hns_roce_main.c | 5 +-
drivers/infiniband/hw/hns/hns_roce_mr.c | 21 +-
6 files changed, 545 insertions(+), 15 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h
index ed14ad3..f5a9ee2 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cmd.h
+++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h
@@ -58,11 +58,6 @@ enum {
HNS_ROCE_CMD_QUERY_QP = 0x22,
};
-struct hns_roce_cmd_mailbox {
- void *buf;
- dma_addr_t dma;
-};
-
int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
unsigned long in_modifier, u8 op_modifier, u16 op,
unsigned long timeout);
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index e48464d..1050829 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -388,6 +388,11 @@ struct hns_roce_cmdq {
u8 toggle;
};
+struct hns_roce_cmd_mailbox {
+ void *buf;
+ dma_addr_t dma;
+};
+
struct hns_roce_dev;
struct hns_roce_qp {
@@ -522,6 +527,7 @@ struct hns_roce_hw {
struct ib_recv_wr **bad_recv_wr);
int (*req_notify_cq)(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+ int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr);
void *priv;
};
@@ -688,6 +694,10 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags,
struct ib_udata *udata);
int hns_roce_dereg_mr(struct ib_mr *ibmr);
+int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
+ struct hns_roce_cmd_mailbox *mailbox,
+ unsigned long mpt_index);
+unsigned long key_to_hw_index(u32 key);
void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size,
struct hns_roce_buf *buf);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index aee1d01..f67a3bf 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -295,6 +295,8 @@ int hns_roce_v1_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
roce_set_field(sq_db.u32_4, SQ_DOORBELL_U32_4_SQ_HEAD_M,
SQ_DOORBELL_U32_4_SQ_HEAD_S,
(qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1)));
+ roce_set_field(sq_db.u32_4, SQ_DOORBELL_U32_4_SL_M,
+ SQ_DOORBELL_U32_4_SL_S, qp->sl);
roce_set_field(sq_db.u32_4, SQ_DOORBELL_U32_4_PORT_M,
SQ_DOORBELL_U32_4_PORT_S, qp->phy_port);
roce_set_field(sq_db.u32_8, SQ_DOORBELL_U32_8_QPN_M,
@@ -622,6 +624,213 @@ static int hns_roce_db_ext_init(struct hns_roce_dev *hr_dev, u32 sdb_ext_mod,
return ret;
}
+static struct hns_roce_qp *hns_roce_v1_create_lp_qp(struct hns_roce_dev *hr_dev,
+ struct ib_pd *pd)
+{
+ struct device *dev = &hr_dev->pdev->dev;
+ struct ib_qp_init_attr init_attr;
+ struct ib_qp *qp;
+
+ memset(&init_attr, 0, sizeof(struct ib_qp_init_attr));
+ init_attr.qp_type = IB_QPT_RC;
+ init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ init_attr.cap.max_recv_wr = HNS_ROCE_MIN_WQE_NUM;
+ init_attr.cap.max_send_wr = HNS_ROCE_MIN_WQE_NUM;
+
+ qp = hns_roce_create_qp(pd, &init_attr, NULL);
+ if (IS_ERR(qp)) {
+ dev_err(dev, "Create loop qp for mr free failed!");
+ return NULL;
+ }
+
+ return to_hr_qp(qp);
+}
+
+static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
+{
+ struct hns_roce_caps *caps = &hr_dev->caps;
+ struct device *dev = &hr_dev->pdev->dev;
+ struct ib_cq_init_attr cq_init_attr;
+ struct hns_roce_free_mr *free_mr;
+ struct ib_qp_attr attr = { 0 };
+ struct hns_roce_v1_priv *priv;
+ struct hns_roce_qp *hr_qp;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+ u64 subnet_prefix;
+ int attr_mask = 0;
+ int i;
+ int ret;
+ u8 phy_port;
+ u8 sl;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ free_mr = &priv->free_mr;
+
+ /* Reserved cq for loop qp */
+ cq_init_attr.cqe = HNS_ROCE_MIN_WQE_NUM * 2;
+ cq_init_attr.comp_vector = 0;
+ cq = hns_roce_ib_create_cq(&hr_dev->ib_dev, &cq_init_attr, NULL, NULL);
+ if (IS_ERR(cq)) {
+ dev_err(dev, "Create cq for reseved loop qp failed!");
+ return -ENOMEM;
+ }
+ free_mr->mr_free_cq = to_hr_cq(cq);
+ free_mr->mr_free_cq->ib_cq.device = &hr_dev->ib_dev;
+ free_mr->mr_free_cq->ib_cq.uobject = NULL;
+ free_mr->mr_free_cq->ib_cq.comp_handler = NULL;
+ free_mr->mr_free_cq->ib_cq.event_handler = NULL;
+ free_mr->mr_free_cq->ib_cq.cq_context = NULL;
+ atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0);
+
+ pd = hns_roce_alloc_pd(&hr_dev->ib_dev, NULL, NULL);
+ if (IS_ERR(pd)) {
+ dev_err(dev, "Create pd for reseved loop qp failed!");
+ ret = -ENOMEM;
+ goto alloc_pd_failed;
+ }
+ free_mr->mr_free_pd = to_hr_pd(pd);
+ free_mr->mr_free_pd->ibpd.device = &hr_dev->ib_dev;
+ free_mr->mr_free_pd->ibpd.uobject = NULL;
+ atomic_set(&free_mr->mr_free_pd->ibpd.usecnt, 0);
+
+ attr.qp_access_flags = IB_ACCESS_REMOTE_WRITE;
+ attr.pkey_index = 0;
+ attr.min_rnr_timer = 0;
+ /* Disable read ability */
+ attr.max_dest_rd_atomic = 0;
+ attr.max_rd_atomic = 0;
+ /* Use arbitrary values as rq_psn and sq_psn */
+ attr.rq_psn = 0x0808;
+ attr.sq_psn = 0x0808;
+ attr.retry_cnt = 7;
+ attr.rnr_retry = 7;
+ attr.timeout = 0x12;
+ attr.path_mtu = IB_MTU_256;
+ attr.ah_attr.ah_flags = 1;
+ attr.ah_attr.static_rate = 3;
+ attr.ah_attr.grh.sgid_index = 0;
+ attr.ah_attr.grh.hop_limit = 1;
+ attr.ah_attr.grh.flow_label = 0;
+ attr.ah_attr.grh.traffic_class = 0;
+
+ subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+ for (i = 0; i < HNS_ROCE_V1_RESV_QP; i++) {
+ free_mr->mr_free_qp[i] = hns_roce_v1_create_lp_qp(hr_dev, pd);
+ if (IS_ERR(free_mr->mr_free_qp[i])) {
+ dev_err(dev, "Create loop qp failed!\n");
+ goto create_lp_qp_failed;
+ }
+ hr_qp = free_mr->mr_free_qp[i];
+
+ sl = i / caps->num_ports;
+
+ if (caps->num_ports == HNS_ROCE_MAX_PORTS)
+ phy_port = (i >= HNS_ROCE_MAX_PORTS) ? (i - 2) :
+ (i % caps->num_ports);
+ else
+ phy_port = i % caps->num_ports;
+
+ hr_qp->port = phy_port + 1;
+ hr_qp->phy_port = phy_port;
+ hr_qp->ibqp.qp_type = IB_QPT_RC;
+ hr_qp->ibqp.device = &hr_dev->ib_dev;
+ hr_qp->ibqp.uobject = NULL;
+ atomic_set(&hr_qp->ibqp.usecnt, 0);
+ hr_qp->ibqp.pd = pd;
+ hr_qp->ibqp.recv_cq = cq;
+ hr_qp->ibqp.send_cq = cq;
+
+ attr.ah_attr.port_num = phy_port + 1;
+ attr.ah_attr.sl = sl;
+ attr.port_num = phy_port + 1;
+
+ attr.dest_qp_num = hr_qp->qpn;
+ memcpy(attr.ah_attr.dmac, hr_dev->dev_addr[phy_port],
+ MAC_ADDR_OCTET_NUM);
+
+ memcpy(attr.ah_attr.grh.dgid.raw,
+ &subnet_prefix, sizeof(u64));
+ memcpy(&attr.ah_attr.grh.dgid.raw[8],
+ hr_dev->dev_addr[phy_port], 3);
+ memcpy(&attr.ah_attr.grh.dgid.raw[13],
+ hr_dev->dev_addr[phy_port] + 3, 3);
+ attr.ah_attr.grh.dgid.raw[11] = 0xff;
+ attr.ah_attr.grh.dgid.raw[12] = 0xfe;
+ attr.ah_attr.grh.dgid.raw[8] ^= 2;
+
+ attr_mask |= IB_QP_PORT;
+
+ ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, &attr, attr_mask,
+ IB_QPS_RESET, IB_QPS_INIT);
+ if (ret) {
+ dev_err(dev, "modify qp failed(%d)!\n", ret);
+ goto create_lp_qp_failed;
+ }
+
+ ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, &attr, attr_mask,
+ IB_QPS_INIT, IB_QPS_RTR);
+ if (ret) {
+ dev_err(dev, "modify qp failed(%d)!\n", ret);
+ goto create_lp_qp_failed;
+ }
+
+ ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, &attr, attr_mask,
+ IB_QPS_RTR, IB_QPS_RTS);
+ if (ret) {
+ dev_err(dev, "modify qp failed(%d)!\n", ret);
+ goto create_lp_qp_failed;
+ }
+ }
+
+ return 0;
+
+create_lp_qp_failed:
+ for (i -= 1; i >= 0; i--) {
+ hr_qp = free_mr->mr_free_qp[i];
+ if (hns_roce_v1_destroy_qp(&hr_qp->ibqp))
+ dev_err(dev, "Destroy qp %d for mr free failed!\n", i);
+ }
+
+ if (hns_roce_dealloc_pd(pd))
+ dev_err(dev, "Destroy pd for create_lp_qp failed!\n");
+
+alloc_pd_failed:
+ if (hns_roce_ib_destroy_cq(cq))
+ dev_err(dev, "Destroy cq for create_lp_qp failed!\n");
+
+ return -EINVAL;
+}
+
+static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev)
+{
+ struct device *dev = &hr_dev->pdev->dev;
+ struct hns_roce_free_mr *free_mr;
+ struct hns_roce_v1_priv *priv;
+ struct hns_roce_qp *hr_qp;
+ int ret;
+ int i;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ free_mr = &priv->free_mr;
+
+ for (i = 0; i < HNS_ROCE_V1_RESV_QP; i++) {
+ hr_qp = free_mr->mr_free_qp[i];
+ ret = hns_roce_v1_destroy_qp(&hr_qp->ibqp);
+ if (ret)
+ dev_err(dev, "Destroy qp %d for mr free failed(%d)!\n",
+ i, ret);
+ }
+
+ ret = hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq);
+ if (ret)
+ dev_err(dev, "Destroy cq for mr_free failed(%d)!\n", ret);
+
+ ret = hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd);
+ if (ret)
+ dev_err(dev, "Destroy pd for mr_free failed(%d)!\n", ret);
+}
+
static int hns_roce_db_init(struct hns_roce_dev *hr_dev)
{
struct device *dev = &hr_dev->pdev->dev;
@@ -659,6 +868,223 @@ static int hns_roce_db_init(struct hns_roce_dev *hr_dev)
return 0;
}
+void hns_roce_v1_recreate_lp_qp_work_fn(struct work_struct *work)
+{
+ struct hns_roce_recreate_lp_qp_work *lp_qp_work;
+ struct hns_roce_dev *hr_dev;
+
+ lp_qp_work = container_of(work, struct hns_roce_recreate_lp_qp_work,
+ work);
+ hr_dev = to_hr_dev(lp_qp_work->ib_dev);
+
+ hns_roce_v1_release_lp_qp(hr_dev);
+
+ if (hns_roce_v1_rsv_lp_qp(hr_dev))
+ dev_err(&hr_dev->pdev->dev, "create reserver qp failed\n");
+
+ if (lp_qp_work->comp_flag)
+ complete(lp_qp_work->comp);
+
+ kfree(lp_qp_work);
+}
+
+static int hns_roce_v1_recreate_lp_qp(struct hns_roce_dev *hr_dev)
+{
+ struct device *dev = &hr_dev->pdev->dev;
+ struct hns_roce_recreate_lp_qp_work *lp_qp_work;
+ struct hns_roce_free_mr *free_mr;
+ struct hns_roce_v1_priv *priv;
+ struct completion comp;
+ unsigned long end =
+ msecs_to_jiffies(HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS) + jiffies;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ free_mr = &priv->free_mr;
+
+ lp_qp_work = kzalloc(sizeof(struct hns_roce_recreate_lp_qp_work),
+ GFP_KERNEL);
+
+ INIT_WORK(&(lp_qp_work->work), hns_roce_v1_recreate_lp_qp_work_fn);
+
+ lp_qp_work->ib_dev = &(hr_dev->ib_dev);
+ lp_qp_work->comp = ∁
+ lp_qp_work->comp_flag = 1;
+
+ init_completion(lp_qp_work->comp);
+
+ queue_work(free_mr->free_mr_wq, &(lp_qp_work->work));
+
+ while (time_before_eq(jiffies, end)) {
+ if (try_wait_for_completion(&comp))
+ return 0;
+ msleep(HNS_ROCE_V1_RECREATE_LP_QP_WAIT_VALUE);
+ }
+
+ lp_qp_work->comp_flag = 0;
+ if (try_wait_for_completion(&comp))
+ return 0;
+
+ dev_warn(dev, "recreate lp qp failed 20s timeout and return failed!\n");
+ return -ETIMEDOUT;
+}
+
+static int hns_roce_v1_send_lp_wqe(struct hns_roce_qp *hr_qp)
+{
+ struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device);
+ struct device *dev = &hr_dev->pdev->dev;
+ struct ib_send_wr send_wr, *bad_wr;
+ int ret;
+
+ memset(&send_wr, 0, sizeof(send_wr));
+ send_wr.next = NULL;
+ send_wr.num_sge = 0;
+ send_wr.send_flags = 0;
+ send_wr.sg_list = NULL;
+ send_wr.wr_id = (unsigned long long)&send_wr;
+ send_wr.opcode = IB_WR_RDMA_WRITE;
+
+ ret = hns_roce_v1_post_send(&hr_qp->ibqp, &send_wr, &bad_wr);
+ if (ret) {
+ dev_err(dev, "Post write wqe for mr free failed(%d)!", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void hns_roce_v1_mr_free_work_fn(struct work_struct *work)
+{
+ struct hns_roce_mr_free_work *mr_work;
+ struct ib_wc wc[HNS_ROCE_V1_RESV_QP];
+ struct hns_roce_free_mr *free_mr;
+ struct hns_roce_cq *mr_free_cq;
+ struct hns_roce_v1_priv *priv;
+ struct hns_roce_dev *hr_dev;
+ struct hns_roce_mr *hr_mr;
+ struct hns_roce_qp *hr_qp;
+ struct device *dev;
+ unsigned long end =
+ msecs_to_jiffies(HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS) + jiffies;
+ int i;
+ int ret;
+ int ne;
+
+ mr_work = container_of(work, struct hns_roce_mr_free_work, work);
+ hr_mr = (struct hns_roce_mr *)mr_work->mr;
+ hr_dev = to_hr_dev(mr_work->ib_dev);
+ dev = &hr_dev->pdev->dev;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ free_mr = &priv->free_mr;
+ mr_free_cq = free_mr->mr_free_cq;
+
+ for (i = 0; i < HNS_ROCE_V1_RESV_QP; i++) {
+ hr_qp = free_mr->mr_free_qp[i];
+ ret = hns_roce_v1_send_lp_wqe(hr_qp);
+ if (ret) {
+ dev_err(dev,
+ "Send wqe (qp:0x%lx) for mr free failed(%d)!\n",
+ hr_qp->qpn, ret);
+ goto free_work;
+ }
+ }
+
+ ne = HNS_ROCE_V1_RESV_QP;
+ do {
+ ret = hns_roce_v1_poll_cq(&mr_free_cq->ib_cq, ne, wc);
+ if (ret < 0) {
+ dev_err(dev,
+ "(qp:0x%lx) starts, Poll cqe failed(%d) for mr 0x%x free! Remain %d cqe\n",
+ hr_qp->qpn, ret, hr_mr->key, ne);
+ goto free_work;
+ }
+ ne -= ret;
+ msleep(HNS_ROCE_V1_FREE_MR_WAIT_VALUE);
+ } while (ne && time_before_eq(jiffies, end));
+
+ if (ne != 0)
+ dev_err(dev,
+ "Poll cqe for mr 0x%x free timeout! Remain %d cqe\n",
+ hr_mr->key, ne);
+
+free_work:
+ if (mr_work->comp_flag)
+ complete(mr_work->comp);
+ kfree(mr_work);
+}
+
+int hns_roce_v1_dereg_mr(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr)
+{
+ struct device *dev = &hr_dev->pdev->dev;
+ struct hns_roce_mr_free_work *mr_work;
+ struct hns_roce_free_mr *free_mr;
+ struct hns_roce_v1_priv *priv;
+ struct completion comp;
+ unsigned long end =
+ msecs_to_jiffies(HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS) + jiffies;
+ unsigned long start = jiffies;
+ int npages;
+ int ret = 0;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ free_mr = &priv->free_mr;
+
+ if (mr->enabled) {
+ if (hns_roce_hw2sw_mpt(hr_dev, NULL, key_to_hw_index(mr->key)
+ & (hr_dev->caps.num_mtpts - 1)))
+ dev_warn(dev, "HW2SW_MPT failed!\n");
+ }
+
+ mr_work = kzalloc(sizeof(*mr_work), GFP_KERNEL);
+ if (!mr_work) {
+ ret = -ENOMEM;
+ goto free_mr;
+ }
+
+ INIT_WORK(&(mr_work->work), hns_roce_v1_mr_free_work_fn);
+
+ mr_work->ib_dev = &(hr_dev->ib_dev);
+ mr_work->comp = ∁
+ mr_work->comp_flag = 1;
+ mr_work->mr = (void *)mr;
+ init_completion(mr_work->comp);
+
+ queue_work(free_mr->free_mr_wq, &(mr_work->work));
+
+ while (time_before_eq(jiffies, end)) {
+ if (try_wait_for_completion(&comp))
+ goto free_mr;
+ msleep(HNS_ROCE_V1_FREE_MR_WAIT_VALUE);
+ }
+
+ mr_work->comp_flag = 0;
+ if (try_wait_for_completion(&comp))
+ goto free_mr;
+
+ dev_warn(dev, "Free mr work 0x%x over 50s and failed!\n", mr->key);
+ ret = -ETIMEDOUT;
+
+free_mr:
+ dev_dbg(dev, "Free mr 0x%x use 0x%x us.\n",
+ mr->key, jiffies_to_usecs(jiffies) - jiffies_to_usecs(start));
+
+ if (mr->size != ~0ULL) {
+ npages = ib_umem_page_count(mr->umem);
+ dma_free_coherent(dev, npages * 8, mr->pbl_buf,
+ mr->pbl_dma_addr);
+ }
+
+ hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap,
+ key_to_hw_index(mr->key), 0);
+
+ if (mr->umem)
+ ib_umem_release(mr->umem);
+
+ kfree(mr);
+
+ return ret;
+}
+
static void hns_roce_db_free(struct hns_roce_dev *hr_dev)
{
struct device *dev = &hr_dev->pdev->dev;
@@ -899,6 +1325,46 @@ static void hns_roce_tptr_free(struct hns_roce_dev *hr_dev)
tptr_buf->buf, tptr_buf->map);
}
+static int hns_roce_free_mr_init(struct hns_roce_dev *hr_dev)
+{
+ struct device *dev = &hr_dev->pdev->dev;
+ struct hns_roce_free_mr *free_mr;
+ struct hns_roce_v1_priv *priv;
+ int ret = 0;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ free_mr = &priv->free_mr;
+
+ free_mr->free_mr_wq = create_singlethread_workqueue("hns_roce_free_mr");
+ if (!free_mr->free_mr_wq) {
+ dev_err(dev, "Create free mr workqueue failed!\n");
+ return -ENOMEM;
+ }
+
+ ret = hns_roce_v1_rsv_lp_qp(hr_dev);
+ if (ret) {
+ dev_err(dev, "Reserved loop qp failed(%d)!\n", ret);
+ flush_workqueue(free_mr->free_mr_wq);
+ destroy_workqueue(free_mr->free_mr_wq);
+ }
+
+ return ret;
+}
+
+static void hns_roce_free_mr_free(struct hns_roce_dev *hr_dev)
+{
+ struct hns_roce_free_mr *free_mr;
+ struct hns_roce_v1_priv *priv;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ free_mr = &priv->free_mr;
+
+ flush_workqueue(free_mr->free_mr_wq);
+ destroy_workqueue(free_mr->free_mr_wq);
+
+ hns_roce_v1_release_lp_qp(hr_dev);
+}
+
/**
* hns_roce_v1_reset - reset RoCE
* @hr_dev: RoCE device struct pointer
@@ -1100,10 +1566,19 @@ int hns_roce_v1_init(struct hns_roce_dev *hr_dev)
goto error_failed_des_qp_init;
}
+ ret = hns_roce_free_mr_init(hr_dev);
+ if (ret) {
+ dev_err(dev, "free mr init failed!\n");
+ goto error_failed_free_mr_init;
+ }
+
hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_UP);
return 0;
+error_failed_free_mr_init:
+ hns_roce_des_qp_free(hr_dev);
+
error_failed_des_qp_init:
hns_roce_tptr_free(hr_dev);
@@ -1121,6 +1596,7 @@ int hns_roce_v1_init(struct hns_roce_dev *hr_dev)
void hns_roce_v1_exit(struct hns_roce_dev *hr_dev)
{
hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_DOWN);
+ hns_roce_free_mr_free(hr_dev);
hns_roce_des_qp_free(hr_dev);
hns_roce_tptr_free(hr_dev);
hns_roce_bt_free(hr_dev);
@@ -1161,6 +1637,14 @@ void hns_roce_v1_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr)
u32 *p;
u32 val;
+ /*
+ * When mac changed, loopback may fail
+ * because of smac not equal to dmac.
+ * We Need to release and create reserved qp again.
+ */
+ if (hr_dev->hw->dereg_mr && hns_roce_v1_recreate_lp_qp(hr_dev))
+ dev_warn(&hr_dev->pdev->dev, "recreate lp qp timeout!\n");
+
p = (u32 *)(&addr[0]);
reg_smac_l = *p;
roce_raw_write(reg_smac_l, hr_dev->reg_base + ROCEE_SMAC_L_0_REG +
@@ -3299,5 +3783,6 @@ struct hns_roce_hw hns_roce_hw_v1 = {
.post_recv = hns_roce_v1_post_recv,
.req_notify_cq = hns_roce_v1_req_notify_cq,
.poll_cq = hns_roce_v1_poll_cq,
+ .dereg_mr = hns_roce_v1_dereg_mr,
.priv = &hr_v1_priv,
};
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
index 1d250c0..b213b5e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
@@ -58,6 +58,7 @@
#define HNS_ROCE_V1_PHY_UAR_NUM 8
#define HNS_ROCE_V1_GID_NUM 16
+#define HNS_ROCE_V1_RESV_QP 8
#define HNS_ROCE_V1_NUM_COMP_EQE 0x8000
#define HNS_ROCE_V1_NUM_ASYNC_EQE 0x400
@@ -107,6 +108,10 @@
#define HNS_ROCE_V1_DB_STAGE2 2
#define HNS_ROCE_V1_CHECK_DB_TIMEOUT_MSECS 10000
#define HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS 20
+#define HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS 50000
+#define HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS 10000
+#define HNS_ROCE_V1_FREE_MR_WAIT_VALUE 5
+#define HNS_ROCE_V1_RECREATE_LP_QP_WAIT_VALUE 20
#define HNS_ROCE_BT_RSV_BUF_SIZE (1 << 17)
@@ -969,6 +974,10 @@ struct hns_roce_sq_db {
#define SQ_DOORBELL_U32_4_SQ_HEAD_M \
(((1UL << 15) - 1) << SQ_DOORBELL_U32_4_SQ_HEAD_S)
+#define SQ_DOORBELL_U32_4_SL_S 16
+#define SQ_DOORBELL_U32_4_SL_M \
+ (((1UL << 2) - 1) << SQ_DOORBELL_U32_4_SL_S)
+
#define SQ_DOORBELL_U32_4_PORT_S 18
#define SQ_DOORBELL_U32_4_PORT_M (((1UL << 3) - 1) << SQ_DOORBELL_U32_4_PORT_S)
@@ -1015,14 +1024,39 @@ struct hns_roce_des_qp {
int requeue_flag;
};
+struct hns_roce_mr_free_work {
+ struct work_struct work;
+ struct ib_device *ib_dev;
+ struct completion *comp;
+ int comp_flag;
+ void *mr;
+};
+
+struct hns_roce_recreate_lp_qp_work {
+ struct work_struct work;
+ struct ib_device *ib_dev;
+ struct completion *comp;
+ int comp_flag;
+};
+
+struct hns_roce_free_mr {
+ struct workqueue_struct *free_mr_wq;
+ struct hns_roce_qp *mr_free_qp[HNS_ROCE_V1_RESV_QP];
+ struct hns_roce_cq *mr_free_cq;
+ struct hns_roce_pd *mr_free_pd;
+};
+
struct hns_roce_v1_priv {
struct hns_roce_db_table db_table;
struct hns_roce_raq_table raq_table;
struct hns_roce_bt_table bt_table;
struct hns_roce_tptr_table tptr_table;
struct hns_roce_des_qp des_qp;
+ struct hns_roce_free_mr free_mr;
};
int hns_dsaf_roce_reset(struct fwnode_handle *dsaf_fwnode, bool dereset);
+int hns_roce_v1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int hns_roce_v1_destroy_qp(struct ib_qp *ibqp);
#endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 914d0ac..0cedec0 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -129,7 +129,6 @@ static int handle_en_event(struct hns_roce_dev *hr_dev, u8 port,
{
struct device *dev = &hr_dev->pdev->dev;
struct net_device *netdev;
- unsigned long flags;
netdev = hr_dev->iboe.netdevs[port];
if (!netdev) {
@@ -137,7 +136,7 @@ static int handle_en_event(struct hns_roce_dev *hr_dev, u8 port,
return -ENODEV;
}
- spin_lock_irqsave(&hr_dev->iboe.lock, flags);
+ spin_lock_bh(&hr_dev->iboe.lock);
switch (event) {
case NETDEV_UP:
@@ -156,7 +155,7 @@ static int handle_en_event(struct hns_roce_dev *hr_dev, u8 port,
break;
}
- spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
+ spin_unlock_bh(&hr_dev->iboe.lock);
return 0;
}
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 9b8a1ad..4139abe 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -42,7 +42,7 @@ static u32 hw_index_to_key(unsigned long ind)
return (u32)(ind >> 24) | (ind << 8);
}
-static unsigned long key_to_hw_index(u32 key)
+unsigned long key_to_hw_index(u32 key)
{
return (key << 24) | (key >> 8);
}
@@ -56,7 +56,7 @@ static int hns_roce_sw2hw_mpt(struct hns_roce_dev *hr_dev,
HNS_ROCE_CMD_TIMEOUT_MSECS);
}
-static int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
+int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
struct hns_roce_cmd_mailbox *mailbox,
unsigned long mpt_index)
{
@@ -607,13 +607,20 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
int hns_roce_dereg_mr(struct ib_mr *ibmr)
{
+ struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device);
struct hns_roce_mr *mr = to_hr_mr(ibmr);
+ int ret = 0;
- hns_roce_mr_free(to_hr_dev(ibmr->device), mr);
- if (mr->umem)
- ib_umem_release(mr->umem);
+ if (hr_dev->hw->dereg_mr) {
+ ret = hr_dev->hw->dereg_mr(hr_dev, mr);
+ } else {
+ hns_roce_mr_free(hr_dev, mr);
- kfree(mr);
+ if (mr->umem)
+ ib_umem_release(mr->umem);
- return 0;
+ kfree(mr);
+ }
+
+ return ret;
}
--
1.7.9.5
^ permalink raw reply related
* [PATCH for-next 1/6] IB/hns: Fix the bug when destroy qp
From: Salil Mehta @ 2016-11-29 23:10 UTC (permalink / raw)
To: dledford-H+wXaHxf7aLQT0dZR+AlfA
Cc: salil.mehta-hv44wF8Li93QT0dZR+AlfA,
xavier.huwei-hv44wF8Li93QT0dZR+AlfA,
oulijun-hv44wF8Li93QT0dZR+AlfA, xushaobo2-hv44wF8Li93QT0dZR+AlfA,
mehta.salil.lnk-Re5JQEeQqe8AvxtiuMwx3w, lijun_nudt-9Onoh4P/yGk,
linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linuxarm-hv44wF8Li93QT0dZR+AlfA, Dongdong Huang
In-Reply-To: <20161129231030.1105600-1-salil.mehta-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
From: "Wei Hu (Xavier)" <xavier.huwei-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
If send queue is still working when qp is in reset state by modify qp
in destroy qp function, hardware will hold on and don't work in hip06
SoC. In current codes, RoCE driver check hardware pointer of sending and
hardware pointer of processing to ensure that hardware has processed all
the dbs of this qp. But while the environment of wire becomes not good,
The checking time maybe too long.
In order to solve this problem, RoCE driver created a workqueue at probe
function. If there is a timeout when checking the status of qp, driver
initialize work entry and push it into the workqueue, Work function will
finish checking and release the related resources later.
Signed-off-by: Wei Hu (Xavier) <xavier.huwei-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
Signed-off-by: Lijun Ou <oulijun-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
Signed-off-by: Dongdong Huang(Donald) <hdd.huang-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
Signed-off-by: Salil Mehta <salil.mehta-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
drivers/infiniband/hw/hns/hns_roce_common.h | 40 +++
drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 435 +++++++++++++++++++++------
drivers/infiniband/hw/hns/hns_roce_hw_v1.h | 23 ++
3 files changed, 402 insertions(+), 96 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
index 0dcb620..a055632 100644
--- a/drivers/infiniband/hw/hns/hns_roce_common.h
+++ b/drivers/infiniband/hw/hns/hns_roce_common.h
@@ -57,6 +57,32 @@
#define roce_set_bit(origin, shift, val) \
roce_set_field((origin), (1ul << (shift)), (shift), (val))
+/*
+ * roce_hw_index_cmp_lt - Compare two hardware index values in hisilicon
+ * SOC, check if a is less than b.
+ * @a: hardware index value
+ * @b: hardware index value
+ * @bits: the number of bits of a and b, range: 0~31.
+ *
+ * Hardware index increases continuously till max value, and then restart
+ * from zero, again and again. Because the bits of reg field is often
+ * limited, the reg field can only hold the low bits of the hardware index
+ * in hisilicon SOC.
+ * In some scenes we need to compare two values(a,b) getted from two reg
+ * fields in this driver, for example:
+ * If a equals 0xfffe, b equals 0x1 and bits equals 16, we think b has
+ * incresed from 0xffff to 0x1 and a is less than b.
+ * If a equals 0xfffe, b equals 0x0xf001 and bits equals 16, we think a
+ * is bigger than b.
+ *
+ * Return true on a less than b, otherwise false.
+ */
+#define roce_hw_index_mask(bits) ((1ul << (bits)) - 1)
+#define roce_hw_index_shift(bits) (32 - (bits))
+#define roce_hw_index_cmp_lt(a, b, bits) \
+ ((int)((((a) - (b)) & roce_hw_index_mask(bits)) << \
+ roce_hw_index_shift(bits)) < 0)
+
#define ROCEE_GLB_CFG_ROCEE_DB_SQ_MODE_S 3
#define ROCEE_GLB_CFG_ROCEE_DB_OTH_MODE_S 4
@@ -245,10 +271,22 @@
#define ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M \
(((1UL << 28) - 1) << ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S)
+#define ROCEE_SDB_PTR_CMP_BITS 28
+
#define ROCEE_SDB_INV_CNT_SDB_INV_CNT_S 0
#define ROCEE_SDB_INV_CNT_SDB_INV_CNT_M \
(((1UL << 16) - 1) << ROCEE_SDB_INV_CNT_SDB_INV_CNT_S)
+#define ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S 0
+#define ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M \
+ (((1UL << 16) - 1) << ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S)
+
+#define ROCEE_SDB_CNT_CMP_BITS 16
+
+#define ROCEE_TSP_BP_ST_QH_FIFO_ENTRY_S 20
+
+#define ROCEE_CNT_CLR_CE_CNT_CLR_CE_S 0
+
/*************ROCEE_REG DEFINITION****************/
#define ROCEE_VENDOR_ID_REG 0x0
#define ROCEE_VENDOR_PART_ID_REG 0x4
@@ -317,6 +355,8 @@
#define ROCEE_SDB_ISSUE_PTR_REG 0x758
#define ROCEE_SDB_SEND_PTR_REG 0x75C
#define ROCEE_SDB_INV_CNT_REG 0x9A4
+#define ROCEE_SDB_RETRY_CNT_REG 0x9AC
+#define ROCEE_TSP_BP_ST_REG 0x9EC
#define ROCEE_ECC_UCERR_ALM0_REG 0xB34
#define ROCEE_ECC_CERR_ALM0_REG 0xB40
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 125ab90..aee1d01 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -948,6 +948,38 @@ int hns_roce_v1_reset(struct hns_roce_dev *hr_dev, bool dereset)
return ret;
}
+static int hns_roce_des_qp_init(struct hns_roce_dev *hr_dev)
+{
+ struct device *dev = &hr_dev->pdev->dev;
+ struct hns_roce_v1_priv *priv;
+ struct hns_roce_des_qp *des_qp;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ des_qp = &priv->des_qp;
+
+ des_qp->requeue_flag = 1;
+ des_qp->qp_wq = create_singlethread_workqueue("hns_roce_destroy_qp");
+ if (!des_qp->qp_wq) {
+ dev_err(dev, "Create destroy qp workqueue failed!\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void hns_roce_des_qp_free(struct hns_roce_dev *hr_dev)
+{
+ struct hns_roce_v1_priv *priv;
+ struct hns_roce_des_qp *des_qp;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ des_qp = &priv->des_qp;
+
+ des_qp->requeue_flag = 0;
+ flush_workqueue(des_qp->qp_wq);
+ destroy_workqueue(des_qp->qp_wq);
+}
+
void hns_roce_v1_profile(struct hns_roce_dev *hr_dev)
{
int i = 0;
@@ -1050,8 +1082,6 @@ int hns_roce_v1_init(struct hns_roce_dev *hr_dev)
goto error_failed_raq_init;
}
- hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_UP);
-
ret = hns_roce_bt_init(hr_dev);
if (ret) {
dev_err(dev, "bt init failed!\n");
@@ -1064,13 +1094,23 @@ int hns_roce_v1_init(struct hns_roce_dev *hr_dev)
goto error_failed_tptr_init;
}
+ ret = hns_roce_des_qp_init(hr_dev);
+ if (ret) {
+ dev_err(dev, "des qp init failed!\n");
+ goto error_failed_des_qp_init;
+ }
+
+ hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_UP);
+
return 0;
+error_failed_des_qp_init:
+ hns_roce_tptr_free(hr_dev);
+
error_failed_tptr_init:
hns_roce_bt_free(hr_dev);
error_failed_bt_init:
- hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_DOWN);
hns_roce_raq_free(hr_dev);
error_failed_raq_init:
@@ -1080,9 +1120,10 @@ int hns_roce_v1_init(struct hns_roce_dev *hr_dev)
void hns_roce_v1_exit(struct hns_roce_dev *hr_dev)
{
+ hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_DOWN);
+ hns_roce_des_qp_free(hr_dev);
hns_roce_tptr_free(hr_dev);
hns_roce_bt_free(hr_dev);
- hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_DOWN);
hns_roce_raq_free(hr_dev);
hns_roce_db_free(hr_dev);
}
@@ -2906,132 +2947,334 @@ int hns_roce_v1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
hns_roce_v1_q_sqp(ibqp, qp_attr, qp_attr_mask, qp_init_attr) :
hns_roce_v1_q_qp(ibqp, qp_attr, qp_attr_mask, qp_init_attr);
}
-static void hns_roce_v1_destroy_qp_common(struct hns_roce_dev *hr_dev,
- struct hns_roce_qp *hr_qp,
- int is_user)
+
+static int check_qp_db_process_status(struct hns_roce_dev *hr_dev,
+ struct hns_roce_qp *hr_qp,
+ u32 sdb_issue_ptr,
+ u32 *sdb_inv_cnt,
+ u32 *wait_stage)
{
- u32 sdbinvcnt;
- unsigned long end = 0;
- u32 sdbinvcnt_val;
- u32 sdbsendptr_val;
- u32 sdbisusepr_val;
- struct hns_roce_cq *send_cq, *recv_cq;
struct device *dev = &hr_dev->pdev->dev;
+ u32 sdb_retry_cnt, old_retry;
+ u32 sdb_send_ptr, old_send;
+ u32 success_flags = 0;
+ u32 cur_cnt, old_cnt;
+ unsigned long end;
+ u32 send_ptr;
+ u32 inv_cnt;
+ u32 tsp_st;
+
+ if (*wait_stage > HNS_ROCE_V1_DB_STAGE2 ||
+ *wait_stage < HNS_ROCE_V1_DB_STAGE1) {
+ dev_err(dev, "QP(0x%lx) db status wait stage(%d) error!\n",
+ hr_qp->qpn, *wait_stage);
+ return -EINVAL;
+ }
- if (hr_qp->ibqp.qp_type == IB_QPT_RC) {
- if (hr_qp->state != IB_QPS_RESET) {
- /*
- * Set qp to ERR,
- * waiting for hw complete processing all dbs
- */
- if (hns_roce_v1_qp_modify(hr_dev, NULL,
- to_hns_roce_state(
- (enum ib_qp_state)hr_qp->state),
- HNS_ROCE_QP_STATE_ERR, NULL,
- hr_qp))
- dev_err(dev, "modify QP %06lx to ERR failed.\n",
- hr_qp->qpn);
-
- /* Record issued doorbell */
- sdbisusepr_val = roce_read(hr_dev,
- ROCEE_SDB_ISSUE_PTR_REG);
- /*
- * Query db process status,
- * until hw process completely
- */
- end = msecs_to_jiffies(
- HNS_ROCE_QP_DESTROY_TIMEOUT_MSECS) + jiffies;
- do {
- sdbsendptr_val = roce_read(hr_dev,
+ /* Calculate the total timeout for the entire verification process */
+ end = msecs_to_jiffies(HNS_ROCE_V1_CHECK_DB_TIMEOUT_MSECS) + jiffies;
+
+ if (*wait_stage == HNS_ROCE_V1_DB_STAGE1) {
+ /* Query db process status, until hw process completely */
+ sdb_send_ptr = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG);
+ while (roce_hw_index_cmp_lt(sdb_send_ptr, sdb_issue_ptr,
+ ROCEE_SDB_PTR_CMP_BITS)) {
+ if (!time_before(jiffies, end)) {
+ dev_dbg(dev, "QP(0x%lx) db process stage1 timeout. issue 0x%x send 0x%x.\n",
+ hr_qp->qpn, sdb_issue_ptr,
+ sdb_send_ptr);
+ return 0;
+ }
+
+ msleep(HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS);
+ sdb_send_ptr = roce_read(hr_dev,
ROCEE_SDB_SEND_PTR_REG);
- if (!time_before(jiffies, end)) {
- dev_err(dev, "destroy qp(0x%lx) timeout!!!",
- hr_qp->qpn);
- break;
- }
- } while ((short)(roce_get_field(sdbsendptr_val,
- ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
- ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) -
- roce_get_field(sdbisusepr_val,
- ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_M,
- ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_S)
- ) < 0);
+ }
- /* Get list pointer */
- sdbinvcnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
+ if (roce_get_field(sdb_issue_ptr,
+ ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_M,
+ ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_S) ==
+ roce_get_field(sdb_send_ptr,
+ ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
+ ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S)) {
+ old_send = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG);
+ old_retry = roce_read(hr_dev, ROCEE_SDB_RETRY_CNT_REG);
- /* Query db's list status, until hw reversal */
do {
- sdbinvcnt_val = roce_read(hr_dev,
- ROCEE_SDB_INV_CNT_REG);
+ tsp_st = roce_read(hr_dev, ROCEE_TSP_BP_ST_REG);
+ if (roce_get_bit(tsp_st,
+ ROCEE_TSP_BP_ST_QH_FIFO_ENTRY_S) == 1) {
+ *wait_stage = HNS_ROCE_V1_DB_WAIT_OK;
+ return 0;
+ }
+
if (!time_before(jiffies, end)) {
- dev_err(dev, "destroy qp(0x%lx) timeout!!!",
- hr_qp->qpn);
- dev_err(dev, "SdbInvCnt = 0x%x\n",
- sdbinvcnt_val);
- break;
+ dev_dbg(dev, "QP(0x%lx) db process stage1 timeout when send ptr equals issue ptr.\n"
+ "issue 0x%x send 0x%x.\n",
+ hr_qp->qpn, sdb_issue_ptr,
+ sdb_send_ptr);
+ return 0;
}
- } while ((short)(roce_get_field(sdbinvcnt_val,
- ROCEE_SDB_INV_CNT_SDB_INV_CNT_M,
- ROCEE_SDB_INV_CNT_SDB_INV_CNT_S) -
- (sdbinvcnt + SDB_INV_CNT_OFFSET)) < 0);
-
- /* Modify qp to reset before destroying qp */
- if (hns_roce_v1_qp_modify(hr_dev, NULL,
- to_hns_roce_state(
- (enum ib_qp_state)hr_qp->state),
- HNS_ROCE_QP_STATE_RST, NULL, hr_qp))
- dev_err(dev, "modify QP %06lx to RESET failed.\n",
- hr_qp->qpn);
+
+ msleep(HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS);
+
+ sdb_send_ptr = roce_read(hr_dev,
+ ROCEE_SDB_SEND_PTR_REG);
+ sdb_retry_cnt = roce_read(hr_dev,
+ ROCEE_SDB_RETRY_CNT_REG);
+ cur_cnt = roce_get_field(sdb_send_ptr,
+ ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
+ ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) +
+ roce_get_field(sdb_retry_cnt,
+ ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M,
+ ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S);
+ if (!roce_get_bit(tsp_st,
+ ROCEE_CNT_CLR_CE_CNT_CLR_CE_S)) {
+ old_cnt = roce_get_field(old_send,
+ ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
+ ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) +
+ roce_get_field(old_retry,
+ ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M,
+ ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S);
+ if (cur_cnt - old_cnt > SDB_ST_CMP_VAL)
+ success_flags = 1;
+ } else {
+ old_cnt = roce_get_field(old_send,
+ ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
+ ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S);
+ if (cur_cnt - old_cnt > SDB_ST_CMP_VAL)
+ success_flags = 1;
+ else {
+ send_ptr = roce_get_field(old_send,
+ ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
+ ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) +
+ roce_get_field(sdb_retry_cnt,
+ ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M,
+ ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S);
+ roce_set_field(old_send,
+ ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
+ ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S,
+ send_ptr);
+ }
+ }
+ } while (!success_flags);
+ }
+
+ *wait_stage = HNS_ROCE_V1_DB_STAGE2;
+
+ /* Get list pointer */
+ *sdb_inv_cnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
+ dev_dbg(dev, "QP(0x%lx) db process stage2. inv cnt = 0x%x.\n",
+ hr_qp->qpn, *sdb_inv_cnt);
+ }
+
+ if (*wait_stage == HNS_ROCE_V1_DB_STAGE2) {
+ /* Query db's list status, until hw reversal */
+ inv_cnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
+ while (roce_hw_index_cmp_lt(inv_cnt,
+ *sdb_inv_cnt + SDB_INV_CNT_OFFSET,
+ ROCEE_SDB_CNT_CMP_BITS)) {
+ if (!time_before(jiffies, end)) {
+ dev_dbg(dev, "QP(0x%lx) db process stage2 timeout. inv cnt 0x%x.\n",
+ hr_qp->qpn, inv_cnt);
+ return 0;
+ }
+
+ msleep(HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS);
+ inv_cnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
}
+
+ *wait_stage = HNS_ROCE_V1_DB_WAIT_OK;
+ }
+
+ return 0;
+}
+
+static int check_qp_reset_state(struct hns_roce_dev *hr_dev,
+ struct hns_roce_qp *hr_qp,
+ struct hns_roce_qp_work *qp_work_entry,
+ int *is_timeout)
+{
+ struct device *dev = &hr_dev->pdev->dev;
+ u32 sdb_issue_ptr;
+ int ret;
+
+ if (hr_qp->state != IB_QPS_RESET) {
+ /* Set qp to ERR, waiting for hw complete processing all dbs */
+ ret = hns_roce_v1_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state,
+ IB_QPS_ERR);
+ if (ret) {
+ dev_err(dev, "Modify QP(0x%lx) to ERR failed!\n",
+ hr_qp->qpn);
+ return ret;
+ }
+
+ /* Record issued doorbell */
+ sdb_issue_ptr = roce_read(hr_dev, ROCEE_SDB_ISSUE_PTR_REG);
+ qp_work_entry->sdb_issue_ptr = sdb_issue_ptr;
+ qp_work_entry->db_wait_stage = HNS_ROCE_V1_DB_STAGE1;
+
+ /* Query db process status, until hw process completely */
+ ret = check_qp_db_process_status(hr_dev, hr_qp, sdb_issue_ptr,
+ &qp_work_entry->sdb_inv_cnt,
+ &qp_work_entry->db_wait_stage);
+ if (ret) {
+ dev_err(dev, "Check QP(0x%lx) db process status failed!\n",
+ hr_qp->qpn);
+ return ret;
+ }
+
+ if (qp_work_entry->db_wait_stage != HNS_ROCE_V1_DB_WAIT_OK) {
+ qp_work_entry->sche_cnt = 0;
+ *is_timeout = 1;
+ return 0;
+ }
+
+ /* Modify qp to reset before destroying qp */
+ ret = hns_roce_v1_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state,
+ IB_QPS_RESET);
+ if (ret) {
+ dev_err(dev, "Modify QP(0x%lx) to RST failed!\n",
+ hr_qp->qpn);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void hns_roce_v1_destroy_qp_work_fn(struct work_struct *work)
+{
+ struct hns_roce_qp_work *qp_work_entry;
+ struct hns_roce_v1_priv *priv;
+ struct hns_roce_dev *hr_dev;
+ struct hns_roce_qp *hr_qp;
+ struct device *dev;
+ int ret;
+
+ qp_work_entry = container_of(work, struct hns_roce_qp_work, work);
+ hr_dev = to_hr_dev(qp_work_entry->ib_dev);
+ dev = &hr_dev->pdev->dev;
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ hr_qp = qp_work_entry->qp;
+
+ dev_dbg(dev, "Schedule destroy QP(0x%lx) work.\n", hr_qp->qpn);
+
+ qp_work_entry->sche_cnt++;
+
+ /* Query db process status, until hw process completely */
+ ret = check_qp_db_process_status(hr_dev, hr_qp,
+ qp_work_entry->sdb_issue_ptr,
+ &qp_work_entry->sdb_inv_cnt,
+ &qp_work_entry->db_wait_stage);
+ if (ret) {
+ dev_err(dev, "Check QP(0x%lx) db process status failed!\n",
+ hr_qp->qpn);
+ return;
+ }
+
+ if (qp_work_entry->db_wait_stage != HNS_ROCE_V1_DB_WAIT_OK &&
+ priv->des_qp.requeue_flag) {
+ queue_work(priv->des_qp.qp_wq, work);
+ return;
+ }
+
+ /* Modify qp to reset before destroying qp */
+ ret = hns_roce_v1_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state,
+ IB_QPS_RESET);
+ if (ret) {
+ dev_err(dev, "Modify QP(0x%lx) to RST failed!\n", hr_qp->qpn);
+ return;
+ }
+
+ hns_roce_qp_remove(hr_dev, hr_qp);
+ hns_roce_qp_free(hr_dev, hr_qp);
+
+ if (hr_qp->ibqp.qp_type == IB_QPT_RC) {
+ /* RC QP, release QPN */
+ hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
+ kfree(hr_qp);
+ } else
+ kfree(hr_to_hr_sqp(hr_qp));
+
+ kfree(qp_work_entry);
+
+ dev_dbg(dev, "Accomplished destroy QP(0x%lx) work.\n", hr_qp->qpn);
+}
+
+int hns_roce_v1_destroy_qp(struct ib_qp *ibqp)
+{
+ struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+ struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+ struct device *dev = &hr_dev->pdev->dev;
+ struct hns_roce_qp_work qp_work_entry;
+ struct hns_roce_qp_work *qp_work;
+ struct hns_roce_v1_priv *priv;
+ struct hns_roce_cq *send_cq, *recv_cq;
+ int is_user = !!ibqp->pd->uobject;
+ int is_timeout = 0;
+ int ret;
+
+ ret = check_qp_reset_state(hr_dev, hr_qp, &qp_work_entry, &is_timeout);
+ if (ret) {
+ dev_err(dev, "QP reset state check failed(%d)!\n", ret);
+ return ret;
}
send_cq = to_hr_cq(hr_qp->ibqp.send_cq);
recv_cq = to_hr_cq(hr_qp->ibqp.recv_cq);
hns_roce_lock_cqs(send_cq, recv_cq);
-
if (!is_user) {
__hns_roce_v1_cq_clean(recv_cq, hr_qp->qpn, hr_qp->ibqp.srq ?
to_hr_srq(hr_qp->ibqp.srq) : NULL);
if (send_cq != recv_cq)
__hns_roce_v1_cq_clean(send_cq, hr_qp->qpn, NULL);
}
-
- hns_roce_qp_remove(hr_dev, hr_qp);
-
hns_roce_unlock_cqs(send_cq, recv_cq);
- hns_roce_qp_free(hr_dev, hr_qp);
+ if (!is_timeout) {
+ hns_roce_qp_remove(hr_dev, hr_qp);
+ hns_roce_qp_free(hr_dev, hr_qp);
- /* Not special_QP, free their QPN */
- if ((hr_qp->ibqp.qp_type == IB_QPT_RC) ||
- (hr_qp->ibqp.qp_type == IB_QPT_UC) ||
- (hr_qp->ibqp.qp_type == IB_QPT_UD))
- hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
+ /* RC QP, release QPN */
+ if (hr_qp->ibqp.qp_type == IB_QPT_RC)
+ hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
+ }
hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
- if (is_user) {
+ if (is_user)
ib_umem_release(hr_qp->umem);
- } else {
+ else {
kfree(hr_qp->sq.wrid);
kfree(hr_qp->rq.wrid);
+
hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
}
-}
-int hns_roce_v1_destroy_qp(struct ib_qp *ibqp)
-{
- struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
- struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
-
- hns_roce_v1_destroy_qp_common(hr_dev, hr_qp, !!ibqp->pd->uobject);
-
- if (hr_qp->ibqp.qp_type == IB_QPT_GSI)
- kfree(hr_to_hr_sqp(hr_qp));
- else
- kfree(hr_qp);
+ if (!is_timeout) {
+ if (hr_qp->ibqp.qp_type == IB_QPT_RC)
+ kfree(hr_qp);
+ else
+ kfree(hr_to_hr_sqp(hr_qp));
+ } else {
+ qp_work = kzalloc(sizeof(*qp_work), GFP_KERNEL);
+ if (!qp_work)
+ return -ENOMEM;
+
+ INIT_WORK(&qp_work->work, hns_roce_v1_destroy_qp_work_fn);
+ qp_work->ib_dev = &hr_dev->ib_dev;
+ qp_work->qp = hr_qp;
+ qp_work->db_wait_stage = qp_work_entry.db_wait_stage;
+ qp_work->sdb_issue_ptr = qp_work_entry.sdb_issue_ptr;
+ qp_work->sdb_inv_cnt = qp_work_entry.sdb_inv_cnt;
+ qp_work->sche_cnt = qp_work_entry.sche_cnt;
+
+ priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+ queue_work(priv->des_qp.qp_wq, &qp_work->work);
+ dev_dbg(dev, "Begin destroy QP(0x%lx) work.\n", hr_qp->qpn);
+ }
return 0;
}
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
index cf28f1b..1d250c0 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
@@ -102,6 +102,12 @@
#define HNS_ROCE_V1_EXT_ODB_ALFUL \
(HNS_ROCE_V1_EXT_ODB_DEPTH - HNS_ROCE_V1_DB_RSVD)
+#define HNS_ROCE_V1_DB_WAIT_OK 0
+#define HNS_ROCE_V1_DB_STAGE1 1
+#define HNS_ROCE_V1_DB_STAGE2 2
+#define HNS_ROCE_V1_CHECK_DB_TIMEOUT_MSECS 10000
+#define HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS 20
+
#define HNS_ROCE_BT_RSV_BUF_SIZE (1 << 17)
#define HNS_ROCE_V1_TPTR_ENTRY_SIZE 2
@@ -144,6 +150,7 @@
#define SQ_PSN_SHIFT 8
#define QKEY_VAL 0x80010000
#define SDB_INV_CNT_OFFSET 8
+#define SDB_ST_CMP_VAL 8
struct hns_roce_cq_context {
u32 cqc_byte_4;
@@ -993,11 +1000,27 @@ struct hns_roce_tptr_table {
struct hns_roce_buf_list tptr_buf;
};
+struct hns_roce_qp_work {
+ struct work_struct work;
+ struct ib_device *ib_dev;
+ struct hns_roce_qp *qp;
+ u32 db_wait_stage;
+ u32 sdb_issue_ptr;
+ u32 sdb_inv_cnt;
+ u32 sche_cnt;
+};
+
+struct hns_roce_des_qp {
+ struct workqueue_struct *qp_wq;
+ int requeue_flag;
+};
+
struct hns_roce_v1_priv {
struct hns_roce_db_table db_table;
struct hns_roce_raq_table raq_table;
struct hns_roce_bt_table bt_table;
struct hns_roce_tptr_table tptr_table;
+ struct hns_roce_des_qp des_qp;
};
int hns_dsaf_roce_reset(struct fwnode_handle *dsaf_fwnode, bool dereset);
--
1.7.9.5
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH for-next 0/6] IB/hns: Bug Fixes for HNS RoCE Driver
From: Salil Mehta @ 2016-11-29 23:10 UTC (permalink / raw)
To: dledford-H+wXaHxf7aLQT0dZR+AlfA
Cc: salil.mehta-hv44wF8Li93QT0dZR+AlfA,
xavier.huwei-hv44wF8Li93QT0dZR+AlfA,
oulijun-hv44wF8Li93QT0dZR+AlfA, xushaobo2-hv44wF8Li93QT0dZR+AlfA,
mehta.salil.lnk-Re5JQEeQqe8AvxtiuMwx3w, lijun_nudt-9Onoh4P/yGk,
linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linuxarm-hv44wF8Li93QT0dZR+AlfA
This patch-set contains bug fixes for the HNS RoCE driver.
Lijun Ou (1):
IB/hns: Fix the IB device name
Shaobo Xu (2):
IB/hns: Fix the bug when free mr
IB/hns: Fix the bug when free cq
Wei Hu (Xavier) (3):
IB/hns: Fix the bug when destroy qp
IB/hns: Fix the bug of setting port mtu
IB/hns: Delete the redundant memset operation
drivers/infiniband/hw/hns/hns_roce_cmd.h | 5 -
drivers/infiniband/hw/hns/hns_roce_common.h | 42 ++
drivers/infiniband/hw/hns/hns_roce_cq.c | 27 +-
drivers/infiniband/hw/hns/hns_roce_device.h | 18 +
drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 967 ++++++++++++++++++++++++---
drivers/infiniband/hw/hns/hns_roce_hw_v1.h | 57 ++
drivers/infiniband/hw/hns/hns_roce_main.c | 26 +-
drivers/infiniband/hw/hns/hns_roce_mr.c | 21 +-
8 files changed, 1026 insertions(+), 137 deletions(-)
--
1.7.9.5
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [bug report] net/mlx5: E-Switch, Add control for inline mode
From: Dan Carpenter @ 2016-11-29 21:25 UTC (permalink / raw)
To: Roi Dayan; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <fa981369-2032-eb9e-6307-f3055297e7e6-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
On Tue, Nov 29, 2016 at 05:06:55PM +0200, Roi Dayan wrote:
> I really didn't see this. sparse and smatch didn't catch this. I use
> FC24 and gcc 6.2.1 for compilation and it doesn't give me any
> warning.
> Did another check with an old rhel machine with gcc 4.4.6 and I got
> a warning about it there.
> Can you tell me which tool you used?
Some new unreleased Smatch stuff I'm working on.
>
> This is a false positive because of the first check if vport > 1, we
> actually check prev_mlx5_mode from the second for-loop
> iteration and on the first loop iteration we set a value (in line 952).
> It is used to check if all vports are set with the same inline mode.
Oh crap... Smatch takes a short cut handling loops (I really need to
fix this but I wrote the orginal code a decade ago). I misread how the
loops works as well. Sorry for the noise.
regards,
dan carpenter
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* [PATCH v2] ethernet :mellanox :mlx5: Replace pci_pool_alloc by pci_pool_zalloc
From: Souptick Joarder @ 2016-11-29 21:20 UTC (permalink / raw)
To: sergei.shtylyov-M4DtvfQ/ZS1MRgGoP+s0PdBPR1lH4CV8,
saeedm-VPRAkNaXOzVWk0Htik3J/w, matanb-VPRAkNaXOzVWk0Htik3J/w,
leonro-VPRAkNaXOzVWk0Htik3J/w
Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
sahu.rameshwar73-Re5JQEeQqe8AvxtiuMwx3w
In alloc_cmd_box(), pci_pool_alloc() followed by memset will be
replaced by pci_pool_zalloc()
Signed-off-by: Souptick joarder <jrdr.linux-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
---
v2:
- Address comments from Sergei
Alignment was not proper
drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 1e639f8..94a23f6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -1063,14 +1063,13 @@ static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev,
if (!mailbox)
return ERR_PTR(-ENOMEM);
- mailbox->buf = pci_pool_alloc(dev->cmd.pool, flags,
- &mailbox->dma);
+ mailbox->buf = pci_pool_zalloc(dev->cmd.pool, flags,
+ &mailbox->dma);
if (!mailbox->buf) {
mlx5_core_dbg(dev, "failed allocation\n");
kfree(mailbox);
return ERR_PTR(-ENOMEM);
}
- memset(mailbox->buf, 0, sizeof(struct mlx5_cmd_prot_block));
mailbox->next = NULL;
return mailbox;
--
1.9.1
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* Re: [PATCH v3] ethernet :mellanox :mlx4: Replace pci_pool_alloc by pci_pool_zalloc
From: Souptick Joarder @ 2016-11-29 20:29 UTC (permalink / raw)
To: Yuval Shaia
Cc: Sergei Shtylyov, yishaih-VPRAkNaXOzVWk0Htik3J/w,
netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
Rameshwar Sahu
In-Reply-To: <20161129195811.GA4221-Hxa29pjIrETlQW142y8m19+IiqhCXseY@public.gmane.org>
Hi Yuval,
> v3:
> - Fixed alignment issues
You mean remove extra empty line?
Yes, I removed the empty line and also I had fixed one alignment issue
which was mentioned
by Sergei in v2 patch.
On Wed, Nov 30, 2016 at 1:28 AM, Yuval Shaia <yuval.shaia-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org> wrote:
> On Wed, Nov 30, 2016 at 01:16:12AM +0530, Souptick Joarder wrote:
>> In mlx4_alloc_cmd_mailbox(), pci_pool_alloc() followed by memset will be
>> replaced by pci_pool_zalloc()
>>
>> Signed-off-by: Souptick joarder <jrdr.linux-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> ---
>> v3:
>> - Fixed alignment issues
>
> You mean remove extra empty line?
>
> Reviewed-by: Yuval Shaia <yuval.shaia-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
>
>>
>> v2:
>> - Address comment from sergei
>> Alignment was not proper
>>
>> drivers/net/ethernet/mellanox/mlx4/cmd.c | 6 ++----
>> 1 file changed, 2 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
>> index e36bebc..a49072b4 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
>> @@ -2679,15 +2679,13 @@ struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
>> if (!mailbox)
>> return ERR_PTR(-ENOMEM);
>>
>> - mailbox->buf = pci_pool_alloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
>> - &mailbox->dma);
>> + mailbox->buf = pci_pool_zalloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
>> + &mailbox->dma);
>> if (!mailbox->buf) {
>> kfree(mailbox);
>> return ERR_PTR(-ENOMEM);
>> }
>>
>> - memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
>> -
>> return mailbox;
>> }
>> EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);
>> --
>> 1.9.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH v3] ethernet :mellanox :mlx4: Replace pci_pool_alloc by pci_pool_zalloc
From: Yuval Shaia @ 2016-11-29 19:58 UTC (permalink / raw)
To: Souptick Joarder
Cc: sergei.shtylyov, yishaih, netdev, linux-rdma, sahu.rameshwar73
In-Reply-To: <20161129194611.GA4088@jordon-HP-15-Notebook-PC>
On Wed, Nov 30, 2016 at 01:16:12AM +0530, Souptick Joarder wrote:
> In mlx4_alloc_cmd_mailbox(), pci_pool_alloc() followed by memset will be
> replaced by pci_pool_zalloc()
>
> Signed-off-by: Souptick joarder <jrdr.linux@gmail.com>
> ---
> v3:
> - Fixed alignment issues
You mean remove extra empty line?
Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
>
> v2:
> - Address comment from sergei
> Alignment was not proper
>
> drivers/net/ethernet/mellanox/mlx4/cmd.c | 6 ++----
> 1 file changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
> index e36bebc..a49072b4 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
> @@ -2679,15 +2679,13 @@ struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
> if (!mailbox)
> return ERR_PTR(-ENOMEM);
>
> - mailbox->buf = pci_pool_alloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
> - &mailbox->dma);
> + mailbox->buf = pci_pool_zalloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
> + &mailbox->dma);
> if (!mailbox->buf) {
> kfree(mailbox);
> return ERR_PTR(-ENOMEM);
> }
>
> - memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
> -
> return mailbox;
> }
> EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* [PATCH v3] ethernet :mellanox :mlx4: Replace pci_pool_alloc by pci_pool_zalloc
From: Souptick Joarder @ 2016-11-29 19:46 UTC (permalink / raw)
To: sergei.shtylyov, yishaih; +Cc: netdev, linux-rdma, sahu.rameshwar73
In mlx4_alloc_cmd_mailbox(), pci_pool_alloc() followed by memset will be
replaced by pci_pool_zalloc()
Signed-off-by: Souptick joarder <jrdr.linux@gmail.com>
---
v3:
- Fixed alignment issues
v2:
- Address comment from sergei
Alignment was not proper
drivers/net/ethernet/mellanox/mlx4/cmd.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index e36bebc..a49072b4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -2679,15 +2679,13 @@ struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
if (!mailbox)
return ERR_PTR(-ENOMEM);
- mailbox->buf = pci_pool_alloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
- &mailbox->dma);
+ mailbox->buf = pci_pool_zalloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
+ &mailbox->dma);
if (!mailbox->buf) {
kfree(mailbox);
return ERR_PTR(-ENOMEM);
}
- memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
-
return mailbox;
}
EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);
^ permalink raw reply related
* RE: [PATCH v5 0/8] connect reject event helpers
From: Steve Wise @ 2016-11-29 17:05 UTC (permalink / raw)
To: dledford-H+wXaHxf7aLQT0dZR+AlfA,
sean.hefty-ral2JQCrhuEAvxtiuMwx3w
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ,
linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
sagi-NQWnxTmZq1alnMjI0IkVqw, hch-jcswGhMUV9g, axboe-b10kYP2dOMg,
santosh.shilimkar-QHcLZuEGTsvQT0dZR+AlfA
In-Reply-To: <cover.1477510827.git.swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
Hey Doug,
Are you planning to merge this series for 4.10? They're still marked 'new' on
patchworks.
Thanks,
Steve.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [RFC 02/10] IB/hfi-vnic: Virtual Network Interface Controller (VNIC) Bus driver
From: Jason Gunthorpe @ 2016-11-29 16:50 UTC (permalink / raw)
To: Hefty, Sean
Cc: Vishwanathapura, Niranjana, Weiny, Ira, Doug Ledford,
linux-rdma@vger.kernel.org, netdev@vger.kernel.org,
Dalessandro, Dennis
In-Reply-To: <1828884A29C6694DAF28B7E6B8A82373AB0B9B46@ORSMSX109.amr.corp.intel.com>
On Tue, Nov 29, 2016 at 04:44:37PM +0000, Hefty, Sean wrote:
> > You are not making a subsystem. Don't overcomplicate things. A
> > multi-part device device can just directly link.
>
> The VNIC may be usable over multiple generations of HFIs, but I
> don't know if that is the intent.
If Intel wants to build a HFI subystem within RDMA with multiple
drivers then sure, but they are not there yet, and we don't even know
what that could look like. So it is better to leave it simple for now.
Jason
^ permalink raw reply
* RE: [RFC 02/10] IB/hfi-vnic: Virtual Network Interface Controller (VNIC) Bus driver
From: Hefty, Sean @ 2016-11-29 16:44 UTC (permalink / raw)
To: Jason Gunthorpe, Vishwanathapura, Niranjana
Cc: Weiny, Ira, Doug Ledford,
linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Dalessandro, Dennis
In-Reply-To: <20161129161950.GB742-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
> You are not making a subsystem. Don't overcomplicate things. A
> multi-part device device can just directly link.
The VNIC may be usable over multiple generations of HFIs, but I don't know if that is the intent.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH 1/2] SRP transport: Move queuecommand() wait code to SCSI core
From: Martin K. Petersen @ 2016-11-29 16:22 UTC (permalink / raw)
To: Bart Van Assche
Cc: Max Gurtovoy, James Bottomley, Martin K. Petersen, Doug Ledford,
Christoph Hellwig, Sagi Grimberg, linux-scsi@vger.kernel.org,
linux-rdma@vger.kernel.org
In-Reply-To: <BLUPR02MB16836EF63187DE03E7F1DD77818D0-Y8PPn9RqzNfZ9ihocuPUdanrV9Ap65cLvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
>>>>> "Bart" == Bart Van Assche <Bart.VanAssche-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org> writes:
Bart> On 11/25/16 15:21, Max Gurtovoy wrote:
>> the arg is sdev.
Bart> Good catch.
Bart> Martin, do you want me to repost this patch or do you want me to
Bart> post a fix-up patch?
I fixed it up.
--
Martin K. Petersen Oracle Linux Engineering
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [RFC 02/10] IB/hfi-vnic: Virtual Network Interface Controller (VNIC) Bus driver
From: Jason Gunthorpe @ 2016-11-29 16:21 UTC (permalink / raw)
To: Vishwanathapura, Niranjana
Cc: ira.weiny, Doug Ledford, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA, Dennis Dalessandro
In-Reply-To: <20161129062938.GA84990-wPcXA7LoDC+1XWohqUldA0EOCMrvLtNR@public.gmane.org>
On Mon, Nov 28, 2016 at 10:29:38PM -0800, Vishwanathapura, Niranjana wrote:
> On Thu, Nov 24, 2016 at 09:15:45AM -0700, Jason Gunthorpe wrote:
> >>And will move the hfi_vnic module under
> >>???drivers/infiniband/ulp/hfi_vnic???.
> >
> >I would prefer drivers/net/ethernet
> >
> >This is clearly not a ULP since it doesn't use verbs.
> >
>
> I understand it is not using verbs, but the control path (ib_device client)
> is using verbs (IB MAD).
> Our prefernce is to keep it somewhere under drivers/infiniband. Summarizing
> reasons again here,
>
> - VNIC control driver (ib_device client) is an IB MAD agent.
> - It is purly a software construct, encapsualtes ethernet packets in
> Omni-path packet and depends on hfi1 driver here for HW access.
Is the majority of the code MAD focused or net stack focused?
I'm not sure it matters, it isn't like we can review Intel's
proprietary mad stuff anyhow. :\
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [RFC 02/10] IB/hfi-vnic: Virtual Network Interface Controller (VNIC) Bus driver
From: Jason Gunthorpe @ 2016-11-29 16:19 UTC (permalink / raw)
To: Vishwanathapura, Niranjana
Cc: ira.weiny, Doug Ledford, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA, Dennis Dalessandro
In-Reply-To: <20161129063106.GB84990-wPcXA7LoDC+1XWohqUldA0EOCMrvLtNR@public.gmane.org>
On Mon, Nov 28, 2016 at 10:31:06PM -0800, Vishwanathapura, Niranjana wrote:
> On Fri, Nov 25, 2016 at 12:05:09PM -0700, Jason Gunthorpe wrote:
> >On Thu, Nov 24, 2016 at 06:13:50PM -0800, Vishwanathapura, Niranjana wrote:
> >
> >>In order to be truely device independent the hfi_vnic ULP should not depend
> >>on a device exported symbol. Instead device should register its functions
> >>with the ULP. Hence the approaches a) and b).
> >
> >It is not device independent, it is hard linked to hfi1, just like our
> >other multi-component drivers.. So don't worry about that.
> >
>
> We would like to keep the design clean and avoid any tight coupling here
> (our original design in this series tackled these).
> Any strong reason not to go with a) or b) ?
You are not making a subsystem. Don't overcomplicate things. A
multi-part device device can just directly link.
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH rdma-next 01/10] IB/core: Add raw packet protocol
From: Jason Gunthorpe @ 2016-11-29 16:19 UTC (permalink / raw)
To: Or Gerlitz
Cc: Yishai Hadas, Matan Barak, Doug Ledford,
linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Steve Wise,
Moni Shoua
In-Reply-To: <CAJ3xEMiC3UDujgSL5fwP7ee1=OjhorZ2aeB1k+ptGb9GWaUVkg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
On Tue, Nov 29, 2016 at 08:35:52AM +0200, Or Gerlitz wrote:
> > or at least it should or things are already broken.
>
> Can you elaborate what it broken with mlx4? I suspect we
> even have down there some functionality which depends on that,
> but again, I 1st would like to hear if/what is broken - I copied the maintainer.
The semantic we require is that everything under a struct ib_device is
compatible, you can use an AH on any PD with any QP on any port. As an
example wee absolutely do not allow iwarp and ib on the same
ib_device, (rdma_cm will break horribly, at least).
The fact rocee and ib sort of work together as-is represents a fluke
not a design goal :(
> Jason, patches 8-10 which carry the functional change I want to introduce
> (allow mlx5 IB devices to be created when RoCE is not supported) stand
> for themselves.
I asked for the port_num to be removed, and the uapi for it to be
device not port specific. Don't see why that is such a big deal..
> a re-write of the 10y old IB device-ing of things done by mlx4 just
> to be able and introduce this reduced functionally (raw packet qp
> only) of mlx5 devices.
Seems totally fair to me for patches adding a new uapi. Why do you
guys keep thinking uapis should be easy?
Who else is going to fix this?
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* [PATCH v3 10/10] svcrdma: Further clean-up of svc_rdma_get_inv_rkey()
From: Chuck Lever @ 2016-11-29 16:05 UTC (permalink / raw)
To: bfields-uC3wQj2KruNg9hUCZPvPmw
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161129155521.4477.53561.stgit-Hs+gFlyCn65vLzlybtyyYzGyq/o6K9yX@public.gmane.org>
No longer any need for the dprintk().
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 19 +++++--------------
1 file changed, 5 insertions(+), 14 deletions(-)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 30eeab5..ad4d286 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -199,31 +199,22 @@ static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
{
struct rpcrdma_read_chunk *rd_ary;
struct rpcrdma_segment *arg_ch;
- u32 inv_rkey;
-
- inv_rkey = 0;
rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0];
- if (rd_ary->rc_discrim != xdr_zero) {
- inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle);
- goto out;
- }
+ if (rd_ary->rc_discrim != xdr_zero)
+ return be32_to_cpu(rd_ary->rc_target.rs_handle);
if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
arg_ch = &wr_ary->wc_array[0].wc_target;
- inv_rkey = be32_to_cpu(arg_ch->rs_handle);
- goto out;
+ return be32_to_cpu(arg_ch->rs_handle);
}
if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
arg_ch = &rp_ary->wc_array[0].wc_target;
- inv_rkey = be32_to_cpu(arg_ch->rs_handle);
- goto out;
+ return be32_to_cpu(arg_ch->rs_handle);
}
-out:
- dprintk("svcrdma: Send With Invalidate rkey=%08x\n", inv_rkey);
- return inv_rkey;
+ return 0;
}
/* Assumptions:
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v3 09/10] svcrdma: Break up dprintk format in svc_rdma_accept()
From: Chuck Lever @ 2016-11-29 16:05 UTC (permalink / raw)
To: bfields-uC3wQj2KruNg9hUCZPvPmw
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161129155521.4477.53561.stgit-Hs+gFlyCn65vLzlybtyyYzGyq/o6K9yX@public.gmane.org>
The current code results in:
Nov 7 14:50:19 klimt kernel: svcrdma: newxprt->sc_cm_id=ffff88085590c800,
newxprt->sc_pd=ffff880852a7ce00#012 cm_id->device=ffff88084dd20000,
sc_pd->device=ffff88084dd20000#012 cap.max_send_wr = 272#012
cap.max_recv_wr = 34#012 cap.max_send_sge = 32#012
cap.max_recv_sge = 32
Nov 7 14:50:19 klimt kernel: svcrdma: new connection ffff880855908000
accepted with the following attributes:#012 local_ip :
10.0.0.5#012 local_port#011 : 20049#012 remote_ip :
10.0.0.2#012 remote_port : 59909#012 max_sge : 32#012
max_sge_rd : 30#012 sq_depth : 272#012 max_requests :
32#012 ord : 16
Split up the output over multiple dprintks and take the opportunity
to fix the display of IPv6 addresses.
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
net/sunrpc/xprtrdma/svc_rdma_transport.c | 55 ++++++++++--------------------
1 file changed, 18 insertions(+), 37 deletions(-)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 733d6e03..ca2799a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -41,6 +41,7 @@
*/
#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/interrupt.h>
@@ -968,6 +969,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct rpcrdma_connect_private pmsg;
struct ib_qp_init_attr qp_attr;
struct ib_device *dev;
+ struct sockaddr *sap;
unsigned int i;
int ret = 0;
@@ -1048,18 +1050,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
qp_attr.qp_type = IB_QPT_RC;
qp_attr.send_cq = newxprt->sc_sq_cq;
qp_attr.recv_cq = newxprt->sc_rq_cq;
- dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
- " cm_id->device=%p, sc_pd->device=%p\n"
- " cap.max_send_wr = %d\n"
- " cap.max_recv_wr = %d\n"
- " cap.max_send_sge = %d\n"
- " cap.max_recv_sge = %d\n",
- newxprt->sc_cm_id, newxprt->sc_pd,
- dev, newxprt->sc_pd->device,
- qp_attr.cap.max_send_wr,
- qp_attr.cap.max_recv_wr,
- qp_attr.cap.max_send_sge,
- qp_attr.cap.max_recv_sge);
+ dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n",
+ newxprt->sc_cm_id, newxprt->sc_pd);
+ dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
+ qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
+ dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
+ qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
if (ret) {
@@ -1142,31 +1138,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
goto errout;
}
- dprintk("svcrdma: new connection %p accepted with the following "
- "attributes:\n"
- " local_ip : %pI4\n"
- " local_port : %d\n"
- " remote_ip : %pI4\n"
- " remote_port : %d\n"
- " max_sge : %d\n"
- " max_sge_rd : %d\n"
- " sq_depth : %d\n"
- " max_requests : %d\n"
- " ord : %d\n",
- newxprt,
- &((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.src_addr)->sin_addr.s_addr,
- ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.src_addr)->sin_port),
- &((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.dst_addr)->sin_addr.s_addr,
- ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.dst_addr)->sin_port),
- newxprt->sc_max_sge,
- newxprt->sc_max_sge_rd,
- newxprt->sc_sq_depth,
- newxprt->sc_max_requests,
- newxprt->sc_ord);
+ dprintk("svcrdma: new connection %p accepted:\n", newxprt);
+ sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+ dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap));
+ sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+ dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap));
+ dprintk(" max_sge : %d\n", newxprt->sc_max_sge);
+ dprintk(" max_sge_rd : %d\n", newxprt->sc_max_sge_rd);
+ dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth);
+ dprintk(" max_requests : %d\n", newxprt->sc_max_requests);
+ dprintk(" ord : %d\n", newxprt->sc_ord);
return &newxprt->sc_xprt;
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v3 08/10] svcrdma: Remove unused variable in rdma_copy_tail()
From: Chuck Lever @ 2016-11-29 16:05 UTC (permalink / raw)
To: bfields-uC3wQj2KruNg9hUCZPvPmw
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161129155521.4477.53561.stgit-Hs+gFlyCn65vLzlybtyyYzGyq/o6K9yX@public.gmane.org>
Clean up.
linux-2.6/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c: In function
‘rdma_copy_tail’:
linux-2.6/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c:376:6: warning:
variable ‘ret’ set but not used [-Wunused-but-set-variable]
int ret;
^
Fixes: a97c331f9aa9 ("svcrdma: Handle additional inline content")
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 428ab4ef..57d35fb 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -373,9 +373,7 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
u32 position, u32 byte_count, u32 page_offset, int page_no)
{
char *srcp, *destp;
- int ret;
- ret = 0;
srcp = head->arg.head[0].iov_base + position;
byte_count = head->arg.head[0].iov_len - position;
if (byte_count > PAGE_SIZE) {
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v3 07/10] svcrdma: Remove unused variables in xprt_rdma_bc_allocate()
From: Chuck Lever @ 2016-11-29 16:05 UTC (permalink / raw)
To: bfields-uC3wQj2KruNg9hUCZPvPmw
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161129155521.4477.53561.stgit-Hs+gFlyCn65vLzlybtyyYzGyq/o6K9yX@public.gmane.org>
Clean up.
/linux-2.6/net/sunrpc/xprtrdma/svc_rdma_backchannel.c: In function
‘xprt_rdma_bc_allocate’:
linux-2.6/net/sunrpc/xprtrdma/svc_rdma_backchannel.c:169:23: warning:
variable ‘rdma’ set but not used [-Wunused-but-set-variable]
struct svcxprt_rdma *rdma;
^
Fixes: 5d252f90a800 ("svcrdma: Add class for RDMA backwards ...")
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 4 ----
1 file changed, 4 deletions(-)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 6035c5a..288e35c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -164,13 +164,9 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
xprt_rdma_bc_allocate(struct rpc_task *task)
{
struct rpc_rqst *rqst = task->tk_rqstp;
- struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
size_t size = rqst->rq_callsize;
- struct svcxprt_rdma *rdma;
struct page *page;
- rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
-
if (size > PAGE_SIZE) {
WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
size);
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v3 06/10] svcrdma: Remove svc_rdma_op_ctxt::wc_status
From: Chuck Lever @ 2016-11-29 16:05 UTC (permalink / raw)
To: bfields-uC3wQj2KruNg9hUCZPvPmw
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161129155521.4477.53561.stgit-Hs+gFlyCn65vLzlybtyyYzGyq/o6K9yX@public.gmane.org>
Clean up: Completion status is already reported in the individual
completion handlers. Save a few bytes in struct svc_rdma_op_ctxt.
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
include/linux/sunrpc/svc_rdma.h | 1 -
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 4 ++--
net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 -
3 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 43d7c70..757fb96 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -79,7 +79,6 @@ struct svc_rdma_op_ctxt {
struct ib_cqe reg_cqe;
struct ib_cqe inv_cqe;
struct list_head dto_q;
- enum ib_wc_status wc_status;
u32 byte_len;
u32 position;
struct svcxprt_rdma *xprt;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 283246e..428ab4ef 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -640,8 +640,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto defer;
goto out;
}
- dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
- ctxt, rdma_xprt, rqstp, ctxt->wc_status);
+ dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p\n",
+ ctxt, rdma_xprt, rqstp);
atomic_inc(&rdma_stat_recv);
/* Build up the XDR from the receive buffers. */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 61ae392..733d6e03 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -395,7 +395,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wc->wr_cqe and wc->status are reliable */
ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
- ctxt->wc_status = wc->status;
svc_rdma_unmap_dma(ctxt);
if (wc->status != IB_WC_SUCCESS)
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v3 05/10] svcrdma: Remove DMA map accounting
From: Chuck Lever @ 2016-11-29 16:04 UTC (permalink / raw)
To: bfields-uC3wQj2KruNg9hUCZPvPmw
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161129155521.4477.53561.stgit-Hs+gFlyCn65vLzlybtyyYzGyq/o6K9yX@public.gmane.org>
Clean up: sc_dma_used is not required for correct operation. It is
simply a debugging tool to report when svcrdma has leaked DMA maps.
However, manipulating an atomic has a measurable CPU cost, and DMA
map accounting specific to svcrdma will be meaningless once svcrdma
is converted to use the new generic r/w API.
A similar kind of debug accounting can be done simply by enabling
the IOMMU or by using CONFIG_DMA_API_DEBUG, CONFIG_IOMMU_DEBUG, and
CONFIG_IOMMU_LEAK.
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
include/linux/sunrpc/svc_rdma.h | 2 --
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 1 -
net/sunrpc/xprtrdma/svc_rdma_transport.c | 13 +++----------
3 files changed, 3 insertions(+), 13 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 601cb07..43d7c70 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -148,7 +148,6 @@ struct svcxprt_rdma {
struct ib_pd *sc_pd;
- atomic_t sc_dma_used;
spinlock_t sc_ctxt_lock;
struct list_head sc_ctxts;
int sc_ctxt_used;
@@ -200,7 +199,6 @@ static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
struct svc_rdma_op_ctxt *ctxt)
{
ctxt->mapped_sges++;
- atomic_inc(&rdma->sc_dma_used);
}
/* svc_rdma_backchannel.c */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 873c2a9..283246e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -279,7 +279,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
frmr->sg);
return -ENOMEM;
}
- atomic_inc(&xprt->sc_dma_used);
n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
if (unlikely(n != frmr->sg_nents)) {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 3e68e3d..61ae392 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -226,25 +226,22 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
struct svcxprt_rdma *xprt = ctxt->xprt;
struct ib_device *device = xprt->sc_cm_id->device;
u32 lkey = xprt->sc_pd->local_dma_lkey;
- unsigned int i, count;
+ unsigned int i;
- for (count = 0, i = 0; i < ctxt->mapped_sges; i++) {
+ for (i = 0; i < ctxt->mapped_sges; i++) {
/*
* Unmap the DMA addr in the SGE if the lkey matches
* the local_dma_lkey, otherwise, ignore it since it is
* an FRMR lkey and will be unmapped later when the
* last WR that uses it completes.
*/
- if (ctxt->sge[i].lkey == lkey) {
- count++;
+ if (ctxt->sge[i].lkey == lkey)
ib_dma_unmap_page(device,
ctxt->sge[i].addr,
ctxt->sge[i].length,
ctxt->direction);
- }
}
ctxt->mapped_sges = 0;
- atomic_sub(count, &xprt->sc_dma_used);
}
void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -946,7 +943,6 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
if (frmr) {
ib_dma_unmap_sg(rdma->sc_cm_id->device,
frmr->sg, frmr->sg_nents, frmr->direction);
- atomic_dec(&rdma->sc_dma_used);
spin_lock_bh(&rdma->sc_frmr_q_lock);
WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
@@ -1258,9 +1254,6 @@ static void __svc_rdma_free(struct work_struct *work)
if (rdma->sc_ctxt_used != 0)
pr_err("svcrdma: ctxt still in use? (%d)\n",
rdma->sc_ctxt_used);
- if (atomic_read(&rdma->sc_dma_used) != 0)
- pr_err("svcrdma: dma still in use? (%d)\n",
- atomic_read(&rdma->sc_dma_used));
/* Final put of backchannel client transport */
if (xprt->xpt_bc_xprt) {
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v3 04/10] svcrdma: Remove BH-disabled spin locking in svc_rdma_send()
From: Chuck Lever @ 2016-11-29 16:04 UTC (permalink / raw)
To: bfields-uC3wQj2KruNg9hUCZPvPmw
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161129155521.4477.53561.stgit-Hs+gFlyCn65vLzlybtyyYzGyq/o6K9yX@public.gmane.org>
svcrdma's current SQ accounting algorithm takes sc_lock and disables
bottom-halves while posting all RDMA Read, Write, and Send WRs.
This is relatively heavyweight serialization. And note that Write and
Send are already fully serialized by the xpt_mutex.
Using a single atomic_t should be all that is necessary to guarantee
that ib_post_send() is called only when there is enough space on the
send queue. This is what the other RDMA-enabled storage targets do.
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
include/linux/sunrpc/svc_rdma.h | 2 +-
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 7 ++++++-
net/sunrpc/xprtrdma/svc_rdma_transport.c | 25 ++++++++++---------------
3 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 6aef63b..601cb07 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -139,7 +139,7 @@ struct svcxprt_rdma {
int sc_max_sge_rd; /* max sge for read target */
bool sc_snd_w_inv; /* OK to use Send With Invalidate */
- atomic_t sc_sq_count; /* Number of SQ WR on queue */
+ atomic_t sc_sq_avail; /* SQEs ready to be consumed */
unsigned int sc_sq_depth; /* Depth of SQ */
unsigned int sc_rq_depth; /* Depth of RQ */
u32 sc_max_requests; /* Forward credits */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 0a58d40..30eeab5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -594,7 +594,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
goto err0;
inline_bytes = rqstp->rq_res.len;
- /* Create the RDMA response header */
+ /* Create the RDMA response header. xprt->xpt_mutex,
+ * acquired in svc_send(), serializes RPC replies. The
+ * code path below that inserts the credit grant value
+ * into each transport header runs only inside this
+ * critical section.
+ */
ret = -ENOMEM;
res_page = alloc_page(GFP_KERNEL);
if (!res_page)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 1334de2..3e68e3d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -436,7 +436,7 @@ static void svc_rdma_send_wc_common(struct svcxprt_rdma *xprt,
goto err;
out:
- atomic_dec(&xprt->sc_sq_count);
+ atomic_inc(&xprt->sc_sq_avail);
wake_up(&xprt->sc_send_wait);
return;
@@ -1010,6 +1010,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_rq_depth = newxprt->sc_max_requests +
newxprt->sc_max_bc_requests;
newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+ atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
if (!svc_rdma_prealloc_ctxts(newxprt))
goto errout;
@@ -1339,15 +1340,13 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
/* If the SQ is full, wait until an SQ entry is available */
while (1) {
- spin_lock_bh(&xprt->sc_lock);
- if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
- spin_unlock_bh(&xprt->sc_lock);
+ if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) {
atomic_inc(&rdma_stat_sq_starve);
/* Wait until SQ WR available if SQ still full */
+ atomic_add(wr_count, &xprt->sc_sq_avail);
wait_event(xprt->sc_send_wait,
- atomic_read(&xprt->sc_sq_count) <
- xprt->sc_sq_depth);
+ atomic_read(&xprt->sc_sq_avail) > wr_count);
if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
return -ENOTCONN;
continue;
@@ -1357,21 +1356,17 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
svc_xprt_get(&xprt->sc_xprt);
/* Bump used SQ WR count and post */
- atomic_add(wr_count, &xprt->sc_sq_count);
ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
if (ret) {
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
- atomic_sub(wr_count, &xprt->sc_sq_count);
for (i = 0; i < wr_count; i ++)
svc_xprt_put(&xprt->sc_xprt);
- dprintk("svcrdma: failed to post SQ WR rc=%d, "
- "sc_sq_count=%d, sc_sq_depth=%d\n",
- ret, atomic_read(&xprt->sc_sq_count),
- xprt->sc_sq_depth);
- }
- spin_unlock_bh(&xprt->sc_lock);
- if (ret)
+ dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret);
+ dprintk(" sc_sq_avail=%d, sc_sq_depth=%d\n",
+ atomic_read(&xprt->sc_sq_avail),
+ xprt->sc_sq_depth);
wake_up(&xprt->sc_send_wait);
+ }
break;
}
return ret;
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v3 03/10] svcrdma: Renovate sendto chunk list parsing
From: Chuck Lever @ 2016-11-29 16:04 UTC (permalink / raw)
To: bfields-uC3wQj2KruNg9hUCZPvPmw
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161129155521.4477.53561.stgit-Hs+gFlyCn65vLzlybtyyYzGyq/o6K9yX@public.gmane.org>
The current sendto code appears to support clients that provide only
one of a Read list, a Write list, or a Reply chunk. My reading of
that code is that it doesn't support the following cases:
- Read list + Write list
- Read list + Reply chunk
- Write list + Reply chunk
- Read list + Write list + Reply chunk
The protocol allows more than one Read or Write chunk in those
lists. Some clients do send a Read list and Reply chunk
simultaneously. NFSv4 WRITE uses a Read list for the data payload,
and a Reply chunk because the GETATTR result in the reply can
contain a large object like an ACL.
Generalize one of the sendto code paths needed to support all of
the above cases, and attempt to ensure that only one pass is done
through the RPC Call's transport header to gather chunk list
information for building the reply.
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
include/linux/sunrpc/svc_rdma.h | 2 -
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 14 +++++
net/sunrpc/xprtrdma/svc_rdma_sendto.c | 92 ++++++++-----------------------
3 files changed, 39 insertions(+), 69 deletions(-)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index cc3ae16..6aef63b 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -236,8 +236,6 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
struct svc_rdma_req_map *, bool);
extern int svc_rdma_sendto(struct svc_rqst *);
-extern struct rpcrdma_read_chunk *
- svc_rdma_get_read_chunk(struct rpcrdma_msg *);
extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
int);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index ad1df97..873c2a9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -415,6 +415,20 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
return 1;
}
+/* Returns the address of the first read chunk or <nul> if no read chunk
+ * is present
+ */
+static struct rpcrdma_read_chunk *
+svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+{
+ struct rpcrdma_read_chunk *ch =
+ (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+
+ if (ch->rc_discrim == xdr_zero)
+ return NULL;
+ return ch;
+}
+
static int rdma_read_chunks(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rmsgp,
struct svc_rqst *rqstp,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index f5a91ed..0a58d40 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -153,76 +153,35 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
return dma_addr;
}
-/* Returns the address of the first read chunk or <nul> if no read chunk
- * is present
+/* Parse the RPC Call's transport header.
*/
-struct rpcrdma_read_chunk *
-svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
+ struct rpcrdma_write_array **write,
+ struct rpcrdma_write_array **reply)
{
- struct rpcrdma_read_chunk *ch =
- (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+ __be32 *p;
- if (ch->rc_discrim == xdr_zero)
- return NULL;
- return ch;
-}
-
-/* Returns the address of the first read write array element or <nul>
- * if no write array list is present
- */
-static struct rpcrdma_write_array *
-svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
-{
- if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
- rmsgp->rm_body.rm_chunks[1] == xdr_zero)
- return NULL;
- return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1];
-}
+ p = (__be32 *)&rmsgp->rm_body.rm_chunks[0];
-/* Returns the address of the first reply array element or <nul> if no
- * reply array is present
- */
-static struct rpcrdma_write_array *
-svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp,
- struct rpcrdma_write_array *wr_ary)
-{
- struct rpcrdma_read_chunk *rch;
- struct rpcrdma_write_array *rp_ary;
+ /* Read list */
+ while (*p++ != xdr_zero)
+ p += 5;
- /* XXX: Need to fix when reply chunk may occur with read list
- * and/or write list.
- */
- if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
- rmsgp->rm_body.rm_chunks[1] != xdr_zero)
- return NULL;
-
- rch = svc_rdma_get_read_chunk(rmsgp);
- if (rch) {
- while (rch->rc_discrim != xdr_zero)
- rch++;
-
- /* The reply chunk follows an empty write array located
- * at 'rc_position' here. The reply array is at rc_target.
- */
- rp_ary = (struct rpcrdma_write_array *)&rch->rc_target;
- goto found_it;
- }
-
- if (wr_ary) {
- int chunk = be32_to_cpu(wr_ary->wc_nchunks);
-
- rp_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[chunk].wc_target.rs_length;
- goto found_it;
+ /* Write list */
+ if (*p != xdr_zero) {
+ *write = (struct rpcrdma_write_array *)p;
+ while (*p++ != xdr_zero)
+ p += 1 + be32_to_cpu(*p) * 4;
+ } else {
+ *write = NULL;
+ p++;
}
- /* No read list, no write list */
- rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2];
-
- found_it:
- if (rp_ary->wc_discrim == xdr_zero)
- return NULL;
- return rp_ary;
+ /* Reply chunk */
+ if (*p != xdr_zero)
+ *reply = (struct rpcrdma_write_array *)p;
+ else
+ *reply = NULL;
}
/* RPC-over-RDMA Version One private extension: Remote Invalidation.
@@ -244,8 +203,8 @@ static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
inv_rkey = 0;
- rd_ary = svc_rdma_get_read_chunk(rdma_argp);
- if (rd_ary) {
+ rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0];
+ if (rd_ary->rc_discrim != xdr_zero) {
inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle);
goto out;
}
@@ -622,8 +581,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
* places this at the start of page 0.
*/
rdma_argp = page_address(rqstp->rq_pages[0]);
- wr_ary = svc_rdma_get_write_array(rdma_argp);
- rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
+ svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary);
inv_rkey = 0;
if (rdma->sc_snd_w_inv)
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v3 02/10] svcauth_gss: Close connection when dropping an incoming message
From: Chuck Lever @ 2016-11-29 16:04 UTC (permalink / raw)
To: bfields-uC3wQj2KruNg9hUCZPvPmw
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161129155521.4477.53561.stgit-Hs+gFlyCn65vLzlybtyyYzGyq/o6K9yX@public.gmane.org>
S5.3.3.1 of RFC 2203 requires that an incoming GSS-wrapped message
whose sequence number lies outside the current window is dropped.
The rationale is:
The reason for discarding requests silently is that the server
is unable to determine if the duplicate or out of range request
was due to a sequencing problem in the client, network, or the
operating system, or due to some quirk in routing, or a replay
attack by an intruder. Discarding the request allows the client
to recover after timing out, if indeed the duplication was
unintentional or well intended.
However, clients may rely on the server dropping the connection to
indicate that a retransmit is needed. Without a connection reset, a
client can wait forever without retransmitting, and the workload
just stops dead. I've reproduced this behavior by running xfstests
generic/323 on an NFSv4.0 mount with proto=rdma and sec=krb5i.
To address this issue, have the server close the connection when it
silently discards an incoming message due to a GSS sequence number
problem.
There are a few other places where the server will never reply.
Change those spots in a similar fashion.
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
net/sunrpc/auth_gss/svcauth_gss.c | 2 +-
net/sunrpc/svc.c | 14 +++++++++-----
2 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 45662d7..886e9d38 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1548,7 +1548,7 @@ static void destroy_use_gss_proxy_proc_entry(struct net *net) {}
ret = SVC_COMPLETE;
goto out;
drop:
- ret = SVC_DROP;
+ ret = SVC_CLOSE;
out:
if (rsci)
cache_put(&rsci->h, sn->rsc_cache);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 7c8070e..75f290b 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1155,8 +1155,7 @@ static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ..
case SVC_DENIED:
goto err_bad_auth;
case SVC_CLOSE:
- if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
- svc_close_xprt(rqstp->rq_xprt);
+ goto close;
case SVC_DROP:
goto dropit;
case SVC_COMPLETE:
@@ -1246,7 +1245,7 @@ static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ..
sendit:
if (svc_authorise(rqstp))
- goto dropit;
+ goto close;
return 1; /* Caller can now send it */
dropit:
@@ -1254,11 +1253,16 @@ static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ..
dprintk("svc: svc_process dropit\n");
return 0;
+ close:
+ if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+ svc_close_xprt(rqstp->rq_xprt);
+ dprintk("svc: svc_process close\n");
+ return 0;
+
err_short_len:
svc_printk(rqstp, "short len %Zd, dropping request\n",
argv->iov_len);
-
- goto dropit; /* drop request */
+ goto close;
err_bad_rpc:
serv->sv_stats->rpcbadfmt++;
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v3 01/10] svcrdma: Clear xpt_bc_xps in xprt_setup_rdma_bc() error exit arm
From: Chuck Lever @ 2016-11-29 16:04 UTC (permalink / raw)
To: bfields-uC3wQj2KruNg9hUCZPvPmw
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161129155521.4477.53561.stgit-Hs+gFlyCn65vLzlybtyyYzGyq/o6K9yX@public.gmane.org>
Logic copied from xs_setup_bc_tcp().
Fixes: 39a9beab5acb ('rpc: share one xps between all backchannels')
Signed-off-by: Chuck Lever <chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 20027f8..6035c5a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -359,6 +359,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
out_fail:
xprt_rdma_free_addresses(xprt);
args->bc_xprt->xpt_bc_xprt = NULL;
+ args->bc_xprt->xpt_bc_xps = NULL;
xprt_put(xprt);
xprt_free(xprt);
return ERR_PTR(-EINVAL);
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox