Netdev List
 help / color / mirror / Atom feed
* [RFC 17/19] RDMA/irdma: Add ABI definitions
From: Shiraz Saleem @ 2019-02-12 21:44 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Shiraz Saleem
In-Reply-To: <20190212214402.23284-1-shiraz.saleem@intel.com>

From: Mustafa Ismail <mustafa.ismail@intel.com>

Add ABI definitions for irdma.

Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 include/uapi/rdma/irdma-abi.h | 140 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 include/uapi/rdma/irdma-abi.h

diff --git a/include/uapi/rdma/irdma-abi.h b/include/uapi/rdma/irdma-abi.h
new file mode 100644
index 0000000..04ce5c2
--- /dev/null
+++ b/include/uapi/rdma/irdma-abi.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2006 - 2019 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef IRDMA_ABI_H
+#define IRDMA_ABI_H
+
+#include <linux/types.h>
+
+#define IRDMA_ABI_VER	6
+
+enum irdma_memreg_type {
+	IW_MEMREG_TYPE_MEM  = 0,
+	IW_MEMREG_TYPE_QP   = 1,
+	IW_MEMREG_TYPE_CQ   = 2,
+	IW_MEMREG_TYPE_RSVD = 3,
+	IW_MEMREG_TYPE_MW   = 4,
+};
+
+struct irdma_alloc_ucontext_req {
+	__u32 rsvd32;
+	__u8 userspace_ver;
+	__u8 rsvd8[3];
+};
+
+struct irdma_alloc_ucontext_resp {
+	__u8 kernel_ver;
+	__u8 rsvd[7];
+	struct irdma_hw_attrs hw_attrs;
+};
+
+struct i40iw_alloc_ucontext_resp {
+	__u32 max_pds;		/* maximum pds allowed for this user process */
+	__u32 max_qps;		/* maximum qps allowed for this user process */
+	__u32 wq_size;		/* size of the WQs (sq+rq) allocated to the mmaped area */
+	__u8 kernel_ver;
+	__u8 reserved[3];
+};
+
+struct irdma_alloc_pd_resp {
+	__u32 pd_id;
+	__u8 rsvd[4];
+};
+
+struct irdma_create_cq_req {
+	__aligned_u64 user_cq_buf;
+	__aligned_u64 user_shadow_area;
+};
+
+struct irdma_create_qp_req {
+	__u64 user_wqe_bufs;
+	__u64 user_compl_ctx;
+};
+
+struct i40iw_create_qp_req {
+	__aligned_u64 user_wqe_buffers;
+	__aligned_u64 user_compl_ctx;
+
+	/* UDA QP PHB */
+	__aligned_u64 user_sq_phb;	/* place for VA of the sq phb buff */
+	__aligned_u64 user_rq_phb;	/* place for VA of the rq phb buff */
+};
+
+struct irdma_mem_reg_req {
+	__u16 reg_type;	 /* Memory, QP or CQ */
+	__u16 cq_pages;
+	__u16 rq_pages;
+	__u16 sq_pages;
+};
+
+struct irdma_create_cq_resp {
+	__u32 cq_id;
+	__u32 cq_size;
+};
+
+struct i40iw_create_cq_resp {
+	__u32 cq_id;
+	__u32 cq_size;
+	__u32 mmap_db_index;
+	__u32 reserved;
+};
+
+struct irdma_create_qp_resp {
+	__u32 qp_id;
+	__u32 actual_sq_size;
+	__u32 actual_rq_size;
+	__u32 irdma_drv_opt;
+	__u16 push_idx;
+	__u8  lsmm;
+	__u8  rsvd;
+	__u32 qp_caps;
+};
+
+struct i40iw_create_qp_resp {
+	__u32 qp_id;
+	__u32 actual_sq_size;
+	__u32 actual_rq_size;
+	__u32 i40iw_drv_opt;
+	__u16 push_idx;
+	__u8  lsmm;
+	__u8  rsvd2;
+};
+
+struct irdma_create_ah_resp {
+	__u32 ah_id;
+	__u32 rsvd[4];
+};
+#endif /* IRDMA_ABI_H */
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 19/19] RDMA/irdma: Update MAINTAINERS file
From: Shiraz Saleem @ 2019-02-12 21:44 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Shiraz Saleem
In-Reply-To: <20190212214402.23284-1-shiraz.saleem@intel.com>

Add maintainer entry for irdma driver.

Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 MAINTAINERS | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8c68de3c..1b4fbc4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7807,7 +7807,15 @@ L:	linux-pm@vger.kernel.org
 S:	Supported
 F:	drivers/cpufreq/intel_pstate.c
 
-INTEL RDMA RNIC DRIVER
+INTEL ETHERNET RDMA DRIVER
+M:	Mustafa Ismail <mustafa.ismail@intel.com>
+M:	Shiraz Saleem <shiraz.saleem@intel.com>
+L:	linux-rdma@vger.kernel.org
+S:	Supported
+F:	drivers/infiniband/hw/irdma/
+F:	include/uapi/rdma/irdma-abi.h
+
+INTEL X722 RDMA RNIC DRIVER
 M:	Faisal Latif <faisal.latif@intel.com>
 M:	Shiraz Saleem <shiraz.saleem@intel.com>
 L:	linux-rdma@vger.kernel.org
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 16/19] RDMA/irdma: Add dynamic tracing for CM
From: Shiraz Saleem @ 2019-02-12 21:43 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Michael J. Ruhl, Shiraz Saleem
In-Reply-To: <20190212214402.23284-1-shiraz.saleem@intel.com>

From: "Michael J. Ruhl" <michael.j.ruhl@intel.com>

Add dynamic tracing functionality to debug connection
management issues.

Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/hw/irdma/trace.c    | 113 ++++++++
 drivers/infiniband/hw/irdma/trace.h    |   4 +
 drivers/infiniband/hw/irdma/trace_cm.h | 459 +++++++++++++++++++++++++++++++++
 3 files changed, 576 insertions(+)
 create mode 100644 drivers/infiniband/hw/irdma/trace.c
 create mode 100644 drivers/infiniband/hw/irdma/trace.h
 create mode 100644 drivers/infiniband/hw/irdma/trace_cm.h

diff --git a/drivers/infiniband/hw/irdma/trace.c b/drivers/infiniband/hw/irdma/trace.c
new file mode 100644
index 0000000..0b6fbdb
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/trace.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2019, Intel Corporation. */
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+const char *print_ip_addr(struct trace_seq *p, u32 *addr, u16 port, bool ipv4)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	if (ipv4) {
+		u32 myaddr = ntohl(*addr);
+
+		trace_seq_printf(p, "%pI4:%d", &myaddr, ntohs(port));
+	} else {
+		trace_seq_printf(p, "%pI6:%d", addr, ntohs(port));
+	}
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+const char *parse_iw_event_type(enum iw_cm_event_type iw_type)
+{
+	switch (iw_type) {
+	case IW_CM_EVENT_CONNECT_REQUEST:
+		return "IwRequest";
+	case IW_CM_EVENT_CONNECT_REPLY:
+		return "IwReply";
+	case IW_CM_EVENT_ESTABLISHED:
+		return "IwEstablished";
+	case IW_CM_EVENT_DISCONNECT:
+		return "IwDisconnect";
+	case IW_CM_EVENT_CLOSE:
+		return "IwClose";
+	}
+
+	return "Unknown";
+}
+
+const char *parse_cm_event_type(enum irdma_cm_event_type cm_type)
+{
+	switch (cm_type) {
+	case IRDMA_CM_EVENT_ESTABLISHED:
+		return "CmEstablished";
+	case IRDMA_CM_EVENT_MPA_REQ:
+		return "CmMPA_REQ";
+	case IRDMA_CM_EVENT_MPA_CONNECT:
+		return "CmMPA_CONNECT";
+	case IRDMA_CM_EVENT_MPA_ACCEPT:
+		return "CmMPA_ACCEPT";
+	case IRDMA_CM_EVENT_MPA_REJECT:
+		return "CmMPA_REJECT";
+	case IRDMA_CM_EVENT_MPA_ESTABLISHED:
+		return "CmMPA_ESTABLISHED";
+	case IRDMA_CM_EVENT_CONNECTED:
+		return "CmConnected";
+	case IRDMA_CM_EVENT_RESET:
+		return "CmReset";
+	case IRDMA_CM_EVENT_ABORTED:
+		return "CmAborted";
+	case IRDMA_CM_EVENT_UNKNOWN:
+		return "none";
+	}
+	return "Unknown";
+}
+
+const char *parse_cm_state(enum irdma_cm_node_state state)
+{
+	switch (state) {
+	case IRDMA_CM_STATE_UNKNOWN:
+		return "UNKNOWN";
+	case IRDMA_CM_STATE_INITED:
+		return "INITED";
+	case IRDMA_CM_STATE_LISTENING:
+		return "LISTENING";
+	case IRDMA_CM_STATE_SYN_RCVD:
+		return "SYN_RCVD";
+	case IRDMA_CM_STATE_SYN_SENT:
+		return "SYN_SENT";
+	case IRDMA_CM_STATE_ONE_SIDE_ESTABLISHED:
+		return "ONE_SIDE_ESTABLISHED";
+	case IRDMA_CM_STATE_ESTABLISHED:
+		return "ESTABLISHED";
+	case IRDMA_CM_STATE_ACCEPTING:
+		return "ACCEPTING";
+	case IRDMA_CM_STATE_MPAREQ_SENT:
+		return "MPAREQ_SENT";
+	case IRDMA_CM_STATE_MPAREQ_RCVD:
+		return "MPAREQ_RCVD";
+	case IRDMA_CM_STATE_MPAREJ_RCVD:
+		return "MPAREJ_RECVD";
+	case IRDMA_CM_STATE_OFFLOADED:
+		return "OFFLOADED";
+	case IRDMA_CM_STATE_FIN_WAIT1:
+		return "FIN_WAIT1";
+	case IRDMA_CM_STATE_FIN_WAIT2:
+		return "FIN_WAIT2";
+	case IRDMA_CM_STATE_CLOSE_WAIT:
+		return "CLOSE_WAIT";
+	case IRDMA_CM_STATE_TIME_WAIT:
+		return "TIME_WAIT";
+	case IRDMA_CM_STATE_LAST_ACK:
+		return "LAST_ACK";
+	case IRDMA_CM_STATE_CLOSING:
+		return "CLOSING";
+	case IRDMA_CM_STATE_LISTENER_DESTROYED:
+		return "LISTENER_DESTROYED";
+	case IRDMA_CM_STATE_CLOSED:
+		return "CLOSED";
+	}
+	return ("Bad state");
+}
diff --git a/drivers/infiniband/hw/irdma/trace.h b/drivers/infiniband/hw/irdma/trace.h
new file mode 100644
index 0000000..40cd65e
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/trace.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */
+/* Copyright (c) 2019, Intel Corporation. */
+
+#include "trace_cm.h"
diff --git a/drivers/infiniband/hw/irdma/trace_cm.h b/drivers/infiniband/hw/irdma/trace_cm.h
new file mode 100644
index 0000000..551d1f5
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/trace_cm.h
@@ -0,0 +1,459 @@
+/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */
+/* Copyright (c) 2019, Intel Corporation. */
+
+#if !defined(__TRACE_CM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __TRACE_CM_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "main.h"
+
+const char *print_ip_addr(struct trace_seq *p, u32 *addr, u16 port, bool ivp4);
+const char *parse_iw_event_type(enum iw_cm_event_type iw_type);
+const char *parse_cm_event_type(enum irdma_cm_event_type cm_type);
+const char *parse_cm_state(enum irdma_cm_node_state);
+#define __print_ip_addr(addr, port, ipv4) print_ip_addr(p, addr, port, ipv4)
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM icrdma_cm
+
+TRACE_EVENT(irdma_create_listen,
+	    TP_PROTO(struct irdma_device *iwdev, struct irdma_cm_info *cm_info),
+	    TP_ARGS(iwdev, cm_info),
+	    TP_STRUCT__entry(__field(struct irdma_device *, iwdev)
+			     __dynamic_array(u32, laddr, 4)
+			     __field(u16, lport)
+			     __field(bool, ipv4)
+		    ),
+	    TP_fast_assign(__entry->iwdev = iwdev;
+			   __entry->lport = cm_info->loc_port;
+			   __entry->ipv4 = cm_info->ipv4;
+			   memcpy(__get_dynamic_array(laddr),
+				  cm_info->loc_addr, 4);
+		    ),
+	    TP_printk("iwdev=%p  loc: %s",
+		      __entry->iwdev,
+		      __print_ip_addr(__get_dynamic_array(laddr),
+				      __entry->lport, __entry->ipv4)
+		    )
+);
+
+TRACE_EVENT(irdma_dec_refcnt_listen,
+	    TP_PROTO(struct irdma_cm_listener *listener, void *caller),
+	    TP_ARGS(listener, caller),
+	    TP_STRUCT__entry(__field(struct irdma_device *, iwdev)
+		    __field(u32, refcnt)
+		    __dynamic_array(u32, laddr, 4)
+		    __field(u16, lport)
+		    __field(bool, ipv4)
+		    __field(void *, caller)
+		    ),
+	    TP_fast_assign(__entry->iwdev = listener->iwdev;
+			   __entry->lport = listener->loc_port;
+			   __entry->ipv4 = listener->ipv4;
+			   memcpy(__get_dynamic_array(laddr),
+				  listener->loc_addr, 4);
+		    ),
+	    TP_printk("iwdev=%p  caller=%pS  loc: %s",
+		      __entry->iwdev,
+		      __entry->caller,
+		      __print_ip_addr(__get_dynamic_array(laddr),
+				      __entry->lport, __entry->ipv4)
+		    )
+);
+
+DECLARE_EVENT_CLASS(listener_template,
+		    TP_PROTO(struct irdma_cm_listener *listener),
+		    TP_ARGS(listener),
+		    TP_STRUCT__entry(__field(struct irdma_device *, iwdev)
+				     __field(u16, lport)
+				     __field(u16, vlan_id)
+				     __field(bool, ipv4)
+				     __field(enum irdma_cm_listener_state,
+					     state)
+				     __dynamic_array(u32, laddr, 4)
+			    ),
+		    TP_fast_assign(__entry->iwdev = listener->iwdev;
+				   __entry->lport = listener->loc_port;
+				   __entry->vlan_id = listener->vlan_id;
+				   __entry->ipv4 = listener->ipv4;
+				   __entry->state = listener->listener_state;
+				   memcpy(__get_dynamic_array(laddr),
+					  listener->loc_addr, 4);
+			    ),
+		    TP_printk("iwdev=%p  vlan=%d  loc: %s",
+			      __entry->iwdev,
+			      __entry->vlan_id,
+			      __print_ip_addr(__get_dynamic_array(laddr),
+					      __entry->lport, __entry->ipv4)
+			    )
+);
+
+DEFINE_EVENT(listener_template, irdma_find_listener,
+	     TP_PROTO(struct irdma_cm_listener *listener),
+	     TP_ARGS(listener));
+
+DEFINE_EVENT(listener_template, irdma_del_multiple_qhash,
+	     TP_PROTO(struct irdma_cm_listener *listener),
+	     TP_ARGS(listener));
+
+TRACE_EVENT(irdma_negotiate_mpa_v2,
+	    TP_PROTO(struct irdma_cm_node *cm_node),
+	    TP_ARGS(cm_node),
+	    TP_STRUCT__entry(__field(struct irdma_cm_node *, cm_node)
+			     __field(u16, ord_size)
+			     __field(u16, ird_size)
+		    ),
+	    TP_fast_assign(__entry->cm_node = cm_node;
+			   __entry->ord_size = cm_node->ord_size;
+			   __entry->ird_size = cm_node->ird_size;
+		    ),
+	    TP_printk("MPVA2 Negotiated cm_node=%p ORD:[%d], IRD:[%d]",
+		      __entry->cm_node,
+		      __entry->ord_size,
+		      __entry->ird_size
+		    )
+);
+
+DECLARE_EVENT_CLASS(tos_template,
+		    TP_PROTO(struct irdma_device *iwdev, u8 tos, u8 user_pri),
+		    TP_ARGS(iwdev, tos, user_pri),
+		    TP_STRUCT__entry(__field(struct irdma_device *, iwdev)
+				     __field(u8, tos)
+				     __field(u8, user_pri)
+			    ),
+		    TP_fast_assign(__entry->iwdev = iwdev;
+				   __entry->tos = tos;
+				   __entry->user_pri = user_pri;
+			    ),
+		    TP_printk("iwdev=%p  TOS:[%d]  UP:[%d]",
+			      __entry->iwdev,
+			      __entry->tos,
+			      __entry->user_pri
+			    )
+);
+
+DEFINE_EVENT(tos_template, irdma_listener_tos,
+	     TP_PROTO(struct irdma_device *iwdev, u8 tos, u8 user_pri),
+	     TP_ARGS(iwdev, tos, user_pri));
+
+DEFINE_EVENT(tos_template, irdma_dcb_tos,
+	     TP_PROTO(struct irdma_device *iwdev, u8 tos, u8 user_pri),
+	     TP_ARGS(iwdev, tos, user_pri));
+
+DECLARE_EVENT_CLASS(qhash_template,
+		    TP_PROTO(struct irdma_device *iwdev,
+			     struct irdma_cm_listener *listener,
+			     char *dev_addr),
+		    TP_ARGS(iwdev, listener, dev_addr),
+		    TP_STRUCT__entry(__field(struct irdma_device *, iwdev)
+				     __field(u16, lport)
+				     __field(u16, vlan_id)
+				     __field(bool, ipv4)
+				     __dynamic_array(u32, laddr, 4)
+				     __dynamic_array(u32, mac, ETH_ALEN)
+			    ),
+		    TP_fast_assign(__entry->iwdev = iwdev;
+				   __entry->lport = listener->loc_port;
+				   __entry->vlan_id = listener->vlan_id;
+				   __entry->ipv4 = listener->ipv4;
+				   memcpy(__get_dynamic_array(laddr),
+					  listener->loc_addr, 4);
+				   ether_addr_copy(__get_dynamic_array(mac),
+						   dev_addr);
+			    ),
+		    TP_printk("iwdev=%p  vlan=%d  MAC=%pM  loc: %s",
+			      __entry->iwdev,
+			      __entry->vlan_id,
+			      __get_dynamic_array(mac),
+			      __print_ip_addr(__get_dynamic_array(laddr),
+					      __entry->lport, __entry->ipv4)
+		    )
+);
+
+DEFINE_EVENT(qhash_template, irdma_add_mqh_6,
+	     TP_PROTO(struct irdma_device *iwdev,
+		      struct irdma_cm_listener *listener, char *dev_addr),
+	     TP_ARGS(iwdev, listener, dev_addr));
+
+DEFINE_EVENT(qhash_template, irdma_add_mqh_4,
+	     TP_PROTO(struct irdma_device *iwdev,
+		      struct irdma_cm_listener *listener, char *dev_addr),
+	     TP_ARGS(iwdev, listener, dev_addr));
+
+TRACE_EVENT(irdma_addr_resolve,
+	    TP_PROTO(struct irdma_device *iwdev, char *dev_addr),
+	    TP_ARGS(iwdev, dev_addr),
+	    TP_STRUCT__entry(__field(struct irdma_device *, iwdev)
+		    __dynamic_array(u8, mac, ETH_ALEN)
+		    ),
+	    TP_fast_assign(__entry->iwdev = iwdev;
+		    ether_addr_copy(__get_dynamic_array(mac), dev_addr);
+		    ),
+	    TP_printk("iwdev=%p   MAC=%pM", __entry->iwdev,
+		      __get_dynamic_array(mac)
+		    )
+);
+
+TRACE_EVENT(irdma_send_cm_event,
+	    TP_PROTO(struct irdma_cm_node *cm_node, struct iw_cm_id *cm_id,
+		     enum iw_cm_event_type type, int status, void *caller),
+	    TP_ARGS(cm_node, cm_id, type, status, caller),
+	    TP_STRUCT__entry(__field(struct irdma_device *, iwdev)
+			     __field(struct irdma_cm_node *, cm_node)
+			     __field(struct iw_cm_id *, cm_id)
+			     __field(u32, refcount)
+			     __field(u16, lport)
+			     __field(u16, rport)
+			     __field(enum irdma_cm_node_state, state)
+			     __field(bool, ipv4)
+			     __field(u16, vlan_id)
+			     __field(int, accel)
+			     __field(enum iw_cm_event_type, type)
+			     __field(int, status)
+			     __field(void *, caller)
+			     __dynamic_array(u32, laddr, 4)
+			     __dynamic_array(u32, raddr, 4)
+		    ),
+	    TP_fast_assign(__entry->iwdev = cm_node->iwdev;
+			   __entry->cm_node = cm_node;
+			   __entry->cm_id = cm_id;
+			   __entry->refcount = atomic_read(&cm_node->ref_count);
+			   __entry->state = cm_node->state;
+			   __entry->lport = cm_node->loc_port;
+			   __entry->rport = cm_node->rem_port;
+			   __entry->ipv4 = cm_node->ipv4;
+			   __entry->vlan_id = cm_node->vlan_id;
+			   __entry->accel = cm_node->accelerated;
+			   __entry->type = type;
+			   __entry->status = status;
+			   __entry->caller = caller;
+			   memcpy(__get_dynamic_array(laddr),
+				  cm_node->loc_addr, 4);
+			   memcpy(__get_dynamic_array(raddr),
+				  cm_node->rem_addr, 4);
+		    ),
+	    TP_printk("iwdev=%p  caller=%pS  cm_id=%p  node=%p  refcnt=%d  vlan_id=%d  accel=%d  state=%s  event_type=%s  status=%d  loc: %s  rem: %s",
+		      __entry->iwdev,
+		      __entry->caller,
+		      __entry->cm_id,
+		      __entry->cm_node,
+		      __entry->refcount,
+		      __entry->vlan_id,
+		      __entry->accel,
+		      parse_cm_state(__entry->state),
+		      parse_iw_event_type(__entry->type),
+		      __entry->status,
+		      __print_ip_addr(__get_dynamic_array(laddr),
+				      __entry->lport, __entry->ipv4),
+		      __print_ip_addr(__get_dynamic_array(raddr),
+				      __entry->rport, __entry->ipv4)
+		    )
+);
+
+TRACE_EVENT(irdma_send_cm_event_no_node,
+	    TP_PROTO(struct iw_cm_id *cm_id, enum iw_cm_event_type type,
+		     int status, void *caller),
+	    TP_ARGS(cm_id, type, status, caller),
+	    TP_STRUCT__entry(__field(struct iw_cm_id *, cm_id)
+			     __field(enum iw_cm_event_type, type)
+			     __field(int, status)
+			     __field(void *, caller)
+		    ),
+	    TP_fast_assign(__entry->cm_id = cm_id;
+			   __entry->type = type;
+			   __entry->status = status;
+			   __entry->caller = caller;
+		    ),
+	    TP_printk("cm_id=%p  caller=%pS  event_type=%s  status=%d",
+		      __entry->cm_id,
+		      __entry->caller,
+		      parse_iw_event_type(__entry->type),
+		      __entry->status
+		    )
+);
+
+DECLARE_EVENT_CLASS(cm_node_template,
+		    TP_PROTO(struct irdma_cm_node *cm_node,
+			     enum irdma_cm_event_type type, void *caller),
+		    TP_ARGS(cm_node, type, caller),
+		    TP_STRUCT__entry(__field(struct irdma_device *, iwdev)
+				     __field(struct irdma_cm_node *, cm_node)
+				     __field(u32, refcount)
+				     __field(u16, lport)
+				     __field(u16, rport)
+				     __field(enum irdma_cm_node_state, state)
+				     __field(bool, ipv4)
+				     __field(u16, vlan_id)
+				     __field(int, accel)
+				     __field(enum irdma_cm_event_type, type)
+				     __field(void *, caller)
+				     __dynamic_array(u32, laddr, 4)
+				     __dynamic_array(u32, raddr, 4)
+			    ),
+		    TP_fast_assign(__entry->iwdev = cm_node->iwdev;
+				   __entry->cm_node = cm_node;
+				   __entry->refcount = atomic_read(&cm_node->ref_count);
+				   __entry->state = cm_node->state;
+				   __entry->lport = cm_node->loc_port;
+				   __entry->rport = cm_node->rem_port;
+				   __entry->ipv4 = cm_node->ipv4;
+				   __entry->vlan_id = cm_node->vlan_id;
+				   __entry->accel = cm_node->accelerated;
+				   __entry->type = type;
+				   __entry->caller = caller;
+				   memcpy(__get_dynamic_array(laddr),
+					  cm_node->loc_addr, 4);
+				   memcpy(__get_dynamic_array(raddr),
+					  cm_node->rem_addr, 4);
+			    ),
+		    TP_printk("iwdev=%p  caller=%pS  node=%p  refcnt=%d  vlan_id=%d  accel=%d  state=%s  event_type=%s  loc: %s  rem: %s",
+			      __entry->iwdev,
+			      __entry->caller,
+			      __entry->cm_node,
+			      __entry->refcount,
+			      __entry->vlan_id,
+			      __entry->accel,
+			      parse_cm_state(__entry->state),
+			      parse_cm_event_type(__entry->type),
+			      __print_ip_addr(__get_dynamic_array(laddr),
+					      __entry->lport, __entry->ipv4),
+			      __print_ip_addr(__get_dynamic_array(raddr),
+					      __entry->rport, __entry->ipv4)
+		    )
+);
+
+DEFINE_EVENT(cm_node_template, irdma_create_event,
+	     TP_PROTO(struct irdma_cm_node *cm_node,
+		      enum irdma_cm_event_type type, void *caller),
+	     TP_ARGS(cm_node, type, caller));
+
+DEFINE_EVENT(cm_node_template, irdma_accept,
+	     TP_PROTO(struct irdma_cm_node *cm_node,
+		      enum irdma_cm_event_type type, void *caller),
+	     TP_ARGS(cm_node, type, caller));
+
+DEFINE_EVENT(cm_node_template, irdma_connect,
+	     TP_PROTO(struct irdma_cm_node *cm_node,
+		      enum irdma_cm_event_type type, void *caller),
+	     TP_ARGS(cm_node, type, caller));
+
+DEFINE_EVENT(cm_node_template, irdma_reject,
+	     TP_PROTO(struct irdma_cm_node *cm_node,
+		      enum irdma_cm_event_type type, void *caller),
+	     TP_ARGS(cm_node, type, caller));
+
+DEFINE_EVENT(cm_node_template, irdma_find_node,
+	     TP_PROTO(struct irdma_cm_node *cm_node,
+		      enum irdma_cm_event_type type, void *caller),
+	     TP_ARGS(cm_node, type, caller));
+
+DEFINE_EVENT(cm_node_template, irdma_send_reset,
+	     TP_PROTO(struct irdma_cm_node *cm_node,
+		      enum irdma_cm_event_type type, void *caller),
+	     TP_ARGS(cm_node, type, caller));
+
+DEFINE_EVENT(cm_node_template, irdma_rem_ref_cm_node,
+	     TP_PROTO(struct irdma_cm_node *cm_node,
+		      enum irdma_cm_event_type type, void *caller),
+	     TP_ARGS(cm_node, type, caller));
+
+DEFINE_EVENT(cm_node_template, irdma_cm_event_handler,
+	     TP_PROTO(struct irdma_cm_node *cm_node,
+		      enum irdma_cm_event_type type, void *caller),
+	     TP_ARGS(cm_node, type, caller));
+
+TRACE_EVENT(open_err_template,
+	    TP_PROTO(struct irdma_cm_node *cm_node, bool reset, void *caller),
+	    TP_ARGS(cm_node, reset, caller),
+	    TP_STRUCT__entry(__field(struct irdma_device *, iwdev)
+			     __field(struct irdma_cm_node *, cm_node)
+			     __field(enum irdma_cm_node_state, state)
+			     __field(bool, reset)
+			     __field(void *, caller)
+		    ),
+	    TP_fast_assign(__entry->iwdev = cm_node->iwdev;
+			   __entry->cm_node = cm_node;
+			   __entry->state = cm_node->state;
+			   __entry->reset = reset;
+			   __entry->caller = caller;
+		    ),
+	    TP_printk("iwdev=%p  caller=%pS  node%p reset=%d  state=%s",
+		      __entry->iwdev,
+		      __entry->caller,
+		      __entry->cm_node,
+		      __entry->reset,
+		      parse_cm_state(__entry->state)
+		    )
+);
+
+DEFINE_EVENT(open_err_template, irdma_active_open_err,
+	     TP_PROTO(struct irdma_cm_node *cm_node, bool reset, void *caller),
+	     TP_ARGS(cm_node, reset, caller));
+
+DEFINE_EVENT(open_err_template, irdma_passive_open_err,
+	     TP_PROTO(struct irdma_cm_node *cm_node, bool reset, void *caller),
+	     TP_ARGS(cm_node, reset, caller));
+
+DECLARE_EVENT_CLASS(cm_node_ah_template,
+		    TP_PROTO(struct irdma_cm_node *cm_node),
+		    TP_ARGS(cm_node),
+		    TP_STRUCT__entry(__field(struct irdma_device *, iwdev)
+				     __field(struct irdma_cm_node *, cm_node)
+				     __field(struct irdma_sc_ah *, ah)
+				     __field(u32, refcount)
+				     __field(u16, lport)
+				     __field(u16, rport)
+				     __field(enum irdma_cm_node_state, state)
+				     __field(bool, ipv4)
+				     __field(u16, vlan_id)
+				     __field(int, accel)
+				     __dynamic_array(u32, laddr, 4)
+				     __dynamic_array(u32, raddr, 4)
+			    ),
+		    TP_fast_assign(__entry->iwdev = cm_node->iwdev;
+				   __entry->cm_node = cm_node;
+				   __entry->ah = cm_node->ah;
+				   __entry->refcount = atomic_read(&cm_node->ref_count);
+				   __entry->lport = cm_node->loc_port;
+				   __entry->rport = cm_node->rem_port;
+				   __entry->state = cm_node->state;
+				   __entry->ipv4 = cm_node->ipv4;
+				   __entry->vlan_id = cm_node->vlan_id;
+				   __entry->accel = cm_node->accelerated;
+				   memcpy(__get_dynamic_array(laddr),
+					  cm_node->loc_addr, 4);
+				   memcpy(__get_dynamic_array(raddr),
+					  cm_node->rem_addr, 4);
+			    ),
+		    TP_printk("iwdev=%p  node=%p  ah=%p  refcnt=%d  vlan_id=%d  accel=%d  state=%s loc: %s  rem: %s",
+			      __entry->iwdev,
+			      __entry->cm_node,
+			      __entry->ah,
+			      __entry->refcount,
+			      __entry->vlan_id,
+			      __entry->accel,
+			      parse_cm_state(__entry->state),
+			      __print_ip_addr(__get_dynamic_array(laddr),
+					      __entry->lport, __entry->ipv4),
+			      __print_ip_addr(__get_dynamic_array(raddr),
+					      __entry->rport, __entry->ipv4)
+		    )
+);
+
+DEFINE_EVENT(cm_node_ah_template, irdma_cm_free_ah,
+	     TP_PROTO(struct irdma_cm_node *cm_node),
+	     TP_ARGS(cm_node));
+
+DEFINE_EVENT(cm_node_ah_template, irdma_create_ah,
+	     TP_PROTO(struct irdma_cm_node *cm_node),
+	     TP_ARGS(cm_node));
+
+#endif  /* __TRACE_CM_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_cm
+#include <trace/define_trace.h>
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 14/19] RDMA/irdma: Add user/kernel shared libraries
From: Shiraz Saleem @ 2019-02-12 21:43 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Shiraz Saleem
In-Reply-To: <20190212214402.23284-1-shiraz.saleem@intel.com>

From: Mustafa Ismail <mustafa.ismail@intel.com>

Building the WQE descriptors for different verb
operations are similar in kernel and user-space.
Add these shared libraries.

Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/hw/irdma/uk.c   | 1680 ++++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/irdma/user.h |  463 ++++++++++
 2 files changed, 2143 insertions(+)
 create mode 100644 drivers/infiniband/hw/irdma/uk.c
 create mode 100644 drivers/infiniband/hw/irdma/user.h

diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c
new file mode 100644
index 0000000..d71d0d8
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/uk.c
@@ -0,0 +1,1680 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2019, Intel Corporation. */
+
+#include "osdep.h"
+#include "status.h"
+#include "defs.h"
+#include "user.h"
+#include "irdma.h"
+
+/**
+ * irdma_set_fragment - set fragment in wqe
+ * @wqe: wqe for setting fragment
+ * @offset: offset value
+ * @sge: sge length and stag
+ */
+static void irdma_set_fragment(__le64 *wqe,
+			       u32 offset,
+			       struct irdma_sge *sge,
+			       u8 valid)
+{
+	if (sge) {
+		set_64bit_val(wqe, offset,
+			      LS_64(sge->tag_off, IRDMAQPSQ_FRAG_TO));
+		set_64bit_val(wqe, offset + 8,
+			      LS_64(valid, IRDMAQPSQ_VALID) |
+			      LS_64(sge->len, IRDMAQPSQ_FRAG_LEN) |
+			      LS_64(sge->stag, IRDMAQPSQ_FRAG_STAG));
+	} else {
+		set_64bit_val(wqe, offset, 0);
+		set_64bit_val(wqe, offset + 8,
+			      LS_64(valid, IRDMAQPSQ_VALID));
+	}
+}
+
+/**
+ * irdma_set_fragment_gen_1 - set fragment in wqe
+ * @wqe: wqe for setting fragment
+ * @offset: offset value
+ * @sge: sge length and stag
+ */
+static void irdma_set_fragment_gen_1(__le64 *wqe,
+				     u32 offset,
+				     struct irdma_sge *sge,
+				     u8 valid)
+{
+	if (sge) {
+		set_64bit_val(wqe, offset,
+			      LS_64(sge->tag_off, IRDMAQPSQ_FRAG_TO));
+		set_64bit_val(wqe, offset + 8,
+			      LS_64(sge->len, IRDMAQPSQ_GEN1_FRAG_LEN) |
+			      LS_64(sge->stag, IRDMAQPSQ_GEN1_FRAG_STAG));
+	} else {
+		set_64bit_val(wqe, offset, 0);
+		set_64bit_val(wqe, offset + 8, 0);
+	}
+}
+
+/**
+ * irdma_nop_1 - insert a NOP wqe
+ * @qp: hw qp ptr
+ */
+static enum irdma_status_code irdma_nop_1(struct irdma_qp_uk *qp)
+{
+	u64 hdr;
+	__le64 *wqe;
+	u32 wqe_idx;
+	bool signaled = false;
+
+	if (!qp->sq_ring.head)
+		return IRDMA_ERR_PARAM;
+
+	wqe_idx = IRDMA_RING_CURRENT_HEAD(qp->sq_ring);
+	wqe = qp->sq_base[wqe_idx].elem;
+
+	qp->sq_wrtrk_array[wqe_idx].quanta = IRDMA_QP_WQE_MIN_QUANTA;
+
+	set_64bit_val(wqe, 0, 0);
+	set_64bit_val(wqe, 8, 0);
+	set_64bit_val(wqe, 16, 0);
+
+	hdr = LS_64(IRDMAQP_OP_NOP, IRDMAQPSQ_OPCODE) |
+	      LS_64(signaled, IRDMAQPSQ_SIGCOMPL) |
+	      LS_64(qp->swqe_polarity, IRDMAQPSQ_VALID);
+
+	/* make sure WQE is written before valid bit is set */
+	wmb();
+
+	set_64bit_val(wqe, 24, hdr);
+
+	return 0;
+}
+
+/**
+ * irdma_qp_post_wr - ring doorbell
+ * @qp: hw qp ptr
+ */
+void irdma_qp_post_wr(struct irdma_qp_uk *qp)
+{
+	u64 temp;
+	u32 hw_sq_tail;
+	u32 sw_sq_head;
+
+	/* valid bit is written and loads completed before reading shadow */
+	mb();
+
+	/* read the doorbell shadow area */
+	get_64bit_val(qp->shadow_area, 0, &temp);
+
+	hw_sq_tail = (u32)RS_64(temp, IRDMA_QP_DBSA_HW_SQ_TAIL);
+	sw_sq_head = IRDMA_RING_CURRENT_HEAD(qp->sq_ring);
+	if (sw_sq_head != qp->initial_ring.head) {
+		if (qp->push_mode) {
+			writel(qp->qp_id, qp->wqe_alloc_db);
+			qp->push_mode = false;
+		} else if (sw_sq_head != hw_sq_tail) {
+			if (sw_sq_head > qp->initial_ring.head) {
+				if (hw_sq_tail >= qp->initial_ring.head &&
+				    hw_sq_tail < sw_sq_head) {
+					writel(qp->qp_id, qp->wqe_alloc_db);
+				}
+			} else {
+				if (hw_sq_tail >= qp->initial_ring.head ||
+				    hw_sq_tail < sw_sq_head) {
+					writel(qp->qp_id, qp->wqe_alloc_db);
+				}
+			}
+		}
+	}
+
+	qp->initial_ring.head = qp->sq_ring.head;
+}
+
+/**
+ * irdma_qp_ring_push_db -  ring qp doorbell
+ * @qp: hw qp ptr
+ * @wqe_idx: wqe index
+ */
+static void irdma_qp_ring_push_db(struct irdma_qp_uk *qp, u32 wqe_idx)
+{
+	set_32bit_val(qp->push_db,
+		      0,
+		      LS_32(wqe_idx >> 3, IRDMA_WQEALLOC_WQE_DESC_INDEX) | qp->qp_id);
+	qp->initial_ring.head = qp->sq_ring.head;
+	qp->push_mode = true;
+}
+
+void irdma_qp_push_wqe(struct irdma_qp_uk *qp,
+		       __le64 *wqe,
+		       u16 quanta,
+		       u32 wqe_idx,
+		       bool post_sq)
+{
+	__le64 *push;
+
+	if (IRDMA_RING_CURRENT_HEAD(qp->initial_ring) !=
+	    IRDMA_RING_CURRENT_TAIL(qp->sq_ring) &&
+	    !(qp->push_mode)) {
+		if (post_sq)
+			irdma_qp_post_wr(qp);
+	} else {
+		push = (__le64 *)((uintptr_t)qp->push_wqe + (wqe_idx & 0x7) * 0x20);
+		memcpy(push, wqe, quanta * IRDMA_QP_WQE_MIN_SIZE);
+		irdma_qp_ring_push_db(qp, wqe_idx);
+	}
+}
+
+/**
+ * irdma_qp_get_next_send_wqe - pad with NOP if needed, return where next WR should go
+ * @qp: hw qp ptr
+ * @wqe_idx: return wqe index
+ * @quanta: size of WR in quanta
+ * @total_size: size of WR in bytes
+ * @info: info on WR
+ */
+__le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp,
+				   u32 *wqe_idx,
+				   u16 quanta,
+				   u32 total_size,
+				   struct irdma_post_sq_info *info)
+{
+	__le64 *wqe;
+	u64 *wqe_0 = NULL;
+	u32 nop_wqe_idx;
+	u16 nop_cnt;
+	u16 i;
+
+	nop_cnt = IRDMA_RING_CURRENT_HEAD(qp->sq_ring) % qp->hw_attrs->max_hw_sq_chunk;
+	if (nop_cnt)
+		nop_cnt = qp->hw_attrs->max_hw_sq_chunk - nop_cnt;
+
+	if (quanta > nop_cnt) {
+		/* Need to pad with NOP */
+		/* Make sure SQ has room for nop_cnt + quanta */
+		if ((u32)(quanta + nop_cnt) > IRDMA_RING_FREE_QUANTA(qp->sq_ring))
+			return NULL;
+
+		/* pad with NOP */
+		nop_wqe_idx = IRDMA_RING_CURRENT_HEAD(qp->sq_ring);
+		for (i = 0; i < nop_cnt; i++) {
+			irdma_nop_1(qp);
+			IRDMA_RING_MOVE_HEAD_NOCHECK(qp->sq_ring);
+		}
+		if (qp->push_db && nop_cnt && info->push_wqe)  {
+			irdma_qp_push_wqe(qp,
+					  qp->sq_base[nop_wqe_idx].elem,
+					  nop_cnt, nop_wqe_idx, true);
+		}
+	} else {
+		/* no need to pad with NOP */
+		if (quanta > IRDMA_RING_FREE_QUANTA(qp->sq_ring))
+			return NULL;
+	}
+
+	*wqe_idx = IRDMA_RING_CURRENT_HEAD(qp->sq_ring);
+	if (!*wqe_idx)
+		qp->swqe_polarity = !qp->swqe_polarity;
+
+	IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->sq_ring, quanta);
+
+	wqe = qp->sq_base[*wqe_idx].elem;
+	if (qp->hw_attrs->hw_rev == IRDMA_GEN_1 &&
+	    quanta == 1 &&
+	    (IRDMA_RING_CURRENT_HEAD(qp->sq_ring) & 1)) {
+		wqe_0 = qp->sq_base[IRDMA_RING_CURRENT_HEAD(qp->sq_ring)].elem;
+		wqe_0[3] = LS_64(!qp->swqe_polarity, IRDMAQPSQ_VALID);
+	}
+	qp->sq_wrtrk_array[*wqe_idx].wrid = info->wr_id;
+	qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size;
+	qp->sq_wrtrk_array[*wqe_idx].quanta = quanta;
+
+	return wqe;
+}
+
+/**
+ * irdma_qp_get_next_recv_wqe - get next qp's rcv wqe
+ * @qp: hw qp ptr
+ * @wqe_idx: return wqe index
+ */
+__le64 *irdma_qp_get_next_recv_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx)
+{
+	__le64 *wqe;
+	enum irdma_status_code ret_code;
+
+	if (IRDMA_RING_FULL_ERR(qp->rq_ring))
+		return NULL;
+
+	IRDMA_ATOMIC_RING_MOVE_HEAD(qp->rq_ring, *wqe_idx, ret_code);
+	if (ret_code)
+		return NULL;
+
+	if (!*wqe_idx)
+		qp->rwqe_polarity = !qp->rwqe_polarity;
+	/* rq_wqe_size_multiplier is no of 32 byte quanta in in one rq wqe */
+	wqe = qp->rq_base[*wqe_idx * (qp->rq_wqe_size_multiplier)].elem;
+
+	return wqe;
+}
+
+/**
+ * irdma_rdma_write - rdma write operation
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @post_sq: flag to post sq
+ */
+static enum irdma_status_code irdma_rdma_write(struct irdma_qp_uk *qp,
+					       struct irdma_post_sq_info *info,
+					       bool post_sq)
+{
+	u64 hdr;
+	__le64 *wqe;
+	struct irdma_rdma_write *op_info;
+	u32 i, wqe_idx;
+	u32 total_size = 0, byte_off;
+	enum irdma_status_code ret_code;
+	u32 frag_cnt, addl_frag_cnt;
+	bool read_fence = false;
+	u16 quanta;
+
+	info->push_wqe = qp->push_db ? true : false;
+
+	op_info = &info->op.rdma_write;
+	if (op_info->num_lo_sges > qp->max_sq_frag_cnt)
+		return IRDMA_ERR_INVALID_FRAG_COUNT;
+
+	for (i = 0; i < op_info->num_lo_sges; i++)
+		total_size += op_info->lo_sg_list[i].len;
+
+	if (total_size > IRDMA_MAX_OUTBOUND_MSG_SIZE)
+		return IRDMA_ERR_QP_INVALID_MSG_SIZE;
+
+	read_fence |= info->read_fence;
+
+	if (info->imm_data_valid)
+		frag_cnt = op_info->num_lo_sges + 1;
+	else
+		frag_cnt = op_info->num_lo_sges;
+	addl_frag_cnt = frag_cnt > 1 ? (frag_cnt - 1) : 0;
+	ret_code = irdma_fragcnt_to_quanta_sq(frag_cnt, &quanta);
+	if (ret_code)
+		return ret_code;
+
+	wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta,
+					 total_size, info);
+	if (!wqe)
+		return IRDMA_ERR_QP_TOOMANY_WRS_POSTED;
+
+	set_64bit_val(wqe,
+		      16,
+		      LS_64(op_info->rem_addr.tag_off, IRDMAQPSQ_FRAG_TO));
+
+	if (info->imm_data_valid) {
+		set_64bit_val(wqe, 0,
+			      LS_64(info->imm_data, IRDMAQPSQ_IMMDATA));
+		i = 0;
+	} else {
+		qp->wqe_ops.iw_set_fragment(wqe, 0, op_info->lo_sg_list,
+					    qp->swqe_polarity);
+		i = 1;
+	}
+
+	for (byte_off = 32; i < op_info->num_lo_sges; i++) {
+		qp->wqe_ops.iw_set_fragment(wqe, byte_off, &op_info->lo_sg_list[i],
+					    qp->swqe_polarity);
+		byte_off += 16;
+	}
+
+	/* if not an odd number set valid bit in next fragment */
+	if (qp->hw_attrs->hw_rev > IRDMA_GEN_1 && !(frag_cnt & 0x01) &&
+	    frag_cnt) {
+		qp->wqe_ops.iw_set_fragment(wqe, byte_off, NULL,
+					    qp->swqe_polarity);
+		if (qp->hw_attrs->hw_rev == IRDMA_GEN_2)
+			++addl_frag_cnt;
+	}
+
+	if (!op_info->rem_addr.stag && !total_size)
+		op_info->rem_addr.stag = 0x1234;
+	hdr = LS_64(op_info->rem_addr.stag, IRDMAQPSQ_REMSTAG) |
+	      LS_64(info->op_type, IRDMAQPSQ_OPCODE) |
+	      LS_64((info->imm_data_valid ? 1 : 0), IRDMAQPSQ_IMMDATAFLAG) |
+	      LS_64((info->report_rtt ? 1 : 0), IRDMAQPSQ_REPORTRTT) |
+	      LS_64(addl_frag_cnt, IRDMAQPSQ_ADDFRAGCNT) |
+	      LS_64((info->push_wqe ? 1 : 0), IRDMAQPSQ_PUSHWQE) |
+	      LS_64(read_fence, IRDMAQPSQ_READFENCE) |
+	      LS_64(info->local_fence, IRDMAQPSQ_LOCALFENCE) |
+	      LS_64(info->signaled, IRDMAQPSQ_SIGCOMPL) |
+	      LS_64(qp->swqe_polarity, IRDMAQPSQ_VALID);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+	if (info->push_wqe) {
+		irdma_qp_push_wqe(qp, wqe, quanta, wqe_idx, post_sq);
+	} else {
+		if (post_sq)
+			irdma_qp_post_wr(qp);
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_rdma_read - rdma read command
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @inv_stag: flag for inv_stag
+ * @post_sq: flag to post sq
+ */
+static enum irdma_status_code irdma_rdma_read(struct irdma_qp_uk *qp,
+					      struct irdma_post_sq_info *info,
+					      bool inv_stag,
+					      bool post_sq)
+{
+	struct irdma_rdma_read *op_info;
+	enum irdma_status_code ret_code;
+	u32 i, byte_off, total_size = 0;
+	bool local_fence = false;
+	u32 addl_frag_cnt;
+	__le64 *wqe;
+	u32 wqe_idx;
+	u16 quanta;
+	u64 hdr;
+
+	info->push_wqe = qp->push_db ? true : false;
+
+	op_info = &info->op.rdma_read;
+	if (qp->max_sq_frag_cnt < op_info->num_lo_sges)
+		return IRDMA_ERR_INVALID_FRAG_COUNT;
+
+	for (i = 0; i < op_info->num_lo_sges; i++)
+		total_size += op_info->lo_sg_list[i].len;
+
+	ret_code = irdma_fragcnt_to_quanta_sq(op_info->num_lo_sges, &quanta);
+	if (ret_code)
+		return ret_code;
+
+	wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta,
+					 total_size, info);
+	if (!wqe)
+		return IRDMA_ERR_QP_TOOMANY_WRS_POSTED;
+
+	addl_frag_cnt = op_info->num_lo_sges > 1 ? (op_info->num_lo_sges - 1) : 0;
+	local_fence |= info->local_fence;
+
+	qp->wqe_ops.iw_set_fragment(wqe, 0, op_info->lo_sg_list,
+				    qp->swqe_polarity);
+	for (i = 1, byte_off = 32; i < op_info->num_lo_sges; ++i) {
+		qp->wqe_ops.iw_set_fragment(wqe, byte_off, &op_info->lo_sg_list[i],
+					    qp->swqe_polarity);
+		byte_off += 16;
+	}
+
+	/* if not an odd number set valid bit in next fragment */
+	if (qp->hw_attrs->hw_rev > IRDMA_GEN_1 &&
+	    !(op_info->num_lo_sges & 0x01) && op_info->num_lo_sges) {
+		qp->wqe_ops.iw_set_fragment(wqe, byte_off, NULL,
+					    qp->swqe_polarity);
+		if (qp->hw_attrs->hw_rev == IRDMA_GEN_2)
+			++addl_frag_cnt;
+	}
+	set_64bit_val(wqe, 16, LS_64(op_info->rem_addr.tag_off, IRDMAQPSQ_FRAG_TO));
+	hdr = LS_64(op_info->rem_addr.stag, IRDMAQPSQ_REMSTAG) |
+	      LS_64((info->report_rtt ? 1 : 0), IRDMAQPSQ_REPORTRTT) |
+	      LS_64(addl_frag_cnt, IRDMAQPSQ_ADDFRAGCNT) |
+	      LS_64((inv_stag ? IRDMAQP_OP_RDMA_READ_LOC_INV : IRDMAQP_OP_RDMA_READ), IRDMAQPSQ_OPCODE) |
+	      LS_64((info->push_wqe ? 1 : 0), IRDMAQPSQ_PUSHWQE) |
+	      LS_64(info->read_fence || qp->force_fence ? 1 : 0, IRDMAQPSQ_READFENCE) |
+	      LS_64(local_fence, IRDMAQPSQ_LOCALFENCE) |
+	      LS_64(info->signaled, IRDMAQPSQ_SIGCOMPL) |
+	      LS_64(qp->swqe_polarity, IRDMAQPSQ_VALID);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+	if (info->push_wqe) {
+		irdma_qp_push_wqe(qp, wqe, quanta, wqe_idx, post_sq);
+	} else {
+		if (post_sq)
+			irdma_qp_post_wr(qp);
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_send - rdma send command
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @post_sq: flag to post sq
+ */
+static enum irdma_status_code irdma_send(struct irdma_qp_uk *qp,
+					 struct irdma_post_sq_info *info,
+					 bool post_sq)
+{
+	__le64 *wqe;
+	struct irdma_post_send *op_info;
+	u64 hdr;
+	u32 i, wqe_idx, total_size = 0, byte_off;
+	enum irdma_status_code ret_code;
+	u32 frag_cnt, addl_frag_cnt;
+	bool read_fence = false;
+	u16 quanta;
+
+	info->push_wqe = qp->push_db ? true : false;
+
+	op_info = &info->op.send;
+	if (qp->max_sq_frag_cnt < op_info->num_sges)
+		return IRDMA_ERR_INVALID_FRAG_COUNT;
+
+	for (i = 0; i < op_info->num_sges; i++)
+		total_size += op_info->sg_list[i].len;
+
+	if (info->imm_data_valid)
+		frag_cnt = op_info->num_sges + 1;
+	else
+		frag_cnt = op_info->num_sges;
+	ret_code = irdma_fragcnt_to_quanta_sq(frag_cnt, &quanta);
+	if (ret_code)
+		return ret_code;
+
+	wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta,
+					 total_size, info);
+	if (!wqe)
+		return IRDMA_ERR_QP_TOOMANY_WRS_POSTED;
+
+	read_fence |= info->read_fence;
+	addl_frag_cnt = frag_cnt > 1 ? (frag_cnt - 1) : 0;
+	if (info->imm_data_valid) {
+		set_64bit_val(wqe, 0,
+			      LS_64(info->imm_data, IRDMAQPSQ_IMMDATA));
+		i = 0;
+	} else {
+		qp->wqe_ops.iw_set_fragment(wqe, 0, op_info->sg_list,
+					    qp->swqe_polarity);
+		i = 1;
+	}
+
+	for (byte_off = 32; i < op_info->num_sges; i++) {
+		qp->wqe_ops.iw_set_fragment(wqe, byte_off, &op_info->sg_list[i],
+					    qp->swqe_polarity);
+		byte_off += 16;
+	}
+
+	/* if not an odd number set valid bit in next fragment */
+	if (qp->hw_attrs->hw_rev > IRDMA_GEN_1 && !(frag_cnt & 0x01) &&
+	    frag_cnt) {
+		qp->wqe_ops.iw_set_fragment(wqe, byte_off, NULL,
+					    qp->swqe_polarity);
+		if (qp->hw_attrs->hw_rev == IRDMA_GEN_2)
+			++addl_frag_cnt;
+	}
+
+	set_64bit_val(wqe,
+		      16,
+		      LS_64(op_info->qkey, IRDMAQPSQ_DESTQKEY) |
+		      LS_64(op_info->dest_qp, IRDMAQPSQ_DESTQPN));
+	hdr = LS_64(info->stag_to_inv, IRDMAQPSQ_REMSTAG) |
+	      LS_64(op_info->ah_id, IRDMAQPSQ_AHID) |
+	      LS_64((info->imm_data_valid ? 1 : 0), IRDMAQPSQ_IMMDATAFLAG) |
+	      LS_64((info->report_rtt ? 1 : 0), IRDMAQPSQ_REPORTRTT) |
+	      LS_64(info->op_type, IRDMAQPSQ_OPCODE) |
+	      LS_64(addl_frag_cnt, IRDMAQPSQ_ADDFRAGCNT) |
+	      LS_64((info->push_wqe ? 1 : 0), IRDMAQPSQ_PUSHWQE) |
+	      LS_64(read_fence, IRDMAQPSQ_READFENCE) |
+	      LS_64(info->local_fence, IRDMAQPSQ_LOCALFENCE) |
+	      LS_64(info->signaled, IRDMAQPSQ_SIGCOMPL) |
+	      LS_64(info->udp_hdr, IRDMAQPSQ_UDPHEADER) |
+	      LS_64(info->l4len, IRDMAQPSQ_L4LEN) |
+	      LS_64(qp->swqe_polarity, IRDMAQPSQ_VALID);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+	if (info->push_wqe) {
+		irdma_qp_push_wqe(qp, wqe, quanta, wqe_idx, post_sq);
+	} else {
+		if (post_sq)
+			irdma_qp_post_wr(qp);
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_set_mw_bind_wqe_gen_1 - set mw bind wqe
+ * @wqe: wqe for setting fragment
+ * @op_info: info for setting bind wqe values
+ */
+static void irdma_set_mw_bind_wqe_gen_1(__le64 *wqe,
+					struct irdma_bind_window *op_info)
+{
+	set_64bit_val(wqe, 0, (uintptr_t)op_info->va);
+	set_64bit_val(wqe, 8,
+		      LS_64(op_info->mw_stag, IRDMAQPSQ_PARENTMRSTAG) |
+		      LS_64(op_info->mr_stag, IRDMAQPSQ_MWSTAG));
+	set_64bit_val(wqe, 16, op_info->bind_len);
+}
+
+/**
+ * irdma_copy_inline_data_gen_1 - Copy inline data to wqe
+ * @dest: pointer to wqe
+ * @src: pointer to inline data
+ * @len: length of inline data to copy
+ * @polarity: compatibility parameter
+ */
+static void irdma_copy_inline_data_gen_1(u8 *dest, u8 *src, u32 len, u8 polarity)
+{
+	if (len <= 16) {
+		memcpy(dest, src, len);
+	} else {
+		memcpy(dest, src, 16);
+		src += 16;
+		dest = dest + 32;
+		memcpy(dest, src, len - 16);
+	}
+}
+
+/**
+ * irdma_inline_data_size_to_quanta_gen_1 - based on inline data, quanta
+ * @data_size: data size for inline
+ * @quanta: size of sq wqe returned
+ * @max_size: maximum allowed inline size
+ *
+ * Gets the quanta based on inline and immediate data.
+ */
+static enum irdma_status_code irdma_inline_data_size_to_quanta_gen_1(u32 data_size,
+								     u16 *quanta,
+								     u32 max_size)
+{
+	if (data_size > max_size)
+		return IRDMA_ERR_INVALID_INLINE_DATA_SIZE;
+
+	if (data_size <= 16)
+		*quanta = IRDMA_QP_WQE_MIN_QUANTA;
+	else
+		*quanta = 2;
+
+	return 0;
+}
+
+/**
+ * irdma_set_mw_bind_wqe - set mw bind in wqe
+ * @wqe: wqe for setting mw bind
+ * @op_info: info for setting wqe values
+ */
+static void irdma_set_mw_bind_wqe(__le64 *wqe,
+				  struct irdma_bind_window *op_info)
+{
+	set_64bit_val(wqe, 0, (uintptr_t)op_info->va);
+	set_64bit_val(wqe, 8,
+		      LS_64(op_info->mr_stag, IRDMAQPSQ_PARENTMRSTAG) |
+		      LS_64(op_info->mw_stag, IRDMAQPSQ_MWSTAG));
+	set_64bit_val(wqe, 16, op_info->bind_len);
+}
+
+/**
+ * irdma_copy_inline_data - Copy inline data to wqe
+ * @dest: pointer to wqe
+ * @src: pointer to inline data
+ * @len: length of inline data to copy
+ * @polarity: polarity of wqe valid bit
+ */
+static void irdma_copy_inline_data(u8 *dest, u8 *src, u32 len, u8 polarity)
+{
+	u8 inline_valid = polarity << IRDMA_INLINE_VALID_S;
+	u32 copy_size;
+
+	dest += 8;
+	if (len <= 8) {
+		memcpy(dest, src, len);
+		return;
+	}
+
+	*((u64 *)dest) = *((u64 *)src);
+	len -= 8;
+	src += 8;
+	dest += 24;	/* point to additional 32 byte quanta */
+
+	while (len) {
+		copy_size = len < 31 ? len : 31;
+		memcpy(dest, src, copy_size);
+		*(dest + 31) = inline_valid;
+		len -= copy_size;
+		dest += 32;
+		src += copy_size;
+	}
+}
+
+/**
+ * irdma_inline_data_size_to_quanta - based on inline data, quanta
+ * @data_size: data size for inline
+ * @quanta: size of sq wqe returned
+ * @max_size: maximum allowed inline size
+ *
+ * Gets the quanta based on inline and immediate data.
+ */
+static enum irdma_status_code irdma_inline_data_size_to_quanta(u32 data_size,
+							       u16 *quanta,
+							       u32 max_size)
+{
+	if (data_size > max_size)
+		return IRDMA_ERR_INVALID_INLINE_DATA_SIZE;
+
+	if (data_size <= 8)
+		*quanta = IRDMA_QP_WQE_MIN_QUANTA;
+	else if (data_size <= 39)
+		*quanta = 2;
+	else if (data_size <= 70)
+		*quanta = 3;
+	else if (data_size <= 101)
+		*quanta = 4;
+	else if (data_size <= 132)
+		*quanta = 5;
+	else if (data_size <= 163)
+		*quanta = 6;
+	else if (data_size <= 194)
+		*quanta = 7;
+	else
+		*quanta = 8;
+
+	return 0;
+}
+
+/**
+ * irdma_inline_rdma_write - inline rdma write operation
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @post_sq: flag to post sq
+ */
+static enum irdma_status_code
+irdma_inline_rdma_write(struct irdma_qp_uk *qp,
+			struct irdma_post_sq_info *info,
+			bool post_sq)
+{
+	__le64 *wqe;
+	struct irdma_inline_rdma_write *op_info;
+	u64 hdr = 0;
+	u32 wqe_idx;
+	enum irdma_status_code ret_code;
+	bool read_fence = false;
+	u16 quanta;
+
+	info->push_wqe = qp->push_db ? true : false;
+	op_info = &info->op.inline_rdma_write;
+	ret_code = qp->wqe_ops.iw_inline_data_size_to_quanta(op_info->len, &quanta,
+							 qp->hw_attrs->max_hw_inline);
+	if (ret_code)
+		return ret_code;
+
+	wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta,
+					 op_info->len, info);
+	if (!wqe)
+		return IRDMA_ERR_QP_TOOMANY_WRS_POSTED;
+
+	read_fence |= info->read_fence;
+	set_64bit_val(wqe, 16,
+		      LS_64(op_info->rem_addr.tag_off, IRDMAQPSQ_FRAG_TO));
+
+	hdr = LS_64(op_info->rem_addr.stag, IRDMAQPSQ_REMSTAG) |
+	      LS_64(info->op_type, IRDMAQPSQ_OPCODE) |
+	      LS_64(op_info->len, IRDMAQPSQ_INLINEDATALEN) |
+	      LS_64((info->report_rtt ? 1 : 0), IRDMAQPSQ_REPORTRTT) |
+	      LS_64(1, IRDMAQPSQ_INLINEDATAFLAG) |
+	      LS_64((info->imm_data_valid ? 1 : 0), IRDMAQPSQ_IMMDATAFLAG) |
+	      LS_64((info->push_wqe ? 1 : 0), IRDMAQPSQ_PUSHWQE) |
+	      LS_64(read_fence, IRDMAQPSQ_READFENCE) |
+	      LS_64(info->local_fence, IRDMAQPSQ_LOCALFENCE) |
+	      LS_64(info->signaled, IRDMAQPSQ_SIGCOMPL) |
+	      LS_64(qp->swqe_polarity, IRDMAQPSQ_VALID);
+
+	if (info->imm_data_valid)
+		set_64bit_val(wqe, 0,
+			      LS_64(info->imm_data, IRDMAQPSQ_IMMDATA));
+
+	qp->wqe_ops.iw_copy_inline_data((u8 *)wqe, op_info->data, op_info->len,
+				    qp->swqe_polarity);
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+
+	if (info->push_wqe) {
+		irdma_qp_push_wqe(qp, wqe, quanta, wqe_idx, post_sq);
+	} else {
+		if (post_sq)
+			irdma_qp_post_wr(qp);
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_inline_send - inline send operation
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @post_sq: flag to post sq
+ */
+static enum irdma_status_code irdma_inline_send(struct irdma_qp_uk *qp,
+						struct irdma_post_sq_info *info,
+						bool post_sq)
+{
+	__le64 *wqe;
+	struct irdma_post_inline_send *op_info;
+	u64 hdr;
+	u32 wqe_idx;
+	enum irdma_status_code ret_code;
+	bool read_fence = false;
+	u16 quanta;
+
+	info->push_wqe = qp->push_db ? true : false;
+	op_info = &info->op.inline_send;
+
+	ret_code = qp->wqe_ops.iw_inline_data_size_to_quanta(op_info->len, &quanta,
+							 qp->hw_attrs->max_hw_inline);
+	if (ret_code)
+		return ret_code;
+
+	wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta,
+					 op_info->len, info);
+	if (!wqe)
+		return IRDMA_ERR_QP_TOOMANY_WRS_POSTED;
+
+	set_64bit_val(wqe, 16,
+		      LS_64(op_info->qkey, IRDMAQPSQ_DESTQKEY) |
+		      LS_64(op_info->dest_qp, IRDMAQPSQ_DESTQPN));
+
+	read_fence |= info->read_fence;
+	hdr = LS_64(info->stag_to_inv, IRDMAQPSQ_REMSTAG) |
+	      LS_64(op_info->ah_id, IRDMAQPSQ_AHID) |
+	      LS_64(info->op_type, IRDMAQPSQ_OPCODE) |
+	      LS_64(op_info->len, IRDMAQPSQ_INLINEDATALEN) |
+	      LS_64((info->imm_data_valid ? 1 : 0), IRDMAQPSQ_IMMDATAFLAG) |
+	      LS_64((info->report_rtt ? 1 : 0), IRDMAQPSQ_REPORTRTT) |
+	      LS_64(1, IRDMAQPSQ_INLINEDATAFLAG) |
+	      LS_64((info->push_wqe ? 1 : 0), IRDMAQPSQ_PUSHWQE) |
+	      LS_64(read_fence, IRDMAQPSQ_READFENCE) |
+	      LS_64(info->local_fence, IRDMAQPSQ_LOCALFENCE) |
+	      LS_64(info->signaled, IRDMAQPSQ_SIGCOMPL) |
+	      LS_64(info->udp_hdr, IRDMAQPSQ_UDPHEADER) |
+	      LS_64(info->l4len, IRDMAQPSQ_L4LEN) |
+	      LS_64(qp->swqe_polarity, IRDMAQPSQ_VALID);
+
+	if (info->imm_data_valid)
+		set_64bit_val(wqe, 0,
+			      LS_64(info->imm_data, IRDMAQPSQ_IMMDATA));
+	qp->wqe_ops.iw_copy_inline_data((u8 *)wqe, op_info->data, op_info->len, qp->swqe_polarity);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+
+	if (info->push_wqe) {
+		irdma_qp_push_wqe(qp, wqe, quanta, wqe_idx, post_sq);
+	} else {
+		if (post_sq)
+			irdma_qp_post_wr(qp);
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_stag_local_invalidate - stag invalidate operation
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @post_sq: flag to post sq
+ */
+static enum irdma_status_code
+irdma_stag_local_invalidate(struct irdma_qp_uk *qp,
+			    struct irdma_post_sq_info *info,
+			    bool post_sq)
+{
+	__le64 *wqe;
+	struct irdma_inv_local_stag *op_info;
+	u64 hdr;
+	u32 wqe_idx;
+	bool local_fence = false;
+	struct irdma_sge sge = {};
+
+	info->push_wqe = qp->push_db ? true : false;
+	op_info = &info->op.inv_local_stag;
+	local_fence = info->local_fence;
+
+	wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, IRDMA_QP_WQE_MIN_QUANTA,
+					 0, info);
+	if (!wqe)
+		return IRDMA_ERR_QP_TOOMANY_WRS_POSTED;
+
+	sge.stag = op_info->target_stag;
+	qp->wqe_ops.iw_set_fragment(wqe, 0, &sge, 0);
+
+	set_64bit_val(wqe, 16, 0);
+
+	hdr = LS_64(IRDMA_OP_TYPE_INV_STAG, IRDMAQPSQ_OPCODE) |
+	      LS_64((info->push_wqe ? 1 : 0), IRDMAQPSQ_PUSHWQE) |
+	      LS_64(info->read_fence, IRDMAQPSQ_READFENCE) |
+	      LS_64(local_fence, IRDMAQPSQ_LOCALFENCE) |
+	      LS_64(info->signaled, IRDMAQPSQ_SIGCOMPL) |
+	      LS_64(qp->swqe_polarity, IRDMAQPSQ_VALID);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+
+	if (info->push_wqe) {
+		irdma_qp_push_wqe(qp, wqe, IRDMA_QP_WQE_MIN_QUANTA, wqe_idx,
+				  post_sq);
+	} else {
+		if (post_sq)
+			irdma_qp_post_wr(qp);
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_mw_bind - bind Memory Window
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @post_sq: flag to post sq
+ */
+static enum irdma_status_code irdma_mw_bind(struct irdma_qp_uk *qp,
+					    struct irdma_post_sq_info *info,
+					    bool post_sq)
+{
+	__le64 *wqe;
+	struct irdma_bind_window *op_info;
+	u64 hdr;
+	u32 wqe_idx;
+	bool local_fence = false;
+
+	info->push_wqe = qp->push_db ? true : false;
+	op_info = &info->op.bind_window;
+	local_fence |= info->local_fence;
+
+	wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx,
+					 IRDMA_QP_WQE_MIN_QUANTA, 0, info);
+	if (!wqe)
+		return IRDMA_ERR_QP_TOOMANY_WRS_POSTED;
+
+	qp->wqe_ops.iw_set_mw_bind_wqe(wqe, op_info);
+
+	hdr = LS_64(IRDMA_OP_TYPE_BIND_MW, IRDMAQPSQ_OPCODE) |
+	      LS_64(((op_info->ena_reads << 2) | (op_info->ena_writes << 3)),
+		    IRDMAQPSQ_STAGRIGHTS) |
+	      LS_64((op_info->addressing_type == IRDMA_ADDR_TYPE_VA_BASED ?  1 : 0),
+		    IRDMAQPSQ_VABASEDTO) |
+	      LS_64((op_info->mem_window_type_1 ?  1 : 0), IRDMAQPSQ_MEMWINDOWTYPE) |
+	      LS_64((info->push_wqe ? 1 : 0), IRDMAQPSQ_PUSHWQE) |
+	      LS_64(info->read_fence, IRDMAQPSQ_READFENCE) |
+	      LS_64(local_fence, IRDMAQPSQ_LOCALFENCE) |
+	      LS_64(info->signaled, IRDMAQPSQ_SIGCOMPL) |
+	      LS_64(qp->swqe_polarity, IRDMAQPSQ_VALID);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+
+	if (info->push_wqe) {
+		irdma_qp_push_wqe(qp, wqe, IRDMA_QP_WQE_MIN_QUANTA, wqe_idx, post_sq);
+	} else {
+		if (post_sq)
+			irdma_qp_post_wr(qp);
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_post_receive - post receive wqe
+ * @qp: hw qp ptr
+ * @info: post rq information
+ */
+static enum irdma_status_code
+irdma_post_receive(struct irdma_qp_uk *qp, struct irdma_post_rq_info *info)
+{
+	u32 total_size = 0, wqe_idx, i, byte_off;
+	u32 addl_frag_cnt;
+	__le64 *wqe;
+	u64 hdr;
+
+	if (qp->max_rq_frag_cnt < info->num_sges)
+		return IRDMA_ERR_INVALID_FRAG_COUNT;
+
+	for (i = 0; i < info->num_sges; i++)
+		total_size += info->sg_list[i].len;
+
+	wqe = irdma_qp_get_next_recv_wqe(qp, &wqe_idx);
+	if (!wqe)
+		return IRDMA_ERR_QP_TOOMANY_WRS_POSTED;
+
+	qp->rq_wrid_array[wqe_idx] = info->wr_id;
+	addl_frag_cnt = info->num_sges > 1 ? (info->num_sges - 1) : 0;
+	qp->wqe_ops.iw_set_fragment(wqe, 0, info->sg_list,
+				    qp->rwqe_polarity);
+
+	for (i = 1, byte_off = 32; i < info->num_sges; i++) {
+		qp->wqe_ops.iw_set_fragment(wqe, byte_off, &info->sg_list[i],
+					    qp->rwqe_polarity);
+		byte_off += 16;
+	}
+
+	/* if not an odd number set valid bit in next fragment */
+	if (qp->hw_attrs->hw_rev > IRDMA_GEN_1 && !(info->num_sges & 0x01) &&
+	    info->num_sges) {
+		qp->wqe_ops.iw_set_fragment(wqe, byte_off, NULL, qp->rwqe_polarity);
+		if (qp->hw_attrs->hw_rev == IRDMA_GEN_2)
+			++addl_frag_cnt;
+	}
+
+	set_64bit_val(wqe, 16, 0);
+	hdr = LS_64(addl_frag_cnt, IRDMAQPSQ_ADDFRAGCNT) |
+	      LS_64(qp->rwqe_polarity, IRDMAQPSQ_VALID);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+
+	return 0;
+}
+
+/**
+ * irdma_cq_request_notification - cq notification request (door bell)
+ * @cq: hw cq
+ * @cq_notify: notification type
+ */
+static void irdma_cq_request_notification(struct irdma_cq_uk *cq,
+					  enum irdma_cmpl_notify cq_notify)
+{
+	u64 temp_val;
+	u16 sw_cq_sel;
+	u8 arm_next_se = 0;
+	u8 arm_next = 0;
+	u8 arm_seq_num;
+
+	get_64bit_val(cq->shadow_area, 32, &temp_val);
+	arm_seq_num = (u8)RS_64(temp_val, IRDMA_CQ_DBSA_ARM_SEQ_NUM);
+	arm_seq_num++;
+	sw_cq_sel = (u16)RS_64(temp_val, IRDMA_CQ_DBSA_SW_CQ_SELECT);
+	arm_next_se = (u8)RS_64(temp_val, IRDMA_CQ_DBSA_ARM_NEXT_SE);
+	arm_next_se |= 1;
+	if (cq_notify == IRDMA_CQ_COMPL_EVENT)
+		arm_next = 1;
+	temp_val = LS_64(arm_seq_num, IRDMA_CQ_DBSA_ARM_SEQ_NUM) |
+		   LS_64(sw_cq_sel, IRDMA_CQ_DBSA_SW_CQ_SELECT) |
+		   LS_64(arm_next_se, IRDMA_CQ_DBSA_ARM_NEXT_SE) |
+		   LS_64(arm_next, IRDMA_CQ_DBSA_ARM_NEXT);
+
+	set_64bit_val(cq->shadow_area, 32, temp_val);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	writel(cq->cq_id, cq->cqe_alloc_db);
+}
+
+/**
+ * irdma_cq_post_entries - update tail in shadow memory
+ * @cq: hw cq
+ * @count: # of entries processed
+ */
+static enum irdma_status_code irdma_cq_post_entries(struct irdma_cq_uk *cq,
+						    u8 count)
+{
+	IRDMA_RING_MOVE_TAIL_BY_COUNT(cq->cq_ring, count);
+	set_64bit_val(cq->shadow_area, 0,
+		      IRDMA_RING_CURRENT_HEAD(cq->cq_ring));
+
+	return 0;
+}
+
+/**
+ * irdma_cq_poll_cmpl - get cq completion info
+ * @cq: hw cq
+ * @info: cq poll information returned
+ * @post_cq: update cq tail
+ */
+static enum irdma_status_code
+irdma_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info)
+{
+	u64 comp_ctx, qword0, qword2, qword3, qword4, qword6, qword7, wqe_qword;
+	__le64 *cqe, *sw_wqe;
+	struct irdma_qp_uk *qp;
+	struct irdma_ring *pring = NULL;
+	u32 wqe_idx, q_type, array_idx = 0;
+	enum irdma_status_code ret_code = 0;
+	bool move_cq_head = true;
+	u8 polarity;
+	bool ext_valid;
+	__le64 *ext_cqe;
+	u32 peek_head;
+
+	if (cq->avoid_mem_cflct)
+		cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(cq);
+	else
+		cqe = IRDMA_GET_CURRENT_CQ_ELEM(cq);
+
+	get_64bit_val(cqe, 24, &qword3);
+	polarity = (u8)RS_64(qword3, IRDMA_CQ_VALID);
+	if (polarity != cq->polarity)
+		return IRDMA_ERR_Q_EMPTY;
+
+	/* Ensure CQE contents are read after valid bit is checked */
+	rmb();
+
+	ext_valid = (bool)RS_64(qword3, IRDMA_CQ_EXTCQE);
+	if (ext_valid) {
+		if (cq->avoid_mem_cflct) {
+			ext_cqe = (__le64 *)((u8 *)cqe + 32);
+			get_64bit_val(ext_cqe, 24, &qword7);
+			polarity = (u8)RS_64(qword7, IRDMA_CQ_VALID);
+		} else {
+			peek_head = (cq->cq_ring.head + 1) % cq->cq_ring.size;
+			ext_cqe = cq->cq_base[peek_head].buf;
+			get_64bit_val(ext_cqe, 24, &qword7);
+			polarity = (u8)RS_64(qword7, IRDMA_CQ_VALID);
+			if (!peek_head)
+				polarity ^= 1;
+		}
+		if (polarity != cq->polarity)
+			return IRDMA_ERR_Q_EMPTY;
+
+		/* Ensure ext CQE contents are read after ext valid bit is checked */
+		rmb();
+
+		info->imm_valid = (bool)RS_64(qword7, IRDMA_CQ_IMMVALID);
+		if (info->imm_valid) {
+			get_64bit_val(ext_cqe, 0, &qword4);
+			info->imm_data = (u32)RS_64(qword4, IRDMA_CQ_IMMDATALOW32);
+		}
+		info->ud_smac_valid = (bool)RS_64(qword7, IRDMA_CQ_UDSMACVALID);
+		info->ud_vlan_valid = (bool)RS_64(qword7, IRDMA_CQ_UDVLANVALID);
+		if (info->ud_smac_valid || info->ud_vlan_valid) {
+			get_64bit_val(ext_cqe, 16, &qword6);
+			if (info->ud_vlan_valid)
+				info->ud_vlan = (u16)RS_64(qword6, IRDMA_CQ_UDVLAN);
+			if (info->ud_smac_valid) {
+				info->ud_smac[5] = qword6 & 0xFF;
+				info->ud_smac[4] = (qword6 >> 8) & 0xFF;
+				info->ud_smac[3] = (qword6 >> 16) & 0xFF;
+				info->ud_smac[2] = (qword6 >> 24) & 0xFF;
+				info->ud_smac[1] = (qword6 >> 32) & 0xFF;
+				info->ud_smac[0] = (qword6 >> 40) & 0xFF;
+			}
+		}
+	} else {
+		info->imm_valid = false;
+		info->ud_smac_valid = false;
+		info->ud_vlan_valid = false;
+	}
+
+	q_type = (u8)RS_64(qword3, IRDMA_CQ_SQ);
+	info->error = (bool)RS_64(qword3, IRDMA_CQ_ERROR);
+	info->push_dropped = (bool)RS_64(qword3, IRDMACQ_PSHDROP);
+	info->ipv4 = (bool)RS_64(qword3, IRDMACQ_IPV4);
+	if (info->error) {
+		info->major_err = RS_64(qword3, IRDMA_CQ_MAJERR);
+		info->minor_err = RS_64(qword3, IRDMA_CQ_MINERR);
+		if (info->major_err == IRDMA_FLUSH_MAJOR_ERR)
+			info->comp_status = IRDMA_COMPL_STATUS_FLUSHED;
+		else if (info->major_err == IRDMA_LEN_MAJOR_ERR)
+			info->comp_status = IRDMA_COMPL_STATUS_INVALID_LEN;
+		else
+			info->comp_status = IRDMA_COMPL_STATUS_UNKNOWN;
+	} else {
+		info->comp_status = IRDMA_COMPL_STATUS_SUCCESS;
+	}
+
+	get_64bit_val(cqe, 0, &qword0);
+	get_64bit_val(cqe, 16, &qword2);
+
+	info->tcp_seq_num_rtt = (u32)RS_64(qword0, IRDMACQ_TCPSEQNUMRTT);
+	info->qp_id = (u32)RS_64(qword2, IRDMACQ_QPID);
+	info->ud_src_qpn = (u32)RS_64(qword2, IRDMACQ_UDSRCQPN);
+
+	get_64bit_val(cqe, 8, &comp_ctx);
+
+	info->solicited_event = (bool)RS_64(qword3, IRDMACQ_SOEVENT);
+
+	qp = (struct irdma_qp_uk *)(unsigned long)comp_ctx;
+	if (!qp) {
+		ret_code = IRDMA_ERR_Q_DESTROYED;
+		goto exit;
+	}
+	wqe_idx = (u32)RS_64(qword3, IRDMA_CQ_WQEIDX);
+	info->qp_handle = (irdma_qp_handle)(unsigned long)qp;
+
+	if (q_type == IRDMA_CQE_QTYPE_RQ) {
+		array_idx = wqe_idx / qp->rq_wqe_size_multiplier;
+		if (info->comp_status == IRDMA_COMPL_STATUS_FLUSHED ||
+		    info->comp_status == IRDMA_COMPL_STATUS_INVALID_LEN) {
+			if (!IRDMA_RING_MORE_WORK(qp->rq_ring)) {
+				ret_code = IRDMA_ERR_Q_EMPTY;
+				goto exit;
+			}
+
+			info->wr_id = qp->rq_wrid_array[qp->rq_ring.tail];
+			array_idx = qp->rq_ring.tail;
+		} else {
+			info->wr_id = qp->rq_wrid_array[array_idx];
+		}
+
+		if (info->imm_valid)
+			info->op_type = IRDMA_OP_TYPE_REC_IMM;
+		else
+			info->op_type = IRDMA_OP_TYPE_REC;
+		if (qword3 & IRDMACQ_STAG_M) {
+			info->stag_invalid_set = true;
+			info->inv_stag = (u32)RS_64(qword2, IRDMACQ_INVSTAG);
+		} else {
+			info->stag_invalid_set = false;
+		}
+		info->bytes_xfered = (u32)RS_64(qword0, IRDMACQ_PAYLDLEN);
+		IRDMA_RING_SET_TAIL(qp->rq_ring, array_idx + 1);
+		if (!IRDMA_RING_MORE_WORK(qp->rq_ring) &&
+		    info->comp_status == IRDMA_COMPL_STATUS_FLUSHED)
+			qp->rq_flush_complete = true;
+
+		pring = &qp->rq_ring;
+	} else {	/* q_type is IRDMA_CQE_QTYPE_SQ */
+		if (qp->first_sq_wq) {
+			qp->first_sq_wq = false;
+			if (!wqe_idx && qp->sq_ring.head == qp->sq_ring.tail) {
+				IRDMA_RING_MOVE_HEAD_NOCHECK(cq->cq_ring);
+				IRDMA_RING_MOVE_TAIL(cq->cq_ring);
+				set_64bit_val(cq->shadow_area, 0,
+					      IRDMA_RING_CURRENT_HEAD(cq->cq_ring));
+				memset(info, 0, sizeof(struct irdma_cq_poll_info));
+				return irdma_cq_poll_cmpl(cq, info);
+			}
+		}
+		/*cease posting push mode on push drop*/
+		if (info->push_dropped)
+			qp->push_mode = false;
+
+		if (info->comp_status != IRDMA_COMPL_STATUS_FLUSHED) {
+			info->wr_id = qp->sq_wrtrk_array[wqe_idx].wrid;
+			if (!info->comp_status)
+				info->bytes_xfered = qp->sq_wrtrk_array[wqe_idx].wr_len;
+			info->op_type = (u8)RS_64(qword3, IRDMACQ_OP);
+			sw_wqe = qp->sq_base[wqe_idx].elem;
+			get_64bit_val(sw_wqe, 24, &wqe_qword);
+			IRDMA_RING_SET_TAIL(qp->sq_ring,
+					    wqe_idx + qp->sq_wrtrk_array[wqe_idx].quanta);
+		} else {
+			if (!IRDMA_RING_MORE_WORK(qp->sq_ring)) {
+				ret_code = IRDMA_ERR_Q_EMPTY;
+				goto exit;
+			}
+
+			do {
+				u8 op_type;
+				u32 tail;
+
+				tail = qp->sq_ring.tail;
+				sw_wqe = qp->sq_base[tail].elem;
+				get_64bit_val(sw_wqe, 24, &wqe_qword);
+				op_type = (u8)RS_64(wqe_qword, IRDMAQPSQ_OPCODE);
+				info->op_type = op_type;
+				IRDMA_RING_SET_TAIL(qp->sq_ring,
+						    tail + qp->sq_wrtrk_array[tail].quanta);
+				if (op_type != IRDMAQP_OP_NOP) {
+					info->wr_id = qp->sq_wrtrk_array[tail].wrid;
+					info->bytes_xfered = qp->sq_wrtrk_array[tail].wr_len;
+					break;
+				}
+			} while (1);
+			if (!IRDMA_RING_MORE_WORK(qp->sq_ring))
+				qp->sq_flush_complete = true;
+		}
+		pring = &qp->sq_ring;
+	}
+
+	ret_code = 0;
+
+exit:
+	if (!ret_code &&
+	    info->comp_status == IRDMA_COMPL_STATUS_FLUSHED)
+		if (pring && (IRDMA_RING_MORE_WORK(*pring)))
+			move_cq_head = false;
+
+	if (move_cq_head) {
+		IRDMA_RING_MOVE_HEAD_NOCHECK(cq->cq_ring);
+		if (IRDMA_RING_CURRENT_HEAD(cq->cq_ring) == 0)
+			cq->polarity ^= 1;
+
+		if (ext_valid && !cq->avoid_mem_cflct) {
+			IRDMA_RING_MOVE_HEAD_NOCHECK(cq->cq_ring);
+			if (IRDMA_RING_CURRENT_HEAD(cq->cq_ring) == 0)
+				cq->polarity ^= 1;
+		}
+
+		IRDMA_RING_MOVE_TAIL(cq->cq_ring);
+		if (!cq->avoid_mem_cflct && ext_valid)
+			IRDMA_RING_MOVE_TAIL(cq->cq_ring);
+		set_64bit_val(cq->shadow_area, 0,
+			      IRDMA_RING_CURRENT_HEAD(cq->cq_ring));
+	} else {
+		qword3 &= ~IRDMA_CQ_WQEIDX_M;
+		qword3 |= LS_64(pring->tail, IRDMA_CQ_WQEIDX);
+		set_64bit_val(cqe, 24, qword3);
+	}
+
+	return ret_code;
+}
+
+/**
+ * irdma_qp_roundup - return round up qp wq depth
+ * @wqdepth: wq depth in quanta to round up
+ */
+static int irdma_qp_round_up(u32 wqdepth)
+{
+	int scount = 1;
+
+	for (wqdepth--; scount <= 16; scount *= 2)
+		wqdepth |= wqdepth >> scount;
+
+	return ++wqdepth;
+}
+
+/**
+ * irdma_get_wqe_shift - get shift count for maximum wqe size
+ * @sge: Maximum Scatter Gather Elements wqe
+ * @inline_data: Maximum inline data size
+ * @shift: Returns the shift needed based on sge
+ *
+ * Shift can be used to left shift the wqe size based on number of SGEs and inlind data size.
+ * For 1 SGE or inline data <= 8, shift = 0 (wqe size of 32
+ * bytes). For 2 or 3 SGEs or inline data <= 39, shift = 1 (wqe
+ * size of 64 bytes).
+ * For 4-7 SGE's and inline <= 101 Shift of 2 otherwise (wqe
+ * size of 256 bytes).
+ */
+void irdma_get_wqe_shift(struct irdma_hw_attrs *hw_attrs,
+			 u32 sge,
+			 u32 inline_data,
+			 u8 *shift)
+{
+	*shift = 0;
+	if (hw_attrs->hw_rev > IRDMA_GEN_1) {
+		if (sge > 1 || inline_data > 8) {
+			if (sge < 4 && inline_data <= 39)
+				*shift = 1;
+			else if (sge < 8 && inline_data <= 101)
+				*shift = 2;
+			else
+				*shift = 3;
+		}
+	} else if (sge > 1 || inline_data > 16) {
+		*shift = (sge < 4 && inline_data <= 48) ? 1 : 2;
+	}
+}
+
+/*
+ * irdma_get_sqdepth - get SQ depth (quanta)
+ * @sq_size: SQ size
+ * @shift: shift which determines size of WQE
+ * @sqdepth: depth of SQ
+ *
+ */
+enum irdma_status_code irdma_get_sqdepth(struct irdma_hw_attrs *hw_attrs,
+					 u32 sq_size,
+					 u8 shift,
+					 u32 *sqdepth)
+{
+	*sqdepth = irdma_qp_round_up((sq_size << shift) + IRDMA_SQ_RSVD);
+
+	if (*sqdepth < (IRDMA_QP_SW_MIN_WQSIZE << shift))
+		*sqdepth = IRDMA_QP_SW_MIN_WQSIZE << shift;
+	else if (*sqdepth > hw_attrs->max_hw_wq_quanta)
+		return IRDMA_ERR_INVALID_SIZE;
+
+	return 0;
+}
+
+/*
+ * irdma_get_rq_depth - get RQ depth (quanta)
+ * @sq_size: SQ size
+ * @shift: shift which determines size of WQE
+ * @wqdepth: depth of RQ
+ *
+ */
+enum irdma_status_code irdma_get_rqdepth(struct irdma_hw_attrs *hw_attrs,
+					 u32 rq_size,
+					 u8 shift,
+					 u32 *rqdepth)
+{
+	*rqdepth = irdma_qp_round_up((rq_size << shift) + IRDMA_RQ_RSVD);
+
+	if (*rqdepth < (IRDMA_QP_SW_MIN_WQSIZE << shift))
+		*rqdepth = IRDMA_QP_SW_MIN_WQSIZE << shift;
+	else if (*rqdepth > hw_attrs->max_hw_rq_quanta)
+		return IRDMA_ERR_INVALID_SIZE;
+
+	return 0;
+}
+
+static struct irdma_qp_uk_ops iw_qp_uk_ops = {
+	.iw_qp_post_wr = irdma_qp_post_wr,
+	.iw_qp_ring_push_db = irdma_qp_ring_push_db,
+	.iw_rdma_write = irdma_rdma_write,
+	.iw_rdma_read = irdma_rdma_read,
+	.iw_send = irdma_send,
+	.iw_inline_rdma_write = irdma_inline_rdma_write,
+	.iw_inline_send = irdma_inline_send,
+	.iw_stag_local_invalidate = irdma_stag_local_invalidate,
+	.iw_mw_bind = irdma_mw_bind,
+	.iw_post_receive = irdma_post_receive,
+	.iw_post_nop = irdma_nop,
+};
+
+static struct irdma_wqe_uk_ops iw_wqe_uk_ops = {
+	.iw_set_fragment = irdma_set_fragment,
+	.iw_copy_inline_data = irdma_copy_inline_data,
+	.iw_inline_data_size_to_quanta = irdma_inline_data_size_to_quanta,
+	.iw_set_mw_bind_wqe = irdma_set_mw_bind_wqe,
+};
+
+static struct irdma_wqe_uk_ops iw_wqe_uk_ops_gen_1 = {
+	.iw_set_fragment = irdma_set_fragment_gen_1,
+	.iw_copy_inline_data = irdma_copy_inline_data_gen_1,
+	.iw_inline_data_size_to_quanta = irdma_inline_data_size_to_quanta_gen_1,
+	.iw_set_mw_bind_wqe = irdma_set_mw_bind_wqe_gen_1,
+};
+
+static struct irdma_cq_ops iw_cq_ops = {
+	.iw_cq_request_notification = irdma_cq_request_notification,
+	.iw_cq_poll_cmpl = irdma_cq_poll_cmpl,
+	.iw_cq_post_entries = irdma_cq_post_entries,
+	.iw_cq_clean = irdma_clean_cq,
+};
+
+static struct irdma_device_uk_ops iw_device_uk_ops = {
+	.iw_cq_uk_init = irdma_cq_uk_init,
+	.iw_qp_uk_init = irdma_qp_uk_init,
+};
+
+/**
+ * irdma_setup_connection_wqes - setup WQEs necessary to complete
+ * connection.
+ * @qp: hw qp (user and kernel)
+ * @info: qp initialization info
+ * @ret_code: return status of moving ring head
+ */
+static void irdma_setup_connection_wqes(struct irdma_qp_uk *qp,
+					struct irdma_qp_uk_init_info *info,
+					enum irdma_status_code *ret_code)
+{
+	u16 move_cnt = 1;
+
+	if (qp->hw_attrs->feature_flags & IRDMA_FEATURE_RTS_AE)
+		move_cnt = 3;
+
+	IRDMA_RING_MOVE_HEAD_BY_COUNT(qp->sq_ring, move_cnt, *ret_code);
+	IRDMA_RING_MOVE_TAIL_BY_COUNT(qp->sq_ring, move_cnt);
+	IRDMA_RING_MOVE_HEAD_BY_COUNT(qp->initial_ring, move_cnt, *ret_code);
+}
+
+/**
+ * irdma_qp_uk_init - initialize shared qp
+ * @qp: hw qp (user and kernel)
+ * @info: qp initialization info
+ *
+ * initializes the vars used in both user and kernel mode.
+ * size of the wqe depends on numbers of max. fragements
+ * allowed. Then size of wqe * the number of wqes should be the
+ * amount of memory allocated for sq and rq.
+ */
+enum irdma_status_code irdma_qp_uk_init(struct irdma_qp_uk *qp,
+					struct irdma_qp_uk_init_info *info)
+{
+	enum irdma_status_code ret_code = 0;
+	u32 sq_ring_size;
+	u8 sqshift, rqshift;
+
+	qp->hw_attrs = info->hw_attrs;
+	if (info->max_sq_frag_cnt > qp->hw_attrs->max_hw_wq_frags)
+		return IRDMA_ERR_INVALID_FRAG_COUNT;
+
+	if (info->max_rq_frag_cnt > qp->hw_attrs->max_hw_wq_frags)
+		return IRDMA_ERR_INVALID_FRAG_COUNT;
+
+	if (qp->hw_attrs->hw_rev == IRDMA_GEN_1)
+		irdma_get_wqe_shift(qp->hw_attrs, info->max_sq_frag_cnt,
+				    info->max_inline_data, &sqshift);
+	else
+		irdma_get_wqe_shift(qp->hw_attrs, info->max_sq_frag_cnt + 1,
+				    info->max_inline_data, &sqshift);
+	irdma_get_wqe_shift(qp->hw_attrs, info->max_rq_frag_cnt, 0, &rqshift);
+	if (qp->hw_attrs->hw_rev == IRDMA_GEN_1)
+		rqshift = 2;
+	qp->qp_caps = info->qp_caps;
+	qp->sq_base = info->sq;
+	qp->rq_base = info->rq;
+	qp->shadow_area = info->shadow_area;
+	qp->sq_wrtrk_array = info->sq_wrtrk_array;
+	qp->rq_wrid_array = info->rq_wrid_array;
+	qp->wqe_alloc_db = info->wqe_alloc_db;
+	qp->qp_id = info->qp_id;
+	qp->sq_size = info->sq_size;
+	qp->push_db = info->push_db;
+	qp->push_wqe = info->push_wqe;
+	qp->push_mode = false;
+	qp->max_sq_frag_cnt = info->max_sq_frag_cnt;
+	sq_ring_size = qp->sq_size << sqshift;
+	IRDMA_RING_INIT(qp->sq_ring, sq_ring_size);
+	IRDMA_RING_INIT(qp->initial_ring, sq_ring_size);
+	if (info->first_sq_wq) {
+		irdma_setup_connection_wqes(qp, info, &ret_code);
+		qp->swqe_polarity = 1;
+		qp->first_sq_wq = true;
+	} else {
+		qp->swqe_polarity = 0;
+	}
+	qp->swqe_polarity_deferred = 1;
+	qp->rwqe_polarity = 0;
+	qp->rq_size = info->rq_size;
+	qp->max_rq_frag_cnt = info->max_rq_frag_cnt;
+	qp->max_inline_data = info->max_inline_data;
+	qp->rq_wqe_size = rqshift;
+	IRDMA_RING_INIT(qp->rq_ring, qp->rq_size);
+	qp->rq_wqe_size_multiplier = 1 << rqshift;
+	qp->qp_ops = iw_qp_uk_ops;
+	if (qp->hw_attrs->hw_rev == IRDMA_GEN_1)
+		qp->wqe_ops = iw_wqe_uk_ops_gen_1;
+	else
+		qp->wqe_ops = iw_wqe_uk_ops;
+
+	return ret_code;
+}
+
+/**
+ * irdma_cq_uk_init - initialize shared cq (user and kernel)
+ * @cq: hw cq
+ * @info: hw cq initialization info
+ */
+enum irdma_status_code irdma_cq_uk_init(struct irdma_cq_uk *cq,
+					struct irdma_cq_uk_init_info *info)
+{
+	cq->cq_base = (struct irdma_cqe *)info->cq_base;
+	cq->cq_id = info->cq_id;
+	cq->cq_size = info->cq_size;
+	cq->cqe_alloc_db = info->cqe_alloc_db;
+	cq->cq_ack_db = info->cq_ack_db;
+	cq->shadow_area = info->shadow_area;
+	cq->avoid_mem_cflct = info->avoid_mem_cflct;
+	IRDMA_RING_INIT(cq->cq_ring, cq->cq_size);
+	cq->polarity = 1;
+	cq->ops = iw_cq_ops;
+
+	return 0;
+}
+
+/**
+ * irdma_device_init_uk - setup routines for iwarp shared device
+ * @dev: iwarp shared (user and kernel)
+ */
+void irdma_device_init_uk(struct irdma_dev_uk *dev)
+{
+	dev->ops_uk = iw_device_uk_ops;
+}
+
+/**
+ * irdma_clean_cq - clean cq entries
+ * @ queue completion context
+ * @cq: cq to clean
+ */
+void irdma_clean_cq(void *q, struct irdma_cq_uk *cq)
+{
+	__le64 *cqe;
+	u64 qword3, comp_ctx;
+	u32 cq_head;
+	u8 polarity, temp;
+
+	cq_head = cq->cq_ring.head;
+	temp = cq->polarity;
+	do {
+		if (cq->avoid_mem_cflct)
+			cqe = ((struct irdma_extended_cqe *)(cq->cq_base))[cq_head].buf;
+		else
+			cqe = cq->cq_base[cq_head].buf;
+		get_64bit_val(cqe, 24, &qword3);
+		polarity = (u8)RS_64(qword3, IRDMA_CQ_VALID);
+
+		if (polarity != temp)
+			break;
+
+		get_64bit_val(cqe, 8, &comp_ctx);
+		if ((void *)(unsigned long)comp_ctx == q)
+			set_64bit_val(cqe, 8, 0);
+
+		cq_head = (cq_head + 1) % cq->cq_ring.size;
+		if (!cq_head)
+			temp ^= 1;
+	} while (true);
+}
+
+/**
+ * irdma_nop - post a nop
+ * @qp: hw qp ptr
+ * @wr_id: work request id
+ * @signaled: signaled for completion
+ * @post_sq: ring doorbell
+ */
+enum irdma_status_code irdma_nop(struct irdma_qp_uk *qp,
+				 u64 wr_id,
+				 bool signaled,
+				 bool post_sq)
+{
+	__le64 *wqe;
+	u64 hdr;
+	u32 wqe_idx;
+	struct irdma_post_sq_info info = {};
+
+	info.push_wqe = false;
+	info.wr_id = wr_id;
+	wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, IRDMA_QP_WQE_MIN_QUANTA,
+					 0, &info);
+	if (!wqe)
+		return IRDMA_ERR_QP_TOOMANY_WRS_POSTED;
+
+	set_64bit_val(wqe, 0, 0);
+	set_64bit_val(wqe, 8, 0);
+	set_64bit_val(wqe, 16, 0);
+
+	hdr = LS_64(IRDMAQP_OP_NOP, IRDMAQPSQ_OPCODE) |
+	      LS_64(signaled, IRDMAQPSQ_SIGCOMPL) |
+	      LS_64(qp->swqe_polarity, IRDMAQPSQ_VALID);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+	if (post_sq)
+		irdma_qp_post_wr(qp);
+
+	return 0;
+}
+
+/**
+ * irdma_fragcnt_to_quanta_sq - calculate quanta based on fragment count for SQ
+ * @frag_cnt: number of fragments
+ * @quanta: quanta for frag_cnt
+ */
+enum irdma_status_code irdma_fragcnt_to_quanta_sq(u32 frag_cnt, u16 *quanta)
+{
+	switch (frag_cnt) {
+	case 0:
+	case 1:
+		*quanta = IRDMA_QP_WQE_MIN_QUANTA;
+		break;
+	case 2:
+	case 3:
+		*quanta = 2;
+		break;
+	case 4:
+	case 5:
+		*quanta = 3;
+		break;
+	case 6:
+	case 7:
+		*quanta = 4;
+		break;
+	case 8:
+	case 9:
+		*quanta = 5;
+		break;
+	case 10:
+	case 11:
+		*quanta = 6;
+		break;
+	case 12:
+	case 13:
+		*quanta = 7;
+		break;
+	case 14:
+	case 15: /* when immediate data is present */
+		*quanta = 8;
+		break;
+	default:
+		return IRDMA_ERR_INVALID_FRAG_COUNT;
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_fragcnt_to_wqesize_rq - calculate wqe size based on fragment count for RQ
+ * @frag_cnt: number of fragments
+ * @wqe_size: size in bytes given frag_cnt
+ */
+enum irdma_status_code irdma_fragcnt_to_wqesize_rq(u32 frag_cnt, u16 *wqe_size)
+{
+	switch (frag_cnt) {
+	case 0:
+	case 1:
+		*wqe_size = 32;
+		break;
+	case 2:
+	case 3:
+		*wqe_size = 64;
+		break;
+	case 4:
+	case 5:
+	case 6:
+	case 7:
+		*wqe_size = 128;
+		break;
+	case 8:
+	case 9:
+	case 10:
+	case 11:
+	case 12:
+	case 13:
+	case 14:
+		*wqe_size = 256;
+		break;
+	default:
+		return IRDMA_ERR_INVALID_FRAG_COUNT;
+	}
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h
new file mode 100644
index 0000000..8853c5b
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/user.h
@@ -0,0 +1,463 @@
+/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */
+/* Copyright (c) 2019, Intel Corporation. */
+
+#ifndef IRDMA_USER_H
+#define IRDMA_USER_H
+
+#define irdma_handle void *
+#define irdma_adapter_handle irdma_handle
+#define irdma_qp_handle irdma_handle
+#define irdma_cq_handle irdma_handle
+#define irdma_pd_id irdma_handle
+#define irdma_stag_handle irdma_handle
+#define irdma_stag_index u32
+#define irdma_stag u32
+#define irdma_stag_key u8
+#define irdma_tagged_offset u64
+#define irdma_access_privileges u32
+#define irdma_physical_fragment u64
+#define irdma_address_list u64 *
+#define irdma_sgl struct irdma_sge *
+
+#define	IRDMA_MAX_MR_SIZE	0x10000000000L
+
+#define IRDMA_ACCESS_FLAGS_LOCALREAD		0x01
+#define IRDMA_ACCESS_FLAGS_LOCALWRITE		0x02
+#define IRDMA_ACCESS_FLAGS_REMOTEREAD_ONLY	0x04
+#define IRDMA_ACCESS_FLAGS_REMOTEREAD		0x05
+#define IRDMA_ACCESS_FLAGS_REMOTEWRITE_ONLY	0x08
+#define IRDMA_ACCESS_FLAGS_REMOTEWRITE		0x0a
+#define IRDMA_ACCESS_FLAGS_BIND_WINDOW		0x10
+#define IRDMA_ACCESS_FLAGS_ALL			0x1f
+
+#define IRDMA_OP_TYPE_RDMA_WRITE		0x00
+#define IRDMA_OP_TYPE_RDMA_READ			0x01
+#define IRDMA_OP_TYPE_SEND			0x03
+#define IRDMA_OP_TYPE_SEND_INV			0x04
+#define IRDMA_OP_TYPE_SEND_SOL			0x05
+#define IRDMA_OP_TYPE_SEND_SOL_INV		0x06
+#define IRDMA_OP_TYPE_RDMA_WRITE_SOL		0x0d
+#define IRDMA_OP_TYPE_BIND_MW			0x08
+#define IRDMA_OP_TYPE_FAST_REG_NSMR		0x09
+#define IRDMA_OP_TYPE_INV_STAG			0x0a
+#define IRDMA_OP_TYPE_RDMA_READ_INV_STAG	0x0b
+#define IRDMA_OP_TYPE_NOP			0x0c
+#define IRDMA_OP_TYPE_REC	0x3e
+#define IRDMA_OP_TYPE_REC_IMM	0x3f
+
+#define IRDMA_FLUSH_MAJOR_ERR	1
+#define IRDMA_LEN_MAJOR_ERR	2
+
+enum irdma_device_caps_const {
+	IRDMA_WQE_SIZE =			4,
+	IRDMA_CQP_WQE_SIZE =			8,
+	IRDMA_CQE_SIZE =			4,
+	IRDMA_EXTENDED_CQE_SIZE =		8,
+	IRDMA_AEQE_SIZE =			2,
+	IRDMA_CEQE_SIZE =			1,
+	IRDMA_CQP_CTX_SIZE =			8,
+	IRDMA_SHADOW_AREA_SIZE =		8,
+	IRDMA_QUERY_FPM_BUF_SIZE =		176,
+	IRDMA_COMMIT_FPM_BUF_SIZE =		176,
+	IRDMA_GATHER_STATS_BUF_SIZE =		1024,
+	IRDMA_MIN_IW_QP_ID =			0,
+	IRDMA_MAX_IW_QP_ID =			262143,
+	IRDMA_MIN_CEQID =			0,
+	IRDMA_MAX_CEQID =			1023,
+	IRDMA_CEQ_MAX_COUNT =			IRDMA_MAX_CEQID + 1,
+	IRDMA_MIN_CQID =			0,
+	IRDMA_MAX_CQID =			524287,
+	IRDMA_MIN_AEQ_ENTRIES =			1,
+	IRDMA_MAX_AEQ_ENTRIES =			524287,
+	IRDMA_MIN_CEQ_ENTRIES =			1,
+	IRDMA_MAX_CEQ_ENTRIES =			524288,
+	IRDMA_MIN_CQ_SIZE =			1,
+	IRDMA_MAX_CQ_SIZE =			1048575,
+	IRDMA_DB_ID_ZERO =			0,
+	IRDMA_MAX_WQ_FRAGMENT_COUNT =		13,
+	IRDMA_MAX_SGE_RD =			13,
+	IRDMA_MAX_OUTBOUND_MSG_SIZE =		2147483647,
+	IRDMA_MAX_INBOUND_MSG_SIZE =		2147483647,
+	IRDMA_MAX_PUSH_PAGE_COUNT =		4096,
+	IRDMA_MAX_PE_ENA_VF_COUNT =		32,
+	IRDMA_MAX_VF_FPM_ID =			47,
+	IRDMA_MAX_SQ_PAYLOAD_SIZE =		2145386496,
+	IRDMA_MAX_INLINE_DATA_SIZE =		224,
+	IRDMA_MAX_PUSHMODE_INLINE_DATA_SIZE =	224,
+	IRDMA_MAX_IRD_SIZE =			127,
+	IRDMA_MAX_ORD_SIZE =			255,
+	IRDMA_MAX_WQ_ENTRIES =			32768,
+	IRDMA_Q2_BUF_SIZE =			256,
+	IRDMA_QP_CTX_SIZE =			256,
+	IRDMA_MAX_PDS =				262144,
+};
+
+enum irdma_addressing_type {
+	IRDMA_ADDR_TYPE_ZERO_BASED = 0,
+	IRDMA_ADDR_TYPE_VA_BASED = 1,
+};
+
+enum irdma_cmpl_status {
+	IRDMA_COMPL_STATUS_SUCCESS = 0,
+	IRDMA_COMPL_STATUS_FLUSHED,
+	IRDMA_COMPL_STATUS_INVALID_WQE,
+	IRDMA_COMPL_STATUS_QP_CATASTROPHIC,
+	IRDMA_COMPL_STATUS_REMOTE_TERMINATION,
+	IRDMA_COMPL_STATUS_INVALID_STAG,
+	IRDMA_COMPL_STATUS_BASE_BOUND_VIOLATION,
+	IRDMA_COMPL_STATUS_ACCESS_VIOLATION,
+	IRDMA_COMPL_STATUS_INVALID_PD_ID,
+	IRDMA_COMPL_STATUS_WRAP_ERROR,
+	IRDMA_COMPL_STATUS_STAG_INVALID_PDID,
+	IRDMA_COMPL_STATUS_RDMA_READ_ZERO_ORD,
+	IRDMA_COMPL_STATUS_QP_NOT_PRIVLEDGED,
+	IRDMA_COMPL_STATUS_STAG_NOT_INVALID,
+	IRDMA_COMPL_STATUS_INVALID_PHYS_BUF_SIZE,
+	IRDMA_COMPL_STATUS_INVALID_PHYS_BUF_ENTRY,
+	IRDMA_COMPL_STATUS_INVALID_FBO,
+	IRDMA_COMPL_STATUS_INVALID_LEN,
+	IRDMA_COMPL_STATUS_INVALID_ACCESS,
+	IRDMA_COMPL_STATUS_PHYS_BUF_LIST_TOO_LONG,
+	IRDMA_COMPL_STATUS_INVALID_VIRT_ADDRESS,
+	IRDMA_COMPL_STATUS_INVALID_REGION,
+	IRDMA_COMPL_STATUS_INVALID_WINDOW,
+	IRDMA_COMPL_STATUS_INVALID_TOTAL_LEN,
+	IRDMA_COMPL_STATUS_UNKNOWN,
+};
+
+enum irdma_cmpl_notify {
+	IRDMA_CQ_COMPL_EVENT = 0,
+	IRDMA_CQ_COMPL_SOLICITED = 1,
+};
+
+enum irdma_qp_caps {
+	IRDMA_WRITE_WITH_IMM = 1,
+	IRDMA_SEND_WITH_IMM = 2,
+	IRDMA_ROCE = 4,
+};
+
+struct irdma_qp_uk;
+struct irdma_cq_uk;
+struct irdma_qp_uk_init_info;
+struct irdma_cq_uk_init_info;
+
+struct irdma_sge {
+	irdma_tagged_offset tag_off;
+	u32 len;
+	irdma_stag stag;
+};
+
+struct irdma_ring {
+	u32 head;
+	u32 tail;
+	u32 size;
+};
+
+struct irdma_cqe {
+	__le64 buf[IRDMA_CQE_SIZE];
+};
+
+struct irdma_extended_cqe {
+	__le64 buf[IRDMA_EXTENDED_CQE_SIZE];
+};
+
+struct irdma_post_send {
+	irdma_sgl sg_list;
+	u32 num_sges;
+	u32 qkey;
+	u32 dest_qp;
+	u32 ah_id;
+};
+
+struct irdma_post_inline_send {
+	void *data;
+	u32 len;
+	u32 qkey;
+	u32 dest_qp;
+	u32 ah_id;
+};
+
+struct irdma_rdma_write {
+	irdma_sgl lo_sg_list;
+	u32 num_lo_sges;
+	struct irdma_sge rem_addr;
+};
+
+struct irdma_inline_rdma_write {
+	void *data;
+	u32 len;
+	struct irdma_sge rem_addr;
+};
+
+struct irdma_rdma_read {
+	irdma_sgl lo_sg_list;
+	u32 num_lo_sges;
+	struct irdma_sge rem_addr;
+};
+
+struct irdma_bind_window {
+	irdma_stag mr_stag;
+	u64 bind_len;
+	void *va;
+	enum irdma_addressing_type addressing_type;
+	bool ena_reads;
+	bool ena_writes;
+	irdma_stag mw_stag;
+	bool mem_window_type_1;
+};
+
+struct irdma_inv_local_stag {
+	irdma_stag target_stag;
+};
+
+struct irdma_post_sq_info {
+	u64 wr_id;
+	u8 op_type;
+	u8 l4len;
+	bool signaled;
+	bool read_fence;
+	bool local_fence;
+	bool inline_data;
+	bool imm_data_valid;
+	bool push_wqe;
+	bool report_rtt;
+	bool udp_hdr;
+	u32 imm_data;
+	u32 stag_to_inv;
+	bool defer_flag;
+	union {
+		struct irdma_post_send send;
+		struct irdma_rdma_write rdma_write;
+		struct irdma_rdma_read rdma_read;
+		struct irdma_bind_window bind_window;
+		struct irdma_inv_local_stag inv_local_stag;
+		struct irdma_inline_rdma_write inline_rdma_write;
+		struct irdma_post_inline_send inline_send;
+	} op;
+};
+
+struct irdma_post_rq_info {
+	u64 wr_id;
+	irdma_sgl sg_list;
+	u32 num_sges;
+};
+
+struct irdma_cq_poll_info {
+	u64 wr_id;
+	irdma_qp_handle qp_handle;
+	u32 bytes_xfered;
+	u32 tcp_seq_num_rtt;
+	u32 qp_id;
+	u32 ud_src_qpn;
+	u32 imm_data;
+	irdma_stag inv_stag; /* or L_R_Key */
+	enum irdma_cmpl_status comp_status;
+	u16 major_err;
+	u16 minor_err;
+	u16 ud_vlan;
+	u8 ud_smac[6];
+	u8 op_type;
+	bool stag_invalid_set; /* or L_R_Key set */
+	bool push_dropped;
+	bool error;
+	bool solicited_event;
+	bool ipv4;
+	bool ud_vlan_valid;
+	bool ud_smac_valid;
+	bool imm_valid;
+};
+
+struct irdma_qp_uk_ops {
+	void (*iw_qp_post_wr)(struct irdma_qp_uk *qp);
+	void (*iw_qp_ring_push_db)(struct irdma_qp_uk *qp, u32 wqe_index);
+	enum irdma_status_code (*iw_rdma_write)(struct irdma_qp_uk *qp,
+						struct irdma_post_sq_info *info,
+						bool post_sq);
+	enum irdma_status_code (*iw_rdma_read)(struct irdma_qp_uk *qp,
+					       struct irdma_post_sq_info *info,
+					       bool inv_stag, bool post_sq);
+	enum irdma_status_code (*iw_send)(struct irdma_qp_uk *qp,
+					  struct irdma_post_sq_info *info,
+					  bool post_sq);
+	enum irdma_status_code (*iw_inline_rdma_write)(struct irdma_qp_uk *qp,
+						       struct irdma_post_sq_info *info,
+						       bool post_sq);
+	enum irdma_status_code (*iw_inline_send)(struct irdma_qp_uk *qp,
+						 struct irdma_post_sq_info *info,
+						 bool post_sq);
+	enum irdma_status_code (*iw_stag_local_invalidate)(struct irdma_qp_uk *qp,
+							   struct irdma_post_sq_info *info,
+							   bool post_sq);
+	enum irdma_status_code (*iw_mw_bind)(struct irdma_qp_uk *qp,
+					     struct irdma_post_sq_info *info,
+					     bool post_sq);
+	enum irdma_status_code (*iw_post_receive)(struct irdma_qp_uk *qp,
+						  struct irdma_post_rq_info *info);
+	enum irdma_status_code (*iw_post_nop)(struct irdma_qp_uk *qp,
+					      u64 wr_id,
+					      bool signaled,
+					      bool post_sq);
+};
+
+struct irdma_wqe_uk_ops {
+	void (*iw_set_fragment)(__le64 *wqe,
+				u32 offset,
+				struct irdma_sge *sge,
+				u8 valid);
+	void (*iw_copy_inline_data)(u8 *dest, u8 *src, u32 len, u8 polarity);
+	enum irdma_status_code (*iw_inline_data_size_to_quanta)(u32 data_size,
+								u16 *quanta,
+								u32 max_size);
+	void (*iw_set_mw_bind_wqe)(__le64 *wqe, struct irdma_bind_window *op_info);
+};
+
+struct irdma_cq_ops {
+	void (*iw_cq_request_notification)(struct irdma_cq_uk *cq,
+					   enum irdma_cmpl_notify cq_notify);
+	enum irdma_status_code (*iw_cq_poll_cmpl)(struct irdma_cq_uk *cq,
+						  struct irdma_cq_poll_info *info);
+	enum irdma_status_code (*iw_cq_post_entries)(struct irdma_cq_uk *cq,
+						     u8 count);
+	void (*iw_cq_clean)(void *q, struct irdma_cq_uk *cq);
+};
+
+struct irdma_dev_uk;
+
+struct irdma_device_uk_ops {
+	enum irdma_status_code (*iw_cq_uk_init)(struct irdma_cq_uk *cq,
+						struct irdma_cq_uk_init_info *info);
+	enum irdma_status_code (*iw_qp_uk_init)(struct irdma_qp_uk *qp,
+						struct irdma_qp_uk_init_info *info);
+};
+
+struct irdma_dev_uk {
+	struct irdma_device_uk_ops ops_uk;
+};
+
+struct irdma_sq_uk_wr_trk_info {
+	u64 wrid;
+	u32 wr_len;
+	u16 quanta;
+	u8 reserved[2];
+};
+
+struct irdma_qp_quanta {
+	__le64 elem[IRDMA_WQE_SIZE];
+};
+
+struct irdma_qp_uk {
+	struct irdma_qp_quanta *sq_base;
+	struct irdma_qp_quanta *rq_base;
+	struct irdma_hw_attrs *hw_attrs;
+	u32 __iomem *wqe_alloc_db;
+	struct irdma_sq_uk_wr_trk_info *sq_wrtrk_array;
+	u64 *rq_wrid_array;
+	__le64 *shadow_area;
+	u32 *push_db;
+	__le64 *push_wqe;
+	struct irdma_ring sq_ring;
+	struct irdma_ring rq_ring;
+	struct irdma_ring initial_ring;
+	u32 qp_id;
+	u32 qp_caps;
+	u32 sq_size;
+	u32 rq_size;
+	u32 max_sq_frag_cnt;
+	u32 max_rq_frag_cnt;
+	u32 max_inline_data;
+	struct irdma_qp_uk_ops qp_ops;
+	struct irdma_wqe_uk_ops wqe_ops;
+	u8 swqe_polarity;
+	u8 swqe_polarity_deferred;
+	u8 rwqe_polarity;
+	u8 rq_wqe_size;
+	u8 rq_wqe_size_multiplier;
+	bool deferred_flag;
+	bool push_mode; /* whether the last post wqe was pushed */
+	bool first_sq_wq;
+	bool force_fence;
+	bool sq_flush_complete;		/* Indicates flush was seen and SQ was empty after the flush */
+	bool rq_flush_complete;		/* Indicates flush was seen and RQ was empty after the flush */
+	u8 dbg_rq_flushed;
+};
+
+struct irdma_cq_uk {
+	struct irdma_cqe *cq_base;
+	u32 __iomem *cqe_alloc_db;
+	u32 __iomem *cq_ack_db;
+	__le64 *shadow_area;
+	u32 cq_id;
+	u32 cq_size;
+	struct irdma_ring cq_ring;
+	u8 polarity;
+	bool avoid_mem_cflct;
+	struct irdma_cq_ops ops;
+};
+
+struct irdma_qp_uk_init_info {
+	struct irdma_qp_quanta *sq;
+	struct irdma_qp_quanta *rq;
+	struct irdma_hw_attrs *hw_attrs;
+	u32 __iomem *wqe_alloc_db;
+	__le64 *shadow_area;
+	struct irdma_sq_uk_wr_trk_info *sq_wrtrk_array;
+	u64 *rq_wrid_array;
+	u32 *push_db;
+	__le64 *push_wqe;
+	u32 qp_id;
+	u32 qp_caps;
+	u32 sq_size;
+	u32 rq_size;
+	u32 max_sq_frag_cnt;
+	u32 max_rq_frag_cnt;
+	u32 max_inline_data;
+	u8 first_sq_wq;
+};
+
+struct irdma_cq_uk_init_info {
+	u32 __iomem *cqe_alloc_db;
+	u32 __iomem *cq_ack_db;
+	struct irdma_cqe *cq_base;
+	__le64 *shadow_area;
+	u32 cq_size;
+	u32 cq_id;
+	bool avoid_mem_cflct;
+};
+
+void irdma_device_init_uk(struct irdma_dev_uk *dev);
+void irdma_qp_post_wr(struct irdma_qp_uk *qp);
+__le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp,
+				   u32 *wqe_idx,
+				   u16 quanta,
+				   u32 total_size,
+				   struct irdma_post_sq_info *info);
+__le64 *irdma_qp_get_next_recv_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx);
+enum irdma_status_code irdma_cq_uk_init(struct irdma_cq_uk *cq,
+					struct irdma_cq_uk_init_info *info);
+enum irdma_status_code irdma_qp_uk_init(struct irdma_qp_uk *qp,
+					struct irdma_qp_uk_init_info *info);
+void irdma_clean_cq(void *q, struct irdma_cq_uk *cq);
+enum irdma_status_code irdma_nop(struct irdma_qp_uk *qp,
+				 u64 wr_id,
+				 bool signaled,
+				 bool post_sq);
+enum irdma_status_code irdma_fragcnt_to_quanta_sq(u32 frag_cnt, u16 *quanta);
+enum irdma_status_code irdma_fragcnt_to_wqesize_rq(u32 frag_cnt, u16 *wqe_size);
+void irdma_get_wqe_shift(struct irdma_hw_attrs *hw_attrs,
+			 u32 sge,
+			 u32 inline_data,
+			 u8 *shift);
+enum irdma_status_code irdma_get_sqdepth(struct irdma_hw_attrs *hw_attrs,
+					 u32 sq_size,
+					 u8 shift,
+					 u32 *wqdepth);
+enum irdma_status_code irdma_get_rqdepth(struct irdma_hw_attrs *hw_attrs,
+					 u32 rq_size,
+					 u8 shift,
+					 u32 *wqdepth);
+void irdma_qp_push_wqe(struct irdma_qp_uk *qp,
+		       __le64 *wqe,
+		       u16 quanta,
+		       u32 wqe_idx,
+		       bool post_sq);
+#endif /* IRDMA_USER_H */
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 13/19] RDMA/irdma: Add RoCEv2 UD OP support
From: Shiraz Saleem @ 2019-02-12 21:43 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Shiraz Saleem
In-Reply-To: <20190212214402.23284-1-shiraz.saleem@intel.com>

From: Mustafa Ismail <mustafa.ismail@intel.com>

Add the header, data structures and functions
to populate the WQE descriptors and issue the
Control QP commands that support RoCEv2 UD operations.

Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/hw/irdma/uda.c   | 416 ++++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/irdma/uda.h   |  87 ++++++++
 drivers/infiniband/hw/irdma/uda_d.h | 383 +++++++++++++++++++++++++++++++++
 3 files changed, 886 insertions(+)
 create mode 100644 drivers/infiniband/hw/irdma/uda.c
 create mode 100644 drivers/infiniband/hw/irdma/uda.h
 create mode 100644 drivers/infiniband/hw/irdma/uda_d.h

diff --git a/drivers/infiniband/hw/irdma/uda.c b/drivers/infiniband/hw/irdma/uda.c
new file mode 100644
index 0000000..af0a8f3
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/uda.c
@@ -0,0 +1,416 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2019, Intel Corporation. */
+
+#include "osdep.h"
+#include "status.h"
+#include "hmc.h"
+#include "defs.h"
+#include "type.h"
+#include "protos.h"
+#include "uda.h"
+#include "uda_d.h"
+
+/**
+ * irdma_sc_ah_init - initialize sc ah struct
+ * @dev: sc device struct
+ * @ah: sc ah ptr
+ * @ah_id: ah_id for allocated ah
+ */
+static void irdma_sc_init_ah(struct irdma_sc_dev *dev, struct irdma_sc_ah *ah)
+{
+	ah->dev = dev;
+}
+
+/**
+ * irdma_sc_access_ah() - Create, modify or delete AH
+ * @cqp: struct for cqp hw
+ * @info: ah information
+ * @op: Operation
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum irdma_status_code irdma_sc_access_ah(struct irdma_sc_cqp *cqp,
+						 struct irdma_ah_info *info,
+						 u32 op,
+						 u64 scratch)
+{
+	__le64 *wqe;
+	u64 qw1, qw2;
+
+	wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return IRDMA_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 0, LS_64_1(info->mac_addr[5], 16) |
+					 LS_64_1(info->mac_addr[4], 24) |
+					 LS_64_1(info->mac_addr[3], 32) |
+					 LS_64_1(info->mac_addr[2], 40) |
+					 LS_64_1(info->mac_addr[1], 48) |
+					 LS_64_1(info->mac_addr[0], 56));
+
+	qw1 = LS_64(info->pd_idx, IRDMA_UDA_CQPSQ_MAV_PDINDEXLO) |
+	      LS_64(info->tc_tos, IRDMA_UDA_CQPSQ_MAV_TC) |
+	      LS_64(info->vlan_tag, IRDMA_UDAQPC_VLANTAG);
+
+	qw2 = LS_64(info->dst_arpindex, IRDMA_UDA_CQPSQ_MAV_ARPINDEX) |
+	      LS_64(info->flow_label, IRDMA_UDA_CQPSQ_MAV_FLOWLABEL) |
+	      LS_64(info->hop_ttl, IRDMA_UDA_CQPSQ_MAV_HOPLIMIT) |
+	      LS_64(info->pd_idx >> 16, IRDMA_UDA_CQPSQ_MAV_PDINDEXHI);
+
+	if (!info->ipv4_valid) {
+		set_64bit_val(wqe,
+			      40,
+			      LS_64(info->dest_ip_addr[0], IRDMA_UDA_CQPSQ_MAV_ADDR0) |
+			      LS_64(info->dest_ip_addr[1], IRDMA_UDA_CQPSQ_MAV_ADDR1));
+		set_64bit_val(wqe,
+			      32,
+			      LS_64(info->dest_ip_addr[2], IRDMA_UDA_CQPSQ_MAV_ADDR2) |
+			      LS_64(info->dest_ip_addr[3], IRDMA_UDA_CQPSQ_MAV_ADDR3));
+
+		set_64bit_val(wqe,
+			      56,
+			      LS_64(info->src_ip_addr[0], IRDMA_UDA_CQPSQ_MAV_ADDR0) |
+			      LS_64(info->src_ip_addr[1], IRDMA_UDA_CQPSQ_MAV_ADDR1));
+		set_64bit_val(wqe,
+			      48,
+			      LS_64(info->src_ip_addr[2], IRDMA_UDA_CQPSQ_MAV_ADDR2) |
+			      LS_64(info->src_ip_addr[3], IRDMA_UDA_CQPSQ_MAV_ADDR3));
+	} else {
+		set_64bit_val(wqe, 32,
+			      LS_64(info->dest_ip_addr[0], IRDMA_UDA_CQPSQ_MAV_ADDR3));
+
+		set_64bit_val(wqe,
+			      48,
+			      LS_64(info->src_ip_addr[0], IRDMA_UDA_CQPSQ_MAV_ADDR3));
+	}
+
+	set_64bit_val(wqe, 8, qw1);
+	set_64bit_val(wqe, 16, qw2);
+
+	wmb(); /* need write block before writing WQE header */
+
+	set_64bit_val(wqe, 24,
+		      LS_64(cqp->polarity, IRDMA_UDA_CQPSQ_MAV_WQEVALID) |
+		      LS_64(op, IRDMA_UDA_CQPSQ_MAV_OPCODE) |
+		      LS_64(info->do_lpbk, IRDMA_UDA_CQPSQ_MAV_DOLOOPBACKK) |
+		      LS_64(info->ipv4_valid, IRDMA_UDA_CQPSQ_MAV_IPV4VALID) |
+		      LS_64(info->ah_idx, IRDMA_UDA_CQPSQ_MAV_AVIDX) |
+		      LS_64(info->insert_vlan_tag, IRDMA_UDA_CQPSQ_MAV_INSERTVLANTAG));
+
+	irdma_debug_buf(cqp->dev, IRDMA_DEBUG_WQE, "MANAGE_AH WQE",
+			wqe, IRDMA_CQP_WQE_SIZE * 8);
+	irdma_sc_cqp_post_sq(cqp);
+
+	return 0;
+}
+
+/**
+ * irdma_sc_create_ah() - Create AH
+ * @cqp: struct for cqp hw
+ * @info: ah information
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum irdma_status_code irdma_sc_create_ah(struct irdma_sc_cqp *cqp,
+						 struct irdma_ah_info *info,
+						 u64 scratch)
+{
+	return irdma_sc_access_ah(cqp,
+				  info,
+				  IRDMA_CQP_OP_CREATE_ADDR_HANDLE,
+				  scratch);
+}
+
+/**
+ * irdma_sc_modify_ah() - Modify AH
+ * @cqp: struct for cqp hw
+ * @info: ah information
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum irdma_status_code irdma_sc_modify_ah(struct irdma_sc_cqp *cqp,
+						 struct irdma_ah_info *info,
+						 u64 scratch)
+{
+	return irdma_sc_access_ah(cqp,
+				  info,
+				  IRDMA_CQP_OP_MODIFY_ADDR_HANDLE,
+				  scratch);
+}
+
+/**
+ * irdma_sc_destroy_ah() - Delete AH
+ * @cqp: struct for cqp hw
+ * @info: ah information
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum irdma_status_code irdma_sc_destroy_ah(struct irdma_sc_cqp *cqp,
+						  struct irdma_ah_info *info,
+						  u64 scratch)
+{
+	return irdma_sc_access_ah(cqp,
+				  info,
+				  IRDMA_CQP_OP_DESTROY_ADDR_HANDLE,
+				  scratch);
+}
+
+/**
+ * create_mg_ctx() - create a mcg context
+ * @info: multicast group context info
+ */
+static enum irdma_status_code irdma_create_mg_ctx(struct irdma_mcast_grp_info *info)
+{
+	struct irdma_mcast_grp_ctx_entry_info *entry_info = NULL;
+	u8 idx = 0;	/* index in the array */
+	u8 ctx_idx = 0;	/* index in the MG context */
+
+	memset(info->dma_mem_mc.va, 0, IRDMA_MAX_MGS_PER_CTX * sizeof(u64));
+
+	for (idx = 0; idx < IRDMA_MAX_MGS_PER_CTX; idx++) {
+		entry_info = &info->mg_ctx_info[idx];
+		if (entry_info->valid_entry) {
+			set_64bit_val((__le64 *)info->dma_mem_mc.va,
+				      ctx_idx * sizeof(u64),
+				      LS_64(entry_info->dest_port, IRDMA_UDA_MGCTX_DESTPORT) |
+				      LS_64(entry_info->valid_entry, IRDMA_UDA_MGCTX_VALIDENT) |
+				      LS_64(entry_info->qp_id, IRDMA_UDA_MGCTX_QPID));
+			ctx_idx++;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_access_mcast_grp() - Access mcast group based on op
+ * @cqp: Control QP
+ * @info: multicast group context info
+ * @op: operation to perform
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum irdma_status_code
+irdma_access_mcast_grp(struct irdma_sc_cqp *cqp,
+		       struct irdma_mcast_grp_info *info,
+		       u32 op,
+		       u64 scratch)
+{
+	__le64 *wqe;
+	enum irdma_status_code ret_code = 0;
+
+	if (info->mg_id >= IRDMA_UDA_MAX_FSI_MGS) {
+		irdma_debug(cqp->dev, IRDMA_DEBUG_WQE, "mg_id out of range\n");
+		return IRDMA_ERR_PARAM;
+	}
+
+	wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe) {
+		irdma_debug(cqp->dev, IRDMA_DEBUG_WQE, "ring full\n");
+		return IRDMA_ERR_RING_FULL;
+	}
+
+	ret_code = irdma_create_mg_ctx(info);
+	if (ret_code)
+		return ret_code;
+
+	set_64bit_val(wqe, 32, info->dma_mem_mc.pa);
+	set_64bit_val(wqe,
+		      16,
+		      LS_64(info->vlan_id, IRDMA_UDA_CQPSQ_MG_VLANID) |
+		      LS_64(info->qs_handle, IRDMA_UDA_CQPSQ_QS_HANDLE));
+	set_64bit_val(wqe, 0, LS_64_1(info->dest_mac_addr[5], 0) |
+					 LS_64_1(info->dest_mac_addr[4], 8) |
+					 LS_64_1(info->dest_mac_addr[3], 16) |
+					 LS_64_1(info->dest_mac_addr[2], 24) |
+					 LS_64_1(info->dest_mac_addr[1], 32) |
+					 LS_64_1(info->dest_mac_addr[0], 40));
+	set_64bit_val(wqe, 8,
+		      LS_64(info->hmc_fcn_id, IRDMA_UDA_CQPSQ_MG_HMC_FCN_ID));
+
+	if (!info->ipv4_valid) {
+		set_64bit_val(wqe, 56,
+			      LS_64(info->dest_ip_addr[0], IRDMA_UDA_CQPSQ_MAV_ADDR0) |
+			      LS_64(info->dest_ip_addr[1], IRDMA_UDA_CQPSQ_MAV_ADDR1));
+		set_64bit_val(wqe, 48,
+			      LS_64(info->dest_ip_addr[2], IRDMA_UDA_CQPSQ_MAV_ADDR2) |
+			      LS_64(info->dest_ip_addr[3], IRDMA_UDA_CQPSQ_MAV_ADDR3));
+	} else {
+		set_64bit_val(wqe, 48,
+			      LS_64(info->dest_ip_addr[0], IRDMA_UDA_CQPSQ_MAV_ADDR3));
+	}
+
+	wmb(); /* need write memory block before writing the WQE header. */
+
+	set_64bit_val(wqe, 24,
+		      LS_64(cqp->polarity, IRDMA_UDA_CQPSQ_MG_WQEVALID) |
+		      LS_64(op, IRDMA_UDA_CQPSQ_MG_OPCODE) |
+		      LS_64(info->mg_id, IRDMA_UDA_CQPSQ_MG_MGIDX) |
+		      LS_64(info->vlan_valid, IRDMA_UDA_CQPSQ_MG_VLANVALID) |
+		      LS_64(info->ipv4_valid, IRDMA_UDA_CQPSQ_MG_IPV4VALID));
+
+	irdma_debug_buf(cqp->dev, IRDMA_DEBUG_WQE, "MANAGE_MCG WQE",
+			wqe, IRDMA_CQP_WQE_SIZE * 8);
+	irdma_debug_buf(cqp->dev, IRDMA_DEBUG_WQE, "MCG_HOST CTX WQE",
+			info->dma_mem_mc.va, IRDMA_MAX_MGS_PER_CTX * 8);
+	irdma_sc_cqp_post_sq(cqp);
+
+	return 0;
+}
+
+/**
+ * irdma_sc_create_mcast_grp() - Create mcast group.
+ * @cqp: Control QP
+ * @info: multicast group context info
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum irdma_status_code
+irdma_sc_create_mcast_grp(struct irdma_sc_cqp *cqp,
+			  struct irdma_mcast_grp_info *info,
+			  u64 scratch)
+{
+	return irdma_access_mcast_grp(cqp,
+				      info,
+				      IRDMA_CQP_OP_CREATE_MCAST_GRP,
+				      scratch);
+}
+
+/**
+ * irdma_sc_modify_mcast_grp() - Modify mcast group
+ * @cqp: Control QP
+ * @info: multicast group context info
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum irdma_status_code
+irdma_sc_modify_mcast_grp(struct irdma_sc_cqp *cqp,
+			  struct irdma_mcast_grp_info *info,
+			  u64 scratch)
+{
+	return irdma_access_mcast_grp(cqp,
+				      info,
+				      IRDMA_CQP_OP_MODIFY_MCAST_GRP,
+				      scratch);
+}
+
+/**
+ * irdma_sc_destroy_mcast_grp() - Destroys mcast group
+ * @cqp: Control QP
+ * @info: multicast group context info
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum irdma_status_code
+irdma_sc_destroy_mcast_grp(struct irdma_sc_cqp *cqp,
+			   struct irdma_mcast_grp_info *info,
+			   u64 scratch)
+{
+	return irdma_access_mcast_grp(cqp,
+				      info,
+				      IRDMA_CQP_OP_DESTROY_MCAST_GRP,
+				      scratch);
+}
+
+/**
+ * irdma_compare_mgs - Compares two multicast group structures
+ * @entry1: Multcast group info
+ * @entry2: Multcast group info in context
+ */
+static bool irdma_compare_mgs(struct irdma_mcast_grp_ctx_entry_info *entry1,
+			      struct irdma_mcast_grp_ctx_entry_info *entry2)
+{
+	if (entry1->dest_port == entry2->dest_port &&
+	    entry1->qp_id == entry2->qp_id)
+		return true;
+	else
+		return false;
+}
+
+/**
+ * irdma_sc_add_mcast_grp - Allocates mcast group entry in ctx
+ * @ctx: Multcast group context
+ * @mg: Multcast group info
+ */
+static enum irdma_status_code
+irdma_sc_add_mcast_grp(struct irdma_mcast_grp_info *ctx,
+		       struct irdma_mcast_grp_ctx_entry_info *mg)
+{
+	u32 idx;
+	enum irdma_status_code ret_code = IRDMA_ERR_NO_MEMORY;
+	bool free_entry_found = false;
+	u32 free_entry_idx = 0;
+
+	/* find either an identical or a free entry for a multicast group */
+	for (idx = 0; idx < IRDMA_MAX_MGS_PER_CTX; idx++) {
+		if (ctx->mg_ctx_info[idx].valid_entry) {
+			if (irdma_compare_mgs(&ctx->mg_ctx_info[idx], mg)) {
+				ctx->mg_ctx_info[idx].use_cnt++;
+				return 0;
+			}
+			continue;
+		}
+		if (!free_entry_found) {
+			free_entry_found = true;
+			free_entry_idx = idx;
+		}
+	}
+
+	if (free_entry_found) {
+		ctx->mg_ctx_info[free_entry_idx] = *mg;
+		ctx->mg_ctx_info[free_entry_idx].valid_entry = true;
+		ctx->mg_ctx_info[free_entry_idx].use_cnt = 1;
+		ctx->no_of_mgs++;
+		ret_code = 0;
+	}
+
+	return ret_code;
+}
+
+/**
+ * irdma_sc_del_mcast_grp - Delete mcast group
+ * @ctx: Multcast group context
+ * @mg: Multcast group info
+ *
+ * Finds and removes a specific mulicast group from context, all
+ * parameters must match to remove a multicast group.
+ */
+static enum irdma_status_code
+irdma_sc_del_mcast_grp(struct irdma_mcast_grp_info *ctx,
+		       struct irdma_mcast_grp_ctx_entry_info *mg)
+{
+	u32 idx;
+	enum irdma_status_code ret_code = IRDMA_ERR_PARAM;
+
+	/* find an entry in multicast group context */
+	for (idx = 0; idx < IRDMA_MAX_MGS_PER_CTX; idx++) {
+		if (!ctx->mg_ctx_info[idx].valid_entry)
+			continue;
+
+		if (irdma_compare_mgs(mg, &ctx->mg_ctx_info[idx])) {
+			ctx->mg_ctx_info[idx].use_cnt--;
+
+			if (ctx->mg_ctx_info[idx].use_cnt == 0) {
+				ctx->mg_ctx_info[idx].valid_entry = false;
+				ctx->no_of_mgs--;
+				/* Remove gap if element was not the last */
+				if (idx != ctx->no_of_mgs &&
+				    ctx->no_of_mgs > 0) {
+					memcpy(&ctx->mg_ctx_info[idx],
+					       &ctx->mg_ctx_info[ctx->no_of_mgs - 1],
+					       sizeof(ctx->mg_ctx_info[idx]));
+					ctx->mg_ctx_info[ctx->no_of_mgs - 1].valid_entry = false;
+				}
+			}
+
+			ret_code = 0;
+			goto exit;
+		}
+	}
+exit:
+	return ret_code;
+}
+
+struct irdma_uda_ops irdma_uda_ops = {
+	.init_ah = irdma_sc_init_ah,
+	.create_ah = irdma_sc_create_ah,
+	.modify_ah = irdma_sc_modify_ah,
+	.destroy_ah = irdma_sc_destroy_ah,
+	.mcast_grp_create = irdma_sc_create_mcast_grp,
+	.mcast_grp_modify = irdma_sc_modify_mcast_grp,
+	.mcast_grp_destroy = irdma_sc_destroy_mcast_grp,
+	.mcast_grp_add = irdma_sc_add_mcast_grp,
+	.mcast_grp_del = irdma_sc_del_mcast_grp,
+};
diff --git a/drivers/infiniband/hw/irdma/uda.h b/drivers/infiniband/hw/irdma/uda.h
new file mode 100644
index 0000000..3a0477e
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/uda.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */
+/* Copyright (c) 2019, Intel Corporation. */
+
+#ifndef IRDMA_UDA_H
+#define IRDMA_UDA_H
+
+extern struct irdma_uda_ops irdma_uda_ops;
+
+#define IRDMA_UDA_MAX_FSI_MGS	4096
+#define IRDMA_UDA_MAX_PFS	16
+#define IRDMA_UDA_MAX_VFS	128
+#define IRDMA_MAX_MGS_PER_CTX	8
+
+struct irdma_sc_cqp;
+
+struct irdma_ah_info {
+	struct irdma_sc_ah *ah;
+	struct irdma_sc_vsi *vsi;
+	u32 pd_idx;
+	u32 dst_arpindex;
+	u32 dest_ip_addr[4];
+	u32 src_ip_addr[4];
+	u32 flow_label;
+	u32 ah_idx;
+	bool ipv4_valid;
+	bool do_lpbk;
+	u16 vlan_tag;
+	u8 insert_vlan_tag;
+	u8 tc_tos;
+	u8 hop_ttl;
+	u8 mac_addr[ETH_ALEN];
+	bool ah_valid;
+};
+
+struct irdma_sc_ah {
+	struct irdma_sc_dev *dev;
+	struct irdma_ah_info ah_info;
+};
+
+struct irdma_mcast_grp_ctx_entry_info {
+	u32 qp_id;
+	bool valid_entry;
+	u16 dest_port;
+	u32 use_cnt;
+};
+
+struct irdma_mcast_grp_info {
+	u8 dest_mac_addr[ETH_ALEN];
+	u16 vlan_id;
+	u8 hmc_fcn_id;
+	bool ipv4_valid;
+	bool vlan_valid;
+	u16 mg_id;
+	u32 no_of_mgs;
+	u32 dest_ip_addr[4];
+	u16 qs_handle;
+	struct irdma_dma_mem dma_mem_mc;
+	struct irdma_mcast_grp_ctx_entry_info mg_ctx_info[IRDMA_MAX_MGS_PER_CTX];
+};
+
+struct irdma_uda_ops {
+	void (*init_ah)(struct irdma_sc_dev *dev, struct irdma_sc_ah *ah);
+	enum irdma_status_code (*create_ah)(struct irdma_sc_cqp *cqp,
+					    struct irdma_ah_info *info,
+					    u64 scratch);
+	enum irdma_status_code (*modify_ah)(struct irdma_sc_cqp *cqp,
+					    struct irdma_ah_info *info,
+					    u64 scratch);
+	enum irdma_status_code (*destroy_ah)(struct irdma_sc_cqp *cqp,
+					     struct irdma_ah_info *info,
+					     u64 scratch);
+	/* multicast */
+	enum irdma_status_code (*mcast_grp_create)(struct irdma_sc_cqp *cqp,
+						   struct irdma_mcast_grp_info *info,
+						   u64 scratch);
+	enum irdma_status_code (*mcast_grp_modify)(struct irdma_sc_cqp *cqp,
+						   struct irdma_mcast_grp_info *info,
+						   u64 scratch);
+	enum irdma_status_code (*mcast_grp_destroy)(struct irdma_sc_cqp *cqp,
+						    struct irdma_mcast_grp_info *info,
+						    u64 scratch);
+	enum irdma_status_code (*mcast_grp_add)(struct irdma_mcast_grp_info *ctx,
+						struct irdma_mcast_grp_ctx_entry_info *mg);
+	enum irdma_status_code (*mcast_grp_del)(struct irdma_mcast_grp_info *ctx,
+						struct irdma_mcast_grp_ctx_entry_info *mg);
+};
+#endif /* IRDMA_UDA_H */
diff --git a/drivers/infiniband/hw/irdma/uda_d.h b/drivers/infiniband/hw/irdma/uda_d.h
new file mode 100644
index 0000000..212de00
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/uda_d.h
@@ -0,0 +1,383 @@
+/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */
+/* Copyright (c) 2019, Intel Corporation. */
+
+#ifndef IRDMA_UDA_D_H
+#define IRDMA_UDA_D_H
+
+/* L4 packet type */
+#define IRDMA_E_UDA_SQ_L4T_UNKNOWN	0
+#define IRDMA_E_UDA_SQ_L4T_TCP		1
+#define IRDMA_E_UDA_SQ_L4T_SCTP		2
+#define IRDMA_E_UDA_SQ_L4T_UDP		3
+
+/* Inner IP header type */
+#define IRDMA_E_UDA_SQ_IIPT_UNKNOWN		0
+#define IRDMA_E_UDA_SQ_IIPT_IPV6		1
+#define IRDMA_E_UDA_SQ_IIPT_IPV4_NO_CSUM	2
+#define IRDMA_E_UDA_SQ_IIPT_IPV4_CSUM		3
+
+/* UDA defined fields for transmit descriptors */
+#define IRDMA_UDA_QPSQ_PUSHWQE_S 56
+#define IRDMA_UDA_QPSQ_PUSHWQE_M BIT_ULL(IRDMA_UDA_QPSQ_PUSHWQE_S)
+
+#define IRDMA_UDA_QPSQ_INLINEDATAFLAG_S 57
+#define IRDMA_UDA_QPSQ_INLINEDATAFLAG_M \
+	BIT_ULL(IRDMA_UDA_QPSQ_INLINEDATAFLAG_S)
+
+#define IRDMA_UDA_QPSQ_INLINEDATALEN_S 48
+#define IRDMA_UDA_QPSQ_INLINEDATALEN_M \
+	((u64)0xffULL << IRDMA_UDA_QPSQ_INLINEDATALEN_S)
+
+#define IRDMA_UDA_QPSQ_ADDFRAGCNT_S 38
+#define IRDMA_UDA_QPSQ_ADDFRAGCNT_M \
+	((u64)0x0F << IRDMA_UDA_QPSQ_ADDFRAGCNT_S)
+
+#define IRDMA_UDA_QPSQ_IPFRAGFLAGS_S 42
+#define IRDMA_UDA_QPSQ_IPFRAGFLAGS_M \
+	((u64)0x3 << IRDMA_UDA_QPSQ_IPFRAGFLAGS_S)
+
+#define IRDMA_UDA_QPSQ_NOCHECKSUM_S 45
+#define IRDMA_UDA_QPSQ_NOCHECKSUM_M \
+	BIT_ULL(IRDMA_UDA_QPSQ_NOCHECKSUM_S)
+
+#define IRDMA_UDA_QPSQ_AHIDXVALID_S 46
+#define IRDMA_UDA_QPSQ_AHIDXVALID_M \
+	BIT_ULL(IRDMA_UDA_QPSQ_AHIDXVALID_S)
+
+#define IRDMA_UDA_QPSQ_LOCAL_FENCE_S 61
+#define IRDMA_UDA_QPSQ_LOCAL_FENCE_M \
+	BIT_ULL(IRDMA_UDA_QPSQ_LOCAL_FENCE_S)
+
+#define IRDMA_UDA_QPSQ_AHIDX_S 0
+#define IRDMA_UDA_QPSQ_AHIDX_M ((u64)0x1ffff << IRDMA_UDA_QPSQ_AHIDX_S)
+
+#define IRDMA_UDA_QPSQ_PROTOCOL_S 16
+#define IRDMA_UDA_QPSQ_PROTOCOL_M \
+	((u64)0xff << IRDMA_UDA_QPSQ_PROTOCOL_S)
+
+#define IRDMA_UDA_QPSQ_EXTHDRLEN_S 32
+#define IRDMA_UDA_QPSQ_EXTHDRLEN_M \
+	((u64)0x1ff << IRDMA_UDA_QPSQ_EXTHDRLEN_S)
+
+#define IRDMA_UDA_QPSQ_MULTICAST_S 63
+#define IRDMA_UDA_QPSQ_MULTICAST_M \
+	BIT_ULL(IRDMA_UDA_QPSQ_MULTICAST_S)
+
+#define IRDMA_UDA_QPSQ_MACLEN_S 56
+#define IRDMA_UDA_QPSQ_MACLEN_M \
+	((u64)0x7f << IRDMA_UDA_QPSQ_MACLEN_S)
+#define IRDMA_UDA_QPSQ_MACLEN_LINE 2
+
+#define IRDMA_UDA_QPSQ_IPLEN_S 48
+#define IRDMA_UDA_QPSQ_IPLEN_M \
+	((u64)0x7f << IRDMA_UDA_QPSQ_IPLEN_S)
+#define IRDMA_UDA_QPSQ_IPLEN_LINE 2
+
+#define IRDMA_UDA_QPSQ_L4T_S 30
+#define IRDMA_UDA_QPSQ_L4T_M ((u64)0x3 << IRDMA_UDA_QPSQ_L4T_S)
+#define IRDMA_UDA_QPSQ_L4T_LINE 2
+
+#define IRDMA_UDA_QPSQ_IIPT_S 28
+#define IRDMA_UDA_QPSQ_IIPT_M ((u64)0x3 << IRDMA_UDA_QPSQ_IIPT_S)
+#define IRDMA_UDA_QPSQ_IIPT_LINE 2
+
+#define IRDMA_UDA_QPSQ_DO_LPB_LINE 3
+
+#define IRDMA_UDA_QPSQ_FWD_PROG_CONFIRM_S 45
+#define IRDMA_UDA_QPSQ_FWD_PROG_CONFIRM_M \
+	BIT_ULL(IRDMA_UDA_QPSQ_FWD_PROG_CONFIRM_S)
+#define IRDMA_UDA_QPSQ_FWD_PROG_CONFIRM_LINE 3
+
+#define IRDMA_UDA_QPSQ_IMMDATA_S 0
+#define IRDMA_UDA_QPSQ_IMMDATA_M \
+	((u64)0xffffffffffffffffULL << IRDMA_UDA_QPSQ_IMMDATA_S)
+
+/* Byte Offset 0 */
+#define IRDMA_UDAQPC_IPV4_S 3
+#define IRDMA_UDAQPC_IPV4_M BIT_ULL(IRDMAQPC_IPV4_S)
+
+#define IRDMA_UDAQPC_INSERTVLANTAG_S 5
+#define IRDMA_UDAQPC_INSERTVLANTAG_M BIT_ULL(IRDMA_UDAQPC_INSERTVLANTAG_S)
+
+#define IRDMA_UDAQPC_ISQP1_S 6
+#define IRDMA_UDAQPC_ISQP1_M BIT_ULL(IRDMA_UDAQPC_ISQP1_S)
+
+#define IRDMA_UDAQPC_RQWQESIZE_S IRDMAQPC_RQWQESIZE_S
+#define IRDMA_UDAQPC_RQWQESIZE_M IRDMAQPC_RQWQESIZE_M
+
+#define IRDMA_UDAQPC_ECNENABLE_S 14
+#define IRDMA_UDAQPC_ECNENABLE_M BIT_ULL(IRDMA_UDAQPC_ECNENABLE_S)
+
+#define IRDMA_UDAQPC_PDINDEXHI_S 20
+#define IRDMA_UDAQPC_PDINDEXHI_M ((u64)3 << IRDMA_UDAQPC_PDINDEXHI_S)
+
+#define IRDMA_UDAQPC_DCTCPENABLE_S 25
+#define IRDMA_UDAQPC_DCTCPENABLE_M BIT_ULL(IRDMA_UDAQPC_DCTCPENABLE_S)
+
+#define IRDMA_UDAQPC_RCVTPHEN_S IRDMAQPC_RCVTPHEN_S
+#define IRDMA_UDAQPC_RCVTPHEN_M IRDMAQPC_RCVTPHEN_M
+
+#define IRDMA_UDAQPC_XMITTPHEN_S IRDMAQPC_XMITTPHEN_S
+#define IRDMA_UDAQPC_XMITTPHEN_M IRDMAQPC_XMITTPHEN_M
+
+#define IRDMA_UDAQPC_RQTPHEN_S IRDMAQPC_RQTPHEN_S
+#define IRDMA_UDAQPC_RQTPHEN_M IRDMAQPC_RQTPHEN_M
+
+#define IRDMA_UDAQPC_SQTPHEN_S IRDMAQPC_SQTPHEN_S
+#define IRDMA_UDAQPC_SQTPHEN_M IRDMAQPC_SQTPHEN_M
+
+#define IRDMA_UDAQPC_PPIDX_S IRDMAQPC_PPIDX_S
+#define IRDMA_UDAQPC_PPIDX_M IRDMAQPC_PPIDX_M
+
+#define IRDMA_UDAQPC_PMENA_S IRDMAQPC_PMENA_S
+#define IRDMA_UDAQPC_PMENA_M IRDMAQPC_PMENA_M
+
+#define IRDMA_UDAQPC_INSERTTAG2_S 11
+#define IRDMA_UDAQPC_INSERTTAG2_M BIT_ULL(IRDMA_UDAQPC_INSERTTAG2_S)
+
+#define IRDMA_UDAQPC_INSERTTAG3_S 14
+#define IRDMA_UDAQPC_INSERTTAG3_M BIT_ULL(IRDMA_UDAQPC_INSERTTAG3_S)
+
+#define IRDMA_UDAQPC_RQSIZE_S IRDMAQPC_RQSIZE_S
+#define IRDMA_UDAQPC_RQSIZE_M IRDMAQPC_RQSIZE_M
+
+#define IRDMA_UDAQPC_SQSIZE_S IRDMAQPC_SQSIZE_S
+#define IRDMA_UDAQPC_SQSIZE_M IRDMAQPC_SQSIZE_M
+
+#define IRDMA_UDAQPC_TXCQNUM_S IRDMAQPC_TXCQNUM_S
+#define IRDMA_UDAQPC_TXCQNUM_M IRDMAQPC_TXCQNUM_M
+
+#define IRDMA_UDAQPC_RXCQNUM_S IRDMAQPC_RXCQNUM_S
+#define IRDMA_UDAQPC_RXCQNUM_M IRDMAQPC_RXCQNUM_M
+
+#define IRDMA_UDAQPC_QPCOMPCTX_S IRDMAQPC_QPCOMPCTX_S
+#define IRDMA_UDAQPC_QPCOMPCTX_M IRDMAQPC_QPCOMPCTX_M
+
+#define IRDMA_UDAQPC_SQTPHVAL_S IRDMAQPC_SQTPHVAL_S
+#define IRDMA_UDAQPC_SQTPHVAL_M IRDMAQPC_SQTPHVAL_M
+
+#define IRDMA_UDAQPC_RQTPHVAL_S IRDMAQPC_RQTPHVAL_S
+#define IRDMA_UDAQPC_RQTPHVAL_M IRDMAQPC_RQTPHVAL_M
+
+#define IRDMA_UDAQPC_QSHANDLE_S IRDMAQPC_QSHANDLE_S
+#define IRDMA_UDAQPC_QSHANDLE_M IRDMAQPC_QSHANDLE_M
+
+#define IRDMA_UDAQPC_RQHDRRINGBUFSIZE_S 48
+#define IRDMA_UDAQPC_RQHDRRINGBUFSIZE_M \
+	((u64)0x3 << IRDMA_UDAQPC_RQHDRRINGBUFSIZE_S)
+
+#define IRDMA_UDAQPC_SQHDRRINGBUFSIZE_S 32
+#define IRDMA_UDAQPC_SQHDRRINGBUFSIZE_M \
+	((u64)0x3 << IRDMA_UDAQPC_SQHDRRINGBUFSIZE_S)
+
+#define IRDMA_UDAQPC_PRIVILEGEENABLE_S 25
+#define IRDMA_UDAQPC_PRIVILEGEENABLE_M \
+	((u64)1 << IRDMA_UDAQPC_PRIVILEGEENABLE_S)
+
+#define IRDMA_UDAQPC_USE_STATISTICS_INSTANCE_S 26
+#define IRDMA_UDAQPC_USE_STATISTICS_INSTANCE_M \
+	((u64)1 << IRDMA_UDAQPC_USE_STATISTICS_INSTANCE_S)
+
+#define IRDMA_UDAQPC_STATISTICS_INSTANCE_INDEX_S 0
+#define IRDMA_UDAQPC_STATISTICS_INSTANCE_INDEX_M \
+	((u64)0x7F << IRDMA_UDAQPC_STATISTICS_INSTANCE_INDEX_S)
+
+#define IRDMA_UDAQPC_PRIVHDRGENENABLE_S 0
+#define IRDMA_UDAQPC_PRIVHDRGENENABLE_M \
+	BIT_ULL(IRDMA_UDAQPC_PRIVHDRGENENABLE_S)
+
+#define IRDMA_UDAQPC_RQHDRSPLITENABLE_S 3
+#define IRDMA_UDAQPC_RQHDRSPLITENABLE_M \
+	BIT_ULL(IRDMA_UDAQPC_RQHDRSPLITENABLE_S)
+
+#define IRDMA_UDAQPC_RQHDRRINGBUFENABLE_S 2
+#define IRDMA_UDAQPC_RQHDRRINGBUFENABLE_M \
+	BIT_ULL(IRDMA_UDAQPC_RQHDRRINGBUFENABLE_S)
+
+#define IRDMA_UDAQPC_SQHDRRINGBUFENABLE_S 1
+#define IRDMA_UDAQPC_SQHDRRINGBUFENABLE_M \
+	BIT_ULL(IRDMA_UDAQPC_SQHDRRINGBUFENABLE_S)
+
+#define IRDMA_UDAQPC_IPID_S 32
+#define IRDMA_UDAQPC_IPID_M ((u64)0xffff << IRDMA_UDAQPC_IPID_S)
+
+#define IRDMA_UDAQPC_SNDMSS_S 16
+#define IRDMA_UDAQPC_SNDMSS_M ((u64)0x3fff << IRDMA_UDAQPC_SNDMSS_S)
+
+#define IRDMA_UDAQPC_VLANTAG_S 0
+#define IRDMA_UDAQPC_VLANTAG_M  ((u64)0xffff << IRDMA_UDAQPC_VLANTAG_S)
+
+/* Address Handle */
+#define IRDMA_UDA_CQPSQ_MAV_PDINDEXHI_S 20
+#define IRDMA_UDA_CQPSQ_MAV_PDINDEXHI_M \
+	((u64)0x3 << IRDMA_UDA_CQPSQ_MAV_PDINDEXHI_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_PDINDEXLO_S 48
+#define IRDMA_UDA_CQPSQ_MAV_PDINDEXLO_M \
+	((u64)0xffff << IRDMA_UDA_CQPSQ_MAV_PDINDEXLO_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_SRCMACADDRINDEX_S 24
+#define IRDMA_UDA_CQPSQ_MAV_SRCMACADDRINDEX_M \
+	((u64)0x3f << IRDMA_UDA_CQPSQ_MAV_SRCMACADDRINDEX_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_ARPINDEX_S 48
+#define IRDMA_UDA_CQPSQ_MAV_ARPINDEX_M \
+	((u64)0xffff << IRDMA_UDA_CQPSQ_MAV_ARPINDEX_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_TC_S 32
+#define IRDMA_UDA_CQPSQ_MAV_TC_M ((u64)0xff << IRDMA_UDA_CQPSQ_MAV_TC_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_HOPLIMIT_S 32
+#define IRDMA_UDA_CQPSQ_MAV_HOPLIMIT_M \
+	((u64)0xff << IRDMA_UDA_CQPSQ_MAV_HOPLIMIT_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_FLOWLABEL_S 0
+#define IRDMA_UDA_CQPSQ_MAV_FLOWLABEL_M \
+	((u64)0xfffff << IRDMA_UDA_CQPSQ_MAV_FLOWLABEL_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_ADDR0_S 32
+#define IRDMA_UDA_CQPSQ_MAV_ADDR0_M \
+	((u64)0xffffffff << IRDMA_UDA_CQPSQ_MAV_ADDR0_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_ADDR1_S 0
+#define IRDMA_UDA_CQPSQ_MAV_ADDR1_M \
+	((u64)0xffffffff << IRDMA_UDA_CQPSQ_MAV_ADDR1_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_ADDR2_S 32
+#define IRDMA_UDA_CQPSQ_MAV_ADDR2_M \
+	((u64)0xffffffff << IRDMA_UDA_CQPSQ_MAV_ADDR2_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_ADDR3_S 0
+#define IRDMA_UDA_CQPSQ_MAV_ADDR3_M \
+	((u64)0xffffffff << IRDMA_UDA_CQPSQ_MAV_ADDR3_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_WQEVALID_S 63
+#define IRDMA_UDA_CQPSQ_MAV_WQEVALID_M \
+	BIT_ULL(IRDMA_UDA_CQPSQ_MAV_WQEVALID_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_OPCODE_S 32
+#define IRDMA_UDA_CQPSQ_MAV_OPCODE_M \
+	((u64)0x3f << IRDMA_UDA_CQPSQ_MAV_OPCODE_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_DOLOOPBACKK_S 62
+#define IRDMA_UDA_CQPSQ_MAV_DOLOOPBACKK_M \
+	BIT_ULL(IRDMA_UDA_CQPSQ_MAV_DOLOOPBACKK_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_IPV4VALID_S 59
+#define IRDMA_UDA_CQPSQ_MAV_IPV4VALID_M \
+	BIT_ULL(IRDMA_UDA_CQPSQ_MAV_IPV4VALID_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_AVIDX_S 0
+#define IRDMA_UDA_CQPSQ_MAV_AVIDX_M \
+	((u64)0x1ffff << IRDMA_UDA_CQPSQ_MAV_AVIDX_S)
+
+#define IRDMA_UDA_CQPSQ_MAV_INSERTVLANTAG_S 60
+#define IRDMA_UDA_CQPSQ_MAV_INSERTVLANTAG_M BIT_ULL(IRDMA_UDA_CQPSQ_MAV_INSERTVLANTAG_S)
+
+/* UDA multicast group */
+
+#define IRDMA_UDA_MGCTX_VFFLAG_S 29
+#define IRDMA_UDA_MGCTX_VFFLAG_M BIT_ULL(IRDMA_UDA_MGCTX_VFFLAG_S)
+
+#define IRDMA_UDA_MGCTX_DESTPORT_S 32
+#define IRDMA_UDA_MGCTX_DESTPORT_M ((u64)0xffff << IRDMA_UDA_MGCTX_DESTPORT_S)
+
+#define IRDMA_UDA_MGCTX_VFID_S 22
+#define IRDMA_UDA_MGCTX_VFID_M ((u64)0x7f << IRDMA_UDA_MGCTX_VFID_S)
+
+#define IRDMA_UDA_MGCTX_VALIDENT_S 31
+#define IRDMA_UDA_MGCTX_VALIDENT_M BIT_ULL(IRDMA_UDA_MGCTX_VALIDENT_S)
+
+#define IRDMA_UDA_MGCTX_PFID_S 18
+#define IRDMA_UDA_MGCTX_PFID_M ((u64)0xf << IRDMA_UDA_MGCTX_PFID_S)
+
+#define IRDMA_UDA_MGCTX_FLAGIGNOREDPORT_S 30
+#define IRDMA_UDA_MGCTX_FLAGIGNOREDPORT_M \
+	BIT_ULL(IRDMA_UDA_MGCTX_FLAGIGNOREDPORT_S)
+
+#define IRDMA_UDA_MGCTX_QPID_S 0
+#define IRDMA_UDA_MGCTX_QPID_M ((u64)0x3ffff << IRDMA_UDA_MGCTX_QPID_S)
+
+/* multicast group create CQP command */
+
+#define IRDMA_UDA_CQPSQ_MG_WQEVALID_S 63
+#define IRDMA_UDA_CQPSQ_MG_WQEVALID_M \
+	BIT_ULL(IRDMA_UDA_CQPSQ_MG_WQEVALID_S)
+
+#define IRDMA_UDA_CQPSQ_MG_OPCODE_S 32
+#define IRDMA_UDA_CQPSQ_MG_OPCODE_M ((u64)0x3f << IRDMA_UDA_CQPSQ_MG_OPCODE_S)
+
+#define IRDMA_UDA_CQPSQ_MG_MGIDX_S 0
+#define IRDMA_UDA_CQPSQ_MG_MGIDX_M ((u64)0x1fff << IRDMA_UDA_CQPSQ_MG_MGIDX_S)
+
+#define IRDMA_UDA_CQPSQ_MG_IPV4VALID_S 60
+#define IRDMA_UDA_CQPSQ_MG_IPV4VALID_M BIT_ULL(IRDMA_UDA_CQPSQ_MG_IPV4VALID_S)
+
+#define IRDMA_UDA_CQPSQ_MG_VLANVALID_S 59
+#define IRDMA_UDA_CQPSQ_MG_VLANVALID_M BIT_ULL(IRDMA_UDA_CQPSQ_MG_VLANVALID_S)
+
+#define IRDMA_UDA_CQPSQ_MG_HMC_FCN_ID_S 0
+#define IRDMA_UDA_CQPSQ_MG_HMC_FCN_ID_M ((u64)0x3F << IRDMA_UDA_CQPSQ_MG_HMC_FCN_ID_S)
+
+#define IRDMA_UDA_CQPSQ_MG_VLANID_S 32
+#define IRDMA_UDA_CQPSQ_MG_VLANID_M ((u64)0xFFF << IRDMA_UDA_CQPSQ_MG_VLANID_S)
+
+#define IRDMA_UDA_CQPSQ_QS_HANDLE_S 0
+#define IRDMA_UDA_CQPSQ_QS_HANDLE_M ((u64)0x3FF << IRDMA_UDA_CQPSQ_QS_HANDLE_S)
+
+/* Quad hash table */
+#define IRDMA_UDA_CQPSQ_QHASH_QPN_S 32
+#define IRDMA_UDA_CQPSQ_QHASH_QPN_M \
+	((u64)0x3ffff << IRDMA_UDA_CQPSQ_QHASH_QPN_S)
+
+#define IRDMA_UDA_CQPSQ_QHASH__S 0
+#define IRDMA_UDA_CQPSQ_QHASH__M BIT_ULL(IRDMA_UDA_CQPSQ_QHASH__S)
+
+#define IRDMA_UDA_CQPSQ_QHASH_SRC_PORT_S 16
+#define IRDMA_UDA_CQPSQ_QHASH_SRC_PORT_M \
+	((u64)0xffff << IRDMA_UDA_CQPSQ_QHASH_SRC_PORT_S)
+
+#define IRDMA_UDA_CQPSQ_QHASH_DEST_PORT_S 0
+#define IRDMA_UDA_CQPSQ_QHASH_DEST_PORT_M \
+	((u64)0xffff << IRDMA_UDA_CQPSQ_QHASH_DEST_PORT_S)
+
+#define IRDMA_UDA_CQPSQ_QHASH_ADDR0_S 32
+#define IRDMA_UDA_CQPSQ_QHASH_ADDR0_M \
+	((u64)0xffffffff << IRDMA_UDA_CQPSQ_QHASH_ADDR0_S)
+
+#define IRDMA_UDA_CQPSQ_QHASH_ADDR1_S 0
+#define IRDMA_UDA_CQPSQ_QHASH_ADDR1_M \
+	((u64)0xffffffff << IRDMA_UDA_CQPSQ_QHASH_ADDR1_S)
+
+#define IRDMA_UDA_CQPSQ_QHASH_ADDR2_S 32
+#define IRDMA_UDA_CQPSQ_QHASH_ADDR2_M \
+	((u64)0xffffffff << IRDMA_UDA_CQPSQ_QHASH_ADDR2_S)
+
+#define IRDMA_UDA_CQPSQ_QHASH_ADDR3_S 0
+#define IRDMA_UDA_CQPSQ_QHASH_ADDR3_M \
+	((u64)0xffffffff << IRDMA_UDA_CQPSQ_QHASH_ADDR3_S)
+
+#define IRDMA_UDA_CQPSQ_QHASH_WQEVALID_S 63
+#define IRDMA_UDA_CQPSQ_QHASH_WQEVALID_M \
+	BIT_ULL(IRDMA_UDA_CQPSQ_QHASH_WQEVALID_S)
+
+#define IRDMA_UDA_CQPSQ_QHASH_OPCODE_S 32
+#define IRDMA_UDA_CQPSQ_QHASH_OPCODE_M \
+	((u64)0x3f << IRDMA_UDA_CQPSQ_QHASH_OPCODE_S)
+
+#define IRDMA_UDA_CQPSQ_QHASH_MANAGE_S 61
+#define IRDMA_UDA_CQPSQ_QHASH_MANAGE_M \
+	((u64)0x3 << IRDMA_UDA_CQPSQ_QHASH_MANAGE_S)
+
+#define IRDMA_UDA_CQPSQ_QHASH_IPV4VALID_S 60
+#define IRDMA_UDA_CQPSQ_QHASH_IPV4VALID_M \
+	((u64)0x1 << IRDMA_UDA_CQPSQ_QHASH_IPV4VALID_S)
+
+#define IRDMA_UDA_CQPSQ_QHASH_LANFWD_S 59
+#define IRDMA_UDA_CQPSQ_QHASH_LANFWD_M \
+	((u64)0x1 << IRDMA_UDA_CQPSQ_QHASH_LANFWD_S)
+
+#define IRDMA_UDA_CQPSQ_QHASH_ENTRYTYPE_S 42
+#define IRDMA_UDA_CQPSQ_QHASH_ENTRYTYPE_M \
+	((u64)0x7 << IRDMA_UDA_CQPSQ_QHASH_ENTRYTYPE_S)
+#endif /* IRDMA_UDA_D_H */
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 11/19] RDMA/irdma: Add PBLE resource manager
From: Shiraz Saleem @ 2019-02-12 21:43 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Shiraz Saleem
In-Reply-To: <20190212214402.23284-1-shiraz.saleem@intel.com>

From: Mustafa Ismail <mustafa.ismail@intel.com>

Implement a Physical Buffer List Entry (PBLE) resource manager
to manage a pool of PBLE HMC resource objects.

Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/hw/irdma/pble.c | 520 +++++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/irdma/pble.h | 135 ++++++++++
 2 files changed, 655 insertions(+)
 create mode 100644 drivers/infiniband/hw/irdma/pble.c
 create mode 100644 drivers/infiniband/hw/irdma/pble.h

diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c
new file mode 100644
index 0000000..66fab69
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/pble.c
@@ -0,0 +1,520 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2019, Intel Corporation. */
+
+#include "osdep.h"
+#include "status.h"
+#include "hmc.h"
+#include "defs.h"
+#include "type.h"
+#include "protos.h"
+#include "pble.h"
+
+static enum irdma_status_code add_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc);
+
+/**
+ * irdma_destroy_pble_prm - destroy prm during module unload
+ * @pble_rsrc:	pble resources
+ */
+void irdma_destroy_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc)
+{
+	struct irdma_sc_dev *dev = pble_rsrc->dev;
+	struct irdma_chunk *chunk;
+	struct irdma_pble_prm *pinfo = &pble_rsrc->pinfo;
+
+	while (!list_empty(&pinfo->clist)) {
+		chunk = (struct irdma_chunk *)pinfo->clist.next;
+		list_del(&chunk->list);
+		if (chunk->type == PBLE_SD_PAGED)
+			irdma_pble_free_paged_mem(chunk);
+		if (chunk->bitmapbuf)
+			irdma_free_virt_mem(dev->hw, &chunk->bitmapmem);
+		irdma_free_virt_mem(dev->hw, &chunk->chunkmem);
+	}
+}
+
+/**
+ * irdma_hmc_init_pble - Initialize pble resources during module load
+ * @dev: irdma_sc_dev struct
+ * @pble_rsrc:	pble resources
+ */
+enum irdma_status_code
+irdma_hmc_init_pble(struct irdma_sc_dev *dev,
+		    struct irdma_hmc_pble_rsrc *pble_rsrc)
+{
+	struct irdma_hmc_info *hmc_info;
+	u32 fpm_idx = 0;
+	enum irdma_status_code status = 0;
+
+	hmc_info = dev->hmc_info;
+	pble_rsrc->dev = dev;
+	pble_rsrc->fpm_base_addr = hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].base;
+	/* Start pble' on 4k boundary */
+	if (pble_rsrc->fpm_base_addr & 0xfff)
+		fpm_idx = (PAGE_SIZE - (pble_rsrc->fpm_base_addr & 0xfff)) >> 3;
+	pble_rsrc->unallocated_pble =
+		hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt - fpm_idx;
+	pble_rsrc->next_fpm_addr = pble_rsrc->fpm_base_addr + (fpm_idx << 3);
+	pble_rsrc->pinfo.pble_shift = PBLE_SHIFT;
+
+	spin_lock_init(&pble_rsrc->pinfo.prm_lock);
+	INIT_LIST_HEAD(&pble_rsrc->pinfo.clist);
+	if (add_pble_prm(pble_rsrc)) {
+		irdma_destroy_pble_prm(pble_rsrc);
+		status = IRDMA_ERR_NO_MEMORY;
+	}
+
+	return status;
+}
+
+/**
+ * get_sd_pd_idx -  Returns sd index, pd index and rel_pd_idx from fpm address
+ * @ pble_rsrc:	structure containing fpm address
+ * @ idx: where to return indexes
+ */
+static void get_sd_pd_idx(struct irdma_hmc_pble_rsrc *pble_rsrc,
+			  struct sd_pd_idx *idx)
+{
+	idx->sd_idx = (u32)(pble_rsrc->next_fpm_addr) /
+		      IRDMA_HMC_DIRECT_BP_SIZE;
+	idx->pd_idx = (u32)(pble_rsrc->next_fpm_addr) / IRDMA_HMC_PAGED_BP_SIZE;
+	idx->rel_pd_idx = (idx->pd_idx % IRDMA_HMC_PD_CNT_IN_SD);
+}
+
+/**
+ * add_sd_direct - add sd direct for pble
+ * @pble_rsrc: pble resource ptr
+ * @info: page info for sd
+ */
+static enum irdma_status_code
+add_sd_direct(struct irdma_hmc_pble_rsrc *pble_rsrc,
+	      struct irdma_add_page_info *info)
+{
+	struct irdma_sc_dev *dev = pble_rsrc->dev;
+	enum irdma_status_code ret_code = 0;
+	struct sd_pd_idx *idx = &info->idx;
+	struct irdma_chunk *chunk = info->chunk;
+	struct irdma_hmc_info *hmc_info = info->hmc_info;
+	struct irdma_hmc_sd_entry *sd_entry = info->sd_entry;
+	u32 offset = 0;
+
+	if (!sd_entry->valid) {
+		ret_code = irdma_add_sd_table_entry(dev->hw, hmc_info,
+						    info->idx.sd_idx,
+						    IRDMA_SD_TYPE_DIRECT,
+						    IRDMA_HMC_DIRECT_BP_SIZE);
+		if (ret_code)
+			return ret_code;
+
+		chunk->type = PBLE_SD_CONTIGOUS;
+	}
+
+	offset = idx->rel_pd_idx << HMC_PAGED_BP_SHIFT;
+	chunk->size = info->pages << HMC_PAGED_BP_SHIFT;
+	chunk->vaddr = (u64)((u8 *)sd_entry->u.bp.addr.va + offset);
+	chunk->fpm_addr = pble_rsrc->next_fpm_addr;
+	irdma_debug(dev, IRDMA_DEBUG_PBLE,
+		    "chunk_size[%lld] = 0x%llx vaddr=0x%llx fpm_addr = %llx\n",
+		    chunk->size, chunk->size, chunk->vaddr, chunk->fpm_addr);
+
+	return 0;
+}
+
+/**
+ * fpm_to_idx - given fpm address, get pble index
+ * @pble_rsrc: pble resource management
+ * @addr: fpm address for index
+ */
+static u32 fpm_to_idx(struct irdma_hmc_pble_rsrc *pble_rsrc, u64 addr)
+{
+	u64 idx;
+
+	idx = (addr - (pble_rsrc->fpm_base_addr)) >> 3;
+
+	return (u32)idx;
+}
+
+/**
+ * add_bp_pages - add backing pages for sd
+ * @pble_rsrc: pble resource management
+ * @info: page info for sd
+ */
+static enum irdma_status_code
+add_bp_pages(struct irdma_hmc_pble_rsrc *pble_rsrc,
+	     struct irdma_add_page_info *info)
+{
+	struct irdma_sc_dev *dev = pble_rsrc->dev;
+	u8 *addr;
+	struct irdma_dma_mem mem;
+	struct irdma_hmc_pd_entry *pd_entry;
+	struct irdma_hmc_sd_entry *sd_entry = info->sd_entry;
+	struct irdma_hmc_info *hmc_info = info->hmc_info;
+	struct irdma_chunk *chunk = info->chunk;
+	enum irdma_status_code status = 0;
+	u32 rel_pd_idx = info->idx.rel_pd_idx;
+	u32 pd_idx = info->idx.pd_idx;
+	u32 i;
+
+	if (irdma_pble_get_paged_mem(chunk, info->pages))
+		return IRDMA_ERR_NO_MEMORY;
+
+	status = irdma_add_sd_table_entry(dev->hw, hmc_info,
+					  info->idx.sd_idx, IRDMA_SD_TYPE_PAGED,
+					  IRDMA_HMC_DIRECT_BP_SIZE);
+
+	if (status)
+		goto error;
+
+	addr = (u8 *)chunk->vaddr;
+	for (i = 0; i < info->pages; i++) {
+		mem.pa = (u64)chunk->dmainfo.dmaaddrs[i];
+		mem.size = PAGE_SIZE;
+		mem.va = (void *)(addr);
+		pd_entry = &sd_entry->u.pd_table.pd_entry[rel_pd_idx++];
+		if (!pd_entry->valid) {
+			status = irdma_add_pd_table_entry(dev, hmc_info,
+							  pd_idx++, &mem);
+			if (status)
+				goto error;
+
+			addr += PAGE_SIZE;
+		}
+	}
+
+	chunk->fpm_addr = pble_rsrc->next_fpm_addr;
+	return 0;
+
+error:
+	irdma_pble_free_paged_mem(chunk);
+
+	return status;
+}
+
+/**
+ * add_pble_prm - add a sd entry for pble resoure
+ * @pble_rsrc: pble resource management
+ */
+static enum irdma_status_code add_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc)
+{
+	struct irdma_sc_dev *dev = pble_rsrc->dev;
+	struct irdma_hmc_sd_entry *sd_entry;
+	struct irdma_hmc_info *hmc_info;
+	struct irdma_chunk *chunk;
+	struct irdma_add_page_info info;
+	struct sd_pd_idx *idx = &info.idx;
+	enum irdma_status_code ret_code = 0;
+	enum irdma_sd_entry_type sd_entry_type;
+	u64 sd_reg_val = 0;
+	struct irdma_virt_mem chunkmem;
+	u32 pages;
+
+	if (pble_rsrc->unallocated_pble < PBLE_PER_PAGE)
+		return IRDMA_ERR_NO_MEMORY;
+
+	if (pble_rsrc->next_fpm_addr & 0xfff)
+		return IRDMA_ERR_INVALID_PAGE_DESC_INDEX;
+
+	ret_code = irdma_allocate_virt_mem(dev->hw, &chunkmem, sizeof(*chunk));
+	if (ret_code)
+		return IRDMA_ERR_NO_MEMORY;
+
+	chunk = (struct irdma_chunk *)chunkmem.va;
+	chunk->chunkmem = chunkmem;
+	hmc_info = dev->hmc_info;
+	chunk->dev = dev;
+	chunk->fpm_addr = pble_rsrc->next_fpm_addr;
+	get_sd_pd_idx(pble_rsrc, idx);
+	sd_entry = &hmc_info->sd_table.sd_entry[idx->sd_idx];
+	pages = (idx->rel_pd_idx) ? (IRDMA_HMC_PD_CNT_IN_SD -
+				     idx->rel_pd_idx) : IRDMA_HMC_PD_CNT_IN_SD;
+	pages = min(pages, pble_rsrc->unallocated_pble >> PBLE_512_SHIFT);
+	info.chunk = chunk;
+	info.hmc_info = hmc_info;
+	info.pages = pages;
+	info.sd_entry = sd_entry;
+	if (!sd_entry->valid)
+		sd_entry_type = (!idx->rel_pd_idx &&
+				 (pages == IRDMA_HMC_PD_CNT_IN_SD) &&
+				 dev->is_pf) ?
+				 IRDMA_SD_TYPE_DIRECT : IRDMA_SD_TYPE_PAGED;
+	else
+		sd_entry_type = sd_entry->entry_type;
+
+	irdma_debug(dev, IRDMA_DEBUG_PBLE,
+		    "pages = %d, unallocated_pble[%d] current_fpm_addr = %llx\n",
+		    pages, pble_rsrc->unallocated_pble, pble_rsrc->next_fpm_addr);
+	irdma_debug(dev, IRDMA_DEBUG_PBLE, "sd_entry_type = %d\n",
+		    sd_entry_type);
+	if (sd_entry_type == IRDMA_SD_TYPE_DIRECT)
+		ret_code = add_sd_direct(pble_rsrc, &info);
+
+	if (ret_code)
+		sd_entry_type = IRDMA_SD_TYPE_PAGED;
+	else
+		pble_rsrc->stats_direct_sds++;
+
+	if (sd_entry_type == IRDMA_SD_TYPE_PAGED) {
+		ret_code = add_bp_pages(pble_rsrc, &info);
+		if (ret_code)
+			goto error;
+		else
+			pble_rsrc->stats_paged_sds++;
+	}
+
+	ret_code = irdma_prm_add_pble_mem(&pble_rsrc->pinfo, chunk);
+	if (ret_code)
+		goto error;
+
+	pble_rsrc->next_fpm_addr += chunk->size;
+	irdma_debug(dev, IRDMA_DEBUG_PBLE,
+		    "next_fpm_addr = %llx chunk_size[%llu] = 0x%llx\n",
+		    pble_rsrc->next_fpm_addr, chunk->size, chunk->size);
+	pble_rsrc->unallocated_pble -= (u32)(chunk->size >> 3);
+	list_add(&chunk->list, &pble_rsrc->pinfo.clist);
+	sd_reg_val = (sd_entry_type == IRDMA_SD_TYPE_PAGED) ?
+		     sd_entry->u.pd_table.pd_page_addr.pa : sd_entry->u.bp.addr.pa;
+	if (sd_entry->valid)
+		return 0;
+
+	if (dev->is_pf) {
+		ret_code = irdma_hmc_sd_one(dev, hmc_info->hmc_fn_id,
+					    sd_reg_val, idx->sd_idx,
+					    sd_entry->entry_type, true);
+		if (ret_code)
+			goto error;
+	}
+
+	sd_entry->valid = true;
+	return 0;
+
+error:
+	if (chunk->bitmapbuf)
+		irdma_free_virt_mem(dev->hw, &chunk->bitmapmem);
+
+	irdma_free_virt_mem(dev->hw, &chunk->chunkmem);
+
+	return ret_code;
+}
+
+/**
+ * free_lvl2 - fee level 2 pble
+ * @pble_rsrc: pble resource management
+ * @palloc: level 2 pble allocation
+ */
+static void free_lvl2(struct irdma_hmc_pble_rsrc *pble_rsrc,
+		      struct irdma_pble_alloc *palloc)
+{
+	u32 i;
+	struct irdma_sc_dev *dev = pble_rsrc->dev;
+	struct irdma_pble_level2 *lvl2 = &palloc->level2;
+	struct irdma_pble_info *root = &lvl2->root;
+	struct irdma_pble_info *leaf = lvl2->leaf;
+
+	for (i = 0; i < lvl2->leaf_cnt; i++, leaf++) {
+		if (leaf->addr)
+			irdma_prm_return_pbles(&pble_rsrc->pinfo,
+					       &leaf->chunkinfo);
+		else
+			break;
+	}
+
+	if (root->addr)
+		irdma_prm_return_pbles(&pble_rsrc->pinfo, &root->chunkinfo);
+
+	irdma_free_virt_mem(dev->hw, &lvl2->leafmem);
+	lvl2->leaf = NULL;
+}
+
+/**
+ * get_lvl2_pble - get level 2 pble resource
+ * @pble_rsrc: pble resource management
+ * @palloc: level 2 pble allocation
+ */
+static enum irdma_status_code
+get_lvl2_pble(struct irdma_hmc_pble_rsrc *pble_rsrc,
+	      struct irdma_pble_alloc *palloc)
+{
+	u32 lf4k, lflast, total, i;
+	u32 pblcnt = PBLE_PER_PAGE;
+	u64 *addr;
+	struct irdma_sc_dev *dev = pble_rsrc->dev;
+	struct irdma_pble_level2 *lvl2 = &palloc->level2;
+	struct irdma_pble_info *root = &lvl2->root;
+	struct irdma_pble_info *leaf;
+	enum irdma_status_code ret_code;
+	u64 fpm_addr;
+
+	/* number of full 512 (4K) leafs) */
+	lf4k = palloc->total_cnt >> 9;
+	lflast = palloc->total_cnt % PBLE_PER_PAGE;
+	total = (lflast == 0) ? lf4k : lf4k + 1;
+	lvl2->leaf_cnt = total;
+
+	ret_code = irdma_allocate_virt_mem(dev->hw, &lvl2->leafmem,
+					   (sizeof(*leaf) * total));
+	if (ret_code)
+		return ret_code;
+
+	lvl2->leaf = (struct irdma_pble_info *)lvl2->leafmem.va;
+	leaf = lvl2->leaf;
+	ret_code = irdma_prm_get_pbles(&pble_rsrc->pinfo,
+				       &root->chunkinfo,
+				       total << 3,
+				       &root->addr,
+				       &fpm_addr);
+	if (ret_code) {
+		irdma_free_virt_mem(dev->hw, &lvl2->leafmem);
+		lvl2->leaf = NULL;
+		return IRDMA_ERR_NO_MEMORY;
+	}
+
+	root->idx = fpm_to_idx(pble_rsrc, fpm_addr);
+	root->cnt = total;
+	addr = (u64 *)root->addr;
+	for (i = 0; i < total; i++, leaf++) {
+		pblcnt = (lflast && ((i + 1) == total)) ?
+			 lflast : PBLE_PER_PAGE;
+		ret_code = irdma_prm_get_pbles(&pble_rsrc->pinfo,
+					       &leaf->chunkinfo,
+					       pblcnt << 3,
+					       &leaf->addr,
+					       &fpm_addr);
+		if (ret_code)
+			goto error;
+
+		leaf->idx = fpm_to_idx(pble_rsrc, fpm_addr);
+
+		leaf->cnt = pblcnt;
+		*addr = (u64)leaf->idx;
+		addr++;
+	}
+
+	palloc->level = PBLE_LEVEL_2;
+	pble_rsrc->stats_lvl2++;
+	return 0;
+
+error:
+	free_lvl2(pble_rsrc, palloc);
+
+	return IRDMA_ERR_NO_MEMORY;
+}
+
+/**
+ * get_lvl1_pble - get level 1 pble resource
+ * @pble_rsrc: pble resource management
+ * @palloc: level 1 pble allocation
+ */
+static enum irdma_status_code
+get_lvl1_pble(struct irdma_hmc_pble_rsrc *pble_rsrc,
+	      struct irdma_pble_alloc *palloc)
+{
+	enum irdma_status_code ret_code;
+	u64 fpm_addr, vaddr;
+	struct irdma_pble_info *lvl1 = &palloc->level1;
+
+	ret_code = irdma_prm_get_pbles(&pble_rsrc->pinfo,
+				       &lvl1->chunkinfo,
+				       palloc->total_cnt << 3,
+				       &vaddr,
+				       &fpm_addr);
+	if (ret_code)
+		return IRDMA_ERR_NO_MEMORY;
+
+	lvl1->addr = vaddr;
+	palloc->level = PBLE_LEVEL_1;
+	lvl1->idx = fpm_to_idx(pble_rsrc, fpm_addr);
+	lvl1->cnt = palloc->total_cnt;
+	pble_rsrc->stats_lvl1++;
+
+	return 0;
+}
+
+/**
+ * get_lvl1_lvl2_pble - calls get_lvl1 and get_lvl2 pble routine
+ * @pble_rsrc:	pble resources
+ * @palloc: contains all inforamtion regarding pble (idx + pble addr)
+ * @prm: pointer to general purpose special memory prm descriptor
+ */
+static enum irdma_status_code
+get_lvl1_lvl2_pble(struct irdma_hmc_pble_rsrc *pble_rsrc,
+		   struct irdma_pble_alloc *palloc,
+		   bool level1_only)
+{
+	enum irdma_status_code status = 0;
+
+	status = get_lvl1_pble(pble_rsrc, palloc);
+	if (!status || level1_only || palloc->total_cnt <= PBLE_PER_PAGE)
+		return status;
+
+	status = get_lvl2_pble(pble_rsrc, palloc);
+
+	return status;
+}
+
+/**
+ * irdma_get_pble - allocate pbles from the prm
+ * @pble_rsrc:	pble resources
+ * @palloc: contains all inforamtion regarding pble (idx + pble addr)
+ * @pble_cnt: #of pbles requested
+ * @level1_only: true if only pble level 1 to acquire
+ */
+enum irdma_status_code irdma_get_pble(struct irdma_hmc_pble_rsrc *pble_rsrc,
+				      struct irdma_pble_alloc *palloc,
+				      u32 pble_cnt,
+				      bool level1_only)
+{
+	enum irdma_status_code status = 0;
+	unsigned long flags;
+	int max_sds = 0;
+	int i;
+
+	palloc->total_cnt = pble_cnt;
+	palloc->level = PBLE_LEVEL_0;
+	spin_lock_irqsave(&pble_rsrc->pble_lock, flags);
+	/*check first to see if we can get pble's without acquiring
+	 * additional sd's
+	 */
+	status = get_lvl1_lvl2_pble(pble_rsrc, palloc, level1_only);
+	if (!status)
+		goto exit;
+
+	max_sds = (palloc->total_cnt >> 18) + 1;
+	for (i = 0; i < max_sds; i++) {
+		status = add_pble_prm(pble_rsrc);
+		if (status)
+			break;
+
+		status = get_lvl1_lvl2_pble(pble_rsrc, palloc, level1_only);
+		/* if level1_only, only go through it once */
+		if (!status || level1_only)
+			break;
+	}
+
+exit:
+	if (!status) {
+		pble_rsrc->allocdpbles += pble_cnt;
+		pble_rsrc->stats_alloc_ok++;
+	} else {
+		pble_rsrc->stats_alloc_fail++;
+	}
+	spin_unlock_irqrestore(&pble_rsrc->pble_lock, flags);
+
+	return status;
+}
+
+/**
+ * irdma_free_pble - put pbles back into prm
+ * @pble_rsrc: pble resources
+ * @palloc: contains all information regarding pble resource being freed
+ */
+void irdma_free_pble(struct irdma_hmc_pble_rsrc *pble_rsrc,
+		     struct irdma_pble_alloc *palloc)
+{
+	pble_rsrc->freedpbles += palloc->total_cnt;
+
+	if (palloc->level == PBLE_LEVEL_2)
+		free_lvl2(pble_rsrc, palloc);
+	else
+		irdma_prm_return_pbles(&pble_rsrc->pinfo,
+				       &palloc->level1.chunkinfo);
+	pble_rsrc->stats_alloc_freed++;
+}
diff --git a/drivers/infiniband/hw/irdma/pble.h b/drivers/infiniband/hw/irdma/pble.h
new file mode 100644
index 0000000..f9ecc87
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/pble.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */
+/* Copyright (c) 2019, Intel Corporation. */
+
+#ifndef IRDMA_PBLE_H
+#define IRDMA_PBLE_H
+
+#define PBLE_SHIFT		6
+#define PBLE_PER_PAGE		512
+#define HMC_PAGED_BP_SHIFT	12
+#define PBLE_512_SHIFT		9
+#define PBLE_INVALID_IDX	0xffffffff
+
+enum irdma_pble_level {
+	PBLE_LEVEL_0 = 0,
+	PBLE_LEVEL_1 = 1,
+	PBLE_LEVEL_2 = 2,
+};
+
+enum irdma_alloc_type {
+	PBLE_NO_ALLOC     = 0,
+	PBLE_SD_CONTIGOUS = 1,
+	PBLE_SD_PAGED     = 2,
+};
+
+struct irdma_chunk;
+
+struct irdma_pble_chunkinfo {
+	struct irdma_chunk	*pchunk;
+	u64			bit_idx;
+	u64			bits_used;
+};
+
+struct irdma_pble_info {
+	u64				addr;
+	u32				idx;
+	u32				cnt;
+	struct irdma_pble_chunkinfo	chunkinfo;
+};
+
+struct irdma_pble_level2 {
+	struct irdma_pble_info	root;
+	struct irdma_pble_info *leaf;
+	struct irdma_virt_mem	leafmem;
+	u32			leaf_cnt;
+};
+
+struct irdma_pble_alloc {
+	u32			total_cnt;
+	enum irdma_pble_level	level;
+	union {
+		struct irdma_pble_info		level1;
+		struct irdma_pble_level2	level2;
+	};
+};
+
+struct sd_pd_idx {
+	u32	sd_idx;
+	u32	pd_idx;
+	u32	rel_pd_idx;
+};
+
+struct irdma_add_page_info {
+	struct irdma_chunk		*chunk;
+	struct irdma_hmc_sd_entry	*sd_entry;
+	struct irdma_hmc_info		*hmc_info;
+	struct sd_pd_idx		idx;
+	u32				pages;
+};
+
+struct irdma_chunk {
+	struct list_head	list;
+	struct irdma_dma_info	dmainfo;
+	void			*bitmapbuf;
+
+	u32			sizeofbitmap;
+	u64			size;
+	u64			vaddr;
+	u64			fpm_addr;
+	u32			pg_cnt;
+	enum irdma_alloc_type	type;
+	struct irdma_sc_dev	*dev;
+	struct irdma_virt_mem	bitmapmem;
+	struct irdma_virt_mem	chunkmem;
+};
+
+struct irdma_pble_prm {
+	struct list_head	clist;
+	spinlock_t		prm_lock; /* protect prm bitmap */
+	u64			total_pble_alloc;
+	u64			free_pble_cnt;
+	u8			pble_shift;
+};
+
+struct irdma_hmc_pble_rsrc {
+	u32			unallocated_pble;
+	spinlock_t		pble_lock; /* to serialize PBLE resource acquisition */
+	struct irdma_sc_dev	*dev;
+	u64			fpm_base_addr;
+	u64			next_fpm_addr;
+	struct irdma_pble_prm	pinfo;
+	u64			allocdpbles;
+	u64			freedpbles;
+	u32			stats_direct_sds;
+	u32			stats_paged_sds;
+	u64			stats_alloc_ok;
+	u64			stats_alloc_fail;
+	u64			stats_alloc_freed;
+	u64			stats_lvl1;
+	u64			stats_lvl2;
+};
+
+void irdma_destroy_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc);
+enum irdma_status_code irdma_hmc_init_pble(struct irdma_sc_dev *dev,
+					   struct irdma_hmc_pble_rsrc *pble_rsrc);
+void irdma_free_pble(struct irdma_hmc_pble_rsrc *pble_rsrc,
+		     struct irdma_pble_alloc *palloc);
+enum irdma_status_code irdma_get_pble(struct irdma_hmc_pble_rsrc *pble_rsrc,
+				      struct irdma_pble_alloc *palloc,
+				      u32 pble_cnt,
+				      bool level1_only);
+enum irdma_status_code irdma_prm_add_pble_mem(struct irdma_pble_prm *pprm,
+					      struct irdma_chunk *pchunk);
+enum irdma_status_code irdma_prm_get_pbles(struct irdma_pble_prm *pprm,
+					   struct irdma_pble_chunkinfo *chunkinfo,
+					   u32 mem_size,
+					   u64 *vaddr,
+					   u64 *fpm_addr);
+void irdma_prm_return_pbles(struct irdma_pble_prm *pprm,
+			    struct irdma_pble_chunkinfo *chunkinfo);
+void irdma_pble_acquire_lock(struct irdma_hmc_pble_rsrc *pble_rsrc, unsigned long *flags);
+void irdma_pble_release_lock(struct irdma_hmc_pble_rsrc *pble_rsrc, unsigned long *flags);
+void irdma_pble_free_paged_mem(struct irdma_chunk *chunk);
+enum irdma_status_code irdma_pble_get_paged_mem(struct irdma_chunk *chunk,
+						int pg_cnt);
+#endif /* IRDMA_PBLE_H */
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 08/19] RDMA/irdma: Add privileged UDA queue implementation
From: Shiraz Saleem @ 2019-02-12 21:43 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Shiraz Saleem
In-Reply-To: <20190212214402.23284-1-shiraz.saleem@intel.com>

From: Mustafa Ismail <mustafa.ismail@intel.com>

Implement privileged UDA queues to handle iWARP connection
packets and receive exceptions.

Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/hw/irdma/puda.c | 1694 ++++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/irdma/puda.h |  186 ++++
 2 files changed, 1880 insertions(+)
 create mode 100644 drivers/infiniband/hw/irdma/puda.c
 create mode 100644 drivers/infiniband/hw/irdma/puda.h

diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c
new file mode 100644
index 0000000..24bcdf4
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/puda.c
@@ -0,0 +1,1694 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2019, Intel Corporation. */
+
+#include "osdep.h"
+#include "status.h"
+#include "hmc.h"
+#include "defs.h"
+#include "type.h"
+#include "protos.h"
+#include "puda.h"
+#include "ws.h"
+
+static void irdma_ieq_receive(struct irdma_sc_vsi *vsi,
+			      struct irdma_puda_buf *buf);
+static void irdma_ieq_tx_compl(struct irdma_sc_vsi *vsi, void *sqwrid);
+static void irdma_ilq_putback_rcvbuf(struct irdma_sc_qp *qp, u32 wqe_idx);
+/**
+ * irdma_puda_get_listbuf - get buffer from puda list
+ * @list: list to use for buffers (ILQ or IEQ)
+ */
+static struct irdma_puda_buf *irdma_puda_get_listbuf(struct list_head *list)
+{
+	struct irdma_puda_buf *buf = NULL;
+
+	if (!list_empty(list)) {
+		buf = (struct irdma_puda_buf *)list->next;
+		list_del((struct list_head *)&buf->list);
+	}
+
+	return buf;
+}
+
+/**
+ * irdma_puda_get_bufpool - return buffer from resource
+ * @rsrc: resource to use for buffer
+ */
+struct irdma_puda_buf *irdma_puda_get_bufpool(struct irdma_puda_rsrc *rsrc)
+{
+	struct irdma_puda_buf *buf = NULL;
+	struct list_head *list = &rsrc->bufpool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rsrc->bufpool_lock, flags);
+	buf = irdma_puda_get_listbuf(list);
+	if (buf) {
+		rsrc->avail_buf_count--;
+		buf->vsi = rsrc->vsi;
+	} else {
+		rsrc->stats_buf_alloc_fail++;
+	}
+	spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
+
+	return buf;
+}
+
+/**
+ * irdma_puda_ret_bufpool - return buffer to rsrc list
+ * @rsrc: resource to use for buffer
+ * @buf: buffer to return to resource
+ */
+void irdma_puda_ret_bufpool(struct irdma_puda_rsrc *rsrc,
+			    struct irdma_puda_buf *buf)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rsrc->bufpool_lock, flags);
+	list_add(&buf->list, &rsrc->bufpool);
+	spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
+	rsrc->avail_buf_count++;
+}
+
+/**
+ * irdma_puda_post_recvbuf - set wqe for rcv buffer
+ * @rsrc: resource ptr
+ * @wqe_idx: wqe index to use
+ * @buf: puda buffer for rcv q
+ * @initial: flag if during init time
+ */
+static void irdma_puda_post_recvbuf(struct irdma_puda_rsrc *rsrc,
+				    u32 wqe_idx,
+				    struct irdma_puda_buf *buf,
+				    bool initial)
+{
+	__le64 *wqe;
+	struct irdma_sc_qp *qp = &rsrc->qp;
+	u64 offset24 = 0;
+
+	qp->qp_uk.rq_wrid_array[wqe_idx] = (uintptr_t)buf;
+	wqe = qp->qp_uk.rq_base[wqe_idx].elem;
+	if (!initial)
+		get_64bit_val(wqe, 24, &offset24);
+
+	offset24 = (offset24) ? 0 : LS_64(1, IRDMAQPSQ_VALID);
+
+	set_64bit_val(wqe, 16, 0);
+	set_64bit_val(wqe, 0, buf->mem.pa);
+	if (qp->qp_uk.hw_attrs->hw_rev == IRDMA_GEN_1) {
+		set_64bit_val(wqe,
+			      8,
+			      LS_64(buf->mem.size, IRDMAQPSQ_GEN1_FRAG_LEN));
+	} else {
+		set_64bit_val(wqe,
+			      8,
+			      LS_64(buf->mem.size,
+				    IRDMAQPSQ_FRAG_LEN) | (offset24 & IRDMAQPSQ_VALID_M));
+	}
+	irdma_insert_wqe_hdr(wqe, offset24);
+}
+
+/**
+ * irdma_puda_replenish_rq - post rcv buffers
+ * @rsrc: resource to use for buffer
+ * @initial: flag if during init time
+ */
+static enum irdma_status_code
+irdma_puda_replenish_rq(struct irdma_puda_rsrc *rsrc, bool initial)
+{
+	u32 i;
+	u32 invalid_cnt = rsrc->rxq_invalid_cnt;
+	struct irdma_puda_buf *buf = NULL;
+
+	for (i = 0; i < invalid_cnt; i++) {
+		buf = irdma_puda_get_bufpool(rsrc);
+		if (!buf)
+			return IRDMA_ERR_list_empty;
+		irdma_puda_post_recvbuf(rsrc, rsrc->rx_wqe_idx, buf,
+					initial);
+		rsrc->rx_wqe_idx =
+		    ((rsrc->rx_wqe_idx + 1) % rsrc->rq_size);
+		rsrc->rxq_invalid_cnt--;
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_puda_alloc_buf - allocate mem for buffer
+ * @dev: iwarp device
+ * @length: length of buffer
+ */
+static struct irdma_puda_buf *irdma_puda_alloc_buf(struct irdma_sc_dev *dev,
+						   u32 len)
+{
+	struct irdma_puda_buf *buf = NULL;
+	struct irdma_virt_mem buf_mem;
+	enum irdma_status_code ret;
+
+	ret = irdma_allocate_virt_mem(dev->hw, &buf_mem,
+				      sizeof(struct irdma_puda_buf));
+	if (ret) {
+		irdma_debug(dev, IRDMA_DEBUG_PUDA,
+			    "error virt_mem for buf\n");
+		return NULL;
+	}
+
+	buf = (struct irdma_puda_buf *)buf_mem.va;
+	ret = irdma_allocate_dma_mem(dev->hw, &buf->mem, len, 1);
+	if (ret) {
+		irdma_debug(dev, IRDMA_DEBUG_PUDA,
+			    "error dma_mem for buf\n");
+		irdma_free_virt_mem(dev->hw, &buf_mem);
+		return NULL;
+	}
+
+	buf->buf_mem.va = buf_mem.va;
+	buf->buf_mem.size = buf_mem.size;
+
+	return buf;
+}
+
+/**
+ * irdma_puda_dele_buf - delete buffer back to system
+ * @dev: iwarp device
+ * @buf: buffer to free
+ */
+static void irdma_puda_dele_buf(struct irdma_sc_dev *dev,
+				struct irdma_puda_buf *buf)
+{
+	irdma_free_dma_mem(dev->hw, &buf->mem);
+	irdma_free_virt_mem(dev->hw, &buf->buf_mem);
+}
+
+/**
+ * irdma_puda_get_next_send_wqe - return next wqe for processing
+ * @qp: puda qp for wqe
+ * @wqe_idx: wqe index for caller
+ */
+static __le64 *irdma_puda_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx)
+{
+	__le64 *wqe = NULL;
+	enum irdma_status_code ret_code = 0;
+
+	*wqe_idx = IRDMA_RING_CURRENT_HEAD(qp->sq_ring);
+	if (!*wqe_idx)
+		qp->swqe_polarity = !qp->swqe_polarity;
+	IRDMA_RING_MOVE_HEAD(qp->sq_ring, ret_code);
+	if (ret_code)
+		return wqe;
+
+	wqe = qp->sq_base[*wqe_idx].elem;
+
+	return wqe;
+}
+
+/**
+ * irdma_puda_poll_info - poll cq for completion
+ * @cq: cq for poll
+ * @info: info return for successful completion
+ */
+static enum irdma_status_code
+irdma_puda_poll_info(struct irdma_sc_cq *cq, struct irdma_puda_cmpl_info *info)
+{
+	struct irdma_cq_uk *cq_uk = &cq->cq_uk;
+	u64 qword0, qword2, qword3, qword6;
+	__le64 *cqe;
+	__le64 *ext_cqe = NULL;
+	u64 qword7 = 0;
+	u64 comp_ctx;
+	bool valid_bit;
+	bool ext_valid = 0;
+	u32 major_err, minor_err;
+	u32 peek_head;
+	bool error;
+	u8 polarity;
+
+	cqe = IRDMA_GET_CURRENT_CQ_ELEM(&cq->cq_uk);
+	get_64bit_val(cqe, 24, &qword3);
+	valid_bit = (bool)RS_64(qword3, IRDMA_CQ_VALID);
+	if (valid_bit != cq_uk->polarity)
+		return IRDMA_ERR_Q_EMPTY;
+
+	if (cq->dev->hw_attrs.hw_rev > IRDMA_GEN_1)
+		ext_valid = (bool)RS_64(qword3, IRDMA_CQ_EXTCQE);
+
+	if (ext_valid) {
+		enum irdma_status_code ret = 0;
+
+		peek_head = (cq_uk->cq_ring.head + 1) % cq_uk->cq_ring.size;
+		ext_cqe = cq_uk->cq_base[peek_head].buf;
+		get_64bit_val(ext_cqe, 24, &qword7);
+		polarity = (u8)RS_64(qword7, IRDMA_CQ_VALID);
+		if (!peek_head)
+			polarity ^= 1;
+		if (polarity != cq_uk->polarity)
+			return IRDMA_ERR_Q_EMPTY;
+
+		IRDMA_RING_MOVE_HEAD(cq_uk->cq_ring, ret);
+		if (IRDMA_RING_CURRENT_HEAD(cq_uk->cq_ring) == 0)
+			cq_uk->polarity = !cq_uk->polarity;
+		/* update cq tail in cq shadow memory also */
+		IRDMA_RING_MOVE_TAIL(cq_uk->cq_ring);
+	}
+
+	irdma_debug_buf(cq->dev, IRDMA_DEBUG_PUDA, "PUDA CQE", cqe, 32);
+	if (ext_valid)
+		irdma_debug_buf(cq->dev, IRDMA_DEBUG_PUDA, "PUDA EXT-CQE",
+				ext_cqe, 32);
+
+	error = (bool)RS_64(qword3, IRDMA_CQ_ERROR);
+	if (error) {
+		irdma_debug(cq->dev, IRDMA_DEBUG_PUDA, "receive error\n");
+		major_err = (u32)(RS_64(qword3, IRDMA_CQ_MAJERR));
+		minor_err = (u32)(RS_64(qword3, IRDMA_CQ_MINERR));
+		info->compl_error = major_err << 16 | minor_err;
+		return IRDMA_ERR_CQ_COMPL_ERROR;
+	}
+
+	get_64bit_val(cqe, 0, &qword0);
+	get_64bit_val(cqe, 16, &qword2);
+
+	info->q_type = (u8)RS_64(qword3, IRDMA_CQ_SQ);
+	info->qp_id = (u32)RS_64(qword2, IRDMACQ_QPID);
+	if (cq->dev->hw_attrs.hw_rev > IRDMA_GEN_1)
+		info->ipv4 = (bool)RS_64(qword3, IRDMACQ_IPV4);
+
+	get_64bit_val(cqe, 8, &comp_ctx);
+	info->qp = (struct irdma_qp_uk *)(unsigned long)comp_ctx;
+	info->wqe_idx = (u32)RS_64(qword3, IRDMA_CQ_WQEIDX);
+
+	if (info->q_type == IRDMA_CQE_QTYPE_RQ) {
+		if (ext_valid) {
+			info->vlan_valid = (bool)RS_64(qword7, IRDMA_CQ_UDVLANVALID);
+			if (info->vlan_valid) {
+				get_64bit_val(ext_cqe, 16, &qword6);
+				info->vlan = (u16)RS_64(qword6, IRDMA_CQ_UDVLAN);
+			}
+			info->smac_valid = (bool)RS_64(qword7, IRDMA_CQ_UDSMACVALID);
+			if (info->smac_valid) {
+				get_64bit_val(ext_cqe, 16, &qword6);
+				info->smac[0] = (u8)((qword6 >> 40) & 0xFF);
+				info->smac[1] = (u8)((qword6 >> 32) & 0xFF);
+				info->smac[2] = (u8)((qword6 >> 24) & 0xFF);
+				info->smac[3] = (u8)((qword6 >> 16) & 0xFF);
+				info->smac[4] = (u8)((qword6 >> 8) & 0xFF);
+				info->smac[5] = (u8)(qword6 & 0xFF);
+			}
+		}
+
+		if (cq->dev->hw_attrs.hw_rev == IRDMA_GEN_1) {
+			info->vlan_valid = (bool)RS_64(qword3, IRDMA_VLAN_TAG_VALID);
+			info->l4proto = (u8)RS_64(qword2, IRDMA_UDA_L4PROTO);
+			info->l3proto = (u8)RS_64(qword2, IRDMA_UDA_L3PROTO);
+		}
+
+		info->payload_len = (u32)RS_64(qword0, IRDMACQ_PAYLDLEN);
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_puda_poll_completion - processes completion for cq
+ * @dev: iwarp device
+ * @cq: cq getting interrupt
+ * @compl_err: return any completion err
+ */
+enum irdma_status_code irdma_puda_poll_cmpl(struct irdma_sc_dev *dev,
+					    struct irdma_sc_cq *cq,
+					    u32 *compl_err)
+{
+	struct irdma_qp_uk *qp;
+	struct irdma_cq_uk *cq_uk = &cq->cq_uk;
+	struct irdma_puda_cmpl_info info = {};
+	enum irdma_status_code ret = 0;
+	struct irdma_puda_buf *buf;
+	struct irdma_puda_rsrc *rsrc;
+	void *sqwrid;
+	u8 cq_type = cq->cq_type;
+	unsigned long flags;
+
+	if (cq_type == IRDMA_CQ_TYPE_ILQ || cq_type == IRDMA_CQ_TYPE_IEQ) {
+		rsrc = (cq_type == IRDMA_CQ_TYPE_ILQ) ?
+			cq->vsi->ilq : cq->vsi->ieq;
+	} else {
+		irdma_debug(dev, IRDMA_DEBUG_PUDA, "qp_type error\n");
+		return IRDMA_ERR_BAD_PTR;
+	}
+
+	ret = irdma_puda_poll_info(cq, &info);
+	*compl_err = info.compl_error;
+	if (ret == IRDMA_ERR_Q_EMPTY)
+		return ret;
+	if (ret)
+		goto done;
+
+	qp = info.qp;
+	if (!qp || !rsrc) {
+		ret = IRDMA_ERR_BAD_PTR;
+		goto done;
+	}
+
+	if (qp->qp_id != rsrc->qp_id) {
+		ret = IRDMA_ERR_BAD_PTR;
+		goto done;
+	}
+
+	if (info.q_type == IRDMA_CQE_QTYPE_RQ) {
+		buf = (struct irdma_puda_buf *)
+		      (uintptr_t)qp->rq_wrid_array[info.wqe_idx];
+		/* Get all the tcpip information in the buf header */
+		ret = irdma_puda_get_tcpip_info(&info, buf);
+		if (ret) {
+			rsrc->stats_rcvd_pkt_err++;
+			if (cq_type == IRDMA_CQ_TYPE_ILQ) {
+				irdma_ilq_putback_rcvbuf(&rsrc->qp,
+							 info.wqe_idx);
+			} else {
+				irdma_puda_ret_bufpool(rsrc, buf);
+				irdma_puda_replenish_rq(rsrc, false);
+			}
+			goto done;
+		}
+
+		rsrc->stats_pkt_rcvd++;
+		rsrc->compl_rxwqe_idx = info.wqe_idx;
+		irdma_debug(dev, IRDMA_DEBUG_PUDA, "RQ completion\n");
+		rsrc->receive(rsrc->vsi, buf);
+		if (cq_type == IRDMA_CQ_TYPE_ILQ)
+			irdma_ilq_putback_rcvbuf(&rsrc->qp, info.wqe_idx);
+		else
+			irdma_puda_replenish_rq(rsrc, false);
+
+	} else {
+		irdma_debug(dev, IRDMA_DEBUG_PUDA, "SQ completion\n");
+		sqwrid = (void *)(uintptr_t)
+			 qp->sq_wrtrk_array[info.wqe_idx].wrid;
+		IRDMA_RING_SET_TAIL(qp->sq_ring, info.wqe_idx);
+		rsrc->xmit_complete(rsrc->vsi, sqwrid);
+		spin_lock_irqsave(&rsrc->bufpool_lock, flags);
+		rsrc->tx_wqe_avail_cnt++;
+		spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
+		if (!list_empty(&rsrc->txpend))
+			irdma_puda_send_buf(rsrc, NULL);
+	}
+
+done:
+	IRDMA_RING_MOVE_HEAD(cq_uk->cq_ring, ret);
+	if (IRDMA_RING_CURRENT_HEAD(cq_uk->cq_ring) == 0)
+		cq_uk->polarity = !cq_uk->polarity;
+	/* update cq tail in cq shadow memory also */
+	IRDMA_RING_MOVE_TAIL(cq_uk->cq_ring);
+	set_64bit_val(cq_uk->shadow_area,
+		      0,
+		      IRDMA_RING_CURRENT_HEAD(cq_uk->cq_ring));
+
+	return 0;
+}
+
+/**
+ * irdma_puda_send - complete send wqe for transmit
+ * @qp: puda qp for send
+ * @info: buffer information for transmit
+ */
+enum irdma_status_code irdma_puda_send(struct irdma_sc_qp *qp,
+				       struct irdma_puda_send_info *info)
+{
+	__le64 *wqe;
+	u32 iplen, l4len;
+	u64 hdr[2];
+	u32 wqe_idx;
+	u8 iipt;
+
+	/* number of 32 bits DWORDS in header */
+	l4len = info->tcplen >> 2;
+	if (info->ipv4) {
+		iipt = 3;
+		iplen = 5;
+	} else {
+		iipt = 1;
+		iplen = 10;
+	}
+
+	wqe = irdma_puda_get_next_send_wqe(&qp->qp_uk, &wqe_idx);
+	if (!wqe)
+		return IRDMA_ERR_QP_TOOMANY_WRS_POSTED;
+
+	qp->qp_uk.sq_wrtrk_array[wqe_idx].wrid = (uintptr_t)info->scratch;
+	/* Third line of WQE descriptor */
+	/* maclen is in words */
+
+	if (qp->dev->hw_attrs.hw_rev > IRDMA_GEN_1) {
+		hdr[0] = 0; /* Dest_QPN and Dest_QKey only for UD */
+		hdr[1] = LS_64(IRDMA_OP_TYPE_SEND, IRDMA_UDA_QPSQ_OPCODE) |
+			LS_64(l4len, IRDMA_UDA_QPSQ_L4LEN) |
+			LS_64(info->ah_id, IRDMAQPSQ_AHID) |
+			LS_64(1, IRDMA_UDA_QPSQ_SIGCOMPL) |
+			LS_64(qp->qp_uk.swqe_polarity, IRDMA_UDA_QPSQ_VALID);
+
+		/* Forth line of WQE descriptor */
+
+		set_64bit_val(wqe, 0, info->paddr);
+		set_64bit_val(wqe,
+			      8,
+			      LS_64(info->len, IRDMAQPSQ_FRAG_LEN) |
+			      LS_64(qp->qp_uk.swqe_polarity, IRDMA_UDA_QPSQ_VALID));
+	} else {
+		hdr[0] = LS_64((info->maclen >> 1), IRDMA_UDA_QPSQ_MACLEN) |
+			 LS_64(iplen, IRDMA_UDA_QPSQ_IPLEN) |
+			 LS_64(1, IRDMA_UDA_QPSQ_L4T) |
+			 LS_64(iipt, IRDMA_UDA_QPSQ_IIPT) |
+			 LS_64(l4len, IRDMA_GEN1_UDA_QPSQ_L4LEN);
+
+		hdr[1] = LS_64(IRDMA_OP_TYPE_SEND, IRDMA_UDA_QPSQ_OPCODE) |
+			 LS_64(1, IRDMA_UDA_QPSQ_SIGCOMPL) |
+			 LS_64(info->do_lpb, IRDMA_UDA_QPSQ_DOLOOPBACK) |
+			 LS_64(qp->qp_uk.swqe_polarity, IRDMA_UDA_QPSQ_VALID);
+
+		/* Forth line of WQE descriptor */
+
+		set_64bit_val(wqe, 0, info->paddr);
+		set_64bit_val(wqe, 8,
+			      LS_64(info->len, IRDMAQPSQ_GEN1_FRAG_LEN));
+	}
+
+	set_64bit_val(wqe, 16, hdr[0]);
+	irdma_insert_wqe_hdr(wqe, hdr[1]);
+
+	irdma_debug_buf(qp->dev, IRDMA_DEBUG_PUDA, "PUDA SEND WQE", wqe, 32);
+	irdma_qp_post_wr(&qp->qp_uk);
+	return 0;
+}
+
+/**
+ * irdma_puda_send_buf - transmit puda buffer
+ * @rsrc: resource to use for buffer
+ * @buf: puda buffer to transmit
+ */
+void irdma_puda_send_buf(struct irdma_puda_rsrc *rsrc,
+			 struct irdma_puda_buf *buf)
+{
+	struct irdma_puda_send_info info;
+	enum irdma_status_code ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rsrc->bufpool_lock, flags);
+	/* if no wqe available or not from a completion and we have
+	 * pending buffers, we must queue new buffer
+	 */
+	if (!rsrc->tx_wqe_avail_cnt || (buf && !list_empty(&rsrc->txpend))) {
+		list_add_tail(&buf->list, &rsrc->txpend);
+		spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
+		rsrc->stats_sent_pkt_q++;
+		if (rsrc->type == IRDMA_PUDA_RSRC_TYPE_ILQ)
+			irdma_debug(rsrc->dev, IRDMA_DEBUG_PUDA,
+				    "adding to txpend\n");
+		return;
+	}
+	rsrc->tx_wqe_avail_cnt--;
+	/* if we are coming from a completion and have pending buffers
+	 * then Get one from pending list
+	 */
+	if (!buf) {
+		buf = irdma_puda_get_listbuf(&rsrc->txpend);
+		if (!buf)
+			goto done;
+	}
+
+	info.scratch = (void *)buf;
+	info.paddr = buf->mem.pa;
+	info.len = buf->totallen;
+	info.tcplen = buf->tcphlen;
+	info.ipv4 = buf->ipv4;
+
+	if (rsrc->dev->hw_attrs.hw_rev > IRDMA_GEN_1) {
+		info.ah_id = buf->ah_id;
+	} else {
+		info.maclen = buf->maclen;
+		info.do_lpb = (rsrc->type == IRDMA_PUDA_RSRC_TYPE_IEQ);
+	}
+
+	ret = irdma_puda_send(&rsrc->qp, &info);
+	if (ret) {
+		rsrc->tx_wqe_avail_cnt++;
+		rsrc->stats_sent_pkt_q++;
+		list_add(&buf->list, &rsrc->txpend);
+		if (rsrc->type == IRDMA_PUDA_RSRC_TYPE_ILQ)
+			irdma_debug(rsrc->dev, IRDMA_DEBUG_PUDA,
+				    "adding to puda_send\n");
+	} else {
+		rsrc->stats_pkt_sent++;
+	}
+done:
+	spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
+}
+
+/**
+ * irdma_puda_qp_setctx - during init, set qp's context
+ * @rsrc: qp's resource
+ */
+static void irdma_puda_qp_setctx(struct irdma_puda_rsrc *rsrc)
+{
+	struct irdma_sc_qp *qp = &rsrc->qp;
+	__le64 *qp_ctx = qp->hw_host_ctx;
+
+	set_64bit_val(qp_ctx, 8, qp->sq_pa);
+	set_64bit_val(qp_ctx, 16, qp->rq_pa);
+	set_64bit_val(qp_ctx, 24,
+		      LS_64(qp->hw_rq_size, IRDMAQPC_RQSIZE) |
+		      LS_64(qp->hw_sq_size, IRDMAQPC_SQSIZE));
+	set_64bit_val(qp_ctx, 48,
+		      LS_64(rsrc->buf_size, IRDMAQPC_SNDMSS));
+	set_64bit_val(qp_ctx, 56, 0);
+	if (qp->dev->hw_attrs.hw_rev == IRDMA_GEN_1)
+		set_64bit_val(qp_ctx, 64, 1);
+	set_64bit_val(qp_ctx, 136,
+		      LS_64(rsrc->cq_id, IRDMAQPC_TXCQNUM) |
+		      LS_64(rsrc->cq_id, IRDMAQPC_RXCQNUM));
+	set_64bit_val(qp_ctx, 144,
+		      LS_64(rsrc->stats_idx, IRDMAQPC_STAT_INDEX));
+	set_64bit_val(qp_ctx, 160,
+		      LS_64(1, IRDMAQPC_PRIVEN) |
+		      LS_64(rsrc->stats_idx_valid, IRDMAQPC_USESTATSINSTANCE));
+	set_64bit_val(qp_ctx, 168,
+		      LS_64((uintptr_t)qp, IRDMAQPC_QPCOMPCTX));
+	set_64bit_val(qp_ctx, 176,
+		      LS_64(qp->sq_tph_val, IRDMAQPC_SQTPHVAL) |
+		      LS_64(qp->rq_tph_val, IRDMAQPC_RQTPHVAL) |
+		      LS_64(qp->qs_handle, IRDMAQPC_QSHANDLE));
+
+	irdma_debug_buf(rsrc->dev, IRDMA_DEBUG_PUDA, "PUDA QP CONTEXT",
+			qp_ctx, IRDMA_QP_CTX_SIZE);
+}
+
+/**
+ * irdma_puda_qp_wqe - setup wqe for qp create
+ * @dev: Device
+ * @qp: Resource qp
+ */
+static enum irdma_status_code irdma_puda_qp_wqe(struct irdma_sc_dev *dev,
+						struct irdma_sc_qp *qp)
+{
+	struct irdma_sc_cqp *cqp;
+	__le64 *wqe;
+	u64 hdr;
+	struct irdma_ccq_cqe_info compl_info;
+	enum irdma_status_code status = 0;
+
+	cqp = dev->cqp;
+	wqe = irdma_sc_cqp_get_next_send_wqe(cqp, 0);
+	if (!wqe)
+		return IRDMA_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 16, qp->hw_host_ctx_pa);
+	set_64bit_val(wqe, 40, qp->shadow_area_pa);
+
+	hdr = qp->qp_uk.qp_id |
+	      LS_64(IRDMA_CQP_OP_CREATE_QP, IRDMA_CQPSQ_OPCODE) |
+	      LS_64(IRDMA_QP_TYPE_UDA, IRDMA_CQPSQ_QP_QPTYPE) |
+	      LS_64(1, IRDMA_CQPSQ_QP_CQNUMVALID) |
+	      LS_64(2, IRDMA_CQPSQ_QP_NEXTIWSTATE) |
+	      LS_64(cqp->polarity, IRDMA_CQPSQ_WQEVALID);
+	irdma_insert_wqe_hdr(wqe, hdr);
+
+	irdma_debug_buf(cqp->dev, IRDMA_DEBUG_PUDA, "PUDA QP CREATE", wqe, 40);
+	irdma_sc_cqp_post_sq(cqp);
+	status = dev->cqp_ops->poll_for_cqp_op_done(dev->cqp,
+						    IRDMA_CQP_OP_CREATE_QP,
+						    &compl_info);
+
+	return status;
+}
+
+/**
+ * irdma_puda_qp_create - create qp for resource
+ * @rsrc: resource to use for buffer
+ */
+static enum irdma_status_code irdma_puda_qp_create(struct irdma_puda_rsrc *rsrc)
+{
+	struct irdma_sc_qp *qp = &rsrc->qp;
+	struct irdma_qp_uk *ukqp = &qp->qp_uk;
+	enum irdma_status_code ret = 0;
+	u32 sq_size, rq_size, t_size;
+	struct irdma_dma_mem *mem;
+
+	sq_size = rsrc->sq_size * IRDMA_QP_WQE_MIN_SIZE;
+	rq_size = rsrc->rq_size * IRDMA_QP_WQE_MIN_SIZE;
+	t_size = (sq_size + rq_size + (IRDMA_SHADOW_AREA_SIZE << 3) +
+		  IRDMA_QP_CTX_SIZE);
+	/* Get page aligned memory */
+	ret = irdma_allocate_dma_mem(rsrc->dev->hw, &rsrc->qpmem, t_size,
+				     IRDMA_HW_PAGE_SIZE);
+	if (ret) {
+		irdma_debug(rsrc->dev, IRDMA_DEBUG_PUDA, "error dma mem\n");
+		return ret;
+	}
+
+	mem = &rsrc->qpmem;
+	memset(mem->va, 0, t_size);
+	qp->hw_sq_size = irdma_get_encoded_wqe_size(rsrc->sq_size, false);
+	qp->hw_rq_size = irdma_get_encoded_wqe_size(rsrc->rq_size, false);
+	qp->pd = &rsrc->sc_pd;
+	qp->qp_type = IRDMA_QP_TYPE_UDA;
+	qp->dev = rsrc->dev;
+	qp->back_qp = (void *)rsrc;
+	qp->sq_pa = mem->pa;
+	qp->rq_pa = qp->sq_pa + sq_size;
+	qp->vsi = rsrc->vsi;
+	ukqp->sq_base = mem->va;
+	ukqp->rq_base = &ukqp->sq_base[rsrc->sq_size];
+	ukqp->shadow_area = ukqp->rq_base[rsrc->rq_size].elem;
+	ukqp->hw_attrs = &qp->dev->hw_attrs;
+	qp->shadow_area_pa = qp->rq_pa + rq_size;
+	qp->hw_host_ctx = ukqp->shadow_area + IRDMA_SHADOW_AREA_SIZE;
+	qp->hw_host_ctx_pa = qp->shadow_area_pa + (IRDMA_SHADOW_AREA_SIZE << 3);
+	ukqp->qp_id = rsrc->qp_id;
+	ukqp->sq_wrtrk_array = rsrc->sq_wrtrk_array;
+	ukqp->rq_wrid_array = rsrc->rq_wrid_array;
+	ukqp->sq_size = rsrc->sq_size;
+	ukqp->rq_size = rsrc->rq_size;
+
+	IRDMA_RING_INIT(ukqp->sq_ring, ukqp->sq_size);
+	IRDMA_RING_INIT(ukqp->initial_ring, ukqp->sq_size);
+	IRDMA_RING_INIT(ukqp->rq_ring, ukqp->rq_size);
+	ukqp->wqe_alloc_db = qp->pd->dev->wqe_alloc_db;
+
+	ret = rsrc->dev->ws_add(qp->vsi, qp->user_pri);
+	if (ret) {
+		irdma_free_dma_mem(rsrc->dev->hw, &rsrc->qpmem);
+		return ret;
+	}
+
+	irdma_qp_add_qos(qp);
+	irdma_puda_qp_setctx(rsrc);
+
+	if (rsrc->dev->ceq_valid)
+		ret = irdma_cqp_qp_create_cmd(rsrc->dev, qp);
+	else
+		ret = irdma_puda_qp_wqe(rsrc->dev, qp);
+	if (ret) {
+		irdma_qp_rem_qos(qp);
+		rsrc->dev->ws_remove(qp->vsi, qp->user_pri);
+		irdma_free_dma_mem(rsrc->dev->hw, &rsrc->qpmem);
+	}
+
+	return ret;
+}
+
+/**
+ * irdma_puda_cq_wqe - setup wqe for cp create
+ * @dev: Device
+ * @cq: resource for cq
+ */
+static enum irdma_status_code irdma_puda_cq_wqe(struct irdma_sc_dev *dev,
+						struct irdma_sc_cq *cq)
+{
+	__le64 *wqe;
+	struct irdma_sc_cqp *cqp;
+	u64 hdr;
+	struct irdma_ccq_cqe_info compl_info;
+	enum irdma_status_code status = 0;
+
+	cqp = dev->cqp;
+	wqe = irdma_sc_cqp_get_next_send_wqe(cqp, 0);
+	if (!wqe)
+		return IRDMA_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
+	set_64bit_val(wqe, 8, RS_64_1(cq, 1));
+	set_64bit_val(wqe,
+		      16,
+		      LS_64(cq->shadow_read_threshold,
+			    IRDMA_CQPSQ_CQ_SHADOW_READ_THRESHOLD));
+	set_64bit_val(wqe, 32, cq->cq_pa);
+	set_64bit_val(wqe, 40, cq->shadow_area_pa);
+	set_64bit_val(wqe,
+		      56,
+		      LS_64(cq->tph_val, IRDMA_CQPSQ_TPHVAL) |
+		      LS_64(cq->vsi->vsi_idx, IRDMA_CQPSQ_VSIIDX));
+
+	hdr = cq->cq_uk.cq_id |
+	      LS_64(IRDMA_CQP_OP_CREATE_CQ, IRDMA_CQPSQ_OPCODE) |
+	      LS_64(1, IRDMA_CQPSQ_CQ_CHKOVERFLOW) |
+	      LS_64(1, IRDMA_CQPSQ_CQ_ENCEQEMASK) |
+	      LS_64(1, IRDMA_CQPSQ_CQ_CEQIDVALID) |
+	      LS_64(cqp->polarity, IRDMA_CQPSQ_WQEVALID);
+	irdma_insert_wqe_hdr(wqe, hdr);
+
+	irdma_debug_buf(dev, IRDMA_DEBUG_PUDA, "PUDA CREATE CQ",
+			wqe, IRDMA_CQP_WQE_SIZE * 8);
+	irdma_sc_cqp_post_sq(dev->cqp);
+	status = dev->cqp_ops->poll_for_cqp_op_done(dev->cqp,
+						    IRDMA_CQP_OP_CREATE_CQ,
+						    &compl_info);
+
+	return status;
+}
+
+/**
+ * irdma_puda_cq_create - create cq for resource
+ * @rsrc: resource for which cq to create
+ */
+static enum irdma_status_code irdma_puda_cq_create(struct irdma_puda_rsrc *rsrc)
+{
+	struct irdma_sc_dev *dev = rsrc->dev;
+	struct irdma_sc_cq *cq = &rsrc->cq;
+	enum irdma_status_code ret = 0;
+	u32 tsize, cqsize;
+	struct irdma_dma_mem *mem;
+	struct irdma_cq_init_info info = {};
+	struct irdma_cq_uk_init_info *init_info = &info.cq_uk_init_info;
+
+	cq->vsi = rsrc->vsi;
+	cqsize = rsrc->cq_size * (sizeof(struct irdma_cqe));
+	tsize = cqsize + sizeof(struct irdma_cq_shadow_area);
+	ret = irdma_allocate_dma_mem(dev->hw, &rsrc->cqmem, tsize,
+				     IRDMA_CQ0_ALIGNMENT);
+	if (ret)
+		return ret;
+
+	mem = &rsrc->cqmem;
+	info.dev = dev;
+	info.type = (rsrc->type == IRDMA_PUDA_RSRC_TYPE_ILQ) ?
+		    IRDMA_CQ_TYPE_ILQ : IRDMA_CQ_TYPE_IEQ;
+	info.shadow_read_threshold = rsrc->cq_size >> 2;
+	info.cq_base_pa = mem->pa;
+	info.shadow_area_pa = mem->pa + cqsize;
+	init_info->cq_base = mem->va;
+	init_info->shadow_area = (__le64 *)((u8 *)mem->va + cqsize);
+	init_info->cq_size = rsrc->cq_size;
+	init_info->cq_id = rsrc->cq_id;
+	info.ceqe_mask = true;
+	info.ceq_id_valid = true;
+	info.vsi = rsrc->vsi;
+
+	ret = dev->iw_priv_cq_ops->cq_init(cq, &info);
+	if (ret)
+		goto error;
+
+	if (rsrc->dev->ceq_valid)
+		ret = irdma_cqp_cq_create_cmd(dev, cq);
+	else
+		ret = irdma_puda_cq_wqe(dev, cq);
+error:
+	if (ret)
+		irdma_free_dma_mem(dev->hw, &rsrc->cqmem);
+
+	return ret;
+}
+
+/**
+ * irdma_puda_free_qp - free qp for resource
+ * @rsrc: resource for which qp to free
+ */
+static void irdma_puda_free_qp(struct irdma_puda_rsrc *rsrc)
+{
+	enum irdma_status_code ret;
+	struct irdma_ccq_cqe_info compl_info;
+	struct irdma_sc_dev *dev = rsrc->dev;
+
+	if (rsrc->dev->ceq_valid) {
+		irdma_cqp_qp_destroy_cmd(dev, &rsrc->qp);
+		rsrc->dev->ws_remove(rsrc->qp.vsi, rsrc->qp.user_pri);
+		return;
+	}
+
+	ret = dev->iw_priv_qp_ops->qp_destroy(&rsrc->qp, 0, false,
+					      true, true);
+	if (ret)
+		irdma_debug(dev, IRDMA_DEBUG_PUDA,
+			    "error puda qp destroy wqe, status = %d\n",
+			    ret);
+	if (!ret) {
+		ret = dev->cqp_ops->poll_for_cqp_op_done(dev->cqp,
+							 IRDMA_CQP_OP_DESTROY_QP,
+							 &compl_info);
+		if (ret)
+			irdma_debug(dev, IRDMA_DEBUG_PUDA,
+				    "error puda qp destroy failed, status = %d\n",
+				    ret);
+	}
+	irdma_qp_rem_qos(&rsrc->qp);
+	rsrc->dev->ws_remove(rsrc->qp.vsi, rsrc->qp.user_pri);
+}
+
+/**
+ * irdma_puda_free_cq - free cq for resource
+ * @rsrc: resource for which cq to free
+ */
+static void irdma_puda_free_cq(struct irdma_puda_rsrc *rsrc)
+{
+	enum irdma_status_code ret;
+	struct irdma_ccq_cqe_info compl_info;
+	struct irdma_sc_dev *dev = rsrc->dev;
+
+	if (rsrc->dev->ceq_valid) {
+		irdma_cqp_cq_destroy_cmd(dev, &rsrc->cq);
+		return;
+	}
+
+	ret = dev->iw_priv_cq_ops->cq_destroy(&rsrc->cq, 0, true);
+	if (ret)
+		irdma_debug(dev, IRDMA_DEBUG_PUDA,
+			    "error ieq cq destroy\n");
+	if (!ret) {
+		ret = dev->cqp_ops->poll_for_cqp_op_done(dev->cqp,
+							 IRDMA_CQP_OP_DESTROY_CQ,
+							 &compl_info);
+		if (ret)
+			irdma_debug(dev, IRDMA_DEBUG_PUDA,
+				    "error ieq qp destroy done\n");
+	}
+}
+
+/**
+ * irdma_puda_dele_resources - delete all resources during close
+ * @dev: iwarp device
+ * @type: type of resource to dele
+ * @reset: true if reset chip
+ */
+void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi,
+			  enum puda_rsrc_type type,
+			  bool reset)
+{
+	struct irdma_sc_dev *dev = vsi->dev;
+	struct irdma_puda_rsrc *rsrc;
+	struct irdma_puda_buf *buf = NULL;
+	struct irdma_puda_buf *nextbuf = NULL;
+	struct irdma_virt_mem *vmem;
+	struct irdma_sc_ceq *ceq;
+
+	ceq = vsi->dev->ceq[0];
+	switch (type) {
+	case IRDMA_PUDA_RSRC_TYPE_ILQ:
+		rsrc = vsi->ilq;
+		vmem = &vsi->ilq_mem;
+		vsi->ilq = NULL;
+		if (ceq && ceq->reg_cq)
+			irdma_sc_remove_cq_ctx(ceq, &rsrc->cq);
+		break;
+	case IRDMA_PUDA_RSRC_TYPE_IEQ:
+		rsrc = vsi->ieq;
+		vmem = &vsi->ieq_mem;
+		vsi->ieq = NULL;
+		if (ceq && ceq->reg_cq)
+			irdma_sc_remove_cq_ctx(ceq, &rsrc->cq);
+		break;
+	default:
+		irdma_debug(dev, IRDMA_DEBUG_PUDA, "error resource type = 0x%x\n",
+			    type);
+		return;
+	}
+
+	switch (rsrc->cmpl) {
+	case PUDA_HASH_CRC_COMPLETE:
+		irdma_free_hash_desc(rsrc->hash_desc);
+	case PUDA_QP_CREATED:
+		if (!reset)
+			irdma_puda_free_qp(rsrc);
+
+		irdma_free_dma_mem(dev->hw, &rsrc->qpmem);
+		/* fallthrough */
+	case PUDA_CQ_CREATED:
+		if (!reset)
+			irdma_puda_free_cq(rsrc);
+
+		irdma_free_dma_mem(dev->hw, &rsrc->cqmem);
+		break;
+	default:
+		irdma_debug(rsrc->dev, IRDMA_DEBUG_PUDA, "error no resources\n");
+		break;
+	}
+	/* Free all allocated puda buffers for both tx and rx */
+	buf = rsrc->alloclist;
+	while (buf) {
+		nextbuf = buf->next;
+		irdma_puda_dele_buf(dev, buf);
+		buf = nextbuf;
+		rsrc->alloc_buf_count--;
+	}
+
+	irdma_free_virt_mem(dev->hw, vmem);
+}
+
+/**
+ * irdma_puda_allocbufs - allocate buffers for resource
+ * @rsrc: resource for buffer allocation
+ * @count: number of buffers to create
+ */
+static enum irdma_status_code irdma_puda_allocbufs(struct irdma_puda_rsrc *rsrc,
+						   u32 count)
+{
+	u32 i;
+	struct irdma_puda_buf *buf;
+	struct irdma_puda_buf *nextbuf;
+
+	for (i = 0; i < count; i++) {
+		buf = irdma_puda_alloc_buf(rsrc->dev, rsrc->buf_size);
+		if (!buf) {
+			rsrc->stats_buf_alloc_fail++;
+			return IRDMA_ERR_NO_MEMORY;
+		}
+		irdma_puda_ret_bufpool(rsrc, buf);
+		rsrc->alloc_buf_count++;
+		if (!rsrc->alloclist) {
+			rsrc->alloclist = buf;
+		} else {
+			nextbuf = rsrc->alloclist;
+			rsrc->alloclist = buf;
+			buf->next = nextbuf;
+		}
+	}
+
+	rsrc->avail_buf_count = rsrc->alloc_buf_count;
+
+	return 0;
+}
+
+/**
+ * irdma_puda_create_rsrc - create resource (ilq or ieq)
+ * @vsi: sc VSI struct
+ * @info: resource information
+ */
+enum irdma_status_code irdma_puda_create_rsrc(struct irdma_sc_vsi *vsi,
+					      struct irdma_puda_rsrc_info *info)
+{
+	struct irdma_sc_dev *dev = vsi->dev;
+	enum irdma_status_code ret = 0;
+	struct irdma_puda_rsrc *rsrc;
+	u32 pudasize;
+	u32 sqwridsize, rqwridsize;
+	struct irdma_virt_mem *vmem;
+	struct irdma_sc_ceq *ceq;
+
+	info->count = 1;
+	pudasize = sizeof(struct irdma_puda_rsrc);
+	sqwridsize = info->sq_size * sizeof(struct irdma_sq_uk_wr_trk_info);
+	rqwridsize = info->rq_size * 8;
+	switch (info->type) {
+	case IRDMA_PUDA_RSRC_TYPE_ILQ:
+		vmem = &vsi->ilq_mem;
+		break;
+	case IRDMA_PUDA_RSRC_TYPE_IEQ:
+		vmem = &vsi->ieq_mem;
+		break;
+	default:
+		return IRDMA_NOT_SUPPORTED;
+	}
+	ret = irdma_allocate_virt_mem(dev->hw, vmem,
+				      pudasize + sqwridsize + rqwridsize);
+	if (ret)
+		return ret;
+
+	rsrc = (struct irdma_puda_rsrc *)vmem->va;
+	spin_lock_init(&rsrc->bufpool_lock);
+	switch (info->type) {
+	case IRDMA_PUDA_RSRC_TYPE_ILQ:
+		vsi->ilq = (struct irdma_puda_rsrc *)vmem->va;
+		vsi->ilq_count = info->count;
+		rsrc->receive = info->receive;
+		rsrc->xmit_complete = info->xmit_complete;
+		break;
+	case IRDMA_PUDA_RSRC_TYPE_IEQ:
+		vsi->ieq_count = info->count;
+		vsi->ieq = (struct irdma_puda_rsrc *)vmem->va;
+		rsrc->receive = irdma_ieq_receive;
+		rsrc->xmit_complete = irdma_ieq_tx_compl;
+		break;
+	default:
+		return IRDMA_NOT_SUPPORTED;
+	}
+
+	rsrc->type = info->type;
+	rsrc->sq_wrtrk_array = (struct irdma_sq_uk_wr_trk_info *)
+			       ((u8 *)vmem->va + pudasize);
+	rsrc->rq_wrid_array = (u64 *)((u8 *)vmem->va + pudasize + sqwridsize);
+	/* Initialize all ieq lists */
+	INIT_LIST_HEAD(&rsrc->bufpool);
+	INIT_LIST_HEAD(&rsrc->txpend);
+
+	rsrc->tx_wqe_avail_cnt = info->sq_size - 1;
+	dev->iw_pd_ops->pd_init(dev, &rsrc->sc_pd, info->pd_id, -1);
+	rsrc->qp_id = info->qp_id;
+	rsrc->cq_id = info->cq_id;
+	rsrc->sq_size = info->sq_size;
+	rsrc->rq_size = info->rq_size;
+	rsrc->cq_size = info->rq_size + info->sq_size;
+	if (dev->hw_attrs.hw_rev > IRDMA_GEN_1) {
+		if (rsrc->type == IRDMA_PUDA_RSRC_TYPE_ILQ)
+			rsrc->cq_size += info->rq_size;
+	}
+	rsrc->buf_size = info->buf_size;
+	rsrc->dev = dev;
+	rsrc->vsi = vsi;
+	rsrc->stats_idx = info->stats_idx;
+	rsrc->stats_idx_valid = info->stats_idx_valid;
+
+	ret = irdma_puda_cq_create(rsrc);
+	if (!ret) {
+		rsrc->cmpl = PUDA_CQ_CREATED;
+		ret = irdma_puda_qp_create(rsrc);
+	}
+	if (ret) {
+		irdma_debug(dev, IRDMA_DEBUG_PUDA,
+			    "error qp_create type=%d, status=%d\n",
+			    rsrc->type, ret);
+		goto error;
+	}
+	rsrc->cmpl = PUDA_QP_CREATED;
+
+	ceq = vsi->dev->ceq[0];
+	if (ceq->reg_cq)
+		ret = irdma_sc_add_cq_ctx(ceq, &rsrc->cq);
+
+	if (ret) {
+		irdma_debug(dev, IRDMA_DEBUG_PUDA, "error unable to add to cq_ctx\n");
+		goto error;
+	}
+
+	ret = irdma_puda_allocbufs(rsrc, info->tx_buf_cnt + info->rq_size);
+	if (ret) {
+		irdma_debug(dev, IRDMA_DEBUG_PUDA, "error allloc_buf\n");
+		goto error;
+	}
+
+	rsrc->rxq_invalid_cnt = info->rq_size;
+	ret = irdma_puda_replenish_rq(rsrc, true);
+	if (ret)
+		goto error;
+
+	if (info->type == IRDMA_PUDA_RSRC_TYPE_IEQ) {
+		if (!irdma_init_hash_desc(&rsrc->hash_desc)) {
+			rsrc->check_crc = true;
+			rsrc->cmpl = PUDA_HASH_CRC_COMPLETE;
+			ret = 0;
+		}
+	}
+
+	dev->ccq_ops->ccq_arm(&rsrc->cq);
+	return ret;
+
+error:
+	irdma_puda_dele_rsrc(vsi, info->type, false);
+
+	return ret;
+}
+
+/**
+ * irdma_ilq_putback_rcvbuf - ilq buffer to put back on rq
+ * @qp: ilq's qp resource
+ * @wqe_idx:  wqe index of completed rcvbuf
+ */
+static void irdma_ilq_putback_rcvbuf(struct irdma_sc_qp *qp, u32 wqe_idx)
+{
+	__le64 *wqe;
+	u64 offset8, offset24;
+
+	wqe = qp->qp_uk.rq_base[wqe_idx].elem;
+	get_64bit_val(wqe, 8, &offset8);
+	get_64bit_val(wqe, 24, &offset24);
+	if (offset24) {
+		offset24 = 0;
+		offset8 &= ~LS_64(1, IRDMAQPSQ_VALID);
+	} else {
+		offset24 = LS_64(1, IRDMAQPSQ_VALID);
+		offset8 |= LS_64(1, IRDMAQPSQ_VALID);
+	}
+	set_64bit_val(wqe, 8, offset8);
+	irdma_insert_wqe_hdr(wqe, offset24);
+}
+
+/**
+ * irdma_ieq_get_fpdu - given length return fpdu length
+ * @length: length if fpdu
+ */
+static u16 irdma_ieq_get_fpdu_len(u16 len)
+{
+	u16 fpdu_len;
+
+	fpdu_len = len + IRDMA_IEQ_MPA_FRAMING;
+	fpdu_len = (fpdu_len + 3) & 0xfffffffc;
+
+	return fpdu_len;
+}
+
+/**
+ * irdma_ieq_copy_to_txbuf - copydata from rcv buf to tx buf
+ * @buf: rcv buffer with partial
+ * @txbuf: tx buffer for sending back
+ * @buf_offset: rcv buffer offset to copy from
+ * @txbuf_offset: at offset in tx buf to copy
+ * @length: length of data to copy
+ */
+static void irdma_ieq_copy_to_txbuf(struct irdma_puda_buf *buf,
+				    struct irdma_puda_buf *txbuf,
+				    u16 buf_offset,
+				    u32 txbuf_offset,
+				    u32 len)
+{
+	void *mem1 = (u8 *)buf->mem.va + buf_offset;
+	void *mem2 = (u8 *)txbuf->mem.va + txbuf_offset;
+
+	memcpy(mem2, mem1, len);
+}
+
+/**
+ * irdma_ieq_setup_tx_buf - setup tx buffer for partial handling
+ * @buf: reeive buffer with partial
+ * @txbuf: buffer to prepare
+ */
+static void irdma_ieq_setup_tx_buf(struct irdma_puda_buf *buf,
+				   struct irdma_puda_buf *txbuf)
+{
+	txbuf->tcphlen = buf->tcphlen;
+	txbuf->ipv4 = buf->ipv4;
+
+	if (buf->vsi->dev->hw_attrs.hw_rev > IRDMA_GEN_1) {
+		txbuf->hdrlen = txbuf->tcphlen;
+		irdma_ieq_copy_to_txbuf(buf, txbuf, IRDMA_TCP_OFFSET, 0, txbuf->hdrlen);
+	} else {
+		txbuf->maclen = buf->maclen;
+		txbuf->hdrlen = buf->hdrlen;
+		irdma_ieq_copy_to_txbuf(buf, txbuf, 0, 0, buf->hdrlen);
+	}
+}
+
+/**
+ * irdma_ieq_check_first_buf - check if rcv buffer's seq is in range
+ * @buf: receive exception buffer
+ * @fps: first partial sequence number
+ */
+static void irdma_ieq_check_first_buf(struct irdma_puda_buf *buf, u32 fps)
+{
+	u32 offset;
+
+	if (buf->seqnum < fps) {
+		offset = fps - buf->seqnum;
+		if (offset > buf->datalen)
+			return;
+		buf->data += offset;
+		buf->datalen -= (u16)offset;
+		buf->seqnum = fps;
+	}
+}
+
+/**
+ * irdma_ieq_compl_pfpdu - write txbuf with full fpdu
+ * @ieq: ieq resource
+ * @rxlist: ieq's received buffer list
+ * @pbufl: temporary list for buffers for fpddu
+ * @txbuf: tx buffer for fpdu
+ * @fpdu_len: total length of fpdu
+ */
+static void  irdma_ieq_compl_pfpdu(struct irdma_puda_rsrc *ieq,
+				   struct list_head *rxlist,
+				   struct list_head *pbufl,
+				   struct irdma_puda_buf *txbuf,
+				   u16 fpdu_len)
+{
+	struct irdma_puda_buf *buf;
+	u32 nextseqnum;
+	u16 txoffset, bufoffset;
+
+	buf = irdma_puda_get_listbuf(pbufl);
+	if (!buf)
+		return;
+
+	nextseqnum = buf->seqnum + fpdu_len;
+	irdma_ieq_setup_tx_buf(buf, txbuf);
+	if (buf->vsi->dev->hw_attrs.hw_rev > IRDMA_GEN_1) {
+		txoffset = txbuf->hdrlen;
+		txbuf->totallen = txbuf->hdrlen + fpdu_len;
+		txbuf->data = (u8 *)txbuf->mem.va + txoffset;
+	} else {
+		txoffset = buf->hdrlen;
+		txbuf->totallen = buf->hdrlen + fpdu_len;
+		txbuf->data = (u8 *)txbuf->mem.va + buf->hdrlen;
+	}
+	bufoffset = (u16)(buf->data - (u8 *)buf->mem.va);
+
+	do {
+		if (buf->datalen >= fpdu_len) {
+			/* copied full fpdu */
+			irdma_ieq_copy_to_txbuf(buf, txbuf, bufoffset, txoffset,
+						fpdu_len);
+			buf->datalen -= fpdu_len;
+			buf->data += fpdu_len;
+			buf->seqnum = nextseqnum;
+			break;
+		}
+		/* copy partial fpdu */
+		irdma_ieq_copy_to_txbuf(buf, txbuf, bufoffset, txoffset,
+					buf->datalen);
+		txoffset += buf->datalen;
+		fpdu_len -= buf->datalen;
+		irdma_puda_ret_bufpool(ieq, buf);
+		buf = irdma_puda_get_listbuf(pbufl);
+		if (!buf)
+			return;
+
+		bufoffset = (u16)(buf->data - (u8 *)buf->mem.va);
+	} while (1);
+
+	/* last buffer on the list*/
+	if (buf->datalen)
+		list_add(&buf->list, rxlist);
+	else
+		irdma_puda_ret_bufpool(ieq, buf);
+}
+
+/**
+ * irdma_ieq_create_pbufl - create buffer list for single fpdu
+ * @rxlist: resource list for receive ieq buffes
+ * @pbufl: temp. list for buffers for fpddu
+ * @buf: first receive buffer
+ * @fpdu_len: total length of fpdu
+ */
+static enum irdma_status_code irdma_ieq_create_pbufl(struct irdma_pfpdu *pfpdu,
+						     struct list_head *rxlist,
+						     struct list_head *pbufl,
+						     struct irdma_puda_buf *buf,
+						     u16 fpdu_len)
+{
+	enum irdma_status_code status = 0;
+	struct irdma_puda_buf *nextbuf;
+	u32	nextseqnum;
+	u16 plen = fpdu_len - buf->datalen;
+	bool done = false;
+
+	nextseqnum = buf->seqnum + buf->datalen;
+	do {
+		nextbuf = irdma_puda_get_listbuf(rxlist);
+		if (!nextbuf) {
+			status = IRDMA_ERR_list_empty;
+			break;
+		}
+		list_add_tail(&nextbuf->list, pbufl);
+		if (nextbuf->seqnum != nextseqnum) {
+			pfpdu->bad_seq_num++;
+			status = IRDMA_ERR_SEQ_NUM;
+			break;
+		}
+		if (nextbuf->datalen >= plen) {
+			done = true;
+		} else {
+			plen -= nextbuf->datalen;
+			nextseqnum = nextbuf->seqnum + nextbuf->datalen;
+		}
+
+	} while (!done);
+
+	return status;
+}
+
+/**
+ * irdma_ieq_handle_partial - process partial fpdu buffer
+ * @ieq: ieq resource
+ * @pfpdu: partial management per user qp
+ * @buf: receive buffer
+ * @fpdu_len: fpdu len in the buffer
+ */
+static enum irdma_status_code
+irdma_ieq_handle_partial(struct irdma_puda_rsrc *ieq,
+			 struct irdma_pfpdu *pfpdu,
+			 struct irdma_puda_buf *buf,
+			 u16 fpdu_len)
+{
+	enum irdma_status_code status = 0;
+	u8 *crcptr;
+	u32 mpacrc;
+	u32 seqnum = buf->seqnum;
+	struct list_head pbufl;	/* partial buffer list */
+	struct irdma_puda_buf *txbuf = NULL;
+	struct list_head *rxlist = &pfpdu->rxlist;
+
+	INIT_LIST_HEAD(&pbufl);
+	list_add(&buf->list, &pbufl);
+
+	status = irdma_ieq_create_pbufl(pfpdu, rxlist, &pbufl, buf, fpdu_len);
+	if (status)
+		goto error;
+
+	txbuf = irdma_puda_get_bufpool(ieq);
+	if (!txbuf) {
+		pfpdu->no_tx_bufs++;
+		status = IRDMA_ERR_NO_TXBUFS;
+		goto error;
+	}
+
+	irdma_ieq_compl_pfpdu(ieq, rxlist, &pbufl, txbuf, fpdu_len);
+	irdma_ieq_update_tcpip_info(txbuf, fpdu_len, seqnum);
+
+	crcptr = txbuf->data + fpdu_len - 4;
+	mpacrc = *(u32 *)crcptr;
+	if (ieq->check_crc) {
+		status = irdma_ieq_check_mpacrc(ieq->hash_desc, txbuf->data,
+						(fpdu_len - 4), mpacrc);
+		if (status) {
+			irdma_debug(ieq->dev, IRDMA_DEBUG_IEQ,
+				    "error bad crc\n");
+			goto error;
+		}
+	}
+
+	irdma_debug_buf(ieq->dev, IRDMA_DEBUG_IEQ, "IEQ TX BUFFER",
+			txbuf->mem.va, txbuf->totallen);
+	if (ieq->dev->hw_attrs.hw_rev > IRDMA_GEN_1)
+		txbuf->ah_id = pfpdu->ah->ah_info.ah_idx;
+	irdma_puda_send_buf(ieq, txbuf);
+	pfpdu->rcv_nxt = seqnum + fpdu_len;
+	return status;
+
+ error:
+	while (!list_empty(&pbufl)) {
+		buf = (struct irdma_puda_buf *)(pbufl.prev);
+		list_del(&buf->list);
+		list_add(&buf->list, rxlist);
+	}
+	if (txbuf)
+		irdma_puda_ret_bufpool(ieq, txbuf);
+
+	return status;
+}
+
+/**
+ * irdma_ieq_process_buf - process buffer rcvd for ieq
+ * @ieq: ieq resource
+ * @pfpdu: partial management per user qp
+ * @buf: receive buffer
+ */
+static enum irdma_status_code irdma_ieq_process_buf(struct irdma_puda_rsrc *ieq,
+						    struct irdma_pfpdu *pfpdu,
+						    struct irdma_puda_buf *buf)
+{
+	u16 fpdu_len = 0;
+	u16 datalen = buf->datalen;
+	u8 *datap = buf->data;
+	u8 *crcptr;
+	u16 ioffset = 0;
+	u32 mpacrc;
+	u32 seqnum = buf->seqnum;
+	u16 len = 0;
+	u16 full = 0;
+	bool partial = false;
+	struct irdma_puda_buf *txbuf;
+	struct list_head *rxlist = &pfpdu->rxlist;
+	enum irdma_status_code ret = 0;
+	enum irdma_status_code status = 0;
+
+	ioffset = (u16)(buf->data - (u8 *)buf->mem.va);
+	while (datalen) {
+		fpdu_len = irdma_ieq_get_fpdu_len(ntohs(*(__be16 *)datap));
+		if (fpdu_len > pfpdu->max_fpdu_data) {
+			irdma_debug(ieq->dev, IRDMA_DEBUG_IEQ, "error bad fpdu_len\n");
+			status = IRDMA_ERR_MPA_CRC;
+			list_add(&buf->list, rxlist);
+			return status;
+		}
+
+		if (datalen < fpdu_len) {
+			partial = true;
+			break;
+		}
+		crcptr = datap + fpdu_len - 4;
+		mpacrc = *(u32 *)crcptr;
+		if (ieq->check_crc)
+			ret = irdma_ieq_check_mpacrc(ieq->hash_desc,
+						     datap,
+						     fpdu_len - 4,
+						     mpacrc);
+		if (ret) {
+			status = IRDMA_ERR_MPA_CRC;
+			list_add(&buf->list, rxlist);
+			irdma_debug(ieq->dev, IRDMA_DEBUG_ERR, "IRDMA_ERR_MPA_CRC\n");
+			return status;
+		}
+		full++;
+		pfpdu->fpdu_processed++;
+		datap += fpdu_len;
+		len += fpdu_len;
+		datalen -= fpdu_len;
+	}
+	if (full) {
+		/* copy full pdu's in the txbuf and send them out */
+		txbuf = irdma_puda_get_bufpool(ieq);
+		if (!txbuf) {
+			pfpdu->no_tx_bufs++;
+			status = IRDMA_ERR_NO_TXBUFS;
+			list_add(&buf->list, rxlist);
+			return status;
+		}
+		/* modify txbuf's buffer header */
+		irdma_ieq_setup_tx_buf(buf, txbuf);
+		/* copy full fpdu's to new buffer */
+		if (ieq->dev->hw_attrs.hw_rev > IRDMA_GEN_1) {
+			irdma_ieq_copy_to_txbuf(buf, txbuf, ioffset,
+						txbuf->hdrlen,
+						len);
+			txbuf->totallen = txbuf->hdrlen + len;
+			txbuf->ah_id = pfpdu->ah->ah_info.ah_idx;
+		} else {
+			irdma_ieq_copy_to_txbuf(buf, txbuf, ioffset, buf->hdrlen, len);
+			txbuf->totallen = buf->hdrlen + len;
+		}
+		irdma_ieq_update_tcpip_info(txbuf, len, buf->seqnum);
+		irdma_debug_buf(ieq->dev, IRDMA_DEBUG_IEQ, "IEQ TX BUFFER",
+				txbuf->mem.va, txbuf->totallen);
+		irdma_puda_send_buf(ieq, txbuf);
+
+		if (!datalen) {
+			pfpdu->rcv_nxt = buf->seqnum + len;
+			irdma_puda_ret_bufpool(ieq, buf);
+			return status;
+		}
+		buf->data = datap;
+		buf->seqnum = seqnum + len;
+		buf->datalen = datalen;
+		pfpdu->rcv_nxt = buf->seqnum;
+	}
+	if (partial)
+		status = irdma_ieq_handle_partial(ieq, pfpdu, buf, fpdu_len);
+
+	return status;
+}
+
+/**
+ * irdma_ieq_process_fpdus - process fpdu's buffers on its list
+ * @qp: qp for which partial fpdus
+ * @ieq: ieq resource
+ */
+void irdma_ieq_process_fpdus(struct irdma_sc_qp *qp,
+			     struct irdma_puda_rsrc *ieq)
+{
+	struct irdma_pfpdu *pfpdu = &qp->pfpdu;
+	struct list_head *rxlist = &pfpdu->rxlist;
+	struct irdma_puda_buf *buf;
+	enum irdma_status_code status;
+
+	do {
+		if (list_empty(rxlist))
+			break;
+		buf = irdma_puda_get_listbuf(rxlist);
+		if (!buf) {
+			irdma_debug(ieq->dev, IRDMA_DEBUG_IEQ,
+				    "error no buf\n");
+			break;
+		}
+		if (buf->seqnum != pfpdu->rcv_nxt) {
+			/* This could be out of order or missing packet */
+			pfpdu->out_of_order++;
+			list_add(&buf->list, rxlist);
+			break;
+		}
+		/* keep processing buffers from the head of the list */
+		status = irdma_ieq_process_buf(ieq, pfpdu, buf);
+		if (status == IRDMA_ERR_MPA_CRC) {
+			pfpdu->mpa_crc_err = true;
+			while (!list_empty(rxlist)) {
+				buf = irdma_puda_get_listbuf(rxlist);
+				irdma_puda_ret_bufpool(ieq, buf);
+				pfpdu->crc_err++;
+			}
+			/* create CQP for AE */
+			irdma_ieq_mpa_crc_ae(ieq->dev, qp);
+		}
+	} while (!status);
+}
+
+/**
+ * irdma_ieq_create_ah - create an address handle for IEQ
+ * @qp: qp pointer
+ * @buf: buf received on IEQ used to create AH
+ */
+static enum irdma_status_code irdma_ieq_create_ah(struct irdma_sc_qp *qp,
+						  struct irdma_puda_buf *buf)
+{
+	struct irdma_ah_info ah_info = {};
+
+	qp->pfpdu.ah_buf = buf;
+	irdma_puda_ieq_get_ah_info(qp, &ah_info);
+	return irdma_puda_create_ah(qp->vsi->dev,
+				    &ah_info,
+				    false,
+				    IRDMA_PUDA_RSRC_TYPE_IEQ,
+				    qp,
+				    &qp->pfpdu.ah);
+}
+
+/**
+ * irdma_ieq_handle_exception - handle qp's exception
+ * @ieq: ieq resource
+ * @qp: qp receiving excpetion
+ * @buf: receive buffer
+ */
+static void irdma_ieq_handle_exception(struct irdma_puda_rsrc *ieq,
+				       struct irdma_sc_qp *qp,
+				       struct irdma_puda_buf *buf)
+{
+	struct irdma_pfpdu *pfpdu = &qp->pfpdu;
+	u32 *hw_host_ctx = (u32 *)qp->hw_host_ctx;
+	u32 rcv_wnd = hw_host_ctx[23];
+	/* first partial seq # in q2 */
+	u32 fps = *(u32 *)(qp->q2_buf + Q2_FPSN_OFFSET);
+	struct list_head *rxlist = &pfpdu->rxlist;
+	unsigned long flags = 0;
+	u8 hw_rev = qp->dev->hw_attrs.hw_rev;
+
+	irdma_debug_buf(ieq->dev, IRDMA_DEBUG_IEQ, "IEQ RX BUFFER",
+			buf->mem.va, buf->totallen);
+
+	if (hw_rev > IRDMA_GEN_1)
+		spin_lock_irqsave(&pfpdu->lock, flags);
+
+	pfpdu->total_ieq_bufs++;
+	if (pfpdu->mpa_crc_err) {
+		pfpdu->crc_err++;
+		goto error;
+	}
+	if (pfpdu->mode && fps != pfpdu->fps) {
+		/* clean up qp as it is new partial sequence */
+		irdma_ieq_cleanup_qp(ieq, qp);
+		irdma_debug(ieq->dev, IRDMA_DEBUG_IEQ,
+			    "restarting new partial\n");
+		pfpdu->mode = false;
+	}
+
+	if (!pfpdu->mode) {
+		irdma_debug_buf(ieq->dev, IRDMA_DEBUG_IEQ, "Q2 BUFFER",
+				(u64 *)qp->q2_buf, 128);
+		/* First_Partial_Sequence_Number check */
+		pfpdu->rcv_nxt = fps;
+		pfpdu->fps = fps;
+		pfpdu->mode = true;
+		pfpdu->max_fpdu_data = (buf->ipv4) ?
+				       (ieq->vsi->mtu - IRDMA_MTU_TO_MSS_IPV4) :
+				       (ieq->vsi->mtu - IRDMA_MTU_TO_MSS_IPV6);
+		pfpdu->pmode_count++;
+		INIT_LIST_HEAD(rxlist);
+		irdma_ieq_check_first_buf(buf, fps);
+	}
+
+	if (!(rcv_wnd >= (buf->seqnum - pfpdu->rcv_nxt))) {
+		pfpdu->bad_seq_num++;
+		goto error;
+	}
+
+	if (!list_empty(rxlist)) {
+		if (buf->seqnum != pfpdu->nextseqnum) {
+			irdma_send_ieq_ack(qp);
+			/* throw away out-of-order, duplicates*/
+			goto error;
+		}
+	}
+	/* Insert buf before head */
+	list_add_tail(&buf->list, rxlist);
+	pfpdu->nextseqnum = buf->seqnum + buf->datalen;
+	pfpdu->lastrcv_buf = buf;
+	if (hw_rev > IRDMA_GEN_1 &&
+	    !pfpdu->ah) {
+		irdma_ieq_create_ah(qp, buf);
+		if (!pfpdu->ah)
+			goto error;
+		goto exit;
+	}
+	if (qp->dev->hw_attrs.hw_rev == IRDMA_GEN_1)
+		irdma_ieq_process_fpdus(qp, ieq);
+	else if (pfpdu->ah && pfpdu->ah->ah_info.ah_valid)
+		irdma_ieq_process_fpdus(qp, ieq);
+exit:
+	if (hw_rev > IRDMA_GEN_1)
+		spin_unlock_irqrestore(&pfpdu->lock, flags);
+	return;
+
+error:
+	irdma_puda_ret_bufpool(ieq, buf);
+	if (qp->dev->hw_attrs.hw_rev > IRDMA_GEN_1)
+		spin_unlock_irqrestore(&pfpdu->lock, flags);
+}
+
+/**
+ * irdma_ieq_receive - received exception buffer
+ * @dev: iwarp device
+ * @buf: exception buffer received
+ */
+static void irdma_ieq_receive(struct irdma_sc_vsi *vsi,
+			      struct irdma_puda_buf *buf)
+{
+	struct irdma_puda_rsrc *ieq = vsi->ieq;
+	struct irdma_sc_qp *qp = NULL;
+	u32 wqe_idx = ieq->compl_rxwqe_idx;
+
+	qp = irdma_ieq_get_qp(vsi->dev, buf);
+	if (!qp) {
+		ieq->stats_bad_qp_id++;
+		irdma_puda_ret_bufpool(ieq, buf);
+	} else {
+		irdma_ieq_handle_exception(ieq, qp, buf);
+	}
+	/*
+	 * ieq->rx_wqe_idx is used by irdma_puda_replenish_rq()
+	 * on which wqe_idx to start replenish rq
+	 */
+	if (!ieq->rxq_invalid_cnt)
+		ieq->rx_wqe_idx = wqe_idx;
+	ieq->rxq_invalid_cnt++;
+}
+
+/**
+ * irdma_ieq_tx_compl - put back after sending completed exception buffer
+ * @vsi: sc VSI struct
+ * @sqwrid: pointer to puda buffer
+ */
+static void irdma_ieq_tx_compl(struct irdma_sc_vsi *vsi, void *sqwrid)
+{
+	struct irdma_puda_rsrc *ieq = vsi->ieq;
+	struct irdma_puda_buf *buf = (struct irdma_puda_buf *)sqwrid;
+
+	irdma_puda_ret_bufpool(ieq, buf);
+}
+
+/**
+ * irdma_ieq_cleanup_qp - qp is being destroyed
+ * @ieq: ieq resource
+ * @qp: all pending fpdu buffers
+ */
+void irdma_ieq_cleanup_qp(struct irdma_puda_rsrc *ieq, struct irdma_sc_qp *qp)
+{
+	struct irdma_puda_buf *buf;
+	struct irdma_pfpdu *pfpdu = &qp->pfpdu;
+	struct list_head *rxlist = &pfpdu->rxlist;
+
+	if (!pfpdu->mode)
+		return;
+
+	while (!list_empty(rxlist)) {
+		buf = irdma_puda_get_listbuf(rxlist);
+		irdma_puda_ret_bufpool(ieq, buf);
+	}
+
+	if (qp->pfpdu.ah) {
+		irdma_puda_free_ah(ieq->dev, qp->pfpdu.ah);
+		qp->pfpdu.ah = NULL;
+		qp->pfpdu.ah_buf = NULL;
+	}
+}
diff --git a/drivers/infiniband/hw/irdma/puda.h b/drivers/infiniband/hw/irdma/puda.h
new file mode 100644
index 0000000..4216ce6
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/puda.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */
+/* Copyright (c) 2019, Intel Corporation. */
+
+#ifndef IRDMA_PUDA_H
+#define IRDMA_PUDA_H
+
+#define IRDMA_IEQ_MPA_FRAMING	6
+#define IRDMA_TCP_OFFSET	40
+
+enum puda_rsrc_type {
+	IRDMA_PUDA_RSRC_TYPE_ILQ = 1,
+	IRDMA_PUDA_RSRC_TYPE_IEQ,
+	IRDMA_PUDA_RSRC_TYPE_MAX, /* Must be last entry */
+};
+
+enum puda_rsrc_complete {
+	PUDA_CQ_CREATED = 1,
+	PUDA_QP_CREATED,
+	PUDA_TX_COMPLETE,
+	PUDA_RX_COMPLETE,
+	PUDA_HASH_CRC_COMPLETE,
+};
+
+struct irdma_sc_dev;
+struct irdma_sc_qp;
+struct irdma_sc_cq;
+
+struct irdma_puda_cmpl_info {
+	struct irdma_qp_uk *qp;
+	u8 q_type;
+	bool vlan_valid;
+	u8 l3proto;
+	u8 l4proto;
+	u16 vlan;
+	u32 payload_len;
+	u32 compl_error;	/* No_err=0, else major and minor err code */
+	u32 qp_id;
+	u32 wqe_idx;
+	bool ipv4;
+	bool smac_valid;
+	u8 smac[ETH_ALEN];
+};
+
+struct irdma_puda_send_info {
+	u64 paddr;		/* Physical address */
+	u32 len;
+	u32 ah_id;
+	u8 tcplen;
+	u8 maclen;
+	bool ipv4;
+	bool do_lpb;
+	void *scratch;
+};
+
+struct irdma_puda_buf {
+	struct list_head list;	/* MUST be first entry */
+	struct irdma_dma_mem mem;	/* DMA memory for the buffer */
+	struct irdma_puda_buf *next;	/* for alloclist in rsrc struct */
+	struct irdma_virt_mem buf_mem;	/* Buffer memory for this buffer */
+	void *scratch;
+	u8 *iph;
+	u8 *tcph;
+	u8 *data;
+	u16 datalen;
+	u16 vlan_id;
+	u8 tcphlen;		/* tcp length in bytes */
+	u8 maclen;		/* mac length in bytes */
+	u32 totallen;		/* machlen+iphlen+tcphlen+datalen */
+	atomic_t refcount;
+	u8 hdrlen;
+	bool ipv4;
+	bool vlan_valid;
+	u32 seqnum;
+	u32 ah_id;
+	bool smac_valid;
+	u8 smac[ETH_ALEN];
+	struct irdma_sc_vsi *vsi;
+};
+
+struct irdma_puda_rsrc_info {
+	void (*receive)(struct irdma_sc_vsi *vsi, struct irdma_puda_buf *buf);
+	void (*xmit_complete)(struct irdma_sc_vsi *vsi, void *sqwrid);
+	enum puda_rsrc_type type;	/* ILQ or IEQ */
+	u32 count;
+	u32 pd_id;
+	u32 cq_id;
+	u32 qp_id;
+	u32 sq_size;
+	u32 rq_size;
+	u32 tx_buf_cnt; /* total bufs allocated will be rq_size + tx_buf_cnt */
+	u16 buf_size;
+	u8 stats_idx;
+	bool stats_idx_valid;
+};
+
+struct irdma_puda_rsrc {
+	struct irdma_sc_cq cq;
+	struct irdma_sc_qp qp;
+	struct irdma_sc_pd sc_pd;
+	struct irdma_sc_dev *dev;
+	struct irdma_sc_vsi *vsi;
+	struct irdma_dma_mem cqmem;
+	struct irdma_dma_mem qpmem;
+	struct irdma_virt_mem ilq_mem;
+	enum puda_rsrc_complete cmpl;
+	enum puda_rsrc_type type;
+	u16 buf_size;		/*buf must be max datalen + tcpip hdr + mac */
+	u32 cq_id;
+	u32 qp_id;
+	u32 sq_size;
+	u32 rq_size;
+	u32 cq_size;
+	struct irdma_sq_uk_wr_trk_info *sq_wrtrk_array;
+	u64 *rq_wrid_array;
+	u32 compl_rxwqe_idx;
+	u32 rx_wqe_idx;
+	u32 rxq_invalid_cnt;
+	u32 tx_wqe_avail_cnt;
+	bool check_crc;
+	struct shash_desc *hash_desc;
+	struct list_head txpend;
+	struct list_head bufpool;	/* free buffers pool list for recv and xmit */
+	u32 alloc_buf_count;
+	u32 avail_buf_count;	/* snapshot of currently available buffers */
+	spinlock_t bufpool_lock;
+	struct irdma_puda_buf *alloclist;
+	void (*receive)(struct irdma_sc_vsi *vsi, struct irdma_puda_buf *buf);
+	void (*xmit_complete)(struct irdma_sc_vsi *vsi, void *sqwrid);
+	/* puda stats */
+	u64 stats_buf_alloc_fail;
+	u64 stats_pkt_rcvd;
+	u64 stats_pkt_sent;
+	u64 stats_rcvd_pkt_err;
+	u64 stats_sent_pkt_q;
+	u64 stats_bad_qp_id;
+	u8 stats_idx;
+	bool stats_idx_valid;
+};
+
+struct irdma_puda_buf *irdma_puda_get_bufpool(struct irdma_puda_rsrc *rsrc);
+void irdma_puda_ret_bufpool(struct irdma_puda_rsrc *rsrc,
+			    struct irdma_puda_buf *buf);
+void irdma_puda_send_buf(struct irdma_puda_rsrc *rsrc,
+			 struct irdma_puda_buf *buf);
+enum irdma_status_code irdma_puda_send(struct irdma_sc_qp *qp,
+				       struct irdma_puda_send_info *info);
+enum irdma_status_code irdma_puda_create_rsrc(struct irdma_sc_vsi *vsi,
+					      struct irdma_puda_rsrc_info *info);
+void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi, enum puda_rsrc_type type,
+			  bool reset);
+enum irdma_status_code irdma_puda_poll_cmpl(struct irdma_sc_dev *dev,
+					    struct irdma_sc_cq *cq,
+					    u32 *compl_err);
+
+struct irdma_sc_qp *irdma_ieq_get_qp(struct irdma_sc_dev *dev,
+				     struct irdma_puda_buf *buf);
+enum irdma_status_code irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info,
+						 struct irdma_puda_buf *buf);
+enum irdma_status_code irdma_ieq_check_mpacrc(struct shash_desc *desc,
+					      void *addr,
+					      u32 len,
+					      u32 val);
+enum irdma_status_code irdma_init_hash_desc(struct shash_desc **desc);
+void irdma_ieq_mpa_crc_ae(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp);
+void irdma_free_hash_desc(struct shash_desc *desc);
+void irdma_ieq_update_tcpip_info(struct irdma_puda_buf *buf, u16 len,
+				 u32 seqnum);
+enum irdma_status_code irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev,
+					       struct irdma_sc_qp *qp);
+enum irdma_status_code irdma_cqp_cq_create_cmd(struct irdma_sc_dev *dev,
+					       struct irdma_sc_cq *cq);
+void irdma_cqp_qp_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp);
+void irdma_cqp_cq_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq);
+void irdma_puda_ieq_get_ah_info(struct irdma_sc_qp *qp,
+				struct irdma_ah_info *ah_info);
+enum irdma_status_code irdma_puda_create_ah(struct irdma_sc_dev *dev,
+					    struct irdma_ah_info *ah_info,
+					    bool wait,
+					    enum puda_rsrc_type type,
+					    void *cb_param,
+					    struct irdma_sc_ah **ah);
+void irdma_puda_free_ah(struct irdma_sc_dev *dev, struct irdma_sc_ah *ah);
+void irdma_ieq_process_fpdus(struct irdma_sc_qp *qp,
+			     struct irdma_puda_rsrc *ieq);
+void irdma_ieq_cleanup_qp(struct irdma_puda_rsrc *ieq, struct irdma_sc_qp *qp);
+#endif /*IRDMA_PROTOS_H */
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 09/19] RDMA/irdma: Add QoS definitions
From: Shiraz Saleem @ 2019-02-12 21:43 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Shiraz Saleem
In-Reply-To: <20190212214402.23284-1-shiraz.saleem@intel.com>

From: Mustafa Ismail <mustafa.ismail@intel.com>

Add definitions for managing the RDMA HW work scheduler (WS) tree.

A WS node is created via a control QP operation with the bandwidth
allocation, arbitration scheme, and traffic class of the QP specified.
The Qset handle returned associates the QoS parameters for the QP.
The Qset is registered with the LAN and a equivalent node is created
in the LAN packet scheduler tree.

Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/hw/irdma/ws.c | 449 +++++++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/irdma/ws.h |  40 ++++
 2 files changed, 489 insertions(+)
 create mode 100644 drivers/infiniband/hw/irdma/ws.c
 create mode 100644 drivers/infiniband/hw/irdma/ws.h

diff --git a/drivers/infiniband/hw/irdma/ws.c b/drivers/infiniband/hw/irdma/ws.c
new file mode 100644
index 0000000..f9c0beb
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/ws.c
@@ -0,0 +1,449 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2019, Intel Corporation. */
+
+#include "osdep.h"
+#include "status.h"
+#include "hmc.h"
+#include "defs.h"
+#include "type.h"
+#include "protos.h"
+
+#include "ws.h"
+
+/**
+ * irdma_alloc_node - Allocate a WS node and init
+ * @vsi: vsi pointer
+ * @user_pri: user priority
+ * @node_type: Type of node, leaf or parent
+ * @parent: parent node pointer
+ */
+static struct irdma_ws_node *irdma_alloc_node(struct irdma_sc_vsi *vsi,
+					      u8 user_pri,
+					      enum irdma_ws_node_type node_type,
+					      struct irdma_ws_node *parent)
+{
+	struct irdma_virt_mem ws_mem;
+	struct irdma_ws_node *node;
+	u16 node_index = 0;
+
+	if (irdma_allocate_virt_mem(vsi->dev->hw, &ws_mem,
+				    sizeof(struct irdma_ws_node)))
+		return NULL;
+
+	if (parent || vsi->vm_vf_type == IRDMA_VF_TYPE) {
+		node_index = irdma_alloc_ws_node_id(vsi->dev);
+		if (node_index == IRDMA_WS_NODE_INVALID) {
+			irdma_free_virt_mem(vsi->dev->hw, &ws_mem);
+			return NULL;
+		}
+	}
+
+	node = (struct irdma_ws_node *)ws_mem.va;
+	node->index = node_index;
+	node->vsi_index = vsi->vsi_idx;
+	INIT_LIST_HEAD(&node->siblings);
+	node->first_child = NULL;
+	if (node_type == WS_NODE_TYPE_LEAF) {
+		node->type_leaf = true;
+		node->traffic_class = vsi->qos[user_pri].traffic_class;
+		node->user_pri = user_pri;
+		node->rel_bw =
+			vsi->qos[user_pri].rel_bw;
+		node->lan_qs_handle =
+			vsi->qos[user_pri].lan_qos_handle;
+		node->prio_type = IRDMA_PRIO_WEIGHTED_RR;
+	} else {
+		node->rel_bw = 1;
+		node->prio_type = IRDMA_PRIO_WEIGHTED_RR;
+	}
+
+	node->parent = parent;
+
+	return node;
+}
+
+/**
+ * irdma_free_node - Free a WS node
+ * @node: Pointer to node to free
+ */
+static void irdma_free_node(struct irdma_sc_vsi *vsi,
+			    struct irdma_ws_node *node)
+{
+	struct irdma_virt_mem ws_mem;
+
+	if (node->index)
+		irdma_free_ws_node_id(vsi->dev, node->index);
+
+	ws_mem.va = node;
+	ws_mem.size = sizeof(struct irdma_ws_node);
+	irdma_free_virt_mem(vsi->dev->hw, &ws_mem);
+}
+
+/**
+ * irdma_ws_cqp_cmd - Post CQP work scheduler node cmd
+ * @vsi: vsi pointer
+ * @node: pointer to node
+ * @cmd: add, remove or modify
+ * @user_pri: User priority for a leaf node
+ */
+static enum irdma_status_code
+irdma_ws_cqp_cmd(struct irdma_sc_vsi *vsi,
+		 struct irdma_ws_node *node,
+		 u8 cmd)
+{
+	struct irdma_ws_node_info node_info = {};
+
+	node_info.id = node->index;
+	node_info.vsi = node->vsi_index;
+	if (node->parent)
+		node_info.parent_id = node->parent->index;
+	else
+		node_info.parent_id = node_info.id;
+
+	node_info.weight = node->rel_bw;
+	node_info.tc = node->traffic_class;
+	node_info.prio_type = node->prio_type;
+	node_info.type_leaf = node->type_leaf;
+	node_info.enable = node->enable;
+	if (irdma_cqp_ws_node_cmd(vsi->dev, cmd, &node_info)) {
+		irdma_debug(vsi->dev, IRDMA_DEBUG_WS, "CQP WS CMD failed\n");
+		return IRDMA_ERR_NO_MEMORY;
+	}
+
+	if (node->type_leaf && cmd == IRDMA_OP_WS_ADD_NODE) {
+		node->qs_handle = node_info.qs_handle;
+		vsi->qos[node->user_pri].qs_handle = node_info.qs_handle;
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_node_from_entry - Given entry, get pointer to ws node struct
+ * @entry: Points to sibling list of a node
+ */
+static struct irdma_ws_node *irdma_node_from_entry(struct list_head *entry)
+{
+	if (entry)
+		return (struct irdma_ws_node *)((char *)entry
+				- offsetof(struct irdma_ws_node, siblings));
+
+	return NULL;
+}
+
+/**
+ * ws_find_node - Find SC WS node based on VSI id or TC
+ * @node: pointer to first node in level corresponding to VSI/TC
+ */
+static struct irdma_ws_node *ws_find_node(struct irdma_ws_node *root,
+					  u16 match_val,
+					  enum irdma_ws_match_type type)
+{
+	struct irdma_ws_node *node = root;
+	struct list_head *entry;
+
+	if (!root)
+		return NULL;
+
+	switch (type) {
+	case WS_MATCH_TYPE_VSI:
+		while (node && node->vsi_index != match_val) {
+			entry = node->siblings.next;
+			node = irdma_node_from_entry(entry);
+			if (node == root)
+				return NULL;
+		}
+		break;
+	case WS_MATCH_TYPE_TC:
+		while (node && node->traffic_class != match_val) {
+			entry = node->siblings.next;
+			node = irdma_node_from_entry(entry);
+			if (node == root)
+				return NULL;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return node;
+}
+
+/**
+ * ws_tree_del_node - Delete node from SW WS tree structure
+ * @node: shared code node pointer
+ */
+static void ws_tree_del_node(struct irdma_ws_node *node)
+{
+	struct irdma_ws_node *parent = node->parent;
+	struct irdma_ws_node *next_entry;
+
+	if (parent) {
+		if (parent->first_child == node && list_empty(&node->siblings)) {
+			parent->first_child = NULL;
+		} else if (parent->first_child == node) {
+			next_entry = irdma_node_from_entry(node->siblings.next);
+			list_del(&node->siblings);
+			parent->first_child = next_entry;
+		} else {
+			list_del(&node->siblings);
+		}
+	}
+}
+
+/**
+ * irdma_tc_in_use - Checks to see if a leaf node is in use
+ * @vsi: vsi pointer
+ * @user_pri: user priority
+ */
+static bool irdma_tc_in_use(struct irdma_sc_vsi *vsi, u8 user_pri)
+{
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&vsi->qos[user_pri].lock, flags);
+	if (!list_empty(&vsi->qos[user_pri].qplist)) {
+		spin_unlock_irqrestore(&vsi->qos[user_pri].lock, flags);
+		return true;
+	}
+
+	/* Check if the traffic class associated with the given user priority
+	 * is in use by any other user priority. If so, nothing left to do
+	 */
+	for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) {
+		if (vsi->qos[i].traffic_class == vsi->qos[user_pri].traffic_class &&
+		    !list_empty(&vsi->qos[i].qplist)) {
+			spin_unlock_irqrestore(&vsi->qos[user_pri].lock, flags);
+			return true;
+		}
+	}
+	spin_unlock_irqrestore(&vsi->qos[user_pri].lock, flags);
+
+	return false;
+}
+
+/**
+ * irdma_remove_leaf - Remove leaf node unconditionally
+ * @vsi: vsi pointer
+ * @user_pri: user priority
+ */
+static void irdma_remove_leaf(struct irdma_sc_vsi *vsi, u8 user_pri)
+{
+	struct irdma_ws_node *ws_tree_root, *vsi_node, *tc_node;
+
+	ws_tree_root = vsi->dev->ws_tree_root;
+	if (!ws_tree_root)
+		return;
+
+	vsi_node = ws_find_node(ws_tree_root->first_child, vsi->vsi_idx,
+				WS_MATCH_TYPE_VSI);
+	if (!vsi_node)
+		return;
+
+	tc_node = ws_find_node(vsi_node->first_child,
+			       vsi->qos[user_pri].traffic_class,
+			       WS_MATCH_TYPE_TC);
+	if (!tc_node)
+		return;
+
+	irdma_ws_cqp_cmd(vsi, tc_node, IRDMA_OP_WS_DELETE_NODE);
+	irdma_lan_unregister_qset(vsi, tc_node);
+	ws_tree_del_node(tc_node);
+	irdma_free_node(vsi, tc_node);
+	/* Check if VSI node can be freed */
+	if (!vsi_node->first_child) {
+		irdma_ws_cqp_cmd(vsi, vsi_node, IRDMA_OP_WS_DELETE_NODE);
+		ws_tree_del_node(vsi_node);
+		irdma_free_node(vsi, vsi_node);
+		/* Free head node there are no remaining VSI nodes */
+		if (!ws_tree_root->first_child) {
+			irdma_ws_cqp_cmd(vsi, ws_tree_root,
+					 IRDMA_OP_WS_DELETE_NODE);
+			irdma_free_node(vsi, ws_tree_root);
+			vsi->dev->ws_tree_root = NULL;
+		}
+	}
+}
+
+/**
+ * irdma_ws_add - Build work scheduler tree, set RDMA qs_handle
+ * @vsi: vsi pointer
+ * @user_pri: user priority
+ */
+enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri)
+{
+	struct irdma_ws_node *ws_tree_root;
+	struct irdma_ws_node *vsi_node;
+	struct irdma_ws_node *tc_node;
+	u16 traffic_class;
+	enum irdma_status_code ret = 0;
+	int i;
+
+	mutex_lock(&vsi->dev->ws_mutex);
+	if (vsi->tc_change_pending) {
+		mutex_unlock(&vsi->dev->ws_mutex);
+		return IRDMA_ERR_NOT_READY;
+	}
+
+	ws_tree_root = vsi->dev->ws_tree_root;
+	if (!ws_tree_root) {
+		irdma_debug(vsi->dev, IRDMA_DEBUG_WS, "Creating root node\n");
+		ws_tree_root = irdma_alloc_node(vsi, user_pri,
+						WS_NODE_TYPE_PARENT, NULL);
+		if (!ws_tree_root) {
+			ret = IRDMA_ERR_NO_MEMORY;
+			goto exit;
+		}
+
+		ret = irdma_ws_cqp_cmd(vsi, ws_tree_root, IRDMA_OP_WS_ADD_NODE);
+		if (ret) {
+			irdma_free_node(vsi, ws_tree_root);
+			goto exit;
+		}
+
+		vsi->dev->ws_tree_root = ws_tree_root;
+	}
+
+	/* Find a second tier node that matches the VSI */
+	vsi_node = ws_find_node(ws_tree_root->first_child, vsi->vsi_idx,
+				WS_MATCH_TYPE_VSI);
+
+	/* If VSI node doesn't exist, add one */
+	if (!vsi_node) {
+		irdma_debug(vsi->dev, IRDMA_DEBUG_WS,
+			    "Node not found matching VSI %d\n", vsi->vsi_idx);
+		vsi_node = irdma_alloc_node(vsi, user_pri, WS_NODE_TYPE_PARENT,
+					    ws_tree_root);
+		if (!vsi_node) {
+			ret = IRDMA_ERR_NO_MEMORY;
+			goto vsi_add_err;
+		}
+
+		ret = irdma_ws_cqp_cmd(vsi, vsi_node, IRDMA_OP_WS_ADD_NODE);
+		if (ret) {
+			irdma_free_node(vsi, vsi_node);
+			goto vsi_add_err;
+		}
+
+		if (ws_tree_root->first_child)
+			list_add(&vsi_node->siblings,
+				 &ws_tree_root->first_child->siblings);
+		else /* Connect to head node if this is the first VSI node */
+			ws_tree_root->first_child = vsi_node;
+	}
+
+	irdma_debug(vsi->dev, IRDMA_DEBUG_WS,
+		    "Using node %d which represents VSI %d\n",
+		    vsi_node->index, vsi->vsi_idx);
+	traffic_class = vsi->qos[user_pri].traffic_class;
+	tc_node = ws_find_node(vsi_node->first_child, traffic_class,
+			       WS_MATCH_TYPE_TC);
+	if (!tc_node) {
+		/* Add leaf node */
+		irdma_debug(vsi->dev, IRDMA_DEBUG_WS,
+			    "Node not found matching VSI %d and TC %d\n",
+			    vsi->vsi_idx, traffic_class);
+		tc_node = irdma_alloc_node(vsi, user_pri, WS_NODE_TYPE_LEAF,
+					   vsi_node);
+		if (!tc_node) {
+			ret = IRDMA_ERR_NO_MEMORY;
+			goto leaf_add_err;
+		}
+
+		ret = irdma_ws_cqp_cmd(vsi, tc_node, IRDMA_OP_WS_ADD_NODE);
+		if (ret) {
+			irdma_free_node(vsi, tc_node);
+			goto leaf_add_err;
+		}
+
+		if (vsi_node->first_child)
+			list_add(&tc_node->siblings,
+				 &vsi_node->first_child->siblings);
+		else
+			vsi_node->first_child = tc_node;
+
+		/*
+		 * callback to LAN to update the LAN tree with our node
+		 */
+		ret = irdma_lan_register_qset(vsi, tc_node);
+		if (ret)
+			goto reg_err;
+
+		tc_node->enable = true;
+		ret = irdma_ws_cqp_cmd(vsi, tc_node, IRDMA_OP_WS_MODIFY_NODE);
+		if (ret)
+			goto reg_err;
+	}
+	irdma_debug(vsi->dev, IRDMA_DEBUG_WS,
+		    "Using node %d which represents VSI %d TC %d\n",
+		    tc_node->index, vsi->vsi_idx, traffic_class);
+	/*
+	 * Iterate through other UPs and update the QS handle if they have
+	 * a matching traffic class.
+	 */
+	for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) {
+		if (vsi->qos[i].traffic_class == traffic_class) {
+			vsi->qos[i].qs_handle = tc_node->qs_handle;
+			vsi->qos[i].lan_qos_handle = tc_node->lan_qs_handle;
+			vsi->qos[i].l2_sched_node_id = tc_node->l2_sched_node_id;
+		}
+	}
+	goto exit;
+
+leaf_add_err:
+	if (!vsi_node->first_child) {
+		if (irdma_ws_cqp_cmd(vsi, vsi_node, IRDMA_OP_WS_DELETE_NODE))
+			goto exit;
+		ws_tree_del_node(vsi_node);
+		irdma_free_node(vsi, vsi_node);
+	}
+
+vsi_add_err:
+	/* Free head node there are no remaining VSI nodes */
+	if (!ws_tree_root->first_child) {
+		irdma_ws_cqp_cmd(vsi, ws_tree_root,
+				 IRDMA_OP_WS_DELETE_NODE);
+		vsi->dev->ws_tree_root = NULL;
+		ws_tree_del_node(ws_tree_root);
+		irdma_free_node(vsi, ws_tree_root);
+	}
+
+exit:
+	mutex_unlock(&vsi->dev->ws_mutex);
+	return ret;
+
+reg_err:
+	mutex_unlock(&vsi->dev->ws_mutex);
+	irdma_ws_remove(vsi, user_pri);
+	return ret;
+}
+
+/**
+ * irdma_ws_remove - Free WS scheduler node, update WS tree
+ * @vsi: vsi pointer
+ * @user_pri: user priority
+ */
+void irdma_ws_remove(struct irdma_sc_vsi *vsi, u8 user_pri)
+{
+	mutex_lock(&vsi->dev->ws_mutex);
+	if (irdma_tc_in_use(vsi, user_pri))
+		goto exit;
+
+	irdma_remove_leaf(vsi, user_pri);
+exit:
+	mutex_unlock(&vsi->dev->ws_mutex);
+}
+
+/**
+ * irdma_ws_reset - Reset entire WS tree
+ * @vsi: vsi pointer
+ */
+void irdma_ws_reset(struct irdma_sc_vsi *vsi)
+{
+	u8 i;
+
+	mutex_lock(&vsi->dev->ws_mutex);
+	for (i = 0; i < IRDMA_MAX_USER_PRIORITY; ++i)
+		irdma_remove_leaf(vsi, i);
+	mutex_unlock(&vsi->dev->ws_mutex);
+}
diff --git a/drivers/infiniband/hw/irdma/ws.h b/drivers/infiniband/hw/irdma/ws.h
new file mode 100644
index 0000000..fe0474d
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/ws.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */
+/* Copyright (c) 2019, Intel Corporation. */
+
+#ifndef IRDMA_WS_H
+#define IRDMA_WS_H
+
+#include "osdep.h"
+
+enum irdma_ws_node_type {
+	WS_NODE_TYPE_PARENT,
+	WS_NODE_TYPE_LEAF,
+};
+
+enum irdma_ws_match_type {
+	WS_MATCH_TYPE_VSI,
+	WS_MATCH_TYPE_TC,
+};
+
+struct irdma_ws_node {
+	struct list_head siblings;
+	struct irdma_ws_node *first_child;
+	struct irdma_ws_node *parent;
+	u64 lan_qs_handle;	  /* opaque handle used by LAN */
+	u32 l2_sched_node_id;
+	u16 index;
+	u16 qs_handle;
+	u16 vsi_index;
+	u8 traffic_class;
+	u8 user_pri;
+	u8 rel_bw;
+	u8 abstraction_layer; /* used for splitting a TC */
+	u8 prio_type;
+	bool type_leaf;
+	bool enable;
+};
+
+enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri);
+void irdma_ws_remove(struct irdma_sc_vsi *vsi, u8 user_pri);
+void irdma_ws_reset(struct irdma_sc_vsi *vsi);
+#endif  /* IRDMA_WS_H */
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 07/19] RDMA/irdma: Add HMC backing store setup functions
From: Shiraz Saleem @ 2019-02-12 21:43 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Shiraz Saleem
In-Reply-To: <20190212214402.23284-1-shiraz.saleem@intel.com>

From: Mustafa Ismail <mustafa.ismail@intel.com>

HW uses host memory as a backing store for a number of
protocol context objects and queue state tracking.
The Host Memory Cache (HMC) is a component responsible for
managing these objects stored in host memory.

Add the functions and data structures to manage the allocation
of backing pages used by the HMC for the various objects

Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/hw/irdma/hmc.c | 716 ++++++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/irdma/hmc.h | 221 ++++++++++++
 2 files changed, 937 insertions(+)
 create mode 100644 drivers/infiniband/hw/irdma/hmc.c
 create mode 100644 drivers/infiniband/hw/irdma/hmc.h

diff --git a/drivers/infiniband/hw/irdma/hmc.c b/drivers/infiniband/hw/irdma/hmc.c
new file mode 100644
index 0000000..c896e44
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/hmc.c
@@ -0,0 +1,716 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2019, Intel Corporation. */
+
+#include "osdep.h"
+#include "status.h"
+#include "hmc.h"
+#include "defs.h"
+#include "type.h"
+#include "protos.h"
+
+/**
+ * irdma_find_sd_index_limit - finds segment descriptor index limit
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @type: type of HMC resources we're searching
+ * @index: starting index for the object
+ * @cnt: number of objects we're trying to create
+ * @sd_idx: pointer to return index of the segment descriptor in question
+ * @sd_limit: pointer to return the maximum number of segment descriptors
+ *
+ * This function calculates the segment descriptor index and index limit
+ * for the resource defined by irdma_hmc_rsrc_type.
+ */
+
+static void irdma_find_sd_index_limit(struct irdma_hmc_info *hmc_info,
+				      u32 type,
+				      u32 idx,
+				      u32 cnt,
+				      u32 *sd_idx,
+				      u32 *sd_limit)
+{
+	u64 fpm_addr, fpm_limit;
+
+	fpm_addr = hmc_info->hmc_obj[(type)].base +
+		   hmc_info->hmc_obj[type].size * idx;
+	fpm_limit = fpm_addr + hmc_info->hmc_obj[type].size * cnt;
+	*sd_idx = (u32)(fpm_addr / IRDMA_HMC_DIRECT_BP_SIZE);
+	*sd_limit = (u32)((fpm_limit - 1) / IRDMA_HMC_DIRECT_BP_SIZE);
+	*sd_limit += 1;
+}
+
+/**
+ * irdma_find_pd_index_limit - finds page descriptor index limit
+ * @hmc_info: pointer to the HMC configuration information struct
+ * @type: HMC resource type we're examining
+ * @idx: starting index for the object
+ * @cnt: number of objects we're trying to create
+ * @pd_index: pointer to return page descriptor index
+ * @pd_limit: pointer to return page descriptor index limit
+ *
+ * Calculates the page descriptor index and index limit for the resource
+ * defined by irdma_hmc_rsrc_type.
+ */
+
+static void irdma_find_pd_index_limit(struct irdma_hmc_info *hmc_info,
+				      u32 type,
+				      u32 idx,
+				      u32 cnt,
+				      u32 *pd_idx,
+				      u32 *pd_limit)
+{
+	u64 fpm_adr, fpm_limit;
+
+	fpm_adr = hmc_info->hmc_obj[type].base +
+		  hmc_info->hmc_obj[type].size * idx;
+	fpm_limit = fpm_adr + (hmc_info)->hmc_obj[(type)].size * (cnt);
+	*(pd_idx) = (u32)(fpm_adr / IRDMA_HMC_PAGED_BP_SIZE);
+	*(pd_limit) = (u32)((fpm_limit - 1) / IRDMA_HMC_PAGED_BP_SIZE);
+	*(pd_limit) += 1;
+}
+
+/**
+ * irdma_set_sd_entry - setup entry for sd programming
+ * @pa: physical addr
+ * @idx: sd index
+ * @type: paged or direct sd
+ * @entry: sd entry ptr
+ */
+static void irdma_set_sd_entry(u64 pa,
+			       u32 idx,
+			       enum irdma_sd_entry_type type,
+			       struct irdma_update_sd_entry *entry)
+{
+	entry->data = pa | (IRDMA_HMC_MAX_BP_COUNT << IRDMA_PFHMC_SDDATALOW_PMSDBPCOUNT_S) |
+		      (((type == IRDMA_SD_TYPE_PAGED) ? 0 : 1) << IRDMA_PFHMC_SDDATALOW_PMSDTYPE_S) |
+		      (1 << IRDMA_PFHMC_SDDATALOW_PMSDVALID_S);
+	entry->cmd = (idx | (1 << IRDMA_PFHMC_SDCMD_PMSDWR_S) | (1 << 15));
+}
+
+/**
+ * irdma_clr_sd_entry - setup entry for sd clear
+ * @idx: sd index
+ * @type: paged or direct sd
+ * @entry: sd entry ptr
+ */
+static void irdma_clr_sd_entry(u32 idx,
+			       enum irdma_sd_entry_type type,
+			       struct irdma_update_sd_entry *entry)
+{
+	entry->data = (IRDMA_HMC_MAX_BP_COUNT << IRDMA_PFHMC_SDDATALOW_PMSDBPCOUNT_S) |
+		      (((type == IRDMA_SD_TYPE_PAGED) ? 0 : 1) << IRDMA_PFHMC_SDDATALOW_PMSDTYPE_S);
+	entry->cmd = (idx | (1 << IRDMA_PFHMC_SDCMD_PMSDWR_S) | (1 << 15));
+}
+
+/**
+ * irdma_hmc_sd_one - setup 1 sd entry for cqp
+ * @dev: pointer to the device structure
+ * @hmc_fn_id: hmc's function id
+ * @pa: physical addr
+ * @sd_idx: sd index
+ * @type: paged or direct sd
+ * @setsd: flag to set or clear sd
+ */
+enum irdma_status_code irdma_hmc_sd_one(struct irdma_sc_dev *dev,
+					u8 hmc_fn_id,
+					u64 pa, u32 sd_idx,
+					enum irdma_sd_entry_type type,
+					bool setsd)
+{
+	struct irdma_update_sds_info sdinfo;
+
+	sdinfo.cnt = 1;
+	sdinfo.hmc_fn_id = hmc_fn_id;
+	if (setsd)
+		irdma_set_sd_entry(pa, sd_idx, type, sdinfo.entry);
+	else
+		irdma_clr_sd_entry(sd_idx, type, sdinfo.entry);
+	return dev->cqp->process_cqp_sds(dev, &sdinfo);
+}
+
+/**
+ * irdma_hmc_sd_grp - setup group od sd entries for cqp
+ * @dev: pointer to the device structure
+ * @hmc_info: pointer to the HMC configuration information struct
+ * @sd_index: sd index
+ * @sd_cnt: number of sd entries
+ * @setsd: flag to set or clear sd
+ */
+static enum irdma_status_code irdma_hmc_sd_grp(struct irdma_sc_dev *dev,
+					       struct irdma_hmc_info *hmc_info,
+					       u32 sd_index,
+					       u32 sd_cnt,
+					       bool setsd)
+{
+	struct irdma_hmc_sd_entry *sd_entry;
+	struct irdma_update_sds_info sdinfo = {};
+	u64 pa;
+	u32 i;
+	enum irdma_status_code ret_code = 0;
+
+	sdinfo.hmc_fn_id = hmc_info->hmc_fn_id;
+	for (i = sd_index; i < sd_index + sd_cnt; i++) {
+		sd_entry = &hmc_info->sd_table.sd_entry[i];
+		if (!sd_entry ||
+		    (!sd_entry->valid && setsd) ||
+		    (sd_entry->valid && !setsd))
+			continue;
+		if (setsd) {
+			pa = (sd_entry->entry_type == IRDMA_SD_TYPE_PAGED) ?
+			    sd_entry->u.pd_table.pd_page_addr.pa :
+			    sd_entry->u.bp.addr.pa;
+			irdma_set_sd_entry(pa, i, sd_entry->entry_type,
+					   &sdinfo.entry[sdinfo.cnt]);
+		} else {
+			irdma_clr_sd_entry(i, sd_entry->entry_type,
+					   &sdinfo.entry[sdinfo.cnt]);
+		}
+		sdinfo.cnt++;
+		if (sdinfo.cnt == IRDMA_MAX_SD_ENTRIES) {
+			ret_code = dev->cqp->process_cqp_sds(dev, &sdinfo);
+			if (ret_code) {
+				irdma_debug(dev, IRDMA_DEBUG_HMC,
+					    "sd_programming failed err=%d\n",
+					    ret_code);
+				return ret_code;
+			}
+
+			sdinfo.cnt = 0;
+		}
+	}
+	if (sdinfo.cnt)
+		ret_code = dev->cqp->process_cqp_sds(dev, &sdinfo);
+
+	return ret_code;
+}
+
+/**
+ * irdma_hmc_finish_add_sd_reg - program sd entries for objects
+ * @dev: pointer to the device structure
+ * @info: create obj info
+ */
+static enum irdma_status_code
+irdma_hmc_finish_add_sd_reg(struct irdma_sc_dev *dev,
+			    struct irdma_hmc_create_obj_info *info)
+{
+	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt)
+		return IRDMA_ERR_INVALID_HMC_OBJ_INDEX;
+
+	if ((info->start_idx + info->count) >
+			info->hmc_info->hmc_obj[info->rsrc_type].cnt)
+		return IRDMA_ERR_INVALID_HMC_OBJ_COUNT;
+
+	if (!info->add_sd_cnt)
+		return 0;
+	return irdma_hmc_sd_grp(dev, info->hmc_info,
+				 info->hmc_info->sd_indexes[0],
+				 info->add_sd_cnt, true);
+}
+
+/**
+ * irdma_create_iw_hmc_obj - allocate backing store for hmc objects
+ * @dev: pointer to the device structure
+ * @info: pointer to irdma_hmc_iw_create_obj_info struct
+ *
+ * This will allocate memory for PDs and backing pages and populate
+ * the sd and pd entries.
+ */
+enum irdma_status_code
+irdma_sc_create_hmc_obj(struct irdma_sc_dev *dev,
+			struct irdma_hmc_create_obj_info *info)
+{
+	struct irdma_hmc_sd_entry *sd_entry;
+	u32 sd_idx, sd_lmt;
+	u32 pd_idx = 0, pd_lmt = 0;
+	u32 pd_idx1 = 0, pd_lmt1 = 0;
+	u32 i, j;
+	bool pd_error = false;
+	enum irdma_status_code ret_code = 0;
+
+	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt)
+		return IRDMA_ERR_INVALID_HMC_OBJ_INDEX;
+
+	if ((info->start_idx + info->count) >
+	    info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		irdma_debug(dev, IRDMA_DEBUG_HMC,
+			    "error type %u, start = %u, req cnt %u, cnt = %u\n",
+			    info->rsrc_type, info->start_idx, info->count,
+			    info->hmc_info->hmc_obj[info->rsrc_type].cnt);
+		return IRDMA_ERR_INVALID_HMC_OBJ_COUNT;
+	}
+
+	irdma_find_sd_index_limit(info->hmc_info, info->rsrc_type,
+				  info->start_idx, info->count,
+				  &sd_idx, &sd_lmt);
+	if (sd_idx >= info->hmc_info->sd_table.sd_cnt ||
+	    sd_lmt > info->hmc_info->sd_table.sd_cnt) {
+		return IRDMA_ERR_INVALID_SD_INDEX;
+	}
+
+	irdma_find_pd_index_limit(info->hmc_info, info->rsrc_type,
+				  info->start_idx, info->count,
+				  &pd_idx, &pd_lmt);
+
+	for (j = sd_idx; j < sd_lmt; j++) {
+		ret_code = irdma_add_sd_table_entry(dev->hw,
+						    info->hmc_info,
+						    j,
+						    info->entry_type,
+						    IRDMA_HMC_DIRECT_BP_SIZE);
+		if (ret_code)
+			goto exit_sd_error;
+
+		sd_entry = &info->hmc_info->sd_table.sd_entry[j];
+		if (sd_entry->entry_type == IRDMA_SD_TYPE_PAGED &&
+		    (dev->hmc_info == info->hmc_info &&
+		     info->rsrc_type != IRDMA_HMC_IW_PBLE)) {
+			pd_idx1 = max(pd_idx, (j * IRDMA_HMC_MAX_BP_COUNT));
+			pd_lmt1 = min(pd_lmt,
+				      (j + 1) * IRDMA_HMC_MAX_BP_COUNT);
+			for (i = pd_idx1; i < pd_lmt1; i++) {
+				/* update the pd table entry */
+				ret_code = irdma_add_pd_table_entry(dev,
+								    info->hmc_info,
+								    i,
+								    NULL);
+				if (ret_code) {
+					pd_error = true;
+					break;
+				}
+			}
+			if (pd_error) {
+				while (i && (i > pd_idx1)) {
+					irdma_remove_pd_bp(dev, info->hmc_info,
+							   i - 1);
+					i--;
+				}
+			}
+		}
+		if (sd_entry->valid)
+			continue;
+
+		info->hmc_info->sd_indexes[info->add_sd_cnt] = (u16)j;
+		info->add_sd_cnt++;
+		sd_entry->valid = true;
+	}
+	return irdma_hmc_finish_add_sd_reg(dev, info);
+
+exit_sd_error:
+	while (j && (j > sd_idx)) {
+		sd_entry = &info->hmc_info->sd_table.sd_entry[j - 1];
+		switch (sd_entry->entry_type) {
+		case IRDMA_SD_TYPE_PAGED:
+			pd_idx1 = max(pd_idx,
+				      (j - 1) * IRDMA_HMC_MAX_BP_COUNT);
+			pd_lmt1 = min(pd_lmt, (j * IRDMA_HMC_MAX_BP_COUNT));
+			for (i = pd_idx1; i < pd_lmt1; i++)
+				irdma_prep_remove_pd_page(info->hmc_info, i);
+			break;
+		case IRDMA_SD_TYPE_DIRECT:
+			irdma_prep_remove_pd_page(info->hmc_info, (j - 1));
+			break;
+		default:
+			ret_code = IRDMA_ERR_INVALID_SD_TYPE;
+			break;
+		}
+		j--;
+	}
+
+	return ret_code;
+}
+
+/**
+ * irdma_finish_del_sd_reg - delete sd entries for objects
+ * @dev: pointer to the device structure
+ * @info: dele obj info
+ * @reset: true if called before reset
+ */
+static enum irdma_status_code
+irdma_finish_del_sd_reg(struct irdma_sc_dev *dev,
+			struct irdma_hmc_del_obj_info *info,
+			bool reset)
+{
+	struct irdma_hmc_sd_entry *sd_entry;
+	enum irdma_status_code ret_code = 0;
+	u32 i, sd_idx;
+	struct irdma_dma_mem *mem;
+
+	if (dev->is_pf && !reset)
+		ret_code = irdma_hmc_sd_grp(dev, info->hmc_info,
+					    info->hmc_info->sd_indexes[0],
+					    info->del_sd_cnt, false);
+
+	if (ret_code)
+		irdma_debug(dev, IRDMA_DEBUG_HMC, "error cqp sd sd_grp\n");
+	for (i = 0; i < info->del_sd_cnt; i++) {
+		sd_idx = info->hmc_info->sd_indexes[i];
+		sd_entry = &info->hmc_info->sd_table.sd_entry[sd_idx];
+		if (!sd_entry)
+			continue;
+		mem = (sd_entry->entry_type == IRDMA_SD_TYPE_PAGED) ?
+			&sd_entry->u.pd_table.pd_page_addr :
+			&sd_entry->u.bp.addr;
+
+		if (!mem || !mem->va)
+			irdma_debug(dev, IRDMA_DEBUG_HMC, "error cqp sd mem\n");
+		else
+			irdma_free_dma_mem(dev->hw, mem);
+	}
+
+	return ret_code;
+}
+
+/**
+ * irdma_del_iw_hmc_obj - remove pe hmc objects
+ * @dev: pointer to the device structure
+ * @info: pointer to irdma_hmc_del_obj_info struct
+ * @reset: true if called before reset
+ *
+ * This will de-populate the SDs and PDs.  It frees
+ * the memory for PDS and backing storage.  After this function is returned,
+ * caller should deallocate memory allocated previously for
+ * book-keeping information about PDs and backing storage.
+ */
+enum irdma_status_code irdma_sc_del_hmc_obj(struct irdma_sc_dev *dev,
+					    struct irdma_hmc_del_obj_info *info,
+					    bool reset)
+{
+	struct irdma_hmc_pd_table *pd_table;
+	u32 sd_idx, sd_lmt;
+	u32 pd_idx, pd_lmt, rel_pd_idx;
+	u32 i, j;
+	enum irdma_status_code ret_code = 0;
+
+	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		irdma_debug(dev, IRDMA_DEBUG_HMC,
+			    "error start_idx[%04d]  >= [type %04d].cnt[%04d]\n",
+			    info->start_idx, info->rsrc_type,
+			    info->hmc_info->hmc_obj[info->rsrc_type].cnt);
+		return IRDMA_ERR_INVALID_HMC_OBJ_INDEX;
+	}
+
+	if ((info->start_idx + info->count) >
+	    info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		irdma_debug(dev, IRDMA_DEBUG_HMC,
+			    "error start_idx[%04d] + count %04d  >= [type %04d].cnt[%04d]\n",
+			    info->start_idx, info->count,
+			    info->rsrc_type,
+			    info->hmc_info->hmc_obj[info->rsrc_type].cnt);
+		return IRDMA_ERR_INVALID_HMC_OBJ_COUNT;
+	}
+
+	irdma_find_pd_index_limit(info->hmc_info, info->rsrc_type,
+				  info->start_idx, info->count,
+				  &pd_idx, &pd_lmt);
+
+	for (j = pd_idx; j < pd_lmt; j++) {
+		sd_idx = j / IRDMA_HMC_PD_CNT_IN_SD;
+
+		if (!info->hmc_info->sd_table.sd_entry[sd_idx].valid)
+			continue;
+
+		if (info->hmc_info->sd_table.sd_entry[sd_idx].entry_type !=
+		    IRDMA_SD_TYPE_PAGED)
+			continue;
+
+		rel_pd_idx = j % IRDMA_HMC_PD_CNT_IN_SD;
+		pd_table = &info->hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
+		if (pd_table->pd_entry && pd_table->pd_entry[rel_pd_idx].valid) {
+			ret_code = irdma_remove_pd_bp(dev, info->hmc_info, j);
+			if (ret_code) {
+				irdma_debug(dev, IRDMA_DEBUG_HMC, "remove_pd_bp error\n");
+				return ret_code;
+			}
+		}
+	}
+
+	irdma_find_sd_index_limit(info->hmc_info, info->rsrc_type,
+				  info->start_idx, info->count,
+				  &sd_idx, &sd_lmt);
+	if (sd_idx >= info->hmc_info->sd_table.sd_cnt ||
+	    sd_lmt > info->hmc_info->sd_table.sd_cnt) {
+		irdma_debug(dev, IRDMA_DEBUG_HMC, "invalid sd_idx\n");
+		return IRDMA_ERR_INVALID_SD_INDEX;
+	}
+
+	for (i = sd_idx; i < sd_lmt; i++) {
+		pd_table = &info->hmc_info->sd_table.sd_entry[i].u.pd_table;
+		if (!info->hmc_info->sd_table.sd_entry[i].valid)
+			continue;
+		switch (info->hmc_info->sd_table.sd_entry[i].entry_type) {
+		case IRDMA_SD_TYPE_DIRECT:
+			ret_code = irdma_prep_remove_sd_bp(info->hmc_info, i);
+			if (!ret_code) {
+				info->hmc_info->sd_indexes[info->del_sd_cnt] = (u16)i;
+				info->del_sd_cnt++;
+			}
+			break;
+		case IRDMA_SD_TYPE_PAGED:
+			ret_code = irdma_prep_remove_pd_page(info->hmc_info, i);
+			if (ret_code)
+				break;
+			if (dev->hmc_info != info->hmc_info &&
+			    info->rsrc_type == IRDMA_HMC_IW_PBLE &&
+			    pd_table->pd_entry) {
+				irdma_free_virt_mem(dev->hw,
+						    &pd_table->pd_entry_virt_mem);
+				pd_table->pd_entry = NULL;
+			}
+			info->hmc_info->sd_indexes[info->del_sd_cnt] = (u16)i;
+			info->del_sd_cnt++;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return irdma_finish_del_sd_reg(dev, info, reset);
+}
+
+/**
+ * irdma_add_sd_table_entry - Adds a segment descriptor to the table
+ * @hw: pointer to our hw struct
+ * @hmc_info: pointer to the HMC configuration information struct
+ * @sd_index: segment descriptor index to manipulate
+ * @type: what type of segment descriptor we're manipulating
+ * @direct_mode_sz: size to alloc in direct mode
+ */
+enum irdma_status_code irdma_add_sd_table_entry(struct irdma_hw *hw,
+						struct irdma_hmc_info *hmc_info,
+						u32 sd_index,
+						enum irdma_sd_entry_type type,
+						u64 direct_mode_sz)
+{
+	enum irdma_status_code ret_code = 0;
+	struct irdma_hmc_sd_entry *sd_entry;
+	bool dma_mem_alloc_done = false;
+	struct irdma_dma_mem mem;
+	u64 alloc_len;
+
+	sd_entry = &hmc_info->sd_table.sd_entry[sd_index];
+	if (!sd_entry->valid) {
+		if (type == IRDMA_SD_TYPE_PAGED)
+			alloc_len = IRDMA_HMC_PAGED_BP_SIZE;
+		else
+			alloc_len = direct_mode_sz;
+
+		/* allocate a 4K pd page or 2M backing page */
+		ret_code = irdma_allocate_dma_mem(hw, &mem, alloc_len,
+						  IRDMA_HMC_PD_BP_BUF_ALIGNMENT);
+		if (ret_code)
+			goto exit;
+
+		dma_mem_alloc_done = true;
+		if (type == IRDMA_SD_TYPE_PAGED) {
+			ret_code = irdma_allocate_virt_mem(hw,
+							   &sd_entry->u.pd_table.pd_entry_virt_mem,
+							   sizeof(struct irdma_hmc_pd_entry) * 512);
+			if (ret_code)
+				goto exit;
+			sd_entry->u.pd_table.pd_entry = (struct irdma_hmc_pd_entry *)
+							sd_entry->u.pd_table.pd_entry_virt_mem.va;
+
+			memcpy(&sd_entry->u.pd_table.pd_page_addr, &mem,
+			       sizeof(sd_entry->u.pd_table.pd_page_addr));
+		} else {
+			memcpy(&sd_entry->u.bp.addr, &mem,
+			       sizeof(sd_entry->u.bp.addr));
+
+			    sd_entry->u.bp.sd_pd_index = sd_index;
+		}
+
+		hmc_info->sd_table.sd_entry[sd_index].entry_type = type;
+		IRDMA_INC_SD_REFCNT(&hmc_info->sd_table);
+	}
+	if (sd_entry->entry_type == IRDMA_SD_TYPE_DIRECT)
+		IRDMA_INC_BP_REFCNT(&sd_entry->u.bp);
+exit:
+	if (ret_code)
+		if (dma_mem_alloc_done)
+			irdma_free_dma_mem(hw, &mem);
+
+	return ret_code;
+}
+
+/**
+ * irdma_add_pd_table_entry - Adds page descriptor to the specified table
+ * @hw: pointer to our HW structure
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @pd_index: which page descriptor index to manipulate
+ * @rsrc_pg: if not NULL, use preallocated page instead of allocating new one.
+ *
+ * This function:
+ *	1. Initializes the pd entry
+ *	2. Adds pd_entry in the pd_table
+ *	3. Mark the entry valid in irdma_hmc_pd_entry structure
+ *	4. Initializes the pd_entry's ref count to 1
+ * assumptions:
+ *	1. The memory for pd should be pinned down, physically contiguous and
+ *	   aligned on 4K boundary and zeroed memory.
+ *	2. It should be 4K in size.
+ */
+enum irdma_status_code irdma_add_pd_table_entry(struct irdma_sc_dev *dev,
+						struct irdma_hmc_info *hmc_info,
+						u32 pd_index,
+						struct irdma_dma_mem *rsrc_pg)
+{
+	enum irdma_status_code ret_code = 0;
+	struct irdma_hmc_pd_table *pd_table;
+	struct irdma_hmc_pd_entry *pd_entry;
+	struct irdma_dma_mem mem;
+	struct irdma_dma_mem *page = &mem;
+	u32 sd_idx, rel_pd_idx;
+	u64 *pd_addr;
+	u64 page_desc;
+
+	if (pd_index / IRDMA_HMC_PD_CNT_IN_SD >= hmc_info->sd_table.sd_cnt)
+		return IRDMA_ERR_INVALID_PAGE_DESC_INDEX;
+
+	sd_idx = (pd_index / IRDMA_HMC_PD_CNT_IN_SD);
+	if (hmc_info->sd_table.sd_entry[sd_idx].entry_type != IRDMA_SD_TYPE_PAGED)
+		return 0;
+
+	rel_pd_idx = (pd_index % IRDMA_HMC_PD_CNT_IN_SD);
+	pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
+	pd_entry = &pd_table->pd_entry[rel_pd_idx];
+	if (!pd_entry->valid) {
+		if (rsrc_pg) {
+			pd_entry->rsrc_pg = true;
+			page = rsrc_pg;
+		} else {
+			ret_code = irdma_allocate_dma_mem(dev->hw, page,
+							  IRDMA_HMC_PAGED_BP_SIZE,
+							  IRDMA_HMC_PD_BP_BUF_ALIGNMENT);
+			if (ret_code)
+				return ret_code;
+			pd_entry->rsrc_pg = false;
+		}
+
+		memcpy(&pd_entry->bp.addr, page, sizeof(pd_entry->bp.addr));
+		pd_entry->bp.sd_pd_index = pd_index;
+		pd_entry->bp.entry_type = IRDMA_SD_TYPE_PAGED;
+		page_desc = page->pa | 0x1;
+		pd_addr = (u64 *)pd_table->pd_page_addr.va;
+		pd_addr += rel_pd_idx;
+		memcpy(pd_addr, &page_desc, sizeof(*pd_addr));
+		pd_entry->sd_index = sd_idx;
+		pd_entry->valid = true;
+		IRDMA_INC_PD_REFCNT(pd_table);
+		if (hmc_info->hmc_fn_id < dev->hw_attrs.first_hw_vf_fpm_id)
+			IRDMA_INVALIDATE_PF_HMC_PD(dev, sd_idx, rel_pd_idx);
+		else if (dev->hw->hmc.hmc_fn_id != hmc_info->hmc_fn_id)
+			IRDMA_INVALIDATE_VF_HMC_PD(dev, sd_idx, rel_pd_idx,
+						   hmc_info->hmc_fn_id);
+	}
+	IRDMA_INC_BP_REFCNT(&pd_entry->bp);
+
+	return 0;
+}
+
+/**
+ * irdma_remove_pd_bp - remove a backing page from a page descriptor
+ * @hw: pointer to our HW structure
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: the page index
+ *
+ * This function:
+ *	1. Marks the entry in pd table (for paged address mode) or in sd table
+ *	   (for direct address mode) invalid.
+ *	2. Write to register PMPDINV to invalidate the backing page in FV cache
+ *	3. Decrement the ref count for the pd _entry
+ * assumptions:
+ *	1. Caller can deallocate the memory used by backing storage after this
+ *	   function returns.
+ */
+enum irdma_status_code irdma_remove_pd_bp(struct irdma_sc_dev *dev,
+					  struct irdma_hmc_info *hmc_info,
+					  u32 idx)
+{
+	struct irdma_hmc_pd_entry *pd_entry;
+	struct irdma_hmc_pd_table *pd_table;
+	struct irdma_hmc_sd_entry *sd_entry;
+	u32 sd_idx, rel_pd_idx;
+	struct irdma_dma_mem *mem;
+	u64 *pd_addr;
+
+	sd_idx = idx / IRDMA_HMC_PD_CNT_IN_SD;
+	rel_pd_idx = idx % IRDMA_HMC_PD_CNT_IN_SD;
+	if (sd_idx >= hmc_info->sd_table.sd_cnt)
+		return IRDMA_ERR_INVALID_PAGE_DESC_INDEX;
+
+	sd_entry = &hmc_info->sd_table.sd_entry[sd_idx];
+	if (sd_entry->entry_type != IRDMA_SD_TYPE_PAGED)
+		return IRDMA_ERR_INVALID_SD_TYPE;
+
+	pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
+	pd_entry = &pd_table->pd_entry[rel_pd_idx];
+	IRDMA_DEC_BP_REFCNT(&pd_entry->bp);
+	if (pd_entry->bp.ref_cnt)
+		return 0;
+
+	pd_entry->valid = false;
+	IRDMA_DEC_PD_REFCNT(pd_table);
+	pd_addr = (u64 *)pd_table->pd_page_addr.va;
+	pd_addr += rel_pd_idx;
+	memset(pd_addr, 0, sizeof(u64));
+	if (dev->is_pf) {
+		if (dev->hmc_fn_id == hmc_info->hmc_fn_id)
+			IRDMA_INVALIDATE_PF_HMC_PD(dev, sd_idx, idx);
+		else
+			IRDMA_INVALIDATE_VF_HMC_PD(dev, sd_idx, idx,
+						   hmc_info->hmc_fn_id);
+	}
+
+	if (!pd_entry->rsrc_pg) {
+		mem = &pd_entry->bp.addr;
+		if (!mem || !mem->va)
+			return IRDMA_ERR_PARAM;
+
+		irdma_free_dma_mem(dev->hw, mem);
+	}
+	if (!pd_table->ref_cnt)
+		irdma_free_virt_mem(dev->hw, &pd_table->pd_entry_virt_mem);
+
+	return 0;
+}
+
+/**
+ * irdma_prep_remove_sd_bp - Prepares to remove a backing page from a sd entry
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: the page index
+ */
+enum irdma_status_code irdma_prep_remove_sd_bp(struct irdma_hmc_info *hmc_info,
+					       u32 idx)
+{
+	struct irdma_hmc_sd_entry *sd_entry;
+
+	sd_entry = &hmc_info->sd_table.sd_entry[idx];
+	IRDMA_DEC_BP_REFCNT(&sd_entry->u.bp);
+	if (sd_entry->u.bp.ref_cnt)
+		return IRDMA_ERR_NOT_READY;
+
+	IRDMA_DEC_SD_REFCNT(&hmc_info->sd_table);
+	sd_entry->valid = false;
+
+	return 0;
+}
+
+/**
+ * irdma_prep_remove_pd_page - Prepares to remove a PD page from sd entry.
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: segment descriptor index to find the relevant page descriptor
+ */
+enum irdma_status_code
+irdma_prep_remove_pd_page(struct irdma_hmc_info *hmc_info, u32 idx)
+{
+	struct irdma_hmc_sd_entry *sd_entry;
+
+	sd_entry = &hmc_info->sd_table.sd_entry[idx];
+
+	if (sd_entry->u.pd_table.ref_cnt)
+		return IRDMA_ERR_NOT_READY;
+
+	sd_entry->valid = false;
+	IRDMA_DEC_SD_REFCNT(&hmc_info->sd_table);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/irdma/hmc.h b/drivers/infiniband/hw/irdma/hmc.h
new file mode 100644
index 0000000..25c8a02
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/hmc.h
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */
+/* Copyright (c) 2019, Intel Corporation. */
+
+#ifndef IRDMA_HMC_H
+#define IRDMA_HMC_H
+
+#include "defs.h"
+
+#define IRDMA_HMC_MAX_BP_COUNT			512
+#define IRDMA_MAX_SD_ENTRIES			11
+#define IRDMA_HW_DBG_HMC_INVALID_BP_MARK	0xca
+#define IRDMA_HMC_INFO_SIGNATURE		0x484d5347
+#define IRDMA_HMC_PD_CNT_IN_SD			512
+#define IRDMA_HMC_DIRECT_BP_SIZE		0x200000
+#define IRDMA_HMC_MAX_SD_COUNT			4096
+#define IRDMA_HMC_PAGED_BP_SIZE			4096
+#define IRDMA_HMC_PD_BP_BUF_ALIGNMENT		4096
+#define IRDMA_FIRST_VF_FPM_ID			8
+#define FPM_MULTIPLIER				1024
+
+#define IRDMA_INC_SD_REFCNT(sd_table)	((sd_table)->ref_cnt++)
+#define IRDMA_INC_PD_REFCNT(pd_table)	((pd_table)->ref_cnt++)
+#define IRDMA_INC_BP_REFCNT(bp)		((bp)->ref_cnt++)
+
+#define IRDMA_DEC_SD_REFCNT(sd_table)	((sd_table)->ref_cnt--)
+#define IRDMA_DEC_PD_REFCNT(pd_table)	((pd_table)->ref_cnt--)
+#define IRDMA_DEC_BP_REFCNT(bp)		((bp)->ref_cnt--)
+
+/**
+ * IRDMA_INVALIDATE_PF_HMC_PD - Invalidates the pd cache in the hardware
+ * @hw: pointer to our hw struct
+ * @sd_idx: segment descriptor index
+ * @pd_idx: page descriptor index
+ */
+#define IRDMA_INVALIDATE_PF_HMC_PD(dev, sd_idx, pd_idx)			\
+	wr32((dev)->hw, (dev)->hw_regs[IRDMA_PFHMC_PDINV],		\
+		(((sd_idx) << IRDMA_PFHMC_PDINV_PMSDIDX_S) |		\
+		(0x1 << IRDMA_PFHMC_PDINV_PMSDPARTSEL_S) |		\
+		((pd_idx) << IRDMA_PFHMC_PDINV_PMPDIDX_S)))
+
+/**
+ * IRDMA_INVALIDATE_VF_HMC_PD - Invalidates the pd cache in the hardware
+ * @hw: pointer to our hw struct
+ * @sd_idx: segment descriptor index
+ * @pd_idx: page descriptor index
+ * @hmc_fn_id: VF's function id
+ */
+#define IRDMA_INVALIDATE_VF_HMC_PD(dev, sd_idx, pd_idx, hmc_fn_id)	\
+	wr32((dev)->hw,							\
+	     (dev)->hw_regs[IRDMA_GLHMC_VFPDINV] +			\
+	     4 * ((hmc_fn_id) - (dev)->hw_attrs.first_hw_vf_fpm_id),	\
+	     (((sd_idx) << IRDMA_PFHMC_PDINV_PMSDIDX_S) |		\
+	      ((pd_idx) << IRDMA_PFHMC_PDINV_PMPDIDX_S)))
+
+enum irdma_hmc_rsrc_type {
+	IRDMA_HMC_IW_QP = 0,
+	IRDMA_HMC_IW_CQ = 1,
+	IRDMA_HMC_IW_RESERVED = 2,
+	IRDMA_HMC_IW_HTE = 3,
+	IRDMA_HMC_IW_ARP = 4,
+	IRDMA_HMC_IW_APBVT_ENTRY = 5,
+	IRDMA_HMC_IW_MR = 6,
+	IRDMA_HMC_IW_XF = 7,
+	IRDMA_HMC_IW_XFFL = 8,
+	IRDMA_HMC_IW_Q1 = 9,
+	IRDMA_HMC_IW_Q1FL = 10,
+	IRDMA_HMC_IW_TIMER = 11,
+	IRDMA_HMC_IW_FSIMC = 12,
+	IRDMA_HMC_IW_FSIAV = 13,
+	IRDMA_HMC_IW_PBLE = 14,
+	IRDMA_HMC_IW_RRF = 15,
+	IRDMA_HMC_IW_RRFFL = 16,
+	IRDMA_HMC_IW_HDR = 17,
+	IRDMA_HMC_IW_MD = 18,
+	IRDMA_HMC_IW_OOISC = 19,
+	IRDMA_HMC_IW_OOISCFFL = 20,
+	IRDMA_HMC_IW_MAX, /* Must be last entry */
+};
+
+enum irdma_sd_entry_type {
+	IRDMA_SD_TYPE_INVALID = 0,
+	IRDMA_SD_TYPE_PAGED = 1,
+	IRDMA_SD_TYPE_DIRECT = 2,
+};
+
+struct irdma_hmc_obj_info {
+	u64 base;
+	u32 max_cnt;
+	u32 cnt;
+	u64 size;
+};
+
+struct irdma_hmc_bp {
+	enum irdma_sd_entry_type entry_type;
+	struct irdma_dma_mem addr;
+	u32 sd_pd_index;
+	u32 ref_cnt;
+};
+
+struct irdma_hmc_pd_entry {
+	struct irdma_hmc_bp bp;
+	u32 sd_index;
+	bool rsrc_pg;
+	bool valid;
+};
+
+struct irdma_hmc_pd_table {
+	struct irdma_dma_mem pd_page_addr;
+	struct irdma_hmc_pd_entry *pd_entry;
+	struct irdma_virt_mem pd_entry_virt_mem;
+	u32 ref_cnt;
+	u32 sd_index;
+};
+
+struct irdma_hmc_sd_entry {
+	enum irdma_sd_entry_type entry_type;
+	bool valid;
+	union {
+		struct irdma_hmc_pd_table pd_table;
+		struct irdma_hmc_bp bp;
+	} u;
+};
+
+struct irdma_hmc_sd_table {
+	struct irdma_virt_mem addr;
+	u32 sd_cnt;
+	u32 ref_cnt;
+	struct irdma_hmc_sd_entry *sd_entry;
+};
+
+struct irdma_hmc_info {
+	u32 signature;
+	u8 hmc_fn_id;
+	u16 first_sd_index;
+	struct irdma_hmc_obj_info *hmc_obj;
+	struct irdma_virt_mem hmc_obj_virt_mem;
+	struct irdma_hmc_sd_table sd_table;
+	u16 sd_indexes[IRDMA_HMC_MAX_SD_COUNT];
+};
+
+struct irdma_update_sd_entry {
+	u64 cmd;
+	u64 data;
+};
+
+struct irdma_update_sds_info {
+	u32 cnt;
+	u8 hmc_fn_id;
+	struct irdma_update_sd_entry entry[IRDMA_MAX_SD_ENTRIES];
+};
+
+struct irdma_ccq_cqe_info;
+struct irdma_hmc_fcn_info {
+	void (*callback_fcn)(struct irdma_sc_dev *dev, void *cqp_callback_param,
+			     struct irdma_ccq_cqe_info *ccq_cqe_info);
+	void *cqp_callback_param;
+	u32 vf_id;
+	u16 iw_vf_idx;
+	bool free_fcn;
+};
+
+struct irdma_hmc_create_obj_info {
+	struct irdma_hmc_info *hmc_info;
+	struct irdma_virt_mem add_sd_virt_mem;
+	u32 rsrc_type;
+	u32 start_idx;
+	u32 count;
+	u32 add_sd_cnt;
+	enum irdma_sd_entry_type entry_type;
+	bool is_pf;
+};
+
+struct irdma_hmc_del_obj_info {
+	struct irdma_hmc_info *hmc_info;
+	struct irdma_virt_mem del_sd_virt_mem;
+	u32 rsrc_type;
+	u32 start_idx;
+	u32 count;
+	u32 del_sd_cnt;
+	bool is_pf;
+};
+
+enum irdma_status_code irdma_copy_dma_mem(struct irdma_hw *hw,
+					  void *dest_buf,
+					  struct irdma_dma_mem *src_mem,
+					  u64 src_offset,
+					  u64 size);
+enum irdma_status_code irdma_sc_create_hmc_obj(struct irdma_sc_dev *dev,
+					       struct irdma_hmc_create_obj_info *info);
+enum irdma_status_code irdma_sc_del_hmc_obj(struct irdma_sc_dev *dev,
+					    struct irdma_hmc_del_obj_info *info,
+					    bool reset);
+enum irdma_status_code irdma_hmc_sd_one(struct irdma_sc_dev *dev,
+					u8 hmc_fn_id,
+					u64 pa,
+					u32 sd_idx,
+					enum irdma_sd_entry_type type,
+					bool setsd);
+enum irdma_status_code irdma_update_sds_noccq(struct irdma_sc_dev *dev,
+					      struct irdma_update_sds_info *info);
+struct irdma_vfdev *irdma_vfdev_from_fpm(struct irdma_sc_dev *dev,
+					 u8 hmc_fn_id);
+struct irdma_hmc_info *irdma_vf_hmcinfo_from_fpm(struct irdma_sc_dev *dev,
+						 u8 hmc_fn_id);
+enum irdma_status_code irdma_add_sd_table_entry(struct irdma_hw *hw,
+						struct irdma_hmc_info *hmc_info,
+						u32 sd_index,
+						enum irdma_sd_entry_type type,
+						u64 direct_mode_sz);
+enum irdma_status_code irdma_add_pd_table_entry(struct irdma_sc_dev *dev,
+						struct irdma_hmc_info *hmc_info,
+						u32 pd_index,
+						struct irdma_dma_mem *rsrc_pg);
+enum irdma_status_code irdma_remove_pd_bp(struct irdma_sc_dev *dev,
+					  struct irdma_hmc_info *hmc_info,
+					  u32 idx);
+enum irdma_status_code irdma_prep_remove_sd_bp(struct irdma_hmc_info *hmc_info,
+					       u32 idx);
+enum irdma_status_code irdma_prep_remove_pd_page(struct irdma_hmc_info *hmc_info,
+						 u32 idx);
+#endif /* IRDMA_HMC_H */
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 04/19] RDMA/irdma: Add driver framework definitions
From: Shiraz Saleem @ 2019-02-12 21:43 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Shiraz Saleem
In-Reply-To: <20190212214402.23284-1-shiraz.saleem@intel.com>

From: Mustafa Ismail <mustafa.ismail@intel.com>

Add definitions to setup irdma as a module, establish the
interface with all recognized LAN peers, initialize HW,
and register to receive netdev notifications

Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/hw/irdma/i40iw_if.c | 250 ++++++++++++
 drivers/infiniband/hw/irdma/irdma_if.c | 430 ++++++++++++++++++++
 drivers/infiniband/hw/irdma/main.c     | 496 +++++++++++++++++++++++
 drivers/infiniband/hw/irdma/main.h     | 709 +++++++++++++++++++++++++++++++++
 4 files changed, 1885 insertions(+)
 create mode 100644 drivers/infiniband/hw/irdma/i40iw_if.c
 create mode 100644 drivers/infiniband/hw/irdma/irdma_if.c
 create mode 100644 drivers/infiniband/hw/irdma/main.c
 create mode 100644 drivers/infiniband/hw/irdma/main.h

diff --git a/drivers/infiniband/hw/irdma/i40iw_if.c b/drivers/infiniband/hw/irdma/i40iw_if.c
new file mode 100644
index 0000000..ec3392c
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/i40iw_if.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2019, Intel Corporation. */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/addrconf.h>
+#include "main.h"
+#include "i40iw_hw.h"
+#include "i40e_client.h"
+#define CLIENT_IW_INTERFACE_VERSION_MAJOR 0
+#define CLIENT_IW_INTERFACE_VERSION_MINOR 01
+#define CLIENT_IW_INTERFACE_VERSION_BUILD 00
+
+/**
+ * i40iw_request_reset - Request a reset
+ * @rf: RDMA PCI function
+ *
+ */
+void i40iw_request_reset(struct irdma_pci_f *rf)
+{
+	struct i40e_info *ldev = (struct i40e_info *)rf->ldev.if_ldev;
+
+	ldev->ops->request_reset(ldev, rf->ldev.if_client, 1);
+}
+
+/**
+ * i40iw_open - client interface operation open for iwarp/uda device
+ * @ldev: lan device information
+ * @client: iwarp client information, provided during registration
+ *
+ * Called by the lan driver during the processing of client register
+ * Create device resources, set up queues, pble and hmc objects and
+ * register the device with the ib verbs interface
+ * Return 0 if successful, otherwise return error
+ */
+static int i40iw_open(struct i40e_info *ldev, struct i40e_client *client)
+{
+	struct irdma_device *iwdev = NULL;
+	struct irdma_handler *hdl = NULL;
+	struct irdma_priv_ldev   *pldev;
+	struct irdma_sc_dev *dev;
+	struct irdma_pci_f *rf;
+	struct irdma_l2params l2params = {};
+	int err_code = -EIO;
+	int i;
+	u16 qset;
+	u16 last_qset = IRDMA_NO_QSET;
+
+	hdl = irdma_find_handler(ldev->pcidev);
+	if (hdl)
+		return 0;
+
+	hdl = kzalloc((sizeof(*hdl) + sizeof(*iwdev)), GFP_KERNEL);
+	if (!hdl)
+		return -ENOMEM;
+
+	iwdev = (struct irdma_device *)((u8 *)hdl + sizeof(*hdl));
+
+	iwdev->param_wq = alloc_ordered_workqueue("l2params", WQ_MEM_RECLAIM);
+	if (!iwdev->param_wq)
+		goto error;
+
+	rf = &hdl->rf;
+	rf->hdl = hdl;
+	dev = &rf->sc_dev;
+	dev->back_dev = rf;
+	rf->rdma_ver = IRDMA_GEN_1;
+	irdma_init_rf_params(rf);
+	rf->init_hw = i40iw_init_hw;
+	rf->hw.hw_addr = ldev->hw_addr;
+	rf->pdev = ldev->pcidev;
+	rf->netdev = ldev->netdev;
+
+	iwdev->rf = rf;
+	iwdev->hdl = hdl;
+	iwdev->ldev = &rf->ldev;
+	iwdev->init_state = INITIAL_STATE;
+	iwdev->rcv_wnd = IRDMA_CM_DEFAULT_RCV_WND_SCALED;
+	iwdev->rcv_wscale = IRDMA_CM_DEFAULT_RCV_WND_SCALE;
+	iwdev->netdev = ldev->netdev;
+	iwdev->create_ilq = true;
+	iwdev->vsi_num = 0;
+
+	pldev = &rf->ldev;
+	hdl->ldev = pldev;
+	pldev->if_client = client;
+	pldev->if_ldev = ldev;
+	pldev->fn_num = ldev->fid;
+	pldev->ftype = ldev->ftype;
+	pldev->pf_vsi_num = 0;
+	pldev->msix_count = ldev->msix_count;
+	pldev->msix_entries = ldev->msix_entries;
+
+	if (irdma_ctrl_init_hw(rf))
+		goto error;
+
+	l2params.mtu =
+		(ldev->params.mtu) ? ldev->params.mtu : IRDMA_DEFAULT_MTU;
+	for (i = 0; i < I40E_CLIENT_MAX_USER_PRIORITY; i++) {
+		qset = ldev->params.qos.prio_qos[i].qs_handle;
+		l2params.up2tc[i] = ldev->params.qos.prio_qos[i].tc;
+		l2params.qs_handle_list[i] = qset;
+		if (last_qset == IRDMA_NO_QSET)
+			last_qset = qset;
+		else if ((qset != last_qset) && (qset != IRDMA_NO_QSET))
+			iwdev->dcb = true;
+	}
+
+	if (irdma_rt_init_hw(rf, iwdev, &l2params)) {
+		irdma_deinit_ctrl_hw(rf);
+		goto error;
+	}
+
+	irdma_add_handler(hdl);
+	irdma_probe_inc_ref(ldev->netdev);
+	return 0;
+error:
+	kfree(hdl);
+	return err_code;
+}
+
+/**
+ * i40iw_l2params_worker - worker for l2 params change
+ * @work: work pointer for l2 params
+ */
+static void i40iw_l2params_worker(struct work_struct *work)
+{
+	struct l2params_work *dwork =
+	    container_of(work, struct l2params_work, work);
+	struct irdma_device *iwdev = dwork->iwdev;
+
+	irdma_change_l2params(&iwdev->vsi, &dwork->l2params);
+	atomic_dec(&iwdev->params_busy);
+	kfree(work);
+}
+
+/**
+ * i40iw_l2param_change - handle qs handles for qos and mss change
+ * @ldev: lan device information
+ * @client: client for parameter change
+ * @params: new parameters from L2
+ */
+static void i40iw_l2param_change(struct i40e_info *ldev, struct i40e_client *client,
+				 struct i40e_params *params)
+{
+	struct irdma_handler *hdl;
+	struct irdma_l2params *l2params;
+	struct l2params_work *work;
+	struct irdma_device *iwdev;
+	int i;
+
+	hdl = irdma_find_handler(ldev->pcidev);
+	if (!hdl)
+		return;
+
+	iwdev = (struct irdma_device *)((u8 *)hdl + sizeof(*hdl));
+
+	if (atomic_read(&iwdev->params_busy))
+		return;
+	work = kzalloc(sizeof(*work), GFP_KERNEL);
+	if (!work)
+		return;
+
+	atomic_inc(&iwdev->params_busy);
+	work->iwdev = iwdev;
+	l2params = &work->l2params;
+	for (i = 0; i < I40E_CLIENT_MAX_USER_PRIORITY; i++)
+		l2params->qs_handle_list[i] = params->qos.prio_qos[i].qs_handle;
+
+	l2params->mtu = (params->mtu) ? params->mtu : iwdev->vsi.mtu;
+
+	INIT_WORK(&work->work, i40iw_l2params_worker);
+	queue_work(iwdev->param_wq, &work->work);
+}
+
+/**
+ * i40iw_close - client interface operation close for iwarp/uda device
+ * @ldev: lan device information
+ * @client: client to close
+ *
+ * Called by the lan driver during the processing of client unregister
+ * Destroy and clean up the driver resources
+ */
+static void i40iw_close(struct i40e_info *ldev, struct i40e_client *client, bool reset)
+{
+	struct irdma_handler *hdl;
+	struct irdma_pci_f *rf;
+	struct irdma_device *iwdev;
+
+	hdl = irdma_find_handler(ldev->pcidev);
+	if (!hdl)
+		return;
+	rf = &hdl->rf;
+	iwdev = (struct irdma_device *)((u8 *)hdl + sizeof(*hdl));
+	iwdev->closing = true;
+
+	if (iwdev->param_wq)
+		destroy_workqueue(iwdev->param_wq);
+
+	if (reset)
+		iwdev->reset = true;
+
+	irdma_deinit_rt_device(iwdev);
+	irdma_deinit_ctrl_hw(rf);
+	irdma_del_handler(irdma_find_handler(ldev->pcidev));
+	kfree(hdl);
+	irdma_probe_dec_ref(ldev->netdev);
+	irdma_pr_info("IRDMA hardware deinitialization complete\n");
+}
+
+/* client interface functions */
+static const struct i40e_client_ops i40e_ops = {
+	.open = i40iw_open,
+	.close = i40iw_close,
+	.l2_param_change = i40iw_l2param_change,
+	.virtchnl_receive = NULL,
+	.vf_reset = NULL,
+	.vf_enable = NULL,
+	.vf_capable = NULL
+};
+
+static struct i40e_client i40iw_client = {
+	.version.major = CLIENT_IW_INTERFACE_VERSION_MAJOR,
+	.version.minor = CLIENT_IW_INTERFACE_VERSION_MINOR,
+	.version.build = CLIENT_IW_INTERFACE_VERSION_BUILD,
+	.ops = &i40e_ops,
+	.name = "i40iw",
+	.type = I40E_CLIENT_IWARP,
+};
+
+int i40iw_reg_peer_driver(struct irdma_peer *peer, struct net_device *netdev)
+{
+	struct idc_srv_provider *sp;
+
+	sp = (struct idc_srv_provider *)netdev_priv(netdev);
+	if (sp->signature != IDC_SIGNATURE)
+		return -EINVAL;
+
+	peer->reg_peer_driver = (int (*)(void *))sp->reg_peer_driver;
+	peer->unreg_peer_driver = (int (*)(void *))sp->unreg_peer_driver;
+
+	return peer->reg_peer_driver(&i40iw_client);
+}
+
+void i40iw_unreg_peer_driver(struct irdma_peer *peer)
+{
+	peer->unreg_peer_driver(&i40iw_client);
+}
diff --git a/drivers/infiniband/hw/irdma/irdma_if.c b/drivers/infiniband/hw/irdma/irdma_if.c
new file mode 100644
index 0000000..a3324cf
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/irdma_if.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2019, Intel Corporation. */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <ice_idc.h>
+#include "main.h"
+#include "ws.h"
+#include "icrdma_hw.h"
+
+void irdma_add_dev_ref(struct irdma_sc_dev *dev)
+{
+	try_module_get(THIS_MODULE);
+}
+
+void irdma_put_dev_ref(struct irdma_sc_dev *dev)
+{
+	module_put(THIS_MODULE);
+}
+
+/**
+ * irdma_lan_register_qset - Register qset with LAN driver
+ * @vsi: vsi structure
+ * @tc_node: Traffic class node
+ */
+enum irdma_status_code irdma_lan_register_qset(struct irdma_sc_vsi *vsi,
+					       struct irdma_ws_node *tc_node)
+{
+	struct irdma_device *iwdev = vsi->back_vsi;
+	struct ice_peer_dev *ldev = (struct ice_peer_dev *)iwdev->ldev->if_ldev;
+	struct ice_res rdma_qset_res = {};
+	int ret;
+
+	if (ldev->ops->alloc_res) {
+		rdma_qset_res.cnt_req = 1;
+		rdma_qset_res.res_type = ICE_RDMA_QSETS_TXSCHED;
+		rdma_qset_res.res[0].res.qsets.qs_handle = tc_node->qs_handle;
+		rdma_qset_res.res[0].res.qsets.tc = tc_node->traffic_class;
+		rdma_qset_res.res[0].res.qsets.vsi_id = vsi->vsi_idx;
+		ret = ldev->ops->alloc_res(ldev, &rdma_qset_res, 0);
+		if (ret) {
+			irdma_debug(vsi->dev, IRDMA_DEBUG_WS,
+				    "LAN alloc_res for rdma qset failed.\n");
+			return IRDMA_ERR_NO_MEMORY;
+		}
+
+		tc_node->l2_sched_node_id = rdma_qset_res.res[0].res.qsets.teid;
+		vsi->qos[tc_node->user_pri].l2_sched_node_id =
+					rdma_qset_res.res[0].res.qsets.teid;
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_lan_unregister_qset - Unregister qset with LAN driver
+ * @vsi: vsi structure
+ * @tc_node: Traffic class node
+ */
+void irdma_lan_unregister_qset(struct irdma_sc_vsi *vsi,
+			       struct irdma_ws_node *tc_node)
+{
+	struct irdma_device *iwdev = vsi->back_vsi;
+	struct ice_peer_dev *ldev = (struct ice_peer_dev *)iwdev->ldev->if_ldev;
+	struct ice_res rdma_qset_res = {};
+
+	if (ldev->ops->free_res) {
+		rdma_qset_res.res_allocated = 1;
+		rdma_qset_res.res_type = ICE_RDMA_QSETS_TXSCHED;
+		rdma_qset_res.res[0].res.qsets.vsi_id = vsi->vsi_idx;
+		rdma_qset_res.res[0].res.qsets.teid = tc_node->l2_sched_node_id;
+		rdma_qset_res.res[0].res.qsets.qs_handle = tc_node->qs_handle;
+
+		if (ldev->ops->free_res(ldev, &rdma_qset_res))
+			irdma_debug(vsi->dev, IRDMA_DEBUG_WS, "LAN free_res for rdma qset failed.\n");
+	}
+}
+
+/**
+ * irdma_log_invalid_mtu: log warning on invalid mtu
+ * @mtu: maximum tranmission unit
+ */
+static void irdma_log_invalid_mtu(u16 mtu)
+{
+	if (mtu < IRDMA_MIN_MTU_IPV4)
+		irdma_pr_warn("Current MTU setting of %d is too low for RDMA traffic. Minimum MTU is 576 for IPv4 and 1280 for IPv6\n",
+			      mtu);
+	else if (mtu < IRDMA_MIN_MTU_IPV6)
+		irdma_pr_warn("Current MTU setting of %d is too low for IPv6 RDMA traffic, the minimum is 1280\n",
+			      mtu);
+}
+
+/**
+ * irdma_event_handler - Called by LAN driver to notify events
+ * @peer_dev: Peer device structure
+ * @event: event from LAN driver
+ */
+static void irdma_event_handler(struct ice_peer_dev *ldev,
+				struct ice_event *event)
+{
+	struct irdma_l2params l2params = {};
+	struct irdma_device *iwdev;
+	u8 first_tc;
+	int i;
+
+	iwdev = irdma_find_netdev(ldev->netdev);
+	if (!iwdev)
+		return;
+
+	if (test_bit(ICE_EVENT_LINK_CHANGE, event->type)) {
+		irdma_debug(&iwdev->rf->sc_dev, IRDMA_DEBUG_CLNT, "LINK_CHANGE event\n");
+	} else if (test_bit(ICE_EVENT_MTU_CHANGE, event->type)) {
+		irdma_debug(&iwdev->rf->sc_dev, IRDMA_DEBUG_CLNT,
+			    "new MTU = %d\n", event->info.mtu);
+		if (iwdev->vsi.mtu != event->info.mtu) {
+			l2params.mtu = event->info.mtu;
+			l2params.mtu_changed = true;
+			irdma_log_invalid_mtu(l2params.mtu);
+			irdma_change_l2params(&iwdev->vsi, &l2params);
+		}
+	} else if (test_bit(ICE_EVENT_TC_CHANGE, event->type)) {
+		if (iwdev->vsi.tc_change_pending)
+			return;
+		iwdev->vsi.tc_change_pending = true;
+		irdma_suspend_qps(&iwdev->vsi);
+
+		/* Wait for all qp's to suspend */
+		wait_event_timeout(iwdev->suspend_wq,
+				   !atomic_read(&iwdev->vsi.qp_suspend_reqs),
+				   IRDMA_EVENT_TIMEOUT);
+		irdma_ws_reset(&iwdev->vsi);
+	} else if (test_bit(ICE_EVENT_TC_CHANGE, event->type)) {
+		if (!iwdev->vsi.tc_change_pending)
+			return;
+		l2params.tc_changed = true;
+		irdma_debug(&iwdev->rf->sc_dev, IRDMA_DEBUG_CLNT, "TC Change\n");
+		first_tc = event->info.port_qos.up2tc[0];
+		iwdev->dcb = false;
+		for (i = 0; i < ICE_IDC_MAX_USER_PRIORITY; ++i) {
+			l2params.up2tc[i] = event->info.port_qos.up2tc[i];
+			if (first_tc != l2params.up2tc[i])
+				iwdev->dcb = true;
+		}
+		irdma_change_l2params(&iwdev->vsi, &l2params);
+	} else if (test_bit(ICE_EVENT_API_CHANGE, event->type)) {
+		irdma_debug(&iwdev->rf->sc_dev, IRDMA_DEBUG_CLNT, "API_CHANGE\n");
+	}
+}
+
+static void irdma_enable_lb(struct irdma_device *iwdev)
+{
+	u32 val;
+
+	val = rd32(iwdev->rf->sc_dev.hw, VSI_RXSWCTRL(iwdev->vsi.vsi_idx));
+	val &= ~VSI_RXSWCTRL_SRCPRUNEENABLE_M;
+	wr32(iwdev->rf->sc_dev.hw, (0x00205000 + (iwdev->vsi.vsi_idx * 4)), val);
+}
+
+/**
+ * irdma_open - client interface operation open for RDMA device
+ * @ldev: lan device information
+ *
+ * Called by the lan driver during the processing of client
+ * register.
+ */
+static void irdma_open(struct ice_peer_dev *ldev)
+{
+	struct irdma_handler *hdl;
+	struct irdma_device *iwdev;
+	struct irdma_sc_dev *dev;
+	enum irdma_status_code status;
+	struct ice_event events = {};
+	struct irdma_pci_f *rf;
+	struct irdma_priv_ldev *pldev;
+	struct irdma_l2params l2params = {};
+	int i;
+
+	hdl = irdma_find_handler(ldev->pdev);
+	if (!hdl)
+		return;
+	rf = &hdl->rf;
+	if (rf->init_state != CEQ0_CREATED)
+		return;
+	iwdev = irdma_find_netdev(ldev->netdev);
+	if (iwdev)
+		return;
+	iwdev = kzalloc(sizeof(*iwdev), GFP_KERNEL);
+	if (!iwdev)
+		return;
+
+	iwdev->hdl = hdl;
+	iwdev->rf = rf;
+	iwdev->ldev = &rf->ldev;
+	pldev = &rf->ldev;
+	pldev->pf_vsi_num = ldev->pf_vsi_num;
+
+	/* Set configfs default values */
+	iwdev->push_mode = 0;
+	iwdev->rcv_wnd = IRDMA_CM_DEFAULT_RCV_WND_SCALED;
+	iwdev->rcv_wscale = IRDMA_CM_DEFAULT_RCV_WND_SCALE;
+
+	dev = &hdl->rf.sc_dev;
+	iwdev->netdev = ldev->netdev;
+	iwdev->create_ilq = true;
+	if (rf->roce_ena & (1 << ldev->index)) { /* A bit per port */
+		iwdev->roce_mode = true;
+		iwdev->create_ilq = false;
+	}
+	l2params.mtu = ldev->initial_mtu;
+
+	l2params.num_tc = ldev->initial_qos_info.num_tc;
+	l2params.num_apps = ldev->initial_qos_info.num_apps;
+	l2params.vsi_prio_type = ldev->initial_qos_info.vsi_priority_type;
+	l2params.vsi_rel_bw = ldev->initial_qos_info.vsi_relative_bw;
+	for (i = 0; i < l2params.num_tc; i++) {
+		l2params.tc_info[i].egress_virt_up =
+			ldev->initial_qos_info.tc_info[i].egress_virt_up;
+		l2params.tc_info[i].ingress_virt_up =
+			ldev->initial_qos_info.tc_info[i].ingress_virt_up;
+		l2params.tc_info[i].prio_type =
+			ldev->initial_qos_info.tc_info[i].prio_type;
+		l2params.tc_info[i].rel_bw =
+			ldev->initial_qos_info.tc_info[i].rel_bw;
+		l2params.tc_info[i].tc_ctx =
+			ldev->initial_qos_info.tc_info[i].tc_ctx;
+	}
+	for (i = 0; i < ICE_IDC_MAX_USER_PRIORITY; i++)
+		l2params.up2tc[i] = ldev->initial_qos_info.up2tc[i];
+
+	iwdev->vsi_num = ldev->pf_vsi_num;
+	ldev->ops->update_vsi_filter(ldev, ICE_RDMA_FILTER_BOTH, true);
+
+	status = irdma_rt_init_hw(rf, iwdev, &l2params);
+	if (status)
+		goto error;
+
+	events.reporter = ldev;
+	set_bit(ICE_EVENT_LINK_CHANGE, events.type);
+	set_bit(ICE_EVENT_MTU_CHANGE, events.type);
+	set_bit(ICE_EVENT_TC_CHANGE, events.type);
+	set_bit(ICE_EVENT_API_CHANGE, events.type);
+
+	if (ldev->ops->reg_for_notification)
+		ldev->ops->reg_for_notification(ldev, &events);
+	irdma_enable_lb(iwdev);
+	irdma_dev_info(dev, "IRDMA VSI Open Successful");
+	init_waitqueue_head(&iwdev->close_wq);
+	init_waitqueue_head(&iwdev->suspend_wq);
+
+	return;
+error:
+	list_del(&iwdev->list);
+	kfree(iwdev);
+}
+
+/**
+ * irdma_close - client interface operation close for iwarp/uda device
+ * @ldev: lan device information
+ * @client: client to close
+ *
+ * Called by the lan driver during the processing of client unregister
+ * Destroy and clean up the driver resources
+ */
+static void irdma_close(struct ice_peer_dev *ldev, enum ice_close_reason reason)
+{
+	struct irdma_device *iwdev;
+
+	iwdev = irdma_find_netdev(ldev->netdev);
+	if (!iwdev)
+		return;
+
+	iwdev->closing = true;
+	if (reason == ICE_REASON_HW_RESET_PENDING) {
+		iwdev->reset = true;
+		iwdev->rf->reset = true;
+	}
+
+	if (iwdev->init_state >= CEQ0_CREATED)
+		irdma_deinit_rt_device(iwdev);
+
+	kfree(iwdev);
+	ldev->ops->update_vsi_filter(ldev, ICE_RDMA_FILTER_BOTH, false);
+	irdma_pr_info("IRDMA VSI close complete\n");
+}
+
+/**
+ * irdma_remove - client interface remove operation for RDMA
+ * @ldev: lan device information
+ *
+ * Called on module unload.
+ */
+static int irdma_remove(struct ice_peer_dev *ldev)
+{
+	struct irdma_handler *hdl;
+	struct irdma_pci_f *rf;
+
+	hdl = irdma_find_handler(ldev->pdev);
+	if (!hdl)
+		return 0;
+
+	if (!list_empty(&hdl->rf.vsi_dev_list))
+		return -EBUSY;
+
+	rf = &hdl->rf;
+	if (rf->init_state != CEQ0_CREATED)
+		return -EBUSY;
+
+	destroy_workqueue(rf->free_qp_wq);
+	irdma_deinit_ctrl_hw(rf);
+	irdma_del_handler(hdl);
+	kfree(hdl);
+	irdma_probe_dec_ref(ldev->netdev);
+	irdma_pr_info("IRDMA hardware deinitialization complete\n");
+
+	return 0;
+}
+
+static const struct ice_peer_ops irdma_peer_ops = {
+	.event_handler = irdma_event_handler,
+	.close = irdma_close,
+	.open = irdma_open,
+};
+
+/**
+ * irdma_probe - client interface probe operation for RDMA dev
+ * @ldev: lan device information
+ *
+ * Called by the lan driver during the processing of client register
+ * Create device resources, set up queues, pble and hmc objects.
+ * Return 0 if successful, otherwise return error
+ */
+static int irdma_probe(struct ice_peer_dev *ldev)
+{
+	struct irdma_handler *hdl;
+	struct irdma_pci_f *rf;
+	struct irdma_sc_dev *dev;
+	struct irdma_priv_ldev *pldev;
+
+	irdma_pr_info("probe: ldev=%p, ldev->dev.pdev.bus->number=%d, ldev->netdev=%p\n",
+		      ldev, ldev->pdev->bus->number, ldev->netdev);
+	hdl = irdma_find_handler(ldev->pdev);
+	if (hdl)
+		return -EBUSY;
+
+	hdl = kzalloc(sizeof(*hdl), GFP_KERNEL);
+	if (!hdl)
+		return IRDMA_ERR_NO_MEMORY;
+
+	rf = &hdl->rf;
+	pldev = &rf->ldev;
+	hdl->ldev = pldev;
+	rf->hdl = hdl;
+	dev = &rf->sc_dev;
+	dev->back_dev = rf;
+	rf->init_hw = icrdma_init_hw;
+	pldev->if_ldev = ldev;
+	rf->rdma_ver = IRDMA_GEN_2;
+	irdma_init_rf_params(rf);
+	if (rf->roce_ena & (1 << ldev->index))
+		rf->protocol_used = IRDMA_ROCE_PROTOCOL_ONLY;
+	else
+		rf->protocol_used = IRDMA_IWARP_PROTOCOL_ONLY;
+	dev->pci_rev = ldev->pdev->revision;
+	rf->default_vsi.vsi_idx = ldev->pf_vsi_num;
+	/* save information from ldev to priv_ldev*/
+	pldev->fn_num = ldev->fn_num;
+	rf->hw.hw_addr = ldev->hw_addr;
+	rf->pdev = ldev->pdev;
+	rf->netdev = ldev->netdev;
+	pldev->ftype = ldev->ftype;
+	pldev->msix_count = ldev->msix_count;
+	pldev->msix_entries = ldev->msix_entries;
+	irdma_add_handler(hdl);
+	if (irdma_ctrl_init_hw(rf)) {
+		irdma_del_handler(hdl);
+		kfree(hdl);
+		return -EIO;
+	}
+	ldev->peer_ops = &irdma_peer_ops;
+
+	irdma_probe_inc_ref(ldev->netdev);
+
+	return 0;
+}
+
+static struct ice_peer_drv irdma_client = {
+	.name = KBUILD_MODNAME,
+	.driver_id = ICE_PEER_RDMA_DRIVER,
+	.ver.major = ICE_PEER_MAJOR_VER,
+	.ver.minor = ICE_PEER_MINOR_VER,
+	.dev_id.vendor = PCI_VENDOR_ID_INTEL,
+	.dev_id.device = ICE_PEER_RDMA_DEV,
+	.probe = irdma_probe,
+	.remove = irdma_remove,
+	.driver.name = "i40iw",
+	.driver.mod_name = "i40iw",
+	.driver.owner = THIS_MODULE
+};
+
+/**
+ * icrdma_request_reset - Request a reset
+ * @rf: RDMA PCI function
+ *
+ */
+void icrdma_request_reset(struct irdma_pci_f *rf)
+{
+	struct ice_peer_dev *ldev = (struct ice_peer_dev *)rf->ldev.if_ldev;
+
+	if (ldev && ldev->ops && ldev->ops->request_reset)
+		ldev->ops->request_reset(ldev, ICE_PEER_PFR);
+}
+
+int icrdma_reg_peer_driver(struct irdma_peer *peer, struct net_device *netdev)
+{
+	struct idc_srv_provider *sp;
+
+	sp = (struct idc_srv_provider *)netdev_priv(netdev);
+	if (sp->signature != IDC_SIGNATURE)
+		return -EINVAL;
+
+	peer->reg_peer_driver = (int (*)(void *))sp->reg_peer_driver;
+	peer->unreg_peer_driver = (int (*)(void *))sp->unreg_peer_driver;
+
+	return peer->reg_peer_driver(&irdma_client);
+}
+
+void icrdma_unreg_peer_driver(struct irdma_peer *peer)
+{
+	peer->unreg_peer_driver(&irdma_client);
+}
diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c
new file mode 100644
index 0000000..7e76f63
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/main.c
@@ -0,0 +1,496 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2019, Intel Corporation. */
+
+#include "main.h"
+
+#define DRV_VER_MAJOR 0
+#define DRV_VER_MINOR 0
+#define DRV_VER_BUILD 67
+#define DRV_VER	__stringify(DRV_VER_MAJOR) "."		\
+	__stringify(DRV_VER_MINOR) "." __stringify(DRV_VER_BUILD)
+
+#ifndef CONFIG_DYNAMIC_DEBUG
+static int debug;
+module_param(debug, int, 0644);
+MODULE_PARM_DESC(debug, "debug flags: 0=disabled (default), 0x7fffffff=all");
+#endif
+
+static int resource_profile;
+module_param(resource_profile, int, 0644);
+MODULE_PARM_DESC(resource_profile, "Resource Profile: 0=PF only, 1=Weighted VF, 2=Even Distribution");
+
+static int max_rdma_vfs = 32;
+module_param(max_rdma_vfs, int, 0644);
+MODULE_PARM_DESC(max_rdma_vfs, "Maximum VF count: 0-32 32=default");
+static int roce_ena;
+module_param(roce_ena, int, 0644);
+MODULE_PARM_DESC(roce_ena, "RoCE enable bitmap: 1=port0,2=port1....0=disabled, not supported on X722");
+
+static int limits_sel;
+module_param(limits_sel, int, 0644);
+MODULE_PARM_DESC(limits_sel, "Resource limits selector, Range: 0-3");
+
+MODULE_AUTHOR("Intel Corporation, <e1000-rdma@lists.sourceforge.net>");
+MODULE_DESCRIPTION("Intel(R) Ethernet Connection RDMA Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VER);
+/* Add an alias for i40iw once its deprecated from kernel.
+ * If required add push_mode and mpa_version as deprecated
+ * module params for i40iw compat.
+ */
+
+LIST_HEAD(irdma_handlers);
+DEFINE_SPINLOCK(irdma_handler_lock);
+
+static struct notifier_block irdma_inetaddr_notifier = {
+	.notifier_call = irdma_inetaddr_event
+};
+
+static struct notifier_block irdma_inetaddr6_notifier = {
+	.notifier_call = irdma_inet6addr_event
+};
+
+static struct notifier_block irdma_net_notifier = {
+	.notifier_call = irdma_net_event
+};
+
+static struct notifier_block irdma_netdevice_notifier = {
+	.notifier_call = irdma_netdevice_event
+};
+
+void irdma_init_rf_params(struct irdma_pci_f *rf)
+{
+	rf->limits_sel = limits_sel;
+	if (rf->rdma_ver != IRDMA_GEN_1)
+		rf->roce_ena = roce_ena;
+	rf->rsrc_profile = (resource_profile < IRDMA_HMC_PROFILE_EQUAL) ?
+			    (u8)resource_profile + IRDMA_HMC_PROFILE_DEFAULT :
+			    IRDMA_HMC_PROFILE_DEFAULT;
+	rf->max_rdma_vfs = (rf->rsrc_profile != IRDMA_HMC_PROFILE_DEFAULT) ?
+			    max_rdma_vfs : 0;
+	rf->max_ena_vfs = rf->max_rdma_vfs;
+#ifndef CONFIG_DYNAMIC_DEBUG
+	rf->debug = debug;
+#endif
+}
+
+/**
+ * irdma_find_netdev - find a vsi device given a netdev
+ * @netdev: pointer to net_device
+ */
+struct irdma_device *irdma_find_netdev(struct net_device *netdev)
+{
+	struct irdma_device *iwdev;
+	struct irdma_handler *hdl;
+	struct list_head *pos;
+	struct list_head *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&irdma_handler_lock, flags);
+	list_for_each_entry(hdl, &irdma_handlers, list) {
+		list_for_each_safe(pos, tmp, &hdl->rf.vsi_dev_list) {
+			iwdev = container_of(pos, struct irdma_device, list);
+			if (netdev == iwdev->netdev) {
+				spin_unlock_irqrestore(&irdma_handler_lock,
+						       flags);
+				return iwdev;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&irdma_handler_lock, flags);
+
+	return NULL;
+}
+
+/**
+ * irdma_find_ice_handler - find a handler given a client info
+ * @ldev: pointer to a client info
+ */
+struct irdma_handler *irdma_find_handler(struct pci_dev *pdev)
+{
+	struct irdma_handler *hdl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&irdma_handler_lock, flags);
+	list_for_each_entry(hdl, &irdma_handlers, list) {
+		if (hdl->rf.pdev->devfn == pdev->devfn &&
+		    hdl->rf.pdev->bus->number == pdev->bus->number) {
+			spin_unlock_irqrestore(&irdma_handler_lock, flags);
+			return hdl;
+		}
+	}
+	spin_unlock_irqrestore(&irdma_handler_lock, flags);
+
+	return NULL;
+}
+
+/**
+ * irdma_find_iwdev - find a vsi device given a name
+ * @name: name of iwdev
+ */
+struct irdma_device *irdma_find_iwdev(const char *name)
+{
+	struct irdma_handler *hdl;
+	struct list_head *pos;
+	struct list_head *tmp;
+	struct irdma_device *iwdev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&irdma_handler_lock, flags);
+	list_for_each_entry(hdl, &irdma_handlers, list) {
+		list_for_each_safe(pos, tmp, &hdl->rf.vsi_dev_list) {
+			iwdev = container_of(pos, struct irdma_device, list);
+			if (!strcmp(name, iwdev->iwibdev->ibdev.name)) {
+				spin_unlock_irqrestore(&irdma_handler_lock,
+						       flags);
+				return iwdev;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&irdma_handler_lock, flags);
+
+	return NULL;
+}
+
+/**
+ * irdma_add_handler - add a handler to the list
+ * @hdl: handler to be added to the handler list
+ */
+void irdma_add_handler(struct irdma_handler *hdl)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&irdma_handler_lock, flags);
+	list_add(&hdl->list, &irdma_handlers);
+	spin_unlock_irqrestore(&irdma_handler_lock, flags);
+}
+
+/**
+ * irdma_del_handler - delete a handler from the list
+ * @hdl: handler to be deleted from the handler list
+ */
+void irdma_del_handler(struct irdma_handler *hdl)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&irdma_handler_lock, flags);
+	list_del(&hdl->list);
+	spin_unlock_irqrestore(&irdma_handler_lock, flags);
+}
+
+/**
+ * irdma_register_notifiers - register tcp ip notifiers
+ */
+void irdma_register_notifiers(void)
+{
+	register_inetaddr_notifier(&irdma_inetaddr_notifier);
+	register_inet6addr_notifier(&irdma_inetaddr6_notifier);
+	register_netevent_notifier(&irdma_net_notifier);
+	register_netdevice_notifier(&irdma_netdevice_notifier);
+}
+
+void irdma_unregister_notifiers(void)
+{
+	unregister_netevent_notifier(&irdma_net_notifier);
+	unregister_inetaddr_notifier(&irdma_inetaddr_notifier);
+	unregister_inet6addr_notifier(&irdma_inetaddr6_notifier);
+	unregister_netdevice_notifier(&irdma_netdevice_notifier);
+}
+
+/**
+ * irdma_add_ipv6_addr - add ipv6 address to the hw arp table
+ * @iwdev: iwarp device
+ */
+static void irdma_add_ipv6_addr(struct irdma_device *iwdev)
+{
+	struct net_device *ip_dev;
+	struct inet6_dev *idev;
+	struct inet6_ifaddr *ifp, *tmp;
+	u32 local_ipaddr6[4];
+
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, ip_dev) {
+		if (((rdma_vlan_dev_vlan_id(ip_dev) < 0xFFFF &&
+		      rdma_vlan_dev_real_dev(ip_dev) == iwdev->netdev) ||
+		      ip_dev == iwdev->netdev) && ip_dev->flags & IFF_UP) {
+			idev = __in6_dev_get(ip_dev);
+			if (!idev) {
+				irdma_dev_err(&iwdev->rf->sc_dev,
+					      "ipv6 inet device not found\n");
+				break;
+			}
+			list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
+				irdma_dev_info(&iwdev->rf->sc_dev,
+					       "IP=%pI6, vlan_id=%d, MAC=%pM\n",
+					       &ifp->addr,
+					       rdma_vlan_dev_vlan_id(ip_dev),
+					       ip_dev->dev_addr);
+
+				irdma_copy_ip_ntohl(local_ipaddr6,
+						    ifp->addr.in6_u.u6_addr32);
+				irdma_manage_arp_cache(iwdev->rf,
+						       ip_dev->dev_addr,
+						       local_ipaddr6,
+						       false,
+						       IRDMA_ARP_ADD);
+			}
+		}
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * irdma_add_ipv4_addr - add ipv4 address to the hw arp table
+ * @iwdev: iwarp device
+ */
+static void irdma_add_ipv4_addr(struct irdma_device *iwdev)
+{
+	struct net_device *dev;
+	struct in_device *idev;
+	bool got_lock = true;
+	u32 ip_addr;
+
+	if (!rtnl_trylock())
+		got_lock = false;
+
+	for_each_netdev(&init_net, dev) {
+		if (((rdma_vlan_dev_vlan_id(dev) < 0xFFFF &&
+		      rdma_vlan_dev_real_dev(dev) == iwdev->netdev) ||
+		      dev == iwdev->netdev) && dev->flags & IFF_UP) {
+			idev = in_dev_get(dev);
+			for_ifa(idev) {
+				irdma_debug(&iwdev->rf->sc_dev, IRDMA_DEBUG_CM,
+					    "IP=%pI4, vlan_id=%d, MAC=%pM\n",
+					    &ifa->ifa_address,
+					    rdma_vlan_dev_vlan_id(dev),
+					    dev->dev_addr);
+
+				ip_addr = ntohl(ifa->ifa_address);
+				irdma_manage_arp_cache(iwdev->rf,
+						       dev->dev_addr,
+						       &ip_addr,
+						       true,
+						       IRDMA_ARP_ADD);
+			}
+			endfor_ifa(idev);
+			in_dev_put(idev);
+		}
+	}
+	if (got_lock)
+		rtnl_unlock();
+}
+
+/**
+ * irdma_add_ip - add ip addresses
+ * @iwdev: iwarp device
+ *
+ * Add ipv4/ipv6 addresses to the arp cache
+ */
+void irdma_add_ip(struct irdma_device *iwdev)
+{
+	irdma_add_ipv4_addr(iwdev);
+	irdma_add_ipv6_addr(iwdev);
+}
+
+/**
+ * irdma_request_reset - Request a reset
+ * @rf: RDMA PCI function
+ *
+ */
+void irdma_request_reset(struct irdma_pci_f *rf)
+{
+	irdma_dev_warn(&rf->sc_dev, "Requesting a a reset from LAN driver\n");
+	if (rf->rdma_ver == IRDMA_GEN_1)
+		i40iw_request_reset(rf);
+	else
+		icrdma_request_reset(rf);
+}
+
+static struct irdma_peer_drvs_list *irdma_peer_drvs;
+
+/**
+ * irdma_probe_inc_ref - Increment ref count for a probe
+ * @netdev: netdev pointer
+ */
+void irdma_probe_inc_ref(struct net_device *netdev)
+{
+	struct irdma_peer *peer;
+	u32 i;
+
+	for (i = 0; i < IRDMA_MAX_PEERS; i++) {
+		peer = &irdma_peer_drvs->peer[i];
+		if (!strncmp(netdev->dev.parent->driver->name, peer->name,
+			     sizeof(peer->name)))
+			break;
+	}
+
+	if (i != IRDMA_MAX_PEERS)
+		atomic_inc(&peer->ref_count);
+}
+
+/**
+ * irdma_probe_dec_ref - Decrement ref count for a probe
+ * @netdev: netdev pointer
+ */
+void irdma_probe_dec_ref(struct net_device *netdev)
+{
+	struct irdma_peer *peer;
+	u32 i;
+
+	for (i = 0; i < IRDMA_MAX_PEERS; i++) {
+		peer = &irdma_peer_drvs->peer[i];
+		if (!strcmp(netdev->dev.parent->driver->name, peer->name)) {
+			if (peer->state == IRDMA_STATE_VALID &&
+			    atomic_dec_and_test(&peer->ref_count)) {
+				peer->state = IRDMA_STATE_INVALID;
+				switch (i) {
+				case I40E_PEER_TYPE:
+					if (IS_ENABLED(CONFIG_INFINIBAND_I40IW))
+						return;
+
+					i40iw_unreg_peer_driver(peer);
+					break;
+				case ICE_PEER_TYPE:
+					icrdma_unreg_peer_driver(peer);
+					break;
+				default:
+					return;
+				}
+				module_put(peer->module);
+			}
+			break;
+		}
+	}
+}
+
+/**
+ * irdma_handle_netdev - Find peer driver and register with it
+ * @netdev: netdev of peer driver
+ */
+void irdma_handle_netdev(struct net_device *netdev)
+{
+	struct irdma_peer *peer;
+	int ret;
+	u32 i;
+
+	for (i = 0; i < IRDMA_MAX_PEERS; i++) {
+		peer = &irdma_peer_drvs->peer[i];
+		if (netdev->dev.parent && netdev->dev.parent->driver &&
+		    !strncmp(netdev->dev.parent->driver->name, peer->name,
+			     sizeof(peer->name)))
+			break;
+	}
+
+	if (i == IRDMA_MAX_PEERS || peer->state == IRDMA_STATE_VALID)
+		return;
+
+	/* Found the driver */
+	peer = &irdma_peer_drvs->peer[i];
+	peer->module = netdev->dev.parent->driver->owner;
+
+	switch (i) {
+	case I40E_PEER_TYPE:
+		if (IS_ENABLED(CONFIG_INFINIBAND_I40IW))
+			return;
+
+		ret = i40iw_reg_peer_driver(peer, netdev);
+		break;
+	case ICE_PEER_TYPE:
+		ret = icrdma_reg_peer_driver(peer, netdev);
+		break;
+	default:
+		return;
+	}
+
+	/* call the register routine */
+	if (!ret) {
+		peer->state = IRDMA_STATE_VALID;
+		try_module_get(peer->module);
+	} else {
+		peer->state = IRDMA_STATE_REG_FAILED;
+	}
+}
+
+/**
+ * irdma_find_peers - Search netdevs for a peer drivers
+ */
+static void irdma_find_peers(void)
+{
+	struct net_device *dev;
+
+	rtnl_lock();
+	for_each_netdev(&init_net, dev)
+		irdma_handle_netdev(dev);
+	rtnl_unlock();
+}
+
+/**
+ * irdma_unreg_peers - Unregister with all peers
+ */
+static void irdma_unreg_peers(void)
+{
+	struct irdma_peer *peer;
+	u32 i;
+
+	for (i = 0; i < IRDMA_MAX_PEERS; i++) {
+		peer = &irdma_peer_drvs->peer[i];
+		if (peer->state == IRDMA_STATE_VALID) {
+			peer->state = IRDMA_STATE_INVALID;
+			switch (i) {
+			case I40E_PEER_TYPE:
+				if (IS_ENABLED(CONFIG_INFINIBAND_I40IW))
+					return;
+
+				i40iw_unreg_peer_driver(peer);
+				break;
+			case ICE_PEER_TYPE:
+				icrdma_unreg_peer_driver(peer);
+				break;
+			default:
+				return;
+			}
+			module_put(peer->module);
+		}
+	}
+}
+
+/**
+ * irdma_init_module - driver initialization function
+ *
+ * First function to call when the driver is loaded
+ * Register the driver as ice client and port mapper client
+ */
+static int __init irdma_init_module(void)
+{
+	int ret = 0;
+	struct irdma_peer *peer;
+
+	irdma_peer_drvs = kzalloc(sizeof(*irdma_peer_drvs), GFP_KERNEL);
+	if (!irdma_peer_drvs)
+		return -ENOMEM;
+	peer = &irdma_peer_drvs->peer[I40E_PEER_TYPE];
+	strncpy(peer->name, "i40e", sizeof(peer->name));
+	peer = &irdma_peer_drvs->peer[ICE_PEER_TYPE];
+	strncpy(peer->name, "ice", sizeof(peer->name));
+	irdma_find_peers();
+
+	irdma_register_notifiers();
+
+	return ret;
+}
+
+/**
+ * irdma_exit_module - driver exit clean up function
+ *
+ * The function is called just before the driver is unloaded
+ * Unregister the driver as ice client and port mapper client
+ */
+static void __exit irdma_exit_module(void)
+{
+	irdma_unregister_notifiers();
+	irdma_unreg_peers();
+	kfree(irdma_peer_drvs);
+}
+
+module_init(irdma_init_module);
+module_exit(irdma_exit_module);
diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h
new file mode 100644
index 0000000..198c27f
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/main.h
@@ -0,0 +1,709 @@
+/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */
+/* Copyright (c) 2019, Intel Corporation. */
+
+#ifndef IRDMA_MAIN_H
+#define IRDMA_MAIN_H
+
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+#include <net/addrconf.h>
+#include <net/netevent.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/crc32c.h>
+#include <linux/kthread.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/iw_cm.h>
+#include <crypto/hash.h>
+#include "status.h"
+#include "osdep.h"
+#include "defs.h"
+#include "hmc.h"
+#include "type.h"
+#include "protos.h"
+#include "pble.h"
+#include "verbs.h"
+#include "cm.h"
+#include "user.h"
+#include "puda.h"
+#include <rdma/irdma-abi.h>
+
+extern struct list_head irdma_handlers;
+extern spinlock_t irdma_handler_lock;
+
+#define IRDMA_FW_VER	2
+#define IRDMA_HW_VER	2
+
+#define IRDMA_ARP_ADD		1
+#define IRDMA_ARP_DELETE	2
+#define IRDMA_ARP_RESOLVE	3
+
+#define IRDMA_MACIP_ADD		1
+#define IRDMA_MACIP_DELETE	2
+
+#define IW_CCQ_SIZE	(IRDMA_CQP_SW_SQSIZE_2048 + 1)
+#define IW_CEQ_SIZE	2048
+#define IW_AEQ_SIZE	2048
+
+#define RX_BUF_SIZE	(1536 + 8)
+#define IW_REG0_SIZE	(4 * 1024)
+#define IW_TX_TIMEOUT	(6 * HZ)
+#define IW_FIRST_QPN	1
+
+#define IW_SW_CONTEXT_ALIGN	1024
+
+#define MAX_DPC_ITERATIONS	128
+
+#define IRDMA_EVENT_TIMEOUT		100000
+#define IRDMA_VCHNL_EVENT_TIMEOUT	100000
+
+#define	IRDMA_NO_VLAN	0xffff
+#define	IRDMA_NO_QSET	0xffff
+
+#define IW_CFG_FPM_QP_COUNT		32768
+#define IRDMA_MAX_PAGES_PER_FMR		512
+#define IRDMA_MIN_PAGES_PER_FMR		1
+#define IRDMA_CQP_COMPL_RQ_WQE_FLUSHED	2
+#define IRDMA_CQP_COMPL_SQ_WQE_FLUSHED	3
+
+#define IRDMA_Q_TYPE_PE_AEQ	0x80
+#define IRDMA_Q_INVALID_IDX	0xffff
+
+#define IRDMA_DRV_OPT_ENA_MPA_VER_0		0x00000001
+#define IRDMA_DRV_OPT_DISABLE_MPA_CRC		0x00000002
+#define IRDMA_DRV_OPT_DISABLE_FIRST_WRITE	0x00000004
+#define IRDMA_DRV_OPT_DISABLE_INTF		0x00000008
+#define IRDMA_DRV_OPT_ENA_MSI			0x00000010
+#define IRDMA_DRV_OPT_DUAL_LOGICAL_PORT		0x00000020
+#define IRDMA_DRV_OPT_NO_INLINE_DATA		0x00000080
+#define IRDMA_DRV_OPT_DISABLE_INT_MOD		0x00000100
+#define IRDMA_DRV_OPT_DISABLE_VIRT_WQ		0x00000200
+#define IRDMA_DRV_OPT_ENA_PAU			0x00000400
+#define IRDMA_DRV_OPT_MCAST_LOGPORT_MAP		0x00000800
+
+#define IW_HMC_OBJ_TYPE_NUM	ARRAY_SIZE(iw_hmc_obj_types)
+#define VSI_RXSWCTRL(_VSI)			(0x00205000 + ((_VSI) * 4))
+#define VSI_RXSWCTRL_MACVSIPRUNEENABLE_M	BIT(8)
+#define VSI_RXSWCTRL_SRCPRUNEENABLE_M		BIT(13)
+
+enum init_completion_state {
+	INVALID_STATE = 0,
+	INITIAL_STATE,
+	CQP_CREATED,
+	HMC_OBJS_CREATED,
+	CCQ_CREATED,
+	AEQ_CREATED,
+	CEQ0_CREATED,	/* Last state of probe */
+	CEQS_CREATED,
+	ILQ_CREATED,
+	IEQ_CREATED,
+	PBLE_CHUNK_MEM,
+	IP_ADDR_REGISTERED,
+	RDMA_DEV_REGISTERED, /* Last state of open */
+};
+
+enum IRDMA_IDC_STATE {
+	IRDMA_STATE_INVALID,
+	IRDMA_STATE_VALID,
+	IRDMA_STATE_REG_FAILED
+};
+
+enum irdma_peer_type {
+	I40E_PEER_TYPE,
+	ICE_PEER_TYPE,
+	IRDMA_MAX_PEERS,
+};
+
+struct irdma_rsrc_limits {
+	u32 qplimit;
+	u32 mrlimit;
+	u32 cqlimit;
+};
+
+struct irdma_cqp_compl_info {
+	u32 op_ret_val;
+	u16 maj_err_code;
+	u16 min_err_code;
+	bool error;
+	u8 op_code;
+};
+
+struct irdma_cqp_request {
+	struct cqp_cmds_info info;
+	wait_queue_head_t waitq;
+	struct list_head list;
+	atomic_t refcount;
+	void (*callback_fcn)(struct irdma_cqp_request *cqp_request, u32 num);
+	void *param;
+	struct irdma_cqp_compl_info compl_info;
+	bool waiting;
+	bool request_done;
+	bool dynamic;
+};
+
+struct irdma_cqp {
+	struct irdma_sc_cqp sc_cqp;
+	spinlock_t req_lock; /* protect CQP request list */
+	spinlock_t compl_lock; /* protect CQP completion processing */
+	wait_queue_head_t waitq;
+	wait_queue_head_t remove_wq;
+	struct irdma_dma_mem sq;
+	struct irdma_dma_mem host_ctx;
+	u64 *scratch_array;
+	struct irdma_cqp_request *cqp_requests;
+	struct list_head cqp_avail_reqs;
+	struct list_head cqp_pending_reqs;
+	struct task_struct *cqp_compl_thread;
+	struct semaphore cqp_compl_sem;
+};
+
+struct irdma_ccq {
+	struct irdma_sc_cq sc_cq;
+	struct irdma_dma_mem mem_cq;
+	struct irdma_dma_mem shadow_area;
+};
+
+struct irdma_ceq {
+	struct irdma_sc_ceq sc_ceq;
+	struct irdma_dma_mem mem;
+	u32 irq;
+	u32 msix_idx;
+	struct irdma_pci_f *rf;
+	struct tasklet_struct dpc_tasklet;
+};
+
+struct irdma_aeq {
+	struct irdma_sc_aeq sc_aeq;
+	struct irdma_dma_mem mem;
+};
+
+struct irdma_arp_entry {
+	u32 ip_addr[4];
+	u8 mac_addr[ETH_ALEN];
+};
+
+struct irdma_msix_vector {
+	u32 idx;
+	u32 irq;
+	u32 cpu_affinity;
+	u32 ceq_id;
+	cpumask_t mask;
+};
+
+struct l2params_work {
+	struct work_struct work;
+	struct irdma_device *iwdev;
+	struct irdma_l2params l2params;
+};
+
+struct virtchnl_work {
+	struct work_struct work;
+	union {
+		struct irdma_cqp_request *cqp_request;
+		struct irdma_virtchnl_work_info work_info;
+	};
+};
+
+struct irdma_mc_table_info {
+	bool ipv4_valid;
+	u32 mgn;
+	u32 dest_ip[4];
+	bool lan_fwd;
+};
+
+struct mc_table_list {
+	struct list_head list;
+	struct irdma_mc_table_info mc_info;
+	struct irdma_mcast_grp_info mc_grp_ctx;
+};
+
+struct irdma_qv_info {
+	u32 v_idx; /* msix_vector */
+	u16 ceq_idx;
+	u16 aeq_idx;
+	u8 itr_idx;
+};
+
+struct irdma_qvlist_info {
+	u32 num_vectors;
+	struct irdma_qv_info qv_info[1];
+};
+
+struct irdma_priv_ldev {
+	unsigned int fn_num;
+	bool ftype;
+	u16 pf_vsi_num;
+	u16 msix_count;
+	struct msix_entry *msix_entries;
+	void *if_client;
+	void *if_ldev;
+};
+
+struct irdma_pci_f {
+	bool ooo;
+	bool reset;
+	bool rsrc_created;
+	bool stop_cqp_thread;
+	bool msix_shared;
+	u8 rsrc_profile;
+	u8 max_rdma_vfs;
+	u8 max_ena_vfs;
+	u8 *hmc_info_mem;
+	u8 *mem_rsrc;
+	u8 rdma_ver;
+	enum irdma_protocol_used protocol_used;
+	u32 sd_type;
+	u32 msix_count;
+	u32 max_mr;
+	u32 max_qp;
+	u32 max_cq;
+	u32 max_ah;
+	u32 next_ah;
+	u32 max_mcg;
+	u32 next_mcg;
+	u32 roce_ena;
+	u32 max_pd;
+	u32 next_qp;
+	u32 next_cq;
+	u32 next_pd;
+	u32 max_mr_size;
+	u32 max_cqe;
+	u32 mr_stagmask;
+	u32 used_pds;
+	u32 used_cqs;
+	u32 used_mrs;
+	u32 used_qps;
+	u32 arp_table_size;
+	u32 next_arp_index;
+	u32 ceqs_count;
+	u32 next_ws_node_id;
+	u32 max_ws_node_id;
+	u32 limits_sel;
+#ifndef CONFIG_DYNAMIC_DEBUG
+	u32 debug;
+#endif
+	unsigned long *allocated_ws_nodes;
+	unsigned long *allocated_qps;
+	unsigned long *allocated_cqs;
+	unsigned long *allocated_mrs;
+	unsigned long *allocated_pds;
+	unsigned long *allocated_mcgs;
+	unsigned long *allocated_ahs;
+	unsigned long *allocated_arps;
+	enum init_completion_state init_state;
+	struct irdma_sc_dev sc_dev;
+	struct list_head vsi_dev_list;
+	struct irdma_priv_ldev ldev;
+	struct irdma_handler *hdl;
+	struct pci_dev *pdev;
+	struct net_device *netdev;
+	struct irdma_hw hw;
+	struct irdma_cqp cqp;
+	struct irdma_ccq ccq;
+	struct irdma_aeq aeq;
+	struct irdma_ceq *ceqlist;
+	struct irdma_hmc_pble_rsrc *pble_rsrc;
+	struct irdma_arp_entry *arp_table;
+	spinlock_t rsrc_lock; /* protect HW resource array access */
+	spinlock_t qptable_lock; /*protect QP table access*/
+	struct irdma_qp **qp_table;
+	spinlock_t qh_list_lock;	/* protect mc_qht_list */
+	struct mc_table_list mc_qht_list;
+	struct irdma_msix_vector *iw_msixtbl;
+	struct irdma_qvlist_info *iw_qvlist;
+	struct tasklet_struct dpc_tasklet;
+	struct irdma_dma_mem obj_mem;
+	struct irdma_dma_mem obj_next;
+	atomic_t vchnl_msgs;
+	wait_queue_head_t vchnl_waitq;
+	struct workqueue_struct *free_qp_wq;
+	struct virtchnl_work virtchnl_w[IRDMA_MAX_PE_ENA_VF_COUNT];
+	struct irdma_sc_vsi default_vsi;
+	void *back_fcn;
+	void (*init_hw)(struct irdma_sc_dev *dev);
+};
+
+struct irdma_device {
+	struct irdma_ib_device *iwibdev;
+	struct list_head list;
+	struct irdma_pci_f *rf;
+	struct irdma_priv_ldev *ldev;
+	struct net_device *netdev;
+	struct irdma_handler *hdl;
+	struct irdma_sc_vsi vsi;
+	struct irdma_cm_core cm_core;
+	bool roce_mode;
+	u32 vendor_id;
+	u32 vendor_part_id;
+	u32 device_cap_flags;
+	u32 push_mode;
+	u32 rcv_wnd;
+	u16 mac_ip_table_idx;
+	u8 rcv_wscale;
+	bool dctcp_en;
+	bool ecn_en;
+	u16  vsi_num;
+	bool create_ilq;
+	bool roce_timely_en;
+	bool roce_dcqcn_en;
+	u8 iw_status;
+	struct tasklet_struct dpc_tasklet;
+	enum init_completion_state init_state;
+	bool dcb;
+	bool closing;
+	bool reset;
+	wait_queue_head_t close_wq;
+	wait_queue_head_t suspend_wq;
+	atomic64_t use_count;
+	struct workqueue_struct *param_wq;
+	atomic_t params_busy;
+};
+
+struct irdma_ib_device {
+	struct ib_device ibdev;
+	struct irdma_device *iwdev;
+};
+
+struct irdma_handler {
+	struct list_head list;
+	struct irdma_pci_f rf;
+	struct irdma_priv_ldev *ldev;
+	bool shared_res_created;
+};
+
+struct irdma_peer {
+	struct module *module;
+#define IRDMA_MAX_PEER_NAME_SIZE	8
+	char name[IRDMA_MAX_PEER_NAME_SIZE];
+	enum IRDMA_IDC_STATE state;
+	atomic_t ref_count;
+	int (*reg_peer_driver)(void *peer_info);
+	int (*unreg_peer_driver)(void *peer_info);
+};
+
+struct irdma_peer_drvs_list {
+	struct irdma_peer peer[IRDMA_MAX_PEERS];
+};
+/***********************************************************/
+/**
+ * to_iwdev - get device
+ * @ibdev: ib device
+ **/
+static inline struct irdma_device *to_iwdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct irdma_ib_device, ibdev)->iwdev;
+}
+
+/**
+ * to_ucontext - get user context
+ * @ibucontext: ib user context
+ **/
+static inline struct irdma_ucontext *to_ucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct irdma_ucontext, ibucontext);
+}
+
+/**
+ * to_iwpd - get protection domain
+ * @ibpd: ib pd
+ **/
+static inline struct irdma_pd *to_iwpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct irdma_pd, ibpd);
+}
+
+/**
+ * to_iwah - get device ah
+ * @ibdev: ib ah
+ **/
+static inline struct irdma_ah *to_iwah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct irdma_ah, ibah);
+}
+
+/**
+ * to_iwmr - get device memory region
+ * @ibdev: ib memory region
+ **/
+static inline struct irdma_mr *to_iwmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct irdma_mr, ibmr);
+}
+
+/**
+ * to_iwmr_from_ibfmr - get device memory region
+ * @ibfmr: ib fmr
+ **/
+static inline struct irdma_mr *to_iwmr_from_ibfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct irdma_mr, ibfmr);
+}
+
+/**
+ * to_iwmw - get device memory window
+ * @ibmw: ib memory window
+ **/
+static inline struct irdma_mr *to_iwmw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct irdma_mr, ibmw);
+}
+
+/**
+ * to_iwcq - get completion queue
+ * @ibcq: ib cqdevice
+ **/
+static inline struct irdma_cq *to_iwcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct irdma_cq, ibcq);
+}
+
+/**
+ * to_iwqp - get device qp
+ * @ibqp: ib qp
+ **/
+static inline struct irdma_qp *to_iwqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct irdma_qp, ibqp);
+}
+
+/**
+ * irdma_alloc_resource - allocate a resource
+ * @iwdev: device pointer
+ * @resource_array: resource bit array:
+ * @max_resources: maximum resource number
+ * @req_resources_num: Allocated resource number
+ * @next: next free id
+ **/
+static inline int irdma_alloc_rsrc(struct irdma_pci_f *rf,
+				   unsigned long *rsrc_array,
+				   u32 max_rsrc,
+				   u32 *req_rsrc_num,
+				   u32 *next)
+{
+	u32 rsrc_num;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rf->rsrc_lock, flags);
+	rsrc_num = find_next_zero_bit(rsrc_array, max_rsrc, *next);
+	if (rsrc_num >= max_rsrc) {
+		rsrc_num =
+			find_first_zero_bit(rsrc_array, max_rsrc);
+		if (rsrc_num >= max_rsrc) {
+			spin_unlock_irqrestore(&rf->rsrc_lock, flags);
+			irdma_debug(&rf->sc_dev, IRDMA_DEBUG_ERR,
+				    "resource [%d] allocation failed\n",
+				    rsrc_num);
+			return -EOVERFLOW;
+		}
+	}
+	set_bit(rsrc_num, rsrc_array);
+	*next = rsrc_num + 1;
+	if (*next == max_rsrc)
+		*next = 0;
+	*req_rsrc_num = rsrc_num;
+	spin_unlock_irqrestore(&rf->rsrc_lock, flags);
+
+	return 0;
+}
+
+/**
+ * irdma_is_resource_allocated - detrmine if resource is
+ * allocated
+ * @iwdev: device pointer
+ * @resource_array: resource array for the resource_num
+ * @resource_num: resource number to check
+ **/
+static inline bool irdma_is_rsrc_allocated(struct irdma_pci_f *rf,
+					   unsigned long *rsrc_array,
+					   u32 rsrc_num)
+{
+	bool bit_is_set;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rf->rsrc_lock, flags);
+
+	bit_is_set = test_bit(rsrc_num, rsrc_array);
+	spin_unlock_irqrestore(&rf->rsrc_lock, flags);
+
+	return bit_is_set;
+}
+
+/**
+ * irdma_free_resource - free a resource
+ * @iwdev: device pointer
+ * @resource_array: resource array for the resource_num
+ * @resource_num: resource number to free
+ **/
+static inline void irdma_free_rsrc(struct irdma_pci_f *rf,
+				   unsigned long *rsrc_array,
+				   u32 rsrc_num)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rf->rsrc_lock, flags);
+	clear_bit(rsrc_num, rsrc_array);
+	spin_unlock_irqrestore(&rf->rsrc_lock, flags);
+}
+
+void irdma_init_rf_params(struct irdma_pci_f *rf);
+enum irdma_status_code irdma_ctrl_init_hw(struct irdma_pci_f *rf);
+void irdma_deinit_ctrl_hw(struct irdma_pci_f *rf);
+enum irdma_status_code irdma_rt_init_hw(struct irdma_pci_f *rf, struct irdma_device *iwdev,
+					struct irdma_l2params *l2params);
+void irdma_deinit_rt_device(struct irdma_device *iwdev);
+void irdma_add_ref(struct ib_qp *ibqp);
+void irdma_rem_ref(struct ib_qp *ibqp);
+struct ib_qp *irdma_get_qp(struct ib_device *ibdev, int qpn);
+void irdma_flush_wqes(struct irdma_pci_f *rf,
+		      struct irdma_qp *qp);
+void irdma_manage_arp_cache(struct irdma_pci_f *rf,
+			    unsigned char *mac_addr,
+			    u32 *ip_addr,
+			    bool ipv4,
+			    u32 action);
+int irdma_manage_apbvt(struct irdma_device *iwdev,
+		       u16 accel_local_port,
+		       bool add_port);
+struct irdma_cqp_request *irdma_get_cqp_request(struct irdma_cqp *cqp,
+						bool wait);
+void irdma_free_cqp_request(struct irdma_cqp *cqp,
+			    struct irdma_cqp_request *cqp_request);
+void irdma_put_cqp_request(struct irdma_cqp *cqp,
+			   struct irdma_cqp_request *cqp_request);
+struct irdma_device *irdma_find_netdev(struct net_device *netdev);
+struct irdma_handler *irdma_find_handler(struct pci_dev *pdev);
+struct irdma_device *irdma_find_iwdev(const char *name);
+void irdma_add_handler(struct irdma_handler *hdl);
+void irdma_del_handler(struct irdma_handler *hdl);
+void irdma_add_ip(struct irdma_device *iwdev);
+int irdma_alloc_local_mac_entry(struct irdma_pci_f *rf,
+				u16 *mac_tbl_idx);
+int irdma_add_local_mac_entry(struct irdma_pci_f *rf,
+			      u8 *mac_addr,
+			      u16 idx);
+void irdma_del_local_mac_entry(struct irdma_pci_f *rf, u16 idx);
+
+u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf);
+int irdma_register_rdma_device(struct irdma_device *iwdev);
+void irdma_port_ibevent(struct irdma_device *iwdev);
+void irdma_cm_disconn(struct irdma_qp *qp);
+
+enum irdma_status_code irdma_handle_cqp_op(struct irdma_pci_f *rf,
+					   struct irdma_cqp_request *cqp_request);
+
+int irdma_modify_qp(struct ib_qp *ibqp,
+		    struct ib_qp_attr *attr,
+		    int attr_mask,
+		    struct ib_udata *udata);
+int irdma_modify_qp_roce(struct ib_qp *ibqp,
+			 struct ib_qp_attr *attr,
+			 int attr_mask,
+			 struct ib_udata *udata);
+void irdma_cq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_cq *cq);
+
+void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf);
+void irdma_rem_pdusecount(struct irdma_pd *iwpd, struct irdma_device *iwdev);
+void irdma_add_pdusecount(struct irdma_pd *iwpd);
+void irdma_rem_devusecount(struct irdma_device *iwdev);
+void irdma_add_devusecount(struct irdma_device *iwdev);
+enum irdma_status_code  irdma_hw_modify_qp(struct irdma_device *iwdev,
+					   struct irdma_qp *iwqp,
+					   struct irdma_modify_qp_info *info,
+					   bool wait);
+enum irdma_status_code irdma_qp_suspend_resume(struct irdma_sc_qp *qp,
+					       bool suspend);
+enum irdma_status_code
+irdma_manage_qhash(struct irdma_device *iwdev,
+		   struct irdma_cm_info *cminfo,
+		   enum irdma_quad_entry_type etype,
+		   enum irdma_quad_hash_manage_type mtype,
+		   void *cmnode,
+		   bool wait);
+void irdma_receive_ilq(struct irdma_sc_vsi *vsi, struct irdma_puda_buf *rbuf);
+void irdma_free_sqbuf(struct irdma_sc_vsi *vsi, void *bufp);
+void irdma_free_qp_rsrc(struct irdma_device *iwdev,
+			struct irdma_qp *iwqp,
+			u32 qp_num);
+void irdma_request_reset(struct irdma_pci_f *rf);
+void irdma_destroy_rdma_device(struct irdma_ib_device *iwibdev);
+void irdma_setup_cm_core(struct irdma_device *iwdev, u8 ver);
+void irdma_cleanup_cm_core(struct irdma_cm_core *cm_core);
+void irdma_process_ceq(struct irdma_pci_f *rf, struct irdma_ceq *iwceq);
+void irdma_process_aeq(struct irdma_pci_f *rf);
+void irdma_next_iw_state(struct irdma_qp *iwqp,
+			 u8 state, u8 del_hash,
+			 u8 term, u8 term_len);
+int irdma_send_syn(struct irdma_cm_node *cm_node, u32 sendack);
+int irdma_send_reset(struct irdma_cm_node *cm_node);
+struct irdma_cm_node *irdma_find_node(struct irdma_cm_core *cm_core,
+				      u16 rem_port,
+				      u32 *rem_addr,
+				      u16 loc_port,
+				      u32 *loc_addr,
+				      bool add_refcnt,
+				      bool accelerated_list);
+enum irdma_status_code irdma_hw_flush_wqes(struct irdma_pci_f *rf,
+					   struct irdma_sc_qp *qp,
+					   struct irdma_qp_flush_info *info,
+					   bool wait);
+void irdma_gen_ae(struct irdma_pci_f *rf,
+		  struct irdma_sc_qp *qp,
+		  struct irdma_gen_ae_info *info,
+		  bool wait);
+void irdma_copy_ip_ntohl(u32 *dst, __be32 *src);
+void irdma_copy_ip_htonl(__be32 *dst, u32 *src);
+u16 irdma_get_vlan_ipv4(u32 *addr);
+struct net_device *irdma_netdev_vlan_ipv6(u32 *addr, u16 *vlan_id, u8 *mac);
+struct ib_mr *irdma_reg_phys_mr(struct ib_pd *ib_pd,
+				u64 addr,
+				u64 size,
+				int acc,
+				u64 *iova_start);
+int cqp_compl_thread(void *context);
+int irdma_inetaddr_event(struct notifier_block *notifier,
+			 unsigned long event,
+			 void *ptr);
+int irdma_inet6addr_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *ptr);
+int irdma_net_event(struct notifier_block *notifier,
+		    unsigned long event,
+		    void *ptr);
+int irdma_netdevice_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *ptr);
+int i40iw_reg_peer_driver(struct irdma_peer *peer, struct net_device *netdev);
+int icrdma_reg_peer_driver(struct irdma_peer *peer, struct net_device *netdev);
+void i40iw_unreg_peer_driver(struct irdma_peer *peer);
+void icrdma_unreg_peer_driver(struct irdma_peer *peer);
+void i40iw_request_reset(struct irdma_pci_f *rf);
+void icrdma_request_reset(struct irdma_pci_f *rf);
+void irdma_probe_inc_ref(struct net_device *netdev);
+void irdma_probe_dec_ref(struct net_device *netdev);
+void irdma_handle_netdev(struct net_device *netdev);
+void irdma_register_notifiers(void);
+void irdma_unregister_notifiers(void);
+void irdma_cqp_ce_handler(struct irdma_pci_f *rf, struct irdma_sc_cq *cq);
+int irdma_ah_cqp_op(struct irdma_pci_f *rf,
+		    struct irdma_sc_ah *sc_ah,
+		    u8 cmd,
+		    bool wait,
+		    void (*callback_fcn)(struct irdma_cqp_request *cqp_request, u32 num),
+		    void *cb_param);
+void irdma_gsi_ud_qp_ah_cb(struct irdma_cqp_request *cqp_request, u32 unused);
+void irdma_destroy_ah_cb(struct irdma_cqp_request *cqp_request, u32 unused);
+int irdma_configfs_init(void);
+void irdma_configfs_exit(void);
+#endif /* IRDMA_MAIN_H */
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 03/19] net/ice: Add support for ice peer devices and drivers
From: Shiraz Saleem @ 2019-02-12 21:43 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Anirudh Venkataramanan
In-Reply-To: <20190212214402.23284-1-shiraz.saleem@intel.com>

From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>

The E800 series of Ethernet devices has multiple hardware blocks, of
which RDMA is one. The RDMA block isn't interfaced directly to PCI
or any other bus. The RDMA driver (irdma) thus depends on the ice
driver to provide access to the RDMA hardware block.

The ice driver first creates a pseudo bus and then creates and attaches
a new device to the pseudo bus using device_register(). This new device
is referred to as a "peer device" and the associated driver (i.e. irdma)
is a "peer driver" to ice. Once the peer driver loads, it can call
ice driver functions exposed to it via struct ice_ops. Similarly, ice can
call peer driver functions exposed to it via struct ice_peer_ops.

This whole mechanism of creating peer devices and registering peer
drivers, and subsequent interaction between the two is referred to as
Inter-Driver Communication (IDC).

For the purposes of this RFC, this patch is being submitted as a
monolothic patch. It will be broken up into multiple patches for
the final submission.

Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/Makefile          |    1 +
 drivers/net/ethernet/intel/ice/ice.h             |   14 +
 drivers/net/ethernet/intel/ice/ice_adminq_cmd.h  |   32 +
 drivers/net/ethernet/intel/ice/ice_common.c      |  189 +++
 drivers/net/ethernet/intel/ice/ice_common.h      |    9 +
 drivers/net/ethernet/intel/ice/ice_idc.c         | 1527 ++++++++++++++++++++++
 drivers/net/ethernet/intel/ice/ice_idc.h         |  402 ++++++
 drivers/net/ethernet/intel/ice/ice_idc_int.h     |   99 ++
 drivers/net/ethernet/intel/ice/ice_lib.c         |   24 +
 drivers/net/ethernet/intel/ice/ice_lib.h         |    2 +
 drivers/net/ethernet/intel/ice/ice_main.c        |  143 +-
 drivers/net/ethernet/intel/ice/ice_switch.c      |   23 +
 drivers/net/ethernet/intel/ice/ice_switch.h      |    2 +
 drivers/net/ethernet/intel/ice/ice_type.h        |    2 +
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c |   25 -
 15 files changed, 2465 insertions(+), 29 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/ice/ice_idc.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_idc.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_idc_int.h

diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile
index e5d6f68..62a8e91 100644
--- a/drivers/net/ethernet/intel/ice/Makefile
+++ b/drivers/net/ethernet/intel/ice/Makefile
@@ -15,5 +15,6 @@ ice-y := ice_main.o	\
 	 ice_sched.o	\
 	 ice_lib.o	\
 	 ice_txrx.o	\
+	 ice_idc.o	\
 	 ice_ethtool.o
 ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o
diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index a385575..790881c4 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -36,6 +36,7 @@
 #include "ice_switch.h"
 #include "ice_common.h"
 #include "ice_sched.h"
+#include "ice_idc_int.h"
 #include "ice_virtchnl_pf.h"
 #include "ice_sriov.h"
 
@@ -64,6 +65,7 @@
 #define ICE_MAX_SMALL_RSS_QS	8
 #define ICE_RES_VALID_BIT	0x8000
 #define ICE_RES_MISC_VEC_ID	(ICE_RES_VALID_BIT - 1)
+#define ICE_RES_RDMA_VEC_ID	(ICE_RES_MISC_VEC_ID - 1)
 #define ICE_INVAL_Q_INDEX	0xffff
 #define ICE_INVAL_VFID		256
 #define ICE_MAX_VF_COUNT	256
@@ -243,6 +245,7 @@ struct ice_vsi {
 	u16 alloc_rxq;			 /* Allocated Rx queues */
 	u16 num_rxq;			 /* Used Rx queues */
 	u16 num_desc;
+	u16 qset_handle[ICE_MAX_TRAFFIC_CLASS];
 	struct ice_tc_cfg tc_cfg;
 } ____cacheline_internodealigned_in_smp;
 
@@ -267,6 +270,7 @@ struct ice_q_vector {
 enum ice_pf_flags {
 	ICE_FLAG_MSIX_ENA,
 	ICE_FLAG_FLTR_SYNC,
+	ICE_FLAG_IWARP_ENA,
 	ICE_FLAG_RSS_ENA,
 	ICE_FLAG_SRIOV_ENA,
 	ICE_FLAG_SRIOV_CAPABLE,
@@ -302,6 +306,10 @@ struct ice_pf {
 	struct mutex avail_q_mutex;	/* protects access to avail_[rx|tx]qs */
 	struct mutex sw_mutex;		/* lock for protecting VSI alloc flow */
 	u32 msg_enable;
+	/* Total number of MSIX vectors reserved for base driver */
+	u32 num_rdma_msix;
+	u32 rdma_base_vector;
+	struct ice_peer_dev *rdma_peer;
 	u32 hw_csum_rx_error;
 	u32 sw_oicr_idx;	/* Other interrupt cause SW vector index */
 	u32 num_avail_sw_msix;	/* remaining MSIX SW vectors left unclaimed */
@@ -330,9 +338,13 @@ struct ice_pf {
 };
 
 struct ice_netdev_priv {
+	struct idc_srv_provider prov_callbacks;
 	struct ice_vsi *vsi;
 };
 
+extern struct bus_type ice_peer_bus;
+extern struct ida ice_peer_index_ida;
+
 /**
  * ice_irq_dynamic_ena - Enable default interrupt generation settings
  * @hw: pointer to hw struct
@@ -370,7 +382,9 @@ static inline void ice_vsi_set_tc_cfg(struct ice_vsi *vsi)
 int ice_set_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size);
 int ice_get_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size);
 void ice_fill_rss_lut(u8 *lut, u16 rss_table_size, u16 rss_size);
+int ice_schedule_reset(struct ice_pf *pf, enum ice_reset_req reset);
 void ice_print_link_msg(struct ice_vsi *vsi, bool isup);
+int ice_init_peer_devices(struct ice_pf *pf);
 void ice_napi_del(struct ice_vsi *vsi);
 
 #endif /* _ICE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index fcdcd80..aa01239 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -1226,6 +1226,36 @@ struct ice_aqc_dis_txq {
 	struct ice_aqc_dis_txq_item qgrps[1];
 };
 
+/* Add Tx RDMA Queue Set (indirect 0x0C33) */
+struct ice_aqc_add_rdma_qset {
+	u8 num_qset_grps;
+	u8 reserved[7];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+/* This is the descriptor of each qset entry for the Add Tx RDMA Queue Set
+ * command (0x0C33). Only used within struct ice_aqc_add_rdma_qset.
+ */
+struct ice_aqc_add_tx_rdma_qset_entry {
+	__le16 tx_qset_id;
+	u8 rsvd[2];
+	__le32 qset_teid;
+	struct ice_aqc_txsched_elem info;
+};
+
+/* The format of the command buffer for Add Tx RDMA Queue Set(0x0C33)
+ * is an array of the following structs. Please note that the length of
+ * each struct ice_aqc_add_rdma_qset is variable due to the variable
+ * number of queues in each group!
+ */
+struct ice_aqc_add_rdma_qset_data {
+	__le32 parent_teid;
+	__le16 num_qsets;
+	u8 rsvd[2];
+	struct ice_aqc_add_tx_rdma_qset_entry rdma_qsets[1];
+};
+
 /* Configure Firmware Logging Command (indirect 0xFF09)
  * Logging Information Read Response (indirect 0xFF10)
  * Note: The 0xFF10 command has no input parameters.
@@ -1353,6 +1383,7 @@ struct ice_aq_desc {
 		struct ice_aqc_get_set_rss_key get_set_rss_key;
 		struct ice_aqc_add_txqs add_txqs;
 		struct ice_aqc_dis_txqs dis_txqs;
+		struct ice_aqc_add_rdma_qset add_rdma_qset;
 		struct ice_aqc_add_get_update_free_vsi vsi_cmd;
 		struct ice_aqc_add_update_free_vsi_resp add_update_free_vsi_res;
 		struct ice_aqc_fw_logging fw_logging;
@@ -1459,6 +1490,7 @@ enum ice_adminq_opc {
 	/* TX queue handling commands/events */
 	ice_aqc_opc_add_txqs				= 0x0C30,
 	ice_aqc_opc_dis_txqs				= 0x0C31,
+	ice_aqc_opc_add_rdma_qset			= 0x0C33,
 
 	/* debug commands */
 	ice_aqc_opc_fw_logging				= 0xFF09,
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 4c1d35d..16a712c 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -2381,6 +2381,59 @@ enum ice_status
 	return ice_aq_send_cmd(hw, &desc, qg_list, buf_size, cd);
 }
 
+/**
+ * ice_aq_add_rdma_qsets
+ * @hw: pointer to the hardware structure
+ * @num_qset_grps: Number of RDMA Qset groups
+ * @qset_list: list of qset groups to be added
+ * @buf_size: size of buffer for indirect command
+ * @cd: pointer to command details structure or NULL
+ *
+ * Add Tx RDMA Qsets (0x0C33)
+ */
+static enum ice_status
+ice_aq_add_rdma_qsets(struct ice_hw *hw, u8 num_qset_grps,
+		      struct ice_aqc_add_rdma_qset_data *qset_list,
+		      u16 buf_size, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_rdma_qset_data *list;
+	u16 i, sum_header_size, sum_q_size = 0;
+	struct ice_aqc_add_rdma_qset *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.add_rdma_qset;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_rdma_qset);
+
+	if (!qset_list)
+		return ICE_ERR_PARAM;
+
+	if (num_qset_grps > ICE_LAN_TXQ_MAX_QGRPS)
+		return ICE_ERR_PARAM;
+
+	sum_header_size = num_qset_grps *
+		(sizeof(*qset_list) - sizeof(*qset_list->rdma_qsets));
+
+	list = qset_list;
+	for (i = 0; i < num_qset_grps; i++) {
+		struct ice_aqc_add_tx_rdma_qset_entry *qset = list->rdma_qsets;
+		u16 num_qsets = le16_to_cpu(list->num_qsets);
+
+		sum_q_size += num_qsets * sizeof(*qset);
+		list = (struct ice_aqc_add_rdma_qset_data *)
+			(qset + num_qsets);
+	}
+
+	if (buf_size != (sum_header_size + sum_q_size))
+		return ICE_ERR_PARAM;
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd->num_qset_grps = num_qset_grps;
+
+	return ice_aq_send_cmd(hw, &desc, qset_list, buf_size, cd);
+}
+
 /* End of FW Admin Queue command wrappers */
 
 /**
@@ -2792,6 +2845,142 @@ enum ice_status
 }
 
 /**
+ * ice_cfg_vsi_rdma - configure the VSI RDMA queues
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap
+ * @max_rdmaqs: max RDMA queues array per TC
+ *
+ * This function adds/updates the VSI RDMA queues per TC.
+ */
+enum ice_status
+ice_cfg_vsi_rdma(struct ice_port_info *pi, u16 vsi_handle, u8 tc_bitmap,
+		 u16 *max_rdmaqs)
+{
+	return ice_cfg_vsi_qs(pi, vsi_handle, tc_bitmap, max_rdmaqs,
+			      ICE_SCHED_NODE_OWNER_RDMA);
+}
+
+/**
+ * ice_ena_vsi_rdma_qset
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: TC number
+ * @rdma_qset: pointer to RDMA qset
+ * @num_qsets: number of RDMA qsets
+ * @qset_teid: pointer to qset node teids
+ *
+ * This function adds RDMA qset
+ */
+enum ice_status
+ice_ena_vsi_rdma_qset(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      u16 *rdma_qset, u16 num_qsets, u32 *qset_teid)
+{
+	struct ice_aqc_txsched_elem_data node = { 0 };
+	struct ice_aqc_add_rdma_qset_data *buf;
+	struct ice_sched_node *parent;
+	enum ice_status status;
+	struct ice_hw *hw;
+	u16 buf_size;
+	u8 i;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+	hw = pi->hw;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	buf_size = sizeof(*buf) + sizeof(*buf->rdma_qsets) * (num_qsets - 1);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+	mutex_lock(&pi->sched_lock);
+
+	parent = ice_sched_get_free_qparent(pi, vsi_handle, tc,
+					    ICE_SCHED_NODE_OWNER_RDMA);
+	if (!parent) {
+		status = ICE_ERR_PARAM;
+		goto rdma_error_exit;
+	}
+	buf->parent_teid = parent->info.node_teid;
+	node.parent_teid = parent->info.node_teid;
+
+	buf->num_qsets = cpu_to_le16(num_qsets);
+	for (i = 0; i < num_qsets; i++) {
+		buf->rdma_qsets[i].tx_qset_id = cpu_to_le16(rdma_qset[i]);
+		buf->rdma_qsets[i].info.valid_sections =
+						ICE_AQC_ELEM_VALID_GENERIC;
+	}
+	status = ice_aq_add_rdma_qsets(hw, 1, buf, buf_size, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_RDMA, "add RDMA qset failed\n");
+		goto rdma_error_exit;
+	}
+	node.data.elem_type = ICE_AQC_ELEM_TYPE_LEAF;
+	for (i = 0; i < num_qsets; i++) {
+		node.node_teid = buf->rdma_qsets[i].qset_teid;
+		status = ice_sched_add_node(pi, hw->num_tx_sched_layers - 1,
+					    &node);
+		if (status)
+			break;
+		qset_teid[i] = le32_to_cpu(node.node_teid);
+	}
+rdma_error_exit:
+	mutex_unlock(&pi->sched_lock);
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
+}
+
+/**
+ * ice_dis_vsi_rdma_qset - free RMDA resources
+ * @pi: port_info struct
+ * @count: number of RDMA qsets to free
+ * @qset_teid: TEID of qset node
+ * @q_id: list of queue IDs being disabled
+ */
+enum ice_status
+ice_dis_vsi_rdma_qset(struct ice_port_info *pi, u16 count, u32 *qset_teid,
+		      u16 *q_id)
+{
+	struct ice_aqc_dis_txq_item qg_list;
+	enum ice_status status = 0;
+	u16 qg_size;
+	int i;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+
+	qg_size = sizeof(qg_list);
+
+	mutex_lock(&pi->sched_lock);
+
+	for (i = 0; i < count; i++) {
+		struct ice_sched_node *node;
+
+		node = ice_sched_find_node_by_teid(pi->root, qset_teid[i]);
+		if (!node)
+			continue;
+
+		qg_list.parent_teid = node->info.parent_teid;
+		qg_list.num_qs = 1;
+		qg_list.q_id[0] =
+			cpu_to_le16(q_id[i] |
+				    ICE_AQC_Q_DIS_BUF_ELEM_TYPE_RDMA_QSET);
+
+		status = ice_aq_dis_lan_txq(pi->hw, 1, &qg_list, qg_size,
+					    ICE_NO_RESET, 0, NULL);
+		if (status)
+			break;
+
+		ice_free_sched_node(pi, node);
+	}
+
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
  * ice_replay_pre_init - replay pre initialization
  * @hw: pointer to the hw struct
  *
diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
index cf760c2..c436376 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -86,6 +86,15 @@ enum ice_status
 ice_aq_set_link_restart_an(struct ice_port_info *pi, bool ena_link,
 			   struct ice_sq_cd *cd);
 enum ice_status
+ice_cfg_vsi_rdma(struct ice_port_info *pi, u16 vsi_handle, u8 tc_bitmap,
+		 u16 *max_rdmaqs);
+enum ice_status
+ice_ena_vsi_rdma_qset(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      u16 *rdma_qset, u16 num_qsets, u32 *qset_teid);
+enum ice_status
+ice_dis_vsi_rdma_qset(struct ice_port_info *pi, u16 count, u32 *qset_teid,
+		      u16 *q_id);
+enum ice_status
 ice_dis_vsi_txq(struct ice_port_info *pi, u8 num_queues, u16 *q_ids,
 		u32 *q_teids, enum ice_disq_rst_src rst_src, u16 vmvf_num,
 		struct ice_sq_cd *cmd_details);
diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c
new file mode 100644
index 0000000..42f4c14
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_idc.c
@@ -0,0 +1,1527 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* Inter-Driver Communication */
+#include "ice.h"
+#include "ice_lib.h"
+
+DEFINE_IDA(ice_peer_index_ida);
+DEFINE_MUTEX(ice_peer_drv_mutex); /* lock for accessing list of peer drivers */
+LIST_HEAD(ice_peer_drv_list);
+
+#define ICE_PEER_PCI_RES_LEN (BIT_ULL(18) - 1)
+#define ICE_PEER_SW_RES_START 0x02D00000
+#define ICE_PEER_AE_RES_START (ICE_PEER_SW_RES_START | BIT_ULL(18))
+#define ICE_PEER_INLINE_CRYPTO_RES_START (ICE_PEER_SW_RES_START | BIT_ULL(19))
+#define ICE_PEER_AE_NUM_MSIX 2
+#define ICE_PEER_SW_NUM_MSIX 2
+#define ICE_PEER_IPSEC_NUM_MSIX 2
+
+/**
+ * ice_verify_peer - verify peer device it is legit
+ * @dev: ptr to device
+ *
+ * This function verified 'dev' if it is legit (means is it one of the peer
+ * device whose bus matches with the bus exposed by this driver
+ */
+static bool ice_verify_peer(struct device *dev)
+{
+	return dev->bus == &ice_peer_bus;
+}
+
+/**
+ * ice_peer_state_change - manage state machine for peer
+ * @peer_dev: pointer to peer's configuration
+ * @new_state: the state requested to transition into
+ *
+ * This function handles all state transitions for peer devices.
+ * The state machine is as follows:
+ *
+ *     +<------------------------------------------------------+
+ *					 +<----------+	       +
+ *					 +	     +	       +
+ *    INIT  -->  PROBE  --> PROBED --> OPENED --> CLOSED --> REMOVED
+ *     +	  +			 +	     +
+ *     +----------+		      PREP_RST	     +
+ *					 +	     +
+ *				      PREPPED	     +
+ *					 +---------->+
+ */
+static void
+ice_peer_state_change(struct ice_peer_dev_int *peer_dev, long new_state)
+{
+	switch (new_state) {
+	case ICE_PEER_DEV_STATE_INIT:
+		if (test_and_clear_bit(ICE_PEER_DEV_STATE_PROBE,
+				       peer_dev->state)) {
+			set_bit(ICE_PEER_DEV_STATE_INIT, peer_dev->state);
+			pr_info("state transition from _PROBE to _INIT\n");
+		} else if (test_and_clear_bit(ICE_PEER_DEV_STATE_REMOVED,
+					      peer_dev->state)) {
+			set_bit(ICE_PEER_DEV_STATE_INIT, peer_dev->state);
+			pr_info("state transition from _REMOVED to _INIT\n");
+		} else {
+			set_bit(ICE_PEER_DEV_STATE_INIT, peer_dev->state);
+			pr_info("state set to _INIT\n");
+		}
+		break;
+	case ICE_PEER_DEV_STATE_PROBE:
+		if (test_and_clear_bit(ICE_PEER_DEV_STATE_INIT,
+				       peer_dev->state)) {
+			set_bit(ICE_PEER_DEV_STATE_PROBE, peer_dev->state);
+			pr_info("state transition from _INIT to _PROBE\n");
+		}
+		break;
+	case ICE_PEER_DEV_STATE_PROBED:
+		if (test_and_clear_bit(ICE_PEER_DEV_STATE_PROBE,
+				       peer_dev->state)) {
+			set_bit(ICE_PEER_DEV_STATE_PROBED, peer_dev->state);
+			pr_info("state transition from _PROBE to _PROBED\n");
+		}
+		break;
+	case ICE_PEER_DEV_STATE_OPENED:
+		if (test_and_clear_bit(ICE_PEER_DEV_STATE_PROBED,
+				       peer_dev->state)) {
+			set_bit(ICE_PEER_DEV_STATE_OPENED, peer_dev->state);
+			pr_info("state transition from _PROBED to _OPENED\n");
+		} else if (test_and_clear_bit(ICE_PEER_DEV_STATE_CLOSED,
+				       peer_dev->state)) {
+			set_bit(ICE_PEER_DEV_STATE_OPENED, peer_dev->state);
+			pr_info("state transition from _CLOSED to _OPENED\n");
+		}
+		break;
+	case ICE_PEER_DEV_STATE_PREP_RST:
+		if (test_and_clear_bit(ICE_PEER_DEV_STATE_OPENED,
+				       peer_dev->state)) {
+			set_bit(ICE_PEER_DEV_STATE_PREP_RST, peer_dev->state);
+			pr_info("state transition from _OPENED to _PREP_RST\n");
+		}
+		break;
+	case ICE_PEER_DEV_STATE_PREPPED:
+		if (test_and_clear_bit(ICE_PEER_DEV_STATE_PREP_RST,
+				       peer_dev->state)) {
+			set_bit(ICE_PEER_DEV_STATE_PREPPED, peer_dev->state);
+			pr_info("state transition _PREP_RST to _PREPPED\n");
+		}
+		break;
+	case ICE_PEER_DEV_STATE_CLOSED:
+		if (test_and_clear_bit(ICE_PEER_DEV_STATE_OPENED,
+				       peer_dev->state)) {
+			set_bit(ICE_PEER_DEV_STATE_CLOSED, peer_dev->state);
+			pr_info("state transition from _OPENED to _CLOSED\n");
+		}
+		if (test_and_clear_bit(ICE_PEER_DEV_STATE_PREPPED,
+				       peer_dev->state)) {
+			set_bit(ICE_PEER_DEV_STATE_CLOSED, peer_dev->state);
+			pr_info("state transition from _PREPPED to _CLOSED\n");
+		}
+		/* NOTE - up to peer to handle this situation correctly */
+		if (test_and_clear_bit(ICE_PEER_DEV_STATE_PREP_RST,
+				       peer_dev->state)) {
+			set_bit(ICE_PEER_DEV_STATE_CLOSED, peer_dev->state);
+			pr_warn("WARN: Peer state from PREP_RST to _CLOSED\n");
+		}
+		break;
+	case ICE_PEER_DEV_STATE_REMOVED:
+		if (test_and_clear_bit(ICE_PEER_DEV_STATE_OPENED,
+				       peer_dev->state) ||
+		    test_and_clear_bit(ICE_PEER_DEV_STATE_CLOSED,
+				       peer_dev->state)) {
+			set_bit(ICE_PEER_DEV_STATE_REMOVED, peer_dev->state);
+			pr_info("state from _OPENED/_CLOSED to _REMOVED\n");
+			/* Clear registration for events when peer removed */
+			bitmap_zero(peer_dev->events, ICE_PEER_DEV_STATE_NBITS);
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * ice_peer_close - close a peer device
+ * @dev: device to close
+ * @data: pointer to opaque data
+ *
+ * This function will also set the state bit for the peer to CLOSED. This
+ * function is meant to be called from a bus_for_each_dev().
+ */
+int ice_peer_close(struct device *dev, void *data)
+{
+	enum ice_close_reason reason = *(enum ice_close_reason *)(data);
+	struct ice_peer_dev *peer_dev = dev_to_ice_peer(dev);
+	struct ice_peer_dev_int *peer_dev_int;
+	struct ice_pf *pf;
+	int i;
+
+	/* return 0 so bus_for_each_device will continue closing other peers */
+	if (!peer_dev)
+		return 0;
+	if (!peer_dev->pdev)
+		return 0;
+
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	if (!peer_dev_int)
+		return 0;
+
+	pf = pci_get_drvdata(peer_dev->pdev);
+	if (!pf)
+		return 0;
+
+	if (test_bit(__ICE_DOWN, pf->state) ||
+	    test_bit(__ICE_SUSPENDED, pf->state) ||
+	    test_bit(__ICE_NEEDS_RESTART, pf->state))
+		return 0;
+
+	/* no peer driver or it's already closed, nothing to do */
+	if (!dev->driver ||
+	    test_bit(ICE_PEER_DEV_STATE_CLOSED, peer_dev_int->state) ||
+	    test_bit(ICE_PEER_DEV_STATE_REMOVED, peer_dev_int->state))
+		return 0;
+
+	/* Set the peer state to CLOSED */
+	ice_peer_state_change(peer_dev_int, ICE_PEER_DEV_STATE_CLOSED);
+
+	for (i = 0; i < ICE_EVENT_NBITS; i++)
+		bitmap_zero(peer_dev_int->current_events[i].type,
+			    ICE_EVENT_NBITS);
+
+	if (peer_dev->peer_ops && peer_dev->peer_ops->close)
+		peer_dev->peer_ops->close(peer_dev, reason);
+
+	return 0;
+}
+
+/**
+ * ice_bus_match - check for peer match
+ * @dev: pointer to device struct for peer
+ * @drv: pointer to device driver struct for peer
+ *
+ * This function returns > zero in case it found a supported device,
+ * and zero for an unsupported device.
+ */
+static int ice_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct ice_peer_dev *peer_dev = dev_to_ice_peer(dev);
+	struct ice_peer_drv *peer_drv = drv_to_ice_peer(drv);
+
+	/* Make sure peer device and peer driver's vendor and device_id
+	 * matches. If matches, success, otherwise failure
+	 */
+	if (peer_dev->dev_id.vendor == peer_drv->dev_id.vendor &&
+	    peer_dev->dev_id.device == peer_drv->dev_id.device)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * ice_bus_probe - bus probe function
+ * @dev: ptr to peer device
+ *
+ * This function is invoked by OS bus_infrastructure if bus_match function
+ * returns success (1). It performs basic initialization and delays remainder
+ * of initialization (including calling peer driver's probe), which is handled
+ * by service_task. It sets correct device STATE.
+ */
+static int ice_bus_probe(struct device *dev)
+{
+	struct ice_peer_dev_int *peer_dev_int;
+	struct ice_peer_drv *peer_drv;
+	struct ice_peer_dev *peer_dev;
+
+	if (!dev->driver) {
+		/* no peer driver registered */
+		return 0;
+	}
+
+	if (!ice_verify_peer(dev)) {
+		/* since it is not one of our peer device, cannot trust
+		 * 'data', hence prefer to not use dev_* for err.
+		 */
+		pr_err("%s: failed to verify peer dev %s\n", __func__,
+		       dev->driver->name ? dev->driver->name : "");
+		return 0;
+	}
+
+	peer_drv = drv_to_ice_peer(dev->driver);
+	if (!peer_drv)
+		return 0;
+
+	peer_dev = dev_to_ice_peer(dev);
+	if (!peer_dev)
+		return 0;
+
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	if (!peer_dev_int)
+		return -EINVAL;
+
+	switch (peer_drv->dev_id.device) {
+	case ICE_PEER_RDMA_DEV:
+		break;
+	default:
+		pr_err("unsupported device ID %u\n", peer_drv->dev_id.device);
+		return 0;
+	}
+
+	/* Clear state bitmap on (re)registering devices */
+	bitmap_zero(peer_dev_int->state, ICE_PEER_DEV_STATE_NBITS);
+
+	/* For now , just mark the state of peer device and handle rest of the
+	 * initialization in service_task and then call peer driver "probe"
+	 */
+	ice_peer_state_change(peer_dev_int, ICE_PEER_DEV_STATE_INIT);
+
+	return 0;
+}
+
+/**
+ * ice_bus_remove - bus remove function
+ * @dev: ptr to peer device
+ *
+ * This function is invoked as a result of driver_unregister being invoked from
+ * ice_unreg_peer_driver function. This function in turn calls
+ * peer_driver's "close" and then "remove" function.
+ */
+static int ice_bus_remove(struct device *dev)
+{
+	enum ice_close_reason reason = ICE_REASON_PEER_DRV_UNREG;
+	struct ice_peer_dev_int *peer_dev_int;
+	struct ice_peer_drv *peer_drv;
+	struct ice_peer_dev *peer_dev;
+	struct ice_pf *pf;
+	int i;
+
+	/* no peer driver registered */
+	if (!dev->driver)
+		return 0;
+
+	if (!ice_verify_peer(dev)) {
+		/* since it is not one of our peer device, cannot trust
+		 * 'data', hence prefer to not use dev_* for err.
+		 */
+		pr_err("%s: failed to verify peer dev %s\n", __func__,
+		       dev->driver->name ? dev->driver->name : "");
+		return 0;
+	}
+
+	peer_drv = drv_to_ice_peer(dev->driver);
+	if (!peer_drv)
+		return 0;
+
+	peer_dev = dev_to_ice_peer(dev);
+	if (!peer_dev)
+		return 0;
+
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	if (!peer_dev_int)
+		return 0;
+	/* What action we take here depends on where the peer is in the
+	 * state machine. The return value for ice_bus_remove is largely
+	 * ignored by the kernel, so we need to make the best choice based
+	 * only on what we know about the peer.
+	 */
+
+	/* peer already removed */
+	if (test_bit(ICE_PEER_DEV_STATE_REMOVED, peer_dev_int->state))
+		return 0;
+
+	/* check for reset in progress before proceeding */
+	pf = pci_get_drvdata(peer_dev->pdev);
+	for (i = 0; i < ICE_MAX_RESET_WAIT; i++) {
+		if (!ice_is_reset_in_progress(pf->state))
+			break;
+		msleep(100);
+	}
+
+	/* peer still in init - nothing done yet */
+	if (test_bit(ICE_PEER_DEV_STATE_INIT, peer_dev_int->state))
+		goto exit_setstate;
+
+	/* is there an active function call out to peer */
+	if (test_bit(ICE_PEER_DEV_STATE_PROBE, peer_dev_int->state) ||
+	    test_bit(ICE_PEER_DEV_STATE_PREP_RST, peer_dev_int->state))
+		for (i = 0; i < ICE_IDC_MAX_STATE_WAIT; i++) {
+			if (!test_bit(ICE_PEER_DEV_STATE_PROBE,
+				      peer_dev_int->state) &&
+			    !test_bit(ICE_PEER_DEV_STATE_PREP_RST,
+				      peer_dev_int->state))
+				break;
+			msleep(100);
+		}
+
+	/* probe finished but not open yet */
+	if (test_bit(ICE_PEER_DEV_STATE_PROBED, peer_dev_int->state))
+		goto exit_remove;
+
+	/* is peer stuck in probe or in any intermediate state
+	 * no sense in calling any other API entries
+	 */
+	if (test_bit(ICE_PEER_DEV_STATE_PROBE, peer_dev_int->state) ||
+	    test_bit(ICE_PEER_DEV_STATE_PREP_RST, peer_dev_int->state))
+		goto exit_setstate;
+
+	/* is peer prepped for reset or in nominal open state */
+	if (test_bit(ICE_PEER_DEV_STATE_PREPPED, peer_dev_int->state) ||
+	    test_bit(ICE_PEER_DEV_STATE_OPENED, peer_dev_int->state))
+		goto exit_close;
+
+	/* peer is closed */
+	if (test_bit(ICE_PEER_DEV_STATE_CLOSED, peer_dev_int->state))
+		goto exit_remove;
+
+	/* peer in unknown state */
+	goto exit_setstate;
+
+exit_close:
+	ice_peer_close(dev, &reason);
+exit_remove:
+	if (peer_drv->remove)
+		peer_drv->remove(peer_dev);
+exit_setstate:
+	pr_info("Setting peer state to _REMOVED for peer device %s\n",
+		dev->driver->name ? dev->driver->name : "");
+	bitmap_zero(peer_dev_int->state, ICE_PEER_DEV_STATE_NBITS);
+	set_bit(ICE_PEER_DEV_STATE_REMOVED, peer_dev_int->state);
+	peer_dev->peer_ops = NULL;
+
+	return 0;
+}
+
+struct bus_type ice_peer_bus = {
+	.name = "ice_pseudo_bus",
+	.match = ice_bus_match,
+	.probe = ice_bus_probe,
+	.remove = ice_bus_remove,
+};
+
+/**
+ * ice_validate_peer_dev - validate peer device state
+ * @peer: ptr to peer device
+ *
+ * This helper function checks if pf in a minimal state and if the peer device
+ * is valid. This should be called before engaging in peer operations.
+ */
+static int ice_validate_peer_dev(struct ice_peer_dev *peer)
+{
+	struct ice_peer_dev_int *peer_dev_int;
+	struct ice_pf *pf;
+
+	if (!peer)
+		return -EINVAL;
+
+	if (!peer->pdev)
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer->pdev);
+	if (!pf)
+		return -EINVAL;
+
+	peer_dev_int = peer_to_ice_dev_int(peer);
+	if (!peer_dev_int)
+		return -EINVAL;
+
+	if (test_bit(ICE_PEER_DEV_STATE_REMOVED, peer_dev_int->state))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ice_close_peer_for_reset - queue work to close peer for reset
+ * @dev: pointer peer dev struct
+ * @data: pointer to opaque data used for reset type
+ */
+int ice_close_peer_for_reset(struct device *dev, void *data)
+{
+	enum ice_reset_req reset = *(enum ice_reset_req *)data;
+	struct ice_peer_dev *peer_dev = dev_to_ice_peer(dev);
+	struct ice_peer_dev_int *peer_dev_int;
+
+	if (!peer_dev || !peer_dev->pdev)
+		return 0;
+
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	if (!peer_dev_int)
+		return 0;
+
+	switch (reset) {
+	case ICE_RESET_GLOBR:
+		peer_dev_int->rst_type = ICE_REASON_GLOBR_REQ;
+		break;
+	case ICE_RESET_CORER:
+		peer_dev_int->rst_type = ICE_REASON_CORER_REQ;
+		break;
+	case ICE_RESET_PFR:
+		peer_dev_int->rst_type = ICE_REASON_PFR_REQ;
+		break;
+	default:
+		/* reset type is invalid */
+		return 1;
+	}
+	queue_work(peer_dev_int->ice_peer_wq, &peer_dev_int->peer_close_task);
+	return 0;
+}
+
+/**
+ * ice_check_peer_drv_for_events - check peer_drv for events to report
+ * @peer_dev: peer device to report to
+ */
+static void ice_check_peer_drv_for_events(struct ice_peer_dev *peer_dev)
+{
+	struct ice_peer_drv *peer_drv = drv_to_ice_peer(peer_dev->dev.driver);
+	const struct ice_peer_ops *p_ops = peer_dev->peer_ops;
+	struct ice_peer_drv_int *peer_drv_int;
+	struct ice_peer_dev_int *peer_dev_int;
+	int i;
+
+	peer_drv_int = peer_to_ice_drv_int(peer_drv);
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	if (!peer_drv_int || !peer_dev_int)
+		return;
+
+	for_each_set_bit(i, peer_dev_int->events, ICE_EVENT_NBITS)
+		if (!bitmap_empty(peer_drv_int->current_events[i].type,
+				  ICE_EVENT_NBITS))
+			p_ops->event_handler(peer_dev,
+					     &peer_drv_int->current_events[i]);
+}
+
+/**
+ * ice_check_peer_for_events - check peer_devs for events new peer reg'd for
+ * @dev: peer to check for events
+ * @data: ptr to opaque data, to be used for the peer struct that opened
+ *
+ * This function is to be called when a peer device is opened.
+ *
+ * Since a new peer opening would have missed any events that would
+ * have happened before its opening, we need to walk the peers and see
+ * if any of them have events that the new peer cares about
+ *
+ * This function is meant to be called by a device_for_each_child.
+ */
+static int ice_check_peer_for_events(struct device *dev, void *data)
+{
+	struct ice_peer_dev *new_peer = (struct ice_peer_dev *)data;
+	struct ice_peer_dev *src_peer = dev_to_ice_peer(dev);
+	const struct ice_peer_ops *p_ops = new_peer->peer_ops;
+	struct ice_peer_dev_int *new_peer_int, *src_peer_int;
+	int i;
+
+	if (ice_validate_peer_dev(src_peer))
+		return 0;
+
+	new_peer_int = peer_to_ice_dev_int(new_peer);
+	src_peer_int = peer_to_ice_dev_int(src_peer);
+
+	if (!new_peer_int || !src_peer_int)
+		return 0;
+
+	for_each_set_bit(i, new_peer_int->events, ICE_EVENT_NBITS)
+		if (!bitmap_empty(src_peer_int->current_events[i].type,
+				  ICE_EVENT_NBITS) &&
+		    new_peer->index != src_peer->index)
+			p_ops->event_handler(new_peer,
+					     &src_peer_int->current_events[i]);
+
+	return 0;
+}
+
+/**
+ * ice_finish_init_peer_device - complete peer device initialization
+ * @dev: ptr to peer device
+ * @data: ptr to opaque data
+ *
+ * This function completes remaining initialization of peer_devices and
+ * triggers peer driver's probe (aka open)
+ */
+int ice_finish_init_peer_device(struct device *dev, void __always_unused *data)
+{
+	struct ice_port_info *port_info = NULL;
+	struct ice_peer_dev_int *peer_dev_int;
+	struct ice_peer_drv *peer_drv;
+	struct ice_peer_dev *peer_dev;
+	struct ice_vsi *vsi;
+	struct ice_pf *pf;
+	int ret;
+
+	/* unable to verify peer device or no peer driver registered */
+	if (!dev->driver)
+		return 0;
+
+	peer_drv = drv_to_ice_peer(dev->driver);
+	if (!peer_drv)
+		return 0;
+
+	peer_dev = dev_to_ice_peer(dev);
+	/* is it OK to proceed with peer_dev, state check? */
+	ret = ice_validate_peer_dev(peer_dev);
+	if (ret)
+		return ret;
+
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	if (!peer_dev_int)
+		return 0;
+
+	pf = pci_get_drvdata(peer_dev->pdev);
+	if (!pf->hw.port_info) {
+		dev_warn(&pf->pdev->dev, "pf specific port_info is NULL\n");
+		return 0;
+	}
+
+	peer_dev->hw_addr = (u8 __iomem *)pf->hw.hw_addr;
+	port_info = pf->hw.port_info;
+	vsi = pf->vsi[0];
+	peer_dev->pf_vsi_num = vsi->vsi_num;
+	peer_dev->netdev = vsi->netdev;
+	peer_dev->initial_mtu = vsi->netdev->mtu;
+	ether_addr_copy(peer_dev->lan_addr, port_info->mac.lan_addr);
+
+	/* Call the probe only if peer_dev is in _INIT  state */
+	if (test_bit(ICE_PEER_DEV_STATE_INIT, peer_dev_int->state)) {
+		/* Mark the state as _PROBE */
+		ice_peer_state_change(peer_dev_int, ICE_PEER_DEV_STATE_PROBE);
+
+		/* Initiate peer driver probe/open */
+		ret = peer_drv->probe(peer_dev);
+		if (ret) {
+			dev_err(&pf->pdev->dev,
+				"probe failed for peer device (%s), err %d\n",
+				dev->driver->name ? dev->driver->name : "",
+				ret);
+			ice_peer_state_change(peer_dev_int,
+					      ICE_PEER_DEV_STATE_INIT);
+			return ret;
+		}
+		ice_peer_state_change(peer_dev_int, ICE_PEER_DEV_STATE_PROBED);
+	}
+
+	if (!peer_dev->peer_ops) {
+		dev_err(&pf->pdev->dev,
+			"peer_ops not defined on peer dev (%s)\n",
+			dev->driver->name ? dev->driver->name : "");
+		return 0;
+	}
+
+	if (!peer_dev->peer_ops->open) {
+		dev_err(&pf->pdev->dev,
+			"peer_ops:open not defined on peer dev (%s)\n",
+			dev->driver->name ? dev->driver->name : "");
+		return 0;
+	}
+
+	if (!peer_dev->peer_ops->close) {
+		dev_err(&pf->pdev->dev,
+			"peer_ops:close not defined on peer dev (%s)\n",
+			dev->driver->name ? dev->driver->name : "");
+		return 0;
+	}
+
+	/* Peer driver expected to set driver_id during registration */
+	if (!peer_drv->driver_id) {
+		dev_err(&pf->pdev->dev,
+			"Peer driver (%s) did not set driver_id\n",
+			dev->driver->name);
+		return 0;
+	}
+
+	if ((test_bit(ICE_PEER_DEV_STATE_CLOSED, peer_dev_int->state) ||
+	     test_bit(ICE_PEER_DEV_STATE_PROBED, peer_dev_int->state)) &&
+	    ice_pf_state_is_nominal(pf)) {
+		if (!test_bit(ICE_PEER_DEV_STATE_OPENED, peer_dev_int->state)) {
+			peer_dev->peer_ops->open(peer_dev);
+			ice_peer_state_change(peer_dev_int,
+					      ICE_PEER_DEV_STATE_OPENED);
+			ret = bus_for_each_dev(&ice_peer_bus, NULL, peer_dev,
+					       ice_check_peer_for_events);
+			ice_check_peer_drv_for_events(peer_dev);
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * ice_unreg_peer_device - unregister specified device
+ * @dev: ptr to peer device
+ * @data: ptr to opaque data
+ *
+ * This function invokes device unregistration, removes ID associated with
+ * the specified device.
+ */
+int ice_unreg_peer_device(struct device *dev, void __always_unused *data)
+{
+	struct ice_peer_dev_int *peer_dev_int;
+
+	/* This is the function invoked from ice_remove
+	 * code-path, it eventually comes from device_for_each_child
+	 * No reason to prohibit calling device_unregister because this is the
+	 * last chance to trigger cleanup of devices by unregistering them
+	 * form bus, Actual cleanup of resources such as memory for peer_dev
+	 * is cleaned up from "dev.release function".
+	 */
+	device_unregister(dev);
+
+	peer_dev_int = peer_to_ice_dev_int(dev_to_ice_peer(dev));
+	if (!peer_dev_int)
+		return 0;
+
+	if (peer_dev_int->ice_peer_wq) {
+		if (peer_dev_int->peer_prep_task.func)
+			cancel_work_sync(&peer_dev_int->peer_prep_task);
+
+		if (peer_dev_int->peer_close_task.func)
+			cancel_work_sync(&peer_dev_int->peer_close_task);
+		destroy_workqueue(peer_dev_int->ice_peer_wq);
+	}
+
+	/* Cleanup the allocated ID for this peer device */
+	ida_simple_remove(&ice_peer_index_ida, peer_dev_int->peer_dev.index);
+
+	return 0;
+}
+
+/**
+ * ice_unroll_peer - destroy peers and peer_wq in case of error
+ * @dev: ptr to peer device
+ * @data: ptr to opaque data
+ *
+ * This function releases resources in the event of a failure in creating
+ * peer devices or their individual work_queues. Meant to be called from
+ * a bus_for_each_device invocation
+ */
+int ice_unroll_peer(struct device *dev, void __always_unused *data)
+{
+	struct ice_peer_dev_int *peer_dev_int;
+
+	peer_dev_int = peer_to_ice_dev_int(dev_to_ice_peer(dev));
+
+	if (!peer_dev_int)
+		return 0;
+	if (peer_dev_int->ice_peer_wq)
+		destroy_workqueue(peer_dev_int->ice_peer_wq);
+	devm_kfree(dev->parent, peer_dev_int);
+
+	return 0;
+}
+
+/* static initialization of device IDs for different peer devices */
+static const struct ice_peer_device_id peer_device_ids[] = {
+	{.vendor = PCI_VENDOR_ID_INTEL,
+	 .device = ICE_PEER_RDMA_DEV},
+};
+
+/**
+ * ice_peer_dev_release - Release peer device object
+ * @dev: ptr to device object
+ *
+ * This function is invoked from device_unregister codepath. If peer
+ * device doesn't have 'release' function, WARN is trigger due to
+ * 'release' function being NULL. This function to release device
+ * specific resources and release peer device object memory.
+ */
+static void ice_peer_dev_release(struct device *dev)
+{
+	struct ice_peer_dev *peer_dev = dev_to_ice_peer(dev);
+	struct ice_peer_dev_int *peer_dev_int;
+
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	if (!peer_dev_int)
+		return;
+	devm_kfree(dev->parent, peer_dev_int);
+}
+
+/**
+ * ice_find_vsi - Find the VSI from VSI ID
+ * @pf: The PF pointer to search in
+ * @vsi_num: The VSI ID to search for
+ */
+static struct ice_vsi *ice_find_vsi(struct ice_pf *pf, u16 vsi_num)
+{
+	int i;
+
+	ice_for_each_vsi(pf, i)
+		if (pf->vsi[i] && pf->vsi[i]->vsi_num == vsi_num)
+			return  pf->vsi[i];
+	return NULL;
+}
+
+/**
+ * ice_peer_alloc_rdma_qsets - Allocate Leaf Nodes for RDMA Qset
+ * @peer_dev: peer that is requesting the Leaf Nodes
+ * @res: Resources to be allocated
+ * @partial_acceptable: If partial allocation is acceptable to the peer
+ *
+ * This function allocates Leaf Nodes for given RDMA Qset resources
+ * for the peer device.
+ */
+static int
+ice_peer_alloc_rdma_qsets(struct ice_peer_dev *peer_dev, struct ice_res *res,
+			  int __maybe_unused partial_acceptable)
+{
+	struct ice_pf *pf = pci_get_drvdata(peer_dev->pdev);
+	u16 max_rdmaqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
+	struct ice_rdma_qset_params *qset;
+	enum ice_status status;
+	struct ice_vsi *vsi;
+	u32 qset_teid;
+	int i;
+
+	if (res->cnt_req != 1)
+		return -EINVAL;
+
+	qset = &res->res[0].res.qsets;
+	if (qset->tc != 0 || qset->vsi_id != peer_dev->pf_vsi_num)
+		return -EINVAL;
+
+	/* Find the VSI struct */
+	vsi = ice_find_vsi(pf, qset->vsi_id);
+	if (!vsi)
+		return -EINVAL;
+
+	/* configure VSI nodes based on no. of RDMA qsets and TC's */
+	for (i = 0; i < ICE_MAX_TRAFFIC_CLASS; i++)
+		max_rdmaqs[i] = 1;
+
+	status = ice_cfg_vsi_rdma(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
+				  max_rdmaqs);
+	if (status) {
+		dev_info(&pf->pdev->dev, "Failed VSI RDMA qset config\n");
+		return -EINVAL;
+	}
+
+	status = ice_ena_vsi_rdma_qset(vsi->port_info, vsi->idx, qset->tc,
+				       &qset->qs_handle, res->cnt_req,
+				       &qset_teid);
+	if (status)
+		return -EINVAL;
+
+	vsi->qset_handle[qset->tc] = qset->qs_handle;
+	qset->teid = qset_teid;
+
+	return 0;
+}
+
+/**
+ * ice_peer_free_rdma_qsets - Free leaf nodes for RDMA Qset
+ * @peer_dev: peer that requested qsets to be freed
+ * @res: Resource to be freed
+ */
+static int
+ice_peer_free_rdma_qsets(struct ice_peer_dev *peer_dev, struct ice_res *res)
+{
+	struct ice_pf *pf = pci_get_drvdata(peer_dev->pdev);
+	struct ice_rdma_qset_params *qset;
+	enum ice_status status;
+	struct ice_vsi *vsi;
+	int count;
+	u16 q_id;
+
+	qset = &res->res[0].res.qsets;
+
+	vsi = ice_find_vsi(pf, qset->vsi_id);
+	if (!vsi)
+		return -EINVAL;
+
+	count = res->res_allocated;
+	if (count > 1)
+		return -EINVAL;
+
+	q_id = qset->qs_handle;
+
+	status = ice_dis_vsi_rdma_qset(vsi->port_info, count, &qset->teid,
+				       &q_id);
+	if (status)
+		return -EINVAL;
+
+	vsi->qset_handle[qset->tc] = 0;
+
+	return 0;
+}
+
+/**
+ * ice_peer_alloc_res - Allocate requested resources for peer device
+ * @peer_dev: peer that is requesting resources
+ * @res: Resources to be allocated
+ * @partial_acceptable: If partial allocation is acceptable to the peer
+ *
+ * This function allocates requested resources for the peer device.
+ */
+static int
+ice_peer_alloc_res(struct ice_peer_dev *peer_dev, struct ice_res *res,
+		   int partial_acceptable)
+{
+	struct ice_pf *pf;
+	int ret;
+
+	ret = ice_validate_peer_dev(peer_dev);
+	if (ret)
+		return ret;
+
+	pf = pci_get_drvdata(peer_dev->pdev);
+	if (!ice_pf_state_is_nominal(pf))
+		return -EBUSY;
+
+	switch (res->res_type) {
+	case ICE_RDMA_QSETS_TXSCHED:
+		ret = ice_peer_alloc_rdma_qsets(peer_dev, res,
+						partial_acceptable);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * ice_peer_free_res - Free given resources
+ * @peer_dev: peer that is requesting freeing of resources
+ * @res: Resources to be freed
+ *
+ * Free/Release resources allocated to given peer device.
+ */
+static int
+ice_peer_free_res(struct ice_peer_dev *peer_dev, struct ice_res *res)
+{
+	int ret;
+
+	ret = ice_validate_peer_dev(peer_dev);
+	if (ret)
+		return ret;
+
+	switch (res->res_type) {
+	case ICE_RDMA_QSETS_TXSCHED:
+		ret = ice_peer_free_rdma_qsets(peer_dev, res);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * ice_peer_reg_for_notif - register a peer to receive specific notifications
+ * @peer_dev: peer that is registering for event notifications
+ * @events: mask of event types peer is registering for
+ */
+static void
+ice_peer_reg_for_notif(struct ice_peer_dev *peer_dev, struct ice_event *events)
+{
+	struct ice_peer_dev_int *peer_dev_int;
+
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	if (!peer_dev_int)
+		return;
+
+	bitmap_or(peer_dev_int->events, peer_dev_int->events, events->type,
+		  ICE_EVENT_NBITS);
+
+	/* Check to see if any events happened previous to peer registering */
+	bus_for_each_dev(&ice_peer_bus, NULL, peer_dev,
+			 ice_check_peer_for_events);
+	ice_check_peer_drv_for_events(peer_dev);
+}
+
+/**
+ * ice_peer_unreg_for_notif - unreg a peer from receiving certain notifications
+ * @peer_dev: peer that is unregistering from event notifications
+ * @events: mask of event types peer is unregistering for
+ */
+static void
+ice_peer_unreg_for_notif(struct ice_peer_dev *peer_dev,
+			 struct ice_event *events)
+{
+	struct ice_peer_dev_int *peer_dev_int;
+
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	if (!peer_dev_int)
+		return;
+
+	bitmap_andnot(peer_dev_int->events, peer_dev_int->events, events->type,
+		      ICE_EVENT_NBITS);
+}
+
+/**
+ * ice_peer_check_for_reg - check to see if any peers are reg'd for event
+ * @dev: ptr to peer device
+ * @data: ptr to opaque data, to be used for ice_event to report
+ *
+ * This function is to be called by device_for_each_child to handle an
+ * event reported by a peer or the ice driver.
+ */
+int ice_peer_check_for_reg(struct device *dev, void *data)
+{
+	struct ice_peer_dev *peer_dev = dev_to_ice_peer(dev);
+	struct ice_peer_dev_int *peer_dev_int;
+	struct ice_event *event = (struct ice_event *)data;
+	DECLARE_BITMAP(comp_events, ICE_EVENT_NBITS);
+	bool check = true;
+	int ret;
+
+	ret = ice_validate_peer_dev(peer_dev);
+	/* if returned error, in this case return 0 instead of 'ret'
+	 * because caller ignores this return value
+	 */
+	if (ret)
+		return 0;
+
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	if (!peer_dev_int)
+		return 0;
+
+	if (event->reporter)
+		check = event->reporter->index != peer_dev->index;
+
+	if (bitmap_and(comp_events, event->type, peer_dev_int->events,
+		       ICE_EVENT_NBITS) &&
+	    check && test_bit(ICE_PEER_DEV_STATE_OPENED, peer_dev_int->state))
+		peer_dev->peer_ops->event_handler(peer_dev, event);
+
+	return 0;
+}
+
+/**
+ * ice_peer_report_state_change - accept report of a peer state change
+ * @peer_dev: peer that is sending notification about state change
+ * @event: ice_event holding info on what the state change is
+ *
+ * We also need to parse the list of peers to see if anyone is registered
+ * for notifications about this state change event, and if so, notify them.
+ */
+static void
+ice_peer_report_state_change(struct ice_peer_dev *peer_dev,
+			     struct ice_event *event)
+{
+	struct ice_peer_dev_int *peer_dev_int;
+	struct ice_peer_drv_int *peer_drv_int;
+	struct ice_peer_drv *peer_drv;
+	int e_type, drv_event = 0;
+
+	if (ice_validate_peer_dev(peer_dev))
+		return;
+
+	peer_drv = drv_to_ice_peer(peer_dev->dev.driver);
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	peer_drv_int = peer_to_ice_drv_int(peer_drv);
+
+	if (!peer_dev_int || !peer_drv_int)
+		return;
+
+	e_type = find_first_bit(event->type, ICE_EVENT_NBITS);
+	if (!e_type)
+		return;
+
+	switch (e_type) {
+	/* Check for peer_drv events */
+	case ICE_EVENT_MBX_CHANGE:
+		drv_event = 1;
+		if (event->info.mbx_rdy)
+			set_bit(ICE_PEER_DRV_STATE_MBX_RDY,
+				peer_drv_int->state);
+		else
+			clear_bit(ICE_PEER_DRV_STATE_MBX_RDY,
+				  peer_drv_int->state);
+		break;
+
+	/* Check for peer_dev events */
+	case ICE_EVENT_API_CHANGE:
+		if (event->info.api_rdy)
+			set_bit(ICE_PEER_DEV_STATE_API_RDY,
+				peer_dev_int->state);
+		else
+			clear_bit(ICE_PEER_DEV_STATE_API_RDY,
+				  peer_dev_int->state);
+		break;
+
+	default:
+		return;
+	}
+
+	/* store the event and state to notify any new peers opening */
+	if (drv_event)
+		memcpy(&peer_drv_int->current_events[e_type], event,
+		       sizeof(*event));
+	else
+		memcpy(&peer_dev_int->current_events[e_type], event,
+		       sizeof(*event));
+
+	bus_for_each_dev(&ice_peer_bus, NULL, event, ice_peer_check_for_reg);
+}
+
+/**
+ * ice_peer_dev_uninit - request to uninitialize peer
+ * @peer_dev: peer device
+ *
+ * This function triggers close/remove on peer_dev allowing peer
+ * to uninitialize.
+ */
+static int ice_peer_dev_uninit(struct ice_peer_dev *peer_dev)
+{
+	enum ice_close_reason reason = ICE_REASON_PEER_DEV_UNINIT;
+	struct ice_peer_dev_int *peer_dev_int;
+	struct ice_peer_drv *peer_drv;
+	struct ice_pf *pf;
+	int ret;
+
+	ret = ice_validate_peer_dev(peer_dev);
+	if (ret)
+		return ret;
+
+	pf = pci_get_drvdata(peer_dev->pdev);
+	if (ice_is_reset_in_progress(pf->state))
+		return -EBUSY;
+
+	peer_drv = drv_to_ice_peer(peer_dev->dev.driver);
+
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	if (!peer_dev_int)
+		return -EINVAL;
+
+	ret = ice_peer_close(&peer_dev->dev, &reason);
+	if (ret)
+		return ret;
+
+	ret = peer_drv->remove(peer_dev);
+	if (!ret)
+		ice_peer_state_change(peer_dev_int, ICE_PEER_DEV_STATE_REMOVED);
+
+	return ret;
+}
+
+/**
+ * ice_peer_dev_reinit - request to reinitialize peer
+ * @peer_dev: peer device
+ *
+ * This function resets peer_dev state to 'INIT' that causes a
+ * re-probe/open on peer_dev from service task
+ */
+static int ice_peer_dev_reinit(struct ice_peer_dev *peer_dev)
+{
+	struct ice_peer_dev_int *peer_dev_int;
+	struct ice_pf *pf;
+	int ret;
+
+	ret = ice_validate_peer_dev(peer_dev);
+	if (ret)
+		return ret;
+
+	pf = pci_get_drvdata(peer_dev->pdev);
+	if (!ice_pf_state_is_nominal(pf))
+		return -EBUSY;
+
+	peer_dev_int = peer_to_ice_dev_int(peer_dev);
+	if (!peer_dev_int)
+		return -EINVAL;
+
+	if (test_bit(ICE_PEER_DEV_STATE_REMOVED, peer_dev_int->state))
+		ice_peer_state_change(peer_dev_int, ICE_PEER_DEV_STATE_INIT);
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ice_peer_request_reset - accept request from peer to perform a reset
+ * @peer_dev: peer device that is request a reset
+ * @reset_type: type of reset the peer is requesting
+ */
+static int
+ice_peer_request_reset(struct ice_peer_dev *peer_dev,
+		       enum ice_peer_reset_type reset_type)
+{
+	enum ice_reset_req reset;
+	struct ice_pf *pf;
+
+	if (ice_validate_peer_dev(peer_dev))
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_dev->pdev);
+
+	switch (reset_type) {
+	case ICE_PEER_PFR:
+		reset = ICE_RESET_PFR;
+		break;
+	case ICE_PEER_CORER:
+		reset = ICE_RESET_CORER;
+		break;
+	case ICE_PEER_GLOBR:
+		reset = ICE_RESET_GLOBR;
+		break;
+	default:
+		dev_err(&pf->pdev->dev, "incorrect reset request from peer\n");
+		return -EINVAL;
+	}
+
+	return ice_schedule_reset(pf, reset);
+}
+
+/**
+ * ice_peer_update_vsi_filter - update filters for RDMA VSI
+ * @peer_dev: pointer to RDMA peer device
+ * @filter: selection of filters to enable or disable
+ * @enable: bool whether to enable or disable filters
+ */
+static
+int ice_peer_update_vsi_filter(struct ice_peer_dev *peer_dev,
+			       enum ice_rdma_filter __maybe_unused filter,
+			       bool enable)
+{
+	struct ice_pf *pf;
+	int ret, v;
+	u16 idx;
+
+	pf = pci_get_drvdata(peer_dev->pdev);
+	if (!pf)
+		return -EINVAL;
+
+	ice_for_each_vsi(pf, v)
+		if (peer_dev->pf_vsi_num == pf->vsi[v]->vsi_num) {
+			idx = pf->vsi[v]->idx;
+			break;
+		}
+	if (v >= pf->num_alloc_vsi)
+		return -EINVAL;
+
+	ret = ice_cfg_iwarp_fltr(&pf->hw, idx, enable);
+
+	if (ret)
+		dev_err(&pf->pdev->dev, "Failed to  %sable iWARP filtering\n",
+			enable ? "en" : "dis");
+
+	return ret;
+}
+
+/**
+ * ice_peer_vc_send - send a virt channel message from RDMA peer
+ * @peer_dev: pointer to RDMA peer dev
+ * @vf_id: the absolute VF ID of recipient of message
+ * @msg: pointer to message contents
+ * @len: len of message
+ */
+static
+int ice_peer_vc_send(struct ice_peer_dev *peer_dev, u32 vf_id, u8 *msg, u16 len)
+{
+	struct ice_pf *pf;
+	int err;
+
+	if (ice_validate_peer_dev(peer_dev))
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_dev->pdev);
+	/* VIRTCHNL_OP_IWARP is being used for RoCEv2 msg also */
+	err = ice_aq_send_msg_to_vf(&pf->hw, vf_id, VIRTCHNL_OP_IWARP, 0, msg,
+				    len, NULL);
+	if (err)
+		dev_err(&pf->pdev->dev,
+			"Unable to send RDMA msg to VF, error %d\n", err);
+
+	return err;
+}
+
+/* Initialize the ice_ops struct, which is used in 'ice_init_peer_devices' */
+static const struct ice_ops ops = {
+	.alloc_res			= ice_peer_alloc_res,
+	.free_res			= ice_peer_free_res,
+	.reg_for_notification		= ice_peer_reg_for_notif,
+	.unreg_for_notification		= ice_peer_unreg_for_notif,
+	.notify_state_change		= ice_peer_report_state_change,
+	.request_reset			= ice_peer_request_reset,
+	.request_uninit			= ice_peer_dev_uninit,
+	.request_reinit			= ice_peer_dev_reinit,
+	.update_vsi_filter		= ice_peer_update_vsi_filter,
+	.vc_send			= ice_peer_vc_send,
+
+};
+
+/**
+ * ice_reserve_peer_qvector - Reserve vector resources for peer drivers
+ * @pf: board private structure to initialize
+ */
+static int ice_reserve_peer_qvector(struct ice_pf *pf)
+{
+	if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags)) {
+		int index;
+
+		index = ice_get_res(pf, pf->sw_irq_tracker, pf->num_rdma_msix,
+				    ICE_RES_RDMA_VEC_ID);
+		if (index < 0)
+			return index;
+		pf->num_avail_sw_msix -= pf->num_rdma_msix;
+		pf->rdma_base_vector = index;
+
+		index = ice_get_res(pf, pf->hw_irq_tracker, pf->num_rdma_msix,
+				    ICE_RES_RDMA_VEC_ID);
+		if (index < 0) {
+			ice_free_res(pf->sw_irq_tracker, pf->rdma_base_vector,
+				     ICE_RES_RDMA_VEC_ID);
+			pf->num_avail_sw_msix += pf->num_rdma_msix;
+			return index;
+		}
+		pf->num_avail_hw_msix -= pf->num_rdma_msix;
+	}
+	return 0;
+}
+
+/**
+ * ice_peer_close_task - call peer's close asynchronously
+ * @work: pointer to work_struct contained by the peer_dev_int struct
+ *
+ * This method (asynchronous) of calling a peer's close function is
+ * meant to be used in the reset path.
+ */
+static void ice_peer_close_task(struct work_struct *work)
+{
+	struct ice_peer_dev_int *peer_dev_int;
+	struct ice_peer_dev *peer_dev;
+
+	peer_dev_int = container_of(work, struct ice_peer_dev_int,
+				    peer_close_task);
+
+	peer_dev = &peer_dev_int->peer_dev;
+	if (!peer_dev || !peer_dev->peer_ops)
+		return;
+
+	if (peer_dev->peer_ops->close)
+		peer_dev->peer_ops->close(peer_dev, peer_dev_int->rst_type);
+
+	ice_peer_state_change(peer_dev_int, ICE_PEER_DEV_STATE_CLOSED);
+}
+
+/**
+ * ice_init_peer_devices - initializes peer devices
+ * @pf: ptr to ice_pf
+ *
+ * This function initializes peer devices and associates them with specified
+ * pci_dev as their parent.
+ */
+int ice_init_peer_devices(struct ice_pf *pf)
+{
+	struct pci_dev *pdev = pf->pdev;
+	struct msix_entry *entry = NULL;
+	int status = 0;
+	int i;
+
+	/* Reserve vector resources */
+	status = ice_reserve_peer_qvector(pf);
+	if (status < 0) {
+		dev_err(&pdev->dev,
+			"failed to reserve vectors for peer drivers\n");
+		return status;
+	}
+	for (i = 0; i < ARRAY_SIZE(peer_device_ids); i++) {
+		struct ice_peer_dev_int *peer_dev_int;
+		struct ice_qos_params *qos_info;
+		int j;
+		struct ice_peer_dev *peer_dev;
+
+		peer_dev_int = devm_kzalloc(&pdev->dev, sizeof(*peer_dev_int),
+					    GFP_KERNEL);
+		if (!peer_dev_int)
+			return -ENOMEM;
+
+		peer_dev = &peer_dev_int->peer_dev;
+		peer_dev->peer_ops = NULL;
+		peer_dev_int->ice_peer_wq =
+			alloc_ordered_workqueue("ice_peer_wq_%d", WQ_UNBOUND,
+						i);
+		if (!peer_dev_int->ice_peer_wq)
+			return -ENOMEM;
+		INIT_WORK(&peer_dev_int->peer_close_task, ice_peer_close_task);
+
+		/* Assign a unique index and hence name for peer device */
+		status = ida_simple_get(&ice_peer_index_ida, 0, 0, GFP_KERNEL);
+		if (status < 0) {
+			dev_err(&pdev->dev,
+				"failed to get unique index for device (ID: 0x%04x)\n",
+				peer_dev->dev_id.device);
+			devm_kfree(&pdev->dev, peer_dev);
+			return status;
+		}
+		peer_dev->index = status;
+		dev_set_name(&peer_dev->dev, "ice_peer_%u",
+			     peer_dev->index);
+		peer_dev->pdev = pdev;
+		peer_dev->ari_ena = pci_ari_enabled(pdev->bus);
+		peer_dev->bus_num = PCI_BUS_NUM(pdev->devfn);
+		if (!peer_dev->ari_ena) {
+			peer_dev->dev_num = PCI_SLOT(pdev->devfn);
+			peer_dev->fn_num = PCI_FUNC(pdev->devfn);
+		} else {
+			peer_dev->dev_num = 0;
+			peer_dev->fn_num = pdev->devfn & 0xff;
+		}
+
+		qos_info = &peer_dev->initial_qos_info;
+
+		/* setup qos_info fields with defaults */
+		qos_info->num_apps = 0;
+		qos_info->num_tc = 1;
+
+		for (j = 0; j < ICE_IDC_MAX_USER_PRIORITY; j++)
+			qos_info->up2tc[j] = 0;
+
+		qos_info->tc_info[0].rel_bw = 100;
+		for (j = 1; j < IEEE_8021QAZ_MAX_TCS; j++)
+			qos_info->tc_info[j].rel_bw = 0;
+
+		peer_dev->dev_id.vendor = peer_device_ids[i].vendor;
+		peer_dev->dev_id.device = peer_device_ids[i].device;
+		peer_dev->dev.release = ice_peer_dev_release;
+		peer_dev->dev.parent = &pdev->dev;
+		peer_dev->dev.bus = &ice_peer_bus;
+
+		/* Initialize ice_ops */
+		peer_dev->ops = &ops;
+
+		/* make sure peer specific resources such as msix_count and
+		 * msix_entries are initialized
+		 */
+		switch (peer_dev->dev_id.device) {
+		case ICE_PEER_RDMA_DEV:
+			if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags)) {
+				peer_dev->msix_count = pf->num_rdma_msix;
+				entry = &pf->msix_entries[pf->rdma_base_vector];
+			}
+			break;
+		default:
+			break;
+		}
+
+		peer_dev->msix_entries = entry;
+
+		/* device_register() causes the bus infrastructure to look for
+		 * a matching driver
+		 */
+		status = device_register(&peer_dev->dev);
+		if (status) {
+			dev_err(&pdev->dev,
+				"failed to register device (ID: 0x%04x)\n",
+				peer_dev->dev_id.device);
+			ida_simple_remove(&ice_peer_index_ida,
+					  peer_dev->index);
+			put_device(&peer_dev->dev);
+			devm_kfree(&pdev->dev, peer_dev);
+			break;
+		}
+	}
+
+	return status;
+}
+
+/**
+ * ice_reg_peer_driver - register peer driver
+ * @drv: ptr to peer driver
+ *
+ * This is the registration function for peer drivers, which invokes
+ * OS specific driver registration to trigger bus infrastructure. This
+ * exported symbol to be invoked by peer drivers.
+ *
+ * registering peer is expected to populate the ice_peerdrv->name field
+ * before calling this function.
+ */
+int ice_reg_peer_driver(struct ice_peer_drv *drv)
+{
+	struct ice_peer_drv_int *peer_drv_int;
+	int ret, i;
+
+	if (!drv) {
+		pr_err("Failed to reg peer drv: drv ptr NULL\n");
+		return -EINVAL;
+	}
+
+	if (!drv->name) {
+		pr_err("Failed to reg peer drv: peer drv name NULL\n");
+		return -EINVAL;
+	}
+
+	if (!drv->driver.owner || !drv->driver.mod_name) {
+		pr_err("Fail reg peer drv: peer drv owner or mod_name NULL\n");
+		return -EINVAL;
+	}
+
+	if (drv->ver.major != ICE_PEER_MAJOR_VER ||
+	    drv->ver.minor != ICE_PEER_MINOR_VER) {
+		pr_err("failed to register due to version mismatch:\n");
+		pr_err("expected major ver %d, caller specified major ver %d\n",
+		       ICE_PEER_MAJOR_VER, drv->ver.major);
+		pr_err("expected minor ver %d, caller specified minor ver %d\n",
+		       ICE_PEER_MINOR_VER, drv->ver.minor);
+		return -EINVAL;
+	}
+
+	if (!drv->remove) {
+		pr_err("failed to register due to lack of remove API\n");
+		return -EINVAL;
+	}
+
+	if (!drv->probe) {
+		pr_err("failed to register due to lack of probe API\n");
+		return -EINVAL;
+	}
+
+	peer_drv_int = kzalloc(sizeof(*peer_drv_int), GFP_KERNEL);
+	if (!peer_drv_int)
+		return -ENOMEM;
+
+	peer_drv_int->peer_drv = drv;
+	INIT_LIST_HEAD(&peer_drv_int->drv_int_list);
+
+	mutex_lock(&ice_peer_drv_mutex);
+	list_add(&peer_drv_int->drv_int_list, &ice_peer_drv_list);
+	mutex_unlock(&ice_peer_drv_mutex);
+
+	/* Initialize driver values */
+	for (i = 0; i < ICE_EVENT_NBITS; i++)
+		bitmap_zero(peer_drv_int->current_events[i].type,
+			    ICE_EVENT_NBITS);
+
+	drv->driver.bus = &ice_peer_bus;
+
+	ret = driver_register(&drv->driver);
+	if (ret) {
+		pr_err("Failed to register peer driver %d\n", ret);
+		mutex_lock(&ice_peer_drv_mutex);
+		list_del(&peer_drv_int->drv_int_list);
+		mutex_unlock(&ice_peer_drv_mutex);
+		kfree(peer_drv_int);
+	}
+
+	return ret;
+}
+
+/**
+ * ice_unreg_peer_driver - unregister peer driver
+ * @drv: ptr to peer driver
+ *
+ * This is the unregistration function for peer drivers, which invokes
+ * OS specific driver unregistration to trigger bus infrastructure. This
+ * exported symbol to be invoked by peer drivers.
+ */
+int ice_unreg_peer_driver(struct ice_peer_drv *drv)
+{
+	struct ice_peer_drv_int *peer_drv_int;
+
+	if (!drv || !drv->driver.owner) {
+		pr_err("Fail unregister peer driver: driver or mod ptr NULL\n");
+		return -ENODEV;
+	}
+
+	peer_drv_int = peer_to_ice_drv_int(drv);
+	if (!peer_drv_int)
+		return -ENODEV;
+
+	mutex_lock(&ice_peer_drv_mutex);
+	list_del(&peer_drv_int->drv_int_list);
+	mutex_unlock(&ice_peer_drv_mutex);
+
+	kfree(peer_drv_int);
+
+	driver_unregister(&drv->driver);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_idc.h b/drivers/net/ethernet/intel/ice/ice_idc.h
new file mode 100644
index 0000000..b998aa7
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_idc.h
@@ -0,0 +1,402 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _ICE_IDC_H_
+#define _ICE_IDC_H_
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/dcbnl.h>
+
+/* This major and minor version represent IDC API version information.
+ * During peer driver registration, peer driver specifies major and minor
+ * version information (via. peer_driver:ver_info). It gets checked against
+ * following defines and if mismatch, then peer driver registration
+ * fails and appropriate message gets logged.
+ */
+#define ICE_PEER_MAJOR_VER		5
+#define ICE_PEER_MINOR_VER		1
+
+enum ice_event_type {
+	ICE_EVENT_LINK_CHANGE = 0x0,
+	ICE_EVENT_MTU_CHANGE,
+	ICE_EVENT_TC_CHANGE,
+	ICE_EVENT_API_CHANGE,
+	ICE_EVENT_MBX_CHANGE,
+	ICE_EVENT_NBITS		/* must be last */
+};
+
+enum ice_res_type {
+	ICE_INVAL_RES = 0x0,
+	ICE_VSI,
+	ICE_VEB,
+	ICE_EVENT_Q,
+	ICE_EGRESS_CMPL_Q,
+	ICE_CMPL_EVENT_Q,
+	ICE_ASYNC_EVENT_Q,
+	ICE_DOORBELL_Q,
+	ICE_RDMA_QSETS_TXSCHED,
+};
+
+enum ice_peer_reset_type {
+	ICE_PEER_PFR = 0,
+	ICE_PEER_CORER,
+	ICE_PEER_CORER_SW_CORE,
+	ICE_PEER_CORER_SW_FULL,
+	ICE_PEER_GLOBR,
+};
+
+/* reason notified to peer driver as part of event handling */
+enum ice_close_reason {
+	ICE_REASON_INVAL = 0x0,
+	ICE_REASON_HW_UNRESPONSIVE,
+	ICE_REASON_INTERFACE_DOWN, /* Administrative down */
+	ICE_REASON_PEER_DRV_UNREG, /* peer driver getting unregistered */
+	ICE_REASON_PEER_DEV_UNINIT,
+	ICE_REASON_GLOBR_REQ,
+	ICE_REASON_CORER_REQ,
+	ICE_REASON_EMPR_REQ,
+	ICE_REASON_PFR_REQ,
+	ICE_REASON_HW_RESET_PENDING,
+	ICE_REASON_PARAM_CHANGE,
+};
+
+enum ice_rdma_filter {
+	ICE_RDMA_FILTER_INVAL = 0x0,
+	ICE_RDMA_FILTER_IWARP,
+	ICE_RDMA_FILTER_ROCEV2,
+	ICE_RDMA_FILTER_BOTH,
+};
+
+/* This information is needed to handle peer driver registration,
+ * instead of adding more params to peer_drv_registration function,
+ * let's get it thru' peer_drv object.
+ */
+struct ice_ver_info {
+	u16 major;
+	u16 minor;
+	u16 support;
+};
+
+/* Struct to hold per DCB APP info */
+struct ice_dcb_app_info {
+	u8  priority;
+	u8  selector;
+	u16 prot_id;
+};
+
+struct ice_peer_dev;
+
+#define ICE_IDC_MAX_USER_PRIORITY        8
+#define ICE_IDC_MAX_APPS        8
+
+/* Struct to hold per RDMA Qset info */
+struct ice_rdma_qset_params {
+	u32 teid;	/* qset TEID */
+	u16 qs_handle; /* RDMA driver provides this */
+	u16 vsi_id; /* VSI index */
+	u8 tc; /* TC branch the QSet should belong to */
+	u8 reserved[3];
+};
+
+struct ice_res_base {
+	/* Union for future provision e.g. other res_type */
+	union {
+		struct ice_rdma_qset_params qsets;
+	} res;
+};
+
+struct ice_res {
+	/* Type of resource. Filled by peer driver */
+	enum ice_res_type res_type;
+	/* Count requested by peer driver */
+	u16 cnt_req;
+
+	/* Number of resources allocated. Filled in by callee.
+	 * Based on this value, caller to fill up "resources"
+	 */
+	u16 res_allocated;
+
+	/* Unique handle to resources allocated. Zero if call fails.
+	 * Allocated by callee and for now used by caller for internal
+	 * tracking purpose.
+	 */
+	u32 res_handle;
+
+	/* Peer driver has to allocate sufficient memory, to accommodate
+	 * cnt_requested before calling this function.
+	 * Memory has to be zero initialized. It is input/output param.
+	 * As a result of alloc_res API, this structures will be populated.
+	 */
+	struct ice_res_base res[1];
+};
+
+struct ice_vector_info {
+	u32 v_idx; /* MSIx vector */
+	u16 itr_idx;
+	/* This is the register address of GLINT_DYN_CTL[idx], not value */
+	u64 itr_dyn_ctl_reg;
+	/* This is the register address of GLINT_RATE[idx], not value */
+	u64 itr_rate_lmt_reg;
+};
+
+struct ice_vector_list {
+	u32 num_vectors;
+	struct ice_vector_info *vector;
+	/* Unique handle to resources allocated.
+	 * Zero if call fails
+	 */
+	u32 res_handle;
+};
+
+struct ice_itr_regs {
+	u16 cnt;
+	u64 *tmr_regs;
+	u32 res_handle;
+};
+
+struct ice_qos_info {
+	u64 tc_ctx;
+	u8 rel_bw;
+	u8 prio_type;
+	u8 egress_virt_up;
+	u8 ingress_virt_up;
+};
+
+/* Struct to hold QoS info */
+struct ice_qos_params {
+	struct ice_qos_info tc_info[IEEE_8021QAZ_MAX_TCS];
+	u8 up2tc[ICE_IDC_MAX_USER_PRIORITY];
+	u8 vsi_relative_bw;
+	u8 vsi_priority_type;
+	u32 num_apps;
+	struct ice_dcb_app_info apps[ICE_IDC_MAX_APPS];
+	u8 num_tc;
+};
+
+union ice_event_info {
+	/* ICE_EVENT_LINK_CHANGE */
+	struct {
+		struct net_device *lwr_nd;
+		u16 vsi_num; /* HW index of VSI corresponding to lwr ndev */
+		u8 new_link_state;
+		u8 lport;
+	} link_info;
+	/* ICE_EVENT_MTU_CHANGE */
+	u16 mtu;
+	/* ICE_EVENT_TC_CHANGE */
+	struct ice_qos_params port_qos;
+	/* ICE_EVENT_API_CHANGE */
+	u8 api_rdy;
+	/* ICE_EVENT_MBX_CHANGE */
+	u8 mbx_rdy;
+};
+
+/* ice_event elements are to be passed back and forth between the ice driver
+ * and the peer drivers. They are to be used to both register/unregister
+ * for event reporting and to report an event (events can be either ice
+ * generated or peer generated).
+ *
+ * For (un)registering for events, the structure needs to be populated with:
+ *   reporter - pointer to the ice_peer_dev struct of the peer (un)registering
+ *   type - bitmap with bits set for event types to (un)register for
+ *
+ * For reporting events, the structure needs to be populated with:
+ *   reporter - pointer to peer that generated the event (NULL for ice)
+ *   type - bitmap with single bit set for this event type
+ *   info - union containing data relevant to this event type
+ */
+struct ice_event {
+	struct ice_peer_dev *reporter;
+	DECLARE_BITMAP(type, ICE_EVENT_NBITS);
+	union ice_event_info info;
+};
+
+/* Following APIs are implemented by ICE driver and invoked by peer drivers */
+struct ice_ops {
+	/* APIs to allocate resources such as VEB, VSI, Doorbell queues,
+	 * completion queues, Tx/Rx queues, etc...
+	 */
+	int (*alloc_res)(struct ice_peer_dev *peer_dev,
+			 struct ice_res *res,
+			 int partial_acceptable);
+	int (*free_res)(struct ice_peer_dev *peer_dev,
+			struct ice_res *res);
+
+	/* Interrupt/Vector related APIs */
+	int (*alloc_msix_vector)(struct ice_peer_dev *peer_dev,
+				 int count, struct ice_vector_list *entries);
+	int (*free_msix_vector)(struct ice_peer_dev *peer_dev,
+				int count, struct ice_vector_list *entries);
+	int (*associate_vector_cause)(struct ice_peer_dev *peer_dev,
+				      struct ice_vector_info *qv_info,
+				      enum ice_res_type res_type,
+				      int res_idx);
+	int (*request_uninit)(struct ice_peer_dev *peer_dev);
+	int (*request_reinit)(struct ice_peer_dev *peer_dev);
+	int (*request_reset)(struct ice_peer_dev *dev,
+			     enum ice_peer_reset_type reset_type);
+
+	void (*notify_state_change)(struct ice_peer_dev *dev,
+				    struct ice_event *event);
+
+	/* Notification APIs */
+	void (*reg_for_notification)(struct ice_peer_dev *dev,
+				     struct ice_event *event);
+	void (*unreg_for_notification)(struct ice_peer_dev *dev,
+				       struct ice_event *event);
+	int (*update_vsi_filter)(struct ice_peer_dev *peer_dev,
+				 enum ice_rdma_filter filter, bool enable);
+	int (*vc_send)(struct ice_peer_dev *peer_dev, u32 vf_id, u8 *msg,
+		       u16 len);
+};
+
+/* Following APIs are implemented by peer drivers and invoked by ICE driver */
+struct ice_peer_ops {
+	void (*event_handler)(struct ice_peer_dev *peer_dev,
+			      struct ice_event *event);
+
+	/* Why we have 'open' and when it is expected to be called:
+	 * 1. symmetric set of API w.r.t close
+	 * 2. To be invoked form driver initialization path
+	 *     - call peer_driver:probe as soon as ice driver:probe is done
+	 *     - call peer_driver:open once ice driver is fully initialized
+	 * 3. To be invoked upon RESET complete
+	 *
+	 * Calls to open are performed from ice_finish_init_peer_device
+	 * which is invoked from the service task. This helps keep devices
+	 * from having their open called until the ice driver is ready and
+	 * has scheduled its service task.
+	 */
+	void (*open)(struct ice_peer_dev *peer_dev);
+
+	/* Peer's close function is to be called when the peer needs to be
+	 * quiesced. This can be for a variety of reasons (enumerated in the
+	 * ice_close_reason enum struct). A call to close will only be
+	 * followed by a call to either remove or open. No IDC calls from the
+	 * peer should be accepted until it is re-opened.
+	 *
+	 * The *reason* parameter is the reason for the call to close. This
+	 * can be for any reason enumerated in the ice_close_reason struct.
+	 * It's primary reason is for the peer's bookkeeping and in case the
+	 * peer want to perform any different tasks dictated by the reason.
+	 */
+	void (*close)(struct ice_peer_dev *peer_dev,
+		      enum ice_close_reason reason);
+
+	int (*vc_receive)(struct ice_peer_dev *peer_dev, u32 vf_id, u8 *msg,
+			  u16 len);
+	/* tell RDMA peer to prepare for TC change in a blocking call
+	 * that will directly precede the change event
+	 */
+	void (*prep_tc_change)(struct ice_peer_dev *peer_dev);
+};
+
+struct ice_peer_device_id {
+	u32 vendor;
+
+	u32 device;
+#define ICE_PEER_RDMA_DEV	0x00000010
+};
+
+#define ICE_MAX_NUM_LPORTS		21
+/* structure representing peer device */
+struct ice_peer_dev {
+	struct device dev;
+	struct pci_dev *pdev; /* PCI device of corresponding to main function */
+	struct ice_peer_device_id dev_id;
+	/* KVA / Linear address corresponding to BAR0 of underlying
+	 * pci_device.
+	 */
+	u8 __iomem *hw_addr;
+
+	unsigned int index;
+
+	u8 ftype;	/* PF(false) or VF (true) */
+
+	/* Data VSI created by driver */
+	u16 pf_vsi_num;
+
+	u8 lan_addr[ETH_ALEN]; /* default MAC address of main netdev */
+	u16 initial_mtu; /* Initial MTU of main netdev */
+	struct ice_qos_params initial_qos_info;
+	struct net_device *netdev;
+	/* PCI info */
+	u8 ari_ena;
+	u16 bus_num;
+	u16 dev_num;
+	u16 fn_num;
+
+	/* Based on peer driver type, this shall point to corresponding MSIx
+	 * entries in pf->msix_entries (which were allocated as part of driver
+	 * initialization) e.g. for RDMA driver, msix_entries reserved will be
+	 * num_online_cpus + 1.
+	 */
+	u16 msix_count; /* How many vectors are reserved for this device */
+	struct msix_entry *msix_entries;
+
+	/* Following struct contains function pointers to be initialized
+	 * by ICE driver and called by peer driver
+	 */
+	const struct ice_ops *ops;
+
+	/* Following struct contains function pointers to be initialized
+	 * by peer driver and called by ICE driver
+	 */
+	const struct ice_peer_ops *peer_ops;
+};
+
+static inline struct ice_peer_dev *dev_to_ice_peer(struct device *_dev)
+{
+	return container_of(_dev, struct ice_peer_dev, dev);
+}
+
+/* structure representing peer driver
+ * Peer driver to initialize those function ptrs and
+ * it will be invoked by ICE as part of driver_registration
+ * via bus infrastructure
+ */
+struct ice_peer_drv {
+	u16 driver_id;
+#define ICE_PEER_LAN_DRIVER		0
+#define ICE_PEER_RDMA_DRIVER		4
+#define ICE_PEER_ADK_DRIVER		5
+
+	struct ice_ver_info ver;
+	const char *name;
+
+	struct device_driver driver;
+	struct ice_peer_device_id dev_id;
+
+	/* As part of ice_peer_drv initialization, peer driver is expected
+	 * to initialize driver.probe and driver.remove callbacks to peer
+	 * driver's respective probe and remove.
+	 *
+	 * driver_registration invokes driver->probe and likewise
+	 * driver_unregistration invokes driver->remove
+	 */
+	int (*probe)(struct ice_peer_dev *dev);
+	int (*remove)(struct ice_peer_dev *dev);
+};
+
+#define IDC_SIGNATURE 0x494e54454c494443ULL
+struct idc_srv_provider {
+	u64 signature;
+	u16 maj_ver;
+	u16 min_ver;
+	u8 rsvd[4];
+	int (*reg_peer_driver)(struct ice_peer_drv *drv);
+	int (*unreg_peer_driver)(struct ice_peer_drv *drv);
+};
+
+static inline struct ice_peer_drv *drv_to_ice_peer(struct device_driver *drv)
+{
+	return container_of(drv, struct ice_peer_drv, driver);
+};
+
+/* Exported symbols for driver registration/unregistration */
+int ice_reg_peer_driver(struct ice_peer_drv *peer);
+int ice_unreg_peer_driver(struct ice_peer_drv *peer);
+#endif /* _ICE_IDC_H_*/
diff --git a/drivers/net/ethernet/intel/ice/ice_idc_int.h b/drivers/net/ethernet/intel/ice/ice_idc_int.h
new file mode 100644
index 0000000..bb82e2c
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_idc_int.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _ICE_IDC_INT_H_
+#define _ICE_IDC_INT_H_
+
+#include "ice_idc.h"
+
+#define ICE_IDC_MAX_STATE_WAIT	12
+extern struct list_head ice_peer_drv_list;
+extern struct mutex ice_peer_drv_mutex; /* control access to list of peer_drv */
+int ice_prep_peer_for_reset(struct device *dev, void *data);
+int ice_close_peer_for_reset(struct device *dev, void *data);
+int ice_unroll_peer(struct device *dev, void *data);
+int ice_unreg_peer_device(struct device *dev, void *data);
+int ice_peer_close(struct device *dev, void *data);
+int ice_peer_check_for_reg(struct device *dev, void *data);
+int ice_finish_init_peer_device(struct device *dev, void *data);
+
+enum ice_peer_dev_state {
+	ICE_PEER_DEV_STATE_INIT,
+	ICE_PEER_DEV_STATE_PROBE,
+	ICE_PEER_DEV_STATE_PROBED,
+	ICE_PEER_DEV_STATE_OPENED,
+	ICE_PEER_DEV_STATE_PREP_RST,
+	ICE_PEER_DEV_STATE_PREPPED,
+	ICE_PEER_DEV_STATE_CLOSED,
+	ICE_PEER_DEV_STATE_REMOVED,
+	ICE_PEER_DEV_STATE_API_RDY,
+	ICE_PEER_DEV_STATE_NBITS,               /* must be last */
+};
+
+enum ice_peer_drv_state {
+	ICE_PEER_DRV_STATE_MBX_RDY,
+	ICE_PEER_DRV_STATE_NBITS,               /* must be last */
+};
+
+struct ice_peer_dev_int {
+	struct ice_peer_dev peer_dev; /* public structure */
+
+	/* if this peer_dev is the originator of an event, these are the
+	 * most recent events of each type
+	 */
+	struct ice_event current_events[ICE_EVENT_NBITS];
+	/* Events a peer has registered to be notified about */
+	DECLARE_BITMAP(events, ICE_EVENT_NBITS);
+
+	/* States associated with peer device */
+	DECLARE_BITMAP(state, ICE_PEER_DEV_STATE_NBITS);
+
+	/* per peer workqueue */
+	struct workqueue_struct *ice_peer_wq;
+
+	struct work_struct peer_prep_task;
+	struct work_struct peer_close_task;
+
+	enum ice_close_reason rst_type;
+};
+
+struct ice_peer_drv_int {
+	struct ice_peer_drv *peer_drv;
+
+	/* list of peer_drv_int */
+	struct list_head drv_int_list;
+
+	/* States associated with peer driver */
+	DECLARE_BITMAP(state, ICE_PEER_DRV_STATE_NBITS);
+
+	/* if this peer_dev is the originator of an event, these are the
+	 * most recent events of each type
+	 */
+	struct ice_event current_events[ICE_EVENT_NBITS];
+};
+
+static inline
+struct ice_peer_dev_int *peer_to_ice_dev_int(struct ice_peer_dev *peer_dev)
+{
+	return container_of(peer_dev, struct ice_peer_dev_int, peer_dev);
+}
+
+static inline
+struct ice_peer_drv_int *peer_to_ice_drv_int(struct ice_peer_drv *peer_drv)
+{
+	struct ice_peer_drv_int *drv_int;
+
+	mutex_lock(&ice_peer_drv_mutex);
+	list_for_each_entry(drv_int, &ice_peer_drv_list, drv_int_list) {
+		if (drv_int->peer_drv == peer_drv) {
+			mutex_unlock(&ice_peer_drv_mutex);
+			return drv_int;
+		}
+	}
+
+	mutex_unlock(&ice_peer_drv_mutex);
+
+	return NULL;
+}
+
+#endif /* !_ICE_IDC_INT_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 29b1dcf..b0789a12 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -1430,6 +1430,30 @@ int ice_add_mac_to_list(struct ice_vsi *vsi, struct list_head *add_list,
 }
 
 /**
+ * ice_pf_state_is_nominal - checks the pf for nominal state
+ * @pf: pointer to pf to check
+ *
+ * Check the PF's state for a collection of bits that would indicate
+ * the PF is in a state that would inhibit normal operation for
+ * driver functionality.
+ *
+ * Returns true if PF is in a nominal state, false otherwise
+ */
+bool ice_pf_state_is_nominal(struct ice_pf *pf)
+{
+	DECLARE_BITMAP(check_bits, __ICE_STATE_NBITS) = { 0 };
+
+	if (!pf)
+		return false;
+
+	bitmap_set(check_bits, 0, __ICE_STATE_NOMINAL_CHECK_BITS);
+	if (bitmap_intersects(pf->state, check_bits, __ICE_STATE_NBITS))
+		return false;
+
+	return true;
+}
+
+/**
  * ice_update_eth_stats - Update VSI-specific ethernet statistics counters
  * @vsi: the VSI to be updated
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.h b/drivers/net/ethernet/intel/ice/ice_lib.h
index 3831b4f..29aaacc 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.h
+++ b/drivers/net/ethernet/intel/ice/ice_lib.h
@@ -11,6 +11,8 @@ int ice_add_mac_to_list(struct ice_vsi *vsi, struct list_head *add_list,
 
 void ice_free_fltr_list(struct device *dev, struct list_head *h);
 
+bool ice_pf_state_is_nominal(struct ice_pf *pf);
+
 void ice_update_eth_stats(struct ice_vsi *vsi);
 
 int ice_vsi_cfg_rxqs(struct ice_vsi *vsi);
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 8725569..7db9148 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -416,8 +416,16 @@ static void ice_reset_subtask(struct ice_pf *pf)
 	 * for the reset now), poll for reset done, rebuild and return.
 	 */
 	if (test_bit(__ICE_RESET_OICR_RECV, pf->state)) {
-		clear_bit(__ICE_GLOBR_RECV, pf->state);
-		clear_bit(__ICE_CORER_RECV, pf->state);
+		/* Perform the largest reset requested */
+		if (test_and_clear_bit(__ICE_CORER_RECV, pf->state))
+			reset_type = ICE_RESET_CORER;
+		if (test_and_clear_bit(__ICE_GLOBR_RECV, pf->state))
+			reset_type = ICE_RESET_GLOBR;
+		/* return if no valid reset type requested */
+		if (reset_type == ICE_RESET_INVAL)
+			return;
+		bus_for_each_dev(&ice_peer_bus, NULL, &reset_type,
+				 ice_close_peer_for_reset);
 		if (!test_bit(__ICE_PREPARED_FOR_RESET, pf->state))
 			ice_prepare_for_reset(pf);
 
@@ -1063,6 +1071,10 @@ static void ice_service_task(struct work_struct *work)
 		return;
 	}
 
+	/* Invoke remaining initialization of peer devices */
+	bus_for_each_dev(&ice_peer_bus, NULL, NULL,
+			 ice_finish_init_peer_device);
+
 	ice_check_for_hang_subtask(pf);
 	ice_sync_fltr_subtask(pf);
 	ice_handle_mdd_event(pf);
@@ -1103,6 +1115,42 @@ static void ice_set_ctrlq_len(struct ice_hw *hw)
 }
 
 /**
+ * ice_schedule_reset - schedule a reset
+ * @pf: board private structure
+ * @reset: reset being requested
+ */
+int ice_schedule_reset(struct ice_pf *pf, enum ice_reset_req reset)
+{
+	/* bail out if earlier reset has failed */
+	if (test_bit(__ICE_RESET_FAILED, pf->state)) {
+		dev_dbg(&pf->pdev->dev, "earlier reset has failed\n");
+		return -EIO;
+	}
+	/* bail if reset/recovery already in progress */
+	if (ice_is_reset_in_progress(pf->state)) {
+		dev_dbg(&pf->pdev->dev, "Reset already in progress\n");
+		return -EBUSY;
+	}
+
+	switch (reset) {
+	case ICE_RESET_PFR:
+		set_bit(__ICE_PFR_REQ, pf->state);
+		break;
+	case ICE_RESET_CORER:
+		set_bit(__ICE_CORER_REQ, pf->state);
+		break;
+	case ICE_RESET_GLOBR:
+		set_bit(__ICE_GLOBR_REQ, pf->state);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ice_service_task_schedule(pf);
+	return 0;
+}
+
+/**
  * ice_irq_affinity_notify - Callback for affinity changes
  * @notify: context as to what irq was changed
  * @mask: the new affinity mask
@@ -1524,6 +1572,12 @@ static int ice_cfg_netdev(struct ice_vsi *vsi)
 	vsi->netdev = netdev;
 	np = netdev_priv(netdev);
 	np->vsi = vsi;
+	np->prov_callbacks.signature = IDC_SIGNATURE;
+	np->prov_callbacks.maj_ver = ICE_PEER_MAJOR_VER;
+	np->prov_callbacks.min_ver = ICE_PEER_MINOR_VER;
+	memset(np->prov_callbacks.rsvd, 0, sizeof(np->prov_callbacks.rsvd));
+	np->prov_callbacks.reg_peer_driver = ice_reg_peer_driver;
+	np->prov_callbacks.unreg_peer_driver = ice_unreg_peer_driver;
 
 	dflt_features = NETIF_F_SG	|
 			NETIF_F_HIGHDMA	|
@@ -1815,6 +1869,7 @@ static void ice_init_pf(struct ice_pf *pf)
 {
 	bitmap_zero(pf->flags, ICE_PF_FLAGS_NBITS);
 	set_bit(ICE_FLAG_MSIX_ENA, pf->flags);
+	set_bit(ICE_FLAG_IWARP_ENA, pf->flags);
 #ifdef CONFIG_PCI_IOV
 	if (pf->hw.func_caps.common_cap.sr_iov_1_1) {
 		struct ice_hw *hw = &pf->hw;
@@ -1860,6 +1915,8 @@ static int ice_ena_msix_range(struct ice_pf *pf)
 
 	/* reserve one vector for miscellaneous handler */
 	needed = 1;
+	if (v_left < needed)
+		goto no_vecs_left_err;
 	v_budget += needed;
 	v_left -= needed;
 
@@ -1868,6 +1925,21 @@ static int ice_ena_msix_range(struct ice_pf *pf)
 	v_budget += pf->num_lan_msix;
 	v_left -= pf->num_lan_msix;
 
+	if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags)) {
+		needed = min_t(int, num_online_cpus(), v_left);
+
+		/* iWARP peer driver needs one extra interrupt, to be used for
+		 * other causes
+		 */
+		needed += 1;
+		/* no vectors left for RDMA */
+		if (v_left < needed)
+			goto no_vecs_left_err;
+		pf->num_rdma_msix = needed;
+		v_budget += needed;
+		v_left -= needed;
+	}
+
 	pf->msix_entries = devm_kcalloc(&pf->pdev->dev, v_budget,
 					sizeof(struct msix_entry), GFP_KERNEL);
 
@@ -1894,6 +1966,8 @@ static int ice_ena_msix_range(struct ice_pf *pf)
 			 "not enough vectors. requested = %d, obtained = %d\n",
 			 v_budget, v_actual);
 		if (v_actual >= (pf->num_lan_msix + 1)) {
+			clear_bit(ICE_FLAG_IWARP_ENA, pf->flags);
+			pf->num_rdma_msix = 0;
 			pf->num_avail_sw_msix = v_actual -
 						(pf->num_lan_msix + 1);
 		} else if (v_actual >= 2) {
@@ -1912,6 +1986,11 @@ static int ice_ena_msix_range(struct ice_pf *pf)
 	devm_kfree(&pf->pdev->dev, pf->msix_entries);
 	goto exit_err;
 
+no_vecs_left_err:
+	dev_err(&pf->pdev->dev,
+		"not enough vectors. requested = %d, available = %d\n",
+		needed, v_left);
+	err = -ERANGE;
 exit_err:
 	pf->num_lan_msix = 0;
 	clear_bit(ICE_FLAG_MSIX_ENA, pf->flags);
@@ -2162,10 +2241,20 @@ static int ice_probe(struct pci_dev *pdev,
 	/* since everything is good, start the service timer */
 	mod_timer(&pf->serv_tmr, round_jiffies(jiffies + pf->serv_tmr_period));
 
+	err = ice_init_peer_devices(pf);
+	if (err) {
+		dev_err(&pdev->dev,
+			"Failed to initialize peer devices: 0x%x\n", err);
+		err = -EIO;
+		goto err_init_peer_unroll;
+	}
+
 	ice_verify_cacheline_size(pf);
 
 	return 0;
 
+err_init_peer_unroll:
+	bus_for_each_dev(&ice_peer_bus, NULL, NULL, ice_unroll_peer);
 err_alloc_sw_unroll:
 	set_bit(__ICE_SERVICE_DIS, pf->state);
 	set_bit(__ICE_DOWN, pf->state);
@@ -2190,7 +2279,8 @@ static int ice_probe(struct pci_dev *pdev,
 static void ice_remove(struct pci_dev *pdev)
 {
 	struct ice_pf *pf = pci_get_drvdata(pdev);
-	int i;
+	enum ice_close_reason reason;
+	int err, i;
 
 	if (!pf)
 		return;
@@ -2201,12 +2291,21 @@ static void ice_remove(struct pci_dev *pdev)
 		msleep(100);
 	}
 
-	set_bit(__ICE_DOWN, pf->state);
 	ice_service_task_stop(pf);
+	reason = ICE_REASON_INTERFACE_DOWN;
+	bus_for_each_dev(&ice_peer_bus, NULL, &reason, ice_peer_close);
+	set_bit(__ICE_DOWN, pf->state);
 
 	if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags))
 		ice_free_vfs(pf);
 	ice_vsi_release_all(pf);
+	err = bus_for_each_dev(&ice_peer_bus, NULL, NULL,
+			       ice_unreg_peer_device);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to remove peer devices: 0x%x\n",
+			err);
+	}
+
 	ice_free_irq_msix_misc(pf);
 	ice_for_each_vsi(pf, i) {
 		if (!pf->vsi[i])
@@ -2257,9 +2356,16 @@ static int __init ice_module_init(void)
 	pr_info("%s - version %s\n", ice_driver_string, ice_drv_ver);
 	pr_info("%s\n", ice_copyright);
 
+	status = bus_register(&ice_peer_bus);
+	if (status) {
+		pr_err("failed to register pseudo bus\n");
+		return status;
+	}
+
 	ice_wq = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, KBUILD_MODNAME);
 	if (!ice_wq) {
 		pr_err("Failed to create workqueue\n");
+		bus_unregister(&ice_peer_bus);
 		return -ENOMEM;
 	}
 
@@ -2267,6 +2373,11 @@ static int __init ice_module_init(void)
 	if (status) {
 		pr_err("failed to register pci driver, err %d\n", status);
 		destroy_workqueue(ice_wq);
+		bus_unregister(&ice_peer_bus);
+		/* release all cached layer within ida tree, associated with
+		 * ice_peer_index_ida object
+		 */
+		ida_destroy(&ice_peer_index_ida);
 	}
 
 	return status;
@@ -2281,8 +2392,24 @@ static int __init ice_module_init(void)
  */
 static void __exit ice_module_exit(void)
 {
+	struct ice_peer_drv_int *peer_drv_int, *tmp;
+
 	pci_unregister_driver(&ice_driver);
 	destroy_workqueue(ice_wq);
+	mutex_lock(&ice_peer_drv_mutex);
+	list_for_each_entry_safe(peer_drv_int, tmp, &ice_peer_drv_list,
+				 drv_int_list) {
+		list_del(&peer_drv_int->drv_int_list);
+		kfree(peer_drv_int);
+	}
+	mutex_unlock(&ice_peer_drv_mutex);
+
+	bus_unregister(&ice_peer_bus);
+
+	/* release all cached layer within ida tree, associated with
+	 * ice_peer_index_ida object
+	 */
+	ida_destroy(&ice_peer_index_ida);
 	pr_info("module unloaded\n");
 }
 module_exit(ice_module_exit);
@@ -3423,6 +3550,7 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu)
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
+	struct ice_event *event;
 	u8 count = 0;
 
 	if (new_mtu == netdev->mtu) {
@@ -3474,6 +3602,13 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu)
 		}
 	}
 
+	event = devm_kzalloc(&pf->pdev->dev, sizeof(*event), GFP_KERNEL);
+	set_bit(ICE_EVENT_MTU_CHANGE, event->type);
+	event->reporter = NULL;
+	event->info.mtu = new_mtu;
+	bus_for_each_dev(&ice_peer_bus, NULL, event, ice_peer_check_for_reg);
+	devm_kfree(&pf->pdev->dev, event);
+
 	netdev_dbg(netdev, "changed mtu to %d\n", new_mtu);
 	return 0;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index 2e56931..360ea49 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -446,6 +446,29 @@ enum ice_status
 }
 
 /**
+ * ice_cfg_iwarp_fltr - enable/disable iwarp filtering on VSI
+ * @hw: pointer to HW struct
+ * @vsi_handle: VSI SW index
+ * @enable: boolean for enable/disable
+ */
+enum ice_status
+ice_cfg_iwarp_fltr(struct ice_hw *hw, u16 vsi_handle, bool enable)
+{
+	struct ice_vsi_ctx *ctx;
+
+	ctx = ice_get_vsi_ctx(hw, vsi_handle);
+	if (!ctx)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	if (enable)
+		ctx->info.q_opt_flags |= ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
+	else
+		ctx->info.q_opt_flags &= ~ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
+
+	return ice_update_vsi(hw, vsi_handle, ctx, NULL);
+}
+
+/**
  * ice_aq_alloc_free_vsi_list
  * @hw: pointer to the hw struct
  * @vsi_list_id: VSI list id returned or used for lookup
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.h b/drivers/net/ethernet/intel/ice/ice_switch.h
index 78040bb..f297c86 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.h
+++ b/drivers/net/ethernet/intel/ice/ice_switch.h
@@ -207,6 +207,8 @@ enum ice_status
 enum ice_status ice_update_sw_rule_bridge_mode(struct ice_hw *hw);
 enum ice_status ice_add_mac(struct ice_hw *hw, struct list_head *m_lst);
 enum ice_status ice_remove_mac(struct ice_hw *hw, struct list_head *m_lst);
+enum ice_status
+ice_cfg_iwarp_fltr(struct ice_hw *hw, u16 vsi_handle, bool enable);
 void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_handle);
 enum ice_status ice_add_vlan(struct ice_hw *hw, struct list_head *m_list);
 enum ice_status ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list);
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index 6209edc..e5b7c90 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -29,6 +29,7 @@ static inline bool ice_is_tc_ena(u8 bitmap, u8 tc)
 #define ICE_DBG_LAN		BIT_ULL(8)
 #define ICE_DBG_SW		BIT_ULL(13)
 #define ICE_DBG_SCHED		BIT_ULL(14)
+#define ICE_DBG_RDMA		BIT_ULL(15)
 #define ICE_DBG_RES		BIT_ULL(17)
 #define ICE_DBG_AQ_MSG		BIT_ULL(24)
 #define ICE_DBG_AQ_CMD		BIT_ULL(27)
@@ -220,6 +221,7 @@ struct ice_sched_node {
 	u8 tc_num;
 	u8 owner;
 #define ICE_SCHED_NODE_OWNER_LAN	0
+#define ICE_SCHED_NODE_OWNER_RDMA	2
 };
 
 /* Access Macros for Tx Sched Elements data */
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index 05ff4f9..4a1faf9 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -1007,31 +1007,6 @@ static int ice_alloc_vfs(struct ice_pf *pf, u16 num_alloc_vfs)
 }
 
 /**
- * ice_pf_state_is_nominal - checks the pf for nominal state
- * @pf: pointer to pf to check
- *
- * Check the PF's state for a collection of bits that would indicate
- * the PF is in a state that would inhibit normal operation for
- * driver functionality.
- *
- * Returns true if PF is in a nominal state.
- * Returns false otherwise
- */
-static bool ice_pf_state_is_nominal(struct ice_pf *pf)
-{
-	DECLARE_BITMAP(check_bits, __ICE_STATE_NBITS) = { 0 };
-
-	if (!pf)
-		return false;
-
-	bitmap_set(check_bits, 0, __ICE_STATE_NOMINAL_CHECK_BITS);
-	if (bitmap_intersects(pf->state, check_bits, __ICE_STATE_NBITS))
-		return false;
-
-	return true;
-}
-
-/**
  * ice_pci_sriov_ena - Enable or change number of VFs
  * @pf: pointer to the PF structure
  * @num_vfs: number of VFs to allocate
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 01/19] net/i40e: Add peer register/unregister to struct i40e_netdev_priv
From: Shiraz Saleem @ 2019-02-12 21:43 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Shiraz Saleem
In-Reply-To: <20190212214402.23284-1-shiraz.saleem@intel.com>

Expose the register/unregister function pointers in the struct
i40e_netdev_priv which is accesible via the netdev_priv() interface
in the RDMA driver. On a netdev notification in the RDMA driver,
the appropriate LAN driver register/unregister functions are invoked
from the struct i40e_netdev_priv structure,

This is done in order to support single RDMA driver to work with
multiple LAN drivers over multi-generation Intel HW supporting RDMA

Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h        |  1 +
 drivers/net/ethernet/intel/i40e/i40e_client.h | 10 ++++++++++
 drivers/net/ethernet/intel/i40e/i40e_main.c   |  7 +++++++
 3 files changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 8de9085..7bc6316 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -795,6 +795,7 @@ struct i40e_vsi {
 } ____cacheline_internodealigned_in_smp;
 
 struct i40e_netdev_priv {
+	struct idc_srv_provider prov_callbacks;
 	struct i40e_vsi *vsi;
 };
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.h b/drivers/net/ethernet/intel/i40e/i40e_client.h
index 72994ba..70ddb76 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_client.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.h
@@ -44,6 +44,16 @@ enum i40e_client_instance_state {
 #define I40E_QUEUE_TYPE_PE_AEQ  0x80
 #define I40E_QUEUE_INVALID_IDX	0xFFFF
 
+#define IDC_SIGNATURE 0x494e54454c494443ULL	/* INTELIDC */
+struct idc_srv_provider {
+	u64 signature;
+	u16 maj_ver;
+	u16 min_ver;
+	u8 rsvd[4];
+	int (*reg_peer_driver)(struct i40e_client *client);
+	int (*unreg_peer_driver)(struct i40e_client *client);
+};
+
 struct i40e_qv_info {
 	u32 v_idx; /* msix_vector */
 	u16 ceq_idx;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f52e2c4..114ff0e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -12243,6 +12243,13 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
 	np = netdev_priv(netdev);
 	np->vsi = vsi;
 
+	np->prov_callbacks.signature = IDC_SIGNATURE;
+	np->prov_callbacks.maj_ver = I40E_CLIENT_VERSION_MAJOR;
+	np->prov_callbacks.min_ver = I40E_CLIENT_VERSION_MINOR;
+	memset(np->prov_callbacks.rsvd, 0, sizeof(np->prov_callbacks.rsvd));
+	np->prov_callbacks.reg_peer_driver = i40e_register_client;
+	np->prov_callbacks.unreg_peer_driver = i40e_unregister_client;
+
 	hw_enc_features = NETIF_F_SG			|
 			  NETIF_F_IP_CSUM		|
 			  NETIF_F_IPV6_CSUM		|
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 00/19] Add unified Intel Ethernet RDMA driver (irdma)
From: Shiraz Saleem @ 2019-02-12 21:43 UTC (permalink / raw)
  To: dledford, jgg, davem
  Cc: linux-rdma, netdev, mustafa.ismail, jeffrey.t.kirsher,
	Saleem, Shiraz

From: "Saleem, Shiraz" <shiraz.saleem@intel.com>

This patchset adds a unified Intel Ethernet RDMA driver that
supports a new network device E810 (iWARP and RoCEv2 capable)
and the existing X722 iWARP device. The driver architecture
provides the extensibility for future generations of Intel HW
supporting RDMA.

The underlying LAN driver for X722 ("i40e") and E810 ("ice")
have their own peer/client interface that this unified RDMA
driver registers to. The netdev_priv structure of each LAN
driver accessed by the netdev_priv() interface exposes their
peer register/unregister function pointers for use by the RDMA
driver. This solution revives an earlier submission [1] intended for
a different purpose. It now serves to unify the RDMA driver and
we are requesting feedback.

The unified driver is currently being tested on X722 with rdma-core-v22.
X722 is supported by this driver, but only if CONFIG_INFINIBAND_I40IW
is disabled in the kernel. We desire to deprecate or phase out i40iw
from the kernel and can submit patches to do the same. We would like to
move forward with this unified driver model for current and future HW.

This series was built against rdma-next.
commit a2bfd708b17a ("RDMA/iwpm: move kdoc comments to functions")
It includes both net and rdma patches for the purposes
of review. It will be split into two separate patch series, one for
net-next and rdma-next once RFC is accepted.

[1] https://patchwork.kernel.org/patch/10419583/

Anirudh Venkataramanan (2):
  net/ice: Create framework for VSI queue context
  net/ice: Add support for ice peer devices and drivers

Michael J. Ruhl (1):
  RDMA/irdma: Add dynamic tracing for CM

Mustafa Ismail (13):
  RDMA/irdma: Add driver framework definitions
  RDMA/irdma: Implement device initialization definitions
  RDMA/irdma: Implement HW Admin Queue OPs
  RDMA/irdma: Add HMC backing store setup functions
  RDMA/irdma: Add privileged UDA queue implementation
  RDMA/irdma: Add QoS definitions
  RDMA/irdma: Add connection manager
  RDMA/irdma: Add PBLE resource manager
  RDMA/irdma: Implement device supported verb APIs
  RDMA/irdma: Add RoCEv2 UD OP support
  RDMA/irdma: Add user/kernel shared libraries
  RDMA/irdma: Add miscellaneous utility definitions
  RDMA/irdma: Add ABI definitions

Shiraz Saleem (3):
  net/i40e: Add peer register/unregister to struct i40e_netdev_priv
  RDMA/irdma: Add Kconfig and Makefile
  RDMA/irdma: Update MAINTAINERS file

 MAINTAINERS                                      |   10 +-
 drivers/infiniband/Kconfig                       |    1 +
 drivers/infiniband/hw/Makefile                   |    1 +
 drivers/infiniband/hw/irdma/Kconfig              |   11 +
 drivers/infiniband/hw/irdma/Makefile             |   31 +
 drivers/infiniband/hw/irdma/cm.c                 | 4621 ++++++++++++++++
 drivers/infiniband/hw/irdma/cm.h                 |  425 ++
 drivers/infiniband/hw/irdma/ctrl.c               | 6165 ++++++++++++++++++++++
 drivers/infiniband/hw/irdma/defs.h               | 2053 +++++++
 drivers/infiniband/hw/irdma/hmc.c                |  716 +++
 drivers/infiniband/hw/irdma/hmc.h                |  221 +
 drivers/infiniband/hw/irdma/hw.c                 | 2484 +++++++++
 drivers/infiniband/hw/irdma/i40iw_hw.c           |  207 +
 drivers/infiniband/hw/irdma/i40iw_hw.h           |  164 +
 drivers/infiniband/hw/irdma/i40iw_if.c           |  250 +
 drivers/infiniband/hw/irdma/icrdma_hw.c          |   74 +
 drivers/infiniband/hw/irdma/icrdma_hw.h          |   60 +
 drivers/infiniband/hw/irdma/irdma.h              |  187 +
 drivers/infiniband/hw/irdma/irdma_if.c           |  430 ++
 drivers/infiniband/hw/irdma/main.c               |  496 ++
 drivers/infiniband/hw/irdma/main.h               |  709 +++
 drivers/infiniband/hw/irdma/osdep.h              |  153 +
 drivers/infiniband/hw/irdma/pble.c               |  520 ++
 drivers/infiniband/hw/irdma/pble.h               |  135 +
 drivers/infiniband/hw/irdma/protos.h             |  118 +
 drivers/infiniband/hw/irdma/puda.c               | 1694 ++++++
 drivers/infiniband/hw/irdma/puda.h               |  186 +
 drivers/infiniband/hw/irdma/status.h             |   70 +
 drivers/infiniband/hw/irdma/trace.c              |  113 +
 drivers/infiniband/hw/irdma/trace.h              |    4 +
 drivers/infiniband/hw/irdma/trace_cm.h           |  459 ++
 drivers/infiniband/hw/irdma/type.h               | 1705 ++++++
 drivers/infiniband/hw/irdma/uda.c                |  416 ++
 drivers/infiniband/hw/irdma/uda.h                |   87 +
 drivers/infiniband/hw/irdma/uda_d.h              |  383 ++
 drivers/infiniband/hw/irdma/uk.c                 | 1680 ++++++
 drivers/infiniband/hw/irdma/user.h               |  463 ++
 drivers/infiniband/hw/irdma/utils.c              | 2566 +++++++++
 drivers/infiniband/hw/irdma/verbs.c              | 4267 +++++++++++++++
 drivers/infiniband/hw/irdma/verbs.h              |  191 +
 drivers/infiniband/hw/irdma/ws.c                 |  449 ++
 drivers/infiniband/hw/irdma/ws.h                 |   40 +
 drivers/net/ethernet/intel/i40e/i40e.h           |    1 +
 drivers/net/ethernet/intel/i40e/i40e_client.h    |   10 +
 drivers/net/ethernet/intel/i40e/i40e_main.c      |    7 +
 drivers/net/ethernet/intel/ice/Makefile          |    1 +
 drivers/net/ethernet/intel/ice/ice.h             |   14 +
 drivers/net/ethernet/intel/ice/ice_adminq_cmd.h  |   32 +
 drivers/net/ethernet/intel/ice/ice_common.c      |  189 +
 drivers/net/ethernet/intel/ice/ice_common.h      |    9 +
 drivers/net/ethernet/intel/ice/ice_idc.c         | 1527 ++++++
 drivers/net/ethernet/intel/ice/ice_idc.h         |  402 ++
 drivers/net/ethernet/intel/ice/ice_idc_int.h     |   99 +
 drivers/net/ethernet/intel/ice/ice_lib.c         |   24 +
 drivers/net/ethernet/intel/ice/ice_lib.h         |    2 +
 drivers/net/ethernet/intel/ice/ice_main.c        |  143 +-
 drivers/net/ethernet/intel/ice/ice_sched.c       |  177 +-
 drivers/net/ethernet/intel/ice/ice_switch.c      |   23 +
 drivers/net/ethernet/intel/ice/ice_switch.h      |   11 +
 drivers/net/ethernet/intel/ice/ice_type.h        |    3 +
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c |   25 -
 include/uapi/rdma/irdma-abi.h                    |  140 +
 include/uapi/rdma/rdma_user_ioctl_cmds.h         |    1 +
 63 files changed, 37762 insertions(+), 93 deletions(-)
 create mode 100644 drivers/infiniband/hw/irdma/Kconfig
 create mode 100644 drivers/infiniband/hw/irdma/Makefile
 create mode 100644 drivers/infiniband/hw/irdma/cm.c
 create mode 100644 drivers/infiniband/hw/irdma/cm.h
 create mode 100644 drivers/infiniband/hw/irdma/ctrl.c
 create mode 100644 drivers/infiniband/hw/irdma/defs.h
 create mode 100644 drivers/infiniband/hw/irdma/hmc.c
 create mode 100644 drivers/infiniband/hw/irdma/hmc.h
 create mode 100644 drivers/infiniband/hw/irdma/hw.c
 create mode 100644 drivers/infiniband/hw/irdma/i40iw_hw.c
 create mode 100644 drivers/infiniband/hw/irdma/i40iw_hw.h
 create mode 100644 drivers/infiniband/hw/irdma/i40iw_if.c
 create mode 100644 drivers/infiniband/hw/irdma/icrdma_hw.c
 create mode 100644 drivers/infiniband/hw/irdma/icrdma_hw.h
 create mode 100644 drivers/infiniband/hw/irdma/irdma.h
 create mode 100644 drivers/infiniband/hw/irdma/irdma_if.c
 create mode 100644 drivers/infiniband/hw/irdma/main.c
 create mode 100644 drivers/infiniband/hw/irdma/main.h
 create mode 100644 drivers/infiniband/hw/irdma/osdep.h
 create mode 100644 drivers/infiniband/hw/irdma/pble.c
 create mode 100644 drivers/infiniband/hw/irdma/pble.h
 create mode 100644 drivers/infiniband/hw/irdma/protos.h
 create mode 100644 drivers/infiniband/hw/irdma/puda.c
 create mode 100644 drivers/infiniband/hw/irdma/puda.h
 create mode 100644 drivers/infiniband/hw/irdma/status.h
 create mode 100644 drivers/infiniband/hw/irdma/trace.c
 create mode 100644 drivers/infiniband/hw/irdma/trace.h
 create mode 100644 drivers/infiniband/hw/irdma/trace_cm.h
 create mode 100644 drivers/infiniband/hw/irdma/type.h
 create mode 100644 drivers/infiniband/hw/irdma/uda.c
 create mode 100644 drivers/infiniband/hw/irdma/uda.h
 create mode 100644 drivers/infiniband/hw/irdma/uda_d.h
 create mode 100644 drivers/infiniband/hw/irdma/uk.c
 create mode 100644 drivers/infiniband/hw/irdma/user.h
 create mode 100644 drivers/infiniband/hw/irdma/utils.c
 create mode 100644 drivers/infiniband/hw/irdma/verbs.c
 create mode 100644 drivers/infiniband/hw/irdma/verbs.h
 create mode 100644 drivers/infiniband/hw/irdma/ws.c
 create mode 100644 drivers/infiniband/hw/irdma/ws.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_idc.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_idc.h
 create mode 100644 drivers/net/ethernet/intel/ice/ice_idc_int.h
 create mode 100644 include/uapi/rdma/irdma-abi.h

-- 
1.8.3.1


^ permalink raw reply

* [PATCH v4 00/14] various dynamic_debug patches
From: Rasmus Villemoes @ 2019-02-12 21:41 UTC (permalink / raw)
  To: Andrew Morton, Jason Baron
  Cc: linux-kernel, Rasmus Villemoes, David Sterba, Petr Mladek,
	Rafael J . Wysocki, linux-acpi, linux-btrfs, netdev,
	Steven Rostedt, x86, Greg Kroah-Hartman, Ingo Molnar
In-Reply-To: <20181109231021.11658-1-linux@rasmusvillemoes.dk>

This started as an experiment to see how hard it would be to change
the four pointers in struct _ddebug into relative offsets, a la
CONFIG_GENERIC_BUG_RELATIVE_POINTERS, thus saving 16 bytes per
pr_debug site (and thus exactly making up for the extra space used by
the introduction of jump labels in 9049fc74). I stumbled on a few
things that are probably worth fixing regardless of whether that goal
is deemed worthwhile.

Back at v3 (in November), I redid the implementation on top of the
fancy new asm-macros stuff. Luckily enough, v3 didn't get picked up,
since the asm-macros were backed out again. I still want to do the
relative-pointers thing eventually, but we're close to the merge
window opening, so here's just most of the "incidental" patches, some
of which also serve as preparation for the relative pointers.

I'm not sure how long an Ack/Reviewed-by is good for, but OTOH it also
feels rude to just drop them on the floor. I've kept those tags since
the rebasing to current master went completely smooth.

Andrew, please pick these up for soaking in -next.

v3 series: lkml.kernel.org/r/20181109231021.11658-1-linux@rasmusvillemoes.dk
v2 series: lkml.kernel.org/r/20181009112013.14238-1-linux@rasmusvillemoes.dk

Rasmus Villemoes (14):
  linux/device.h: use DYNAMIC_DEBUG_BRANCH in dev_dbg_ratelimited
  linux/net.h: use DYNAMIC_DEBUG_BRANCH in net_dbg_ratelimited
  linux/printk.h: use DYNAMIC_DEBUG_BRANCH in pr_debug_ratelimited
  dynamic_debug: consolidate DEFINE_DYNAMIC_DEBUG_METADATA definitions
  dynamic_debug: don't duplicate modname in ddebug_add_module
  dynamic_debug: use pointer comparison in ddebug_remove_module
  dynamic_debug: remove unused EXPORT_SYMBOLs
  dynamic_debug: move pr_err from module.c to ddebug_add_module
  dynamic_debug: add static inline stub for ddebug_add_module
  dynamic_debug: refactor dynamic_pr_debug and friends
  btrfs: implement btrfs_debug* in terms of helper macro
  ACPI: use proper DYNAMIC_DEBUG_BRANCH macro
  ACPI: remove unused __acpi_handle_debug macro
  ACPI: implement acpi_handle_debug in terms of _dynamic_func_call

 fs/btrfs/ctree.h              |  34 ++++--------
 include/linux/acpi.h          |  11 +---
 include/linux/device.h        |   2 +-
 include/linux/dynamic_debug.h | 102 +++++++++++++++++++---------------
 include/linux/net.h           |   2 +-
 include/linux/printk.h        |   2 +-
 kernel/module.c               |   6 +-
 lib/dynamic_debug.c           |  22 ++++----
 8 files changed, 84 insertions(+), 97 deletions(-)

-- 
2.20.1


^ permalink raw reply

* [PATCH v4 02/14] linux/net.h: use DYNAMIC_DEBUG_BRANCH in net_dbg_ratelimited
From: Rasmus Villemoes @ 2019-02-12 21:41 UTC (permalink / raw)
  To: Andrew Morton, Jason Baron; +Cc: linux-kernel, Rasmus Villemoes, netdev
In-Reply-To: <20190212214150.4807-1-linux@rasmusvillemoes.dk>

net_dbg_ratelimited tests the dynamic debug descriptor the
old-fashioned way, and doesn't utilize the static key/jump label
implementation when CONFIG_JUMP_LABEL is set. Use the
DYNAMIC_DEBUG_BRANCH which is defined appropriately.

Cc: netdev@vger.kernel.org
Acked-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
 include/linux/net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index e0930678c8bf..651fca72286c 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -263,7 +263,7 @@ do {								\
 #define net_dbg_ratelimited(fmt, ...)					\
 do {									\
 	DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);			\
-	if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) &&	\
+	if (DYNAMIC_DEBUG_BRANCH(descriptor) &&				\
 	    net_ratelimit())						\
 		__dynamic_pr_debug(&descriptor, pr_fmt(fmt),		\
 		                   ##__VA_ARGS__);			\
-- 
2.20.1


^ permalink raw reply related

* [PATCH net-next] rocker: Remove port_attr_bridge_flags_get assignment
From: Florian Fainelli @ 2019-02-12 21:39 UTC (permalink / raw)
  To: netdev; +Cc: Florian Fainelli, Jiri Pirko, David S. Miller, open list

After 610d2b601bba ("rocker: Remove getting PORT_BRIDGE_FLAGS") we no
longer have a port_attr_bridge_flags_get member in the rocker_world_ops
structre, fix that.

Fixes: 610d2b601bba ("rocker: Remove getting PORT_BRIDGE_FLAGS")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/rocker/rocker_ofdpa.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 9c07f6040720..fa296a7c255d 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -2813,7 +2813,6 @@ struct rocker_world_ops rocker_ofdpa_ops = {
 	.port_stop = ofdpa_port_stop,
 	.port_attr_stp_state_set = ofdpa_port_attr_stp_state_set,
 	.port_attr_bridge_flags_set = ofdpa_port_attr_bridge_flags_set,
-	.port_attr_bridge_flags_get = ofdpa_port_attr_bridge_flags_get,
 	.port_attr_bridge_flags_support_get = ofdpa_port_attr_bridge_flags_support_get,
 	.port_attr_bridge_ageing_time_set = ofdpa_port_attr_bridge_ageing_time_set,
 	.port_obj_vlan_add = ofdpa_port_obj_vlan_add,
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next] net: sched: flower: only return error from hw offload if skip_sw
From: Vlad Buslov @ 2019-02-12 21:39 UTC (permalink / raw)
  To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, pablo, Vlad Buslov

Recently introduced tc_setup_flow_action() can fail when parsing tcf_exts
on some unsupported action commands. However, this should not affect the
case when user did not explicitly request hw offload by setting skip_sw
flag. Modify tc_setup_flow_action() callers to only propagate the error if
skip_sw flag is set for filter that is being offloaded, and set extack
error message in that case.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Fixes: 3a7b68617de7 ("cls_api: add translator to flow_action representation")
---
 net/sched/cls_flower.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 6a341287a527..f695048455a5 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -396,7 +396,11 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts);
 	if (err) {
 		kfree(cls_flower.rule);
-		return err;
+		if (skip_sw) {
+			NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action");
+			return err;
+		}
+		return 0;
 	}
 
 	err = tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw);
@@ -1499,7 +1503,11 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
 						   &f->exts);
 			if (err) {
 				kfree(cls_flower.rule);
-				return err;
+				if (tc_skip_sw(f->flags)) {
+					NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action");
+					return err;
+				}
+				continue;
 			}
 
 			cls_flower.classid = f->res.classid;
-- 
2.13.6


^ permalink raw reply related

* Re: [PATCH net-next 0/3] Remove getting SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS
From: Florian Fainelli @ 2019-02-12 21:35 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, idosch, linux-kernel, devel, bridge, jiri, andrew,
	vivien.didelot
In-Reply-To: <6ff1b9c2-1206-f568-8556-28efe852dd08@gmail.com>

On 2/12/19 9:54 AM, Florian Fainelli wrote:
> On 2/12/19 9:50 AM, David Miller wrote:
>> From: Florian Fainelli <f.fainelli@gmail.com>
>> Date: Mon, 11 Feb 2019 13:17:46 -0800
>>
>>> AFAICT there is no code that attempts to get the value of the attribute
>>> SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS while it is used with
>>> switchdev_port_attr_set().
>>>
>>> This is effectively no doing anything and it can slow down future work
>>> that tries to make modifications in these areas so remove that.
>>
>> Series applied.
>>
>>> David, there should be no dependency with previous patch series, but
>>> again, feedback from Ido and Jiri would be welcome in case this was
>>> added for a reason.
>>
>> Ok, is there going to be another respin of that switchdev_ops removal
>> series?
> 
> Yes, I will be working on a v5 which addresses Ido's feedback in the
> next hours.

David, looks like I managed to introduce a build failure with
rocker_ofdpa.c, sorry about that, I will send a follow-up immediately.
-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next 4/4] net: phy: Add generic support for 2.5GBaseT and 5GBaseT
From: Andrew Lunn @ 2019-02-12 21:13 UTC (permalink / raw)
  To: Maxime Chevallier
  Cc: davem, netdev, linux-kernel, Florian Fainelli, Heiner Kallweit,
	Russell King, linux-arm-kernel, Antoine Tenart, thomas.petazzoni,
	gregory.clement, miquel.raynal, nadavh, stefanc, mw
In-Reply-To: <20190211142529.22885-5-maxime.chevallier@bootlin.com>

On Mon, Feb 11, 2019 at 03:25:29PM +0100, Maxime Chevallier wrote:
> The 802.3bz specification, based on previous by the NBASET alliance,
> defines the 2.5GBaseT and 5GBaseT link modes for ethernet traffic on
> cat5e, cat6 and cat7 cables.
> 
> These mode integrate with the already defined C45 MDIO PMA/PMD registers
> set that added 10G support, by defining some previously reserved bits,
> and adding a new register (2.5G/5G Extended abilities).
> 
> This commit adds the required definitions in include/uapi/linux/mdio.h
> to support these modes, and detect when a link-partner advertises them.
> 
> It also adds support for these mode in the generic C45 PHY
> infrastructure.
> 
> Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH net-next 3/4] net: phy: Extract genphy_c45_pma_read_abilities from marvell10g
From: Andrew Lunn @ 2019-02-12 21:11 UTC (permalink / raw)
  To: Maxime Chevallier
  Cc: davem, netdev, linux-kernel, Florian Fainelli, Heiner Kallweit,
	Russell King, linux-arm-kernel, Antoine Tenart, thomas.petazzoni,
	gregory.clement, miquel.raynal, nadavh, stefanc, mw
In-Reply-To: <20190211142529.22885-4-maxime.chevallier@bootlin.com>

On Mon, Feb 11, 2019 at 03:25:28PM +0100, Maxime Chevallier wrote:
> Marvell 10G PHY driver has a generic way of initializing the supported
> link modes by reading the PHY's C45 PMA abilities. This can be made
> generic, since these registers are part of the 802.3 specifications.
> 
> This commit extracts the config_init link_mode initialization code from
> marvell10g and uses it to introduce the genphy_c45_pma_read_abilities
> function.
> 
> Only PMA modes are read, it's still up to the caller to set the Pause
> parameters.
> 
> Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

> -	__set_bit(ETHTOOL_LINK_MODE_Pause_BIT, supported);
> -	__set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, supported);
> +	__set_bit(ETHTOOL_LINK_MODE_Pause_BIT, phydev->supported);
> +	__set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, phydev->supported);

I think someone already pointed out, this is not ideal. The PHY driver
should only set pause bits, if it needs odd pause settings because of
HW limitations. If no bits are set, the core will set both bits.

But this is not a new problem introduced by this patch, so it can be
fixed later, rather than hold up this patchset.

      Andrew

^ permalink raw reply

* Re: [PATCH net] dsa: mv88e6xxx: Ensure all pending interrupts are handled prior to exit
From: Heiner Kallweit @ 2019-02-12 20:54 UTC (permalink / raw)
  To: Russell King - ARM Linux admin
  Cc: Andrew Lunn, John David Anglin, Vivien Didelot, Florian Fainelli,
	netdev
In-Reply-To: <20190212163017.lwstmgtyw76cwrd7@shell.armlinux.org.uk>

On 12.02.2019 17:30, Russell King - ARM Linux admin wrote:
> On Tue, Feb 12, 2019 at 07:51:05AM +0100, Heiner Kallweit wrote:
>> On 12.02.2019 04:58, Andrew Lunn wrote:
>>> That change means we don't check the PHY device if it caused an
>>> interrupt when its state is less than UP.
>>>
>>> What i'm seeing is that the PHY is interrupting pretty early on after
>>> a reboot when the previous boot had the interface up.
>>>
>> So this means that when going down for reboot the interrupts are not
>> properly masked / disabled? Because (at least for net-next) we enable
>> interrupts in phy_start() only.
> 
[..]
> In looking at this, I came across this chunk of code:
> 
> static inline bool __phy_is_started(struct phy_device *phydev)
> {
>         WARN_ON(!mutex_is_locked(&phydev->lock));
> 
>         return phydev->state >= PHY_UP;
> }
> 
> /**
>  * phy_is_started - Convenience function to check whether PHY is started
>  * @phydev: The phy_device struct
>  */
> static inline bool phy_is_started(struct phy_device *phydev)
> {
>         bool started;
> 
>         mutex_lock(&phydev->lock);
>         started = __phy_is_started(phydev);
>         mutex_unlock(&phydev->lock);
> 
>         return started;
> }
> 
> which looks to me like over-complication.  The mutex locking there is
> completely pointless - what are you trying to achieve with it?
> 
> Let's go through this.  The above is exactly equivalent to:
> 
> bool phy_is_started(phydev)
> {
> 	int state;
> 
> 	mutex_lock(&phydev->lock);
> 	state = phydev->state;
> 	mutex_unlock(&phydev->lock);
> 
> 	return state >= PHY_UP;
> }
> 
> since when we do the test is irrelevant.  Architectures that Linux
> runs on are single-copy atomic, which means that reading phydev->state
> itself is an atomic operation.  So, the mutex locking around that
> doesn't add to the atomicity of the entire operation.
> 
> How, depending on what you do with the rest of this function depends
> whether the entire operation is safe or not.  For example, let's take
> this code at the end of phy_state_machine():
> 
>         if (phy_polling_mode(phydev) && phy_is_started(phydev))
>                 phy_queue_state_machine(phydev, PHY_STATE_TIME);
> 
> state = PHY_UP
> 		thread 0			thread 1
> 						phy_disconnect()
> 						+-phy_is_started()
> 		phy_is_started()                |
> 						`-phy_stop()
> 						  +-phydev->state = PHY_HALTED
> 						  `-phy_stop_machine()
> 						    `-cancel_delayed_work_sync()
> 		phy_queue_state_machine()
> 		`-mod_delayed_work()
> 
> At this point, the phydev->state_queue() has been added back onto the
> system workqueue despite phy_stop_machine() having been called and
> cancel_delayed_work_sync() called on it.
> 
> The original code in 4.20 did not have this race condition.
> 
> Basically, the lock inside phy_is_started() does nothing useful, and
> I'd say is dangerously misleading.
> 
Then idea would be to first remove the locking from phy_is_started()
and in a second step do the following to prevent the described race
(phy_stop() takes phydev->lock too).

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index c1ed03800..69dc64a4d 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -957,8 +957,10 @@ void phy_state_machine(struct work_struct *work)
         * state machine would be pointless and possibly error prone when
         * called from phy_disconnect() synchronously.
         */
+       mutex_lock(&phydev->lock);
        if (phy_polling_mode(phydev) && phy_is_started(phydev))
                phy_queue_state_machine(phydev, PHY_STATE_TIME);
+       mutex_unlock(&phydev->lock);
 }

Heiner

^ permalink raw reply related

* Re: [PATCH] rpc: properly check debugfs dentry before using it
From: Schumaker, Anna @ 2019-02-12 20:52 UTC (permalink / raw)
  To: bfields@fieldses.org, trond.myklebust@hammerspace.com,
	gregkh@linuxfoundation.org, jlayton@kernel.org
  Cc: netdev@vger.kernel.org, linux-nfs@vger.kernel.org,
	dhowells@redhat.com
In-Reply-To: <20190212182734.GA4509@kroah.com>

On Tue, 2019-02-12 at 19:27 +0100, Greg Kroah-Hartman wrote:
> debugfs can now report an error code if something went wrong instead of
> just NULL.  So if the return value is to be used as a "real" dentry, it
> needs to be checked if it is an error before dereferencing it.
> 
> This is now happening because of ff9fb72bc077 ("debugfs: return error
> values, not NULL"), but why debugfs files are not being created properly
> is an older issue, probably one that has always been there and should
> probably be looked at...
> 
> Cc: "J. Bruce Fields" <bfields@fieldses.org>
> Cc: Jeff Layton <jlayton@kernel.org>
> Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
> Cc: Anna Schumaker <anna.schumaker@netapp.com>
> Cc: linux-nfs@vger.kernel.org
> Cc: netdev@vger.kernel.org
> Reported-by: David Howells <dhowells@redhat.com>
> Tested-by: David Howells <dhowells@redhat.com>
> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> 
> ---
>  net/sunrpc/debugfs.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> I can take this through my tree if people don't object, or it can go
> through the NFS tree.  It does need to get merged before 5.0-final
> though.

I'm planning another bugfixes pull for 5.0, so I can take this patch and send it
with the others this week.

Thanks!
Anna

> 
> I also have a "larger" debugfs cleanup patch for this file, but that's
> not really 5.0-final material and I will send it out later.
> 
> thanks,
> 
> greg k-h
> 
> diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
> index 45a033329cd4..19bb356230ed 100644
> --- a/net/sunrpc/debugfs.c
> +++ b/net/sunrpc/debugfs.c
> @@ -146,7 +146,7 @@ rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
>         rcu_read_lock();
>         xprt = rcu_dereference(clnt->cl_xprt);
>         /* no "debugfs" dentry? Don't bother with the symlink. */
> -       if (!xprt->debugfs) {
> +       if (IS_ERR_OR_NULL(xprt->debugfs)) {
>                 rcu_read_unlock();
>                 return;
>         }

^ permalink raw reply

* Re: [PATCH] net: stmmac: Add SMC support for EMAC System Manager register
From: Thor Thayer @ 2019-02-12 20:52 UTC (permalink / raw)
  To: Ooi, Joyce, Giuseppe Cavallaro, Alexandre Torgue, Jose Abreu,
	David S. Miller, Maxime Coquelin
  Cc: netdev, linux-stm32, linux-arm-kernel, linux-kernel,
	See Chin Liang
In-Reply-To: <1549988692-4124-1-git-send-email-joyce.ooi@intel.com>

Hi Joyce,

On 2/12/19 10:24 AM, Ooi, Joyce wrote:
> As there is restriction to access to EMAC System Manager registers in
> the kernel for Intel Stratix10, the use of SMC calls are required and
> added in dwmac-socfpga driver.
> 
> Signed-off-by: Ooi, Joyce <joyce.ooi@intel.com>

<snip>

I have a pending patchset[1] that addresses this but can be used by 
other drivers as well.

[1] https://patchwork.kernel.org/cover/10612891/


^ permalink raw reply

* Re: [PATCH bpf-next 4/4] selftests/bpf: Test static data relocation
From: Joe Stringer @ 2019-02-12 20:43 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: Joe Stringer, bpf, netdev, Daniel Borkmann, ast
In-Reply-To: <20190212050113.qkryxb34vpnst6w6@ast-mbp>

On Mon, Feb 11, 2019 at 9:01 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Feb 11, 2019 at 04:47:29PM -0800, Joe Stringer wrote:
> > Add tests for libbpf relocation of static variable references into the
> > .data and .bss sections of the ELF.
> >
> > Signed-off-by: Joe Stringer <joe@wand.net.nz>
> ...
> > +#define __fetch(x) (__u32)(&(x))
> > +
> > +static __u32 static_bss = 0; /* Reloc reference to .bss section */
> > +static __u32 static_data = 42;       /* Reloc reference to .data section */
> > +
> > +/**
> > + * Load a u32 value from a static variable into a map, for the userland test
> > + * program to validate.
> > + */
> > +SEC("static_data_load")
> > +int load_static_data(struct __sk_buff *skb)
> > +{
> > +     __u32 key, value;
> > +
> > +     key = 0;
> > +     value = __fetch(static_bss);
>
> If we proceed with this approach we will not be able to add support
> for normal 'value = static_bss;' C code in the future.

Hmm, completely agree that breaking future use of standard code is a
non-starter.

Digging around a bit more, I think I could drop the .bss patch/code
here and still end up with something that will work for my use case.
Just need to ensure that all template values are non-zero when run
through the compiler.

> Let's figure out the way to do it right from the start.
> Support for global and static variables is must have feature to add asap,
> but let's not cut the corner like this.
> We did such hacks in the past and every time it came back to bite us.

Do you see any value in having incremental support in libbpf that
could be used as a fallback for older kernels like in patch #2 of this
series? I could imagine libbpf probing kernel support for
global/static variables and attempting to handle references to .data
via some more comprehensive mechanism in-kernel, or falling back to
this approach if it is not available.

^ permalink raw reply

* [PATCH net] net: fix possible overflow in __sk_mem_raise_allocated()
From: Eric Dumazet @ 2019-02-12 20:26 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet

With many active TCP sockets, fat TCP sockets could fool
__sk_mem_raise_allocated() thanks to an overflow.

They would increase their share of the memory, instead
of decreasing it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sock.h | 2 +-
 net/core/sock.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 2b229f7be8ebbc160706012f7ed03db85c5689d0..f43f935cb113b73c6fc0df35b5f43103ba131ab2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1277,7 +1277,7 @@ static inline void sk_sockets_allocated_inc(struct sock *sk)
 	percpu_counter_inc(sk->sk_prot->sockets_allocated);
 }
 
-static inline int
+static inline u64
 sk_sockets_allocated_read_positive(struct sock *sk)
 {
 	return percpu_counter_read_positive(sk->sk_prot->sockets_allocated);
diff --git a/net/core/sock.c b/net/core/sock.c
index 6aa2e7e0b4fbdbc29d43d6b61a53b8de2a7ba269..bc3512f230a304c97c519a82c69d1e86f115b651 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2380,7 +2380,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 	}
 
 	if (sk_has_memory_pressure(sk)) {
-		int alloc;
+		u64 alloc;
 
 		if (!sk_under_memory_pressure(sk))
 			return 1;
-- 
2.20.1.791.gb4d0f1c61a-goog


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox