Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next] fib: cleanups
From: Eric Dumazet @ 2010-10-05  6:00 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Code style cleanups before upcoming functional changes.
C99 initializer for fib_props array.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/fib_frontend.c  |  121 +++++++++--------
 net/ipv4/fib_rules.c     |   10 -
 net/ipv4/fib_semantics.c |  257 +++++++++++++++++++------------------
 3 files changed, 206 insertions(+), 182 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4a69a95..b05c23b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -225,30 +225,33 @@ EXPORT_SYMBOL(inet_addr_type);
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 				__be32 addr)
 {
-       return __inet_dev_addr_type(net, dev, addr);
+	return __inet_dev_addr_type(net, dev, addr);
 }
 EXPORT_SYMBOL(inet_dev_addr_type);
 
 /* Given (packet source, input interface) and optional (dst, oif, tos):
-   - (main) check, that source is valid i.e. not broadcast or our local
-     address.
-   - figure out what "logical" interface this packet arrived
-     and calculate "specific destination" address.
-   - check, that packet arrived from expected physical interface.
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ *   address.
+ * - figure out what "logical" interface this packet arrived
+ *   and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
  */
-
 int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 			struct net_device *dev, __be32 *spec_dst,
 			u32 *itag, u32 mark)
 {
 	struct in_device *in_dev;
-	struct flowi fl = { .nl_u = { .ip4_u =
-				      { .daddr = src,
-					.saddr = dst,
-					.tos = tos } },
-			    .mark = mark,
-			    .iif = oif };
-
+	struct flowi fl = {
+		.nl_u = {
+			.ip4_u = {
+				.daddr = src,
+				.saddr = dst,
+				.tos = tos
+			}
+		},
+		.mark = mark,
+		.iif = oif
+	};
 	struct fib_result res;
 	int no_addr, rpf, accept_local;
 	bool dev_match;
@@ -477,9 +480,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 }
 
 /*
- *	Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
  */
-
 int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 {
 	struct fib_config cfg;
@@ -523,7 +526,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	return -EINVAL;
 }
 
-const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_DST]		= { .type = NLA_U32 },
 	[RTA_SRC]		= { .type = NLA_U32 },
 	[RTA_IIF]		= { .type = NLA_U32 },
@@ -537,7 +540,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
-			    struct nlmsghdr *nlh, struct fib_config *cfg)
+			     struct nlmsghdr *nlh, struct fib_config *cfg)
 {
 	struct nlattr *attr;
 	int err, remaining;
@@ -692,12 +695,11 @@ out:
 }
 
 /* Prepare and feed intra-kernel routing request.
-   Really, it should be netlink message, but :-( netlink
-   can be not configured, so that we feed it directly
-   to fib engine. It is legal, because all events occur
-   only when netlink is already locked.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
  */
-
 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
 {
 	struct net *net = dev_net(ifa->ifa_dev->dev);
@@ -743,9 +745,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
 	struct in_ifaddr *prim = ifa;
 	__be32 mask = ifa->ifa_mask;
 	__be32 addr = ifa->ifa_local;
-	__be32 prefix = ifa->ifa_address&mask;
+	__be32 prefix = ifa->ifa_address & mask;
 
-	if (ifa->ifa_flags&IFA_F_SECONDARY) {
+	if (ifa->ifa_flags & IFA_F_SECONDARY) {
 		prim = inet_ifa_byprefix(in_dev, prefix, mask);
 		if (prim == NULL) {
 			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
@@ -755,22 +757,24 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
 
 	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
 
-	if (!(dev->flags&IFF_UP))
+	if (!(dev->flags & IFF_UP))
 		return;
 
 	/* Add broadcast address, if it is explicitly assigned. */
 	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
 		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
 
-	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
 	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
-		fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
-			  RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+		fib_magic(RTM_NEWROUTE,
+			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+			  prefix, ifa->ifa_prefixlen, prim);
 
 		/* Add network specific broadcasts, when it takes a sense */
 		if (ifa->ifa_prefixlen < 31) {
 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
-			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+				  32, prim);
 		}
 	}
 }
@@ -781,17 +785,18 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 	struct net_device *dev = in_dev->dev;
 	struct in_ifaddr *ifa1;
 	struct in_ifaddr *prim = ifa;
-	__be32 brd = ifa->ifa_address|~ifa->ifa_mask;
-	__be32 any = ifa->ifa_address&ifa->ifa_mask;
+	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+	__be32 any = ifa->ifa_address & ifa->ifa_mask;
 #define LOCAL_OK	1
 #define BRD_OK		2
 #define BRD0_OK		4
 #define BRD1_OK		8
 	unsigned ok = 0;
 
-	if (!(ifa->ifa_flags&IFA_F_SECONDARY))
-		fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
-			  RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
+	if (!(ifa->ifa_flags & IFA_F_SECONDARY))
+		fib_magic(RTM_DELROUTE,
+			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+			  any, ifa->ifa_prefixlen, prim);
 	else {
 		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
 		if (prim == NULL) {
@@ -801,9 +806,9 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 	}
 
 	/* Deletion is more complicated than add.
-	   We should take care of not to delete too much :-)
-
-	   Scan address list to be sure that addresses are really gone.
+	 * We should take care of not to delete too much :-)
+	 *
+	 * Scan address list to be sure that addresses are really gone.
 	 */
 
 	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
@@ -817,23 +822,23 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 			ok |= BRD0_OK;
 	}
 
-	if (!(ok&BRD_OK))
+	if (!(ok & BRD_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
-	if (!(ok&BRD1_OK))
+	if (!(ok & BRD1_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
-	if (!(ok&BRD0_OK))
+	if (!(ok & BRD0_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
-	if (!(ok&LOCAL_OK)) {
+	if (!(ok & LOCAL_OK)) {
 		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
 
 		/* Check, that this local address finally disappeared. */
 		if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
 			/* And the last, but not the least thing.
-			   We must flush stray FIB entries.
-
-			   First of all, we scan fib_info list searching
-			   for stray nexthop entries, then ignite fib_flush.
-			*/
+			 * We must flush stray FIB entries.
+			 *
+			 * First of all, we scan fib_info list searching
+			 * for stray nexthop entries, then ignite fib_flush.
+			 */
 			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
 				fib_flush(dev_net(dev));
 		}
@@ -844,14 +849,20 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 #undef BRD1_OK
 }
 
-static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
+static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 {
 
 	struct fib_result       res;
-	struct flowi            fl = { .mark = frn->fl_mark,
-				       .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
-							    .tos = frn->fl_tos,
-							    .scope = frn->fl_scope } } };
+	struct flowi            fl = {
+		.mark = frn->fl_mark,
+		.nl_u = {
+			.ip4_u = {
+				.daddr = frn->fl_addr,
+				.tos = frn->fl_tos,
+				.scope = frn->fl_scope
+			}
+		}
+	};
 
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	res.r = NULL;
@@ -899,8 +910,8 @@ static void nl_fib_input(struct sk_buff *skb)
 
 	nl_fib_lookup(frn, tb);
 
-	pid = NETLINK_CB(skb).pid;       /* pid of sending process */
-	NETLINK_CB(skb).pid = 0;         /* from kernel */
+	pid = NETLINK_CB(skb).pid;      /* pid of sending process */
+	NETLINK_CB(skb).pid = 0;        /* from kernel */
 	NETLINK_CB(skb).dst_group = 0;  /* unicast */
 	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
 }
@@ -947,7 +958,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 		fib_del_ifaddr(ifa);
 		if (ifa->ifa_dev->ifa_list == NULL) {
 			/* Last address was deleted from this interface.
-			   Disable IP.
+			 * Disable IP.
 			 */
 			fib_disable_ip(dev, 1, 0);
 		} else {
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 76daeb5..3230052 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -6,7 +6,7 @@
  *		IPv4 Forwarding Information Base: policy rules.
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- * 		Thomas Graf <tgraf@suug.ch>
+ *		Thomas Graf <tgraf@suug.ch>
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -14,7 +14,7 @@
  *		2 of the License, or (at your option) any later version.
  *
  * Fixes:
- * 		Rani Assaf	:	local_rule cannot be deleted
+ *		Rani Assaf	:	local_rule cannot be deleted
  *		Marc Boucher	:	routing by fwmark
  */
 
@@ -32,8 +32,7 @@
 #include <net/ip_fib.h>
 #include <net/fib_rules.h>
 
-struct fib4_rule
-{
+struct fib4_rule {
 	struct fib_rule		common;
 	u8			dst_len;
 	u8			src_len;
@@ -91,7 +90,8 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
 		goto errout;
 	}
 
-	if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL)
+	tbl = fib_get_table(rule->fr_net, rule->table);
+	if (!tbl)
 		goto errout;
 
 	err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 20f09c5..ba52f39 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -60,21 +60,30 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 
 static DEFINE_SPINLOCK(fib_multipath_lock);
 
-#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
-for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
-for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
+#define for_nexthops(fi) {						\
+	int nhsel; const struct fib_nh *nh;				\
+	for (nhsel = 0, nh = (fi)->fib_nh;				\
+	     nhsel < (fi)->fib_nhs;					\
+	     nh++, nhsel++)
+
+#define change_nexthops(fi) {						\
+	int nhsel; struct fib_nh *nexthop_nh;				\
+	for (nhsel = 0,	nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
+	     nhsel < (fi)->fib_nhs;					\
+	     nexthop_nh++, nhsel++)
 
 #else /* CONFIG_IP_ROUTE_MULTIPATH */
 
 /* Hope, that gcc will optimize it to get rid of dummy loop */
 
-#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define for_nexthops(fi) {						\
+	int nhsel; const struct fib_nh *nh = (fi)->fib_nh;		\
+	for (nhsel = 0; nhsel < 1; nhsel++)
 
-#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define change_nexthops(fi) {						\
+	int nhsel;							\
+	struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
+	for (nhsel = 0; nhsel < 1; nhsel++)
 
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
@@ -86,54 +95,54 @@ static const struct
 	int	error;
 	u8	scope;
 } fib_props[RTN_MAX + 1] = {
-	{
+	[RTN_UNSPEC] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_NOWHERE,
-	},	/* RTN_UNSPEC */
-	{
+	},
+	[RTN_UNICAST] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_UNICAST */
-	{
+	},
+	[RTN_LOCAL] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_HOST,
-	},	/* RTN_LOCAL */
-	{
+	},
+	[RTN_BROADCAST] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_LINK,
-	},	/* RTN_BROADCAST */
-	{
+	},
+	[RTN_ANYCAST] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_LINK,
-	},	/* RTN_ANYCAST */
-	{
+	},
+	[RTN_MULTICAST] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_MULTICAST */
-	{
+	},
+	[RTN_BLACKHOLE] = {
 		.error	= -EINVAL,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_BLACKHOLE */
-	{
+	},
+	[RTN_UNREACHABLE] = {
 		.error	= -EHOSTUNREACH,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_UNREACHABLE */
-	{
+	},
+	[RTN_PROHIBIT] = {
 		.error	= -EACCES,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_PROHIBIT */
-	{
+	},
+	[RTN_THROW] = {
 		.error	= -EAGAIN,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_THROW */
-	{
+	},
+	[RTN_NAT] = {
 		.error	= -EINVAL,
 		.scope	= RT_SCOPE_NOWHERE,
-	},	/* RTN_NAT */
-	{
+	},
+	[RTN_XRESOLVE] = {
 		.error	= -EINVAL,
 		.scope	= RT_SCOPE_NOWHERE,
-	},	/* RTN_XRESOLVE */
+	},
 };
 
 
@@ -142,7 +151,7 @@ static const struct
 void free_fib_info(struct fib_info *fi)
 {
 	if (fi->fib_dead == 0) {
-		printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
+		pr_warning("Freeing alive fib_info %p\n", fi);
 		return;
 	}
 	change_nexthops(fi) {
@@ -173,7 +182,7 @@ void fib_release_info(struct fib_info *fi)
 	spin_unlock_bh(&fib_info_lock);
 }
 
-static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 {
 	const struct fib_nh *onh = ofi->fib_nh;
 
@@ -187,7 +196,7 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
 #ifdef CONFIG_NET_CLS_ROUTE
 		    nh->nh_tclassid != onh->nh_tclassid ||
 #endif
-		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
 			return -1;
 		onh++;
 	} endfor_nexthops(fi);
@@ -238,7 +247,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
 		    nfi->fib_priority == fi->fib_priority &&
 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
 			   sizeof(fi->fib_metrics)) == 0 &&
-		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+		    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
 			return fi;
 	}
@@ -247,9 +256,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
 }
 
 /* Check, that the gateway is already configured.
-   Used only by redirect accept routine.
+ * Used only by redirect accept routine.
  */
-
 int ip_fib_check_default(__be32 gw, struct net_device *dev)
 {
 	struct hlist_head *head;
@@ -264,7 +272,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
 	hlist_for_each_entry(nh, node, head, nh_hash) {
 		if (nh->nh_dev == dev &&
 		    nh->nh_gw == gw &&
-		    !(nh->nh_flags&RTNH_F_DEAD)) {
+		    !(nh->nh_flags & RTNH_F_DEAD)) {
 			spin_unlock(&fib_info_lock);
 			return 0;
 		}
@@ -362,10 +370,10 @@ int fib_detect_death(struct fib_info *fi, int order,
 	}
 	if (state == NUD_REACHABLE)
 		return 0;
-	if ((state&NUD_VALID) && order != dflt)
+	if ((state & NUD_VALID) && order != dflt)
 		return 0;
-	if ((state&NUD_VALID) ||
-	    (*last_idx<0 && order > dflt)) {
+	if ((state & NUD_VALID) ||
+	    (*last_idx < 0 && order > dflt)) {
 		*last_resort = fi;
 		*last_idx = order;
 	}
@@ -476,69 +484,69 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 
 
 /*
-   Picture
-   -------
-
-   Semantics of nexthop is very messy by historical reasons.
-   We have to take into account, that:
-   a) gateway can be actually local interface address,
-      so that gatewayed route is direct.
-   b) gateway must be on-link address, possibly
-      described not by an ifaddr, but also by a direct route.
-   c) If both gateway and interface are specified, they should not
-      contradict.
-   d) If we use tunnel routes, gateway could be not on-link.
-
-   Attempt to reconcile all of these (alas, self-contradictory) conditions
-   results in pretty ugly and hairy code with obscure logic.
-
-   I chose to generalized it instead, so that the size
-   of code does not increase practically, but it becomes
-   much more general.
-   Every prefix is assigned a "scope" value: "host" is local address,
-   "link" is direct route,
-   [ ... "site" ... "interior" ... ]
-   and "universe" is true gateway route with global meaning.
-
-   Every prefix refers to a set of "nexthop"s (gw, oif),
-   where gw must have narrower scope. This recursion stops
-   when gw has LOCAL scope or if "nexthop" is declared ONLINK,
-   which means that gw is forced to be on link.
-
-   Code is still hairy, but now it is apparently logically
-   consistent and very flexible. F.e. as by-product it allows
-   to co-exists in peace independent exterior and interior
-   routing processes.
-
-   Normally it looks as following.
-
-   {universe prefix}  -> (gw, oif) [scope link]
-			  |
-			  |-> {link prefix} -> (gw, oif) [scope local]
-						|
-						|-> {local prefix} (terminal node)
+ * Picture
+ * -------
+ *
+ * Semantics of nexthop is very messy by historical reasons.
+ * We have to take into account, that:
+ * a) gateway can be actually local interface address,
+ *    so that gatewayed route is direct.
+ * b) gateway must be on-link address, possibly
+ *    described not by an ifaddr, but also by a direct route.
+ * c) If both gateway and interface are specified, they should not
+ *    contradict.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * results in pretty ugly and hairy code with obscure logic.
+ *
+ * I chose to generalized it instead, so that the size
+ * of code does not increase practically, but it becomes
+ * much more general.
+ * Every prefix is assigned a "scope" value: "host" is local address,
+ * "link" is direct route,
+ * [ ... "site" ... "interior" ... ]
+ * and "universe" is true gateway route with global meaning.
+ *
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
+ * where gw must have narrower scope. This recursion stops
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * which means that gw is forced to be on link.
+ *
+ * Code is still hairy, but now it is apparently logically
+ * consistent and very flexible. F.e. as by-product it allows
+ * to co-exists in peace independent exterior and interior
+ * routing processes.
+ *
+ * Normally it looks as following.
+ *
+ * {universe prefix}  -> (gw, oif) [scope link]
+ *		  |
+ *		  |-> {link prefix} -> (gw, oif) [scope local]
+ *					|
+ *					|-> {local prefix} (terminal node)
  */
-
 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			struct fib_nh *nh)
 {
 	int err;
 	struct net *net;
+	struct net_device *dev;
 
 	net = cfg->fc_nlinfo.nl_net;
 	if (nh->nh_gw) {
 		struct fib_result res;
 
-		if (nh->nh_flags&RTNH_F_ONLINK) {
-			struct net_device *dev;
+		if (nh->nh_flags & RTNH_F_ONLINK) {
 
 			if (cfg->fc_scope >= RT_SCOPE_LINK)
 				return -EINVAL;
 			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
 				return -EINVAL;
-			if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
+			dev = __dev_get_by_index(net, nh->nh_oif);
+			if (!dev)
 				return -ENODEV;
-			if (!(dev->flags&IFF_UP))
+			if (!(dev->flags & IFF_UP))
 				return -ENETDOWN;
 			nh->nh_dev = dev;
 			dev_hold(dev);
@@ -559,7 +567,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			/* It is not necessary, but requires a bit of thinking */
 			if (fl.fl4_scope < RT_SCOPE_LINK)
 				fl.fl4_scope = RT_SCOPE_LINK;
-			if ((err = fib_lookup(net, &fl, &res)) != 0)
+			err = fib_lookup(net, &fl, &res);
+			if (err)
 				return err;
 		}
 		err = -EINVAL;
@@ -567,11 +576,12 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			goto out;
 		nh->nh_scope = res.scope;
 		nh->nh_oif = FIB_RES_OIF(res);
-		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
+		nh->nh_dev = dev = FIB_RES_DEV(res);
+		if (!dev)
 			goto out;
-		dev_hold(nh->nh_dev);
+		dev_hold(dev);
 		err = -ENETDOWN;
-		if (!(nh->nh_dev->flags & IFF_UP))
+		if (!(dev->flags & IFF_UP))
 			goto out;
 		err = 0;
 out:
@@ -580,13 +590,13 @@ out:
 	} else {
 		struct in_device *in_dev;
 
-		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+		if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
 			return -EINVAL;
 
 		in_dev = inetdev_by_index(net, nh->nh_oif);
 		if (in_dev == NULL)
 			return -ENODEV;
-		if (!(in_dev->dev->flags&IFF_UP)) {
+		if (!(in_dev->dev->flags & IFF_UP)) {
 			in_dev_put(in_dev);
 			return -ENETDOWN;
 		}
@@ -602,7 +612,9 @@ static inline unsigned int fib_laddr_hashfn(__be32 val)
 {
 	unsigned int mask = (fib_hash_size - 1);
 
-	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
+	return ((__force u32)val ^
+		((__force u32)val >> 7) ^
+		((__force u32)val >> 14)) & mask;
 }
 
 static struct hlist_head *fib_hash_alloc(int bytes)
@@ -611,7 +623,8 @@ static struct hlist_head *fib_hash_alloc(int bytes)
 		return kzalloc(bytes, GFP_KERNEL);
 	else
 		return (struct hlist_head *)
-			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
+			__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					 get_order(bytes));
 }
 
 static void fib_hash_free(struct hlist_head *hash, int bytes)
@@ -806,7 +819,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 			goto failure;
 	} else {
 		change_nexthops(fi) {
-			if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
+			err = fib_check_nh(cfg, fi, nexthop_nh);
+			if (err != 0)
 				goto failure;
 		} endfor_nexthops(fi)
 	}
@@ -819,7 +833,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	}
 
 link_it:
-	if ((ofi = fib_find_info(fi)) != NULL) {
+	ofi = fib_find_info(fi);
+	if (ofi) {
 		fi->fib_dead = 1;
 		free_fib_info(fi);
 		ofi->fib_treeref++;
@@ -895,7 +910,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 			case RTN_ANYCAST:
 			case RTN_MULTICAST:
 				for_nexthops(fi) {
-					if (nh->nh_flags&RTNH_F_DEAD)
+					if (nh->nh_flags & RTNH_F_DEAD)
 						continue;
 					if (!flp->oif || flp->oif == nh->nh_oif)
 						break;
@@ -906,16 +921,15 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 					goto out_fill_res;
 				}
 #else
-				if (nhsel < 1) {
+				if (nhsel < 1)
 					goto out_fill_res;
-				}
 #endif
 				endfor_nexthops(fi);
 				continue;
 
 			default:
-				printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
-					fa->fa_type);
+				pr_warning("fib_semantic_match bad type %#x\n",
+					   fa->fa_type);
 				return -EINVAL;
 			}
 		}
@@ -1028,10 +1042,10 @@ nla_put_failure:
 }
 
 /*
-   Update FIB if:
-   - local address disappeared -> we must delete all the entries
-     referring to it.
-   - device went down -> we must shutdown all nexthops going via it.
+ * Update FIB if:
+ * - local address disappeared -> we must delete all the entries
+ *   referring to it.
+ * - device went down -> we must shutdown all nexthops going via it.
  */
 int fib_sync_down_addr(struct net *net, __be32 local)
 {
@@ -1078,7 +1092,7 @@ int fib_sync_down_dev(struct net_device *dev, int force)
 		prev_fi = fi;
 		dead = 0;
 		change_nexthops(fi) {
-			if (nexthop_nh->nh_flags&RTNH_F_DEAD)
+			if (nexthop_nh->nh_flags & RTNH_F_DEAD)
 				dead++;
 			else if (nexthop_nh->nh_dev == dev &&
 				 nexthop_nh->nh_scope != scope) {
@@ -1110,10 +1124,9 @@ int fib_sync_down_dev(struct net_device *dev, int force)
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
 /*
-   Dead device goes up. We wake up dead nexthops.
-   It takes sense only on multipath routes.
+ * Dead device goes up. We wake up dead nexthops.
+ * It takes sense only on multipath routes.
  */
-
 int fib_sync_up(struct net_device *dev)
 {
 	struct fib_info *prev_fi;
@@ -1123,7 +1136,7 @@ int fib_sync_up(struct net_device *dev)
 	struct fib_nh *nh;
 	int ret;
 
-	if (!(dev->flags&IFF_UP))
+	if (!(dev->flags & IFF_UP))
 		return 0;
 
 	prev_fi = NULL;
@@ -1142,12 +1155,12 @@ int fib_sync_up(struct net_device *dev)
 		prev_fi = fi;
 		alive = 0;
 		change_nexthops(fi) {
-			if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
 				alive++;
 				continue;
 			}
 			if (nexthop_nh->nh_dev == NULL ||
-			    !(nexthop_nh->nh_dev->flags&IFF_UP))
+			    !(nexthop_nh->nh_dev->flags & IFF_UP))
 				continue;
 			if (nexthop_nh->nh_dev != dev ||
 			    !__in_dev_get_rtnl(dev))
@@ -1169,10 +1182,9 @@ int fib_sync_up(struct net_device *dev)
 }
 
 /*
-   The algorithm is suboptimal, but it provides really
-   fair weighted route distribution.
+ * The algorithm is suboptimal, but it provides really
+ * fair weighted route distribution.
  */
-
 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
 {
 	struct fib_info *fi = res->fi;
@@ -1182,7 +1194,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
 	if (fi->fib_power <= 0) {
 		int power = 0;
 		change_nexthops(fi) {
-			if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
 				power += nexthop_nh->nh_weight;
 				nexthop_nh->nh_power = nexthop_nh->nh_weight;
 			}
@@ -1198,15 +1210,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
 
 
 	/* w should be random number [0..fi->fib_power-1],
-	   it is pretty bad approximation.
+	 * it is pretty bad approximation.
 	 */
 
 	w = jiffies % fi->fib_power;
 
 	change_nexthops(fi) {
-		if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
+		if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
 		    nexthop_nh->nh_power) {
-			if ((w -= nexthop_nh->nh_power) <= 0) {
+			w -= nexthop_nh->nh_power;
+			if (w <= 0) {
 				nexthop_nh->nh_power--;
 				fi->fib_power--;
 				res->nh_sel = nhsel;



^ permalink raw reply related

* Re: checkentry function
From: Stephen Hemminger @ 2010-10-05  6:01 UTC (permalink / raw)
  To: Nicola Padovano; +Cc: netfilter-devel, netdev
In-Reply-To: <AANLkTikDi3ziyH9eJbEhjOBwjFVHrtsMzYEJes4=eiJF@mail.gmail.com>

On Tue, 5 Oct 2010 07:52:39 +0200
Nicola Padovano <nicola.padovano@gmail.com> wrote:

> > In current kernels, checkentry returns errno values.
> > 0 = okay
> > <0 is error (example -EINVAL).
> 0 = ok? and then you say 0 is error? which one?
> 

Negative (ie < 0) is used for error numbers. This is confusing
because in older kernels the checkentry returned a bool which
is defined as 1 okay and 0 for error. 

^ permalink raw reply

* Re: checkentry function
From: Nicola Padovano @ 2010-10-05  6:11 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netfilter-devel, netdev
In-Reply-To: <20101005150147.62226282@s6510>

>
> Negative (ie < 0) is used for error numbers. This is confusing
> because in older kernels the checkentry returned a bool which
> is defined as 1 okay and 0 for error.
>
ok i see.

and why i have this output?
DEBUG: the tablename (not FILTER) is: �%H �

I want block my target if the table name is NOT filter...so i write:

[CODE]
...
 if (strcmp(tablename, "filter"))   {
   printk(KERN_INFO "DEBUG: the tablename (not FILTER) is %s\n",tablename);
   return ERROR_VALUE; // < 0
 }
[/CODE]

but in the tablename variable i haven't the table's right value (but i
have: �%H � a wrong value)...what's the problem?

-- 
Nicola Padovano
e-mail: nicola.padovano@gmail.com
web: http://npadovano.altervista.org

"My only ambition is not be anything at all; it seems the most
sensible thing" (C. Bukowski)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH net-next] fib: fib_rules_cleanup can be static
From: Stephen Hemminger @ 2010-10-05  6:14 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

fib_rules_cleanup_ups is only defined and used in one place.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/include/net/fib_rules.h	2010-10-02 14:31:52.528341728 +0900
+++ b/include/net/fib_rules.h	2010-10-02 14:31:59.890839969 +0900
@@ -106,7 +106,6 @@ static inline u32 frh_get_table(struct f
 
 extern struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *, struct net *);
 extern void fib_rules_unregister(struct fib_rules_ops *);
-extern void                     fib_rules_cleanup_ops(struct fib_rules_ops *);
 
 extern int			fib_rules_lookup(struct fib_rules_ops *,
 						 struct flowi *, int flags,
--- a/net/core/fib_rules.c	2010-10-02 14:31:52.578341727 +0900
+++ b/net/core/fib_rules.c	2010-10-02 14:32:09.850842358 +0900
@@ -144,7 +144,7 @@ fib_rules_register(const struct fib_rule
 }
 EXPORT_SYMBOL_GPL(fib_rules_register);
 
-void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
+static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
 {
 	struct fib_rule *rule, *tmp;
 
@@ -153,7 +153,6 @@ void fib_rules_cleanup_ops(struct fib_ru
 		fib_rule_put(rule);
 	}
 }
-EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops);
 
 static void fib_rules_put_rcu(struct rcu_head *head)
 {

^ permalink raw reply

* [PATCH net-next] ipv6: make __ipv6_isatap_ifid static
From: Stephen Hemminger @ 2010-10-05  6:17 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Another exported symbol only used in one file

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 include/net/addrconf.h |    2 --
 net/ipv6/addrconf.c    |    3 +--
 2 files changed, 1 insertion(+), 4 deletions(-)

--- a/include/net/addrconf.h	2010-10-02 09:04:22.700929104 +0900
+++ b/include/net/addrconf.h	2010-10-02 09:11:00.651004530 +0900
@@ -276,8 +276,6 @@ static inline int ipv6_addr_is_ll_all_ro
 		(addr->s6_addr32[3] ^ htonl(0x00000002))) == 0;
 }
 
-extern int __ipv6_isatap_ifid(u8 *eui, __be32 addr);
-
 static inline int ipv6_addr_is_isatap(const struct in6_addr *addr)
 {
 	return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE);
--- a/net/ipv6/addrconf.c	2010-10-02 09:04:22.720901336 +0900
+++ b/net/ipv6/addrconf.c	2010-10-02 09:05:09.031021195 +0900
@@ -1544,7 +1544,7 @@ static int addrconf_ifid_infiniband(u8 *
 	return 0;
 }
 
-int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
+static int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
 {
 	if (addr == 0)
 		return -1;
@@ -1560,7 +1560,6 @@ int __ipv6_isatap_ifid(u8 *eui, __be32 a
 	memcpy(eui + 4, &addr, 4);
 	return 0;
 }
-EXPORT_SYMBOL(__ipv6_isatap_ifid);
 
 static int addrconf_ifid_sit(u8 *eui, struct net_device *dev)
 {

^ permalink raw reply

* Re: [PATCH net-next] qlcnic: remove dead code
From: Anirban Chakraborty @ 2010-10-05  6:18 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Amit Salecha, davem@davemloft.net, netdev@vger.kernel.org,
	Ameen Rahman, Sritej Velaga
In-Reply-To: <20101005104430.554c03e6@s6510>


On Oct 4, 2010, at 6:44 PM, Stephen Hemminger wrote:

> This driver has several pieces of dead code (found by running
> make namespacecheck). This patch removes them.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> ---
> Applies after Amit's earlier patches.
> 
> --- a/drivers/net/qlcnic/qlcnic.h	2010-10-05 10:37:07.442332958 +0900
> +++ b/drivers/net/qlcnic/qlcnic.h	2010-10-05 10:38:04.459818979 +0900
> @@ -1323,19 +1323,12 @@ netdev_tx_t qlcnic_xmit_frame(struct sk_
> void qlcnic_process_rcv_ring_diag(struct qlcnic_host_sds_ring *sds_ring);
> 
> /* Management functions */
> -int qlcnic_set_mac_address(struct qlcnic_adapter *, u8*);
> int qlcnic_get_mac_address(struct qlcnic_adapter *, u8*);
> int qlcnic_get_nic_info(struct qlcnic_adapter *, struct qlcnic_info *, u8);
> int qlcnic_set_nic_info(struct qlcnic_adapter *, struct qlcnic_info *);
> int qlcnic_get_pci_info(struct qlcnic_adapter *, struct qlcnic_pci_info*);
> -int qlcnic_reset_partition(struct qlcnic_adapter *, u8);
> 
> /*  eSwitch management functions */
> -int qlcnic_get_eswitch_capabilities(struct qlcnic_adapter *, u8,
> -				struct qlcnic_eswitch *);
> -int qlcnic_get_eswitch_status(struct qlcnic_adapter *, u8,
> -				struct qlcnic_eswitch *);
> -int qlcnic_toggle_eswitch(struct qlcnic_adapter *, u8, u8);
> int qlcnic_config_switch_port(struct qlcnic_adapter *,
> 				struct qlcnic_esw_func_cfg *);
> int qlcnic_get_eswitch_port_config(struct qlcnic_adapter *,
> --- a/drivers/net/qlcnic/qlcnic_ctx.c	2010-10-05 10:37:00.492317319 +0900
> +++ b/drivers/net/qlcnic/qlcnic_ctx.c	2010-10-05 10:38:04.459818979 +0900
> @@ -556,32 +556,6 @@ void qlcnic_free_hw_resources(struct qlc
> 	}
> }
> 
> -/* Set MAC address of a NIC partition */
> -int qlcnic_set_mac_address(struct qlcnic_adapter *adapter, u8* mac)
> -{
> -	int err = 0;
> -	u32 arg1, arg2, arg3;
> -
> -	arg1 = adapter->ahw.pci_func | BIT_9;
> -	arg2 = mac[0] | (mac[1] << 8) | (mac[2] << 16) | (mac[3] << 24);
> -	arg3 = mac[4] | (mac[5] << 16);
> -
> -	err = qlcnic_issue_cmd(adapter,
> -			adapter->ahw.pci_func,
> -			adapter->fw_hal_version,
> -			arg1,
> -			arg2,
> -			arg3,
> -			QLCNIC_CDRP_CMD_MAC_ADDRESS);
> -
> -	if (err != QLCNIC_RCODE_SUCCESS) {
> -		dev_err(&adapter->pdev->dev,
> -			"Failed to set mac address%d\n", err);
> -		err = -EIO;
> -	}
> -
> -	return err;
> -}
> 
> /* Get MAC address of a NIC partition */
> int qlcnic_get_mac_address(struct qlcnic_adapter *adapter, u8 *mac)
> @@ -764,149 +738,6 @@ int qlcnic_get_pci_info(struct qlcnic_ad
> 	return err;
> }
> 
> -/* Reset a NIC partition */
> -
> -int qlcnic_reset_partition(struct qlcnic_adapter *adapter, u8 func_no)
> -{
> -	int err = -EIO;
> -
> -	if (adapter->op_mode != QLCNIC_MGMT_FUNC)
> -		return err;
> -
> -	err = qlcnic_issue_cmd(adapter,
> -			adapter->ahw.pci_func,
> -			adapter->fw_hal_version,
> -			func_no,
> -			0,
> -			0,
> -			QLCNIC_CDRP_CMD_RESET_NPAR);
> -
> -	if (err != QLCNIC_RCODE_SUCCESS) {
> -		dev_err(&adapter->pdev->dev,
> -			"Failed to issue reset partition%d\n", err);
> -		err = -EIO;
> -	}
> -
> -	return err;
> -}
> -
> -/* Get eSwitch Capabilities */
> -int qlcnic_get_eswitch_capabilities(struct qlcnic_adapter *adapter, u8 port,
> -					struct qlcnic_eswitch *eswitch)
> -{
> -	int err = -EIO;
> -	u32 arg1, arg2;
> -
> -	if (adapter->op_mode == QLCNIC_NON_PRIV_FUNC)
> -		return err;
> -
> -	err = qlcnic_issue_cmd(adapter,
> -			adapter->ahw.pci_func,
> -			adapter->fw_hal_version,
> -			port,
> -			0,
> -			0,
> -			QLCNIC_CDRP_CMD_GET_ESWITCH_CAPABILITY);
> -
> -	if (err == QLCNIC_RCODE_SUCCESS) {
> -		arg1 = QLCRD32(adapter, QLCNIC_ARG1_CRB_OFFSET);
> -		arg2 = QLCRD32(adapter, QLCNIC_ARG2_CRB_OFFSET);
> -
> -		eswitch->port = arg1 & 0xf;
> -		eswitch->max_ucast_filters = LSW(arg2);
> -		eswitch->max_active_vlans = MSW(arg2) & 0xfff;
> -		if (arg1 & BIT_6)
> -			eswitch->flags |= QLCNIC_SWITCH_VLAN_FILTERING;
> -		if (arg1 & BIT_7)
> -			eswitch->flags |= QLCNIC_SWITCH_PROMISC_MODE;
> -		if (arg1 & BIT_8)
> -			eswitch->flags |= QLCNIC_SWITCH_PORT_MIRRORING;
> -	} else {
> -		dev_err(&adapter->pdev->dev,
> -			"Failed to get eswitch capabilities%d\n", err);
> -	}
> -
> -	return err;
> -}
> -
> -/* Get current status of eswitch */
> -int qlcnic_get_eswitch_status(struct qlcnic_adapter *adapter, u8 port,
> -				struct qlcnic_eswitch *eswitch)
> -{
> -	int err = -EIO;
> -	u32 arg1, arg2;
> -
> -	if (adapter->op_mode != QLCNIC_MGMT_FUNC)
> -		return err;
> -
> -	err = qlcnic_issue_cmd(adapter,
> -			adapter->ahw.pci_func,
> -			adapter->fw_hal_version,
> -			port,
> -			0,
> -			0,
> -			QLCNIC_CDRP_CMD_GET_ESWITCH_STATUS);
> -
> -	if (err == QLCNIC_RCODE_SUCCESS) {
> -		arg1 = QLCRD32(adapter, QLCNIC_ARG1_CRB_OFFSET);
> -		arg2 = QLCRD32(adapter, QLCNIC_ARG2_CRB_OFFSET);
> -
> -		eswitch->port = arg1 & 0xf;
> -		eswitch->active_vports = LSB(arg2);
> -		eswitch->active_ucast_filters = MSB(arg2);
> -		eswitch->active_vlans = LSB(MSW(arg2));
> -		if (arg1 & BIT_6)
> -			eswitch->flags |= QLCNIC_SWITCH_VLAN_FILTERING;
> -		if (arg1 & BIT_8)
> -			eswitch->flags |= QLCNIC_SWITCH_PORT_MIRRORING;
> -
> -	} else {
> -		dev_err(&adapter->pdev->dev,
> -			"Failed to get eswitch status%d\n", err);
> -	}
> -
> -	return err;
> -}
> -
> -/* Enable/Disable eSwitch */
> -int qlcnic_toggle_eswitch(struct qlcnic_adapter *adapter, u8 id, u8 enable)
> -{
> -	int err = -EIO;
> -	u32 arg1, arg2;
> -	struct qlcnic_eswitch *eswitch;
> -
> -	if (adapter->op_mode != QLCNIC_MGMT_FUNC)
> -		return err;
> -
> -	eswitch = &adapter->eswitch[id];
> -	if (!eswitch)
> -		return err;
> -
> -	arg1 = eswitch->port | (enable ? BIT_4 : 0);
> -	arg2 = eswitch->active_vports | (eswitch->max_ucast_filters << 8) |
> -		(eswitch->max_active_vlans << 16);
> -	err = qlcnic_issue_cmd(adapter,
> -			adapter->ahw.pci_func,
> -			adapter->fw_hal_version,
> -			arg1,
> -			arg2,
> -			0,
> -			QLCNIC_CDRP_CMD_TOGGLE_ESWITCH);
> -
> -	if (err != QLCNIC_RCODE_SUCCESS) {
> -		dev_err(&adapter->pdev->dev,
> -			"Failed to enable eswitch%d\n", eswitch->port);
> -		eswitch->flags &= ~QLCNIC_SWITCH_ENABLE;
> -		err = -EIO;
> -	} else {
> -		eswitch->flags |= QLCNIC_SWITCH_ENABLE;
> -		dev_info(&adapter->pdev->dev,
> -			"Enabled eSwitch for port %d\n", eswitch->port);
> -	}
> -
> -	return err;
> -}
> -
> /* Configure eSwitch for port mirroring */
> int qlcnic_config_port_mirroring(struct qlcnic_adapter *adapter, u8 id,
> 				u8 enable_mirroring, u8 pci_func)
> 

Thanks for doing this.

-Anirban




^ permalink raw reply

* Re: [PATCH net-next] fib: fib_rules_cleanup can be static
From: Eric Dumazet @ 2010-10-05  6:19 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev
In-Reply-To: <20101005151417.57eae0b0@s6510>

Le mardi 05 octobre 2010 à 15:14 +0900, Stephen Hemminger a écrit :
> fib_rules_cleanup_ups is only defined and used in one place.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>



^ permalink raw reply

* Re: checkentry function
From: Eric Dumazet @ 2010-10-05  6:23 UTC (permalink / raw)
  To: Nicola Padovano; +Cc: Stephen Hemminger, netfilter-devel, netdev
In-Reply-To: <AANLkTinTd5FjJ-6WeAGwintAS9aHdFYnS7--TLMSDEkn@mail.gmail.com>

Le mardi 05 octobre 2010 à 08:11 +0200, Nicola Padovano a écrit :
> >
> > Negative (ie < 0) is used for error numbers. This is confusing
> > because in older kernels the checkentry returned a bool which
> > is defined as 1 okay and 0 for error.
> >
> ok i see.
> 
> and why i have this output?
> DEBUG: the tablename (not FILTER) is: �%H �
> 
> I want block my target if the table name is NOT filter...so i write:
> 
> [CODE]
> ...
>  if (strcmp(tablename, "filter"))   {
>    printk(KERN_INFO "DEBUG: the tablename (not FILTER) is %s\n",tablename);
>    return ERROR_VALUE; // < 0
>  }
> [/CODE]
> 
> but in the tablename variable i haven't the table's right value (but i
> have: �%H � a wrong value)...what's the problem?
> 
> 

Because xxx_check() signature is not the one you use.

Could you read source code of _current_ existing modules , and use
copy/paste ?

static int hashlimit_mt_check(const struct xt_mtchk_param *par)
{
...
}




^ permalink raw reply

* Re: [PATCH net-next] ipv6: make __ipv6_isatap_ifid static
From: Eric Dumazet @ 2010-10-05  6:25 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev
In-Reply-To: <20101005151753.363121b8@s6510>

Le mardi 05 octobre 2010 à 15:17 +0900, Stephen Hemminger a écrit :
> Another exported symbol only used in one file
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>



^ permalink raw reply

* Re: [PATCHv4 net-next-2.6 4/5] XFRM,IPv6: Add IRO remapping hook in xfrm_input()
From: Herbert Xu @ 2010-10-05  6:27 UTC (permalink / raw)
  To: Arnaud Ebalard; +Cc: David S. Miller, Eric Dumazet, Hideaki YOSHIFUJI, netdev
In-Reply-To: <87vd5h7kbh.fsf@small.ssi.corp>

On Mon, Oct 04, 2010 at 10:51:46PM +0200, Arnaud Ebalard wrote:
> 
> Either I don't understand the sentence or this is not feasible: the
> thing is there is nothing in the packet to demultiplex like nh for
> RH2/HAO. Here, we only lookup for a remapping state when there is a
> mismatch in the source/destination addresses expected for the SA.
> 
> That's the reason IRO remapping states only apply to IPsec traffic.

I see.

The thing that bugs me is that you've added an indirect call for
all IPsec traffic when only MIPv6 users would ever need this.

With your remapping, would it be possible to add dummy xfrm_state
objects with the remapped destination address that could then call
xfrm6_input_addr?

That way normal IPsec users would not be affected at all while
preserving your new functionality.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* RE: [PATCH] caif: remove duplicated include
From: Sjur BRENDELAND @ 2010-10-05  6:34 UTC (permalink / raw)
  To: Nicolas Kaiser; +Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20101004163539.5e879211@absol.kitzblitz>

Nicolas Kaiser wrote: 
> Remove duplicated include.
> 
> Signed-off-by: Nicolas Kaiser <nikai@nikai.net>

Looks good, thanks. 
Acked-by: Sjur Braendeland <sjur.brandeland@stericsson.com>

^ permalink raw reply

* Fw: [PATCH] iwl3945: queue the right work if the scan needs to be aborted
From: Florian Mickler @ 2010-10-05  6:43 UTC (permalink / raw)
  To: stable-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Guy, Wey-Yi, Chatre, Reinette, Intel Linux Wireless,
	John W. Linville, Berg, Johannes, Cahill, Ben M,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

commit e7ee762cf074b0fd8eec483d0cef8fdbf0d04b81


Begin forwarded message:

Date: Mon, 27 Sep 2010 13:11:54 -0700
From: "Guy, Wey-Yi" <wey-yi.w.guy-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
To: Florian Mickler <florian-sVu6HhrpSfRAfugRpC6u6w@public.gmane.org>
Cc: "linux-wireless-u79uwXL29TY76Z2rM5mHXA@public.gmane.org" <linux-wireless-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>,
"Chatre, Reinette" <reinette.chatre-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>, Intel Linux Wireless
<ilw-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>, "John W. Linville" <linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>,
"Berg, Johannes" <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>, Zhu Yi <yi.zhu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>,
"Cahill, Ben M" <ben.m.cahill-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>,  "netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org"
<netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>, "linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org"
<linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org> Subject: Re: [PATCH] iwl3945: queue the
right work if the scan needs to be aborted


On Fri, 2010-09-24 at 09:22 -0700, Florian Mickler wrote:
> iwl3945's scan_completed calls into the mac80211 stack which triggers a
> warn on if there is no scan outstanding.
> 
> This can be avoided by not calling scan_completed but abort_scan in
> iwl3945_request_scan  in the done: branch of the function which is used
> as an error out.
> 
> The done: branch seems to be an error-out branch, as, for example, if
> iwl_is_ready(priv) returns false  the done: branch is executed.
> 
> NOTE:
> I'm not familiar with the driver at all.
> I just quickly scanned as a reaction to
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=17722
> 
> the users of scan_completed in the  iwl3945 driver and noted the odd
> discrepancy between the comment above this instance and the comment in
> mac80211 scan_completed function.
> Signed-off-by: Florian Mickler <florian-sVu6HhrpSfRAfugRpC6u6w@public.gmane.org>
Acked-by: Wey-Yi Guy <wey-yi.w.guy-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> ---
go into wireless-2.6 and stable only, scan fix already in
wireless-next-2.6

Thanks
Wey

>  drivers/net/wireless/iwlwifi/iwl-agn-lib.c  |    2 +-
>  drivers/net/wireless/iwlwifi/iwl3945-base.c |    2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-lib.c b/drivers/net/wireless/iwlwifi/iwl-agn-lib.c
> index 9dd9e64..8fd00a6 100644
> --- a/drivers/net/wireless/iwlwifi/iwl-agn-lib.c
> +++ b/drivers/net/wireless/iwlwifi/iwl-agn-lib.c
> @@ -1411,7 +1411,7 @@ void iwlagn_request_scan(struct iwl_priv *priv, struct ieee80211_vif *vif)
>  	clear_bit(STATUS_SCAN_HW, &priv->status);
>  	clear_bit(STATUS_SCANNING, &priv->status);
>  	/* inform mac80211 scan aborted */
> -	queue_work(priv->workqueue, &priv->scan_completed);
> +	queue_work(priv->workqueue, &priv->abort_scan);
>  }
>  
>  int iwlagn_manage_ibss_station(struct iwl_priv *priv,
> diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
> index 59a308b..d31661c 100644
> --- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
> +++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
> @@ -3018,7 +3018,7 @@ void iwl3945_request_scan(struct iwl_priv *priv, struct ieee80211_vif *vif)
>  	clear_bit(STATUS_SCANNING, &priv->status);
>  
>  	/* inform mac80211 scan aborted */
> -	queue_work(priv->workqueue, &priv->scan_completed);
> +	queue_work(priv->workqueue, &priv->abort_scan);
>  }
>  
>  static void iwl3945_bg_restart(struct work_struct *data)

--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] SIW: Kconfig and Makefile
From: Bernard Metzler @ 2010-10-05  6:53 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 drivers/infiniband/Kconfig         |    1 +
 drivers/infiniband/Makefile        |    1 +
 drivers/infiniband/hw/siw/Kconfig  |   14 ++++++++++++++
 drivers/infiniband/hw/siw/Makefile |    5 +++++
 4 files changed, 21 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/Kconfig
 create mode 100644 drivers/infiniband/hw/siw/Makefile

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 89d70de..524a31c 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -50,6 +50,7 @@ source "drivers/infiniband/hw/cxgb3/Kconfig"
 source "drivers/infiniband/hw/cxgb4/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
 source "drivers/infiniband/hw/nes/Kconfig"
+source "drivers/infiniband/hw/siw/Kconfig"
 
 source "drivers/infiniband/ulp/ipoib/Kconfig"
 
diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile
index 9cc7a47..c01a0d6 100644
--- a/drivers/infiniband/Makefile
+++ b/drivers/infiniband/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_INFINIBAND_CXGB3)		+= hw/cxgb3/
 obj-$(CONFIG_INFINIBAND_CXGB4)		+= hw/cxgb4/
 obj-$(CONFIG_MLX4_INFINIBAND)		+= hw/mlx4/
 obj-$(CONFIG_INFINIBAND_NES)		+= hw/nes/
+obj-$(CONFIG_INFINIBAND_SOFTIWARP)	+= hw/siw/
 obj-$(CONFIG_INFINIBAND_IPOIB)		+= ulp/ipoib/
 obj-$(CONFIG_INFINIBAND_SRP)		+= ulp/srp/
 obj-$(CONFIG_INFINIBAND_ISER)		+= ulp/iser/
diff --git a/drivers/infiniband/hw/siw/Kconfig b/drivers/infiniband/hw/siw/Kconfig
new file mode 100644
index 0000000..6beff23
--- /dev/null
+++ b/drivers/infiniband/hw/siw/Kconfig
@@ -0,0 +1,14 @@
+config INFINIBAND_SOFTIWARP
+	tristate "Software iWARP Stack (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	---help---
+	  Kernel Software Implementation of the iWARP protocol stack
+
+	  This driver implements the iWARP protocol stack in software
+	  and interfaces with in-kernel TCP/IP as well as the OFED
+	  verbs interfaces.
+
+	  Please send feedback to <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called siw.
diff --git a/drivers/infiniband/hw/siw/Makefile b/drivers/infiniband/hw/siw/Makefile
new file mode 100644
index 0000000..28344b7
--- /dev/null
+++ b/drivers/infiniband/hw/siw/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_INFINIBAND_SOFTIWARP) += siw.o
+
+siw-y := siw_main.o siw_cm.o siw_verbs.o siw_obj.o \
+	siw_qp.o siw_qp_tx.o siw_qp_rx.o siw_cq.o siw_cm.o \
+	siw_debug.o siw_ae.o
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] SIW: iWARP Protocol headers
From: Bernard Metzler @ 2010-10-05  6:53 UTC (permalink / raw)
  To: netdev; +Cc: linux-rdma, Bernard Metzler

---
 drivers/infiniband/hw/siw/iwarp.h |  324 +++++++++++++++++++++++++++++++++++++
 1 files changed, 324 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/iwarp.h

diff --git a/drivers/infiniband/hw/siw/iwarp.h b/drivers/infiniband/hw/siw/iwarp.h
new file mode 100644
index 0000000..762c1d3
--- /dev/null
+++ b/drivers/infiniband/hw/siw/iwarp.h
@@ -0,0 +1,324 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt@zurich.ibm.com>
+ *          Fredy Neeser <nfd@zurich.ibm.com>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IWARP_H
+#define _IWARP_H
+
+#include <rdma/rdma_user_cm.h>	/* RDMA_MAX_PRIVATE_DATA */
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+
+#define RDMAP_VERSION		1
+#define DDP_VERSION		1
+#define MPA_REVISION_1		1
+#define MPA_MAX_PRIVDATA	RDMA_MAX_PRIVATE_DATA
+#define MPA_KEY_REQ		"MPA ID Req Frame"
+#define MPA_KEY_REP		"MPA ID Rep Frame"
+
+struct mpa_rr_params {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u16	res:5,
+		r:1,
+		c:1,
+		m:1,
+		rev:8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u16	m:1,
+		c:1,
+		r:1,
+		res:5,
+		rev:8;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+	__u16	pd_len;
+};
+
+/*
+ * MPA request/reply header
+ */
+struct mpa_rr {
+	__u8	key[16];
+	struct mpa_rr_params params;
+};
+
+/*
+ * Don't change the layout/size of this struct!
+ */
+struct mpa_marker {
+	__u16	rsvd;
+	__u16	fpdu_hmd; /* FPDU header-marker distance (= MPA's FPDUPTR) */
+};
+
+#define MPA_MARKER_SPACING	512
+#define MPA_HDR_SIZE		2
+
+/*
+ * MPA marker size:
+ * - Standards-compliant marker insertion: Use sizeof(struct mpa_marker)
+ * - "Invisible markers" for testing sender's marker insertion
+ *   without affecting receiver: Use 0
+ */
+#define MPA_MARKER_SIZE		sizeof(struct mpa_marker)
+
+
+/*
+ * maximum MPA trailer
+ */
+struct mpa_trailer {
+	char	pad[4];
+	__u32	crc;
+};
+
+#define MPA_CRC_SIZE	4
+
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for any FPDU
+ */
+struct iwarp_ctrl {
+	__u16	mpa_len;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u16	dv:2,		/* DDP Version */
+		rsvd:4,		/* DDP reserved, MBZ */
+		l:1,		/* DDP Last flag */
+		t:1,		/* DDP Tagged flag */
+		opcode:4,	/* RDMAP opcode */
+		rsv:2,		/* RDMAP reserved, MBZ */
+		rv:2;		/* RDMAP Version, 01 for IETF, 00 for RDMAC */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u16	t:1,		/* DDP Tagged flag */
+		l:1,		/* DDP Last flag */
+		rsvd:4,		/* DDP reserved, MBZ */
+		dv:2,		/* DDP Version */
+		rv:2,		/* RDMAP Version, 01 for IETF, 00 for RDMAC */
+		rsv:2,		/* RDMAP reserved, MBZ */
+		opcode:4;	/* RDMAP opcode */
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+};
+
+
+struct rdmap_terminate_ctrl {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u32	etype:4,
+		layer:4,
+		ecode:8,
+		rsvd1:5,
+		r:1,
+		d:1,
+		m:1,
+		rsvd2:8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u32	layer:4,
+		etype:4,
+		ecode:8,
+		m:1,
+		d:1,
+		r:1,
+		rsvd1:5,
+		rsvd2:8;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+};
+
+
+struct iwarp_rdma_write {
+	struct iwarp_ctrl	ctrl;
+	__u32			sink_stag;
+	__u64			sink_to;
+} __attribute__((__packed__));
+
+struct iwarp_rdma_rreq {
+	struct iwarp_ctrl	ctrl;
+	__u32			rsvd;
+	__u32			ddp_qn;
+	__u32			ddp_msn;
+	__u32			ddp_mo;
+	__u32			sink_stag;
+	__u64			sink_to;
+	__u32			read_size;
+	__u32			source_stag;
+	__u64			source_to;
+} __attribute__((__packed__));
+
+struct iwarp_rdma_rresp {
+	struct iwarp_ctrl	ctrl;
+	__u32			sink_stag;
+	__u64			sink_to;
+} __attribute__((__packed__));
+
+struct iwarp_send {
+	struct iwarp_ctrl	ctrl;
+	__u32			rsvd;
+	__u32			ddp_qn;
+	__u32			ddp_msn;
+	__u32			ddp_mo;
+} __attribute__((__packed__));
+
+struct iwarp_send_inv {
+	struct iwarp_ctrl	ctrl;
+	__u32			inval_stag;
+	__u32			ddp_qn;
+	__u32			ddp_msn;
+	__u32			ddp_mo;
+} __attribute__((__packed__));
+
+struct iwarp_terminate {
+	struct iwarp_ctrl	ctrl;
+	__u32				rsvd;
+	__u32				ddp_qn;
+	__u32				ddp_msn;
+	__u32				ddp_mo;
+	struct rdmap_terminate_ctrl	term_ctrl;
+} __attribute__((__packed__));
+
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for an FPDU carrying an untagged DDP segment
+ */
+struct iwarp_ctrl_untagged {
+	struct iwarp_ctrl	ctrl;
+	__u32			rsvd;
+	__u32			ddp_qn;
+	__u32			ddp_msn;
+	__u32			ddp_mo;
+} __attribute__((__packed__));
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for an FPDU carrying a tagged DDP segment
+ */
+struct iwarp_ctrl_tagged {
+	struct iwarp_ctrl	ctrl;
+	__u32			ddp_stag;
+	__u64			ddp_to;
+} __attribute__((__packed__));
+
+union iwarp_hdrs {
+	struct iwarp_ctrl		ctrl;
+	struct iwarp_ctrl_untagged	c_untagged;
+	struct iwarp_ctrl_tagged	c_tagged;
+	struct iwarp_rdma_write		rwrite;
+	struct iwarp_rdma_rreq		rreq;
+	struct iwarp_rdma_rresp		rresp;
+	struct iwarp_terminate		terminate;
+	struct iwarp_send		send;
+	struct iwarp_send_inv		send_inv;
+};
+
+
+#define MPA_MIN_FRAG ((sizeof(union iwarp_hdrs) + MPA_CRC_SIZE))
+
+enum ddp_etype {
+	DDP_ETYPE_CATASTROPHIC	= 0x0,
+	DDP_ETYPE_TAGGED_BUF	= 0x1,
+	DDP_ETYPE_UNTAGGED_BUF	= 0x2,
+	DDP_ETYPE_RSVD		= 0x3
+};
+
+enum ddp_ecode {
+	DDP_ECODE_CATASTROPHIC		= 0x00,
+	/* Tagged Buffer Errors */
+	DDP_ECODE_T_INVALID_STAG	= 0x00,
+	DDP_ECODE_T_BASE_BOUNDS		= 0x01,
+	DDP_ECODE_T_STAG_NOT_ASSOC	= 0x02,
+	DDP_ECODE_T_TO_WRAP		= 0x03,
+	DDP_ECODE_T_DDP_VERSION		= 0x04,
+	/* Untagged Buffer Errors */
+	DDP_ECODE_UT_INVALID_QN		= 0x01,
+	DDP_ECODE_UT_INVALID_MSN_NOBUF	= 0x02,
+	DDP_ECODE_UT_INVALID_MSN_RANGE	= 0x03,
+	DDP_ECODE_UT_INVALID_MO		= 0x04,
+	DDP_ECODE_UT_MSG_TOOLONG	= 0x05,
+	DDP_ECODE_UT_DDP_VERSION	= 0x06
+};
+
+
+enum rdmap_untagged_qn {
+	RDMAP_UNTAGGED_QN_SEND		= 0,
+	RDMAP_UNTAGGED_QN_RDMA_READ	= 1,
+	RDMAP_UNTAGGED_QN_TERMINATE	= 2,
+	RDMAP_UNTAGGED_QN_COUNT		= 3
+};
+
+enum rdmap_etype {
+	RDMAP_ETYPE_CATASTROPHIC	= 0x0,
+	RDMAP_ETYPE_REMOTE_PROTECTION	= 0x1,
+	RDMAP_ETYPE_REMOTE_OPERATION	= 0x2
+};
+
+enum rdmap_ecode {
+	RDMAP_ECODE_INVALID_STAG	= 0x00,
+	RDMAP_ECODE_BASE_BOUNDS		= 0x01,
+	RDMAP_ECODE_ACCESS_RIGHTS	= 0x02,
+	RDMAP_ECODE_STAG_NOT_ASSOC	= 0x03,
+	RDMAP_ECODE_TO_WRAP		= 0x04,
+	RDMAP_ECODE_RDMAP_VERSION	= 0x05,
+	RDMAP_ECODE_UNEXPECTED_OPCODE	= 0x06,
+	RDMAP_ECODE_CATASTROPHIC_STREAM	= 0x07,
+	RDMAP_ECODE_CATASTROPHIC_GLOBAL	= 0x08,
+	RDMAP_ECODE_STAG_NOT_INVALIDATE	= 0x09,
+	RDMAP_ECODE_UNSPECIFIED		= 0xff
+};
+
+enum rdmap_elayer {
+	RDMAP_ERROR_LAYER_RDMA	= 0x00,
+	RDMAP_ERROR_LAYER_DDP	= 0x01,
+	RDMAP_ERROR_LAYER_LLP	= 0x02	/* eg., MPA */
+};
+
+enum rdma_opcode {
+	RDMAP_RDMA_WRITE	= 0x0,
+	RDMAP_RDMA_READ_REQ	= 0x1,
+	RDMAP_RDMA_READ_RESP	= 0x2,
+	RDMAP_SEND		= 0x3,
+	RDMAP_SEND_INVAL	= 0x4,
+	RDMAP_SEND_SE		= 0x5,
+	RDMAP_SEND_SE_INVAL	= 0x6,
+	RDMAP_TERMINATE		= 0x7,
+	RDMAP_NOT_SUPPORTED	= RDMAP_TERMINATE + 1
+};
+
+#endif
-- 
1.5.4.3


^ permalink raw reply related

* [PATCH] SIW: Main header file
From: Bernard Metzler @ 2010-10-05  6:53 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw.h |  816 +++++++++++++++++++++++++++++++++++++++
 1 files changed, 816 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw.h

diff --git a/drivers/infiniband/hw/siw/siw.h b/drivers/infiniband/hw/siw/siw.h
new file mode 100644
index 0000000..051eea6
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw.h
@@ -0,0 +1,816 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _SIW_H
+#define _SIW_H
+
+#include <linux/idr.h>
+#include <rdma/ib_verbs.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/netdevice.h>
+#include <linux/crypto.h>
+#include <linux/resource.h>	/* MLOCK_LIMIT */
+
+#include <rdma/ib_umem.h>	/* struct ib_umem_chunk */
+
+#include "siw_user.h"
+#include "iwarp.h"
+
+enum siw_if_type {
+	SIW_IF_OFED = 0,	/* only via standard ofed syscall if */
+	SIW_IF_MAPPED = 1	/* private qp and cq mapping */
+};
+
+#define DEVICE_ID_SOFTIWARP	0x0815
+#define VERSION_ID_SOFTIWARP	0x0001
+#define SIW_VENDOR_ID		0
+#define SIW_VENDORT_PART_ID	0
+#define SIW_SW_VERSION		1
+#define SIW_MAX_QP		(1024 * 100)
+#define SIW_MAX_QP_WR		(1024 * 32)
+#define SIW_MAX_ORD		128
+#define SIW_MAX_IRD		128
+#define SIW_MAX_SGE		10
+#define SIW_MAX_SGE_RD		1	/* iwarp limitation. we could relax */
+#define SIW_MAX_INLINE		PAGE_SIZE
+#define SIW_MAX_CQ		(1024 * 100)
+#define SIW_MAX_CQE		(SIW_MAX_QP_WR * 100)
+#define SIW_MAX_MR	\
+	(current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT)
+#define SIW_MAX_MR_SIZE		(1024 * 1024 * 1024)
+#define SIW_MAX_PD		SIW_MAX_QP
+#define SIW_MAX_MW		0	/* to be set if MW's are supported */
+#define SIW_MAX_FMR		0
+#define SIW_MAX_SRQ		SIW_MAX_QP
+#define SIW_MAX_SRQ_WR		(SIW_MAX_QP_WR * 10)
+
+#define SENDPAGE_THRESH		256	/* min bytes for using sendpage() */
+#define SOCKBUFSIZE		(PAGE_SIZE * 40)
+#define SQ_USER_MAXBURST	10
+
+#define	SIW_NODE_DESC		"Software iWARP stack"
+
+
+/*
+ * Softiwarp TX/RX configuration options
+ */
+
+#define CONFIG_RDMA_SIW_CRC_ENFORCED	0
+
+
+struct siw_devinfo {
+	unsigned		device;
+	unsigned		version;
+
+	/* close match to ib_device_attr where appropriate */
+	u32			vendor_id;
+	u32			vendor_part_id;
+	u32			sw_version;
+	int			max_qp;
+	int			max_qp_wr;
+	int			max_ord; /* max. outbound read queue depth */
+	int			max_ird; /* max. inbound read queue depth */
+
+	enum ib_device_cap_flags	cap_flags;
+	int			max_sge;
+	int			max_sge_rd;
+	int			max_cq;
+	int			max_cqe;
+	u64			max_mr_size;
+	int			max_mr;
+	int			max_pd;
+	int			max_mw;
+	int			max_fmr;
+	int			max_srq;
+	int			max_srq_wr;
+	int			max_srq_sge;
+	/* end ib_device_attr */
+
+	enum siw_if_type	iftype;
+};
+
+struct siw_dev {
+	struct ib_device	ofa_dev;
+	struct siw_dev		*next;
+	struct net_device	*l2dev;
+	struct siw_devinfo	attrs;
+	/* object management */
+	spinlock_t		idr_lock;
+	struct idr		qp_idr;
+	struct idr		cq_idr;
+	struct idr		pd_idr;
+	struct idr		mem_idr;	/* MRs & MWs */
+	/* active objects statistics */
+	atomic_t		num_qp;
+	atomic_t		num_cq;
+	atomic_t		num_pd;
+	atomic_t		num_mem;
+	atomic_t		num_srq;
+};
+
+struct siw_objhdr {
+	u32			id;	/* for idr based object lookup */
+	struct kref		ref;
+	struct siw_dev 		*dev;
+};
+
+
+struct siw_ucontext {
+	struct ib_ucontext	ib_ucontext;
+};
+
+struct siw_pd {
+	struct siw_objhdr	hdr;
+	struct ib_pd		ofa_pd;
+};
+
+enum siw_access_flags {
+	SR_MEM_LREAD	= (1<<0),
+	SR_MEM_LWRITE	= (1<<1),
+	SR_MEM_RREAD	= (1<<2),
+	SR_MEM_RWRITE	= (1<<3),
+
+	SR_MEM_FLAGS_LOCAL =
+		(SR_MEM_LREAD | SR_MEM_LWRITE),
+	SR_MEM_FLAGS_REMOTE =
+		(SR_MEM_RWRITE | SR_MEM_RREAD)
+};
+
+
+
+#define STAG_VALID 	1
+#define STAG_INVALID	0
+#define SIW_STAG_MAX	0xffffffff
+
+struct siw_mr;
+
+/*
+ * generic memory representation for registered siw memory.
+ * memory lookup always via higher 24 bit of stag (stag index).
+ * the stag is stored as part of the siw object header (id).
+ * object relates to memory window if embedded mr pointer is valid
+ */
+struct siw_mem {
+	struct siw_objhdr	hdr;
+
+	struct siw_mr	*mr;		/* assoc. MR if MW, NULL if MR */
+
+	__u32	stag_state:1,		/* VALID or INVALID */
+		is_zbva:1,		/* zero based virt. addr. */
+		mw_bind_enabled:1,	/* check only if MR */
+		remote_inval_enabled:1,	/* VALID or INVALID */
+		consumer_owns_key:1,	/* key/index split ? */
+		rsvd:27;
+
+	enum siw_access_flags	perms;	/* local/remote READ & WRITE */
+
+	u64	va;		/* VA of memory */
+	u32	len;		/* amount of memory bytes */
+	u32	fbo;		/* first byte offset */
+};
+
+#define SIW_MEM_IS_MW(m)	((m)->mr != NULL)
+#define SIW_INLINED_DATA(w)	((w)->wr.hdr.flags & IB_SEND_INLINE)
+
+/*
+ * MR and MW definition.
+ * Used OFA structs ib_mr/ib_mw holding:
+ * lkey, rkey, MW reference count on MR
+ */
+struct siw_mr {
+	struct ib_mr	ofa_mr;
+	struct siw_mem	mem;
+	struct ib_umem	*umem;
+	struct siw_pd	*pd;
+};
+
+struct siw_mw {
+	struct ib_mw	ofa_mw;
+	struct siw_mem	mem;
+};
+
+/********** WR definitions  ****************/
+
+enum siw_wr_opcode {
+	SIW_WR_RDMA_WRITE		= IB_WR_RDMA_WRITE,
+	SIW_WR_RDMA_WRITE_WITH_IMM	= IB_WR_RDMA_WRITE_WITH_IMM,
+	SIW_WR_SEND			= IB_WR_SEND,
+	SIW_WR_SEND_WITH_IMM		= IB_WR_SEND_WITH_IMM,
+	SIW_WR_RDMA_READ_REQ		= IB_WR_RDMA_READ,
+	SIW_WR_ATOMIC_CMP_AND_SWP	= IB_WR_ATOMIC_CMP_AND_SWP,
+	SIW_WR_ATOMIC_FETCH_AND_ADD	= IB_WR_ATOMIC_FETCH_AND_ADD,
+	SIW_WR_FASTREG			= IB_WR_FAST_REG_MR, /* unsupported */
+	SIW_WR_INVAL_STAG		= IB_WR_LOCAL_INV, /* unsupported */
+	SIW_WR_RECEIVE,
+	SIW_WR_BIND_MW, /* unsupported */
+	SIW_WR_RDMA_READ_RESP,		/* pseudo WQE */
+	SIW_WR_NUM			/* last entry! */
+};
+
+#define SIW_WQE_IS_TX(wqe)	1	/* add BIND/FASTREG/INVAL_STAG */
+
+struct siw_sge {
+	u64		addr;	/* HBO */
+	unsigned int	len;	/* HBO */
+	u32		lkey;	/* HBO */
+	union {
+		struct siw_mem	*obj; /* reference to registered memory */
+		char 		*buf; /* linear kernel buffer */
+	} mem;
+};
+
+struct siw_wr_common {
+	enum siw_wr_opcode	type;
+	enum ib_send_flags	flags;
+	u64			id;
+};
+
+/*
+ * All WRs below having an SGL (with 1 ore more SGEs) must start with
+ * the layout given by struct siw_wr_with_sgl!
+ */
+struct siw_wr_with_sgl {
+	struct siw_wr_common	hdr;
+	int                     num_sge;
+	struct siw_sge		sge[0]; /* Start of source or dest. SGL */
+};
+
+struct siw_wr_send {
+	struct siw_wr_common	hdr;
+	int			num_sge;
+	struct siw_sge		sge[SIW_MAX_SGE];
+};
+
+struct siw_wr_rmda_write {
+	struct	siw_wr_common	hdr;
+	int			num_sge;
+	struct siw_sge		sge[SIW_MAX_SGE];
+	u64			raddr;
+	u32			rtag;
+};
+
+struct siw_wr_rdma_rread {
+	struct	siw_wr_common	hdr;
+	int			num_sge;
+	struct siw_sge		sge[SIW_MAX_SGE];
+	u64			raddr;
+	u32			rtag;
+};
+
+struct siw_wr_rdma_rresp {
+	struct	siw_wr_common	hdr;
+	int			num_sge; /* must be 1 */
+	struct siw_sge		sge;
+	u64			raddr;
+	u32			rtag;  /* uninterpreted, NBO */
+};
+
+struct siw_wr_bind {
+	struct	siw_wr_common	hdr;
+	u32			rtag;
+	u32			ltag;
+	struct siw_mr		*mr;
+	u64			addr;
+	u32			len;
+	enum siw_access_flags	perms;
+};
+
+struct siw_wr_recv {
+	struct	siw_wr_common	hdr;
+	int			num_sge;
+	struct siw_sge		sge[SIW_MAX_SGE];
+};
+
+enum siw_wr_state {
+	SR_WR_QUEUED            = 0,	/* processing has not started yet */
+	SR_WR_INPROGRESS	= 1,	/* initiated processing of the WR */
+	SR_WR_DONE		= 2,
+};
+
+/* better name it siw_qe? */
+struct siw_wqe {
+	struct list_head	list;
+	union {
+		struct siw_wr_common		hdr;
+		struct siw_wr_with_sgl		sgl;
+		struct siw_wr_send		send;
+		struct siw_wr_rmda_write	write;
+		struct siw_wr_rdma_rread	rread;
+		struct siw_wr_rdma_rresp	rresp;
+		struct siw_wr_bind		bind;
+		struct siw_wr_recv		recv;
+	} wr;
+	struct siw_qp		*qp;
+	enum siw_wr_state	wr_status;
+	enum ib_wc_status	wc_status;
+	u32			bytes;		/* # bytes to processed */
+	u32			processed;	/* # bytes sucessfully proc'd */
+	int			error;
+};
+
+enum siw_cq_armed {
+	SIW_CQ_NOTIFY_NOT = 0,
+	SIW_CQ_NOTIFY_SOLICITED,
+	SIW_CQ_NOTIFY_ALL
+};
+
+struct siw_cq {
+	struct ib_cq		ofa_cq;
+	struct siw_objhdr	hdr;
+	enum siw_cq_armed	notify;
+	spinlock_t		lock;
+	struct list_head	queue;		/* simple list of cqe's */
+	atomic_t		qlen;		/* number of elements */
+};
+
+enum siw_qp_state {
+	SIW_QP_STATE_IDLE 	= 0,
+	SIW_QP_STATE_RTR 	= 1,
+	SIW_QP_STATE_RTS 	= 2,
+	SIW_QP_STATE_CLOSING 	= 3,
+	SIW_QP_STATE_TERMINATE 	= 4,
+	SIW_QP_STATE_ERROR 	= 5,
+	SIW_QP_STATE_MORIBUND 	= 6, /* destroy called but still referenced */
+	SIW_QP_STATE_UNDEF 	= 7,
+	SIW_QP_STATE_COUNT 	= 8
+};
+
+enum siw_qp_flags {
+	SIW_RDMA_BIND_ENABLED	= (1 << 0),
+	SIW_RDMA_WRITE_ENABLED	= (1 << 1),
+	SIW_RDMA_READ_ENABLED	= (1 << 2),
+	SIW_TERMINATE_LOCAL	= (1 << 3),
+	SIW_RECVQ_ARMED		= (1 << 4),
+	/*
+	 * QP currently being destroyed
+	 */
+	SIW_QP_IN_DESTROY	= (1 << 8)
+};
+
+enum siw_qp_attr_mask {
+	SIW_QP_ATTR_STATE		= (1 << 0),
+	SIW_QP_ATTR_ACCESS_FLAGS	= (1 << 1),
+	SIW_QP_ATTR_LLP_HANDLE		= (1 << 2),
+	SIW_QP_ATTR_ORD			= (1 << 3),
+	SIW_QP_ATTR_IRD			= (1 << 4),
+	SIW_QP_ATTR_SQ_SIZE		= (1 << 5),
+	SIW_QP_ATTR_RQ_SIZE		= (1 << 6),
+	SIW_QP_ATTR_MPA			= (1 << 7)
+};
+
+struct siw_mpa_attrs {
+	__u8	marker_rcv;
+	__u8	marker_snd;
+	__u8	crc;
+	__u8	version;
+};
+
+struct siw_sk_upcalls {
+	void    (*sk_state_change)(struct sock *sk);
+	void    (*sk_data_ready)(struct sock *sk, int bytes);
+	void    (*sk_write_space)(struct sock *sk);
+	void    (*sk_error_report)(struct sock *sk);
+};
+
+struct siw_sq_work {
+	struct work_struct	work;
+};
+
+struct siw_srq {
+	struct ib_srq		ofa_srq;
+	struct siw_pd		*pd;
+	struct list_head	rq;
+	spinlock_t		lock;
+	u32			max_sge;
+	atomic_t		space;	/* current space for posting wqe's */
+	u32			limit;	/* low watermark for async event */
+	u32			max_wr;	/* max # of wqe's allowed */
+	char			armed;	/* inform user if limit hit */
+};
+
+struct siw_qp_attrs {
+	enum siw_qp_state	state;
+	char                    terminate_buffer[52];
+	u32			terminate_msg_length;
+	u32			ddp_rdmap_version; /* 0 or 1 */
+	char                    *stream_msg_buf;
+	u32			stream_msg_buf_length;
+	u32			rq_hiwat;
+	u32			sq_size;
+	u32			rq_size;
+	u32			sq_max_sges;
+	u32			sq_max_sges_rdmaw;
+	u32			rq_max_sges;
+	u32			ord;
+	u32			ird;
+	struct siw_mpa_attrs	mpa;
+	enum siw_qp_flags	flags;
+
+	struct socket		*llp_stream_handle;
+};
+
+enum siw_tx_ctx {
+	SIW_SEND_HDR = 0,	/* start or continue sending HDR */
+	SIW_SEND_DATA = 1,	/* start or continue sending DDP payload */
+	SIW_SEND_TRAILER = 2,	/* start or continue sending TRAILER */
+	SIW_SEND_SHORT_FPDU = 3 /* send whole FPDU hdr|data|trailer at once */
+};
+
+enum siw_rx_state {
+	SIW_GET_HDR = 0,	/* await new hdr or within hdr */
+	SIW_GET_DATA_START = 1,	/* start of inbound DDP payload */
+	SIW_GET_DATA_MORE = 2,	/* continuation of (misaligned) DDP payload */
+	SIW_GET_TRAILER	= 3	/* await new trailer or within trailer (+pad) */
+};
+
+
+struct siw_iwarp_rx {
+	struct sk_buff		*skb;
+	union iwarp_hdrs	hdr;
+	struct mpa_trailer	trailer;
+	/*
+	 * local destination memory of inbound iwarp operation.
+	 * valid, if already resolved, NULL otherwise.
+	 */
+	union {
+		struct siw_wqe	*wqe; /* SEND, RRESP */
+		struct siw_mem	*mem; /* WRITE */
+	} dest;
+
+	struct hash_desc	mpa_crc_hd;
+	/*
+	 * Next expected DDP MSN for each QN +
+	 * expected steering tag +
+	 * expected DDP tagget offset (all HBO)
+	 */
+	u32			ddp_msn[RDMAP_UNTAGGED_QN_COUNT];
+	u32			ddp_stag;
+	u64			ddp_to;
+
+	/*
+	 * For each FPDU, main RX loop runs through 3 stages:
+	 * Receiving protocol headers, placing DDP payload and receiving
+	 * trailer information (CRC + eventual padding).
+	 * Next two variables keep state on receive status of the
+	 * current FPDU part (hdr, data, trailer).
+	 */
+	int			fpdu_part_rcvd;/* bytes in pkt part copied */
+	int			fpdu_part_rem; /* bytes in pkt part not seen */
+
+	int			skb_new;      /* pending unread bytes in skb */
+	int			skb_offset;   /* offset in skb */
+	int			skb_copied;   /* processed bytes in skb */
+
+	int			sge_idx;	/* current sge in rx */
+	unsigned int		sge_off; 	/* already rcvd in curr. sge */
+	struct ib_umem_chunk	*umem_chunk;	/* chunk used by sge and off */
+	int			pg_idx;		/* page used in chunk */
+	unsigned int		pg_off;		/* offset within that page */
+
+	enum siw_rx_state	state;
+
+	u8			crc_enabled:1,
+				first_ddp_seg:1,   /* receiving first DDP seg */
+				more_ddp_segs:1,   /* more DDP segs expected */
+				rx_suspend:1,	   /* stop rcv DDP segs. */
+				prev_ddp_opcode:4; /* opcode of prev DDP msg */
+	char			pad;		/* # of pad bytes expected */
+};
+
+#define siw_rx_data(qp, rctx)	\
+	(iwarp_pktinfo[rctx->hdr.ctrl.opcode].proc_data(qp, rctx))
+
+/*
+ * Shorthands for short packets w/o payload
+ * to be transmitted more efficient.
+ */
+struct siw_send_pkt {
+	struct iwarp_send	send;
+	__u32			crc;
+} __attribute__((__packed__));
+
+struct siw_write_pkt {
+	struct iwarp_rdma_write	write;
+	__u32			crc;
+} __attribute__((__packed__));
+
+struct siw_rreq_pkt {
+	struct iwarp_rdma_rreq	rreq;
+	__u32			crc;
+} __attribute__((__packed__));
+
+struct siw_rresp_pkt {
+	struct iwarp_rdma_rresp	rresp;
+	__u32			crc;
+} __attribute__((__packed__));
+
+struct siw_iwarp_tx {
+	union {
+		union iwarp_hdrs		hdr;
+
+		/* Generic part of FPDU header */
+		struct iwarp_ctrl		ctrl;
+		struct iwarp_ctrl_untagged	c_untagged;
+		struct iwarp_ctrl_tagged	c_tagged;
+
+		/* FPDU headers */
+		struct iwarp_rdma_write		rwrite;
+		struct iwarp_rdma_rreq		rreq;
+		struct iwarp_rdma_rresp		rresp;
+		struct iwarp_terminate		terminate;
+		struct iwarp_send		send;
+		struct iwarp_send_inv		send_inv;
+
+		/* complete short FPDUs */
+		struct siw_send_pkt		send_pkt;
+		struct siw_write_pkt		write_pkt;
+		struct siw_rreq_pkt		rreq_pkt;
+		struct siw_rresp_pkt		rresp_pkt;
+	} pkt;
+
+	struct mpa_trailer			trailer;
+	/* DDP MSN for untagged messages */
+	u32			ddp_msn[RDMAP_UNTAGGED_QN_COUNT];
+
+	enum siw_tx_ctx	state;
+	wait_queue_head_t	waitq;
+
+	u16			ctrl_len;	/* ddp+rdmap hdr */
+	u16			ctrl_sent;
+	int			bytes_unsent;	/* ddp payload bytes */
+
+	struct hash_desc	mpa_crc_hd;
+
+	atomic_t		in_use;		/* tx currently under way */
+
+	char			pad;		/* # pad in current fpdu */
+	u8			crc_enabled:1,	/* compute and ship crc */
+				do_crc:1,	/* do crc for segment */
+				use_sendpage:1,	/* send w/o copy */
+				new_tcpseg:1,	/* start new tcp segment */
+				wspace_update:1,/* new write space indicated */
+				tx_suspend:1,	/* stop sending DDP segs. */
+				rsvd:3;
+
+	u16			fpdu_len;	/* len of FPDU to tx */
+
+	int			tcp_seglen;	/* remaining tcp seg space */
+	struct siw_wqe		*wqe;
+
+	int			sge_idx;	/* current sge in tx */
+	u32			sge_off; 	/* already sent in curr. sge */
+	struct ib_umem_chunk	*umem_chunk;	/* chunk used by sge and off */
+	int			pg_idx;		/* page used in mem chunk */
+};
+
+struct siw_qp {
+	struct ib_qp		ofa_qp;
+	struct siw_objhdr	hdr;
+	int			cpu;
+	struct siw_iwarp_rx	rx_ctx;
+	struct siw_iwarp_tx	tx_ctx;
+
+	struct siw_cep		*cep;
+	struct rw_semaphore	state_lock;
+
+	struct siw_pd		*pd;
+	struct siw_cq		*scq;
+	struct siw_cq		*rcq;
+
+	struct siw_qp_attrs	attrs;
+
+	struct list_head	wqe_freelist;
+	spinlock_t		freelist_lock;
+	struct list_head	sq;
+	struct list_head	irq;
+	spinlock_t		sq_lock;
+	atomic_t		sq_space;
+	struct siw_srq		*srq;
+	struct list_head	rq;
+	spinlock_t		rq_lock;
+	atomic_t		rq_space;
+	struct list_head	orq;
+	atomic_t		orq_space;
+	spinlock_t		orq_lock;
+	/*
+	 * workqueue interface:
+	 *
+	 * we must allow for two works since during work
+	 * execution we may have to schedule another work item
+	 */
+	struct siw_sq_work	sq_work;
+};
+
+#define lock_sq(qp)	spin_lock(&qp->sq_lock)
+#define unlock_sq(qp)	spin_unlock(&qp->sq_lock)
+
+#define lock_sq_rxsave(qp, flags) spin_lock_irqsave(&qp->sq_lock, flags)
+#define unlock_sq_rxsave(qp, flags) spin_unlock_irqrestore(&qp->sq_lock, flags)
+
+#define lock_rq(qp)	spin_lock(&qp->rq_lock)
+#define unlock_rq(qp)	spin_unlock(&qp->rq_lock)
+
+#define lock_rq_rxsave(qp, flags) spin_lock_irqsave(&qp->rq_lock, flags)
+#define unlock_rq_rxsave(qp, flags) spin_unlock_irqrestore(&qp->rq_lock, flags)
+
+#define lock_srq(srq)	spin_lock(&srq->lock)
+#define unlock_srq(srq)	spin_unlock(&srq->lock)
+
+#define lock_srq_rxsave(srq, flags) spin_lock_irqsave(&srq->lock, flags)
+#define unlock_srq_rxsave(srq, flags) spin_unlock_irqrestore(&srq->lock, flags)
+
+#define lock_cq(cq)	spin_lock(&cq->lock)
+#define unlock_cq(cq)	spin_unlock(&cq->lock)
+
+#define lock_cq_rxsave(cq, flags)	spin_lock_irqsave(&cq->lock, flags)
+#define unlock_cq_rxsave(cq, flags)	spin_unlock_irqrestore(&cq->lock, flags)
+
+#define lock_orq(qp)	spin_lock(&qp->orq_lock)
+#define unlock_orq(qp)	spin_unlock(&qp->orq_lock)
+
+#define lock_orq_rxsave(qp, flags)	spin_lock_irqsave(&qp->orq_lock, flags)
+#define unlock_orq_rxsave(qp, flags)\
+	spin_unlock_irqrestore(&qp->orq_lock, flags)
+
+#define RX_QP(rx)		container_of(rx, struct siw_qp, rx_ctx)
+#define TX_QP(tx)		container_of(tx, struct siw_qp, tx_ctx)
+#define QP_ID(qp)		((qp)->hdr.id)
+#define OBJ_ID(obj)		((obj)->hdr.id)
+#define RX_QPID(rx)		QP_ID(RX_QP(rx))
+#define TX_QPID(tx)		QP_ID(TX_QP(tx))
+
+/* helper macros */
+#define tx_wqe(qp)		((qp)->tx_ctx.wqe)
+#define rx_wqe(qp)		((qp)->rx_ctx.dest.wqe)
+#define rx_mem(qp)		((qp)->rx_ctx.dest.mem)
+#define wr_id(wqe)		((wqe)->wr.hdr.id)
+#define wr_type(wqe)		((wqe)->wr.hdr.type)
+#define wr_flags(wqe)		((wqe)->wr.hdr.flags)
+#define list_entry_wqe(pos)	list_entry(pos, struct siw_wqe, list)
+#define list_first_wqe(pos)	list_first_entry(pos, struct siw_wqe, list)
+
+#define ORD_SUSPEND_SQ(qp) 	(!atomic_read(&(qp)->orq_space))
+#define TX_ACTIVE(qp)		(tx_wqe(qp) != NULL)
+#define SQ_EMPTY(qp)		list_empty(&((qp)->sq))
+#define ORQ_EMPTY(qp)		list_empty(&((qp)->orq))
+#define IRQ_EMPTY(qp)		list_empty(&((qp)->irq))
+#define TX_ACTIVE_RRESP(qp)	(TX_ACTIVE(qp) &&\
+			wr_type(tx_wqe(qp)) == SIW_WR_RDMA_READ_RESP)
+
+#define TX_IDLE(qp)		(!TX_ACTIVE(qp) && SQ_EMPTY(qp) && \
+				IRQ_EMPTY(qp) && ORQ_EMPTY(qp))
+
+#define TX_MORE_WQE(qp)		(!SQ_EMPTY(qp) || !IRQ_EMPTY(qp))
+
+struct iwarp_msg_info {
+	int			hdr_len;
+	struct iwarp_ctrl	ctrl;
+	int (*proc_data)	(struct siw_qp *, struct siw_iwarp_rx *);
+};
+
+extern struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1];
+
+
+extern struct siw_dev *siw;
+
+
+/* QP general functions */
+int siw_qp_modify(struct siw_qp *, struct siw_qp_attrs *,
+		  enum siw_qp_attr_mask);
+
+void siw_qp_llp_close(struct siw_qp *);
+void siw_qp_cm_drop(struct siw_qp *, int);
+
+
+struct ib_qp *siw_get_ofaqp(struct ib_device *, int);
+void siw_qp_get_ref(struct ib_qp *);
+void siw_qp_put_ref(struct ib_qp *);
+
+int siw_no_mad(struct ib_device *, int, u8, struct ib_wc *, struct ib_grh *,
+	       struct ib_mad *, struct ib_mad *);
+
+enum siw_qp_state siw_map_ibstate(enum ib_qp_state);
+
+int siw_check_mem(struct siw_pd *, struct siw_mem *, u64,
+		  enum siw_access_flags, int);
+int siw_check_sge(struct siw_pd *, struct siw_sge *,
+		  enum siw_access_flags, u32, int);
+int siw_check_sgl(struct siw_pd *, struct siw_sge *,
+		  enum siw_access_flags, u32, int);
+
+void siw_rq_complete(struct siw_wqe *, struct siw_qp *);
+void siw_sq_complete(struct list_head *, struct siw_qp *, int,
+		     enum ib_send_flags);
+
+
+/* QP TX path functions */
+int siw_qp_sq_process(struct siw_qp *, int);
+int siw_sq_worker_init(void);
+void siw_sq_worker_exit(void);
+int siw_sq_queue_work(struct siw_qp *qp);
+
+/* QP RX path functions */
+int siw_proc_send(struct siw_qp *, struct siw_iwarp_rx *);
+int siw_init_rresp(struct siw_qp *, struct siw_iwarp_rx *);
+int siw_proc_rreq(struct siw_qp *, struct siw_iwarp_rx *);
+int siw_proc_rresp(struct siw_qp *, struct siw_iwarp_rx *);
+int siw_proc_write(struct siw_qp *, struct siw_iwarp_rx *);
+int siw_proc_terminate(struct siw_qp*, struct siw_iwarp_rx *);
+int siw_proc_unsupp(struct siw_qp *, struct siw_iwarp_rx *);
+
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+		    unsigned int off, size_t len);
+
+/* MPA utilities */
+int siw_crc_array(struct hash_desc *, u8 *, size_t);
+int siw_crc_sg(struct hash_desc *, struct scatterlist *, int, int);
+
+
+/* Varia */
+void siw_cq_flush(struct siw_cq *);
+void siw_sq_flush(struct siw_qp *);
+void siw_rq_flush(struct siw_qp *);
+void siw_qp_freeq_flush(struct siw_qp *);
+int siw_reap_cqe(struct siw_cq *, struct ib_wc *);
+
+void siw_async_ev(struct siw_qp *, struct siw_cq *, enum ib_event_type);
+void siw_async_srq_ev(struct siw_srq *, enum ib_event_type);
+
+static inline struct siw_wqe *
+siw_next_tx_wqe(struct siw_qp *qp) {
+	struct siw_wqe *wqe;
+
+	if (!list_empty(&qp->irq))
+		wqe = list_first_entry(&qp->irq, struct siw_wqe, list);
+	else if (!list_empty(&qp->sq))
+		wqe = list_first_entry(&qp->sq, struct siw_wqe, list);
+	else
+		wqe = NULL;
+	return wqe;
+}
+
+static inline void
+siw_rreq_queue(struct siw_wqe *wqe, struct siw_qp *qp)
+{
+	unsigned long	flags;
+
+	lock_orq_rxsave(qp, flags);
+	list_move_tail(&wqe->list, &qp->orq);
+	atomic_dec(&qp->orq_space);
+	unlock_orq_rxsave(qp, flags);
+}
+
+
+static inline struct ib_umem_chunk *
+mem_chunk_next(struct ib_umem_chunk *chunk)
+{
+	return list_entry(chunk->list.next, struct ib_umem_chunk, list);
+}
+
+
+static inline struct siw_mr *siw_mem2mr(struct siw_mem *m)
+{
+	if (!SIW_MEM_IS_MW(m))
+		return container_of(m, struct siw_mr, mem);
+	return m->mr;
+}
+
+#endif
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] SIW: Module initialization
From: Bernard Metzler @ 2010-10-05  6:54 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_main.c |  440 ++++++++++++++++++++++++++++++++++
 1 files changed, 440 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_main.c

diff --git a/drivers/infiniband/hw/siw/siw_main.c b/drivers/infiniband/hw/siw/siw_main.c
new file mode 100644
index 0000000..4eec70b
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_main.c
@@ -0,0 +1,440 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <net/net_namespace.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_arp.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+#include "siw_verbs.h"
+
+
+MODULE_DESCRIPTION("Software iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("0.1");
+
+static int loopback_enabled;
+module_param(loopback_enabled, int, 0644);
+MODULE_PARM_DESC(loopback_enabled, "enable_loopback");
+
+struct siw_dev *siw_device;
+
+static ssize_t show_sw_version(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct siw_dev *siw_dev = container_of(dev, struct siw_dev,
+						 ofa_dev.dev);
+
+	return sprintf(buf, "%x\n", siw_dev->attrs.version);
+}
+
+static ssize_t show_if_type(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct siw_dev *siw_dev = container_of(dev, struct siw_dev,
+					       ofa_dev.dev);
+
+	return sprintf(buf, "%d\n", siw_dev->attrs.iftype);
+}
+
+static DEVICE_ATTR(sw_version, S_IRUGO, show_sw_version, NULL);
+static DEVICE_ATTR(if_type, S_IRUGO, show_if_type, NULL);
+
+static struct device_attribute *siw_dev_attributes[] = {
+	&dev_attr_sw_version,
+	&dev_attr_if_type
+};
+
+int siw_register_device(struct siw_dev *dev)
+{
+	struct ib_device *ibdev = &dev->ofa_dev;
+	int rv, i;
+
+	if (dev->l2dev->type != ARPHRD_LOOPBACK)
+		strlcpy(ibdev->name, "siw%d", IB_DEVICE_NAME_MAX);
+	else
+		strlcpy(ibdev->name, "siw_lo%d", IB_DEVICE_NAME_MAX);
+	memset(&ibdev->node_guid, 0, sizeof(ibdev->node_guid));
+	memcpy(&ibdev->node_guid, dev->l2dev->dev_addr, 6);
+
+	ibdev->owner = THIS_MODULE;
+
+	ibdev->uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+
+	ibdev->node_type = RDMA_NODE_RNIC;
+	memcpy(ibdev->node_desc, SIW_NODE_DESC, sizeof(SIW_NODE_DESC));
+
+	/*
+	 * Current model (one-to-one device association):
+	 * One Softiwarp device per net_device or, equivalently,
+	 * per physical port.
+	 */
+	ibdev->phys_port_cnt = 1;
+
+	ibdev->num_comp_vectors = 1;
+	/*
+	 * While DMA adresses are not used a device must be provided
+	 * as long as the code relies on OFA's ib_umem_get() function for
+	 * memory pinning. calling ib_umem_get() includes a
+	 * (for siw case useless) translation of memory to DMA
+	 * adresses for that device.
+	 */
+	ibdev->dma_device = dev->l2dev->dev.parent;
+	ibdev->query_device = siw_query_device;
+	ibdev->query_port = siw_query_port;
+	ibdev->query_qp = siw_query_qp;
+	ibdev->modify_port = NULL;
+	ibdev->query_pkey = siw_query_pkey;
+	ibdev->query_gid = siw_query_gid;
+	ibdev->alloc_ucontext = siw_alloc_ucontext;
+	ibdev->dealloc_ucontext = siw_dealloc_ucontext;
+	ibdev->mmap = siw_mmap;
+	ibdev->alloc_pd = siw_alloc_pd;
+	ibdev->dealloc_pd = siw_dealloc_pd;
+	ibdev->create_ah = siw_create_ah;
+	ibdev->destroy_ah = siw_destroy_ah;
+	ibdev->create_qp = siw_create_qp;
+	ibdev->modify_qp = siw_ofed_modify_qp;
+	ibdev->destroy_qp = siw_destroy_qp;
+	ibdev->create_cq = siw_create_cq;
+	ibdev->destroy_cq = siw_destroy_cq;
+	ibdev->resize_cq = NULL;
+	ibdev->poll_cq = siw_poll_cq;
+	ibdev->get_dma_mr = siw_get_dma_mr;
+	ibdev->reg_phys_mr = NULL;
+	ibdev->rereg_phys_mr = NULL;
+	ibdev->reg_user_mr = siw_reg_user_mr;
+	ibdev->dereg_mr = siw_dereg_mr;
+	ibdev->alloc_mw = NULL;
+	ibdev->bind_mw = NULL;
+	ibdev->dealloc_mw = NULL;
+
+	ibdev->create_srq = siw_create_srq;
+	ibdev->modify_srq = siw_modify_srq;
+	ibdev->query_srq = siw_query_srq;
+	ibdev->destroy_srq = siw_destroy_srq;
+	ibdev->post_srq_recv = siw_post_srq_recv;
+
+	ibdev->attach_mcast = NULL;
+	ibdev->detach_mcast = NULL;
+	ibdev->process_mad = siw_no_mad;
+
+	ibdev->req_notify_cq = siw_req_notify_cq;
+	ibdev->post_send = siw_post_send;
+	ibdev->post_recv = siw_post_receive;
+
+
+	ibdev->iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
+	if (!ibdev->iwcm)
+		return -ENOMEM;
+
+	ibdev->iwcm->connect = siw_connect;
+	ibdev->iwcm->accept = siw_accept;
+	ibdev->iwcm->reject = siw_reject;
+	ibdev->iwcm->create_listen = siw_create_listen;
+	ibdev->iwcm->destroy_listen = siw_destroy_listen;
+	ibdev->iwcm->add_ref = siw_qp_get_ref;
+	ibdev->iwcm->rem_ref = siw_qp_put_ref;
+	ibdev->iwcm->get_qp = siw_get_ofaqp;
+	rv = ib_register_device(ibdev, NULL);
+	if (rv) {
+		dprint(DBG_DM|DBG_ON, "(dev=%s): "
+			"ib_register_device failed: rv=%d\n", ibdev->name, rv);
+		return rv;
+	}
+
+	/*
+	 * set and register sw version + user if type
+	 */
+	dev->attrs.version = VERSION_ID_SOFTIWARP;
+	dev->attrs.iftype  = SIW_IF_OFED;
+
+	dev->attrs.vendor_id = SIW_VENDOR_ID;
+	dev->attrs.vendor_part_id = SIW_VENDORT_PART_ID;
+	dev->attrs.sw_version = SIW_SW_VERSION;
+	dev->attrs.max_qp = SIW_MAX_QP;
+	dev->attrs.max_qp_wr = SIW_MAX_QP_WR;
+	dev->attrs.max_ord = SIW_MAX_ORD;
+	dev->attrs.max_ird = SIW_MAX_IRD;
+	dev->attrs.cap_flags = 0;
+	dev->attrs.max_sge = SIW_MAX_SGE;
+	dev->attrs.max_sge_rd = SIW_MAX_SGE_RD;
+	dev->attrs.max_cq = SIW_MAX_CQ;
+	dev->attrs.max_cqe = SIW_MAX_CQE;
+	dev->attrs.max_mr = SIW_MAX_MR;
+	dev->attrs.max_mr_size = SIW_MAX_MR_SIZE;
+	dev->attrs.max_pd = SIW_MAX_PD;
+	dev->attrs.max_mw = SIW_MAX_MW;
+	dev->attrs.max_fmr = SIW_MAX_FMR;
+	dev->attrs.max_srq = SIW_MAX_SRQ;
+	dev->attrs.max_srq_wr = SIW_MAX_SRQ_WR;
+	dev->attrs.max_srq_sge = SIW_MAX_SGE;
+
+	siw_idr_init(dev);
+
+	atomic_set(&dev->num_srq, 0);
+	atomic_set(&dev->num_qp, 0);
+	atomic_set(&dev->num_cq, 0);
+	atomic_set(&dev->num_mem, 0);
+	atomic_set(&dev->num_pd, 0);
+
+	for (i = 0; i < ARRAY_SIZE(siw_dev_attributes); ++i) {
+		rv = device_create_file(&ibdev->dev, siw_dev_attributes[i]);
+		if (rv) {
+			dprint(DBG_DM|DBG_ON, "(dev=%s): "
+				"device_create_file failed: i=%d, rv=%d\n",
+				ibdev->name, i, rv);
+			ib_unregister_device(ibdev);
+			return rv;
+		}
+	}
+
+	dprint(DBG_DM, ": Registered '%s' for interface '%s', "
+		"HWaddr=%02x.%02x.%02x.%02x.%02x.%02x\n",
+		ibdev->name, dev->l2dev->name,
+		*(u8 *)dev->l2dev->dev_addr,
+		*((u8 *)dev->l2dev->dev_addr + 1),
+		*((u8 *)dev->l2dev->dev_addr + 2),
+		*((u8 *)dev->l2dev->dev_addr + 3),
+		*((u8 *)dev->l2dev->dev_addr + 4),
+		*((u8 *)dev->l2dev->dev_addr + 5));
+	return 0;
+}
+
+void siw_deregister_device(struct siw_dev *dev)
+{
+	int i;
+
+	siw_idr_release(dev);
+
+	WARN_ON(atomic_read(&dev->num_srq) || atomic_read(&dev->num_qp) ||
+		atomic_read(&dev->num_cq) || atomic_read(&dev->num_mem) ||
+		atomic_read(&dev->num_pd));
+
+	for (i = 0; i < ARRAY_SIZE(siw_dev_attributes); ++i)
+		device_remove_file(&dev->ofa_dev.dev, siw_dev_attributes[i]);
+
+	dprint(DBG_OBJ, ": Unregister '%s' for interface '%s'\n",
+		dev->ofa_dev.name, dev->l2dev->name);
+
+	ib_unregister_device(&dev->ofa_dev);
+}
+
+
+/*
+ * siw_init_module - Initialize Softiwarp module and create Softiwarp devices
+ *
+ * There are three design options for Softiwarp device management supporting
+ * - multiple physical Ethernet ports, i.e., multiple net_device instances
+ * - and multi-homing, i.e., multiple IP addresses associated with net_device,
+ * as follows:
+ *
+ *    Option 1: One Softiwarp device per net_device and
+ *              IP address associated with the net_device
+ *    Option 2: One Softiwarp device per net_device
+ *              (and all IP addresses associated with the net_device)
+ *    Option 3: Single Softiwarp device for all net_device instances
+ *              (and all IP addresses associated with these instances)
+ *
+ * We currently use Option 2, registering a separate siw_dev for
+ * each net_device.
+ *
+ * TODO: Dynamic device management (network device registration/removal).
+ *       IPv6 support.
+ */
+static __init int siw_init_module(void)
+{
+	struct net_device	*dev;
+	struct siw_dev		*siw_p;
+	int rv = 0;
+
+	/*
+	 * Identify all net_device instances and create a
+	 * Softiwarp device for each net_device supporting IPv4
+	 *
+	 * TODO:
+	 * - Do we have to generalize for IPv6?
+	 * - Exclude devices based on IPoIB - if any
+	 * - Consider excluding Ethernet devices with an
+	 *   associated iWARP hardware device
+	 */
+	rtnl_lock();
+	for_each_netdev(&init_net, dev) {
+		struct in_device *in_dev;
+
+		in_dev = in_dev_get(dev);
+		if (!in_dev) {
+			dprint(DBG_DM, ": Skipped %s (no in_dev)\n", dev->name);
+			continue;
+		}
+		if (!in_dev->ifa_list) {
+			dprint(DBG_DM, ": Skipped %s (no ifa)\n", dev->name);
+			in_dev_put(in_dev);
+			continue;
+		}
+		/*
+		 * This device has an in_device attached. Attach to it
+		 * if it is LOOPBACK or ETHER or IEEE801-TR device.
+		 *
+		 * Additional hardware support can be added here
+		 * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see
+		 * <linux/if_arp.h> for type identifiers.
+		 *
+		 * NOTE: ARPHRD_TUNNEL/6 are excluded.
+		 */
+		if (dev->type == ARPHRD_ETHER ||
+		    dev->type == ARPHRD_IEEE802 ||
+		    (dev->type == ARPHRD_LOOPBACK && loopback_enabled)) {
+#ifdef CHECK_DMA_CAPABILITIES
+			if (!dev->dev.parent || !get_dma_ops(dev->dev.parent)) {
+				dprint(DBG_DM|DBG_ON,
+					": No DMA capabilities: %s (skipped)\n",
+					dev->name);
+				in_dev_put(in_dev);
+				continue;
+			}
+#endif
+			siw_p =
+			      (struct siw_dev *)ib_alloc_device(sizeof *siw_p);
+
+			if (!siw_p) {
+				in_dev_put(in_dev);
+				rv = -ENOMEM;
+				break;
+			}
+			if (!siw_device) {
+				siw_device = siw_p;
+				siw_p->next = NULL;
+			} else {
+				siw_p->next = siw_device->next;
+				siw_device->next = siw_p;
+			}
+			siw_p->l2dev = dev;
+
+			rv = siw_register_device(siw_p);
+			if (rv) {
+				if (siw_device != siw_p)
+					siw_device->next = siw_p->next;
+				else
+					siw_device = NULL;
+
+				in_dev_put(in_dev);
+				ib_dealloc_device(&siw_p->ofa_dev);
+
+				break;
+			}
+		} else {
+			dprint(DBG_DM, ": Skipped %s (type %d)\n",
+				dev->name, dev->type);
+			in_dev_put(in_dev);
+		}
+	}
+	rtnl_unlock();
+
+	if (!siw_device)
+		return -ENODEV;
+
+	if (rv)
+		return rv;
+	/*
+	 * FIXME: In case of error, we leave devices allocated.
+	 *        Is this correct?
+	 */
+	rv = siw_cm_init();
+	if (rv)
+		return rv;
+
+	rv = siw_sq_worker_init();
+
+	printk(KERN_INFO "SoftIWARP attached\n");
+	return rv;
+}
+
+static void __exit siw_exit_module(void)
+{
+	struct siw_dev	*siw_p;
+
+	siw_sq_worker_exit();
+	siw_cm_exit();
+
+	while (siw_device) {
+		siw_p = siw_device->next;
+		siw_deregister_device(siw_device);
+		in_dev_put(siw_device->l2dev->ip_ptr);
+		ib_dealloc_device(&siw_device->ofa_dev);
+		siw_device = siw_p;
+	}
+	printk(KERN_INFO "SoftIWARP detached\n");
+}
+
+module_init(siw_init_module);
+module_exit(siw_exit_module);
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] SIW: User interface
From: Bernard Metzler @ 2010-10-05  6:54 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_ae.c    |   96 ++
 drivers/infiniband/hw/siw/siw_user.h  |   66 ++
 drivers/infiniband/hw/siw/siw_verbs.c | 1564 +++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/siw/siw_verbs.h |   96 ++
 4 files changed, 1822 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_ae.c
 create mode 100644 drivers/infiniband/hw/siw/siw_user.h
 create mode 100644 drivers/infiniband/hw/siw/siw_verbs.c
 create mode 100644 drivers/infiniband/hw/siw/siw_verbs.h

diff --git a/drivers/infiniband/hw/siw/siw_ae.c b/drivers/infiniband/hw/siw/siw_ae.c
new file mode 100644
index 0000000..7e9ab3f
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_ae.c
@@ -0,0 +1,96 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+/*
+ * siw_async_ev()
+ *
+ * Report Asynchonous event to user.
+ */
+void siw_async_ev(struct siw_qp *qp, struct siw_cq *cq,
+		  enum ib_event_type etype)
+{
+	static struct ib_event	event;
+
+	dprint(DBG_EH, "(QP%d): AE type %d\n", QP_ID(qp), etype);
+
+	event.event = etype;
+	event.device = qp->ofa_qp.device;
+	if (cq)
+		event.element.cq = &cq->ofa_cq;
+	else
+		event.element.qp = &qp->ofa_qp;
+
+	if (!(qp->attrs.flags & SIW_QP_IN_DESTROY) &&
+	    qp->ofa_qp.event_handler) {
+		dprint(DBG_EH, "(QP%d): Call AEH\n", QP_ID(qp));
+		(*qp->ofa_qp.event_handler)(&event, qp->ofa_qp.qp_context);
+	}
+}
+
+void siw_async_srq_ev(struct siw_srq *srq, enum ib_event_type etype)
+{
+	static struct ib_event 	event;
+
+	dprint(DBG_EH, "(SRQ%p): AE type %d\n", srq, etype);
+
+	event.event = etype;
+	event.device = srq->ofa_srq.device;
+	event.element.srq = &srq->ofa_srq;
+
+	if (srq->ofa_srq.event_handler)
+		(*srq->ofa_srq.event_handler)(&event, srq->ofa_srq.srq_context);
+}
diff --git a/drivers/infiniband/hw/siw/siw_user.h b/drivers/infiniband/hw/siw/siw_user.h
new file mode 100644
index 0000000..ce7857d
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_user.h
@@ -0,0 +1,66 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _SIW_USER_H
+#define _SIW_USER_H
+
+/*
+ * user commands/command responses must correlate with the siw_abi
+ * in user land.
+ */
+
+struct siw_uresp_create_cq {
+	__u32	cq_id;
+};
+
+struct siw_uresp_create_qp {
+	__u32	qp_id;
+	__u32	sq_size;
+	__u32	rq_size;
+};
+
+struct siw_uresp_reg_mr {
+	__u32	stag;
+};
+
+struct siw_ureq_reg_mr {
+	__u8	stag_key;
+	__u8	reserved[3];
+};
+
+#endif
diff --git a/drivers/infiniband/hw/siw/siw_verbs.c b/drivers/infiniband/hw/siw/siw_verbs.c
new file mode 100644
index 0000000..8633b0f
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_verbs.c
@@ -0,0 +1,1564 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR+1] = {
+	[IB_QPS_RESET]	= SIW_QP_STATE_IDLE,
+	[IB_QPS_INIT]	= SIW_QP_STATE_IDLE,
+	[IB_QPS_RTR]	= SIW_QP_STATE_RTR,
+	[IB_QPS_RTS]	= SIW_QP_STATE_RTS,
+	[IB_QPS_SQD]	= SIW_QP_STATE_CLOSING,
+	[IB_QPS_SQE]	= SIW_QP_STATE_TERMINATE,
+	[IB_QPS_ERR]	= SIW_QP_STATE_ERROR
+};
+
+static inline struct siw_mr *siw_mr_ofa2siw(struct ib_mr *ofa_mr)
+{
+	return container_of(ofa_mr, struct siw_mr, ofa_mr);
+}
+
+static inline struct siw_pd *siw_pd_ofa2siw(struct ib_pd *ofa_pd)
+{
+	return container_of(ofa_pd, struct siw_pd, ofa_pd);
+}
+
+static inline struct siw_ucontext *siw_ctx_ofa2siw(
+	struct ib_ucontext *ofa_ctx)
+{
+	return container_of(ofa_ctx, struct siw_ucontext, ib_ucontext);
+}
+
+static inline struct siw_qp *siw_qp_ofa2siw(struct ib_qp *ofa_qp)
+{
+	return container_of(ofa_qp, struct siw_qp, ofa_qp);
+}
+
+static inline struct siw_cq *siw_cq_ofa2siw(struct ib_cq *ofa_cq)
+{
+	return container_of(ofa_cq, struct siw_cq, ofa_cq);
+}
+
+static inline struct siw_srq *siw_srq_ofa2siw(struct ib_srq *ofa_srq)
+{
+	return container_of(ofa_srq, struct siw_srq, ofa_srq);
+}
+
+struct ib_ucontext *siw_alloc_ucontext(struct ib_device *ofa_dev,
+				       struct ib_udata *udata)
+{
+	struct siw_ucontext *ctx;
+
+	dprint(DBG_CM, "(device=%s)\n", ofa_dev->name);
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx) {
+		dprint(DBG_ON, " kzalloc\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	return &ctx->ib_ucontext;
+}
+
+int siw_dealloc_ucontext(struct ib_ucontext *ctx)
+{
+	struct siw_ucontext *ucontext;
+
+	ucontext = siw_ctx_ofa2siw(ctx);
+
+	kfree(ucontext);
+
+	return 0;
+}
+
+int siw_query_device(struct ib_device *ofa_dev, struct ib_device_attr *attr)
+{
+	struct siw_dev *dev = siw_dev_ofa2siw(ofa_dev);
+
+	memset(attr, 0, sizeof *attr);
+
+	attr->max_mr_size = dev->attrs.max_mr_size;
+	attr->vendor_id = dev->attrs.vendor_id;
+	attr->vendor_part_id = dev->attrs.vendor_part_id;
+	attr->max_qp = dev->attrs.max_qp;
+	attr->max_qp_wr = dev->attrs.max_qp_wr;
+
+	/*
+	 * RDMA Read parameters:
+	 * Max. ORD (Outbound Read queue Depth), a.k.a. max_initiator_depth
+	 * Max. IRD (Inbound Read queue Depth), a.k.a. max_responder_resources
+	 */
+	attr->max_qp_rd_atom = dev->attrs.max_ord;
+	attr->max_qp_init_rd_atom = dev->attrs.max_ird;
+	attr->max_res_rd_atom = dev->attrs.max_qp * dev->attrs.max_ird;
+	attr->device_cap_flags = dev->attrs.cap_flags;
+	attr->max_sge = dev->attrs.max_sge;
+	attr->max_sge_rd = dev->attrs.max_sge_rd;
+	attr->max_cq = dev->attrs.max_cq;
+	attr->max_cqe = dev->attrs.max_cqe;
+	attr->max_mr = dev->attrs.max_mr;
+	attr->max_pd = dev->attrs.max_pd;
+	attr->max_mw = dev->attrs.max_mw;
+	attr->max_fmr = dev->attrs.max_fmr;
+	attr->max_srq = dev->attrs.max_srq;
+	attr->max_srq_wr = dev->attrs.max_srq_wr;
+	attr->max_srq_sge = dev->attrs.max_srq_sge;
+
+	memcpy(&attr->sys_image_guid, dev->l2dev->dev_addr, 6);
+
+	/*
+	 * TODO: understand what of the following should
+	 * get useful information
+	 *
+	 * attr->fw_ver;
+	 * attr->max_ah
+	 * attr->max_map_per_fmr
+	 * attr->max_ee
+	 * attr->max_rdd
+	 * attr->max_ee_rd_atom;
+	 * attr->max_ee_init_rd_atom;
+	 * attr->max_raw_ipv6_qp
+	 * attr->max_raw_ethy_qp
+	 * attr->max_mcast_grp
+	 * attr->max_mcast_qp_attach
+	 * attr->max_total_mcast_qp_attach
+	 * attr->max_pkeys
+	 * attr->atomic_cap;
+	 * attr->page_size_cap;
+	 * attr->hw_ver;
+	 * attr->local_ca_ack_delay;
+	 */
+	return 0;
+}
+
+/*
+ * Approximate translation of real MTU for IB.
+ *
+ * TODO: is that needed for RNIC's? We may have a medium
+ *       which reports MTU of 64kb and have to degrade to 4k??
+ */
+static inline enum ib_mtu siw_mtu_net2ofa(unsigned short mtu)
+{
+	if (mtu >= 4096)
+		return IB_MTU_4096;
+	if (mtu >= 2048)
+		return IB_MTU_2048;
+	if (mtu >= 1024)
+		return IB_MTU_1024;
+	if (mtu >= 512)
+		return IB_MTU_512;
+	if (mtu >= 256)
+		return IB_MTU_256;
+	return -1;
+}
+
+int siw_query_port(struct ib_device *ofa_dev, u8 port,
+		     struct ib_port_attr *attr)
+{
+	struct siw_dev *dev = siw_dev_ofa2siw(ofa_dev);
+
+	memset(attr, 0, sizeof *attr);
+	/*
+	 * TODO: fully understand what to do here
+	 */
+	attr->state = IB_PORT_ACTIVE;	/* ?? */
+	attr->max_mtu = siw_mtu_net2ofa(dev->l2dev->mtu);
+	attr->active_mtu = attr->max_mtu;
+	attr->gid_tbl_len = 1;
+	attr->port_cap_flags = IB_PORT_CM_SUP;	/* ?? */
+	attr->port_cap_flags |= IB_PORT_DEVICE_MGMT_SUP;
+	attr->max_msg_sz = -1;
+	attr->pkey_tbl_len = 1;
+	attr->active_width = 2;
+	attr->active_speed = 2;
+	/*
+	 * All zero
+	 *
+	 * attr->lid = 0;
+	 * attr->bad_pkey_cntr = 0;
+	 * attr->qkey_viol_cntr = 0;
+	 * attr->sm_lid = 0;
+	 * attr->lmc = 0;
+	 * attr->max_vl_num = 0;
+	 * attr->sm_sl = 0;
+	 * attr->subnet_timeout = 0;
+	 * attr->init_type_repy = 0;
+	 * attr->phys_state = 0;
+	 */
+	return 0;
+}
+
+int siw_query_pkey(struct ib_device *ofa_dev, u8 port, u16 idx, u16 *pkey)
+{
+	*pkey = 0;
+	return 0;
+}
+
+int siw_query_gid(struct ib_device *ofa_dev, u8 port, int idx,
+		   union ib_gid *gid)
+{
+	struct siw_dev *dev = siw_dev_ofa2siw(ofa_dev);
+
+	/* subnet_prefix == interface_id == 0; */
+	memset(gid, 0, sizeof *gid);
+	memcpy(&gid->raw[0], dev->l2dev->dev_addr, 6);
+
+	return 0;
+}
+
+struct ib_pd *siw_alloc_pd(struct ib_device *ofa_dev,
+			   struct ib_ucontext *context, struct ib_udata *udata)
+{
+	struct siw_pd	*pd = NULL;
+	struct siw_dev	*dev   = siw_dev_ofa2siw(ofa_dev);
+	int rv;
+
+	if (atomic_inc_return(&dev->num_pd) > SIW_MAX_PD) {
+		dprint(DBG_ON, ": Out of PD's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd) {
+		dprint(DBG_ON, ": malloc\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	rv = siw_pd_add(dev, pd);
+	if (rv) {
+		dprint(DBG_ON, ": siw_pd_add\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (context) {
+		if (ib_copy_to_udata(udata, &pd->hdr.id, sizeof pd->hdr.id)) {
+			rv = -EFAULT;
+			goto err_out_idr;
+		}
+	}
+	return &pd->ofa_pd;
+
+err_out_idr:
+	siw_remove_obj(&dev->idr_lock, &dev->pd_idr, &pd->hdr);
+err_out:
+	kfree(pd);
+	atomic_dec(&dev->num_pd);
+
+	return ERR_PTR(rv);
+}
+
+int siw_dealloc_pd(struct ib_pd *ofa_pd)
+{
+	struct siw_pd	*pd = siw_pd_ofa2siw(ofa_pd);
+	struct siw_dev	*dev = siw_dev_ofa2siw(ofa_pd->device);
+
+	siw_remove_obj(&dev->idr_lock, &dev->pd_idr, &pd->hdr);
+	siw_pd_put(pd);
+
+	atomic_dec(&dev->num_pd);
+	return 0;
+}
+
+struct ib_ah *siw_create_ah(struct ib_pd *pd, struct ib_ah_attr *attr)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+int siw_destroy_ah(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+
+void siw_qp_get_ref(struct ib_qp *ofa_qp)
+{
+	struct siw_qp	*qp = siw_qp_ofa2siw(ofa_qp);
+
+	dprint(DBG_OBJ|DBG_CM, "(QP%d): Get Reference\n", QP_ID(qp));
+	siw_qp_get(qp);
+}
+
+
+void siw_qp_put_ref(struct ib_qp *ofa_qp)
+{
+	struct siw_qp	*qp = siw_qp_ofa2siw(ofa_qp);
+
+	dprint(DBG_OBJ|DBG_CM, "(QP%d): Put Reference\n", QP_ID(qp));
+	siw_qp_put(qp);
+}
+
+int siw_no_mad(struct ib_device *ofa_dev, int flags, u8 port,
+			    struct ib_wc *wc, struct ib_grh *grh,
+			    struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	return -ENOSYS;
+}
+
+
+/*
+ * siw_create_qp()
+ *
+ * Create QP of requested size on given device.
+ *
+ * @ofa_pd:	OFA PD contained in siw PD
+ * @attrs:	Initial QP attributes.
+ * @udata:	used to provide QP ID, SQ and RQ size back to user.
+ */
+
+struct ib_qp *siw_create_qp(struct ib_pd *ofa_pd, struct ib_qp_init_attr *attrs,
+			    struct ib_udata *udata)
+{
+	struct siw_qp	 		*qp = NULL;
+	struct siw_pd	 		*pd = siw_pd_ofa2siw(ofa_pd);
+	struct ib_device	 	*ofa_dev = ofa_pd->device;
+	struct siw_dev 			*dev = siw_dev_ofa2siw(ofa_dev);
+	struct siw_cq  			*scq = NULL, *rcq = NULL;
+	struct siw_iwarp_tx		*c_tx;
+	struct siw_iwarp_rx		*c_rx;
+	struct siw_uresp_create_qp	uresp;
+
+	int rv = 0;
+
+	dprint(DBG_OBJ|DBG_CM, ": new QP on device %s\n",
+		ofa_dev->name);
+
+	if (atomic_inc_return(&dev->num_qp) > SIW_MAX_QP) {
+		dprint(DBG_ON, ": Out of QP's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (attrs->qp_type != IB_QPT_RC) {
+		dprint(DBG_ON, ": Only RC QP's supported\n");
+		rv = -EINVAL;
+		goto err_out;
+	}
+	if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
+	    (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
+	    (attrs->cap.max_send_sge > SIW_MAX_SGE)  ||
+	    (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
+		dprint(DBG_ON, ": QP Size!\n");
+		rv = -EINVAL;
+		goto err_out;
+	}
+	/*
+	 * NOTE: we allow for zero element SQ and RQ WQE's SGL's
+	 * but not for a QP unable to hold any WQE (SQ + RQ)
+	 */
+	if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) {
+		rv = -EINVAL;
+		goto err_out;
+	}
+
+	scq = siw_cq_id2obj(dev, ((struct siw_cq *)attrs->send_cq)->hdr.id);
+	rcq = siw_cq_id2obj(dev, ((struct siw_cq *)attrs->recv_cq)->hdr.id);
+
+	if (!scq || !rcq) {
+		dprint(DBG_OBJ, ": Fail: SCQ: 0x%p, RCQ: 0x%p\n",
+			scq, rcq);
+		rv = -EINVAL;
+		goto err_out;
+	}
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp) {
+		dprint(DBG_ON, ": kzalloc\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+
+	rv = siw_qp_add(dev, qp);
+	if (rv)
+		goto err_out;
+
+	INIT_LIST_HEAD(&qp->wqe_freelist);
+	INIT_LIST_HEAD(&qp->sq);
+	INIT_LIST_HEAD(&qp->rq);
+	INIT_LIST_HEAD(&qp->orq);
+	INIT_LIST_HEAD(&qp->irq);
+
+	init_rwsem(&qp->state_lock);
+	spin_lock_init(&qp->freelist_lock);
+	spin_lock_init(&qp->sq_lock);
+	spin_lock_init(&qp->rq_lock);
+	spin_lock_init(&qp->orq_lock);
+
+	init_waitqueue_head(&qp->tx_ctx.waitq);
+
+	qp->pd  = pd;
+	qp->scq = scq;
+	qp->rcq = rcq;
+
+	if (attrs->srq) {
+		/*
+		 * SRQ support.
+		 * Verbs 6.3.7: ignore RQ size, if SRQ present
+		 * Verbs 6.3.5: do not check PD of SRQ against PD of QP
+		 */
+		qp->srq = siw_srq_ofa2siw(attrs->srq);
+		qp->attrs.rq_size = 0;
+		atomic_set(&qp->rq_space, 0);
+		dprint(DBG_OBJ, " QP(%d): SRQ(%p) attached\n",
+			QP_ID(qp), qp->srq);
+	} else {
+		qp->srq = NULL;
+		qp->attrs.rq_size = attrs->cap.max_recv_wr;
+		atomic_set(&qp->rq_space, qp->attrs.rq_size);
+	}
+	qp->attrs.sq_size = attrs->cap.max_send_wr;
+	atomic_set(&qp->sq_space, qp->attrs.sq_size);
+	qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
+	/*
+	 * ofed has no max_send_sge_rdmawrite
+	 */
+	qp->attrs.sq_max_sges_rdmaw = attrs->cap.max_send_sge;
+	qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
+	/*
+	 * while not part of attrs we init ord/ird here
+	 */
+	qp->attrs.ord = dev->attrs.max_ord;
+	qp->attrs.ird = dev->attrs.max_ird;
+
+	qp->attrs.state = SIW_QP_STATE_IDLE;
+
+	if (udata) {
+		uresp.sq_size = qp->attrs.sq_size;
+		uresp.rq_size = qp->attrs.rq_size;
+		uresp.qp_id = QP_ID(qp);
+
+		rv = ib_copy_to_udata(udata, &uresp, sizeof uresp);
+		if (rv)
+			goto err_out_idr;
+	}
+	c_tx = &qp->tx_ctx;
+	c_rx = &qp->rx_ctx;
+
+	c_tx->crc_enabled = c_rx->crc_enabled = CONFIG_RDMA_SIW_CRC_ENFORCED;
+
+	if (c_tx->crc_enabled) {
+		c_tx->mpa_crc_hd.tfm =
+			crypto_alloc_hash("crc32c", 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(c_tx->mpa_crc_hd.tfm)) {
+			rv = -PTR_ERR(c_tx->mpa_crc_hd.tfm);
+			dprint(DBG_ON, "(QP%d): Failed loading crc32c"
+				" with error %d. ", QP_ID(qp), rv);
+			goto err_out_idr;
+		}
+	}
+	if (c_rx->crc_enabled) {
+		c_rx->mpa_crc_hd.tfm =
+			crypto_alloc_hash("crc32c", 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(c_rx->mpa_crc_hd.tfm)) {
+			rv = -PTR_ERR(c_rx->mpa_crc_hd.tfm);
+			crypto_free_hash(c_tx->mpa_crc_hd.tfm);
+			goto err_out_idr;
+		}
+	}
+	atomic_set(&qp->tx_ctx.in_use, 0);
+
+	qp->ofa_qp.qp_num = QP_ID(qp);
+
+	siw_pd_get(pd);
+
+	return &qp->ofa_qp;
+
+err_out_idr:
+	siw_remove_obj(&dev->idr_lock, &dev->qp_idr, &qp->hdr);
+err_out:
+	if (scq)
+		siw_cq_put(scq);
+	if (rcq)
+		siw_cq_put(rcq);
+
+	kfree(qp);
+	atomic_dec(&dev->num_qp);
+
+	return ERR_PTR(rv);
+}
+
+/*
+ * Minimum siw_query_qp() verb interface to allow for qperf application
+ * to run on siw.
+ *
+ * TODO: all.
+ */
+int siw_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
+	qp_init_attr->cap.max_inline_data = 0;
+
+	return 0;
+}
+
+int siw_ofed_modify_qp(struct ib_qp *ofa_qp, struct ib_qp_attr *attr,
+			 int attr_mask, struct ib_udata *udata)
+{
+	struct siw_qp_attrs	new_attrs;
+	enum siw_qp_attr_mask	siw_attr_mask = 0;
+	struct siw_qp		*qp = siw_qp_ofa2siw(ofa_qp);
+	int			rv;
+
+	if (!attr_mask) {
+		dprint(DBG_CM, "(QP%d): attr_mask==0 ignored\n", QP_ID(qp));
+		return 0;
+	}
+	siw_dprint_qp_attr_mask(attr_mask);
+
+	memset(&new_attrs, 0, sizeof new_attrs);
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS) {
+
+		siw_attr_mask |= SIW_QP_ATTR_ACCESS_FLAGS;
+
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
+			new_attrs.flags |= SIW_RDMA_READ_ENABLED;
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
+			new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+		if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
+			new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
+	}
+	if (attr_mask & IB_QP_STATE) {
+		dprint(DBG_CM, "(QP%d): Desired IB QP state: %s\n",
+			   QP_ID(qp), ib_qp_state_to_string[attr->qp_state]);
+
+		new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
+
+		if (new_attrs.state > SIW_QP_STATE_RTS)
+			qp->tx_ctx.tx_suspend = 1;
+
+		/* TODO: SIW_QP_STATE_UNDEF is currently not possible ... */
+		if (new_attrs.state == SIW_QP_STATE_UNDEF)
+			return -EINVAL;
+
+		siw_attr_mask |= SIW_QP_ATTR_STATE;
+	}
+	if (!attr_mask)
+		return 0;
+
+	down_write(&qp->state_lock);
+
+	rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
+
+	up_write(&qp->state_lock);
+	return rv;
+}
+
+int siw_destroy_qp(struct ib_qp *ofa_qp)
+{
+	struct ib_device	*ofa_dev = ofa_qp->device;
+	struct siw_dev		*dev = siw_dev_ofa2siw(ofa_dev);
+	struct siw_qp		*qp = siw_qp_ofa2siw(ofa_qp);
+	struct siw_cep		*cep;
+	struct siw_qp_attrs	qp_attrs;
+
+	dprint(DBG_CM, "(QP%d): SIW QP state=%d, cep=0x%p\n",
+		QP_ID(qp), qp->attrs.state, qp->cep);
+
+	/*
+	 * Mark QP as in process of destruction to prevent from eventual async
+	 * callbacks to OFA core
+	 */
+	qp->attrs.flags |= SIW_QP_IN_DESTROY;
+	qp->rx_ctx.rx_suspend = 1;
+
+	down_write(&qp->state_lock);
+
+	qp_attrs.state = SIW_QP_STATE_ERROR;
+	(void)siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
+
+	up_write(&qp->state_lock);
+
+	cep = qp->cep;
+	if (cep) {
+		/*
+		 * Wait if CM work is scheduled. calling siw_qp_modify()
+		 * already dropped the network connection.
+		 */
+		dprint(DBG_CM, " (QP%d) (CEP 0x%p): %s (%d)\n",
+			QP_ID(qp), cep, atomic_read(&cep->ref.refcount) > 1 ?
+			"Wait for CM" : "CM done",
+			atomic_read(&cep->ref.refcount));
+
+		wait_event(cep->waitq, atomic_read(&cep->ref.refcount) == 1);
+		dprint(DBG_CM, "(QP%d): CM done 2\n", QP_ID(qp));
+		qp->cep = 0;
+		siw_cep_put(cep);
+	}
+
+	if (qp->rx_ctx.crc_enabled)
+		crypto_free_hash(qp->rx_ctx.mpa_crc_hd.tfm);
+	if (qp->tx_ctx.crc_enabled)
+		crypto_free_hash(qp->tx_ctx.mpa_crc_hd.tfm);
+
+	siw_remove_obj(&dev->idr_lock, &dev->qp_idr, &qp->hdr);
+
+	/* Drop references */
+	siw_cq_put(qp->scq);
+	siw_cq_put(qp->rcq);
+	siw_pd_put(qp->pd);
+	qp->scq = qp->rcq = NULL;
+
+	siw_qp_freeq_flush(qp);
+
+	siw_qp_put(qp);
+
+	atomic_dec(&dev->num_qp);
+	return 0;
+}
+
+/*
+ * siw_copy_sgl()
+ *
+ * Copy SGL from user (OFA) representation to local
+ * representation.
+ * Memory lookup and base+bounds checks must
+ * be deferred until wqe gets executed
+ */
+static int siw_copy_sgl(struct ib_sge *ofa_sge, struct siw_sge *si_sge,
+			int num_sge)
+{
+	int bytes = 0;
+
+	while (num_sge--) {
+		si_sge->addr = ofa_sge->addr;
+		si_sge->len  = ofa_sge->length;
+		si_sge->lkey = ofa_sge->lkey;
+		/*
+		 * defer memory lookup to WQE processing
+		 */
+		si_sge->mem.obj = NULL;
+
+		bytes += si_sge->len;
+		si_sge++; ofa_sge++;
+	}
+	return bytes;
+}
+
+/*
+ * siw_copy_inline_sgl()
+ *
+ * Prepare sgl of inlined data for sending.
+ * User provided sgl with unregistered user buffers. The function checks
+ * if the given buffer addresses and len's are within process context
+ * bounds and copies data into one kernel buffer. This implies dual copy
+ * operation in the tx path since TCP will make another copy for
+ * retransmission. There is room for efficiency improvement.
+ */
+static int siw_copy_inline_sgl(struct ib_sge *ofa_sge, struct siw_sge *si_sge,
+			       int num_sge)
+{
+	char	*kbuf;
+	int 	i, bytes = 0;
+
+	if (unlikely(num_sge == 0))
+		return 0;
+
+	for (i = 0; i < num_sge; i++) {
+		struct ib_sge *sge = &ofa_sge[i];
+
+		if (unlikely(!access_ok(VERIFY_READ, sge->addr, sge->length)))
+			return -EFAULT;
+
+		bytes += sge->length;
+
+		if (bytes > SIW_MAX_INLINE)
+			return -EINVAL;
+	}
+	if (unlikely(!bytes))
+		return 0;
+
+	kbuf = kmalloc(bytes, GFP_KERNEL);
+	if (unlikely(!kbuf)) {
+		dprint(DBG_ON, " kmalloc\n");
+		return -ENOMEM;
+	}
+	si_sge->mem.buf = kbuf;
+
+	while (num_sge--) {
+		if (__copy_from_user(kbuf,
+				     (void *)(unsigned long)ofa_sge->addr,
+				     ofa_sge->length)) {
+			kfree(si_sge->mem.buf);
+			return -EFAULT;
+		}
+		kbuf += ofa_sge->length;
+		ofa_sge++;
+	}
+	si_sge->len = bytes;
+	si_sge->lkey = 0;
+	si_sge->addr = 0; /* don't need the user addr */
+	return bytes;
+}
+
+
+/*
+ * siw_post_send()
+ *
+ * Post a list of S-WR's to a SQ.
+ *
+ * @ofa_qp:	OFA QP contained in siw QP
+ * @wr:		Null terminated list of user WR's
+ * @bad_wr:	Points to failing WR in case of synchronous failure.
+ */
+int siw_post_send(struct ib_qp *ofa_qp, struct ib_send_wr *wr,
+		  struct ib_send_wr **bad_wr)
+{
+	struct siw_wqe	*wqe = NULL;
+	struct siw_qp	*qp = siw_qp_ofa2siw(ofa_qp);
+
+	unsigned long flags;
+	int rv = 0;
+
+	dprint(DBG_WR|DBG_TX, "(QP%d): state=%d\n",
+		QP_ID(qp), qp->attrs.state);
+
+	/*
+	 * Acquire QP state lock for reading. The idea is that a
+	 * user cannot move the QP out of RTS during TX/RX processing.
+	 */
+	down_read(&qp->state_lock);
+
+	if (qp->attrs.state != SIW_QP_STATE_RTS) {
+		dprint(DBG_WR|DBG_ON, "(QP%d): state=%d\n",
+			QP_ID(qp), qp->attrs.state);
+		up_read(&qp->state_lock);
+		*bad_wr = wr;
+		return -ENOTCONN;
+	}
+	dprint(DBG_WR|DBG_TX, "(QP%d): sq_space(#1)=%d\n",
+		QP_ID(qp), atomic_read(&qp->sq_space));
+
+	while (wr) {
+		if (!atomic_read(&qp->sq_space)) {
+			dprint(DBG_ON, " sq_space\n");
+			wqe = NULL;
+			rv = -ENOMEM;
+			break;
+		}
+		wqe = siw_wqe_get(qp, wr->opcode);
+		if (!wqe) {
+			dprint(DBG_ON, " siw_wqe_get\n");
+			rv = -ENOMEM;
+			break;
+		}
+		if (wr->num_sge > qp->attrs.sq_max_sges) {
+			/*
+			 * NOTE: we allow for zero length wr's here.
+			 */
+			dprint(DBG_WR, "(QP%d): Num SGE: %d\n",
+				QP_ID(qp), wr->num_sge);
+			rv = -EINVAL;
+			break;
+		}
+		wr_type(wqe) = wr->opcode;
+		wr_flags(wqe) = wr->send_flags;
+		wr_id(wqe) = wr->wr_id;
+
+		if (SIW_INLINED_DATA(wqe))
+			dprint(DBG_WR, "(QP%d): INLINE DATA\n", QP_ID(qp));
+
+		switch (wr->opcode) {
+
+		case IB_WR_SEND:
+			if (!SIW_INLINED_DATA(wqe)) {
+				rv = siw_copy_sgl(wr->sg_list, wqe->wr.send.sge,
+						  wr->num_sge);
+				wqe->wr.send.num_sge = wr->num_sge;
+			} else {
+				rv = siw_copy_inline_sgl(wr->sg_list,
+							 wqe->wr.send.sge,
+							 wr->num_sge);
+				wqe->wr.send.num_sge = 1;
+			}
+			if (rv <= 0) {
+				rv = -EINVAL;
+				break;
+			}
+			wqe->bytes = rv;
+			break;
+
+		case IB_WR_RDMA_READ:
+			/*
+			 * OFED WR restricts RREAD sink to SGL containing
+			 * 1 SGE only. we could relax to SGL with multiple
+			 * elements referring the SAME ltag or even sending
+			 * a private per-rreq tag referring to a checked
+			 * local sgl with MULTIPLE ltag's. would be easy
+			 * to do...
+			 */
+			if (wr->num_sge != 1) {
+				rv = -EINVAL;
+				break;
+			}
+			rv = siw_copy_sgl(wr->sg_list, wqe->wr.rread.sge, 1);
+			/*
+			 * NOTE: zero length RREAD is allowed!
+			 */
+			wqe->wr.rread.raddr = wr->wr.rdma.remote_addr;
+			wqe->wr.rread.rtag = wr->wr.rdma.rkey;
+			wqe->wr.rread.num_sge = 1;
+			wqe->bytes = rv;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+			if (!SIW_INLINED_DATA(wqe)) {
+				rv = siw_copy_sgl(wr->sg_list, wqe->wr.send.sge,
+						  wr->num_sge);
+				wqe->wr.write.num_sge = wr->num_sge;
+			} else {
+				rv = siw_copy_inline_sgl(wr->sg_list,
+							 wqe->wr.send.sge,
+							 wr->num_sge);
+				wqe->wr.write.num_sge = min(1, wr->num_sge);
+			}
+			/*
+			 * NOTE: zero length WRITE is allowed!
+			 */
+			if (rv < 0) {
+				rv = -EINVAL;
+				break;
+			}
+			wqe->wr.write.raddr = wr->wr.rdma.remote_addr;
+			wqe->wr.write.rtag = wr->wr.rdma.rkey;
+			wqe->bytes = rv;
+			break;
+
+		default:
+			dprint(DBG_WR|DBG_TX,
+				"(QP%d): Opcode %d not yet implemented\n",
+				QP_ID(qp), wr->opcode);
+			rv = -EINVAL;
+			break;
+		}
+		dprint(DBG_WR|DBG_TX, "(QP%d): opcode %d, bytes %d, "
+				"flags 0x%x\n",
+				QP_ID(qp), wr_type(wqe), wqe->bytes,
+				wr_flags(wqe));
+		if (rv < 0)
+			break;
+
+		wqe->wr_status = SR_WR_QUEUED;
+
+		lock_sq_rxsave(qp, flags);
+		list_add_tail(&wqe->list, &qp->sq);
+		atomic_dec(&qp->sq_space);
+		unlock_sq_rxsave(qp, flags);
+
+		wr = wr->next;
+	}
+	/*
+	 * Send directly if SQ processing is not in progress.
+	 * Eventual immediate errors (rv < 0) do not affect the involved
+	 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
+	 * processing, if new work is already pending. But rv must be passed
+	 * to caller.
+	 */
+	lock_sq_rxsave(qp, flags);
+
+	if (tx_wqe(qp) == NULL) {
+		struct siw_wqe	*next = siw_next_tx_wqe(qp);
+		if (next != NULL) {
+			if (wr_type(next) != SIW_WR_RDMA_READ_REQ ||
+			    !ORD_SUSPEND_SQ(qp)) {
+				tx_wqe(qp) = next;
+				if (wr_type(next) != SIW_WR_RDMA_READ_REQ)
+					list_del_init(&next->list);
+				else
+					siw_rreq_queue(next, qp);
+
+				unlock_sq_rxsave(qp, flags);
+
+				dprint(DBG_WR|DBG_TX,
+					"(QP%d): Direct sending...\n",
+					QP_ID(qp));
+
+				if (siw_qp_sq_process(qp, 1) != 0 &&
+				    !(qp->tx_ctx.tx_suspend))
+					siw_qp_cm_drop(qp, 0);
+			} else
+				unlock_sq_rxsave(qp, flags);
+		} else
+			unlock_sq_rxsave(qp, flags);
+	} else
+		unlock_sq_rxsave(qp, flags);
+
+	up_read(&qp->state_lock);
+
+	dprint(DBG_WR|DBG_TX, "(QP%d): sq_space(#2)=%d\n", QP_ID(qp),
+		atomic_read(&qp->sq_space));
+	if (rv >= 0)
+		return 0;
+	/*
+	 * Immediate error
+	 */
+	dprint(DBG_WR|DBG_ON, "(QP%d): error=%d\n", QP_ID(qp), rv);
+
+	if (wqe != NULL)
+		siw_wqe_put(wqe);
+	*bad_wr = wr;
+	return rv;
+}
+
+/*
+ * siw_post_receive()
+ *
+ * Post a list of R-WR's to a RQ.
+ *
+ * @ofa_qp:	OFA QP contained in siw QP
+ * @wr:		Null terminated list of user WR's
+ * @bad_wr:	Points to failing WR in case of synchronous failure.
+ */
+int siw_post_receive(struct ib_qp *ofa_qp, struct ib_recv_wr *wr,
+		     struct ib_recv_wr **bad_wr)
+{
+	struct siw_wqe	*wqe = NULL;
+	struct siw_qp	*qp = siw_qp_ofa2siw(ofa_qp);
+	unsigned long	flags;
+	int rv = 0;
+
+	dprint(DBG_WR|DBG_TX, "(QP%d): state=%d\n", QP_ID(qp),
+		qp->attrs.state);
+
+	if (qp->srq)
+		return -EOPNOTSUPP; /* what else from errno.h? */
+	/*
+	 * Acquire a QP state lock for reading. The idea is that a
+	 * user cannot move the QP out of RTS during TX/RX processing.
+	 */
+	down_read(&qp->state_lock);
+
+	if (qp->attrs.state > SIW_QP_STATE_RTS) {
+		up_read(&qp->state_lock);
+		dprint(DBG_ON, " (QP%d): state=%d\n", QP_ID(qp),
+			qp->attrs.state);
+		return -EINVAL;
+	}
+	while (wr) {
+		/*
+		 * NOTE: siw_wqe_get() calls kzalloc(), which may sleep.
+		 */
+		if (!atomic_read(&qp->rq_space) ||
+			!(wqe = siw_wqe_get(qp, SIW_WR_RECEIVE))) {
+			dprint(DBG_ON, " siw_wqe_get? (%d)\n",
+			       atomic_read(&qp->rq_space));
+			rv = -ENOMEM;
+			break;
+		}
+		if (wr->num_sge > qp->attrs.rq_max_sges) {
+			dprint(DBG_WR|DBG_ON, "(QP%d): Num SGE: %d\n",
+				QP_ID(qp), wr->num_sge);
+			rv = -EINVAL;
+			break;
+		}
+		wr_type(wqe) = SIW_WR_RECEIVE;
+		wr_id(wqe) = wr->wr_id;
+
+		rv = siw_copy_sgl(wr->sg_list, wqe->wr.recv.sge, wr->num_sge);
+		if (rv < 0) {
+			/*
+			 * XXX tentatively allow zero length receive
+			 */
+			rv = -EINVAL;
+			break;
+		}
+		wqe->wr.recv.num_sge = wr->num_sge;
+		wqe->bytes = rv;
+
+		lock_rq_rxsave(qp, flags);
+
+		list_add_tail(&wqe->list, &qp->rq);
+		wqe->wr_status = SR_WR_QUEUED;
+		atomic_dec(&qp->rq_space);
+
+		unlock_rq_rxsave(qp, flags);
+
+		wr = wr->next;
+	}
+	if (rv <= 0) {
+		dprint(DBG_WR|DBG_ON, "(QP%d): error=%d\n", QP_ID(qp), rv);
+		if (wqe != NULL)
+			siw_wqe_put(wqe);
+		*bad_wr = wr;
+	}
+	dprint(DBG_WR|DBG_RX, "(QP%d): rq_space=%d\n", QP_ID(qp),
+		atomic_read(&qp->rq_space));
+
+	up_read(&qp->state_lock);
+
+	return rv > 0 ? 0 : rv;
+}
+
+int siw_destroy_cq(struct ib_cq *ofa_cq)
+{
+	struct siw_cq	 	*cq  = siw_cq_ofa2siw(ofa_cq);
+	struct ib_device	*ofa_dev = ofa_cq->device;
+	struct siw_dev		*dev = siw_dev_ofa2siw(ofa_dev);
+
+	siw_cq_flush(cq);
+
+	siw_remove_obj(&dev->idr_lock, &dev->cq_idr, &cq->hdr);
+	siw_cq_put(cq);
+	atomic_dec(&dev->num_cq);
+	return 0;
+}
+
+/*
+ * siw_create_cq()
+ *
+ * Create CQ of requested size on given device.
+ *
+ * @ofa_dev:	OFA device contained in siw device
+ * @size:	maximum number of CQE's allowed.
+ * @ib_context: user context.
+ * @udata:	used to provide CQ ID back to user.
+ */
+
+struct ib_cq *siw_create_cq(struct ib_device *ofa_dev, int size,
+			    int vec /* unused */,
+			    struct ib_ucontext *ib_context,
+			    struct ib_udata *udata)
+{
+	struct siw_cq	 		*cq = NULL;
+	struct siw_dev 			*dev = siw_dev_ofa2siw(ofa_dev);
+	struct siw_uresp_create_cq	uresp;
+	int		 		rv;
+
+	if (atomic_inc_return(&dev->num_cq) > SIW_MAX_CQ) {
+		dprint(DBG_ON, ": Out of CQ's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (size < 1 || size > SIW_MAX_CQE) {
+		dprint(DBG_ON, ": CQE: %d\n", size);
+		rv = -EINVAL;
+		goto err_out;
+	}
+	cq = kmalloc(sizeof *cq, GFP_KERNEL);
+	if (!cq) {
+		dprint(DBG_ON, ":  kmalloc\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	cq->ofa_cq.cqe = size - 1;
+
+	rv = siw_cq_add(dev, cq);
+	if (rv)
+		goto err_out_idr;
+
+	INIT_LIST_HEAD(&cq->queue);
+	spin_lock_init(&cq->lock);
+	atomic_set(&cq->qlen, 0);
+
+	if (ib_context) {
+		uresp.cq_id = OBJ_ID(cq);
+
+		rv = ib_copy_to_udata(udata, &uresp, sizeof uresp);
+		if (rv)
+			goto err_out_idr;
+	}
+	return &cq->ofa_cq;
+
+err_out_idr:
+	siw_remove_obj(&dev->idr_lock, &dev->cq_idr, &cq->hdr);
+err_out:
+	dprint(DBG_OBJ, ": CQ creation failed\n");
+
+	kfree(cq);
+	atomic_dec(&dev->num_cq);
+
+	return ERR_PTR(rv);
+}
+
+/*
+ * siw_poll_cq()
+ *
+ * Reap CQ entries if available and copy work completion status into
+ * array of WC's provided by caller. Returns number of reaped CQE's.
+ *
+ * @ofa_cq:	OFA CQ contained in siw CQ.
+ * @num_cqe:	Maximum number of CQE's to reap.
+ * @wc:		Array of work completions to be filled by siw.
+ */
+int siw_poll_cq(struct ib_cq *ofa_cq, int num_cqe, struct ib_wc *wc)
+{
+	struct siw_cq		*cq  = siw_cq_ofa2siw(ofa_cq);
+	int			i;
+
+	for (i = 0; i < num_cqe; i++) {
+		if (!(siw_reap_cqe(cq, wc)))
+			break;
+		wc++;
+	}
+	dprint(DBG_WR, " CQ%d: reap %d comletions (%d left)\n",
+		OBJ_ID(cq), i, atomic_read(&cq->qlen));
+
+	return i;
+}
+
+/*
+ * siw_req_notify_cq()
+ *
+ * Request notification for new CQE's added to that CQ.
+ * Defined flags:
+ * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
+ *   event if a WQE with notification flag set enters the CQ
+ * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
+ *   event if a WQE enters the CQ.
+ * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
+ *   number of not reaped CQE's regardless of its notification
+ *   type and current or new CQ notification settings.
+ *
+ * @ofa_cq:	OFA CQ contained in siw CQ.
+ * @flags:	Requested notification flags.
+ */
+int siw_req_notify_cq(struct ib_cq *ofa_cq, enum ib_cq_notify_flags flags)
+{
+	struct siw_cq	 *cq  = siw_cq_ofa2siw(ofa_cq);
+
+	dprint(DBG_EH, "(CQ%d:) flags: 0x%8x\n", OBJ_ID(cq), flags);
+
+	if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		cq->notify = SIW_CQ_NOTIFY_SOLICITED;
+	else
+		cq->notify = SIW_CQ_NOTIFY_ALL;
+
+	if (flags & IB_CQ_REPORT_MISSED_EVENTS)
+		return atomic_read(&cq->qlen);
+
+	return 0;
+}
+
+/*
+ * siw_dereg_mr()
+ *
+ * Release Memory Region.
+ *
+ * TODO: Update function if Memory Windows are supported by siw:
+ *       Is OFED core checking for MW dependencies for current
+ *       MR before calling MR deregistration?.
+ *
+ * @ofa_mr:     OFA MR contained in siw MR.
+ */
+int siw_dereg_mr(struct ib_mr *ofa_mr)
+{
+	struct siw_mr	*mr;
+	struct siw_dev	*dev = siw_dev_ofa2siw(ofa_mr->device);
+
+	mr = siw_mr_ofa2siw(ofa_mr);
+
+	dprint(DBG_OBJ|DBG_MM, "(MEM%d): Release UMem %p, #ref's: %d\n",
+		mr->mem.hdr.id, mr->umem,
+		atomic_read(&mr->mem.hdr.ref.refcount));
+
+	mr->mem.stag_state = STAG_INVALID;
+
+	siw_pd_put(mr->pd);
+	siw_remove_obj(&dev->idr_lock, &dev->mem_idr, &mr->mem.hdr);
+	siw_mem_put(&mr->mem);
+
+	atomic_dec(&dev->num_mem);
+	return 0;
+}
+
+/*
+ * siw_reg_user_mr()
+ *
+ * Register Memory Region.
+ *
+ * @ofa_pd:	OFA PD contained in siw PD.
+ * @start:	starting address of MR (virtual address)
+ * @len:	len of MR
+ * @rnic_va:	not used by siw
+ * @rights:	MR access rights
+ * @udata:	user buffer to communicate STag and Key.
+ */
+struct ib_mr *siw_reg_user_mr(struct ib_pd *ofa_pd, u64 start, u64 len,
+			      u64 rnic_va, int rights, struct ib_udata *udata)
+{
+	struct siw_mr		*mr = NULL;
+	struct siw_pd		*pd = siw_pd_ofa2siw(ofa_pd);
+	struct ib_umem		*umem = NULL;
+	struct siw_ureq_reg_mr	ureq;
+	struct siw_uresp_reg_mr	uresp;
+	struct siw_dev		*dev = pd->hdr.dev;
+	int rv;
+
+	dprint(DBG_MM|DBG_OBJ, " start: 0x%016llx, "
+		"va: 0x%016llx, len: %llu, ctx: %p\n",
+		(unsigned long long)start,
+		(unsigned long long)rnic_va,
+		(unsigned long long)len,
+		ofa_pd->uobject->context);
+
+	if (atomic_inc_return(&dev->num_mem) > SIW_MAX_MR) {
+		dprint(DBG_ON, ": Out of MRs: %d\n",
+			atomic_read(&dev->num_mem));
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (!len) {
+		rv = -EINVAL;
+		goto err_out;
+	}
+
+	umem = ib_umem_get(ofa_pd->uobject->context, start, len, rights, 0);
+	if (IS_ERR(umem)) {
+		dprint(DBG_MM, " ib_umem_get:%ld LOCKED:%lu, LIMIT:%lu\n",
+			PTR_ERR(umem), current->mm->locked_vm,
+			current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >>
+			PAGE_SHIFT);
+		rv = -PTR_ERR(umem);
+		umem = NULL;
+		goto err_out;
+	}
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr) {
+		dprint(DBG_ON, ": malloc\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	mr->mem.stag_state = STAG_INVALID;
+
+	if (siw_mem_add(dev, &mr->mem) < 0) {
+		dprint(DBG_ON, ": siw_mem_add\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	dprint(DBG_OBJ|DBG_MM, "(MEM%d): New Object, UMEM %p\n",
+		mr->mem.hdr.id, umem);
+
+	mr->ofa_mr.lkey = mr->ofa_mr.rkey = mr->mem.hdr.id << 8;
+
+	mr->mem.va  = start;
+	mr->mem.len = len;
+	mr->mem.fbo = 0 ;
+	mr->mem.mr  = NULL;
+	mr->mem.perms = SR_MEM_LREAD | /* not selectable in OFA */
+			(rights & IB_ACCESS_REMOTE_READ  ? SR_MEM_RREAD  : 0) |
+			(rights & IB_ACCESS_LOCAL_WRITE  ? SR_MEM_LWRITE : 0) |
+			(rights & IB_ACCESS_REMOTE_WRITE ? SR_MEM_RWRITE : 0);
+
+	mr->umem = umem;
+
+	if (udata) {
+		rv = ib_copy_from_udata(&ureq, udata, sizeof ureq);
+		if (rv)
+			goto err_out_idr;
+
+		mr->ofa_mr.lkey |= ureq.stag_key;
+		mr->ofa_mr.rkey |= ureq.stag_key; /* XXX ??? */
+		uresp.stag = mr->ofa_mr.lkey;
+
+		rv = ib_copy_to_udata(udata, &uresp, sizeof uresp);
+		if (rv)
+			goto err_out_idr;
+	}
+	mr->pd = pd;
+	siw_pd_get(pd);
+
+	mr->mem.stag_state = STAG_VALID;
+
+	return &mr->ofa_mr;
+
+err_out_idr:
+	siw_remove_obj(&dev->idr_lock, &dev->mem_idr, &mr->mem.hdr);
+err_out:
+	if (umem)
+		ib_umem_release(umem);
+
+	kfree(mr);
+
+	atomic_dec(&dev->num_mem);
+
+	return ERR_PTR(rv);
+}
+
+/*
+ * siw_create_srq()
+ *
+ * Create Shared Receive Queue of attributes @init_attrs
+ * within protection domain given by @ofa_pd.
+ *
+ * @ofa_pd:	OFA PD contained in siw PD.
+ * @init_attrs:	SRQ init attributes.
+ * @udata:	not used by siw.
+ */
+struct ib_srq *siw_create_srq(struct ib_pd *ofa_pd,
+			      struct ib_srq_init_attr *init_attrs,
+			      struct ib_udata *udata)
+{
+	struct siw_srq		*srq = NULL;
+	struct ib_srq_attr	*attrs = &init_attrs->attr;
+	struct siw_pd		*pd = siw_pd_ofa2siw(ofa_pd);
+	struct siw_dev		*dev = pd->hdr.dev;
+	int rv;
+
+	if (atomic_inc_return(&dev->num_srq) > SIW_MAX_SRQ) {
+		dprint(DBG_ON, " Out of SRQ's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (attrs->max_wr > SIW_MAX_SRQ_WR || attrs->max_sge > SIW_MAX_SGE ||
+	    attrs->srq_limit > attrs->max_wr) {
+		rv = -EINVAL;
+		goto err_out;
+	}
+
+	srq = kmalloc(sizeof *srq, GFP_KERNEL);
+	if (!srq) {
+		dprint(DBG_ON, " malloc\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	INIT_LIST_HEAD(&srq->rq);
+	srq->max_sge = attrs->max_sge;
+	atomic_set(&srq->space, attrs->max_wr);
+	srq->limit = attrs->srq_limit;
+	if (srq->limit)
+		srq->armed = 1;
+
+	srq->pd	= pd;
+	siw_pd_get(pd);
+
+	spin_lock_init(&srq->lock);
+
+	return &srq->ofa_srq;
+
+err_out:
+	kfree(srq);
+	atomic_dec(&dev->num_srq);
+
+	return ERR_PTR(rv);
+}
+
+/*
+ * siw_modify_srq()
+ *
+ * Modify SRQ. The caller may resize SRQ and/or set/reset notification
+ * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
+ *
+ * NOTE: it is unclear if OFA allows for changing the MAX_SGE
+ * parameter. siw_modify_srq() does not check the attrs->max_sge param.
+ */
+int siw_modify_srq(struct ib_srq *ofa_srq, struct ib_srq_attr *attrs,
+		   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct siw_srq 	*srq = siw_srq_ofa2siw(ofa_srq);
+	unsigned long	flags;
+	int rv = 0;
+
+	lock_srq_rxsave(srq, flags);
+
+	if (attr_mask & IB_SRQ_MAX_WR) {
+		/* resize request */
+		if (attrs->max_wr > SIW_MAX_SRQ_WR) {
+			rv =  -EINVAL;
+			goto out;
+		}
+		if (attrs->max_wr < srq->max_wr) { /* shrink */
+			if (attrs->max_wr <
+			    srq->max_wr - atomic_read(&srq->space)) {
+				rv = -EBUSY;
+				goto out;
+			}
+			atomic_sub(srq->max_wr - attrs->max_wr, &srq->space);
+		} else /* grow */
+			atomic_add(attrs->max_wr - srq->max_wr, &srq->space);
+		srq->max_wr = attrs->max_wr;
+	}
+	if (attr_mask & IB_SRQ_LIMIT) {
+		if (attrs->srq_limit) {
+			if (attrs->srq_limit > srq->max_wr) {
+				rv = -EINVAL;
+				/* FIXME: restore old space & max_wr?? */
+				goto out;
+			}
+			srq->armed = 1;
+		} else
+			srq->armed = 0;
+
+		srq->limit = attrs->srq_limit;
+	}
+out:
+	unlock_srq_rxsave(srq, flags);
+	return rv;
+}
+
+/*
+ * siw_query_srq()
+ *
+ * Query SRQ attributes.
+ */
+int siw_query_srq(struct ib_srq *ofa_srq, struct ib_srq_attr *attrs)
+{
+	struct siw_srq 	*srq = siw_srq_ofa2siw(ofa_srq);
+	unsigned long	flags;
+
+	lock_srq_rxsave(srq, flags);
+
+	attrs->max_wr = srq->max_wr;
+	attrs->max_sge = srq->max_sge;
+	attrs->srq_limit = srq->limit;
+
+	unlock_srq_rxsave(srq, flags);
+
+	return 0;
+}
+
+/*
+ * siw_destroy_srq()
+ *
+ * Destroy SRQ.
+ * SRQ WQE's are silently destroyed, since not belonging to any QP.
+ * Furthermore, it is assumed that the SRQ is not referenced by any
+ * QP anymore - the code trusts the OFA environment to keep track
+ * of QP references.
+ */
+int siw_destroy_srq(struct ib_srq *ofa_srq)
+{
+	struct list_head	*listp, *tmp;
+	struct siw_srq		*srq = siw_srq_ofa2siw(ofa_srq);
+	struct siw_dev		*dev = srq->pd->hdr.dev;
+	unsigned long flags;
+
+	lock_srq_rxsave(srq, flags); /* probably not necessary */
+	list_for_each_safe(listp, tmp, &srq->rq) {
+		list_del(listp);
+		siw_wqe_put(list_entry(listp, struct siw_wqe, list));
+	}
+	unlock_srq_rxsave(srq, flags);
+
+	siw_pd_put(srq->pd);
+	kfree(srq);
+	atomic_dec(&dev->num_srq);
+
+	return 0;
+}
+
+/*
+ * siw_post_srq_recv()
+ *
+ * Post a list of receive queue elements to SRQ.
+ * NOTE: The function does not check or lock a certain SRQ state
+ *       during the post operation. The code simply trusts the
+ *       OFA environment.
+ *
+ * @ofa_srq:	OFA SRQ contained in siw SRQ
+ * @wr:		List of R-WR's
+ * @bad_wr:	Updated to failing WR if posting fails.
+ */
+int siw_post_srq_recv(struct ib_srq *ofa_srq, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct siw_srq	*srq = siw_srq_ofa2siw(ofa_srq);
+	struct siw_wqe	*wqe = NULL;
+	unsigned long flags;
+	int rv = 0;
+
+	while (wr) {
+		if (!atomic_read(&srq->space) ||
+		    !(wqe = siw_srq_wqe_get(srq))) {
+			dprint(DBG_ON, " siw_srq_wqe_get\n");
+			rv = -ENOMEM;
+			break;
+		}
+		if (!wr->num_sge || wr->num_sge > srq->max_sge) {
+			dprint(DBG_WR|DBG_ON,
+				"(SRQ%p): Num SGE: %d\n", srq, wr->num_sge);
+			rv = -EINVAL;
+			break;
+		}
+		wr_type(wqe) = SIW_WR_RECEIVE;
+		wr_id(wqe) = wr->wr_id;
+		wqe->wr_status = SR_WR_QUEUED;
+
+		rv = siw_copy_sgl(wr->sg_list, wqe->wr.recv.sge, wr->num_sge);
+		if (rv == 0) {
+			/*
+			 * do not allow zero length receive
+			 * XXX correct?
+			 */
+			rv = -EINVAL;
+			break;
+		}
+		wqe->wr.recv.num_sge = wr->num_sge;
+		wqe->bytes = rv;
+
+		lock_srq_rxsave(srq, flags);
+
+		list_add_tail(&wqe->list, &srq->rq);
+		atomic_dec(&srq->space);
+
+		unlock_srq_rxsave(srq, flags);
+
+		wr = wr->next;
+	}
+	if (rv <= 0) {
+		dprint(DBG_WR|DBG_ON, "(SRQ %p): error=%d\n",
+			srq, rv);
+
+		if (wqe != NULL)
+			siw_wqe_put(wqe);
+		*bad_wr = wr;
+	}
+	dprint(DBG_WR|DBG_RX, "(SRQ%p): space=%d\n",
+		srq, atomic_read(&srq->space));
+
+	return rv > 0 ? 0 : rv;
+}
+
+
+struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
+{
+	return -ENOSYS;
+}
diff --git a/drivers/infiniband/hw/siw/siw_verbs.h b/drivers/infiniband/hw/siw/siw_verbs.h
new file mode 100644
index 0000000..53eac4f
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_verbs.h
@@ -0,0 +1,96 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _SIW_VERBS_H
+#define _SIW_VERBS_H
+
+#include <linux/errno.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_cm.h"
+
+extern int siw_query_device(struct ib_device *, struct ib_device_attr *);
+
+extern struct ib_ucontext *siw_alloc_ucontext(struct ib_device *,
+					      struct ib_udata *);
+extern int siw_dealloc_ucontext(struct ib_ucontext *);
+extern int siw_query_port(struct ib_device *, u8, struct ib_port_attr *);
+extern int siw_query_pkey(struct ib_device *, u8, u16, u16 *);
+extern int siw_query_gid(struct ib_device *, u8, int, union ib_gid *);
+
+extern struct ib_pd *siw_alloc_pd(struct ib_device *, struct ib_ucontext *,
+				  struct ib_udata *);
+extern int siw_dealloc_pd(struct ib_pd *);
+extern struct ib_ah *siw_create_ah(struct ib_pd *, struct ib_ah_attr *);
+extern int siw_destroy_ah(struct ib_ah *);
+extern struct ib_qp *siw_create_qp(struct ib_pd *, struct ib_qp_init_attr *,
+				   struct ib_udata *);
+extern int siw_query_qp(struct ib_qp *, struct ib_qp_attr *, int,
+			struct ib_qp_init_attr *);
+extern int siw_ofed_modify_qp(struct ib_qp *, struct ib_qp_attr *, int,
+			      struct ib_udata *);
+extern int siw_destroy_qp(struct ib_qp *);
+extern int siw_post_send(struct ib_qp *, struct ib_send_wr *,
+			 struct ib_send_wr **);
+extern int siw_post_receive(struct ib_qp *, struct ib_recv_wr *,
+			    struct ib_recv_wr **);
+extern struct ib_cq *siw_create_cq(struct ib_device *, int, int,
+				   struct ib_ucontext *, struct ib_udata *);
+extern int siw_destroy_cq(struct ib_cq *);
+extern int siw_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *);
+extern int siw_req_notify_cq(struct ib_cq *, enum ib_cq_notify_flags);
+extern struct ib_mr *siw_reg_user_mr(struct ib_pd *, u64, u64, u64, int,
+				     struct ib_udata *);
+extern struct ib_mr *siw_get_dma_mr(struct ib_pd *, int);
+extern int siw_dereg_mr(struct ib_mr *);
+extern struct ib_srq *siw_create_srq(struct ib_pd *, struct ib_srq_init_attr *,
+				     struct ib_udata *);
+extern int siw_modify_srq(struct ib_srq *, struct ib_srq_attr *,
+			  enum ib_srq_attr_mask, struct ib_udata *);
+extern int siw_query_srq(struct ib_srq *, struct ib_srq_attr *);
+extern int siw_destroy_srq(struct ib_srq *);
+extern int siw_post_srq_recv(struct ib_srq *, struct ib_recv_wr *,
+			     struct ib_recv_wr **);
+extern int siw_mmap(struct ib_ucontext *, struct vm_area_struct *);
+
+#endif
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] SIW: Connection management
From: Bernard Metzler @ 2010-10-05  6:54 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_cm.c | 1939 ++++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/siw/siw_cm.h |  155 +++
 2 files changed, 2094 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_cm.c
 create mode 100644 drivers/infiniband/hw/siw/siw_cm.h

diff --git a/drivers/infiniband/hw/siw/siw_cm.c b/drivers/infiniband/hw/siw/siw_cm.c
new file mode 100644
index 0000000..628b3b1
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_cm.c
@@ -0,0 +1,1939 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *          Fredy Neeser <nfd-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/inetdevice.h>
+#include <linux/workqueue.h>
+#include <net/sock.h>
+#include <linux/tcp.h>
+
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_cm.h"
+#include "siw_obj.h"
+
+static int mpa_crc_enabled;
+module_param(mpa_crc_enabled, int, 0644);
+MODULE_PARM_DESC(mpa_crc_enabled, "MPA CRC enabled");
+
+static int mpa_revision = 1;
+
+
+/*
+ * siw_sock_nodelay() - Disable Nagle algorithm
+ *
+ * See also fs/ocfs2/cluster/tcp.c, o2net_set_nodelay()
+ */
+static int siw_sock_nodelay(struct socket *sock)
+{
+	int ret, val = 1;
+	mm_segment_t oldfs;
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	/*
+	 * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level
+	 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
+	 * silently turn into SO_DEBUG.
+	 */
+	ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
+				    (char __user *)&val, sizeof(val));
+	set_fs(oldfs);
+	return ret;
+}
+
+static void siw_cm_llp_state_change(struct sock *);
+static void siw_cm_llp_data_ready(struct sock *, int);
+static void siw_cm_llp_write_space(struct sock *);
+static void siw_cm_llp_error_report(struct sock *);
+static void siw_sk_assign_cm_upcalls(struct sock *sk)
+
+{
+	write_lock_bh(&sk->sk_callback_lock);
+	sk->sk_state_change = siw_cm_llp_state_change;
+	sk->sk_data_ready   = siw_cm_llp_data_ready;
+	sk->sk_write_space  = siw_cm_llp_write_space;
+	sk->sk_error_report = siw_cm_llp_error_report;
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_sk_save_upcalls(struct sock *sk)
+{
+	struct siw_cep *cep = sk_to_cep(sk);
+	BUG_ON(!cep);
+
+	write_lock_bh(&sk->sk_callback_lock);
+	cep->sk_state_change = sk->sk_state_change;
+	cep->sk_data_ready   = sk->sk_data_ready;
+	cep->sk_write_space  = sk->sk_write_space;
+	cep->sk_error_report = sk->sk_error_report;
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep)
+{
+	sk->sk_state_change	= cep->sk_state_change;
+	sk->sk_data_ready	= cep->sk_data_ready;
+	sk->sk_write_space	= cep->sk_write_space;
+	sk->sk_error_report	= cep->sk_error_report;
+	sk->sk_user_data 	= NULL;
+	sk->sk_no_check 	= 0;
+}
+
+static void siw_socket_disassoc(struct socket *s)
+{
+	struct sock	*sk = s->sk;
+	struct siw_cep	*cep;
+
+	if (sk) {
+		write_lock_bh(&sk->sk_callback_lock);
+		cep = sk_to_cep(sk);
+		if (cep) {
+			siw_sk_restore_upcalls(sk, cep);
+			siw_cep_put(cep);
+		}
+		write_unlock_bh(&sk->sk_callback_lock);
+	}
+}
+
+
+static inline int kernel_peername(struct socket *s, struct sockaddr_in *addr)
+{
+	int unused;
+	return s->ops->getname(s, (struct sockaddr *)addr, &unused, 1);
+}
+
+static inline int kernel_localname(struct socket *s, struct sockaddr_in *addr)
+{
+	int unused;
+	return s->ops->getname(s, (struct sockaddr *)addr, &unused, 0);
+}
+
+static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s)
+{
+	cep->llp.sock = s;
+	siw_cep_get(cep);
+	s->sk->sk_user_data = cep;
+
+	siw_sk_save_upcalls(s->sk);
+	siw_sk_assign_cm_upcalls(s->sk);
+}
+
+
+static struct siw_cep *siw_cep_alloc(void)
+{
+	struct siw_cep *cep = kzalloc(sizeof *cep, GFP_KERNEL);
+	if (cep) {
+		INIT_LIST_HEAD(&cep->list);
+		INIT_LIST_HEAD(&cep->work_freelist);
+
+		cep->mpa.hdr.params.c = mpa_crc_enabled ? 1 : 0;
+		cep->mpa.hdr.params.m = 0;
+		cep->mpa.hdr.params.rev = mpa_revision ? 1 : 0;
+		kref_init(&cep->ref);
+		cep->state = SIW_EPSTATE_IDLE;
+		init_waitqueue_head(&cep->waitq);
+		spin_lock_init(&cep->lock);
+		dprint(DBG_OBJ|DBG_CM, "(CEP 0x%p): New Object\n", cep);
+	}
+	return cep;
+}
+
+static void siw_cm_free_work(struct siw_cep *cep)
+{
+	struct list_head	*w, *tmp;
+	struct siw_cm_work	*work;
+
+	list_for_each_safe(w, tmp, &cep->work_freelist) {
+		work = list_entry(w, struct siw_cm_work, list);
+		list_del(&work->list);
+		kfree(work);
+	}
+}
+
+static void siw_put_work(struct siw_cm_work *work)
+{
+	INIT_LIST_HEAD(&work->list);
+	spin_lock_bh(&work->cep->lock);
+	list_add(&work->list, &work->cep->work_freelist);
+	spin_unlock_bh(&work->cep->lock);
+}
+
+
+static void __siw_cep_dealloc(struct kref *ref)
+{
+	struct siw_cep *cep = container_of(ref, struct siw_cep, ref);
+
+	dprint(DBG_OBJ|DBG_CM, "(CEP 0x%p): Free Object\n", cep);
+
+	if (cep->listen_cep)
+		siw_cep_put(cep->listen_cep);
+
+	/* kfree(NULL) is save */
+	kfree(cep->mpa.pdata);
+	spin_lock_bh(&cep->lock);
+	if (!list_empty(&cep->work_freelist))
+		siw_cm_free_work(cep);
+	spin_unlock_bh(&cep->lock);
+
+	kfree(cep);
+}
+
+static struct siw_cm_work *siw_get_work(struct siw_cep *cep)
+{
+	struct siw_cm_work	*work = NULL;
+
+	spin_lock_bh(&cep->lock);
+	if (!list_empty(&cep->work_freelist)) {
+		work = list_entry(cep->work_freelist.next, struct siw_cm_work,
+				  list);
+		list_del_init(&work->list);
+	}
+	spin_unlock_bh(&cep->lock);
+	return work;
+}
+
+static int siw_cm_alloc_work(struct siw_cep *cep, int num)
+{
+	struct siw_cm_work	*work;
+
+	BUG_ON(!list_empty(&cep->work_freelist));
+
+	while (num--) {
+		work = kmalloc(sizeof *work, GFP_KERNEL);
+		if (!work) {
+			if (!(list_empty(&cep->work_freelist)))
+				siw_cm_free_work(cep);
+			dprint(DBG_ON, " Failed\n");
+			return -ENOMEM;
+		}
+		work->cep = cep;
+		INIT_LIST_HEAD(&work->list);
+		list_add(&work->list, &cep->work_freelist);
+	}
+	return 0;
+}
+
+static void siw_cm_release(struct siw_cep *cep)
+{
+	if (cep->llp.sock) {
+		siw_socket_disassoc(cep->llp.sock);
+		sock_release(cep->llp.sock);
+		cep->llp.sock = NULL;
+	}
+	if (cep->qp) {
+		struct siw_qp *qp = cep->qp;
+		cep->qp = NULL;
+		siw_qp_put(qp);
+	}
+	if (cep->cm_id) {
+		cep->cm_id->rem_ref(cep->cm_id);
+		cep->cm_id = NULL;
+		siw_cep_put(cep);
+	}
+	cep->state = SIW_EPSTATE_CLOSED;
+}
+
+/*
+ * Test and set CEP into CLOSE pending. After calling
+ * this function, the CEP conn_close flag is set. Returns:
+ *
+ *  1, if CEP is currently in use,
+ *  0, if CEP is not in use and not already in CLOSE,
+ * -1, if CEP is not in use and already in CLOSE.
+ */
+int siw_cep_in_close(struct siw_cep *cep)
+{
+	int rv;
+
+	spin_lock_bh(&cep->lock);
+
+	dprint(DBG_CM, " (CEP 0x%p): close %d, use %d\n",
+		cep, cep->conn_close, cep->in_use);
+
+	rv = cep->in_use ? 1 : (cep->conn_close ? -1 : 0);
+	cep->conn_close = 1; /* may be redundant */
+
+	spin_unlock_bh(&cep->lock);
+
+	return rv;
+}
+
+/*
+ * siw_qp_cm_drop()
+ *
+ * Drops established LLP connection if present and not already
+ * scheduled for dropping. Called from user context, SQ workqueue
+ * or receive IRQ. Caller signals if socket can be immediately
+ * closed (basically, if not in IRQ) and if IWCM should get
+ * informed of LLP state change.
+ */
+void siw_qp_cm_drop(struct siw_qp *qp, int schedule)
+{
+	struct siw_cep *cep = qp->cep;
+
+	qp->rx_ctx.rx_suspend = 1;
+	qp->tx_ctx.tx_suspend = 1;
+
+	if (cep && !siw_cep_in_close(cep)) {
+		if (schedule) {
+			siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP);
+			return;
+		}
+		/*
+		 * Immediately close socket
+		 */
+		dprint(DBG_CM, "(): immediate close, cep->state=%d\n",
+			cep->state);
+
+		if (cep->cm_id) {
+			switch (cep->state) {
+
+			case SIW_EPSTATE_AWAIT_MPAREP:
+				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+					      IW_CM_EVENT_STATUS_EINVAL);
+				break;
+
+			case SIW_EPSTATE_RDMA_MODE:
+				siw_cm_upcall(cep, IW_CM_EVENT_CLOSE,
+					      IW_CM_EVENT_STATUS_OK);
+
+				break;
+
+			case SIW_EPSTATE_IDLE:
+			case SIW_EPSTATE_LISTENING:
+			case SIW_EPSTATE_CONNECTING:
+			case SIW_EPSTATE_AWAIT_MPAREQ:
+			case SIW_EPSTATE_RECVD_MPAREQ:
+			case SIW_EPSTATE_CLOSED:
+			default:
+
+				break;
+			}
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+			siw_cep_put(cep);
+		}
+		cep->state = SIW_EPSTATE_CLOSED;
+
+		if (cep->llp.sock) {
+			siw_socket_disassoc(cep->llp.sock);
+			sock_release(cep->llp.sock);
+			cep->llp.sock = NULL;
+		}
+		cep->qp = NULL;
+		siw_qp_put(qp);
+	}
+}
+
+
+/*
+ * Set CEP in_use flag. Returns:
+ *
+ *  1, if CEP was not in use and not scheduled for closing,
+ *  0, if CEP was not in use but scheduled for closing,
+ * -1, if CEP is currently in use.
+ */
+static int siw_cep_set_inuse(struct siw_cep *cep)
+{
+	int rv;
+
+	spin_lock_bh(&cep->lock);
+
+	dprint(DBG_CM, " (CEP 0x%p): close %d, use %d\n",
+		cep, cep->conn_close, cep->in_use);
+
+	rv = cep->in_use ? -1 : (cep->conn_close ? 0 : 1);
+	cep->in_use = 1; /* may be redundant */
+
+	spin_unlock_bh(&cep->lock);
+
+	return rv;
+}
+
+/*
+ * Clear CEP in_use flag. Returns:
+ *
+ *  1, if CEP is not scheduled for closing,
+ *  0, else.
+ */
+static int siw_cep_set_free(struct siw_cep *cep)
+{
+	int rv;
+
+	spin_lock_bh(&cep->lock);
+
+	dprint(DBG_CM, " (CEP 0x%p): close %d, use %d\n",
+		cep, cep->conn_close, cep->in_use);
+
+	cep->in_use = 0;
+	rv = cep->conn_close ? 0 : 1;
+
+	spin_unlock_bh(&cep->lock);
+
+	wake_up(&cep->waitq);
+
+	return rv;
+}
+
+
+void siw_cep_put(struct siw_cep *cep)
+{
+	dprint(DBG_OBJ|DBG_CM, "(CEP 0x%p): New refcount: %d\n",
+		cep, atomic_read(&cep->ref.refcount) - 1);
+
+	if (!kref_put(&cep->ref, __siw_cep_dealloc))
+		wake_up(&cep->waitq);
+}
+
+void siw_cep_get(struct siw_cep *cep)
+{
+	kref_get(&cep->ref);
+	dprint(DBG_OBJ|DBG_CM, "(CEP 0x%p): New refcount: %d\n",
+		cep, atomic_read(&cep->ref.refcount));
+}
+
+
+
+static inline int ksock_recv(struct socket *sock, char *buf, size_t size,
+			     int flags)
+{
+	struct kvec iov = {buf, size};
+	struct msghdr msg = {.msg_name = NULL, .msg_flags = flags};
+
+	return kernel_recvmsg(sock, &msg, &iov, 1, size, flags);
+}
+
+/*
+ * Receive MPA Request/Reply heder.
+ *
+ * Returns 0 if complete MPA Request/Reply haeder including
+ * eventual private data was received. Returns -EAGAIN if
+ * header was partially received or negative error code otherwise.
+ *
+ * Context: May be called in process context only
+ */
+static int siw_recv_mpa_rr(struct siw_cep *cep)
+{
+	struct mpa_rr	*hdr = &cep->mpa.hdr;
+	struct socket	*s = cep->llp.sock;
+	int		rcvd, to_rcv;
+
+	if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
+
+		rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd,
+				  sizeof(struct mpa_rr) -
+				  cep->mpa.bytes_rcvd, 0);
+
+		if (rcvd <= 0)
+			return -ECONNABORTED;
+
+		cep->mpa.bytes_rcvd += rcvd;
+
+		if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr))
+			return -EAGAIN;
+
+		hdr->params.pd_len = ntohs(hdr->params.pd_len);
+
+		if (hdr->params.pd_len > MPA_MAX_PRIVDATA)
+			return -EPROTO;
+	}
+
+	/*
+	 * At least the MPA Request/Reply header (frame not including
+	 * private data) has been received.
+	 * Receive (or continue receiving) any private data.
+	 */
+	to_rcv = hdr->params.pd_len -
+		 (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr));
+
+	if (!to_rcv) {
+		/*
+		 * We must have hdr->params.pd_len == 0 and thus received a
+		 * complete MPA Request/Reply frame.
+		 * Check against peer protocol violation.
+		 */
+		__u32 word;
+
+		rcvd = ksock_recv(s, (char *)&word, sizeof word, MSG_DONTWAIT);
+		if (rcvd == -EAGAIN)
+			return 0;
+
+		if (rcvd == 0) {
+			dprint(DBG_CM, " peer EOF\n");
+			return -EPIPE;
+		}
+		if (rcvd < 0) {
+			dprint(DBG_CM, " ERROR: %d: \n", rcvd);
+			return rcvd;
+		}
+		dprint(DBG_CM, " peer sent extra data: %d\n", rcvd);
+		return -EPROTO;
+	}
+
+	/*
+	 * At this point, we must have hdr->params.pd_len != 0.
+	 * A private data buffer gets allocated iff hdr->params.pd_len != 0.
+	 * Ownership of this buffer will be transferred to the IWCM
+	 * when calling siw_cm_upcall().
+	 */
+	if (!cep->mpa.pdata &&
+	    !(cep->mpa.pdata = kmalloc(hdr->params.pd_len + 4, GFP_KERNEL)))
+		return -ENOMEM;
+
+	rcvd = ksock_recv(s, cep->mpa.pdata + cep->mpa.bytes_rcvd
+			  - sizeof(struct mpa_rr), to_rcv + 4, MSG_DONTWAIT);
+
+	if (rcvd < 0)
+		return rcvd;
+
+	if (rcvd > to_rcv)
+		return -EPROTO;
+
+	cep->mpa.bytes_rcvd += rcvd;
+
+	if (to_rcv == rcvd) {
+		dprint(DBG_CM, "%d bytes private_data received",
+			hdr->params.pd_len);
+		return 0;
+	}
+	return -EAGAIN;
+}
+
+
+static void siw_proc_mpareq(struct siw_cep *cep)
+{
+	int err = siw_recv_mpa_rr(cep);
+
+	if (err)
+		goto out;
+
+	if (cep->mpa.hdr.params.rev > MPA_REVISION_1) {
+		/* allow for 0 and 1 only */
+		err = -EPROTO;
+		goto out;
+	}
+
+	if (memcmp(cep->mpa.hdr.key, MPA_KEY_REQ, sizeof cep->mpa.hdr.key)) {
+		err = -EPROTO;
+		goto out;
+	}
+	cep->state = SIW_EPSTATE_RECVD_MPAREQ;
+
+	if (cep->listen_cep->state == SIW_EPSTATE_LISTENING) {
+		/*
+		 * Since siw_cm_upcall() called with success, iwcm must hold
+		 * a reference to the CEP until the IW_CM_EVENT_CONNECT_REQUEST
+		 * has been accepted or rejected.
+		 * NOTE: If the iwcm never calls back with accept/reject,
+		 * (e.g., the user types ^C instead), the CEP can never be
+		 * free'd. It results in a memory hole which should be
+		 * fixed by calling siw_reject() in case of application
+		 * termination..
+		 */
+		siw_cep_get(cep);
+
+		err = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST,
+				    IW_CM_EVENT_STATUS_OK);
+		if (err)
+			siw_cep_put(cep);
+	} else {
+		/*
+		 * listener lost: new connection cannot be signalled
+		 */
+		dprint(DBG_CM|DBG_ON, "(cep=0x%p): Listener lost:!\n", cep);
+		err = -EINVAL;
+	}
+out:
+	if (err) {
+		dprint(DBG_CM|DBG_ON, "(cep=0x%p): error %d\n", cep, err);
+
+		if (!siw_cep_in_close(cep)) {
+			/*
+			 * remove reference from listening cep and clear
+			 * information on related listener.
+			 */
+			siw_cep_put(cep->listen_cep);
+			cep->listen_cep = NULL;
+
+			siw_socket_disassoc(cep->llp.sock);
+			sock_release(cep->llp.sock);
+			cep->llp.sock = NULL;
+
+			cep->state = SIW_EPSTATE_CLOSED;
+			siw_cep_put(cep);
+		}
+	}
+}
+
+
+static void siw_proc_mpareply(struct siw_cep *cep)
+{
+	struct siw_qp_attrs	qp_attrs;
+	struct siw_qp		*qp = cep->qp;
+	int			rv;
+
+	rv = siw_recv_mpa_rr(cep);
+	if (rv == -EAGAIN)
+		/* incomplete mpa reply */
+		return;
+
+	if (rv)
+		goto error;
+
+	if (cep->mpa.hdr.params.rev > MPA_REVISION_1) {
+		/* allow for 0 and 1 only */
+		rv = -EPROTO;
+		goto error;
+	}
+	if (memcmp(cep->mpa.hdr.key, MPA_KEY_REP, sizeof cep->mpa.hdr.key)) {
+		rv = -EPROTO;
+		goto error;
+	}
+	/*
+	 * TODO: 1. handle eventual MPA reject (upcall with ECONNREFUSED)
+	 *       2. finish mpa parameter check/negotiation
+	 */
+	memset(&qp_attrs, 0, sizeof qp_attrs);
+	qp_attrs.mpa.marker_rcv = 0;
+	qp_attrs.mpa.marker_snd = 0;
+	qp_attrs.mpa.crc = CONFIG_RDMA_SIW_CRC_ENFORCED;
+	qp_attrs.mpa.version = 1;
+	qp_attrs.ird = cep->ird;
+	qp_attrs.ord = cep->ord;
+	qp_attrs.llp_stream_handle = cep->llp.sock;
+	qp_attrs.state = SIW_QP_STATE_RTS;
+
+	/* Move socket RX/TX under QP control */
+	down_write(&qp->state_lock);
+	if (qp->attrs.state > SIW_QP_STATE_RTR) {
+		rv = -EINVAL;
+		up_write(&qp->state_lock);
+		goto error;
+	}
+	rv = siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE|
+					       SIW_QP_ATTR_LLP_HANDLE|
+					       SIW_QP_ATTR_ORD|
+					       SIW_QP_ATTR_IRD|
+					       SIW_QP_ATTR_MPA);
+
+	if (!rv) {
+		cep->state = SIW_EPSTATE_RDMA_MODE;
+		siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+			      IW_CM_EVENT_STATUS_OK);
+
+		up_write(&qp->state_lock);
+		return;
+	}
+	up_write(&qp->state_lock);
+error:
+	/*
+	 * failed socket handover returns responsibility:
+	 * inform iwcm and drop connection
+	 * TODO: 1. send MPA reject for MPA rev==1
+	 *	    if rv != ECONNREFUSED
+	 */
+	siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, rv);
+
+	if (!siw_cep_in_close(cep)) {
+
+		cep->cm_id->rem_ref(cep->cm_id);
+		cep->cm_id = NULL;
+		siw_cep_put(cep);
+
+		siw_socket_disassoc(cep->llp.sock);
+		sock_release(cep->llp.sock);
+		cep->llp.sock = NULL;
+
+		cep->qp = NULL;
+		siw_qp_put(cep->qp);
+	}
+	cep->state = SIW_EPSTATE_CLOSED;
+}
+
+/*
+ * siw_accept_newconn - accept an incoming pending connection
+ *
+ */
+static void siw_accept_newconn(struct siw_cep *cep)
+{
+	struct socket		*s = cep->llp.sock;
+	struct socket		*new_s = NULL;
+	struct siw_cep		*new_cep = NULL;
+	int			rv = 0; /* debug only. should disappear */
+
+	new_cep = siw_cep_alloc();
+	if (!new_cep)
+		goto error;
+
+	if (siw_cm_alloc_work(new_cep, 4) != 0)
+		goto error;
+
+	/*
+	 * Copy saved socket callbacks from listening CEP
+	 * and assign new socket with new CEP
+	 */
+	new_cep->sk_state_change = cep->sk_state_change;
+	new_cep->sk_data_ready   = cep->sk_data_ready;
+	new_cep->sk_write_space  = cep->sk_write_space;
+	new_cep->sk_error_report = cep->sk_error_report;
+
+	rv = kernel_accept(s, &new_s, O_NONBLOCK);
+	if (rv != 0) {
+		/*
+		 * TODO: Already aborted by peer?
+		 * Is there anything we should do?
+		 */
+		dprint(DBG_CM|DBG_ON, "(cep=0x%p): ERROR: "
+			"kernel_accept(): rv=%d\n", cep, rv);
+		goto error;
+	}
+	new_cep->llp.sock = new_s;
+	siw_cep_get(new_cep);
+	new_s->sk->sk_user_data = new_cep;
+
+	dprint(DBG_CM, "(cep=0x%p, s=0x%p, new_s=0x%p): "
+		"New LLP connection accepted\n", cep, s, new_s);
+
+	rv = siw_sock_nodelay(new_s);
+	if (rv != 0) {
+		dprint(DBG_CM|DBG_ON, "(cep=0x%p): ERROR: "
+			"siw_sock_nodelay(): rv=%d\n", cep, rv);
+		goto error;
+	}
+
+	rv = kernel_peername(new_s, &new_cep->llp.raddr);
+	if (rv != 0) {
+		dprint(DBG_CM|DBG_ON, "(cep=0x%p): ERROR: "
+			"kernel_peername(): rv=%d\n", cep, rv);
+		goto error;
+	}
+	rv = kernel_localname(new_s, &new_cep->llp.laddr);
+	if (rv != 0) {
+		dprint(DBG_CM|DBG_ON, "(cep=0x%p): ERROR: "
+			"kernel_localname(): rv=%d\n", cep, rv);
+		goto error;
+	}
+
+	/*
+	 * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep.
+	 */
+	new_cep->listen_cep = cep;
+	siw_cep_get(cep);
+
+	new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
+
+	if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
+		/*
+		 * MPA REQ already queued
+		 */
+		dprint(DBG_CM, "(cep=0x%p): Immediate MPA req.\n", cep);
+
+		siw_proc_mpareq(new_cep);
+	}
+	return;
+
+error:
+	if (new_cep)
+		siw_cep_put(new_cep);
+
+	if (new_s) {
+		siw_socket_disassoc(new_s);
+		sock_release(new_s);
+	}
+	dprint(DBG_CM|DBG_ON, "(cep=0x%p): ERROR: rv=%d\n", cep, rv);
+}
+
+/*
+ * Expects params->pd_len in host byte order
+ *
+ * TODO: We might want to combine the arguments params and pdata to a single
+ * pointer to a struct siw_mpa_info as defined in siw_cm.h.
+ * This way, all private data parameters would be in a common struct.
+ */
+static int siw_send_mpareqrep(struct socket *s, struct mpa_rr_params *params,
+				char *key, char *pdata)
+{
+	struct mpa_rr	hdr;
+	struct kvec	iov[2];
+	struct msghdr	msg;
+
+	int		rv;
+	unsigned short 	pd_len = params->pd_len;
+
+	memset(&msg, 0, sizeof(msg));
+	memset(&hdr, 0, sizeof hdr);
+	memcpy(hdr.key, key, 16);
+
+	/*
+	 * TODO: By adding a union to struct mpa_rr_params, it should be
+	 * possible to replace the next 4 statements by one
+	 */
+	hdr.params.r = params->r;
+	hdr.params.c = params->c;
+	hdr.params.m = params->m;
+	hdr.params.rev = params->rev;
+
+	if (pd_len > MPA_MAX_PRIVDATA)
+		return -EINVAL;
+
+	hdr.params.pd_len = htons(pd_len);
+
+	iov[0].iov_base = &hdr;
+	iov[0].iov_len = sizeof hdr;
+
+	if (pd_len) {
+		iov[1].iov_base = pdata;
+		iov[1].iov_len = pd_len;
+
+		rv =  kernel_sendmsg(s, &msg, iov, 2, pd_len + sizeof hdr);
+	} else
+		rv =  kernel_sendmsg(s, &msg, iov, 1, sizeof hdr);
+
+	return rv < 0 ? rv : 0;
+}
+
+/*
+ * siw_cm_upcall()
+ *
+ * Upcall to IWCM to inform about async connection events
+ */
+int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
+			    enum iw_cm_event_status status)
+{
+	struct iw_cm_event	event;
+	struct iw_cm_id 	*cm_id;
+
+	memset(&event, 0, sizeof event);
+	event.status = status;
+	event.event = reason;
+
+	if (cep->mpa.hdr.params.pd_len != 0) {
+		/*
+		 * hand over MPA private data
+		 */
+		event.private_data_len = cep->mpa.hdr.params.pd_len;
+		event.private_data = cep->mpa.pdata;
+		cep->mpa.hdr.params.pd_len = 0;
+	}
+	if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
+	    reason == IW_CM_EVENT_CONNECT_REPLY) {
+		event.local_addr = cep->llp.laddr;
+		event.remote_addr = cep->llp.raddr;
+	}
+	if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
+		event.provider_data = cep;
+		cm_id = cep->listen_cep->cm_id;
+	} else
+		cm_id = cep->cm_id;
+
+	dprint(DBG_CM, " (QP%d): cep=0x%p, id=0x%p, dev(id)=%s, "
+		"reason=%d, status=%d\n",
+		cep->qp ? QP_ID(cep->qp) : -1, cep, cm_id,
+		cm_id->device->name, reason, status);
+
+	return cm_id->event_handler(cm_id, &event);
+}
+
+static void siw_cm_work_handler(struct work_struct *w)
+{
+	struct siw_cm_work	*work;
+	struct siw_cep		*cep;
+	int rv;
+
+	work = container_of(w, struct siw_cm_work, work);
+	cep = work->cep;
+
+	dprint(DBG_CM, " (QP%d): WORK type: %d, CEP: 0x%p\n",
+		cep->qp ? QP_ID(cep->qp) : -1, work->type, cep);
+
+	switch (work->type) {
+
+	case SIW_CM_WORK_ACCEPT:
+
+		rv = siw_cep_set_inuse(cep);
+		if (rv > 0) {
+			if (cep->state == SIW_EPSTATE_LISTENING)
+				siw_accept_newconn(cep);
+
+			if (!siw_cep_set_free(cep)) {
+				siw_cm_release(cep);
+				siw_cep_put(cep);
+			}
+			break;
+		}
+		/*
+		 * CEP already scheduled for closing
+		 */
+		if (!rv) {
+			siw_cm_release(cep);
+			(void) siw_cep_set_free(cep);
+		}
+		break;
+
+	case SIW_CM_WORK_READ_MPAHDR:
+
+		rv = siw_cep_set_inuse(cep);
+		if (rv > 0) {
+			switch (cep->state) {
+
+			case SIW_EPSTATE_AWAIT_MPAREQ:
+
+				siw_proc_mpareq(cep);
+				break;
+
+			case SIW_EPSTATE_AWAIT_MPAREP:
+
+				siw_proc_mpareply(cep);
+				break;
+
+			default:
+				/*
+				 * CEP already moved out of MPA handshake.
+				 * any connection management already done.
+				 * silently ignore the mpa packet.
+				 */
+				dprint(DBG_CM, "(): CEP not in MPA "
+					"handshake state: %d\n", cep->state);
+			}
+			if (!siw_cep_set_free(cep))
+				siw_cm_release(cep);
+
+			break;
+		}
+		/*
+		 * CEP already scheduled for closing
+		 */
+		if (!rv) {
+			siw_cm_release(cep);
+			(void) siw_cep_set_free(cep);
+		}
+		break;
+
+	case SIW_CM_WORK_CLOSE_LLP:
+		/*
+		 * QP scheduled LLP close
+		 */
+		dprint(DBG_CM, "(): SIW_CM_WORK_CLOSE_LLP, cep->state=%d\n",
+			cep->state);
+
+		cep->state = SIW_EPSTATE_CLOSED;
+
+		if (cep->llp.sock) {
+			siw_socket_disassoc(cep->llp.sock);
+			sock_release(cep->llp.sock);
+			cep->llp.sock = NULL;
+		}
+		if (cep->qp) {
+			siw_qp_llp_close(cep->qp);
+			siw_qp_put(cep->qp);
+			cep->qp = NULL;
+		}
+		if (cep->cm_id) {
+			siw_cm_upcall(cep, IW_CM_EVENT_CLOSE,
+				      IW_CM_EVENT_STATUS_OK);
+
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+			siw_cep_put(cep);
+		}
+		break;
+
+	case SIW_CM_WORK_PEER_CLOSE:
+
+		dprint(DBG_CM, "(): SIW_CM_WORK_PEER_CLOSE, "
+			"cep->state=%d\n", cep->state);
+
+		if (cep->cm_id) {
+			switch (cep->state) {
+
+			case SIW_EPSTATE_AWAIT_MPAREP:
+				/*
+				 * MPA reply not received, but connection drop
+				 */
+				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+						-ECONNRESET);
+				break;
+
+			case SIW_EPSTATE_RDMA_MODE:
+				/*
+				 * NOTE: IW_CM_EVENT_DISCONNECT is given just
+				 *       to transition IWCM into CLOSING.
+				 *       FIXME: is that needed?
+				 */
+				siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT,
+					      IW_CM_EVENT_STATUS_OK);
+				siw_cm_upcall(cep, IW_CM_EVENT_CLOSE,
+					      IW_CM_EVENT_STATUS_OK);
+
+				break;
+
+			default:
+
+				break;
+				/*
+				 * for these states there is no connection
+				 * known to the IWCM. Even not for
+				 * SIW_EPSTATE_RECVD_MPAREQ.
+				 */
+			}
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+			siw_cep_put(cep);
+		}
+		if (cep->qp) {
+			siw_qp_llp_close(cep->qp);
+			siw_qp_put(cep->qp);
+			cep->qp = NULL;
+		}
+		if (cep->state != SIW_EPSTATE_CLOSED) {
+			cep->state = SIW_EPSTATE_CLOSED;
+			siw_socket_disassoc(cep->llp.sock);
+			sock_release(cep->llp.sock);
+			cep->llp.sock = NULL;
+		}
+
+		break;
+
+	default:
+		BUG();
+	}
+	dprint(DBG_CM, " (Exit): WORK type: %d, CEP: 0x%p\n", work->type, cep);
+	siw_put_work(work);
+	siw_cep_put(cep);
+}
+
+static struct workqueue_struct *siw_cm_wq;
+
+int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type)
+{
+	struct siw_cm_work *work = siw_get_work(cep);
+
+	dprint(DBG_CM, " (QP%d): WORK type: %d, CEP: 0x%p\n",
+		cep->qp ? QP_ID(cep->qp) : -1, type, cep);
+
+	if (!work) {
+		dprint(DBG_ON, " Failed\n");
+		return -ENOMEM;
+	}
+	work->type = type;
+	work->cep = cep;
+
+	siw_cep_get(cep);
+
+	INIT_WORK(&work->work, siw_cm_work_handler);
+	queue_work(siw_cm_wq, &work->work);
+
+	return 0;
+}
+
+
+static void siw_cm_llp_data_ready(struct sock *sk, int flags)
+{
+	struct siw_cep	*cep;
+
+	read_lock(&sk->sk_callback_lock);
+
+	cep = sk_to_cep(sk);
+	if (!cep) {
+		WARN_ON(1);
+		goto out;
+	}
+
+	if (cep->conn_close)
+		goto out;
+
+	dprint(DBG_CM, "(): cep 0x%p, state: %d, flags %x\n", cep,
+		cep->state, flags);
+
+	switch (cep->state) {
+
+	case SIW_EPSTATE_RDMA_MODE:
+	case SIW_EPSTATE_LISTENING:
+
+		break;
+
+	case SIW_EPSTATE_AWAIT_MPAREQ:
+	case SIW_EPSTATE_AWAIT_MPAREP:
+
+		siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR);
+		break;
+
+	default:
+		dprint(DBG_CM, "(): Unexpected DATA, state %d\n", cep->state);
+		break;
+	}
+out:
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static void siw_cm_llp_write_space(struct sock *sk)
+{
+	struct siw_cep	*cep = sk_to_cep(sk);
+
+	if (cep)
+		dprint(DBG_CM, "(): cep: 0x%p, state: %d\n", cep, cep->state);
+}
+
+static void siw_cm_llp_error_report(struct sock *sk)
+{
+	struct siw_cep	*cep = sk_to_cep(sk);
+
+	dprint(DBG_CM, "(): error: %d, state: %d\n", sk->sk_err, sk->sk_state);
+
+	if (cep) {
+		cep->sk_error = sk->sk_err;
+		dprint(DBG_CM, "(): cep->state: %d\n", cep->state);
+		cep->sk_error_report(sk);
+	}
+}
+
+static void siw_cm_llp_state_change(struct sock *sk)
+{
+	struct siw_cep	*cep;
+	struct socket 	*s;
+	void (*orig_state_change)(struct sock *);
+
+
+	read_lock(&sk->sk_callback_lock);
+
+	cep = sk_to_cep(sk);
+	if (!cep) {
+		WARN_ON(1);
+		read_unlock(&sk->sk_callback_lock);
+		return;
+	}
+	orig_state_change = cep->sk_state_change;
+
+	s = sk->sk_socket;
+
+	dprint(DBG_CM, "(): cep: 0x%p, state: %d\n", cep, cep->state);
+
+	switch (sk->sk_state) {
+
+	case TCP_ESTABLISHED:
+		/*
+		 * handle accepting socket as special case where only
+		 * new connection is possible
+		 */
+		if (cep->conn_close)
+			break;
+
+		if (cep->state == SIW_EPSTATE_LISTENING &&
+			siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT) != 0) {
+				dprint(DBG_ON, "Cannot accept\n");
+		}
+		break;
+
+	case TCP_CLOSE:
+	case TCP_CLOSE_WAIT:
+		if (cep->state <= SIW_EPSTATE_LISTENING) {
+			dprint(DBG_CM, "() Close before accept()\n");
+			break;
+		}
+		if (cep->qp)
+			cep->qp->tx_ctx.tx_suspend = 1;
+
+		if (!siw_cep_in_close(cep))
+			siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE);
+
+		break;
+
+	default:
+		dprint(DBG_CM, "Unexpected sock state %d\n", sk->sk_state);
+	}
+	read_unlock(&sk->sk_callback_lock);
+	orig_state_change(sk);
+}
+
+
+static int kernel_bindconnect(struct socket *s,
+			      struct sockaddr *laddr, int laddrlen,
+			      struct sockaddr *raddr, int raddrlen, int flags)
+{
+	int err, s_val = 1;
+	/*
+	 * XXX
+	 * Tentative fix. Should not be needed but sometimes iwcm
+	 * chooses ports in use
+	 */
+	err = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val,
+				sizeof s_val);
+	if (err < 0)
+		goto done;
+
+	err = s->ops->bind(s, laddr, laddrlen);
+	if (err < 0)
+		goto done;
+
+	err = s->ops->connect(s, raddr, raddrlen, flags);
+	if (err < 0)
+		goto done;
+
+	err = s->ops->getname(s, laddr, &s_val, 0);
+
+done:
+	return err;
+}
+
+
+int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+	struct siw_dev	*dev = siw_dev_ofa2siw(id->device);
+	struct siw_qp	*qp;
+	struct siw_cep	*cep = NULL;
+	struct socket 	*s = NULL;
+	struct sockaddr	*laddr, *raddr;
+
+	u16		pd_len = params->private_data_len;
+	int 		rv, size;
+
+	if (pd_len > MPA_MAX_PRIVDATA)
+		return -EINVAL;
+
+	qp = siw_qp_id2obj(dev, params->qpn);
+	BUG_ON(!qp);
+
+	dprint(DBG_CM, "(id=0x%p, QP%d): dev(id)=%s, l2dev=%s\n",
+		id, QP_ID(qp), dev->ofa_dev.name, dev->l2dev->name);
+	dprint(DBG_CM, "(id=0x%p, QP%d): laddr=(0x%x,%d), raddr=(0x%x,%d)\n",
+		id, QP_ID(qp),
+		ntohl(id->local_addr.sin_addr.s_addr),
+		ntohs(id->local_addr.sin_port),
+		ntohl(id->remote_addr.sin_addr.s_addr),
+		ntohs(id->remote_addr.sin_port));
+
+	down_write(&qp->state_lock);
+	if (qp->attrs.state > SIW_QP_STATE_RTR) {
+		rv = -EINVAL;
+		goto error;
+	}
+
+	laddr = (struct sockaddr *)&id->local_addr;
+	raddr = (struct sockaddr *)&id->remote_addr;
+
+	rv = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, &s);
+	if (rv < 0)
+		goto error;
+
+	size = SOCKBUFSIZE;
+	rv = kernel_setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&size,
+			       sizeof size);
+	if (rv < 0)
+		goto error;
+
+	rv = kernel_setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&size,
+			       sizeof size);
+	if (rv < 0)
+		goto error;
+
+	/*
+	 * NOTE: For simplification, connect() is called in blocking
+	 * mode. Might be reconsidered for async connection setup at
+	 * TCP level.
+	 */
+	rv = kernel_bindconnect(s, laddr, sizeof *laddr, raddr,
+				sizeof *raddr, 0);
+	if (rv != 0) {
+		dprint(DBG_CM, "(id=0x%p, QP%d): kernel_bindconnect: rv=%d\n",
+			id, QP_ID(qp), rv);
+		goto error;
+	}
+	rv = siw_sock_nodelay(s);
+	if (rv != 0) {
+		dprint(DBG_CM, "(id=0x%p, QP%d): siw_sock_nodelay(): rv=%d\n",
+			id, QP_ID(qp), rv);
+		goto error;
+	}
+	cep = siw_cep_alloc();
+	if (!cep) {
+		rv =  -ENOMEM;
+		goto error;
+	}
+
+	/* Associate QP with CEP */
+	siw_cep_get(cep);
+	qp->cep = cep;
+
+	/* siw_qp_get(qp) already done by QP lookup */
+	cep->qp = qp;
+
+	id->add_ref(id);
+	cep->cm_id = id;
+
+	rv = siw_cm_alloc_work(cep, 4);
+	if (rv != 0) {
+		rv = -ENOMEM;
+		goto error;
+	}
+	cep->mpa.hdr.params.pd_len = pd_len;
+	cep->ird = params->ird;
+	cep->ord = params->ord;
+	cep->state = SIW_EPSTATE_CONNECTING;
+
+	rv = kernel_peername(s, &cep->llp.raddr);
+	if (rv)
+		goto error;
+
+	rv = kernel_localname(s, &cep->llp.laddr);
+	if (rv)
+		goto error;
+
+	dprint(DBG_CM, "(id=0x%p, QP%d): pd_len = %u\n", id, QP_ID(qp), pd_len);
+	if (pd_len)
+		dprint(DBG_CM, "%d bytes private_data\n", pd_len);
+	/*
+	 * Associate CEP with socket
+	 */
+	siw_cep_socket_assoc(cep, s);
+
+	cep->state = SIW_EPSTATE_AWAIT_MPAREP;
+
+	rv = siw_send_mpareqrep(cep->llp.sock, &cep->mpa.hdr.params,
+				MPA_KEY_REQ, (char *)params->private_data);
+
+	/*
+	 * Reset private data len: in case connection drops w/o peer
+	 * sending MPA reply we would report stale data pointer during
+	 * IW_CM_EVENT_CONNECT_REPLY.
+	 */
+	cep->mpa.hdr.params.pd_len = 0;
+
+	if (rv >= 0) {
+		dprint(DBG_CM, "(id=0x%p, QP%d): Exit\n", id, QP_ID(qp));
+		up_write(&qp->state_lock);
+		return 0;
+	}
+error:
+	up_write(&qp->state_lock);
+
+	dprint(DBG_ON, " Failed: %d\n", rv);
+
+	if (cep && !siw_cep_in_close(cep)) {
+
+		siw_socket_disassoc(s);
+		sock_release(s);
+		cep->llp.sock = NULL;
+
+		cep->qp = NULL;
+
+		cep->cm_id = NULL;
+		id->rem_ref(id);
+		siw_cep_put(cep);
+
+		qp->cep = NULL;
+		siw_cep_put(cep);
+
+		cep->state = SIW_EPSTATE_CLOSED;
+	} else if (!cep && s)
+		sock_release(s);
+
+	siw_qp_put(qp);
+
+	return rv;
+}
+
+/*
+ * siw_accept - Let SoftiWARP accept an RDMA connection request
+ *
+ * @id:		New connection management id to be used for accepted
+ *		connection request
+ * @params:	Connection parameters provided by ULP for accepting connection
+ *
+ * Transition QP to RTS state, associate new CM id @id with accepted CEP
+ * and get prepared for TCP input by installing socket callbacks.
+ * Then send MPA Reply and generate the "connection established" event.
+ * Socket callbacks must be installed before sending MPA Reply, because
+ * the latter may cause a first RDMA message to arrive from the RDMA Initiator
+ * side very quickly, at which time the socket callbacks must be ready.
+ */
+int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+	struct siw_dev		*dev = siw_dev_ofa2siw(id->device);
+	struct siw_cep		*cep = (struct siw_cep *)id->provider_data;
+	struct siw_qp		*qp;
+	struct siw_qp_attrs	qp_attrs;
+	char			*pdata = NULL;
+	int 			rv;
+
+retry:
+	rv = siw_cep_set_inuse(cep);
+	if (rv < 0) {
+		dprint(DBG_CM, "(id=0x%p, cep=0x%p): CEP in use\n", id, cep);
+		wait_event(cep->waitq, !cep->in_use);
+		goto retry;
+	}
+	if (!rv) {
+		dprint(DBG_CM, "(id=0x%p, cep=0x%p): CEP in close\n", id, cep);
+		(void) siw_cep_set_free(cep);
+		return -EINVAL;
+	}
+	if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
+		if (cep->state == SIW_EPSTATE_CLOSED) {
+
+			dprint(DBG_CM, "(id=0x%p): Out of State\n", id);
+			(void) siw_cep_set_free(cep);
+
+			siw_cep_put(cep);
+			return -ECONNRESET;
+		}
+		BUG();
+	}
+	/* clear iwcm reference to CEP from IW_CM_EVENT_CONNECT_REQUEST */
+	siw_cep_put(cep);
+
+	qp = siw_qp_id2obj(dev, params->qpn);
+	BUG_ON(!qp); /* The OFA core should prevent this */
+
+	down_write(&qp->state_lock);
+	if (qp->attrs.state > SIW_QP_STATE_RTR) {
+		rv = -EINVAL;
+		goto unlock;
+	}
+
+	dprint(DBG_CM, "(id=0x%p, QP%d): dev(id)=%s\n",
+		id, QP_ID(qp), dev->ofa_dev.name);
+
+	if (params->ord > qp->attrs.ord || params->ird > qp->attrs.ird) {
+		dprint(DBG_CM|DBG_ON, "(id=0x%p, QP%d): "
+			"ORD: %d (max: %d), IRD: %d (max: %d)\n",
+			id, QP_ID(qp),
+			params->ord, qp->attrs.ord,
+			params->ird, qp->attrs.ird);
+		rv = -EINVAL;
+		goto unlock;
+	}
+	if (params->private_data_len > MPA_MAX_PRIVDATA) {
+		dprint(DBG_CM|DBG_ON, "(id=0x%p, QP%d): "
+			"Private data too long: %d (max: %d)\n",
+			id, QP_ID(qp),
+			params->private_data_len, MPA_MAX_PRIVDATA);
+		rv =  -EINVAL;
+		goto unlock;
+	}
+	cep->cm_id = id;
+	id->add_ref(id);
+
+	memset(&qp_attrs, 0, sizeof qp_attrs);
+	qp_attrs.ord = params->ord;
+	qp_attrs.ird = params->ird;
+	qp_attrs.llp_stream_handle = cep->llp.sock;
+
+	/*
+	 * TODO: Add MPA negotiation
+	 */
+	qp_attrs.mpa.marker_rcv = 0;
+	qp_attrs.mpa.marker_snd = 0;
+	qp_attrs.mpa.crc = CONFIG_RDMA_SIW_CRC_ENFORCED;
+	qp_attrs.mpa.version = 0;
+	qp_attrs.state = SIW_QP_STATE_RTS;
+
+	dprint(DBG_CM, "(id=0x%p, QP%d): Moving to RTS\n", id, QP_ID(qp));
+
+	/* Associate QP with CEP */
+	siw_cep_get(cep);
+	qp->cep = cep;
+
+	/* siw_qp_get(qp) already done by QP lookup */
+	cep->qp = qp;
+
+	cep->state = SIW_EPSTATE_RDMA_MODE;
+
+	/* Move socket RX/TX under QP control */
+	rv = siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE|
+					  SIW_QP_ATTR_LLP_HANDLE|
+					  SIW_QP_ATTR_ORD|
+					  SIW_QP_ATTR_IRD|
+					  SIW_QP_ATTR_MPA);
+	up_write(&qp->state_lock);
+
+	if (rv)
+		goto error;
+
+
+	/*
+	 * TODO: It might be more elegant and concise to check the
+	 * private data length cep->mpa.hdr.params.pd_len
+	 * inside siw_send_mpareqrep().
+	 */
+	if (params->private_data_len) {
+		pdata = (char *)params->private_data;
+
+		dprint(DBG_CM, "(id=0x%p, QP%d): %d bytes private_data\n",
+				id, QP_ID(qp), params->private_data_len);
+	}
+	cep->mpa.hdr.params.pd_len = params->private_data_len;
+
+	dprint(DBG_CM, "(id=0x%p, QP%d): Sending MPA Reply\n", id, QP_ID(qp));
+
+	rv = siw_send_mpareqrep(cep->llp.sock, &cep->mpa.hdr.params,
+				MPA_KEY_REP, pdata);
+	if (!rv) {
+		/*
+		 * FIXME: In order to ensure that the first FPDU will be sent
+		 * from the RDMA Initiator side, the "connection established"
+		 * event should be delayed until Softiwarp has received the
+		 * first FPDU from the RDMA Initiator side.
+		 * Alternatively, Softiwarp could prevent this side to
+		 * send a first FPDU until a first FPDU has been received.
+		 *
+		 * The two alternatives above will work if
+		 * (1) the RDMA application is iWARP standards compliant
+		 *     by sending its first RDMA payload from the
+		 *     RDMA Initiator side, or
+		 * (2) the RDMA Initiator side RNIC inserts an under-cover
+		 *     zero-length RDMA operation (negotiated through an
+		 *     extended MPA Request/Reply handshake) such as a
+		 *     zero-length RDMA Write or Read.
+		 * Note that (2) would require an extension of the MPA RFC.
+		 *
+		 * A third alternative (which may be the easiest for now) is to
+		 * return an error to an RDMA application that attempts to send
+		 * the first RDMA payload from the RDMA Responder side.
+		 */
+		siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED,
+				IW_CM_EVENT_STATUS_OK);
+
+		if (!siw_cep_set_free(cep))
+			siw_cm_release(cep);
+
+		dprint(DBG_CM, "(id=0x%p, QP%d): Exit\n", id, QP_ID(qp));
+		return 0;
+	}
+
+error:
+	if (siw_cep_set_free(cep)) {
+
+		siw_socket_disassoc(cep->llp.sock);
+		sock_release(cep->llp.sock);
+		cep->llp.sock = NULL;
+
+		cep->state = SIW_EPSTATE_CLOSED;
+
+		cep->cm_id->rem_ref(id);
+		cep->cm_id = NULL;
+
+		if (qp->cep) {
+			siw_cep_put(cep);
+			qp->cep = NULL;
+		}
+		cep->qp = NULL;
+		siw_qp_put(qp);
+	}
+	return rv;
+unlock:
+	up_write(&qp->state_lock);
+	goto error;
+}
+
+/*
+ * siw_reject()
+ *
+ * Local connection reject case. Send private data back to peer,
+ * close connection and dereference connection id.
+ */
+int siw_reject(struct iw_cm_id *id, const void *pdata, u8 plen)
+{
+	struct siw_cep	*cep = (struct siw_cep *)id->provider_data;
+
+	dprint(DBG_CM, "(id=0x%p): cep->state=%d\n", id, cep->state);
+	dprint(DBG_CM, " Reject: %s\n", plen ? (char *)pdata:"(no data)");
+
+	if (!siw_cep_in_close(cep)) {
+
+		dprint(DBG_ON, " Sending REJECT not yet implemented\n");
+
+		siw_socket_disassoc(cep->llp.sock);
+		sock_release(cep->llp.sock);
+		cep->llp.sock = NULL;
+
+		siw_cep_put(cep);
+		cep->state = SIW_EPSTATE_CLOSED;
+	} else {
+		dprint(DBG_CM, " (id=0x%p): Connection lost\n", id);
+	}
+
+	/*
+	 * clear iwcm reference to CEP from
+	 * IW_CM_EVENT_CONNECT_REQUEST
+	 */
+	siw_cep_put(cep);
+
+	return 0;
+}
+
+int siw_listen_address(struct iw_cm_id *id, int backlog, struct sockaddr *laddr)
+{
+	struct socket 		*s;
+	struct siw_cep		*cep = NULL;
+	int 			rv = 0, s_val;
+
+	rv = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, &s);
+	if (rv < 0) {
+		dprint(DBG_CM|DBG_ON, "(id=0x%p): ERROR: "
+			"sock_create(): rv=%d\n", id, rv);
+		return rv;
+	}
+
+	s_val = SOCKBUFSIZE;
+	rv = kernel_setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&s_val,
+			       sizeof s_val);
+	if (rv)
+		goto error;
+
+	rv = kernel_setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&s_val,
+			       sizeof s_val);
+	if (rv)
+		goto error;
+
+	/*
+	 * Probably to be removed later. Allows binding
+	 * local port when still in TIME_WAIT from last close.
+	 */
+	s_val = 1;
+	rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val,
+			       sizeof s_val);
+	if (rv != 0) {
+		dprint(DBG_CM|DBG_ON, "(id=0x%p): ERROR: "
+			"kernel_setsockopt(): rv=%d\n", id, rv);
+		goto error;
+	}
+
+	rv = s->ops->bind(s, laddr, sizeof *laddr);
+	if (rv != 0) {
+		dprint(DBG_CM|DBG_ON, "(id=0x%p): ERROR: bind(): rv=%d\n",
+			id, rv);
+		goto error;
+	}
+
+	cep = siw_cep_alloc();
+	if (!cep) {
+		rv = -ENOMEM;
+		goto error;
+	}
+	siw_cep_socket_assoc(cep, s);
+
+	rv = siw_cm_alloc_work(cep, backlog);
+	if (rv != 0) {
+		dprint(DBG_CM|DBG_ON, "(id=0x%p): ERROR: "
+			"siw_cm_alloc_work(backlog=%d): rv=%d\n",
+			id, backlog, rv);
+		goto error;
+	}
+
+	rv = s->ops->listen(s, backlog);
+	if (rv != 0) {
+		dprint(DBG_CM|DBG_ON, "(id=0x%p): ERROR: listen() rv=%d\n",
+			id, rv);
+		goto error;
+	}
+
+	/*
+	 * TODO: Do we really need the copies of local_addr and remote_addr
+	 *	 in CEP ???
+	 */
+	memcpy(&cep->llp.laddr, &id->local_addr, sizeof cep->llp.laddr);
+	memcpy(&cep->llp.raddr, &id->remote_addr, sizeof cep->llp.raddr);
+
+	cep->cm_id = id;
+	id->add_ref(id);
+
+	/*
+	 * In case of a wildcard rdma_listen on a multi-homed device,
+	 * a listener's IWCM id is associated with more than one listening CEP.
+	 *
+	 * We currently use id->provider_data in three different ways:
+	 *
+	 * o For a listener's IWCM id, id->provider_data points to
+	 *   the list_head of the list of listening CEPs.
+	 *   Uses: siw_create_listen(), siw_destroy_listen()
+	 *
+	 * o For a passive-side IWCM id, id->provider_data points to
+	 *   the CEP itself. This is a consequence of
+	 *   - siw_cm_upcall() setting event.provider_data = cep and
+	 *   - the IWCM's cm_conn_req_handler() setting provider_data of the
+	 *     new passive-side IWCM id equal to event.provider_data
+	 *   Uses: siw_accept(), siw_reject()
+	 *
+	 * o For an active-side IWCM id, id->provider_data is not used at all.
+	 *
+	 */
+	if (!id->provider_data) {
+		id->provider_data = kmalloc(sizeof(struct list_head),
+					    GFP_KERNEL);
+		if (!id->provider_data) {
+			rv = -ENOMEM;
+			goto error;
+		}
+		INIT_LIST_HEAD((struct list_head *)id->provider_data);
+	}
+
+	dprint(DBG_CM, "(id=0x%p): dev(id)=%s, l2dev=%s, "
+		"id->provider_data=0x%p, cep=0x%p\n",
+		id, id->device->name,
+		siw_dev_ofa2siw(id->device)->l2dev->name,
+		id->provider_data, cep);
+
+	list_add_tail(&cep->list, (struct list_head *)id->provider_data);
+	cep->state = SIW_EPSTATE_LISTENING;
+	return 0;
+
+error:
+	dprint(DBG_ON, " Failed: %d\n", rv);
+
+	if (cep) {
+		cep->llp.sock = NULL;
+		siw_socket_disassoc(s);
+		cep->state = SIW_EPSTATE_CLOSED;
+		siw_cep_put(cep);
+	}
+	sock_release(s);
+	return rv;
+}
+
+
+/*
+ * siw_create_listen - Create resources for a listener's IWCM ID @id
+ *
+ * Listens on the socket addresses id->local_addr and id->remote_addr.
+ * We support listening on multi-homed devices, i.e., Softiwarp devices
+ * whose underlying net_device is associated with multiple IP addresses.
+ * Wildcard listening (listening with zero IP address) is also supported.
+ *
+ * There are three design options for Softiwarp device management supporting
+ * - multiple physical Ethernet ports, i.e., multiple net_device instances, and
+ * - multiple IP addresses associated with net_device,
+ * as follows:
+ *
+ *    Option 1: One Softiwarp device per net_device and
+ *              IP address associated with the net_device
+ *    Option 2: One Softiwarp device per net_device
+ *              (and all IP addresses associated with the net_device)
+ *    Option 3: Single Softiwarp device for all net_device instances
+ *              (and all IP addresses associated with these instances)
+ *
+ * We currently use Option 2, registering a separate siw_dev for
+ * each net_device. Consequently, siw_create_listen() (called separately
+ * by the IWCM for each Softiwarp device) handles the associated IP address(es)
+ * as follows:
+ *
+ * - If the listener's @id provides a specific local IP address, at most one
+ *   listening socket is created and associated with @id.
+ *
+ * - If the listener's @id provides the wildcard (zero) local IP address,
+ *   a separate listen is performed for each local IP address of the device
+ *   by creating a listening socket and binding to that local IP address.
+ *   This avoids attempts to bind to the wildcard (zero) IP address
+ *   on multiple devices, which fails with -EADDRINUSE on the second and
+ *   all subsequent devices.
+ *
+ *   For the given IWCM and Option 2 above, the alternative approach of doing
+ *   a single wildcard listen by creating one listening socket and binding it
+ *   to the wildcard IP address is not a good idea if
+ *   - there is more than one Softiwarp device (e.g., for lo and eth0), or
+ *   - there are non-Softiwarp iWARP devices that cannot cooperate.
+ */
+int siw_create_listen(struct iw_cm_id *id, int backlog)
+{
+	struct ib_device	*ofa_dev = id->device;
+	struct siw_dev		*dev = siw_dev_ofa2siw(ofa_dev);
+	int			rv = 0;
+
+	dprint(DBG_CM, "(id=0x%p): dev(id)=%s, l2dev=%s backlog=%d\n",
+		id, ofa_dev->name, dev->l2dev->name, backlog);
+
+	/*
+	 * IPv4/v6 design differences regarding multi-homing
+	 * propagate up to iWARP:
+	 * o For IPv4, use dev->l2dev->ip_ptr
+	 * o For IPv6, use dev->l2dev->ipv6_ptr
+	 */
+	if (id->local_addr.sin_family == AF_INET) {
+		/* IPv4 */
+		struct sockaddr_in	laddr = id->local_addr;
+		u8			*l_ip, *r_ip;
+		struct in_device 	*in_dev;
+
+		l_ip = (u8 *) &id->local_addr.sin_addr.s_addr;
+		r_ip = (u8 *) &id->remote_addr.sin_addr.s_addr;
+		dprint(DBG_CM, "(id=0x%p): "
+			"laddr(id)  : ipv4=%d.%d.%d.%d, port=%d; "
+			"raddr(id)  : ipv4=%d.%d.%d.%d, port=%d\n",
+			id,
+			l_ip[0], l_ip[1], l_ip[2], l_ip[3],
+			ntohs(id->local_addr.sin_port),
+			r_ip[0], r_ip[1], r_ip[2], r_ip[3],
+			ntohs(id->remote_addr.sin_port));
+
+		in_dev = in_dev_get(dev->l2dev);
+		if (!in_dev) {
+			dprint(DBG_CM|DBG_ON, "(id=0x%p): "
+				"l2dev has no in_device\n", id);
+			return -ENODEV;
+		}
+
+		/*
+		 * If in_dev is not configured, in_dev->ifa_list may be empty
+		 */
+		for_ifa(in_dev) {
+			/*
+			 * Create a listening socket if id->local_addr
+			 * contains the wildcard IP address OR
+			 * the IP address of the interface.
+			 */
+			if (ipv4_is_zeronet(id->local_addr.sin_addr.s_addr) ||
+					id->local_addr.sin_addr.s_addr ==
+					ifa->ifa_address) {
+				laddr.sin_addr.s_addr = ifa->ifa_address;
+
+				l_ip = (u8 *) &laddr.sin_addr.s_addr;
+				dprint(DBG_CM, "(id=0x%p): "
+					"laddr(bind): ipv4=%d.%d.%d.%d,"
+					" port=%d\n", id,
+					l_ip[0], l_ip[1], l_ip[2],
+					l_ip[3], ntohs(laddr.sin_port));
+
+				rv = siw_listen_address(id, backlog,
+						(struct sockaddr *)&laddr);
+				if (rv)
+					break;
+			}
+		}
+		endfor_ifa(in_dev);
+		in_dev_put(in_dev);
+
+		if (rv) {
+			/*
+			 * TODO: Cleanup resources already associated with
+			 *	 id->provider_data
+			 */
+			dprint(DBG_CM|DBG_ON, "(id=0x%p): "
+				"TODO: Cleanup resources\n", id);
+		}
+
+	} else {
+		/* IPv6 */
+		dprint(DBG_CM|DBG_ON, "(id=0x%p): TODO: IPv6 support\n", id);
+	}
+	if (!rv)
+		dprint(DBG_CM, "(id=0x%p): Success\n", id);
+
+	return rv;
+}
+
+
+int siw_destroy_listen(struct iw_cm_id *id)
+{
+	struct list_head	*p, *tmp;
+	struct siw_cep		*cep;
+
+	dprint(DBG_CM, "(id=0x%p): dev(id)=%s, l2dev=%s\n",
+		id, id->device->name,
+		siw_dev_ofa2siw(id->device)->l2dev->name);
+
+	if (!id->provider_data) {
+		/*
+		 * TODO: See if there's a way to avoid getting any
+		 *       listener ids without a list of CEPs
+		 */
+		dprint(DBG_CM, "(id=0x%p): Listener id: no CEP(s)\n", id);
+		return 0;
+	}
+
+	/*
+	 * In case of a wildcard rdma_listen on a multi-homed device,
+	 * a listener's IWCM id is associated with more than one listening CEP.
+	 */
+	list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
+
+		cep = list_entry(p, struct siw_cep, list);
+		list_del(p);
+
+		if (siw_cep_set_inuse(cep) > 0) {
+
+			cep->conn_close = 1;
+
+			siw_socket_disassoc(cep->llp.sock);
+			sock_release(cep->llp.sock);
+			cep->llp.sock = NULL;
+			id->rem_ref(id);
+
+			cep->state = SIW_EPSTATE_CLOSED;
+			/*
+			 * Do not set the CEP free again. The CEP is dead.
+			 * (void) siw_cep_set_free(cep);
+			 */
+		} else
+			cep->state = SIW_EPSTATE_CLOSED;
+
+		siw_cep_put(cep);
+	}
+	kfree(id->provider_data);
+	id->provider_data = NULL;
+
+	return 0;
+}
+
+int __init siw_cm_init(void)
+{
+	/*
+	 * create_single_workqueue for strict ordering
+	 */
+	siw_cm_wq = create_singlethread_workqueue("siw_cm_wq");
+	if (!siw_cm_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void __exit siw_cm_exit(void)
+{
+	if (siw_cm_wq) {
+		flush_workqueue(siw_cm_wq);
+		destroy_workqueue(siw_cm_wq);
+	}
+}
diff --git a/drivers/infiniband/hw/siw/siw_cm.h b/drivers/infiniband/hw/siw/siw_cm.h
new file mode 100644
index 0000000..fcb4544
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_cm.h
@@ -0,0 +1,155 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _SIW_CM_H
+#define _SIW_CM_H
+
+#include <net/sock.h>
+#include <linux/tcp.h>
+
+#include <rdma/iw_cm.h>
+
+
+enum siw_cep_state {
+	SIW_EPSTATE_IDLE = 1,
+	SIW_EPSTATE_LISTENING,
+	SIW_EPSTATE_CONNECTING,
+	SIW_EPSTATE_AWAIT_MPAREQ,
+	SIW_EPSTATE_RECVD_MPAREQ,
+	SIW_EPSTATE_AWAIT_MPAREP,
+	SIW_EPSTATE_RDMA_MODE,
+	SIW_EPSTATE_CLOSED
+};
+
+struct siw_mpa_info {
+	struct mpa_rr	hdr; 	/* peer mpa hdr in host byte order */
+	char		*pdata;
+	int		bytes_rcvd;
+};
+
+struct siw_llp_info {
+	struct socket		*sock;
+	struct sockaddr_in	laddr;	/* redundant with socket info above */
+	struct sockaddr_in	raddr;	/* dito, consider removal */
+	struct siw_sk_upcalls	sk_def_upcalls;
+};
+
+struct siw_cep {
+	struct iw_cm_id		*cm_id;
+
+	/*
+	 * The provider_data element of a listener IWCM ID
+	 * refers to a list of one or more listener CEPs
+	 */
+	struct list_head	list;
+
+	struct siw_cep		*listen_cep;
+	struct siw_qp		*qp;
+	spinlock_t		lock;
+	wait_queue_head_t	waitq;
+	struct kref		ref;
+	enum siw_cep_state	state;
+	short			conn_close; /* sched. for closing or closed */
+	short			in_use;
+	struct siw_cm_work	*mpa_timer;
+	struct list_head	work_freelist;
+	struct siw_llp_info	llp;
+	struct siw_mpa_info	mpa;
+	int			ord;
+	int			ird;
+	int			sk_error; /* not (yet) used XXX */
+
+	/* Saved upcalls of socket llp.sock */
+	void    (*sk_state_change)(struct sock *sk);
+	void    (*sk_data_ready)(struct sock *sk, int bytes);
+	void    (*sk_write_space)(struct sock *sk);
+	void    (*sk_error_report)(struct sock *sk);
+};
+
+enum siw_work_type {
+	SIW_CM_WORK_ACCEPT 	= 1,
+	SIW_CM_WORK_READ_MPAHDR,
+	SIW_CM_WORK_CLOSE_LLP,		/* close socket */
+	SIW_CM_WORK_PEER_CLOSE,		/* socket indicated peer close */
+	SIW_CM_WORK_MPATIMEOUT		/* to be done ! */
+};
+
+struct siw_cm_work {
+	struct work_struct	work;
+	struct list_head	list;
+	enum siw_work_type	type;
+	struct siw_cep	*cep;
+};
+
+extern int siw_connect(struct iw_cm_id *, struct iw_cm_conn_param *);
+extern int siw_accept(struct iw_cm_id *, struct iw_cm_conn_param *);
+extern int siw_reject(struct iw_cm_id *, const void *, u8);
+extern int siw_create_listen(struct iw_cm_id *, int);
+extern int siw_destroy_listen(struct iw_cm_id *);
+
+extern int siw_cm_upcall(struct siw_cep *, enum iw_cm_event_type,
+			    enum iw_cm_event_status);
+
+extern void siw_cep_upcall(struct siw_cep *, enum iw_cm_event_type);
+
+extern void siw_cep_put(struct siw_cep *);
+extern void siw_cep_get(struct siw_cep *);
+extern int siw_cep_in_close(struct siw_cep *);
+
+extern int siw_cm_queue_work(struct siw_cep *, enum siw_work_type);
+
+extern int siw_cm_init(void);
+extern void siw_cm_exit(void);
+
+/*
+ * TCP socket interface
+ */
+#define sk_to_qp(sk)	(((struct siw_cep *)((sk)->sk_user_data))->qp)
+#define sk_to_cep(sk)	((struct siw_cep *)((sk)->sk_user_data))
+
+/*
+ * Should we use tcp_current_mss()?
+ * But its not exported by kernel.
+ */
+static inline unsigned int get_tcp_mss(struct sock *sk)
+{
+	return ((struct tcp_sock *)sk)->xmit_size_goal_segs *
+			((struct tcp_sock *)sk)->mss_cache;
+}
+
+#endif
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] SIW: Object management
From: Bernard Metzler @ 2010-10-05  6:54 UTC (permalink / raw)
  To: netdev; +Cc: linux-rdma, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_obj.c |  499 +++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/siw/siw_obj.h |  109 ++++++++
 2 files changed, 608 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_obj.c
 create mode 100644 drivers/infiniband/hw/siw/siw_obj.h

diff --git a/drivers/infiniband/hw/siw/siw_obj.c b/drivers/infiniband/hw/siw/siw_obj.c
new file mode 100644
index 0000000..b5a1a3d
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_obj.c
@@ -0,0 +1,499 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt@zurich.ibm.com>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+
+void siw_objhdr_init(struct siw_objhdr *hdr)
+{
+	kref_init(&hdr->ref);
+}
+
+void siw_idr_init(struct siw_dev *dev)
+{
+	spin_lock_init(&dev->idr_lock);
+
+	idr_init(&dev->qp_idr);
+	idr_init(&dev->cq_idr);
+	idr_init(&dev->pd_idr);
+	idr_init(&dev->mem_idr);
+}
+
+void siw_idr_release(struct siw_dev *dev)
+{
+	idr_destroy(&dev->qp_idr);
+	idr_destroy(&dev->cq_idr);
+	idr_destroy(&dev->pd_idr);
+	idr_destroy(&dev->mem_idr);
+}
+
+static inline int siw_add_obj(spinlock_t *lock, struct idr *idr,
+			      struct siw_objhdr *obj)
+{
+	u32		pre_id, id;
+	unsigned long	flags;
+	int		rv;
+
+	get_random_bytes(&pre_id, sizeof pre_id);
+	pre_id &= 0xffff;
+again:
+	do {
+		if (!(idr_pre_get(idr, GFP_KERNEL)))
+			return -ENOMEM;
+
+		spin_lock_irqsave(lock, flags);
+		rv = idr_get_new_above(idr, obj, pre_id, &id);
+		spin_unlock_irqrestore(lock, flags);
+
+	} while  (rv == -EAGAIN);
+
+	if (rv == 0) {
+		siw_objhdr_init(obj);
+		obj->id = id;
+		dprint(DBG_OBJ, "(OBJ%d): IDR New Object\n", id);
+	} else if (rv == -ENOSPC && pre_id != 1) {
+		pre_id = 1;
+		goto again;
+	} else {
+		dprint(DBG_OBJ|DBG_ON, "(OBJ??): IDR New Object failed!\n");
+	}
+	return rv;
+}
+
+static inline struct siw_objhdr *siw_get_obj(struct idr *idr, int id)
+{
+	struct siw_objhdr *obj;
+
+	obj = idr_find(idr, id);
+	if (obj)
+		kref_get(&obj->ref);
+
+	return obj;
+}
+
+struct siw_cq *siw_cq_id2obj(struct siw_dev *dev, int id)
+{
+	struct siw_objhdr *obj = siw_get_obj(&dev->cq_idr, id);
+	if (obj)
+		return container_of(obj, struct siw_cq, hdr);
+
+	return NULL;
+}
+
+struct siw_qp *siw_qp_id2obj(struct siw_dev *dev, int id)
+{
+	struct siw_objhdr *obj = siw_get_obj(&dev->qp_idr, id);
+	if (obj)
+		return container_of(obj, struct siw_qp, hdr);
+
+	return NULL;
+}
+
+/*
+ * siw_mem_id2obj()
+ *
+ * resolves memory from stag given by id. might be called from:
+ * o process context before sending out of sgl
+ * o or in softirq when resolving target memory
+ */
+struct siw_mem *siw_mem_id2obj(struct siw_dev *dev, int id)
+{
+	struct siw_objhdr *obj;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->idr_lock, flags);
+	obj = siw_get_obj(&dev->mem_idr, id);
+	spin_unlock_irqrestore(&dev->idr_lock, flags);
+
+	if (obj) {
+		dprint(DBG_MM|DBG_OBJ, "(MEM%d): New refcount: %d\n",
+		       obj->id, obj->ref.refcount.counter);
+
+		return container_of(obj, struct siw_mem, hdr);
+	}
+	dprint(DBG_MM|DBG_OBJ|DBG_ON, "(MEM%d): not found!\n", id);
+
+	return NULL;
+}
+
+int siw_qp_add(struct siw_dev *dev, struct siw_qp *qp)
+{
+	int rv = siw_add_obj(&dev->idr_lock, &dev->qp_idr, &qp->hdr);
+	if (!rv) {
+		dprint(DBG_OBJ, "(QP%d): New Object\n", QP_ID(qp));
+		qp->hdr.dev = dev;
+	}
+	return rv;
+}
+
+int siw_cq_add(struct siw_dev *dev, struct siw_cq *cq)
+{
+	int rv = siw_add_obj(&dev->idr_lock, &dev->cq_idr, &cq->hdr);
+	if (!rv) {
+		dprint(DBG_OBJ, "(CQ%d): New Object\n", cq->hdr.id);
+		cq->hdr.dev = dev;
+	}
+	return rv;
+}
+
+int siw_pd_add(struct siw_dev *dev, struct siw_pd *pd)
+{
+	int rv = siw_add_obj(&dev->idr_lock, &dev->pd_idr, &pd->hdr);
+	if (!rv) {
+		dprint(DBG_OBJ, "(PD%d): New Object\n", pd->hdr.id);
+		pd->hdr.dev = dev;
+	}
+	return rv;
+}
+
+/*
+ * Stag lookup is based on its index part only (24 bits)
+ * It is assumed that the idr_get_new_above(,,1,) function will
+ * always return a new id within this range (0x1...0xffffff),
+ * if one is available.
+ * The code avoids special Stag of zero and tries to randomize
+ * STag values.
+ */
+int siw_mem_add(struct siw_dev *dev, struct siw_mem *m)
+{
+	u32		id, pre_id;
+	unsigned long	flags;
+	int		rv;
+
+	do {
+		get_random_bytes(&pre_id, sizeof pre_id);
+		pre_id &= 0xffff;
+	} while (pre_id == 0);
+again:
+	do {
+		if (!(idr_pre_get(&dev->mem_idr, GFP_KERNEL)))
+			return -ENOMEM;
+
+		spin_lock_irqsave(&dev->idr_lock, flags);
+		rv = idr_get_new_above(&dev->mem_idr, m, pre_id, &id);
+		spin_unlock_irqrestore(&dev->idr_lock, flags);
+
+	} while (rv == -EAGAIN);
+
+	if (rv == -ENOSPC || (rv == 0 && id > SIW_STAG_MAX)) {
+		if (rv == 0) {
+			spin_lock_irqsave(&dev->idr_lock, flags);
+			idr_remove(&dev->mem_idr, id);
+			spin_unlock_irqrestore(&dev->idr_lock, flags);
+		}
+		if (pre_id == 1) {
+			dprint(DBG_OBJ|DBG_MM|DBG_ON,
+				"(IDR): New Object failed: %d\n", pre_id);
+			return -ENOSPC;
+		}
+		pre_id = 1;
+		goto again;
+	} else if (rv) {
+		dprint(DBG_OBJ|DBG_MM|DBG_ON,
+			"(IDR%d): New Object failed: rv %d\n", id, rv);
+		return rv;
+	}
+	siw_objhdr_init(&m->hdr);
+	m->hdr.id = id;
+	m->hdr.dev = dev;
+	dprint(DBG_OBJ|DBG_MM, "(IDR%d): New Object\n", id);
+
+	return 0;
+}
+
+void siw_remove_obj(spinlock_t *lock, struct idr *idr,
+		      struct siw_objhdr *hdr)
+{
+	unsigned long	flags;
+
+	dprint(DBG_OBJ, "(OBJ%d): IDR Remove Object\n", hdr->id);
+
+	spin_lock_irqsave(lock, flags);
+	idr_remove(idr, hdr->id);
+	spin_unlock_irqrestore(lock, flags);
+}
+
+
+/********** routines to put objs back and free if no ref left *****/
+
+static void siw_free_cq(struct kref *ref)
+{
+	struct siw_cq *cq =
+		(container_of(container_of(ref, struct siw_objhdr, ref),
+			      struct siw_cq, hdr));
+
+	dprint(DBG_OBJ, "(CQ%d): Free Object\n", cq->hdr.id);
+
+	kfree(cq);
+}
+
+static void siw_free_qp(struct kref *ref)
+{
+	struct siw_qp	*qp =
+		container_of(container_of(ref, struct siw_objhdr, ref),
+			     struct siw_qp, hdr);
+
+	dprint(DBG_OBJ|DBG_CM, "(QP%d): Free Object\n", QP_ID(qp));
+
+	if (qp->cep)
+		siw_cep_put(qp->cep);
+
+	kfree(qp);
+}
+
+static void siw_free_pd(struct kref *ref)
+{
+	struct siw_pd	*pd =
+		container_of(container_of(ref, struct siw_objhdr, ref),
+			     struct siw_pd, hdr);
+
+	dprint(DBG_OBJ, "(PD%d): Free Object\n", pd->hdr.id);
+
+	kfree(pd);
+}
+
+static void siw_free_mem(struct kref *ref)
+{
+	struct siw_mem *m;
+
+	m = container_of(container_of(ref, struct siw_objhdr, ref),
+			 struct siw_mem, hdr);
+
+	dprint(DBG_MM|DBG_OBJ, "(MEM%d): Free Object\n", OBJ_ID(m));
+
+	if (SIW_MEM_IS_MW(m)) {
+		struct siw_mw *mw = container_of(m, struct siw_mw, mem);
+		kfree(mw);
+	} else {
+		struct siw_mr *mr = container_of(m, struct siw_mr, mem);
+		dprint(DBG_MM|DBG_OBJ, "(MEM%d): Release UMem\n", OBJ_ID(m));
+		ib_umem_release(mr->umem);
+		kfree(mr);
+	}
+}
+
+
+void siw_cq_put(struct siw_cq *cq)
+{
+	dprint(DBG_OBJ, "(CQ%d): Old refcount: %d\n",
+		OBJ_ID(cq), atomic_read(&cq->hdr.ref.refcount));
+	kref_put(&cq->hdr.ref, siw_free_cq);
+}
+
+void siw_qp_put(struct siw_qp *qp)
+{
+	dprint(DBG_OBJ, "(QP%d): Old refcount: %d\n",
+		QP_ID(qp), atomic_read(&qp->hdr.ref.refcount));
+	kref_put(&qp->hdr.ref, siw_free_qp);
+}
+
+void siw_pd_put(struct siw_pd *pd)
+{
+	dprint(DBG_OBJ, "(PD%d): Old refcount: %d\n",
+		OBJ_ID(pd), atomic_read(&pd->hdr.ref.refcount));
+	kref_put(&pd->hdr.ref, siw_free_pd);
+}
+
+void siw_mem_put(struct siw_mem *m)
+{
+	dprint(DBG_MM|DBG_OBJ, "(MEM%d): Old refcount: %d\n",
+		OBJ_ID(m), atomic_read(&m->hdr.ref.refcount));
+	kref_put(&m->hdr.ref, siw_free_mem);
+}
+
+
+/***** routines for WQE handling ***/
+
+/*
+ * siw_wqe_get()
+ *
+ * Get new WQE. For READ RESPONSE, take it from the free list which
+ * has a maximum size of maximum inbound READs. All other WQE are
+ * malloc'ed which creates some overhead. Consider change to
+ *
+ * 1. malloc WR only if it cannot be synchonously completed, or
+ * 2. operate own cache of reuseable WQE's.
+ *
+ * Current code trusts on malloc efficiency.
+ */
+inline struct siw_wqe *siw_wqe_get(struct siw_qp *qp, enum siw_wr_opcode op)
+{
+	struct siw_wqe *wqe;
+
+	if (op == SIW_WR_RDMA_READ_RESP) {
+		spin_lock(&qp->freelist_lock);
+		if (!(list_empty(&qp->wqe_freelist))) {
+			wqe = list_entry(qp->wqe_freelist.next,
+					 struct siw_wqe, list);
+			list_del(&wqe->list);
+			spin_unlock(&qp->freelist_lock);
+			wqe->processed = 0;
+			dprint(DBG_OBJ|DBG_WR,
+				"(QP%d): WQE from FreeList p: %p\n",
+				QP_ID(qp), wqe);
+		} else {
+			spin_unlock(&qp->freelist_lock);
+			wqe = NULL;
+			dprint(DBG_ON|DBG_OBJ|DBG_WR,
+				"(QP%d): FreeList empty!\n", QP_ID(qp));
+		}
+	} else {
+		wqe = kzalloc(sizeof(struct siw_wqe), GFP_KERNEL);
+		dprint(DBG_OBJ|DBG_WR, "(QP%d): New WQE p: %p\n",
+			QP_ID(qp), wqe);
+	}
+	if (wqe) {
+		INIT_LIST_HEAD(&wqe->list);
+		siw_qp_get(qp);
+		wqe->qp = qp;
+	}
+	return wqe;
+}
+
+inline struct siw_wqe *siw_srq_wqe_get(struct siw_srq *srq)
+{
+	struct siw_wqe *wqe = kzalloc(sizeof(struct siw_wqe), GFP_KERNEL);
+
+	dprint(DBG_OBJ|DBG_WR, "(SRQ%p): New WQE p: %p\n", srq, wqe);
+	if (wqe) {
+		/* implicite: wqe->qp = NULL; */
+		INIT_LIST_HEAD(&wqe->list);
+		wqe->qp = NULL;
+	}
+	return wqe;
+}
+
+/*
+ * siw_srq_fetch_wqe()
+ *
+ * fetch one RQ wqe from the SRQ and inform user
+ * if SRQ lower watermark reached
+ */
+inline struct siw_wqe *siw_srq_fetch_wqe(struct siw_qp *qp)
+{
+	struct siw_wqe *wqe;
+	struct siw_srq *srq = qp->srq;
+	int qlen;
+
+	lock_srq(srq);
+	if (!list_empty(&srq->rq)) {
+		wqe = list_first_wqe(&srq->rq);
+		list_del_init(&wqe->list);
+		qlen = srq->max_wr - atomic_inc_return(&srq->space);
+		unlock_srq(srq);
+		wqe->qp = qp;
+		if (srq->armed && qlen < srq->limit) {
+			srq->armed = 0;
+			siw_async_srq_ev(srq, IB_EVENT_SRQ_LIMIT_REACHED);
+		}
+		return wqe;
+	}
+	unlock_srq(srq);
+	return NULL;
+}
+
+inline void siw_free_inline_sgl(struct siw_sge *sge, int num_sge)
+{
+	while (num_sge--) {
+		kfree(sge->mem.buf); /* kfree handles NULL pointers */
+		sge++;
+	}
+}
+
+inline void siw_unref_mem_sgl(struct siw_sge *sge, int num_sge)
+{
+	while (num_sge--) {
+		if (sge->mem.obj != NULL)
+			siw_mem_put(sge->mem.obj);
+		sge++;
+	}
+}
+
+
+void siw_wqe_put(struct siw_wqe *wqe)
+{
+	struct siw_qp *qp = wqe->qp;
+	unsigned long flags;
+
+	dprint(DBG_OBJ|DBG_WR, " WQE: %llu:, type: %d, p: %p\n",
+		(unsigned long long)wr_id(wqe), wr_type(wqe), wqe);
+
+	switch (wr_type(wqe)) {
+
+	case SIW_WR_SEND:
+	case SIW_WR_RDMA_WRITE:
+		if (likely(!SIW_INLINED_DATA(wqe)))
+			siw_unref_mem_sgl(wqe->wr.sgl.sge,
+					  wqe->wr.sgl.num_sge);
+		else
+			siw_free_inline_sgl(wqe->wr.sgl.sge,
+					    wqe->wr.sgl.num_sge);
+	case SIW_WR_RDMA_WRITE_WITH_IMM:
+	case SIW_WR_SEND_WITH_IMM:
+		kfree(wqe);
+		break;
+
+	case SIW_WR_RECEIVE:
+	case SIW_WR_RDMA_READ_REQ:
+		siw_unref_mem_sgl(wqe->wr.sgl.sge, wqe->wr.sgl.num_sge);
+		kfree(wqe);
+		break;
+
+	case SIW_WR_RDMA_READ_RESP:
+		siw_unref_mem_sgl(wqe->wr.sgl.sge, 1);
+		wqe->wr.sgl.sge[0].mem.obj = NULL;
+		/*
+		 * freelist can be accessed by tx processing (rresp done)
+		 * and rx softirq (get new wqe for rresponse scheduling)
+		 */
+		INIT_LIST_HEAD(&wqe->list);
+		spin_lock_irqsave(&wqe->qp->freelist_lock, flags);
+		list_add_tail(&wqe->list, &wqe->qp->wqe_freelist);
+		spin_unlock_irqrestore(&wqe->qp->freelist_lock, flags);
+		break;
+
+	default:
+		WARN_ON(1);
+	}
+	siw_qp_put(qp);
+}
diff --git a/drivers/infiniband/hw/siw/siw_obj.h b/drivers/infiniband/hw/siw/siw_obj.h
new file mode 100644
index 0000000..7b8af6c
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_obj.h
@@ -0,0 +1,109 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt@zurich.ibm.com>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _SIW_OBJ_H
+#define _SIW_OBJ_H
+
+#include <linux/idr.h>
+#include <linux/rwsem.h>
+#include <linux/version.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "siw_debug.h"
+
+
+static inline struct siw_dev *siw_dev_ofa2siw(struct ib_device *ofa_dev)
+{
+	return container_of(ofa_dev, struct siw_dev, ofa_dev);
+}
+
+static inline void siw_cq_get(struct siw_cq *cq)
+{
+	kref_get(&cq->hdr.ref);
+	dprint(DBG_OBJ, "(CQ%d): New refcount: %d\n",
+		OBJ_ID(cq), atomic_read(&cq->hdr.ref.refcount));
+}
+static inline void siw_qp_get(struct siw_qp *qp)
+{
+	kref_get(&qp->hdr.ref);
+	dprint(DBG_OBJ, "(QP%d): New refcount: %d\n",
+		OBJ_ID(qp), atomic_read(&qp->hdr.ref.refcount));
+}
+static inline void siw_pd_get(struct siw_pd *pd)
+{
+	kref_get(&pd->hdr.ref);
+	dprint(DBG_OBJ, "(PD%d): New refcount: %d\n",
+		OBJ_ID(pd), atomic_read(&pd->hdr.ref.refcount));
+}
+static inline void siw_mem_get(struct siw_mem *mem)
+{
+	kref_get(&mem->hdr.ref);
+	dprint(DBG_OBJ|DBG_MM, "(MEM%d): New refcount: %d\n",
+		OBJ_ID(mem), atomic_read(&mem->hdr.ref.refcount));
+}
+
+extern void siw_remove_obj(spinlock_t *lock, struct idr *idr,
+				struct siw_objhdr *hdr);
+
+extern void siw_objhdr_init(struct siw_objhdr *);
+extern void siw_idr_init(struct siw_dev *);
+extern void siw_idr_release(struct siw_dev *);
+
+extern struct siw_cq *siw_cq_id2obj(struct siw_dev *, int);
+extern struct siw_qp *siw_qp_id2obj(struct siw_dev *, int);
+extern struct siw_mem *siw_mem_id2obj(struct siw_dev *, int);
+
+extern int siw_qp_add(struct siw_dev *, struct siw_qp *);
+extern int siw_cq_add(struct siw_dev *, struct siw_cq *);
+extern int siw_pd_add(struct siw_dev *, struct siw_pd *);
+extern int siw_mem_add(struct siw_dev *, struct siw_mem *m);
+
+extern struct siw_wqe *siw_wqe_get(struct siw_qp *, enum siw_wr_opcode);
+extern struct siw_wqe *siw_srq_wqe_get(struct siw_srq *);
+extern struct siw_wqe *siw_srq_fetch_wqe(struct siw_qp *);
+
+extern void siw_cq_put(struct siw_cq *);
+extern void siw_qp_put(struct siw_qp *);
+extern void siw_pd_put(struct siw_pd *);
+extern void siw_mem_put(struct siw_mem *);
+extern void siw_wqe_put(struct siw_wqe *);
+
+#endif
-- 
1.5.4.3


^ permalink raw reply related

* [PATCH] SIW: Queue pair
From: Bernard Metzler @ 2010-10-05  6:55 UTC (permalink / raw)
  To: netdev; +Cc: linux-rdma, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_qp.c |  989 ++++++++++++++++++++++++++++++++++++
 1 files changed, 989 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_qp.c

diff --git a/drivers/infiniband/hw/siw/siw_qp.c b/drivers/infiniband/hw/siw/siw_qp.c
new file mode 100644
index 0000000..42bc143
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_qp.c
@@ -0,0 +1,989 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt@zurich.ibm.com>
+ *          Fredy Neeser <nfd@zurich.ibm.com>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/file.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+
+char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = {
+	[SIW_QP_STATE_IDLE]		= "IDLE",
+	[SIW_QP_STATE_RTR]		= "RTR",
+	[SIW_QP_STATE_RTS]		= "RTS",
+	[SIW_QP_STATE_CLOSING]		= "CLOSING",
+	[SIW_QP_STATE_TERMINATE]	= "TERMINATE",
+	[SIW_QP_STATE_ERROR]		= "ERROR",
+	[SIW_QP_STATE_MORIBUND]		= "MORIBUND",
+	[SIW_QP_STATE_UNDEF]		= "UNDEF"
+};
+
+
+/*
+ * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
+ * per-RDMAP message basis. Please keep order of initializer. All MPA len
+ * is initialized to minimum packet size.
+ */
+struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] =
+{ {
+	.hdr_len = sizeof(struct iwarp_rdma_write),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_RDMA_WRITE,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 1,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_write
+},
+{
+	.hdr_len = sizeof(struct iwarp_rdma_rreq),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_RDMA_READ_REQ,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 0,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_rreq
+},
+{
+	.hdr_len = sizeof(struct iwarp_rdma_rresp),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_RDMA_READ_RESP,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 1,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_rresp
+},
+{
+	.hdr_len = sizeof(struct iwarp_send),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_SEND,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 0,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_send
+},
+{
+	.hdr_len = sizeof(struct iwarp_send_inv),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_SEND_INVAL,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 0,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_unsupp
+},
+{
+	.hdr_len = sizeof(struct iwarp_send),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_SEND_SE,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 0,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_send
+},
+{
+	.hdr_len = sizeof(struct iwarp_send_inv),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_SEND_SE_INVAL,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 0,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_unsupp
+},
+{
+	.hdr_len = sizeof(struct iwarp_terminate),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_TERMINATE,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 0,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_terminate
+} };
+
+
+static void siw_qp_llp_data_ready(struct sock *sk, int flags)
+{
+	struct siw_qp		*qp;
+
+	read_lock(&sk->sk_callback_lock);
+
+	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk))) {
+		dprint(DBG_ON, " No QP: %p\n", sk->sk_user_data);
+		goto done;
+	}
+	qp = sk_to_qp(sk);
+
+	if (down_read_trylock(&qp->state_lock)) {
+		read_descriptor_t	rd_desc = {.arg.data = qp, .count = 1};
+
+		dprint(DBG_SK|DBG_RX, "(QP%d): "
+			"state (before tcp_read_sock)=%d, flags=%x\n",
+			QP_ID(qp), qp->attrs.state, flags);
+
+		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
+			/*
+			 * Implements data receive operation during
+			 * socket callback. TCP gracefully catches
+			 * the case where there is nothing to receive
+			 * (not calling siw_tcp_rx_data() then).
+			 */
+			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
+
+		dprint(DBG_SK|DBG_RX, "(QP%d): "
+			"state (after tcp_read_sock)=%d, flags=%x\n",
+			QP_ID(qp), qp->attrs.state, flags);
+
+		up_read(&qp->state_lock);
+	} else {
+		dprint(DBG_SK|DBG_RX, "(QP%d): "
+			"Unable to acquire state_lock\n", QP_ID(qp));
+	}
+done:
+	read_unlock(&sk->sk_callback_lock);
+}
+
+
+void siw_qp_llp_close(struct siw_qp *qp)
+{
+	dprint(DBG_CM, "(QP%d): Enter: SIW QP state = %s, cep=0x%p\n",
+		QP_ID(qp), siw_qp_state_to_string[qp->attrs.state],
+		qp->cep);
+
+	down_write(&qp->state_lock);
+
+	qp->rx_ctx.rx_suspend = 1;
+	qp->tx_ctx.tx_suspend = 1;
+	qp->attrs.llp_stream_handle = NULL;
+
+	switch (qp->attrs.state) {
+
+	case SIW_QP_STATE_RTS:
+	case SIW_QP_STATE_RTR:
+	case SIW_QP_STATE_IDLE:
+	case SIW_QP_STATE_TERMINATE:
+
+		qp->attrs.state = SIW_QP_STATE_ERROR;
+
+		break;
+	/*
+	 * SIW_QP_STATE_CLOSING:
+	 *
+	 * This is a forced close. shall the QP be moved to
+	 * ERROR or IDLE ?
+	 */
+	case SIW_QP_STATE_CLOSING:
+		if (!TX_IDLE(qp))
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+		else
+			qp->attrs.state = SIW_QP_STATE_IDLE;
+
+		break;
+
+	default:
+		dprint(DBG_CM, " No state transition needed: %d\n",
+			qp->attrs.state);
+		break;
+	}
+	siw_sq_flush(qp);
+	siw_rq_flush(qp);
+
+	up_write(&qp->state_lock);
+
+	dprint(DBG_CM, "(QP%d): Exit: SIW QP state = %s\n",
+		QP_ID(qp), siw_qp_state_to_string[qp->attrs.state]);
+}
+
+
+/*
+ * socket callback routine informing about newly available send space.
+ * Function schedules SQ work for processing SQ items.
+ */
+static void siw_qp_llp_write_space(struct sock *sk)
+{
+	struct siw_qp	*qp = sk_to_qp(sk);
+
+	/*
+	 * TODO:
+	 * Resemble sk_stream_write_space() logic for iWARP constraints:
+	 * Clear SOCK_NOSPACE only if sendspace may hold some reasonable
+	 * sized FPDU.
+	 */
+#ifdef SIW_TX_FULLSEGS
+	struct socket *sock = sk->sk_socket;
+	if (sk_stream_wspace(sk) >= (int)qp->tx_ctx.fpdu_len && sock) {
+		clear_bit(SOCK_NOSPACE, &sock->flags);
+		siw_sq_queue_work(qp);
+	}
+#else
+	sk_stream_write_space(sk);
+
+	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+		siw_sq_queue_work(qp);
+#endif
+}
+
+static void siw_qp_socket_assoc(struct socket *s, struct siw_qp *qp)
+{
+	struct sock *sk = s->sk;
+
+	write_lock_bh(&sk->sk_callback_lock);
+
+	qp->attrs.llp_stream_handle = s;
+	s->sk->sk_data_ready = siw_qp_llp_data_ready;
+	s->sk->sk_write_space = siw_qp_llp_write_space;
+
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+
+static int siw_qp_irq_init(struct siw_qp *qp, int i)
+{
+	struct siw_wqe *wqe;
+
+	dprint(DBG_CM|DBG_WR, "(QP%d): irq size: %d\n", QP_ID(qp), i);
+
+	INIT_LIST_HEAD(&qp->wqe_freelist);
+
+	/*
+	 * Give the IRD one extra entry since after sending
+	 * the RResponse it may trigger another peer RRequest
+	 * before the RResponse goes back to freelist.
+	 */
+	i++;
+
+	while (i--) {
+		wqe = kzalloc(sizeof(struct siw_wqe), GFP_KERNEL);
+		if (!wqe) {
+			siw_qp_freeq_flush(qp);
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(&wqe->list);
+		wr_type(wqe) = SIW_WR_RDMA_READ_RESP;
+		list_add(&wqe->list, &qp->wqe_freelist);
+	}
+	return 0;
+}
+
+
+static void siw_send_terminate(struct siw_qp *qp)
+{
+	struct iwarp_terminate	pkt;
+
+	memset(&pkt, 0, sizeof pkt);
+	/*
+	 * TODO: send TERMINATE
+	 */
+	dprint(DBG_CM, "(QP%d): Todo\n", QP_ID(qp));
+}
+
+
+/*
+ * caller holds qp->state_lock
+ */
+int
+siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
+	      enum siw_qp_attr_mask mask)
+{
+	int	drop_conn, rv;
+
+	if (!mask)
+		return 0;
+
+	dprint(DBG_CM, "(QP%d)\n", QP_ID(qp));
+
+	if (mask != SIW_QP_ATTR_STATE) {
+		/*
+		 * changes of qp attributes (maybe state, too)
+		 */
+		if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
+
+			if (attrs->flags & SIW_RDMA_BIND_ENABLED)
+				qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
+			else
+				qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
+
+			if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
+				qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+			else
+				qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
+
+			if (attrs->flags & SIW_RDMA_READ_ENABLED)
+				qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
+			else
+				qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
+
+		}
+		/*
+		 * TODO: what else ??
+		 */
+	}
+	if (!(mask & SIW_QP_ATTR_STATE))
+		return 0;
+
+	dprint(DBG_CM, "(QP%d): SIW QP state: %s => %s\n", QP_ID(qp),
+		siw_qp_state_to_string[qp->attrs.state],
+		   siw_qp_state_to_string[attrs->state]);
+
+	drop_conn = 0;
+
+	switch (qp->attrs.state) {
+
+	case SIW_QP_STATE_IDLE:
+	case SIW_QP_STATE_RTR:
+
+		switch (attrs->state) {
+
+		case SIW_QP_STATE_RTS:
+
+			if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
+				dprint(DBG_ON, "(QP%d): socket?\n", QP_ID(qp));
+				return -EINVAL;
+			}
+			if (!(mask & SIW_QP_ATTR_MPA)) {
+				dprint(DBG_ON, "(QP%d): MPA?\n", QP_ID(qp));
+				return -EINVAL;
+			}
+			dprint(DBG_CM, "(QP%d): Enter RTS: "
+				"peer 0x%08x, local 0x%08x\n", QP_ID(qp),
+				qp->cep->llp.raddr.sin_addr.s_addr,
+				qp->cep->llp.laddr.sin_addr.s_addr);
+			/*
+			 * Initialize global iWARP TX state
+			 */
+			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
+			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
+			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
+
+			/*
+			 * Initialize global iWARP RX state
+			 */
+			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
+			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
+			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
+
+			/*
+			 * init IRD freequeue, caller has already checked
+			 * limits
+			 */
+			rv = siw_qp_irq_init(qp, attrs->ird);
+			if (rv)
+				return rv;
+
+			atomic_set(&qp->orq_space, attrs->ord);
+
+			qp->attrs.ord = attrs->ord;
+			qp->attrs.ird = attrs->ird;
+			qp->attrs.mpa = attrs->mpa;
+			/*
+			 * move socket rx and tx under qp's control
+			 */
+			siw_qp_socket_assoc(attrs->llp_stream_handle, qp);
+
+			qp->attrs.state = SIW_QP_STATE_RTS;
+			/*
+			 * set initial mss
+			 */
+			qp->tx_ctx.tcp_seglen =
+				get_tcp_mss(attrs->llp_stream_handle->sk);
+
+			break;
+
+		case SIW_QP_STATE_ERROR:
+			siw_rq_flush(qp);
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+			drop_conn = 1;
+			break;
+
+		case SIW_QP_STATE_RTR:
+			/* ignore */
+			break;
+
+		default:
+			dprint(DBG_CM,
+				" QP state transition undefined: %s => %s\n",
+				siw_qp_state_to_string[qp->attrs.state],
+				siw_qp_state_to_string[attrs->state]);
+			break;
+		}
+		break;
+
+	case SIW_QP_STATE_RTS:
+
+		switch (attrs->state) {
+
+		case SIW_QP_STATE_CLOSING:
+			/*
+			 * Verbs: move to IDLE if SQ and ORQ are empty.
+			 * Move to ERROR otherwise. But first of all we must
+			 * close the connection. So we keep CLOSING or ERROR
+			 * as a transient state, schedule connection drop work
+			 * and wait for the socket state change upcall to
+			 * come back closed.
+			 */
+			if (TX_IDLE(qp))
+				qp->attrs.state = SIW_QP_STATE_CLOSING;
+			else {
+				qp->attrs.state = SIW_QP_STATE_ERROR;
+				siw_sq_flush(qp);
+			}
+			siw_rq_flush(qp);
+
+			drop_conn = 1;
+			break;
+
+		case SIW_QP_STATE_TERMINATE:
+			qp->attrs.state = SIW_QP_STATE_TERMINATE;
+			siw_send_terminate(qp);
+			drop_conn = 1;
+
+			break;
+
+		case SIW_QP_STATE_ERROR:
+			/*
+			 * This is an emergency close.
+			 *
+			 * Any in progress transmit operation will get
+			 * cancelled.
+			 * This will likely result in a protocol failure,
+			 * if a TX operation is in transit. The caller
+			 * could unconditional wait to give the current
+			 * operation a chance to complete.
+			 * Esp., how to handle the non-empty IRQ case?
+			 * The peer was asking for data transfer at a valid
+			 * point in time.
+			 */
+			siw_sq_flush(qp);
+			siw_rq_flush(qp);
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+			drop_conn = 1;
+
+			break;
+
+		default:
+			dprint(DBG_ON,
+				" QP state transition undefined: %s => %s\n",
+				siw_qp_state_to_string[qp->attrs.state],
+				siw_qp_state_to_string[attrs->state]);
+			break;
+		}
+		break;
+
+	case SIW_QP_STATE_TERMINATE:
+
+		switch (attrs->state) {
+
+		case SIW_QP_STATE_ERROR:
+			siw_rq_flush(qp);
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+
+			if (!TX_IDLE(qp))
+				siw_sq_flush(qp);
+
+			break;
+
+		default:
+			dprint(DBG_ON,
+				" QP state transition undefined: %s => %s\n",
+				siw_qp_state_to_string[qp->attrs.state],
+				siw_qp_state_to_string[attrs->state]);
+		}
+		break;
+
+	case SIW_QP_STATE_CLOSING:
+
+		switch (attrs->state) {
+
+		case SIW_QP_STATE_IDLE:
+			BUG_ON(!TX_IDLE(qp));
+			qp->attrs.state = SIW_QP_STATE_IDLE;
+
+			break;
+
+		case SIW_QP_STATE_CLOSING:
+			/*
+			 * The LLP may already moved the QP to closing
+			 * due to graceful peer close init
+			 */
+			break;
+
+		case SIW_QP_STATE_ERROR:
+			/*
+			 * QP was moved to CLOSING by LLP event
+			 * not yet seen by user.
+			 */
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+
+			if (!TX_IDLE(qp))
+				siw_sq_flush(qp);
+
+			siw_rq_flush(qp);
+
+			break;
+
+		default:
+			dprint(DBG_CM,
+				" QP state transition undefined: %s => %s\n",
+				siw_qp_state_to_string[qp->attrs.state],
+				siw_qp_state_to_string[attrs->state]);
+			return -ECONNABORTED;
+		}
+		break;
+
+	default:
+		dprint(DBG_CM, " NOP: State: %d\n", qp->attrs.state);
+		break;
+	}
+	if (drop_conn)
+		siw_qp_cm_drop(qp, 0);
+
+	return 0;
+}
+
+struct ib_qp *siw_get_ofaqp(struct ib_device *dev, int id)
+{
+	struct siw_qp *qp =  siw_qp_id2obj(siw_dev_ofa2siw(dev), id);
+
+	dprint(DBG_OBJ, ": dev_name: %s, OFA QPID: %d, QP: %p\n",
+		dev->name, id, qp);
+	if (qp) {
+		/*
+		 * siw_qp_id2obj() increments object reference count
+		 */
+		siw_qp_put(qp);
+		dprint(DBG_OBJ, " QPID: %d\n", QP_ID(qp));
+		return &qp->ofa_qp;
+	}
+	return (struct ib_qp *)NULL;
+}
+
+/*
+ * siw_check_mem()
+ *
+ * Check protection domain, STAG state, access permissions and
+ * address range for memory object.
+ *
+ * @pd:		Protection Domain memory should belong to
+ * @mem:	memory to be checked
+ * @addr:	starting addr of mem
+ * @perms:	requested access permissions
+ * @len:	len of memory interval to be checked
+ *
+ */
+int siw_check_mem(struct siw_pd *pd, struct siw_mem *mem, u64 addr,
+		  enum siw_access_flags perms, int len)
+{
+	if (siw_mem2mr(mem)->pd != pd) {
+		dprint(DBG_WR|DBG_ON, "(PD%d): PD mismatch %p : %p\n",
+			OBJ_ID(pd),
+			siw_mem2mr(mem)->pd, pd);
+
+		return -EINVAL;
+	}
+	if (mem->stag_state == STAG_INVALID) {
+		dprint(DBG_WR|DBG_ON, "(PD%d): STAG 0x%08x invalid\n",
+			OBJ_ID(pd), OBJ_ID(mem));
+		return -EPERM;
+	}
+	/*
+	 * check access permissions
+	 */
+	if ((mem->perms & perms) < perms) {
+		dprint(DBG_WR|DBG_ON, "(PD%d): "
+			"INSUFFICIENT permissions 0x%08x : 0x%08x\n",
+			OBJ_ID(pd), mem->perms, perms);
+		return -EPERM;
+	}
+	/*
+	 * Check address interval: we relax check to allow memory shrinked
+	 * from the start address _after_ placing or fetching len bytes.
+	 * TODO: this relaxation is probably overdone
+	 */
+	if (addr < mem->va || addr + len > mem->va + mem->len) {
+		dprint(DBG_WR|DBG_ON, "(PD%d): MEM interval len %d "
+			"[0x%016llx, 0x%016llx) out of bounds "
+			"[0x%016llx, 0x%016llx) for LKey=0x%08x\n",
+			OBJ_ID(pd), len, (unsigned long long)addr,
+			(unsigned long long)(addr + len),
+			(unsigned long long)mem->va,
+			(unsigned long long)(mem->va + mem->len),
+			OBJ_ID(mem));
+
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * siw_check_sge()
+ *
+ * Check SGE for access rights in given interval
+ *
+ * @pd:		Protection Domain memory should belong to
+ * @sge:	SGE to be checked
+ * @perms:	requested access permissions
+ * @off:	starting offset in SGE
+ * @len:	len of memory interval to be checked
+ *
+ * NOTE: Function references each SGE's memory object (sge->mem)
+ * if not yet done. New reference is kept if check went ok and
+ * released if check failed. If sge->mem is already valid, no new
+ * lookup is being done and mem is not released it check fails.
+ */
+int
+siw_check_sge(struct siw_pd *pd, struct siw_sge *sge,
+	      enum siw_access_flags perms, u32 off, int len)
+{
+	struct siw_dev	*dev = pd->hdr.dev;
+	struct siw_mem	*mem;
+	int		new_ref = 0, rv = 0;
+
+	if (len + off > sge->len) {
+		rv = -EPERM;
+		goto fail;
+	}
+	if (sge->mem.obj == NULL) {
+		mem = siw_mem_id2obj(dev, sge->lkey >> 8);
+		if (!mem) {
+			rv = -EINVAL;
+			goto fail;
+		}
+		sge->mem.obj = mem;
+		new_ref = 1;
+	} else {
+		mem = sge->mem.obj;
+		new_ref = 0;
+	}
+	rv = siw_check_mem(pd, mem, sge->addr + off, perms, len);
+	if (rv)
+		goto fail;
+
+	return 0;
+
+fail:
+	if (new_ref) {
+		siw_mem_put(mem);
+		sge->mem.obj = NULL;
+	}
+	return rv;
+}
+
+
+/*
+ * siw_check_sgl()
+ *
+ * Check permissions for a list of SGE's (SGL)
+ *
+ * @pd:		Protection Domain SGL should belong to
+ * @sge:	List of SGE to be checked
+ * @perms:	requested access permissions
+ * @off:	starting offset in SGL
+ * @len:	len of memory interval to be checked
+ *
+ * Function checks only subinterval of SGL described by bytelen @len,
+ * check starts with byte offset @off which must be within
+ * the length of the first SGE.
+ *
+ * The caller is responsible for keeping @len + @off within
+ * the total byte len of the SGL.
+ */
+
+int siw_check_sgl(struct siw_pd *pd, struct siw_sge *sge,
+		  enum siw_access_flags perms, u32 off, int len)
+{
+	int	rv = 0;
+
+	dprint(DBG_WR, "(PD%d): Enter\n", OBJ_ID(pd));
+
+	BUG_ON(off >= sge->len);
+
+	while (len > 0) {
+		dprint(DBG_WR, "(PD%d): sge=%p, perms=0x%x, "
+			"len=%d, off=%u, sge->len=%d\n",
+			OBJ_ID(pd), sge, perms, len, off, sge->len);
+		/*
+		 * rdma verbs: do not check stag for a zero length sge
+		 */
+		if (sge->len == 0) {
+			sge++;
+			continue;
+		}
+
+		rv = siw_check_sge(pd, sge, perms, off, sge->len - off);
+		if (rv)
+			break;
+
+		len -= sge->len - off;
+		off = 0;
+		sge++;
+	}
+	return rv;
+}
+
+int siw_crc_array(struct hash_desc *desc, u8 *start, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, start, len);
+	return crypto_hash_update(desc, &sg, len);
+}
+
+int siw_crc_sg(struct hash_desc *desc, struct scatterlist *sg,
+	       int off, int len)
+{
+	int rv;
+
+	if (off == 0)
+		rv = crypto_hash_update(desc, sg, len);
+	else {
+		struct scatterlist t_sg;
+
+		sg_init_table(&t_sg, 1);
+		sg_set_page(&t_sg, sg_page(sg), len, off);
+		rv = crypto_hash_update(desc, &t_sg, len);
+	}
+	return rv;
+}
+
+/*
+ * siw_qp_freeq_flush()
+ *
+ * Flush any WQE on the QP's free list
+ */
+void siw_qp_freeq_flush(struct siw_qp *qp)
+{
+	struct list_head	*pos, *n;
+	struct siw_wqe		*wqe;
+
+	dprint(DBG_OBJ|DBG_CM|DBG_WR, "(QP%d): Enter\n", QP_ID(qp));
+
+	if (list_empty(&qp->wqe_freelist))
+		return;
+
+	list_for_each_safe(pos, n, &qp->wqe_freelist) {
+		wqe = list_entry_wqe(pos);
+		list_del(&wqe->list);
+		kfree(wqe);
+	}
+}
+
+
+/*
+ * siw_sq_flush()
+ *
+ * Flush SQ and ORRQ entries to CQ.
+ * IRRQ entries are silently dropped.
+ *
+ * TODO: Add termination code for in-progress WQE.
+ * TODO: an in-progress WQE may have been partially
+ *       processed. It should be enforced, that transmission
+ *       of a started DDP segment must be completed if possible
+ *       by any chance.
+ *
+ * Must be called with qp state write lock held.
+ * Therefore, SQ and ORQ lock must not be taken.
+ */
+void siw_sq_flush(struct siw_qp *qp)
+{
+	struct list_head	*pos, *n;
+	struct siw_wqe		*wqe = tx_wqe(qp);
+	struct siw_cq		*cq = qp->scq;
+	int			async_event = 0;
+
+	dprint(DBG_OBJ|DBG_CM|DBG_WR, "(QP%d): Enter\n", QP_ID(qp));
+
+	/*
+	 * flush the in-progress wqe, if there.
+	 */
+	if (wqe) {
+		/*
+		 * TODO: Add iWARP Termination code
+		 */
+		tx_wqe(qp) = NULL;
+
+		dprint(DBG_WR,
+			" (QP%d): Flush current WQE %p, type %d\n",
+			QP_ID(qp), wqe, wr_type(wqe));
+
+		if (wr_type(wqe) == SIW_WR_RDMA_READ_RESP) {
+			siw_wqe_put(wqe);
+			wqe = NULL;
+		} else if (wr_type(wqe) != SIW_WR_RDMA_READ_REQ)
+			/*
+			 *  A RREQUEST is already on the ORRQ
+			 */
+			list_add_tail(&wqe->list, &qp->orq);
+	}
+	if (!list_empty(&qp->irq))
+		list_for_each_safe(pos, n, &qp->irq) {
+			wqe = list_entry_wqe(pos);
+			dprint(DBG_WR,
+				" (QP%d): Flush IRQ WQE %p, status %d\n",
+				QP_ID(qp), wqe, wqe->wr_status);
+			list_del(&wqe->list);
+			siw_wqe_put(wqe);
+		}
+
+	if (!list_empty(&qp->orq))
+		list_for_each_safe(pos, n, &qp->orq) {
+			wqe = list_entry_wqe(pos);
+			dprint(DBG_WR,
+				" (QP%d): Flush ORQ WQE %p, type %d,"
+				" status %d\n", QP_ID(qp), wqe, wr_type(wqe),
+				wqe->wr_status);
+			if (wqe->wr_status != SR_WR_DONE) {
+				async_event = 1;
+				wqe->wc_status = IB_WC_WR_FLUSH_ERR;
+				wqe->wr_status = SR_WR_DONE;
+			}
+			if (cq) {
+				lock_cq(cq);
+				list_move_tail(&wqe->list, &cq->queue);
+				/* TODO: enforce CQ limits */
+				atomic_inc(&cq->qlen);
+				unlock_cq(cq);
+			} else {
+				list_del(&wqe->list);
+				siw_wqe_put(wqe);
+			}
+		}
+	if (!list_empty(&qp->sq))
+		async_event = 1;
+		list_for_each_safe(pos, n, &qp->sq) {
+			wqe = list_entry_wqe(pos);
+			dprint(DBG_WR,
+				" (QP%d): Flush SQ WQE %p, type %d\n",
+				QP_ID(qp), wqe, wr_type(wqe));
+			if (cq) {
+				wqe->wc_status = IB_WC_WR_FLUSH_ERR;
+				wqe->wr_status = SR_WR_DONE;
+				lock_cq(cq);
+				list_move_tail(&wqe->list, &cq->queue);
+				/* TODO: enforce CQ limits */
+				atomic_inc(&cq->qlen);
+				unlock_cq(cq);
+			} else  {
+				list_del(&wqe->list);
+				siw_wqe_put(wqe);
+			}
+		}
+	atomic_set(&qp->sq_space, qp->attrs.sq_size);
+
+	if (wqe != NULL && cq != NULL && cq->ofa_cq.comp_handler != NULL)
+		(*cq->ofa_cq.comp_handler)(&cq->ofa_cq, cq->ofa_cq.cq_context);
+
+	if (async_event)
+		siw_async_ev(qp, NULL, IB_EVENT_SQ_DRAINED);
+}
+
+/*
+ * siw_rq_flush()
+ *
+ * Flush recv queue entries to cq. An in-progress WQE may have some bytes
+ * processed (wqe->processed).
+ *
+ * Must be called with qp state write lock held.
+ * Therefore, RQ lock must not be taken.
+ */
+void siw_rq_flush(struct siw_qp *qp)
+{
+	struct list_head	*pos, *n;
+	struct siw_wqe		*wqe;
+	struct siw_cq		*cq;
+
+	dprint(DBG_OBJ|DBG_CM|DBG_WR, "(QP%d): Enter\n", QP_ID(qp));
+
+	/*
+	 * Flush an in-progess WQE if present
+	 */
+	if (rx_wqe(qp)) {
+		if (qp->rx_ctx.hdr.ctrl.opcode != RDMAP_RDMA_WRITE)
+			list_add(&rx_wqe(qp)->list, &qp->rq);
+		else
+			siw_mem_put(rx_mem(qp));
+
+		rx_wqe(qp) = NULL;
+	}
+	if (list_empty(&qp->rq))
+		return;
+
+	cq = qp->rcq;
+
+	list_for_each_safe(pos, n, &qp->rq) {
+		wqe = list_entry_wqe(pos);
+		list_del_init(&wqe->list);
+		if (cq) {
+			wqe->wc_status = IB_WC_WR_FLUSH_ERR;
+			lock_cq(cq);
+			list_add_tail(&wqe->list, &cq->queue);
+			/* TODO: enforce CQ limits */
+			atomic_inc(&cq->qlen);
+			unlock_cq(cq);
+		} else
+			siw_wqe_put(wqe);
+
+		if (!qp->srq)
+			atomic_inc(&qp->rq_space);
+		else
+			atomic_inc(&qp->srq->space);
+
+	}
+	if (cq != NULL && cq->ofa_cq.comp_handler != NULL)
+		(*cq->ofa_cq.comp_handler)(&cq->ofa_cq, cq->ofa_cq.cq_context);
+}
-- 
1.5.4.3


^ permalink raw reply related

* [PATCH] SIW: Completion queue
From: Bernard Metzler @ 2010-10-05  6:55 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_cq.c |  243 ++++++++++++++++++++++++++++++++++++
 1 files changed, 243 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_cq.c

diff --git a/drivers/infiniband/hw/siw/siw_cq.c b/drivers/infiniband/hw/siw/siw_cq.c
new file mode 100644
index 0000000..441f128
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_cq.c
@@ -0,0 +1,243 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/list.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+static int siw_wc_op_siw2ofa[SIW_WR_NUM] = {
+	[SIW_WR_RDMA_WRITE]		= IB_WC_RDMA_WRITE,
+	[SIW_WR_RDMA_WRITE_WITH_IMM]	= IB_WC_RDMA_WRITE,
+	[SIW_WR_SEND]			= IB_WC_SEND,
+	[SIW_WR_SEND_WITH_IMM]		= IB_WC_SEND,
+	[SIW_WR_RDMA_READ_REQ]		= IB_WC_RDMA_READ,
+	[SIW_WR_ATOMIC_CMP_AND_SWP]	= IB_WC_COMP_SWAP,
+	[SIW_WR_ATOMIC_FETCH_AND_ADD]	= IB_WC_FETCH_ADD,
+	[SIW_WR_BIND_MW]		= IB_WC_BIND_MW,
+	[SIW_WR_FASTREG]		= IB_WC_FAST_REG_MR,
+	[SIW_WR_INVAL_STAG]		= IB_WC_LOCAL_INV,
+	[SIW_WR_RECEIVE]		= IB_WC_RECV,
+	[SIW_WR_RDMA_READ_RESP]		= 0 /* not used */
+};
+
+/*
+ * translate wc into ofa syntax
+ */
+static void siw_wc_siw2ofa(struct siw_wqe *siw_wc, struct ib_wc *ofa_wc)
+{
+	memset(ofa_wc, 0, sizeof *ofa_wc);
+
+	ofa_wc->wr_id = wr_id(siw_wc);
+	ofa_wc->status = siw_wc->wc_status;
+	ofa_wc->byte_len = siw_wc->processed;
+	ofa_wc->qp = &siw_wc->qp->ofa_qp;
+
+	BUG_ON(wr_type(siw_wc) >= SIW_WR_NUM);
+	ofa_wc->opcode = siw_wc_op_siw2ofa[wr_type(siw_wc)];
+	/*
+	 * ofa_wc->imm_data = 0;
+	 * ofa_wc->vendor_err = 0;
+	 * ofa_wc->src_qp = 0;
+	 * ofa_wc->wc_flags = 0; ADD immediate data support
+	 * ofa_wc->pkey_index = 0;
+	 * ofa_wc->slid = 0;
+	 * ofa_wc->sl = 0;
+	 * ofa_wc->dlid_path_bits = 0;
+	 * ofa_wc->port_num = 0;
+	 */
+}
+
+/*
+ * Reap one CQE from the CQ.
+ *
+ * Caller must hold qp read lock
+ *
+ * TODO: Provide routine which can read more than one CQE
+ */
+int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *ofa_wc)
+{
+	struct siw_wqe	*cqe = NULL;
+	unsigned long flags;
+
+	lock_cq_rxsave(cq, flags);
+
+	if (!list_empty(&cq->queue)) {
+		cqe = list_first_wqe(&cq->queue);
+		list_del(&cqe->list);
+		atomic_dec(&cq->qlen);
+	}
+	unlock_cq_rxsave(cq, flags);
+
+	if (cqe) {
+		siw_wc_siw2ofa(cqe, ofa_wc);
+
+		dprint(DBG_WR, " QP%d, CQ%d: Reap WQE type: %d, p: %p\n",
+			  QP_ID(cqe->qp), OBJ_ID(cq), wr_type(cqe), cqe);
+
+		siw_wqe_put(cqe);
+		return 1;
+	} else
+		return 0;
+}
+
+/*
+ * siw_cq_flush()
+ *
+ * Flush all CQ elements. No CQ lock is taken.
+ */
+void siw_cq_flush(struct siw_cq *cq)
+{
+	struct list_head	*pos, *n;
+	struct siw_wqe		*cqe;
+
+	dprint(DBG_CM|DBG_OBJ, "(CQ%d:) Enter\n", OBJ_ID(cq));
+
+	if (list_empty(&cq->queue))
+		return;
+
+	list_for_each_safe(pos, n, &cq->queue) {
+		cqe = list_entry_wqe(pos);
+		list_del(&cqe->list);
+
+		dprint(DBG_OBJ|DBG_WR, " WQE: 0x%llu:, type: %d, p: %p\n",
+			(unsigned long long)wr_id(cqe),
+			wr_type(cqe), cqe);
+
+		siw_wqe_put(cqe);
+	}
+	atomic_set(&cq->qlen, 0);
+}
+
+
+
+/*
+ * siw_rq_complete()
+ *
+ * Appends RQ/SRQ WQE to CQ, if assigned.
+ * Must be called with qp state read locked
+ */
+void siw_rq_complete(struct siw_wqe *wqe, struct siw_qp *qp)
+{
+	struct siw_cq	*cq = qp->rcq;
+	unsigned long flags;
+
+	dprint(DBG_OBJ|DBG_WR, " QP%d WQE: 0x%llu:, type: %d, p: %p\n",
+		QP_ID(qp),
+		(unsigned long long)wr_id(wqe), wr_type(wqe), wqe);
+
+	if (cq) {
+		lock_cq_rxsave(cq, flags);
+
+		list_add_tail(&wqe->list, &cq->queue);
+		atomic_inc(&cq->qlen); /* FIXME: test overflow */
+
+		unlock_cq_rxsave(cq, flags);
+
+		/*
+		 * SRQ space was already incremented when WQE was fetched
+		 * by some QP
+		 */
+		if (!qp->srq)	/* XXX to be deferred to reaping ? */
+			atomic_inc(&qp->rq_space);
+
+		if (cq->ofa_cq.comp_handler != NULL &&
+			((cq->notify & SIW_CQ_NOTIFY_ALL) ||
+			 (cq->notify == SIW_CQ_NOTIFY_SOLICITED &&
+			  wr_flags(wqe) & IB_SEND_SOLICITED))) {
+				cq->notify = SIW_CQ_NOTIFY_NOT;
+				(*cq->ofa_cq.comp_handler)
+					(&cq->ofa_cq, cq->ofa_cq.cq_context);
+		}
+	} else {
+		if (!qp->srq)
+			atomic_inc(&qp->rq_space);
+		siw_wqe_put(wqe);
+	}
+}
+
+/*
+ * siw_sq_complete()
+ * Appends list of former SQ WQE's to CQ, if assigned.
+ * Must be called with qp state read locked
+ */
+void siw_sq_complete(struct list_head *c_list, struct siw_qp *qp, int num,
+		     enum ib_send_flags send_flags)
+{
+	struct siw_cq		*cq = qp->scq;
+	unsigned long flags;
+
+	if (cq) {
+		lock_cq_rxsave(cq, flags);
+
+		list_splice_tail(c_list, &cq->queue);
+		atomic_add(num, &cq->qlen); /* FIXME: test overflow */
+
+
+		dprint(DBG_WR, " CQ%d: add %d from QP%d, CQ len %d\n",
+			OBJ_ID(cq), num, QP_ID(qp), atomic_read(&cq->qlen));
+
+		/* XXX to be deferred to reaping */
+		atomic_add(num, &qp->sq_space);
+
+		if (cq->ofa_cq.comp_handler != NULL &&
+			((cq->notify & SIW_CQ_NOTIFY_ALL) ||
+			 (cq->notify == SIW_CQ_NOTIFY_SOLICITED &&
+			  send_flags & IB_SEND_SOLICITED))) {
+				cq->notify = SIW_CQ_NOTIFY_NOT;
+				(*cq->ofa_cq.comp_handler)
+					(&cq->ofa_cq, cq->ofa_cq.cq_context);
+		}
+		unlock_cq_rxsave(cq, flags);
+	} else {
+		struct list_head *pos;
+
+		list_for_each(pos, c_list)
+			siw_wqe_put(list_entry_wqe(pos));
+
+		atomic_add(num, &qp->sq_space);
+	}
+}
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] SIW: Transmit path
From: Bernard Metzler @ 2010-10-05  6:55 UTC (permalink / raw)
  To: netdev; +Cc: linux-rdma, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_qp_tx.c | 1309 +++++++++++++++++++++++++++++++++
 1 files changed, 1309 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_qp_tx.c

diff --git a/drivers/infiniband/hw/siw/siw_qp_tx.c b/drivers/infiniband/hw/siw/siw_qp_tx.c
new file mode 100644
index 0000000..ef774eb
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_qp_tx.c
@@ -0,0 +1,1309 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt@zurich.ibm.com>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+static int zcopy_tx = 1;
+module_param(zcopy_tx, int, 0644);
+MODULE_PARM_DESC(zcopy_tx, "Zero copy user data transmit if possible");
+
+DEFINE_PER_CPU(atomic_t, siw_workq_len);
+
+static inline int siw_crc_txhdr(struct siw_iwarp_tx *ctx)
+{
+	crypto_hash_init(&ctx->mpa_crc_hd);
+	return siw_crc_array(&ctx->mpa_crc_hd, (u8 *)&ctx->pkt,
+			     ctx->ctrl_len);
+}
+
+#define PKT_FRAGMENTED 1
+#define PKT_COMPLETE 0
+
+/*
+ * siw_qp_prepare_tx()
+ *
+ * Prepare tx state for sending out one fpdu. Builds complete pkt
+ * if no user data or only immediate data are present.
+ *
+ * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise.
+ */
+static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx)
+{
+	struct siw_wqe		*wqe = c_tx->wqe;
+	u32			*crc = NULL;
+
+	dprint(DBG_TX, "(QP%d):\n", TX_QPID(c_tx));
+
+	switch (wr_type(wqe)) {
+
+	case SIW_WR_RDMA_READ_REQ:
+		memcpy(&c_tx->pkt.ctrl,
+		       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
+		       sizeof(struct iwarp_ctrl));
+
+		c_tx->pkt.rreq.rsvd = 0;
+		c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
+		c_tx->pkt.rreq.ddp_msn =
+			htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]);
+		c_tx->pkt.rreq.ddp_mo = 0;
+		c_tx->pkt.rreq.sink_stag = htonl(wqe->wr.rread.sge[0].lkey);
+		c_tx->pkt.rreq.sink_to =
+			cpu_to_be64(wqe->wr.rread.sge[0].addr); /* abs addr! */
+		c_tx->pkt.rreq.source_stag = htonl(wqe->wr.rread.rtag);
+		c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->wr.rread.raddr);
+		c_tx->pkt.rreq.read_size = htonl(wqe->bytes);
+
+		dprint(DBG_TX, ": RREQ: Sink: %x, 0x%016llx\n",
+			wqe->wr.rread.sge[0].lkey, wqe->wr.rread.sge[0].addr);
+
+		c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq);
+		crc = &c_tx->pkt.rreq_pkt.crc;
+		break;
+
+	case SIW_WR_SEND:
+		if (wr_flags(wqe) & IB_SEND_SOLICITED)
+			memcpy(&c_tx->pkt.ctrl,
+			       &iwarp_pktinfo[RDMAP_SEND_SE].ctrl,
+			       sizeof(struct iwarp_ctrl));
+		else
+			memcpy(&c_tx->pkt.ctrl,
+			       &iwarp_pktinfo[RDMAP_SEND].ctrl,
+			       sizeof(struct iwarp_ctrl));
+
+		c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
+		c_tx->pkt.send.ddp_msn =
+			htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+		c_tx->pkt.send.ddp_mo = 0;
+		c_tx->pkt.send.rsvd = 0;
+
+		c_tx->ctrl_len = sizeof(struct iwarp_send);
+
+		if (!wqe->bytes)
+			crc = &c_tx->pkt.send_pkt.crc;
+		break;
+
+	case SIW_WR_RDMA_WRITE:
+		memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl,
+		       sizeof(struct iwarp_ctrl));
+
+		c_tx->pkt.rwrite.sink_stag = htonl(wqe->wr.write.rtag);
+		c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->wr.write.raddr);
+		c_tx->ctrl_len = sizeof(struct iwarp_rdma_write);
+
+		if (!wqe->bytes)
+			crc = &c_tx->pkt.write_pkt.crc;
+		break;
+
+	case SIW_WR_RDMA_READ_RESP:
+		memcpy(&c_tx->pkt.ctrl,
+		       &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl,
+		       sizeof(struct iwarp_ctrl));
+
+		/* NBO */
+		c_tx->pkt.rresp.sink_stag = wqe->wr.rresp.rtag;
+		c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->wr.rresp.raddr);
+
+		c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp);
+
+		dprint(DBG_TX, ": RRESP: Sink: %x, 0x%016llx\n",
+			wqe->wr.rresp.rtag, wqe->wr.rresp.raddr);
+
+		if (!wqe->bytes)
+			crc = &c_tx->pkt.rresp_pkt.crc;
+		break;
+
+	default:
+		dprint(DBG_ON, "Unsupported WQE type %d\n", wr_type(wqe));
+		BUG();
+		break;
+	}
+	c_tx->ctrl_sent = 0;
+	c_tx->sge_idx = 0;
+	c_tx->sge_off = 0;
+	c_tx->pg_idx = 0;
+	c_tx->umem_chunk = NULL;
+
+	/*
+	 * Do complete CRC if enabled and short packet
+	 */
+	if (crc) {
+		*crc = 0;
+		if (c_tx->crc_enabled) {
+			if (siw_crc_txhdr(c_tx) != 0)
+				return -EINVAL;
+			crypto_hash_final(&c_tx->mpa_crc_hd, (u8 *)crc);
+		}
+	}
+	c_tx->ctrl_len += MPA_CRC_SIZE;
+
+	/*
+	 * Allow direct sending out of user buffer if WR is non signalled
+	 * and payload is over threshold and no CRC is enabled.
+	 * Per RDMA verbs, the application should not change the send buffer
+	 * until the work completed. In iWarp, work completion is only
+	 * local delivery to TCP. TCP may reuse the buffer for
+	 * retransmission or may even did not yet sent the data. Changing
+	 * unsent data also breaks the CRC, if applied.
+	 */
+	if (zcopy_tx &&
+	     !(wr_flags(wqe) & IB_SEND_SIGNALED) &&
+	     wqe->bytes > SENDPAGE_THRESH &&
+	     wr_type(wqe) != SIW_WR_RDMA_READ_REQ)
+		c_tx->use_sendpage = 1;
+	else
+		c_tx->use_sendpage = 0;
+
+	return crc == NULL ? PKT_FRAGMENTED : PKT_COMPLETE;
+}
+
+/*
+ * Send out one complete FPDU. Used for fixed sized packets like
+ * Read Requests or zero length SENDs, WRITEs, READ.responses.
+ * Also used for pushing an FPDU hdr only.
+ */
+static inline int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s,
+			      int flags)
+{
+	struct msghdr msg = {.msg_flags = flags};
+	struct kvec iov = {
+		.iov_base = (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent,
+		.iov_len = c_tx->ctrl_len - c_tx->ctrl_sent};
+
+	int rv = kernel_sendmsg(s, &msg, &iov, 1,
+				c_tx->ctrl_len - c_tx->ctrl_sent);
+
+	dprint(DBG_TX, " (QP%d): op=%d, %d of %d sent (%d)\n",
+		TX_QPID(c_tx), c_tx->pkt.ctrl.opcode,
+		c_tx->ctrl_sent + rv, c_tx->ctrl_len, rv);
+
+	if (rv >= 0) {
+		c_tx->ctrl_sent += rv;
+
+		if (c_tx->ctrl_sent == c_tx->ctrl_len) {
+			siw_dprint_hdr(&c_tx->pkt.hdr, TX_QPID(c_tx),
+					"CTRL sent");
+			if (!(flags & MSG_MORE))
+				c_tx->new_tcpseg = 1;
+			rv = 0;
+		} else if (c_tx->ctrl_sent < c_tx->ctrl_len)
+			rv = -EAGAIN;
+		else
+			BUG();
+	}
+	return rv;
+}
+
+/*
+ * 0copy TCP transmit interface.
+ *
+ * Push page array page by page or in one shot.
+ * Pushing the whole page array requires the inner do_tcp_sendpages
+ * function to be exported by the kernel.
+ */
+static int siw_tcp_sendpages(struct socket *s, struct page **page,
+			     int offset, size_t size)
+{
+	int rv = 0;
+
+#ifdef SIW_SENDPAGES_EXPORT
+	struct sock *sk = s->sk;
+
+	if (!(sk->sk_route_caps & NETIF_F_SG) ||
+	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) {
+		/* FIXME:
+		 * This should also be handled in a
+		 * loop
+		 */
+		return -EFAULT;
+	}
+
+	lock_sock(sk);
+	TCP_CHECK_TIMER(sk);
+
+	/*
+	 * just return what sendpages has return
+	 */
+	rv = do_tcp_sendpages(sk, page, offset, size, MSG_MORE|MSG_DONTWAIT);
+
+	TCP_CHECK_TIMER(sk);
+	release_sock(sk);
+	if (rv == -EAGAIN)
+		rv = 0;
+#else
+	/*
+	 * If do_tcp_sendpages() function is not exported
+	 * push page by page
+	 */
+	size_t todo = size;
+	int i;
+
+	for (i = 0; size > 0; i++) {
+		size_t bytes = min_t(size_t, PAGE_SIZE - offset, size);
+
+		rv = s->ops->sendpage(s, page[i], offset, bytes,
+				      MSG_MORE|MSG_DONTWAIT);
+		if (rv <= 0)
+			break;
+
+		size -= rv;
+
+		if (rv != bytes)
+			break;
+
+		offset = 0;
+	}
+	if (rv >= 0 || rv == -EAGAIN)
+		rv = todo - size;
+#endif
+	return rv;
+}
+
+/*
+ * siw_0copy_tx()
+ *
+ * Pushes list of pages to TCP socket. If pages from multiple
+ * SGE's, all referenced pages of each SGE are pushed in one
+ * shot.
+ */
+static int siw_0copy_tx(struct socket *s, struct page **page,
+			struct siw_sge *sge, unsigned int offset,
+			unsigned int size)
+{
+	int i = 0, sent = 0, rv;
+	int sge_bytes = min(sge->len - offset, size);
+
+	offset  = (sge->addr + offset) & ~PAGE_MASK;
+
+	while (sent != size) {
+
+		rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes);
+		if (rv >= 0) {
+			sent += rv;
+			if (size == sent || sge_bytes > rv)
+				break;
+
+			i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT;
+			sge++;
+			sge_bytes = min(sge->len, size - sent);
+			offset = sge->addr & ~PAGE_MASK;
+		} else {
+			sent = rv;
+			break;
+		}
+	}
+	return sent;
+}
+
+/*
+ * siw_tx_umem_init()
+ *
+ * Resolve memory chunk and update page index pointer
+ *
+ * @chunk:	Umem Chunk to be updated
+ * @p_idx	Page Index to be updated
+ * @mr:		Memory Region
+ * @va:		Virtual Address within MR
+ *
+ */
+static void siw_tx_umem_init(struct ib_umem_chunk **chunk, int *page_index,
+			     struct siw_mr *mr, u64 va)
+{
+	struct ib_umem_chunk *cp;
+	int p_ix;
+
+	BUG_ON(va < mr->mem.va);
+	va -= mr->mem.va & PAGE_MASK;
+	/*
+	 * equivalent to
+	 * va += mr->umem->offset;
+	 * va = va >> PAGE_SHIFT;
+	 */
+
+	p_ix = va >> PAGE_SHIFT;
+
+	list_for_each_entry(cp, &mr->umem->chunk_list, list) {
+		if (p_ix < cp->nents)
+			break;
+		p_ix -= cp->nents;
+	}
+	BUG_ON(p_ix >= cp->nents);
+
+	dprint(DBG_MM, "(): New chunk 0x%p: Page idx %d, nents %d\n",
+		cp, p_ix, cp->nents);
+
+	*chunk = cp;
+	*page_index = p_ix;
+
+	return;
+}
+
+/*
+ * update memory chunk and page index from given starting point
+ * before current transmit described by: c_tx->sge_off,
+ * sge->addr, c_tx->pg_idx, and c_tx->umem_chunk
+ */
+static inline void
+siw_umem_chunk_update(struct siw_iwarp_tx *c_tx, struct siw_mr *mr,
+		      struct siw_sge *sge, unsigned int off)
+{
+	struct ib_umem_chunk *chunk = c_tx->umem_chunk;
+	u64 va_start = sge->addr + c_tx->sge_off;
+
+	off += (unsigned int)(va_start & ~PAGE_MASK); /* + first page offset */
+	off >>= PAGE_SHIFT; 	/* bytes offset becomes pages offset */
+
+	list_for_each_entry_from(chunk, &mr->umem->chunk_list, list) {
+		if (c_tx->pg_idx + off < chunk->nents)
+			break;
+		off -= chunk->nents - c_tx->pg_idx;
+		c_tx->pg_idx = 0;
+	}
+	c_tx->pg_idx += off;
+
+	c_tx->umem_chunk = chunk;
+}
+
+#define MAX_TRAILER 8
+#define MAX_ARRAY 130	/* Max number of kernel_sendmsg elements */
+
+static inline void
+siw_save_txstate(struct siw_iwarp_tx *c_tx, struct ib_umem_chunk *chunk,
+		 unsigned int pg_idx, unsigned int sge_idx,
+		 unsigned int sge_off)
+{
+	c_tx->umem_chunk = chunk;
+	c_tx->pg_idx = pg_idx;
+	c_tx->sge_idx = sge_idx;
+	c_tx->sge_off = sge_off;
+}
+/*
+ * Write out iov referencing hdr, data and trailer of current FPDU.
+ * Update transmit state dependent on write return status
+ */
+static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s)
+{
+	struct siw_wqe		*wqe = c_tx->wqe;
+	struct siw_sge		*sge = &wqe->wr.sgl.sge[c_tx->sge_idx],
+				*first_sge = sge;
+	struct siw_mr		*mr = siw_mem2mr(sge->mem.obj);
+	struct ib_umem_chunk 	*chunk = c_tx->umem_chunk;
+
+	struct kvec		iov[MAX_ARRAY];
+	struct page 		*page_array[MAX_ARRAY];
+	struct msghdr		msg = {.msg_flags = MSG_DONTWAIT};
+
+	int			seg = 0, do_crc = c_tx->do_crc, kbuf = 0,
+				rv;
+	unsigned int		data_len = c_tx->bytes_unsent,
+				hdr_len = 0,
+				trl_len = 0,
+				sge_off = c_tx->sge_off,
+				sge_idx = c_tx->sge_idx,
+				pg_idx = c_tx->pg_idx;
+
+	if (SIW_INLINED_DATA(wqe)) {
+		kbuf = 1;
+		chunk = 0;
+	}
+
+	if (c_tx->state == SIW_SEND_HDR) {
+		if (c_tx->use_sendpage) {
+			rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT|MSG_MORE);
+			if (rv)
+				goto done;
+
+			c_tx->state = SIW_SEND_DATA;
+		} else {
+			iov[0].iov_base =
+				(char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent;
+			iov[0].iov_len = hdr_len =
+				c_tx->ctrl_len - c_tx->ctrl_sent;
+			seg = 1;
+			siw_dprint_hdr(&c_tx->pkt.hdr, TX_QPID(c_tx),
+					"HDR to send: ");
+		}
+	}
+
+	wqe->processed += data_len;
+
+	while (data_len) { /* walk the list of SGE's */
+		unsigned int sge_len = min(sge->len - sge_off, data_len);
+		unsigned int fp_off = (sge->addr + sge_off) & ~PAGE_MASK;
+
+		BUG_ON(!sge_len);
+
+		if (kbuf) {
+			/*
+			 * In kernel buffers to be tx'ed.
+			 */
+			iov[seg].iov_base =
+				(void *)(unsigned long)(sge->addr + sge_off);
+			iov[seg].iov_len = sge_len;
+			if (do_crc)
+				siw_crc_array(&c_tx->mpa_crc_hd,
+					      iov[seg].iov_base, sge_len);
+			sge_off += sge_len;
+			data_len -= sge_len;
+			seg++;
+			goto sge_done;
+		}
+		while (sge_len) {
+			struct scatterlist *sl;
+			size_t plen;
+
+			if (!chunk) {
+				mr = siw_mem2mr(sge->mem.obj);
+				siw_tx_umem_init(&chunk, &pg_idx, mr,
+						 sge->addr + sge_off);
+
+				if (!c_tx->umem_chunk)
+					/* Starting first tx for this WQE */
+					siw_save_txstate(c_tx, chunk, pg_idx,
+							 sge_idx, sge_off);
+			}
+			sl = &chunk->page_list[pg_idx];
+			plen = min((int)PAGE_SIZE - fp_off, sge_len);
+
+			BUG_ON(plen <= 0);
+
+			page_array[seg] = sg_page(sl);
+
+			if (!c_tx->use_sendpage) {
+				iov[seg].iov_base = kmap(sg_page(sl)) + fp_off;
+				iov[seg].iov_len = plen;
+			}
+			if (do_crc)
+				siw_crc_sg(&c_tx->mpa_crc_hd, sl, fp_off, plen);
+
+			sge_len -= plen;
+			sge_off += plen;
+			data_len -= plen;
+
+			if (plen + fp_off == PAGE_SIZE &&
+			    sge_off < sge->len && ++pg_idx == chunk->nents) {
+				chunk = mem_chunk_next(chunk);
+				pg_idx = 0;
+			}
+			fp_off = 0;
+			if (++seg > MAX_ARRAY) {
+				dprint(DBG_ON, "(QP%d): Too many fragments\n",
+				       TX_QPID(c_tx));
+				if (!kbuf) {
+					int i = (hdr_len > 0) ? 1 : 0;
+					seg--;
+					while (i < seg)
+						kunmap(page_array[i++]);
+				}
+				wqe->processed = 0;
+				rv = -EINVAL;
+				goto done_crc;
+			}
+		}
+sge_done:
+		/* Update SGE variables at end of SGE */
+		if (sge_off == sge->len && wqe->processed < wqe->bytes) {
+			sge_idx++;
+			sge++;
+			sge_off = 0;
+			chunk = NULL;
+		}
+	}
+	/* trailer */
+	if (likely(c_tx->state != SIW_SEND_TRAILER)) {
+		iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad];
+		iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad);
+	} else {
+		iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent];
+		iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent;
+	}
+
+	if (c_tx->pad) {
+		*(u32 *)c_tx->trailer.pad = 0;
+		if (do_crc)
+			siw_crc_array(&c_tx->mpa_crc_hd,
+				      (u8 *)&c_tx->trailer.crc - c_tx->pad,
+				      c_tx->pad);
+	}
+	if (!c_tx->crc_enabled)
+		c_tx->trailer.crc = 0;
+	else if (do_crc)
+		crypto_hash_final(&c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc);
+
+	data_len = c_tx->bytes_unsent;
+
+	if (c_tx->tcp_seglen >= (int)MPA_MIN_FRAG && TX_MORE_WQE(TX_QP(c_tx))) {
+		msg.msg_flags |= MSG_MORE;
+		c_tx->new_tcpseg = 0;
+	} else
+		c_tx->new_tcpseg = 1;
+
+	if (c_tx->use_sendpage) {
+		rv = siw_0copy_tx(s, page_array, first_sge, c_tx->sge_off,
+				  data_len);
+		if (rv == data_len) {
+			rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len);
+			if (rv > 0)
+				rv += data_len;
+			else
+				rv = data_len;
+		}
+	} else {
+		rv = kernel_sendmsg(s, &msg, iov, seg + 1,
+				    hdr_len + data_len + trl_len);
+		if (!kbuf) {
+			int i = (hdr_len > 0) ? 1 : 0;
+			while (i < seg)
+				kunmap(page_array[i++]);
+		}
+	}
+	if (rv < (int)hdr_len) {
+		/* Not even complete hdr pushed or negative rv */
+		wqe->processed -= data_len;
+		if (rv >= 0) {
+			c_tx->ctrl_sent += rv;
+			rv = -EAGAIN;
+		}
+		goto done_crc;
+	}
+
+	rv -= hdr_len;
+
+	if (rv >= (int)data_len) {
+		/* all user data pushed to TCP or no data to push */
+		if (data_len > 0 && wqe->processed < wqe->bytes)
+			/* Save the current state for next tx */
+			siw_save_txstate(c_tx, chunk, pg_idx, sge_idx, sge_off);
+
+		rv -= data_len;
+
+		if (rv == trl_len) /* all pushed */
+			rv = 0;
+		else {
+			c_tx->state = SIW_SEND_TRAILER;
+			c_tx->ctrl_len = MAX_TRAILER;
+			c_tx->ctrl_sent = rv + 4 - c_tx->pad;
+			c_tx->bytes_unsent = 0;
+			rv = -EAGAIN;
+		}
+
+	} else if (data_len > 0) {
+		/* Maybe some user data pushed to TCP */
+		c_tx->state = SIW_SEND_DATA;
+		wqe->processed -= data_len - rv;
+
+		if (rv) {
+			/*
+			 * Some bytes out. Recompute tx state based
+			 * on old state and bytes pushed
+			 */
+			c_tx->bytes_unsent -= rv;
+			sge = &wqe->wr.sgl.sge[c_tx->sge_idx];
+
+			if (c_tx->sge_idx == sge_idx && c_tx->umem_chunk)
+				/*
+				 * same SGE as starting SGE for this FPDU
+				 */
+				siw_umem_chunk_update(c_tx, mr, sge, rv);
+			else {
+				while (sge->len <= c_tx->sge_off + rv) {
+					rv -= sge->len - c_tx->sge_off;
+					sge = &wqe->wr.sgl.sge[++c_tx->sge_idx];
+					c_tx->sge_off = 0;
+				}
+				c_tx->umem_chunk = NULL;
+			}
+			c_tx->sge_off += rv;
+			BUG_ON(c_tx->sge_off >= sge->len);
+		}
+		rv = -EAGAIN;
+	}
+done_crc:
+	c_tx->do_crc = 0;
+done:
+	return rv;
+}
+
+static void siw_calculate_tcpseg(struct siw_iwarp_tx *c_tx, struct socket *s)
+{
+	/*
+	 * refresh TCP segement len if we start a new segment or
+	 * remaining segment len is less than MPA_MIN_FRAG or
+	 * the socket send buffer is empty.
+	 */
+	if (c_tx->new_tcpseg || c_tx->tcp_seglen < (int)MPA_MIN_FRAG ||
+	     !tcp_send_head(s->sk))
+
+		c_tx->tcp_seglen = get_tcp_mss(s->sk);
+}
+
+
+/*
+ * siw_unseg_txlen()
+ *
+ * Compute complete tcp payload len if packet would not
+ * get fragmented
+ */
+static inline int siw_unseg_txlen(struct siw_iwarp_tx *c_tx)
+{
+	int pad = c_tx->bytes_unsent ? -c_tx->bytes_unsent & 0x3 : 0;
+
+	return c_tx->bytes_unsent + c_tx->ctrl_len + pad + MPA_CRC_SIZE;
+}
+
+
+/*
+ * siw_prepare_fpdu()
+ *
+ * Prepares transmit context to send out one FPDU if FPDU will contain
+ * user data and user data are not immediate data.
+ * Checks and locks involved memory segments of data to be sent.
+ * Computes maximum FPDU length to fill up TCP MSS if possible.
+ *
+ * @qp:		QP from which to transmit
+ * @wqe:	Current WQE causing transmission
+ *
+ * TODO: Take into account real available sendspace on socket
+ *       to avoid header misalignment due to send pausing within
+ *       fpdu transmission
+ */
+int siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+	struct siw_iwarp_tx	*c_tx  = &qp->tx_ctx;
+	int			rv = 0;
+
+	/*
+	 * TODO: TCP Fragmentation dynamics needs for further investigation.
+	 * 	 Resuming SQ processing may start with full-sized packet
+	 *	 or short packet which resets MSG_MORE and thus helps
+	 *	 to synchronize.
+	 *	 This version resumes with short packet.
+	 */
+	c_tx->ctrl_len = iwarp_pktinfo[c_tx->pkt.ctrl.opcode].hdr_len;
+	c_tx->ctrl_sent = 0;
+
+	/*
+	 * Update target buffer offset if any
+	 */
+	if (!c_tx->pkt.ctrl.t) {
+		/* Untagged message */
+		c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed);
+	} else {
+		/* Tagged message */
+		if (wr_type(wqe) == SIW_WR_RDMA_READ_RESP) {
+			c_tx->pkt.c_tagged.ddp_to =
+			    cpu_to_be64(wqe->wr.rresp.raddr + wqe->processed);
+		} else {
+			c_tx->pkt.c_tagged.ddp_to =
+			    cpu_to_be64(wqe->wr.write.raddr + wqe->processed);
+		}
+	}
+
+	/* First guess: one big unsegmented DDP segment */
+	c_tx->bytes_unsent = wqe->bytes - wqe->processed;
+	c_tx->tcp_seglen -= siw_unseg_txlen(c_tx);
+
+	if (c_tx->tcp_seglen >= 0) {
+		/* Whole DDP segment fits into current TCP segment */
+		c_tx->pkt.ctrl.l = 1;
+		c_tx->pad = -c_tx->bytes_unsent & 0x3;
+	} else {
+		/* Trim DDP payload to fit into current TCP segment */
+		c_tx->bytes_unsent += c_tx->tcp_seglen;
+		c_tx->bytes_unsent &= ~0x3;
+		c_tx->pad = 0;
+		c_tx->pkt.ctrl.l = 0;
+	}
+	c_tx->pkt.ctrl.mpa_len =
+		htons(c_tx->ctrl_len + c_tx->bytes_unsent - MPA_HDR_SIZE);
+
+#ifdef SIW_TX_FULLSEGS
+	c_tx->fpdu_len =
+		c_tx->ctrl_len + c_tx->bytes_unsent + c_tx->pad + MPA_CRC_SIZE;
+#endif
+	/*
+	 * Init MPA CRC computation
+	 */
+	if (c_tx->crc_enabled) {
+		siw_crc_txhdr(c_tx);
+		c_tx->do_crc = 1;
+	}
+	if (c_tx->bytes_unsent && !SIW_INLINED_DATA(wqe)) {
+		struct siw_sge	*sge = &wqe->wr.sgl.sge[c_tx->sge_idx];
+		/*
+		 * Reference memory to be tx'd
+		 */
+		BUG_ON(c_tx->sge_idx > wqe->wr.sgl.num_sge - 1);
+
+		if (wr_type(wqe) != SIW_WR_RDMA_READ_RESP)
+			rv = siw_check_sgl(qp->pd, sge, SR_MEM_LREAD,
+					   c_tx->sge_off, c_tx->bytes_unsent);
+		else
+			rv = siw_check_sge(qp->pd, sge, SR_MEM_RREAD,
+					   c_tx->sge_off, c_tx->bytes_unsent);
+	}
+	return rv;
+}
+
+#ifdef SIW_TX_FULLSEGS
+static inline int siw_test_wspace(struct socket *s, struct siw_iwarp_tx *c_tx)
+{
+	struct sock *sk = s->sk;
+	int rv = 0;
+
+	lock_sock(sk);
+	if (sk_stream_wspace(sk) < (int)c_tx->fpdu_len) {
+		set_bit(SOCK_NOSPACE, &s->flags);
+		rv = -EAGAIN;
+	}
+	release_sock(sk);
+
+	return rv;
+}
+#endif
+/*
+ * siw_qp_sq_proc_tx()
+ *
+ * Process one WQE which needs transmission on the wire.
+ * Return with:
+ *	-EAGAIN, if handover to tcp remained incomplete
+ *	0,	 if handover to tcp complete
+ *	< 0,	 if other errors happend.
+ *
+ * @qp:		QP to send from
+ * @wqe:	WQE causing transmission
+ */
+static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+	struct siw_iwarp_tx	*c_tx = &qp->tx_ctx;
+	struct socket	 	*s = qp->attrs.llp_stream_handle;
+	int			rv = 0;
+
+
+	if (wqe->wr_status == SR_WR_QUEUED) {
+		wqe->wr_status = SR_WR_INPROGRESS;
+
+		siw_calculate_tcpseg(c_tx, s);
+
+		rv = siw_qp_prepare_tx(c_tx);
+		if (rv == PKT_FRAGMENTED) {
+			c_tx->state = SIW_SEND_HDR;
+			rv = siw_prepare_fpdu(qp, wqe);
+			if (rv)
+				return rv;
+		} else if (rv == PKT_COMPLETE)
+			c_tx->state = SIW_SEND_SHORT_FPDU;
+		else
+			goto tx_done;
+	}
+next_segment:
+#ifdef SIW_TX_FULLSEGS
+	rv = siw_test_wspace(s, c_tx);
+	if (rv < 0)
+		goto tx_done;
+#endif
+
+	if (c_tx->state == SIW_SEND_SHORT_FPDU) {
+		enum siw_wr_opcode tx_type = wr_type(wqe);
+
+		/*
+		 * Always end current TCP segment (no MSG_MORE flag):
+		 * trying to fill segment would result in excessive delay.
+		 */
+		rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT);
+
+		if (!rv && tx_type != SIW_WR_RDMA_READ_REQ)
+			wqe->processed = wqe->bytes;
+
+		goto tx_done;
+
+	} else
+		rv = siw_tx_hdt(c_tx, s);
+
+	if (!rv) {
+		/* Verbs, 6.4.: Try stopping sending after a full DDP segment
+		 * if the connection goes down (== peer halfclose)
+		 */
+		if (unlikely(c_tx->tx_suspend)) {
+			rv = -ECONNABORTED;
+			goto tx_done;
+		}
+		/*
+		 * One segment sent. Processing completed if last segment.
+		 * Do next segment otherwise. Stop if tx error.
+		 */
+		if (c_tx->pkt.ctrl.l == 1) {
+			dprint(DBG_TX, "(QP%d): WR completed\n", QP_ID(qp));
+			goto tx_done;
+		}
+		c_tx->state = SIW_SEND_HDR;
+
+		siw_calculate_tcpseg(c_tx, s);
+
+		rv = siw_prepare_fpdu(qp, wqe);
+		if (!rv)
+			goto next_segment;
+	}
+tx_done:
+	return rv;
+}
+
+
+/*
+ * siw_wqe_sq_processed()
+ *
+ * Called after WQE processing completed.
+ * If WQE is not of signalled typ, it can be released.
+ * If the ORQ is empty, a signalled WQE is attached to the CQ.
+ * Otherwise, it is appended to the end of the ORQ for later
+ * completion. To keep WQE ordering, the ORQ is always consumed FIFO.
+ */
+static void siw_wqe_sq_processed(struct siw_wqe *wqe, struct siw_qp *qp)
+{
+	unsigned long flags;
+	LIST_HEAD(c_list);
+
+	if (!(wr_flags(wqe) & IB_SEND_SIGNALED)) {
+		atomic_inc(&qp->sq_space);
+		siw_wqe_put(wqe);
+		return;
+	}
+	lock_orq_rxsave(qp, flags);
+
+	if (ORQ_EMPTY(qp)) {
+		unlock_orq_rxsave(qp, flags);
+		dprint(DBG_WR|DBG_TX,
+			"(QP%d): Immediate completion, wr_type %d\n",
+			QP_ID(qp), wr_type(wqe));
+		list_add_tail(&wqe->list, &c_list);
+		siw_sq_complete(&c_list, qp, 1, wr_flags(wqe));
+	} else {
+		list_add_tail(&wqe->list, &qp->orq);
+		dprint(DBG_WR|DBG_TX,
+			"(QP%d): Defer completion, wr_type %d\n",
+			QP_ID(qp), wr_type(wqe));
+	}
+}
+
+int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+	printk(KERN_ERR "local WR's not yet implemented\n");
+	BUG();
+	return 0;
+}
+
+
+/*
+ * siw_qp_sq_process()
+ *
+ * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket.
+ * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more
+ * MPA FPDUs, each containing a DDP segment.
+ *
+ * SQ processing may occur in user context as a result of posting
+ * new WQE's or from siw_sq_work_handler() context.
+ *
+ * SQ processing may get paused anytime, possibly in the middle of a WR
+ * or FPDU, if insufficient send space is available. SQ processing
+ * gets resumed from siw_sq_work_handler(), if send space becomes
+ * available again.
+ *
+ * Must be called with the QP state read-locked.
+ *
+ * TODO:
+ * To be solved more seriously: an outbound RREQ can be satisfied
+ * by the corresponding RRESP _before_ it gets assigned to the ORQ.
+ * This happens regularly in RDMA READ via loopback case. Since both
+ * outbound RREQ and inbound RRESP can be handled by the same CPU
+ * locking the ORQ is dead-lock prone and thus not an option.
+ * Tentatively, the RREQ gets assigned to the ORQ _before_ being
+ * sent (and pulled back in case of send failure).
+ */
+int siw_qp_sq_process(struct siw_qp *qp, int user_ctx)
+{
+	struct siw_wqe		*wqe;
+	enum siw_wr_opcode	tx_type;
+	unsigned long		flags;
+	int			rv = 0;
+	int			max_burst;
+
+	if (user_ctx)
+		max_burst = SQ_USER_MAXBURST;
+	else
+		max_burst = max(qp->attrs.sq_size, qp->attrs.ird);
+
+	atomic_inc(&qp->tx_ctx.in_use);
+
+	wait_event(qp->tx_ctx.waitq, atomic_read(&qp->tx_ctx.in_use) == 1);
+
+	wqe = tx_wqe(qp);
+	BUG_ON(wqe == NULL);
+
+next_wqe:
+	/*
+	 * Stop QP processing if SQ state changed
+	 */
+	if (unlikely(qp->tx_ctx.tx_suspend)) {
+		dprint(DBG_WR|DBG_TX, "(QP%d): tx suspend\n", QP_ID(qp));
+		goto done;
+	}
+	tx_type = wr_type(wqe);
+
+	dprint(DBG_WR|DBG_TX,
+		" QP(%d): WR type %d, state %d, data %u, sent %u, id %llu\n",
+		QP_ID(qp), wr_type(wqe), wqe->wr_status, wqe->bytes,
+		wqe->processed, (unsigned long long)wr_id(wqe));
+
+	if (SIW_WQE_IS_TX(wqe))
+		rv = siw_qp_sq_proc_tx(qp, wqe);
+	else
+		rv = siw_qp_sq_proc_local(qp, wqe);
+
+	if (!rv) {
+		/*
+		 * WQE processing done
+		 */
+		switch (tx_type) {
+
+		case SIW_WR_SEND:
+		case SIW_WR_RDMA_WRITE:
+
+			wqe->wc_status = IB_WC_SUCCESS;
+			wqe->wr_status = SR_WR_DONE;
+			siw_wqe_sq_processed(wqe, qp);
+			break;
+
+		case SIW_WR_RDMA_READ_REQ:
+			/*
+			 * already enqueued to ORQ queue
+			 */
+			break;
+
+		case SIW_WR_RDMA_READ_RESP:
+			/*
+			 * silently recyclye wqe
+			 */
+			/* XXX DEBUG AID, please remove */
+			wqe->wr_status = SR_WR_DONE;
+			siw_wqe_put(wqe);
+			break;
+		default:
+			BUG();
+		}
+
+		lock_sq_rxsave(qp, flags);
+
+		wqe = siw_next_tx_wqe(qp);
+		if (!wqe) {
+			tx_wqe(qp) = NULL;
+			unlock_sq_rxsave(qp, flags);
+			goto done;
+		}
+		if (wr_type(wqe) == SIW_WR_RDMA_READ_REQ) {
+			if (ORD_SUSPEND_SQ(qp)) {
+				tx_wqe(qp) = NULL;
+				unlock_sq_rxsave(qp, flags);
+				dprint(DBG_WR|DBG_TX,
+					" QP%d PAUSE SQ: ORD limit\n",
+					QP_ID(qp));
+				goto done;
+			} else {
+				tx_wqe(qp) = wqe;
+				siw_rreq_queue(wqe, qp);
+			}
+		} else  {
+			list_del_init(&wqe->list);
+			tx_wqe(qp) = wqe;
+		}
+		unlock_sq_rxsave(qp, flags);
+
+		if (--max_burst == 0) {
+			if (user_ctx) {
+				/*
+				 * Avoid to keep the user sending from its
+				 * context for too long (blocking user thread)
+				 */
+				siw_sq_queue_work(qp);
+				goto done;
+			} else {
+				/*
+				 * Avoid to starve other QP's tx if consumer
+				 * keeps posting new tx work for current cpu.
+				 */
+				int workq_len =
+				    atomic_read(&get_cpu_var(siw_workq_len));
+
+				put_cpu_var(siw_workq_len);
+
+				if (workq_len) {
+					/* Another QP's work on same WQ */
+					siw_sq_queue_work(qp);
+					goto done;
+				}
+			}
+			max_burst = max(qp->attrs.sq_size, qp->attrs.ird);
+		}
+		goto next_wqe;
+
+	} else if (rv == -EAGAIN) {
+		dprint(DBG_WR|DBG_TX,
+			"(QP%d): SQ paused: hd/tr %d of %d, data %d\n",
+			QP_ID(qp), qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len,
+			qp->tx_ctx.bytes_unsent);
+		rv = 0;
+		goto done;
+	} else {
+		/*
+		 * WQE processing failed.
+		 * Verbs 8.3.2:
+		 * o It turns any WQE into a signalled WQE.
+		 * o Local catastrophic error must be surfaced
+		 * o QP must be moved into Terminate state: done by code
+		 *   doing socket state change processing
+		 *
+		 * o TODO: Termination message must be sent.
+		 * o TODO: Implement more precise work completion errors,
+		 *         see enum ib_wc_status in ib_verbs.h
+		 */
+		dprint(DBG_ON, " (QP%d): WQE type %d processing failed: %d\n",
+				QP_ID(qp), wr_type(wqe), rv);
+
+		lock_sq_rxsave(qp, flags);
+		/*
+		 * RREQ may have already been completed by inbound RRESP!
+		 */
+		if (tx_type == RDMAP_RDMA_READ_REQ) {
+			lock_orq(qp);
+			if (!ORQ_EMPTY(qp) &&
+			    wqe == list_entry_wqe(qp->orq.prev)) {
+				/*
+				 * wqe still on the ORQ
+				 * TODO: fix a potential race condition if the
+				 * rx path is currently referencing the wqe(!)
+				 */
+				dprint(DBG_ON, " (QP%d): Bad RREQ in ORQ\n",
+					QP_ID(qp));
+				list_del_init(&wqe->list);
+				unlock_orq(qp);
+			} else {
+				/*
+				 * already completed by inbound RRESP
+				 */
+				dprint(DBG_ON,
+					" (QP%d): Bad RREQ already Completed\n",
+					QP_ID(qp));
+				unlock_orq(qp);
+				tx_wqe(qp) = NULL;
+				unlock_sq_rxsave(qp, flags);
+
+				goto done;
+			}
+		}
+		tx_wqe(qp) = NULL;
+		unlock_sq_rxsave(qp, flags);
+		/*
+		 * immediately suspends further TX processing
+		 */
+		if (!qp->tx_ctx.tx_suspend)
+			siw_qp_cm_drop(qp, 0);
+
+		switch (tx_type) {
+
+		case SIW_WR_SEND:
+		case SIW_WR_RDMA_WRITE:
+		case SIW_WR_RDMA_READ_REQ:
+			wqe->wr_status = SR_WR_DONE;
+			wqe->wc_status = IB_WC_LOC_QP_OP_ERR;
+			wqe->error = rv;
+			wr_flags(wqe) |= IB_SEND_SIGNALED;
+			if (tx_type != SIW_WR_RDMA_READ_REQ)
+				/*
+				 * RREQ already enqueued to ORQ queue
+				 */
+				siw_wqe_sq_processed(wqe, qp);
+
+			siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
+
+			break;
+
+		case SIW_WR_RDMA_READ_RESP:
+			/*
+			 * Recyclye wqe
+			 */
+			dprint(DBG_WR|DBG_TX|DBG_ON, "(QP%d): "
+				   "Processing RRESPONSE failed with %d\n",
+				    QP_ID(qp), rv);
+
+			siw_async_ev(qp, NULL, IB_EVENT_QP_REQ_ERR);
+
+			siw_wqe_put(wqe);
+			break;
+
+		default:
+			BUG();
+		}
+	}
+done:
+	atomic_dec(&qp->tx_ctx.in_use);
+	wake_up(&qp->tx_ctx.waitq);
+
+	return rv;
+}
+
+static struct workqueue_struct *siw_sq_wq;
+
+int __init siw_sq_worker_init(void)
+{
+	siw_sq_wq = create_workqueue("siw_sq_wq");
+	if (!siw_sq_wq)
+		return -ENOMEM;
+
+	dprint(DBG_TX|DBG_OBJ, " Init WQ\n");
+	return 0;
+}
+
+
+void __exit siw_sq_worker_exit(void)
+{
+	dprint(DBG_TX|DBG_OBJ, " Destroy WQ\n");
+	if (siw_sq_wq) {
+		flush_workqueue(siw_sq_wq);
+		destroy_workqueue(siw_sq_wq);
+	}
+}
+
+
+/*
+ * siw_sq_work_handler()
+ *
+ * Scheduled by siw_qp_llp_write_space() socket callback if socket
+ * send space became available again. This function resumes SQ
+ * processing.
+ */
+static void siw_sq_work_handler(struct work_struct *w)
+{
+	struct siw_sq_work	*this_work;
+	struct siw_qp		*qp;
+	int			rv;
+
+	atomic_dec(&get_cpu_var(siw_workq_len));
+	put_cpu_var(siw_workq_len);
+
+	this_work = container_of(w, struct siw_sq_work, work);
+	qp = container_of(this_work, struct siw_qp, sq_work);
+
+	dprint(DBG_TX|DBG_OBJ, "(QP%d)\n", QP_ID(qp));
+
+	if (down_read_trylock(&qp->state_lock)) {
+		if (likely(qp->attrs.state == SIW_QP_STATE_RTS &&
+			   !qp->tx_ctx.tx_suspend)) {
+
+			rv = siw_qp_sq_process(qp, 0);
+			up_read(&qp->state_lock);
+
+			if (rv < 0) {
+				dprint(DBG_TX, "(QP%d): failed: %d\n",
+					QP_ID(qp), rv);
+
+				if (!qp->tx_ctx.tx_suspend)
+					siw_qp_cm_drop(qp, 0);
+			}
+		} else {
+			dprint(DBG_ON|DBG_TX, "(QP%d): state: %d %d\n",
+				QP_ID(qp), qp->attrs.state,
+					qp->tx_ctx.tx_suspend);
+			up_read(&qp->state_lock);
+		}
+	} else {
+		dprint(DBG_ON|DBG_TX, "(QP%d): QP locked\n", QP_ID(qp));
+	}
+	siw_qp_put(qp);
+}
+
+
+int siw_sq_queue_work(struct siw_qp *qp)
+{
+	int cpu, rv;
+
+	dprint(DBG_TX|DBG_OBJ, "(QP%d)\n", QP_ID(qp));
+
+	siw_qp_get(qp);
+
+	INIT_WORK(&qp->sq_work.work, siw_sq_work_handler);
+
+	cpu = get_cpu();
+
+	if (in_softirq()) {
+		if (cpu == qp->cpu) {
+			/*
+			 * Try not to use the current CPU for tx traffic.
+			 */
+			for_each_online_cpu(cpu) {
+				if (cpu != qp->cpu)
+					break;
+			}
+		} else
+			cpu = qp->cpu;
+	}
+	atomic_inc(&per_cpu(siw_workq_len, cpu));
+	rv = queue_work_on(cpu, siw_sq_wq, &qp->sq_work.work);
+	/*
+	 * Remember CPU: Avoid spreading SQ work of QP over WQ's
+	 */
+	qp->cpu = cpu;
+
+	put_cpu();
+
+	return rv;
+}
-- 
1.5.4.3


^ permalink raw reply related

* [PATCH] SIW: Receive path
From: Bernard Metzler @ 2010-10-05  6:55 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_qp_rx.c | 1493 +++++++++++++++++++++++++++++++++
 1 files changed, 1493 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_qp_rx.c

diff --git a/drivers/infiniband/hw/siw/siw_qp_rx.c b/drivers/infiniband/hw/siw/siw_qp_rx.c
new file mode 100644
index 0000000..dd9edd4
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_qp_rx.c
@@ -0,0 +1,1493 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *          Fredy Neeser <nfd-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+
+/*
+ * ----------------------------
+ * DDP reassembly for Softiwarp
+ * ----------------------------
+ * For the ordering of transmitted DDP segments, the relevant iWARP ordering
+ * rules are as follows:
+ *
+ * - RDMAP (RFC 5040): Section 7.5, Rule 17:
+ *   "RDMA Read Response Message processing at the Remote Peer (reading
+ *    the specified Tagged Buffer) MUST be started only after the RDMA
+ *    Read Request Message has been Delivered by the DDP layer (thus,
+ *    all previous RDMA Messages have been properly submitted for
+ *    ordered Placement)."
+ *
+ * - DDP (RFC 5041): Section 5.3:
+ *   "At the Data Source, DDP:
+ *    o MUST transmit DDP Messages in the order they were submitted to
+ *      the DDP layer,
+ *    o SHOULD transmit DDP Segments within a DDP Message in increasing
+ *      MO order for Untagged DDP Messages, and in increasing TO order
+ *      for Tagged DDP Messages."
+ *
+ * Combining these rules implies that, although RDMAP does not provide
+ * ordering between operations that are generated from the two ends of an
+ * RDMAP stream, DDP *must not* transmit an RDMA Read Response Message before
+ * it has finished transmitting SQ operations that were already submitted
+ * to the DDP layer. It follows that an iWARP transmitter must fully
+ * serialize RDMAP messages belonging to the same QP.
+ *
+ * Given that a TCP socket receives DDP segments in peer transmit order,
+ * we obtain the following ordering of received DDP segments:
+ *
+ * (i)  the received DDP segments of RDMAP messages for the same QP
+ *      cannot be interleaved
+ * (ii) the received DDP segments of a single RDMAP message *should*
+ *      arrive in order.
+ *
+ * The Softiwarp transmitter obeys rule #2 in DDP Section 5.3.
+ * With this property, the "should" becomes a "must" in (ii) above,
+ * which simplifies DDP reassembly considerably.
+ * The Softiwarp receiver currently relies on this property
+ * and reports an error if DDP segments of the same RDMAP message
+ * do not arrive in sequence.
+ */
+
+static inline int siw_crc_rxhdr(struct siw_iwarp_rx *ctx)
+{
+	crypto_hash_init(&ctx->mpa_crc_hd);
+
+	return siw_crc_array(&ctx->mpa_crc_hd, (u8 *)&ctx->hdr,
+			     ctx->fpdu_part_rcvd);
+}
+
+
+/*
+ * siw_rx_umem_init()
+ *
+ * Given memory region @mr and tagged offset @t_off within @mr,
+ * resolve corresponding ib_umem_chunk memory chunk pointer
+ * and update receive context variables to point at receive position.
+ * returns 0 on sucess and failure otherwise.
+ *
+ * NOTE: This function expects virtual addresses.
+ * TODO: Function needs generalization to support relative adressing
+ *       aka "ZBVA".
+ *
+ * @rctx:	Receive Context to be updated
+ * @mr:		Memory Region
+ * @t_off:	Offset within Memory Region
+ *
+ */
+static int siw_rx_umem_init(struct siw_iwarp_rx *rctx, struct siw_mr *mr,
+			    u64 t_off)
+{
+	struct ib_umem_chunk	*chunk;
+	u64			off_mr;   /* offset into MR */
+	int			psge_idx; /* Index of PSGE */
+
+	off_mr = t_off - (mr->mem.va & PAGE_MASK);
+	/*
+	 * Equivalent to
+	 * off_mr = t_off - mr->mem.va;
+	 * off_mr += mr->umem->offset;
+	 */
+
+	/* Skip pages not referenced by t_off */
+	psge_idx = off_mr >> PAGE_SHIFT;
+
+	list_for_each_entry(chunk, &mr->umem->chunk_list, list) {
+		if (psge_idx < chunk->nents)
+			break;
+		psge_idx -= chunk->nents;
+	}
+	if (psge_idx >= chunk->nents) {
+		dprint(DBG_MM|DBG_ON, "(QP%d): Short chunk list\n",
+			RX_QPID(rctx));
+		return -EINVAL;
+	}
+	rctx->pg_idx = psge_idx;
+	rctx->pg_off = off_mr & ~PAGE_MASK;
+	rctx->umem_chunk = chunk;
+
+	dprint(DBG_MM, "(QP%d): New chunk, idx %d\n", RX_QPID(rctx), psge_idx);
+	return 0;
+}
+
+
+/*
+ * siw_rx_umem()
+ *
+ * Receive data of @len into target referenced by @rctx.
+ * This function does not check if umem is within bounds requested by
+ * @len and @t_off. @umem_ends indicates if routine should
+ * not update chunk position pointers after the point it is
+ * currently receiving
+ *
+ * @rctx:	Receive Context
+ * @len:	Number of bytes to place
+ * @umen_ends:	1, if rctx chunk pointer should not be updated after len.
+ */
+static int siw_rx_umem(struct siw_iwarp_rx *rctx, int len, int umem_ends)
+{
+	struct scatterlist	*p_list;
+	void			*dest;
+	struct ib_umem_chunk    *chunk = rctx->umem_chunk;
+	int			pg_off = rctx->pg_off,
+				copied = 0,
+				bytes,
+				rv;
+
+	while (len) {
+		bytes  = min(len, (int)PAGE_SIZE - pg_off);
+		p_list = &chunk->page_list[rctx->pg_idx];
+
+		dest = kmap_atomic(sg_page(p_list), KM_SOFTIRQ0);
+
+		rv = skb_copy_bits(rctx->skb, rctx->skb_offset, dest + pg_off,
+				   bytes);
+
+		dprint(DBG_RX, "(QP%d): Page #%d, "
+			"bytes=%u, rv=%d returned by skb_copy_bits()\n",
+			RX_QPID(rctx), rctx->pg_idx, bytes, rv);
+
+		if (likely(!rv)) {
+			if (rctx->crc_enabled)
+				rv = siw_crc_sg(&rctx->mpa_crc_hd, p_list,
+						pg_off, bytes);
+
+			rctx->skb_offset += bytes;
+			copied += bytes;
+			len -= bytes;
+			pg_off += bytes;
+		}
+
+		kunmap_atomic(dest, KM_SOFTIRQ0);
+
+		if (unlikely(rv)) {
+			rctx->skb_copied += copied;
+			rctx->skb_new -= copied;
+			copied = -EFAULT;
+
+			dprint(DBG_RX|DBG_ON, "(QP%d): failed with %d\n",
+				RX_QPID(rctx), rv);
+
+			goto out;
+		}
+		if (pg_off == PAGE_SIZE) {
+			/*
+			 * end of page
+			 */
+			pg_off = 0;
+			/*
+			 * reference next page chunk if
+			 * - all pages in chunk used AND
+			 * - current loop fills more into this umem
+			 *   OR the next receive will go into this umem
+			 *   starting at the position where we are leaving
+			 *   the routine.
+			 */
+			if (++rctx->pg_idx == chunk->nents &&
+				(len > 0 || !umem_ends)) {
+
+				rctx->pg_idx = 0;
+				chunk = mem_chunk_next(chunk);
+			}
+		}
+	}
+	/*
+	 * store chunk position for resume
+	 */
+	rctx->umem_chunk = chunk;
+	rctx->pg_off = pg_off;
+
+	rctx->skb_copied += copied;
+	rctx->skb_new -= copied;
+out:
+	return copied;
+}
+
+
+/*
+ * siw_rresp_check_ntoh()
+ *
+ * Check incoming RRESP fragment header against expected
+ * header values and update expected values for potential next
+ * fragment.
+ *
+ * NOTE: This function must be called only if a RRESP DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segement.
+ */
+static inline int siw_rresp_check_ntoh(struct siw_iwarp_rx *rctx)
+{
+	struct iwarp_rdma_rresp	*rresp = &rctx->hdr.rresp;
+	struct siw_wqe		*wqe = rctx->dest.wqe;
+
+	rresp->sink_stag = be32_to_cpu(rresp->sink_stag);
+	rresp->sink_to   = be64_to_cpu(rresp->sink_to);
+
+	if (rctx->first_ddp_seg) {
+		rctx->ddp_stag = wqe->wr.rread.sge[0].lkey;
+		rctx->ddp_to   = wqe->wr.rread.sge[0].addr;
+	}
+	if (rctx->ddp_stag != rresp->sink_stag) {
+		dprint(DBG_RX|DBG_ON,
+			" received STAG=%08x, expected STAG=%08x\n",
+			rresp->sink_stag, rctx->ddp_stag);
+		/*
+		 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+		 */
+		return -EINVAL;
+	}
+	if (rctx->ddp_to != rresp->sink_to) {
+		dprint(DBG_RX|DBG_ON,
+			" received TO=%016llx, expected TO=%016llx\n",
+			(unsigned long long)rresp->sink_to,
+			(unsigned long long)rctx->ddp_to);
+		/*
+		 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+		 */
+		return -EINVAL;
+	}
+	if (rctx->more_ddp_segs)
+		rctx->ddp_to += rctx->fpdu_part_rem;
+
+	else if (wqe->processed + rctx->fpdu_part_rem != wqe->bytes) {
+		dprint(DBG_RX|DBG_ON,
+			" RRESP length does not match RREQ, "
+			"peer sent=%d, expected %d\n",
+			wqe->processed + rctx->fpdu_part_rem, wqe->bytes);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * siw_write_check_ntoh()
+ *
+ * Check incoming WRITE fragment header against expected
+ * header values and update expected values for potential next
+ * fragment
+ *
+ * NOTE: This function must be called only if a WRITE DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segement.
+ */
+static inline int siw_write_check_ntoh(struct siw_iwarp_rx *rctx)
+{
+	struct iwarp_rdma_write	*write = &rctx->hdr.rwrite;
+
+	write->sink_stag = be32_to_cpu(write->sink_stag);
+	write->sink_to   = be64_to_cpu(write->sink_to);
+
+	if (rctx->first_ddp_seg) {
+		rctx->ddp_stag = write->sink_stag;
+		rctx->ddp_to   = write->sink_to;
+	} else {
+		if (rctx->ddp_stag != write->sink_stag) {
+			dprint(DBG_RX|DBG_ON,
+				" received STAG=%08x, expected STAG=%08x\n",
+				write->sink_stag, rctx->ddp_stag);
+			/*
+			 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+			 */
+			return -EINVAL;
+		}
+		if (rctx->ddp_to !=  write->sink_to) {
+			dprint(DBG_RX|DBG_ON,
+				" received TO=%016llx, expected TO=%016llx\n",
+				(unsigned long long)write->sink_to,
+				(unsigned long long)rctx->ddp_to);
+			/*
+			 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+			 */
+			return -EINVAL;
+		}
+	}
+	/*
+	 * Update expected target offset for next incoming DDP segment
+	 */
+	if (rctx->more_ddp_segs != 0)
+		rctx->ddp_to += rctx->fpdu_part_rem;
+
+	return 0;
+}
+
+/*
+ * siw_send_check_ntoh()
+ *
+ * Check incoming SEND fragment header against expected
+ * header values and update expected MSN if no next
+ * fragment expected
+ *
+ * NOTE: This function must be called only if a SEND DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segement.
+ */
+static inline int siw_send_check_ntoh(struct siw_iwarp_rx *rctx)
+{
+	struct iwarp_send	*send = &rctx->hdr.send;
+	struct siw_wqe		*wqe = rctx->dest.wqe;
+
+	send->ddp_msn = be32_to_cpu(send->ddp_msn);
+	send->ddp_mo  = be32_to_cpu(send->ddp_mo);
+	send->ddp_qn  = be32_to_cpu(send->ddp_qn);
+
+	if (send->ddp_qn != RDMAP_UNTAGGED_QN_SEND) {
+		dprint(DBG_RX|DBG_ON, " Invalid DDP QN %d for SEND\n",
+			send->ddp_qn);
+		return -EINVAL;
+	}
+	if (send->ddp_msn != rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]) {
+		dprint(DBG_RX|DBG_ON, " received MSN=%d, expected MSN=%d\n",
+			rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND], send->ddp_msn);
+		/*
+		 * TODO: Error handling
+		 * async_event= RI_EVENT_QP_RQ_PROTECTION_ERROR_MSN_GAP;
+		 * cmpl_status= RI_WC_STATUS_LOCAL_QP_CATASTROPHIC;
+		 */
+		return -EINVAL;
+	}
+	if (send->ddp_mo != wqe->processed) {
+		dprint(DBG_RX|DBG_ON, " Received MO=%u, expected MO=%u\n",
+			send->ddp_mo, wqe->processed);
+		/*
+		 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+		 */
+		return -EINVAL;
+	}
+	if (rctx->first_ddp_seg) {
+		/* initialize user memory write position */
+		rctx->sge_idx = 0;
+		rctx->sge_off = 0;
+	}
+	if (wqe->bytes < wqe->processed + rctx->fpdu_part_rem) {
+		dprint(DBG_RX|DBG_ON, " Receive space short: %d < %d\n",
+			wqe->bytes - wqe->processed, rctx->fpdu_part_rem);
+		wqe->wc_status = IB_WC_LOC_LEN_ERR;
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static inline struct siw_wqe *siw_get_rqe(struct siw_qp *qp)
+{
+	struct siw_wqe	*wqe = NULL;
+
+	if (!qp->srq) {
+		lock_rq(qp);
+		if (!list_empty(&qp->rq)) {
+			wqe = list_first_wqe(&qp->rq);
+			list_del_init(&wqe->list);
+			unlock_rq(qp);
+		} else {
+			unlock_rq(qp);
+			dprint(DBG_RX, " QP(%d): RQ empty!\n", QP_ID(qp));
+		}
+	} else {
+		wqe = siw_srq_fetch_wqe(qp);
+		if (!wqe)
+			dprint(DBG_RX, " QP(%d): SRQ empty!\n", QP_ID(qp));
+	}
+	return wqe;
+}
+
+
+/*
+ * siw_proc_send:
+ *
+ * Process one incoming SEND and place data into memory referenced by
+ * receive wqe.
+ *
+ * Function supports partially received sends (suspending/resuming
+ * current receive wqe processing)
+ *
+ * return value:
+ *	0:       reached the end of a DDP segment
+ *	-EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_send(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct siw_wqe	*wqe;
+	struct siw_sge	*sge;
+	struct siw_mr	*mr;
+	u32		data_bytes,	/* all data bytes available */
+			rcvd_bytes;	/* sum of data bytes rcvd */
+	int		rv = 0;
+
+	if (rctx->first_ddp_seg) {
+		WARN_ON(rx_wqe(qp) != NULL);
+
+		wqe = siw_get_rqe(qp);
+		if (!wqe)
+			return -ENOENT;
+
+		rx_wqe(qp) = wqe;
+		wqe->wr_status = SR_WR_INPROGRESS;
+	} else  {
+		wqe = rx_wqe(qp);
+		if (!wqe) {
+			/*
+			 * this is a siw bug!
+			 */
+			dprint(DBG_ON, "QP(%d): RQ failure\n", QP_ID(qp));
+			return -EPROTO;
+		}
+	}
+	if (rctx->state == SIW_GET_DATA_START) {
+		rv = siw_send_check_ntoh(rctx);
+		if (rv) {
+			siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
+			return rv;
+		}
+		if (!rctx->fpdu_part_rem) /* zero length SEND */
+			return 0;
+	}
+	data_bytes = min(rctx->fpdu_part_rem, rctx->skb_new);
+	rcvd_bytes = 0;
+
+	while (data_bytes) {
+		struct siw_pd	*pd;
+		u32	sge_bytes;	/* data bytes avail for SGE */
+		int	umem_ends;	/* 1 if umem ends with current rcv */
+
+		sge = &wqe->wr.sgl.sge[rctx->sge_idx];
+
+		if (!sge->len) {
+			/* just skip empty sge's */
+			rctx->sge_idx++;
+			rctx->sge_off = 0;
+			continue;
+		}
+		sge_bytes = min(data_bytes, sge->len - rctx->sge_off);
+
+		/*
+		 * check with QP's PD if no SRQ present, SRQ's PD otherwise
+		 */
+		pd = qp->srq == NULL ? qp->pd : qp->srq->pd;
+
+		rv = siw_check_sge(pd, sge, SR_MEM_LWRITE, rctx->sge_off,
+				   sge_bytes);
+		if (rv) {
+			siw_async_ev(qp, NULL, IB_EVENT_QP_ACCESS_ERR);
+			break;
+		}
+		mr = siw_mem2mr(sge->mem.obj);
+
+		if (rctx->sge_off == 0) {
+			/*
+			 * started a new sge: update receive pointers
+			 */
+			rv = siw_rx_umem_init(rctx, mr, sge->addr);
+			if (rv)
+				break;
+		}
+		/*
+		 * Are we going to finish placing
+		 * - the last fragment of the current SGE or
+		 * - the last DDP segment (L=1) of the current RDMAP message?
+		 *
+		 * siw_rx_umem() must advance umem page_chunk position
+		 * after sucessful receive only, if receive into current
+		 * umem does not end. umem ends, if:
+		 * - current SGE gets completely filled, OR
+		 * - current MPA FPDU is last AND gets consumed now
+		 */
+		umem_ends = ((sge_bytes + rctx->sge_off == sge->len) ||
+			      (!rctx->more_ddp_segs &&
+			       rctx->fpdu_part_rcvd + sge_bytes ==
+					rctx->fpdu_part_rem)) ? 1 : 0;
+
+		rv = siw_rx_umem(rctx, sge_bytes, umem_ends);
+		if (rv != sge_bytes) {
+			/*
+			 * siw_rx_umem() must have updated
+			 * skb_new and skb_copied
+			 */
+			wqe->processed += rcvd_bytes;
+			return -EINVAL;
+		}
+		rctx->sge_off += rv;
+
+		if (rctx->sge_off == sge->len) {
+			rctx->sge_idx++;
+			rctx->sge_off = 0;
+		}
+		data_bytes -= rv;
+		rcvd_bytes += rv;
+
+		rctx->fpdu_part_rem -= rv;
+		rctx->fpdu_part_rcvd += rv;
+	}
+	wqe->processed += rcvd_bytes;
+
+	if (!rctx->fpdu_part_rem)
+		return 0;
+
+	return (rv < 0) ? rv : -EAGAIN;
+}
+
+/*
+ * siw_proc_write:
+ *
+ * Place incoming WRITE after referencing and checking target buffer
+
+ * Function supports partially received WRITEs (suspending/resuming
+ * current receive processing)
+ *
+ * return value:
+ *	0:       reached the end of a DDP segment
+ *	-EAGAIN: to be called again to finish the DDP segment
+ */
+
+int siw_proc_write(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct siw_dev		*dev = qp->hdr.dev;
+	struct iwarp_rdma_write	*write = &rctx->hdr.rwrite;
+	struct siw_mem		*mem;
+	int			bytes,
+				last_write,
+				rv;
+
+	if (rctx->state == SIW_GET_DATA_START) {
+
+		if (!rctx->fpdu_part_rem) /* zero length WRITE */
+			return 0;
+
+		rv = siw_write_check_ntoh(rctx);
+		if (rv) {
+			siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
+			return rv;
+		}
+	}
+	bytes = min(rctx->fpdu_part_rem, rctx->skb_new);
+
+	/*
+	 * NOTE: bytes > 0 is always true, since this routine
+	 * gets only called if so.
+	 */
+	if (rctx->first_ddp_seg) {
+		/* DEBUG Code, to be removed */
+		if (rx_mem(qp) != 0) {
+			dprint(DBG_RX|DBG_ON, "(QP%d): Stale rctx state!\n",
+				QP_ID(qp));
+			return -EFAULT;
+		}
+		rx_mem(qp) = siw_mem_id2obj(dev, rctx->ddp_stag >> 8);
+	}
+	if (rx_mem(qp) == NULL) {
+		dprint(DBG_RX|DBG_ON, "(QP%d): "
+			"Sink STag not found or invalid,  STag=0x%08x\n",
+			QP_ID(qp), rctx->ddp_stag);
+		return -EINVAL;
+	}
+	mem = rx_mem(qp);
+	/*
+	 * Rtag not checked against mem's tag again because
+	 * hdr check guarantees same tag as before if fragmented
+	 */
+	rv = siw_check_mem(qp->pd, mem, write->sink_to + rctx->fpdu_part_rcvd,
+			   SR_MEM_RWRITE, bytes);
+	if (rv) {
+		siw_async_ev(qp, NULL, IB_EVENT_QP_ACCESS_ERR);
+		return rv;
+	}
+	if (rctx->first_ddp_seg) {
+		rv = siw_rx_umem_init(rctx, siw_mem2mr(mem), write->sink_to);
+		if (rv)
+			return -EINVAL;
+
+	} else if (!rctx->umem_chunk) {
+		/*
+		 * This should never happen.
+		 *
+		 * TODO: Remove tentative debug aid.
+		 */
+		dprint(DBG_RX|DBG_ON, "(QP%d): "
+			"Umem chunk not resolved!\n", QP_ID(qp));
+		return -EINVAL;
+	}
+	/*
+	 * Are we going to place the last piece of the last
+	 * DDP segment of the current RDMAP message?
+	 *
+	 * It is last if:
+	 * - rctx->fpdu_part_rem <= rctx->skb_new AND
+	 * - payload_rem (of current DDP segment) <= rctx->skb_new
+	 */
+	last_write = ((rctx->fpdu_part_rem <= rctx->skb_new) &&
+		      !rctx->more_ddp_segs) ? 1 : 0;
+
+	rv = siw_rx_umem(rctx, bytes, last_write);
+	if (rv != bytes)
+		return -EINVAL;
+
+	rctx->fpdu_part_rem -= rv;
+	rctx->fpdu_part_rcvd += rv;
+
+	if (!rctx->fpdu_part_rem)
+		return 0;
+
+	return (rv < 0) ? rv : -EAGAIN;
+}
+
+/*
+ * inbound RREQ's cannot carry user data.
+ */
+int siw_proc_rreq(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	if (!rctx->fpdu_part_rem)
+		return 0;
+
+	dprint(DBG_ON|DBG_RX, "(QP%d): RREQ with MPA len %d\n", QP_ID(qp),
+		rctx->hdr.ctrl.mpa_len);
+
+	return -EPROTO;
+}
+
+/*
+ * siw_init_rresp:
+ *
+ * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
+ * Put it at the tail of the IRQ, if there is another WQE currently in
+ * transmit processing. If not, make it the current WQE to be processed
+ * and schedule transmit processing.
+ *
+ * Can be called from softirq context and from process
+ * context (RREAD socket loopback case!)
+ *
+ * return value:
+ *	0:      success,
+ *		failure code otherwise
+ */
+
+int siw_init_rresp(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct siw_wqe 	*rsp;
+
+	rsp = siw_wqe_get(qp, SIW_WR_RDMA_READ_RESP);
+	if (rsp) {
+		rsp->wr.rresp.sge.len = be32_to_cpu(rctx->hdr.rreq.read_size);
+		rsp->bytes = rsp->wr.rresp.sge.len;	/* redundant */
+		rsp->processed = 0;
+
+		rsp->wr.rresp.sge.addr = be64_to_cpu(rctx->hdr.rreq.source_to);
+		rsp->wr.rresp.num_sge = rsp->bytes ? 1 : 0;
+
+		rsp->wr.rresp.sge.mem.obj = NULL;	/* defer lookup */
+		rsp->wr.rresp.sge.lkey =
+			be32_to_cpu(rctx->hdr.rreq.source_stag);
+
+		rsp->wr.rresp.raddr = be64_to_cpu(rctx->hdr.rreq.sink_to);
+		rsp->wr.rresp.rtag = rctx->hdr.rreq.sink_stag; /* NBO */
+
+	} else {
+		dprint(DBG_RX|DBG_ON, "(QP%d): IRD exceeded!\n", QP_ID(qp));
+		return -EPROTO;
+	}
+	rsp->wr_status = SR_WR_QUEUED;
+
+	/*
+	 * Insert into IRQ
+	 *
+	 * TODO: Revisit ordering of genuine SQ WRs and Read Response
+	 * pseudo-WRs. RDMAP specifies that there is no ordering among
+	 * the two directions of transmission, so there is a degree of
+	 * freedom.
+	 *
+	 * The current logic favours Read Responses over SQ work requests
+	 * that are queued but not already in progress.
+	 */
+	lock_sq(qp);
+	if (!tx_wqe(qp)) {
+		tx_wqe(qp) = rsp;
+		unlock_sq(qp);
+		/*
+		 * schedule TX work, even if SQ was supended due to
+		 * ORD limit: it is always OK (and may even prevent peers
+		 * from appl lock) to send RRESPONSE's
+		 */
+		siw_sq_queue_work(qp);
+	} else {
+		list_add_tail(&rsp->list, &qp->irq);
+		unlock_sq(qp);
+	}
+	return 0;
+}
+
+/*
+ * siw_proc_rresp:
+ *
+ * Place incoming RRESP data into memory referenced by RREQ WQE.
+ *
+ * Function supports partially received RRESP's (suspending/resuming
+ * current receive processing)
+ */
+int siw_proc_rresp(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct siw_wqe	*wqe;
+	struct siw_mr	*mr;
+	struct siw_sge	*sge;
+	int		bytes,
+			is_last,
+			rv;
+
+	if (rctx->first_ddp_seg) {
+		WARN_ON(rx_wqe(qp) != NULL);
+		/*
+		 * fetch pending RREQ from orq
+		 */
+		lock_orq(qp);
+		if (!list_empty(&qp->orq)) {
+			wqe = list_first_entry(&qp->orq, struct siw_wqe, list);
+			list_del_init(&wqe->list);
+		} else {
+			unlock_orq(qp);
+			dprint(DBG_RX|DBG_ON, "(QP%d): ORQ empty\n",
+				QP_ID(qp));
+			/*
+			 * TODO: Should generate an async error
+			 */
+			rv = -ENODATA; /* or -ENOENT ? */
+			goto done;
+		}
+		unlock_orq(qp);
+
+		rx_wqe(qp) = wqe;
+
+		if (wr_type(wqe) != SIW_WR_RDMA_READ_REQ || wqe->processed) {
+			WARN_ON(wqe->processed);
+			WARN_ON(wr_type(wqe) != SIW_WR_RDMA_READ_REQ);
+			rv = -EINVAL;
+			goto done;
+		}
+
+		wqe->wr_status = SR_WR_INPROGRESS;
+
+		rv = siw_rresp_check_ntoh(rctx);
+		if (rv) {
+			siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
+			goto done;
+		}
+	} else {
+		wqe = rx_wqe(qp);
+		if (!wqe) {
+			WARN_ON(1);
+			rv = -ENODATA;
+			goto done;
+		}
+	}
+	if (!rctx->fpdu_part_rem) /* zero length RRESPONSE */
+		return 0;
+
+	bytes = min(rctx->fpdu_part_rem, rctx->skb_new);
+	sge = wqe->wr.rread.sge; /* there is only one */
+
+	/*
+	 * check target memory which resolves memory on first fragment
+	 */
+	rv = siw_check_sge(qp->pd, sge, SR_MEM_LWRITE, wqe->processed, bytes);
+	if (rv) {
+		dprint(DBG_RX|DBG_ON, "(QP%d): siw_check_sge failed: %d\n",
+			QP_ID(qp), rv);
+		wqe->wc_status = IB_WC_LOC_PROT_ERR;
+		siw_async_ev(qp, NULL, IB_EVENT_QP_ACCESS_ERR);
+		goto done;
+	}
+	mr = siw_mem2mr(sge->mem.obj);
+
+	if (rctx->first_ddp_seg) {
+		rv = siw_rx_umem_init(rctx, mr, sge->addr);
+		if (rv) {
+			wqe->wc_status = IB_WC_LOC_PROT_ERR;
+			goto done;
+		}
+	} else if (!rctx->umem_chunk) {
+		/*
+		 * This should never happen.
+		 *
+		 * TODO: Remove tentative debug aid.
+		 */
+		dprint(DBG_RX|DBG_ON, "(QP%d): No target mem!\n", QP_ID(qp));
+		wqe->wc_status = IB_WC_GENERAL_ERR;
+		rv = -EPROTO;
+		goto done;
+	}
+	/*
+	 * Are we going to finish placing the last DDP segment (L=1)
+	 * of the current RDMAP message?
+	 *
+	 * NOTE: siw_rresp_check_ntoh() guarantees that the
+	 * last inbound RDMAP Read Response message exactly matches
+	 * with the RREQ WR.
+	 */
+	is_last = (bytes + wqe->processed == wqe->bytes) ? 1 : 0;
+
+	rv = siw_rx_umem(rctx,  bytes, is_last);
+	if (rv != bytes) {
+		wqe->wc_status = IB_WC_GENERAL_ERR;
+		rv = -EINVAL;
+		goto done;
+	}
+	rctx->fpdu_part_rem -= rv;
+	rctx->fpdu_part_rcvd += rv;
+
+	wqe->processed += rv;
+
+	if (!rctx->fpdu_part_rem)
+		return 0;
+done:
+	return (rv < 0) ? rv : -EAGAIN;
+}
+
+static void siw_drain_pkt(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	char	buf[4096];
+	int	len;
+
+	dprint(DBG_ON|DBG_RX, " (QP%d): drain %d bytes\n",
+		QP_ID(qp), rctx->fpdu_part_rem);
+
+	while (rctx->fpdu_part_rem) {
+		len = min(rctx->fpdu_part_rem, 4096);
+
+		skb_copy_bits(rctx->skb, rctx->skb_offset,
+				      buf, rctx->fpdu_part_rem);
+
+		rctx->skb_copied += len;
+		rctx->skb_offset += len;
+		rctx->skb_new -= len;
+		rctx->fpdu_part_rem -= len;
+	}
+}
+
+int siw_proc_unsupp(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	WARN_ON(1);
+	siw_drain_pkt(qp, rctx);
+	return 0;
+}
+
+
+int siw_proc_terminate(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct iwarp_terminate	*term = &rctx->hdr.terminate;
+
+	printk(KERN_INFO "(QP%d): RX Terminate: etype=%d, layer=%d, ecode=%d\n",
+		QP_ID(qp), term->term_ctrl.etype, term->term_ctrl.layer,
+		term->term_ctrl.ecode);
+
+	siw_drain_pkt(qp, rctx);
+	return 0;
+}
+
+
+static int siw_get_trailer(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct sk_buff	*skb = rctx->skb;
+	u8		*tbuf = (u8 *)&rctx->trailer.crc - rctx->pad;
+	int		avail;
+
+	avail = min(rctx->skb_new, rctx->fpdu_part_rem);
+
+	skb_copy_bits(skb, rctx->skb_offset,
+		      tbuf + rctx->fpdu_part_rcvd, avail);
+
+	rctx->fpdu_part_rcvd += avail;
+	rctx->fpdu_part_rem -= avail;
+
+	rctx->skb_new -= avail;
+	rctx->skb_offset += avail;
+	rctx->skb_copied += avail;
+
+	dprint(DBG_RX, " (QP%d): %d remaining (%d)\n", QP_ID(qp),
+		rctx->fpdu_part_rem, avail);
+
+	if (!rctx->fpdu_part_rem) {
+		u32	crc_in, crc_own = 0;
+		/*
+		 * check crc if required
+		 */
+		if (!rctx->crc_enabled)
+			return 0;
+
+		if (rctx->pad && siw_crc_array(&rctx->mpa_crc_hd,
+					       tbuf, rctx->pad) != 0)
+			return -EINVAL;
+
+		crypto_hash_final(&rctx->mpa_crc_hd, (u8 *)&crc_own);
+
+		/*
+		 * CRC32 is computed, transmitted and received directly in NBO,
+		 * so there's never a reason to convert byte order.
+		 */
+		crc_in = rctx->trailer.crc;
+
+		if (crc_in != crc_own) {
+			dprint(DBG_RX|DBG_ON,
+				" (QP%d): CRC ERROR in:=%08x, own=%08x\n",
+				QP_ID(qp), crc_in, crc_own);
+			return -EINVAL;
+		}
+		return 0;
+	}
+	return -EAGAIN;
+}
+
+
+static int siw_get_hdr(struct siw_iwarp_rx *rctx)
+{
+	struct sk_buff		*skb = rctx->skb;
+	struct iwarp_ctrl	*c_hdr = &rctx->hdr.ctrl;
+
+	int bytes;
+
+	if (rctx->fpdu_part_rcvd < sizeof(struct iwarp_ctrl)) {
+		/*
+		 * copy first fix part of iwarp hdr
+		 */
+		bytes = min_t(int, rctx->skb_new,
+			      sizeof(struct iwarp_ctrl) - rctx->fpdu_part_rcvd);
+
+		skb_copy_bits(skb, rctx->skb_offset,
+			      (char *)c_hdr + rctx->fpdu_part_rcvd, bytes);
+
+		rctx->fpdu_part_rcvd += bytes;
+
+		rctx->skb_new -= bytes;
+		rctx->skb_offset += bytes;
+		rctx->skb_copied += bytes;
+
+		if (!rctx->skb_new ||
+			rctx->fpdu_part_rcvd < sizeof(struct iwarp_ctrl)) {
+			return -EAGAIN;
+		}
+
+		if (c_hdr->opcode > RDMAP_TERMINATE) {
+			dprint(DBG_RX|DBG_ON, " opcode %d\n", c_hdr->opcode);
+			return -EINVAL;
+		}
+		if (c_hdr->dv != DDP_VERSION) {
+			dprint(DBG_RX|DBG_ON, " dversion %d\n", c_hdr->dv);
+			return -EINVAL;
+		}
+		if (c_hdr->rv != RDMAP_VERSION) {
+			dprint(DBG_RX|DBG_ON, " rversion %d\n", c_hdr->rv);
+			return -EINVAL;
+		}
+		dprint(DBG_RX, "(QP%d): New Header, opcode:%d\n",
+			RX_QPID(rctx), c_hdr->opcode);
+	}
+	/*
+	 * figure out len of current hdr: variable length of
+	 * iwarp hdr forces us to copy hdr information
+	 */
+	bytes = min(rctx->skb_new,
+		  iwarp_pktinfo[c_hdr->opcode].hdr_len - rctx->fpdu_part_rcvd);
+
+	skb_copy_bits(skb, rctx->skb_offset,
+		      (char *)c_hdr + rctx->fpdu_part_rcvd, bytes);
+
+	rctx->fpdu_part_rcvd += bytes;
+
+	rctx->skb_new -= bytes;
+	rctx->skb_offset += bytes;
+	rctx->skb_copied += bytes;
+
+	if (rctx->fpdu_part_rcvd == iwarp_pktinfo[c_hdr->opcode].hdr_len) {
+		/*
+		 * HDR receive completed. Check if the current DDP segment
+		 * starts a new RDMAP message or continues a previously
+		 * started RDMAP message.
+		 *
+		 * Note well from the comments on DDP reassembly:
+		 * - Support for unordered reception of DDP segments
+		 *   (or FPDUs) from different RDMAP messages is not needed.
+		 * - Unordered reception of DDP segments of the same
+		 *   RDMAP message is not supported. It is probably not
+		 *   needed with most peers.
+		 */
+		siw_dprint_hdr(&rctx->hdr, RX_QPID(rctx), "HDR received");
+
+		if (rctx->more_ddp_segs != 0) {
+			rctx->first_ddp_seg = 0;
+			if (rctx->prev_ddp_opcode != c_hdr->opcode) {
+				dprint(DBG_ON,
+					"packet intersection: %d <> %d\n",
+					rctx->prev_ddp_opcode, c_hdr->opcode);
+				return -EPROTO;
+			}
+		} else {
+			rctx->prev_ddp_opcode = c_hdr->opcode;
+			rctx->first_ddp_seg = 1;
+		}
+		rctx->more_ddp_segs = (c_hdr->l == 0) ? 1 : 0;
+
+		return 0;
+	}
+	return -EAGAIN;
+}
+
+static inline int siw_fpdu_payload_len(struct siw_iwarp_rx *rctx)
+{
+	return ((int)(rctx->hdr.ctrl.mpa_len) - rctx->fpdu_part_rcvd)
+		+ MPA_HDR_SIZE;
+}
+
+static inline int siw_fpdu_trailer_len(struct siw_iwarp_rx *rctx)
+{
+	int mpa_len = (int)rctx->hdr.ctrl.mpa_len + MPA_HDR_SIZE;
+
+	return MPA_CRC_SIZE + (-mpa_len & 0x3);
+}
+
+/*
+ * siw_rreq_complete()
+ *
+ * Complete the current READ REQUEST after READ RESPONSE processing.
+ * It may complete consecutive WQE's which were already SQ
+ * processed before but are awaiting completion due to completion
+ * ordering (see verbs 8.2.2.2).
+ * The READ RESPONSE may also resume SQ processing if it was stalled
+ * due to ORD exhaustion (see verbs 8.2.2.18)
+ * Function stops completion when next READ REQUEST found or ORQ empty.
+ */
+static void siw_rreq_complete(struct siw_wqe *wqe, int error)
+{
+	struct siw_qp		*qp = wqe->qp;
+	int			num_wc = 1;
+	enum ib_send_flags	flags;
+	LIST_HEAD(c_list);
+
+	flags = wr_flags(wqe);
+
+	if (flags & IB_SEND_SIGNALED)
+		list_add(&wqe->list, &c_list);
+	else {
+		atomic_inc(&qp->sq_space);
+		siw_wqe_put(wqe);
+		num_wc = 0;
+	}
+
+	lock_orq(qp);
+
+	/* More WQE's to complete following this RREQ? */
+	if (!list_empty(&qp->orq)) {
+		struct list_head *pos, *n;
+		list_for_each_safe(pos, n, &qp->orq) {
+			wqe = list_entry_wqe(pos);
+			if (wr_type(wqe) == SIW_WR_RDMA_READ_REQ)
+				break;
+			flags |= wr_flags(wqe);
+			num_wc++;
+			dprint(DBG_WR|DBG_ON,
+				"(QP%d): Resume completion, wr_type %d\n",
+				QP_ID(qp), wr_type(wqe));
+			list_move_tail(pos, &c_list);
+		}
+	}
+	unlock_orq(qp);
+
+	if (num_wc)
+		siw_sq_complete(&c_list, qp, num_wc, flags);
+
+	/*
+	 * Check if SQ processing was stalled due to ORD limit
+	 */
+	if (ORD_SUSPEND_SQ(qp)) {
+		lock_sq(qp);
+
+		wqe = siw_next_tx_wqe(qp);
+
+		if (wqe && !tx_wqe(qp)) {
+			WARN_ON(wr_type(wqe) != SIW_WR_RDMA_READ_REQ);
+			list_del_init(&wqe->list);
+			tx_wqe(qp) = wqe;
+
+			list_add_tail(&wqe->list, &qp->orq);
+
+			unlock_sq(qp);
+
+			dprint(DBG_RX, "(QP%d): SQ resume (%d)\n",
+				QP_ID(qp), atomic_read(&qp->sq_space));
+
+			siw_sq_queue_work(qp);
+		} else {
+			/* only new ORQ space if not next RREQ queued */
+			atomic_inc(&qp->orq_space);
+			unlock_sq(qp);
+		}
+	} else
+		atomic_inc(&qp->orq_space);
+}
+
+/*
+ * siw_rdmap_complete()
+ *
+ * complete processing of an RDMA message after receiving all
+ * DDP segmens
+ *
+ *   o SENDs + RRESPs will need for completion,
+ *   o RREQs need for  READ RESPONSE initialization
+ *   o WRITEs need memory dereferencing
+ *
+ * TODO: Could siw_[s,r]_complete() fail? (CQ full)
+ */
+static inline int siw_rdmap_complete(struct siw_qp *qp,
+				     struct siw_iwarp_rx *rctx)
+{
+	struct siw_wqe	*wqe;
+	int rv = 0;
+
+	switch (rctx->hdr.ctrl.opcode) {
+
+	case RDMAP_SEND_SE:
+		wr_flags(rx_wqe(qp)) |= IB_SEND_SOLICITED;
+	case RDMAP_SEND:
+		rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
+
+		wqe = rx_wqe(qp);
+
+		wqe->wc_status = IB_WC_SUCCESS;
+		wqe->wr_status = SR_WR_DONE;
+
+		siw_rq_complete(wqe, qp);
+
+		break;
+
+	case RDMAP_RDMA_READ_RESP:
+		rctx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
+
+		wqe = rx_wqe(qp);
+
+		wqe->wc_status = IB_WC_SUCCESS;
+		wqe->wr_status = SR_WR_DONE;
+
+		siw_rreq_complete(wqe, 0);
+
+		break;
+
+	case RDMAP_RDMA_READ_REQ:
+		rv = siw_init_rresp(qp, rctx);
+
+		break;
+
+	case RDMAP_RDMA_WRITE:
+		/*
+		 * Free References from memory object if
+		 * attached to receive context (inbound WRITE)
+		 * While a zero-length WRITE is allowed, the
+		 * current implementation does not create
+		 * a memory reference (it is unclear if memory
+		 * rights should be checked in that case!).
+		 *
+		 * TODO: check zero length WRITE semantics
+		 */
+		if (rx_mem(qp))
+			siw_mem_put(rx_mem(qp));
+		break;
+
+	default:
+		break;
+
+	}
+	rctx->umem_chunk = NULL; /* DEBUG aid, tentatively */
+	rx_wqe(qp) = NULL;	/* also clears MEM object for WRITE */
+
+	return rv;
+}
+
+/*
+ * siw_rdmap_error()
+ *
+ * Abort processing of RDMAP message after failure.
+ * SENDs + RRESPs will need for receive completion, if
+ * already started.
+ *
+ * TODO: WRITE need local error to be surfaced.
+ *
+ */
+static inline void
+siw_rdmap_error(struct siw_qp *qp, struct siw_iwarp_rx *rctx, int status)
+{
+	struct siw_wqe	*wqe;
+
+	switch (rctx->hdr.ctrl.opcode) {
+
+	case RDMAP_SEND_SE:
+	case RDMAP_SEND:
+		rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
+
+		wqe = rx_wqe(qp);
+		if (!wqe)
+			return;
+
+		if (rctx->hdr.ctrl.opcode == RDMAP_SEND_SE)
+			wr_flags(wqe) |= IB_SEND_SOLICITED;
+
+		if (!wqe->wc_status)
+			wqe->wc_status = IB_WC_GENERAL_ERR;
+
+		wqe->wr_status = SR_WR_DONE;
+		siw_rq_complete(wqe, qp);
+
+		break;
+
+	case RDMAP_RDMA_READ_RESP:
+		/*
+		 * A READ RESPONSE may flush consecutive WQE's
+		 * which were SQ processed before
+		 */
+		rctx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
+
+		if (rctx->state == SIW_GET_HDR || status == -ENODATA)
+			/*  eventual RREQ left untouched */
+			break;
+
+		wqe = rx_wqe(qp);
+		if (wqe) {
+			if (status)
+				wqe->wc_status = status;
+			else
+				wqe->wc_status = IB_WC_GENERAL_ERR;
+
+			wqe->wr_status = SR_WR_DONE;
+			/*
+			 * All errors turn the wqe into signalled.
+			 */
+			wr_flags(wqe) |= IB_SEND_SIGNALED;
+			siw_rreq_complete(wqe, status);
+		}
+		break;
+
+	case RDMAP_RDMA_WRITE:
+		/*
+		 * Free References from memory object if
+		 * attached to receive context (inbound WRITE)
+		 * While a zero-length WRITE is allowed, the
+		 * current implementation does not create
+		 * a memory reference (it is unclear if memory
+		 * rights should be checked in that case!).
+		 *
+		 * TODO: check zero length WRITE semantics
+		 */
+		if (rx_mem(qp))
+			siw_mem_put(rx_mem(qp));
+		break;
+
+	default:
+		break;
+	}
+	rctx->umem_chunk = NULL; /* DEBUG aid, tentatively */
+	rx_wqe(qp) = NULL;	/* also clears MEM object for WRITE */
+}
+
+/*
+ * siw_tcp_rx_data()
+ *
+ * Main routine to consume inbound TCP payload
+ *
+ * @rd_desc:	read descriptor
+ * @skb:	socket buffer
+ * @off:	offset in skb
+ * @len:	skb->len - offset : payload in skb
+ */
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+		    unsigned int off, size_t len)
+{
+	struct siw_qp		*qp = rd_desc->arg.data;
+	struct siw_iwarp_rx	*rctx = &qp->rx_ctx;
+	int			rv;
+
+	rctx->skb = skb;
+	rctx->skb_new = skb->len - off;
+	rctx->skb_offset = off;
+	rctx->skb_copied = 0;
+
+	dprint(DBG_RX, "(QP%d): new data %d, rx-state %d\n", QP_ID(qp),
+		rctx->skb_new, rctx->state);
+
+	if (unlikely(rctx->rx_suspend == 1 ||
+		     qp->attrs.state != SIW_QP_STATE_RTS)) {
+		dprint(DBG_RX|DBG_ON, "(QP%d): failed. state rx:%d, qp:%d\n",
+			QP_ID(qp), qp->rx_ctx.state, qp->attrs.state);
+		return 0;
+	}
+	while (rctx->skb_new) {
+
+		switch (rctx->state) {
+
+		case SIW_GET_HDR:
+			rv = siw_get_hdr(rctx);
+			if (!rv) {
+				if (rctx->crc_enabled &&
+				    siw_crc_rxhdr(rctx) != 0) {
+					rv = -EINVAL;
+					break;
+				}
+				rctx->hdr.ctrl.mpa_len =
+					ntohs(rctx->hdr.ctrl.mpa_len);
+
+				rctx->fpdu_part_rem =
+					siw_fpdu_payload_len(rctx);
+
+				if (rctx->fpdu_part_rem)
+					rctx->pad = -rctx->fpdu_part_rem & 0x3;
+				else
+					rctx->pad = 0;
+
+				rctx->state = SIW_GET_DATA_START;
+				rctx->fpdu_part_rcvd = 0;
+			}
+			break;
+
+		case SIW_GET_DATA_MORE:
+			/*
+			 * Another data fragment of the same DDP segment.
+			 * Headers will not be checked again by the
+			 * opcode-specific data receive function below.
+			 * Setting first_ddp_seg = 0 avoids repeating
+			 * initializations that may occur only once per
+			 * DDP segment.
+			 */
+			rctx->first_ddp_seg = 0;
+
+		case SIW_GET_DATA_START:
+			/*
+			 * Headers will be checked by the opcode-specific
+			 * data receive function below.
+			 */
+			rv = siw_rx_data(qp, rctx);
+			if (!rv) {
+				rctx->fpdu_part_rem =
+					siw_fpdu_trailer_len(rctx);
+				rctx->fpdu_part_rcvd = 0;
+				rctx->state = SIW_GET_TRAILER;
+			} else
+				rctx->state = SIW_GET_DATA_MORE;
+
+			break;
+
+		case SIW_GET_TRAILER:
+			/*
+			 * read CRC + any padding
+			 */
+			rv = siw_get_trailer(qp, rctx);
+			if (!rv) {
+				/*
+				 * FPDU completed.
+				 * complete RDMAP message if last fragment
+				 */
+				rctx->state = SIW_GET_HDR;
+				rctx->fpdu_part_rcvd = 0;
+
+				if (!rctx->hdr.ctrl.l)
+					/* more frags */
+					break;
+
+				rv = siw_rdmap_complete(qp, rctx);
+				if (rv)
+					break;
+			}
+			break;
+
+		default:
+			WARN_ON(1);
+			rv = -EAGAIN;
+		}
+
+		if (unlikely(rv != 0 && rv != -EAGAIN)) {
+			/*
+			 * TODO: implement graceful error handling including
+			 *       generation (and processing) of TERMINATE
+			 *       messages.
+			 *
+			 *	 for now we are left with a bogus rx status
+			 *	 unable to receive any further byte.
+			 *	 BUT: code must handle difference between
+			 *
+			 * 	 o protocol syntax (FATAL, framing lost)
+			 *	 o crc	(FATAL, framing lost since we do not
+			 *	        trust packet header (??))
+			 *	 o local resource (maybe non fatal, framing
+			 *	   not lost)
+			 *
+			 *	 errors.
+			 */
+			siw_rdmap_error(qp, rctx, rv);
+
+			dprint(DBG_RX|DBG_ON,
+				"(QP%d): RX ERROR %d at RX state %d\n",
+				QP_ID(qp), rv, rctx->state);
+
+			siw_dprint_rctx(rctx);
+			/*
+			 * Calling siw_cm_queue_work() is safe without
+			 * releasing qp->state_lock because the QP state
+			 * will be transitioned to SIW_QP_STATE_ERROR
+			 * by the siw_work_handler() workqueue handler
+			 * after we return from siw_qp_llp_data_ready().
+			 */
+			siw_qp_cm_drop(qp, 1);
+
+			break;
+		}
+		if (rv) {
+			dprint(DBG_RX, "(QP%d): "
+				"Misaligned FPDU: State: %d, missing: %d\n",
+				QP_ID(qp), rctx->state, rctx->fpdu_part_rem);
+			break;
+		}
+	}
+	return rctx->skb_copied;
+}
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] SIW: Debugging and Tracing
From: Bernard Metzler @ 2010-10-05  6:55 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_debug.c |  198 +++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/siw/siw_debug.h |  159 ++++++++++++++++++++++++++
 2 files changed, 357 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_debug.c
 create mode 100644 drivers/infiniband/hw/siw/siw_debug.h

diff --git a/drivers/infiniband/hw/siw/siw_debug.c b/drivers/infiniband/hw/siw/siw_debug.c
new file mode 100644
index 0000000..6340272
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_debug.c
@@ -0,0 +1,198 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *          Fredy Neeser <nfd-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+
+
+void siw_print_qp_attr_mask(enum ib_qp_attr_mask attr_mask, char *msg)
+{
+	printk(KERN_INFO "-------- %s -------\n", msg);
+	if (IB_QP_STATE & attr_mask)
+		printk(KERN_INFO "IB_QP_STATE\n");
+	if (IB_QP_CUR_STATE & attr_mask)
+		printk(KERN_INFO "IB_QP_CUR_STATE\n");
+	if (IB_QP_EN_SQD_ASYNC_NOTIFY & attr_mask)
+		printk(KERN_INFO "IB_QP_EN_SQD_ASYNC_NOTIFY\n");
+	if (IB_QP_ACCESS_FLAGS & attr_mask)
+		printk(KERN_INFO "IB_QP_ACCESS_FLAGS\n");
+	if (IB_QP_PKEY_INDEX & attr_mask)
+		printk(KERN_INFO "IB_QP_PKEY_INDEX\n");
+	if (IB_QP_PORT & attr_mask)
+		printk(KERN_INFO "IB_QP_PORT\n");
+	if (IB_QP_QKEY & attr_mask)
+		printk(KERN_INFO "IB_QP_QKEY\n");
+	if (IB_QP_AV & attr_mask)
+		printk(KERN_INFO "IB_QP_AV\n");
+	if (IB_QP_PATH_MTU & attr_mask)
+		printk(KERN_INFO "IB_QP_PATH_MTU\n");
+	if (IB_QP_TIMEOUT & attr_mask)
+		printk(KERN_INFO "IB_QP_TIMEOUT\n");
+	if (IB_QP_RETRY_CNT & attr_mask)
+		printk(KERN_INFO "IB_QP_RETRY_CNT\n");
+	if (IB_QP_RNR_RETRY & attr_mask)
+		printk(KERN_INFO "IB_QP_RNR_RETRY\n");
+	if (IB_QP_RQ_PSN & attr_mask)
+		printk(KERN_INFO "IB_QP_RQ_PSN\n");
+	if (IB_QP_MAX_QP_RD_ATOMIC & attr_mask)
+		printk(KERN_INFO "IB_QP_MAX_QP_RD_ATOMIC\n");
+	if (IB_QP_ALT_PATH & attr_mask)
+		printk(KERN_INFO "IB_QP_ALT_PATH\n");
+	if (IB_QP_MIN_RNR_TIMER & attr_mask)
+		printk(KERN_INFO "IB_QP_MIN_RNR_TIMER\n");
+	if (IB_QP_SQ_PSN & attr_mask)
+		printk(KERN_INFO "IB_QP_SQ_PSN\n");
+	if (IB_QP_MAX_DEST_RD_ATOMIC & attr_mask)
+		printk(KERN_INFO "IB_QP_MAX_DEST_RD_ATOMIC\n");
+	if (IB_QP_PATH_MIG_STATE & attr_mask)
+		printk(KERN_INFO "IB_QP_PATH_MIG_STATE\n");
+	if (IB_QP_CAP & attr_mask)
+		printk(KERN_INFO "IB_QP_CAP\n");
+	if (IB_QP_DEST_QPN & attr_mask)
+		printk(KERN_INFO "IB_QP_DEST_QPN\n");
+	printk(KERN_INFO "-------- %s -(end)-\n", msg);
+}
+
+
+void siw_print_hdr(union iwarp_hdrs *hdr, int qp_id, char *msg)
+{
+	switch (hdr->ctrl.opcode) {
+
+	case RDMAP_RDMA_WRITE:
+		printk(KERN_INFO "QP%04d %s(WRITE, MPA len %d): %08x %016llx\n",
+			qp_id, msg, ntohs(hdr->ctrl.mpa_len),
+			hdr->rwrite.sink_stag, hdr->rwrite.sink_to);
+		break;
+
+	case RDMAP_RDMA_READ_REQ:
+		printk(KERN_INFO "QP%04d %s(RREQ, MPA len %d): %08x %08x "
+			"%08x %08x %016llx %08x %08x %016llx\n", qp_id, msg,
+			ntohs(hdr->ctrl.mpa_len),
+			hdr->rreq.ddp_qn, hdr->rreq.ddp_msn,
+			hdr->rreq.ddp_mo, hdr->rreq.sink_stag,
+			hdr->rreq.sink_to, hdr->rreq.read_size,
+			hdr->rreq.source_stag, hdr->rreq.source_to);
+
+		break;
+	case RDMAP_RDMA_READ_RESP:
+		printk(KERN_INFO "QP%04d %s(RRESP, MPA len %d): %08x %016llx\n",
+			qp_id, msg, ntohs(hdr->ctrl.mpa_len),
+			hdr->rresp.sink_stag, hdr->rresp.sink_to);
+		break;
+
+	case RDMAP_SEND:
+		printk(KERN_INFO "QP%04d %s(SEND, MPA len %d): %08x %08x "
+			"%08x\n", qp_id, msg, ntohs(hdr->ctrl.mpa_len),
+			hdr->send.ddp_qn, hdr->send.ddp_msn, hdr->send.ddp_mo);
+		break;
+
+	case RDMAP_SEND_INVAL:
+		printk(KERN_INFO "QP%04d %s(S_INV, MPA len %d): %08x %08x "
+			"%08x\n", qp_id, msg, ntohs(hdr->ctrl.mpa_len),
+			hdr->send.ddp_qn, hdr->send.ddp_msn,
+			hdr->send.ddp_mo);
+		break;
+
+	case RDMAP_SEND_SE:
+		printk(KERN_INFO "QP%04d %s(S_SE, MPA len %d): %08x %08x "
+			"%08x\n", qp_id, msg, ntohs(hdr->ctrl.mpa_len),
+			hdr->send.ddp_qn, hdr->send.ddp_msn,
+			hdr->send.ddp_mo);
+		break;
+
+	case RDMAP_SEND_SE_INVAL:
+		printk(KERN_INFO "QP%04d %s(S_SE_INV, MPA len %d): %08x %08x "
+			"%08x\n", qp_id, msg, ntohs(hdr->ctrl.mpa_len),
+			hdr->send.ddp_qn, hdr->send.ddp_msn,
+			hdr->send.ddp_mo);
+		break;
+
+	case RDMAP_TERMINATE:
+		printk(KERN_INFO "QP%04d %s(TERM, MPA len %d):\n", qp_id, msg,
+			ntohs(hdr->ctrl.mpa_len));
+		break;
+
+	default:
+		printk(KERN_INFO "QP%04d %s ?????\n", qp_id, msg);
+		break;
+	}
+}
+
+void siw_print_rctx(struct siw_iwarp_rx *rctx)
+{
+	printk(KERN_INFO "---RX Context-->\n");
+	siw_print_hdr(&rctx->hdr, RX_QPID(rctx), "\nCurrent Pkt:\t");
+	printk(KERN_INFO "Skbuf State:\tp:0x%p, new:%d, off:%d, copied:%d\n",
+		rctx->skb, rctx->skb_new, rctx->skb_offset, rctx->skb_copied);
+	printk(KERN_INFO "FPDU State:\trx_state:%d,\n\t\trcvd:%d, rem:%d, "
+		"pad:%d\n", rctx->state, rctx->fpdu_part_rcvd,
+		rctx->fpdu_part_rem, rctx->pad);
+	printk(KERN_INFO "Rx Mem:\t\tp:0x%p, chunk:0x%p,\n\t\tp_ix:%d, "
+		"p_off:%d, stag:0x%08x, mem_id:%d\n",
+		rctx->dest.wqe, rctx->umem_chunk, rctx->pg_idx, rctx->pg_off,
+		rctx->ddp_stag, rctx->ddp_stag >> 8);
+	printk(KERN_INFO "DDP State:\tprev_op:%d, first_seg:%d, "
+		"more_segs:%d\n", rctx->prev_ddp_opcode, rctx->first_ddp_seg,
+		rctx->more_ddp_segs);
+	printk(KERN_INFO "MPA State:\tlen:%d, crc_enabled:%d, crc:0x%x\n",
+		rctx->hdr.ctrl.mpa_len, rctx->crc_enabled, rctx->trailer.crc);
+	printk(KERN_INFO "<---------------\n");
+}
+
+#if DPRINT_MASK > 0
+char ib_qp_state_to_string[IB_QPS_ERR+1][sizeof "RESET"] = {
+	[IB_QPS_RESET]	= "RESET",
+	[IB_QPS_INIT]	= "INIT",
+	[IB_QPS_RTR]	= "RTR",
+	[IB_QPS_RTS]	= "RTS",
+	[IB_QPS_SQD]	= "SQD",
+	[IB_QPS_SQE]	= "SQE",
+	[IB_QPS_ERR]	= "ERR"
+};
+#endif
diff --git a/drivers/infiniband/hw/siw/siw_debug.h b/drivers/infiniband/hw/siw/siw_debug.h
new file mode 100644
index 0000000..58615fd
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_debug.h
@@ -0,0 +1,159 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Fredy Neeser <nfd-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _SIW_DEBUG_H
+#define _SIW_DEBUG_H
+
+#include <linux/uaccess.h>
+#include <linux/hardirq.h>	/* in_interrupt() */
+
+/*
+ * dprint: Selective debug printing
+ *
+ * Use an OR combination of DBG_* as dbgcat in dprint*(dbgcat,...)
+ * to assign debug messages to categories:
+ *
+ * dbgcat	Debug message belongs to category
+ * -----------------------------------------------------------------------------
+ * DBG_ON	Always on, for really important events or error conditions
+ * DBG_TMP	Temporarily on for fine-grained debugging
+ * DBQ_OBJ	Object management (object construction/destruction/refcounting)
+ * DBG_MM	Memory management
+ * DBG_EH	Event handling (completion events and asynchronous events)
+ * DBG_CM	Connection management, QP states
+ * DBG_WR	Work requests
+ * DBG_TX	iWARP TX path
+ * DBG_RX	iWARP RX path
+ * DBG_SK	Socket operations
+ * DBG_KT	Kernel threads
+ * DBG_IRQ	Interrupt context (SoftIRQ or HardIRQ)
+ * DBG_DM	Device management
+ * DBG_HDR	Packet HDRs
+ * DBG_ALL	All categories above
+ */
+#define DBG_ON		0x00000001
+#define DBG_TMP		0x00000002
+#define DBG_OBJ		0x00000004
+#define DBG_MM		0x00000008
+#define DBG_EH		0x00000010
+#define DBG_CM		0x00000020
+#define DBG_WR		0x00000040
+#define DBG_TX		0x00000080
+#define DBG_RX		0x00000100
+#define DBG_SK		0x00000200
+#define DBG_KT		0x00000400
+#define DBG_IRQ		0x00000800
+#define DBG_DM		0x00001000
+#define DBG_HDR		0x00002000
+#define DBG_ALL		(DBG_IRQ|DBG_KT|DBG_SK|DBG_RX|DBG_TX|DBG_WR|\
+DBG_CM|DBG_EH|DBG_MM|DBG_OBJ|DBG_TMP|DBG_DM|DBG_ON|DBG_HDR)
+#define DBG_ALL_NOHDR	(DBG_IRQ|DBG_KT|DBG_SK|DBG_RX|DBG_TX|DBG_WR|\
+DBG_CM|DBG_EH|DBG_MM|DBG_OBJ|DBG_TMP|DBG_DM|DBG_ON)
+#define DBG_CTRL	(DBG_ON|DBG_CM|DBG_DM)
+
+/*
+ * Set DPRINT_MASK to tailor your debugging needs:
+ *
+ * DPRINT_MASK value		Enables debug messages for
+ * ---------------------------------------------------------------------
+ * DBG_ON			Important events / error conditions only
+ *				(minimum number of debug messages)
+ * OR-ed combination of DBG_*	Selective debugging
+ * DBG_KT|DBG_ON		Kernel threads
+ * DBG_ALL			All categories
+ */
+#define DPRINT_MASK	0
+
+extern void siw_print_hdr(union iwarp_hdrs *, int, char *);
+extern void siw_print_rctx(struct siw_iwarp_rx *);
+extern void siw_print_qp_attr_mask(enum ib_qp_attr_mask, char *);
+
+#if DPRINT_MASK > 0
+
+/**
+ * dprint - Selective debug print for process, SoftIRQ or HardIRQ context
+ *
+ * Debug print with selectable debug categories,
+ * starting with header
+ * - "( pid /cpu) __func__" for process context
+ * - "( irq /cpu) __func__" for IRQ context
+ *
+ * @dbgcat	: Set of debug categories (OR-ed combination of DBG_* above),
+ *		  to which this debug message is assigned.
+ * @fmt		: printf compliant format string
+ * @args	: printf compliant argument list
+ */
+#define dprint(dbgcat, fmt, args...)					\
+	do {								\
+		if ((dbgcat) & DPRINT_MASK) {				\
+			if (!in_interrupt())				\
+				printk(KERN_INFO "(%5d/%1d) %s" fmt,	\
+					current->pid,			\
+					current_thread_info()->cpu,	\
+					__func__, ## args);		\
+			else						\
+				printk(KERN_INFO "( irq /%1d) %s" fmt,	\
+					current_thread_info()->cpu,	\
+					__func__, ## args);		\
+		}							\
+	} while (0)
+
+
+#define siw_dprint_rctx(r)	siw_print_rctx(r)
+extern char ib_qp_state_to_string[IB_QPS_ERR+1][sizeof "RESET"];
+
+#else
+#define dprint(dbgcat, fmt, args...)	do { } while (0)
+#define siw_dprint_rctx(r)	do { } while (0)
+#endif
+
+
+#if DPRINT_MASK & DBG_HDR
+#define siw_dprint_hdr(h, i, m)	siw_print_hdr(h, i, m)
+#else
+#define siw_dprint_hdr(h, i, m)	do { } while (0)
+#endif
+
+#if DPRINT_MASK & DBG_CM
+#define siw_dprint_qp_attr_mask(mask)\
+		siw_print_qp_attr_mask(mask, (char *)__func__)
+#else
+#define siw_dprint_qp_attr_mask(mask)	do { } while (0)
+#endif
+
+#endif
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] SIW: Documentation (initial)
From: Bernard Metzler @ 2010-10-05  6:55 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 Documentation/networking/siw.txt |   91 ++++++++++++++++++++++++++++++++++++++
 1 files changed, 91 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/networking/siw.txt

diff --git a/Documentation/networking/siw.txt b/Documentation/networking/siw.txt
new file mode 100644
index 0000000..f051d8b
--- /dev/null
+++ b/Documentation/networking/siw.txt
@@ -0,0 +1,91 @@
+SoftiWARP: Software iWARP kernel driver module.
+
+General
+-------
+SoftiWARP (siw) implements the iWARP protocol suite (MPA/DDP/RDMAP,
+IETF-RFC 5044/5041/5040) completely in software as a Linux kernel module.
+siw runs on top of TCP kernel sockets and exports the Linux kernel ibvers
+RDMA interface. siw interfaces with the iwcm connection manager.
+
+
+Transmit Path
+-------------
+If a send queue (SQ) work queue element gets posted, siw tries to send
+it directly out of the application context. If the SQ was non-empty,
+SQ processing is done asynchronously by a kernel worker thread. This
+thread gets scheduled, if the TCP socket signals new write space to
+be available. If during send operation the socket send space get
+exhausted, SQ processing is abandoned until new socket write space
+becomes available.
+
+
+Receive Path
+------------
+All application data is placed into target buffers within softirq
+socket callback. Application notification is asynchronous.
+
+
+User Interface
+--------------
+All fast path operations such as posting of work requests and
+reaping of work completions currently involve a system call into
+the siw module. Kernel/user-mapped send and receive as well as 
+completion queues are not part of the current code. In
+particular, mapped completion queues may improve performance,
+since reaping completion queue entries as well as re-arming
+the completion queue could be done more efficiently.
+
+
+Memory Management
+-----------------
+siw currently uses kernels ib_umem_get() function to pin memory for later
+use in data transfer operations. Transmit and receive memory is checked
+against correct access permissions only in the moment of access by the
+network input path or before pushing it to the socket for transmission.
+ib_umem_get() provides DMA mappings for the requested address space which
+is not used by siw.
+
+
+Module Parameters
+-----------------
+The following siw module parameters are recognized.
+loopback_enabled:
+	If set, siw attaches also to the looback device. Checked only
+	during module insertion.
+
+mpa_crc_enabled:
+	If set, the MPA CRC gets generated and checked both in tx and rx
+	path. Without hardware support, setting this flag will severely
+	hurt throughput. 
+
+zcopy_tx:
+	If set, payload of non signalled work requests
+	(such as non signalled WRITE or SEND as well as all READ
+	responses) are transferred using the TCP sockets
+	sendpage interface. This parameter can be switched on and
+	off dynamically (echo 1 >> /sys/module/siw/parameters/zcopy_tx
+	for enablement, 0 for disabling). System load may benefits from
+	using 0copy data transmission. 0copy is not enabled if
+	mpa_crc_enabled is set.
+
+
+Compile Time Flags:
+-DCHECK_DMA_CAPABILITIES
+	Checks if the device siw wants to attach to provides
+	DMA capabilities. While DMA capabilities are currently not
+	needed (siw works on top of a kernel TCP socket), siw
+	uses ib_umem_get() which performs a (not used) DMA address
+	translation. Writing a siw private memory reservation and
+	pinning routine would solve the issue.
+
+-DSIW_TX_FULLSEGS
+	Experimental, not enabled by default. If set,
+	siw tries not to overrun the socket (not sending until
+	-EAGAIN retrun), but stops sending if the current segment
+	would not fit into the socket's estimated tx buffer. With that,
+	wire FPDUs may get truncated by the TCP stack far less often.
+	Since this feature manipulates the sock's SOCK_NOSPACE
+	bit, it violates strict layering and is therefore considered
+	proprietary.
+	Since TCP is a byte stream protocol, no guarantee can be given
+	if FPDU's are not fragmented.
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox