Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH] net: tulip: Remove private "strncmp"
From: Rasmus Villemoes @ 2014-12-04 10:30 UTC (permalink / raw)
  To: Grant Grundler; +Cc: Rasmus Villemoes, netdev, linux-kernel

The comment says that the built-in strncmp didn't work. That is not
surprising, as apparently "str" semantics are not really what is
wanted (hint: de4x5_strncmp only stops when two different bytes are
encountered or the end is reached; not if either byte happens to be
0). de4x5_strncmp is actually a memcmp (except for the signature and
that bytes are not necessarily treated as unsigned char); since only
the boolean value of the result is used we can just replace
de4x5_strncmp with memcmp.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---

Notes:
    I don't know if the comment meant to say 3 bytes, or if the code
    compares meaningful chunks of memory (the first three bytes of
    &lp->srom span 1.5 fields, and the three bytes from (char*)&lp->srom +
    0x10 are &lp->srom.{id_block_crc,reserved2,version} - it seems odd
    that these chunks should ever be equal to each other and to the
    enet_det[i]). Whether or not the current code works, this patch
    shouldn't change the semantics, and I'd like to get rid of
    de4x5_strncmp since it is not, in fact, a strncmp.

 drivers/net/ethernet/dec/tulip/de4x5.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c
index cf8b6ff..badff18 100644
--- a/drivers/net/ethernet/dec/tulip/de4x5.c
+++ b/drivers/net/ethernet/dec/tulip/de4x5.c
@@ -995,7 +995,6 @@ static void    de4x5_dbg_mii(struct net_device *dev, int k);
 static void    de4x5_dbg_media(struct net_device *dev);
 static void    de4x5_dbg_srom(struct de4x5_srom *p);
 static void    de4x5_dbg_rx(struct sk_buff *skb, int len);
-static int     de4x5_strncmp(char *a, char *b, int n);
 static int     dc21041_infoleaf(struct net_device *dev);
 static int     dc21140_infoleaf(struct net_device *dev);
 static int     dc21142_infoleaf(struct net_device *dev);
@@ -4102,8 +4101,7 @@ get_hw_addr(struct net_device *dev)
 }
 
 /*
-** Test for enet addresses in the first 32 bytes. The built-in strncmp
-** didn't seem to work here...?
+** Test for enet addresses in the first 32 bytes.
 */
 static int
 de4x5_bad_srom(struct de4x5_private *lp)
@@ -4111,8 +4109,8 @@ de4x5_bad_srom(struct de4x5_private *lp)
     int i, status = 0;
 
     for (i = 0; i < ARRAY_SIZE(enet_det); i++) {
-	if (!de4x5_strncmp((char *)&lp->srom, (char *)&enet_det[i], 3) &&
-	    !de4x5_strncmp((char *)&lp->srom+0x10, (char *)&enet_det[i], 3)) {
+	if (!memcmp(&lp->srom, &enet_det[i], 3) &&
+	    !memcmp((char *)&lp->srom+0x10, &enet_det[i], 3)) {
 	    if (i == 0) {
 		status = SMC;
 	    } else if (i == 1) {
@@ -4125,18 +4123,6 @@ de4x5_bad_srom(struct de4x5_private *lp)
     return status;
 }
 
-static int
-de4x5_strncmp(char *a, char *b, int n)
-{
-    int ret=0;
-
-    for (;n && !ret; n--) {
-	ret = *a++ - *b++;
-    }
-
-    return ret;
-}
-
 static void
 srom_repair(struct net_device *dev, int card)
 {
-- 
2.0.4

^ permalink raw reply related

* Re: Is this 32-bit NCM?y
From: Enrico Mioso @ 2014-12-04 10:33 UTC (permalink / raw)
  To: Midge Shaojun Tan
  Cc: Bjørn Mork, Kevin Zhu, Eli Britstein, Alex Strizhevsky,
	youtux@gmail.com, linux-usb@vger.kernel.org,
	netdev@vger.kernel.org
In-Reply-To: <AMSPR06MB6011E001029C251790CB923EE780@AMSPR06MB601.eurprd06.prod.outlook.com>

... it works only applying cdc_ncm_modify.c; nothing else change. ARP works. Modesiwtch message not changed.

^ permalink raw reply

* [PATCH iproute2 v2] ss: Use rtnl_dump_filter in handle_netlink_request
From: Vadim Kochan @ 2014-12-04 10:32 UTC (permalink / raw)
  To: netdev; +Cc: Vadim Kochan

Replaced handling netlink messages by rtnl_dump_filter
from lib/libnetlink.c, also:

    - removed unused dump_fp arg;
    - added MAGIC_SEQ #define for 123456 seq id;
    - silently exit if ENOENT errno is caused for NETLINK_SOCK_DIAG proto
        in lib/libnetlink.c: rtnl_duml_filter_l(...) function. This fix
        was added in a3fd8e58c1787af186f5c4b234ff974544f840b6 by Eric
        for misc/ss.c

Signed-off-by: Vadim Kochan <vadim4j@gmail.com>
---
 include/libnetlink.h |   1 +
 lib/libnetlink.c     |   5 ++
 misc/ss.c            | 128 +++++++++++++++------------------------------------
 3 files changed, 42 insertions(+), 92 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index fe7d5d3..3794ef1 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -18,6 +18,7 @@ struct rtnl_handle
 	struct sockaddr_nl	peer;
 	__u32			seq;
 	__u32			dump;
+	int			proto;
 };
 
 extern int rcvbuf;
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 8d504a9..e3b7862 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -43,6 +43,7 @@ int rtnl_open_byproto(struct rtnl_handle *rth, unsigned subscriptions,
 
 	memset(rth, 0, sizeof(*rth));
 
+	rth->proto = protocol;
 	rth->fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
 	if (rth->fd < 0) {
 		perror("Cannot open netlink socket");
@@ -245,6 +246,10 @@ int rtnl_dump_filter_l(struct rtnl_handle *rth,
 							"ERROR truncated\n");
 					} else {
 						errno = -err->error;
+						if (rth->proto == NETLINK_SOCK_DIAG &&
+						    errno == ENOENT)
+							return -1;
+
 						perror("RTNETLINK answers");
 					}
 					return -1;
diff --git a/misc/ss.c b/misc/ss.c
index a99294d..68df122 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -41,6 +41,8 @@
 #include <linux/packet_diag.h>
 #include <linux/netlink_diag.h>
 
+#define MAGIC_SEQ 123456
+
 #define DIAG_REQUEST(_req, _r)						    \
 	struct {							    \
 		struct nlmsghdr nlh;					    \
@@ -49,7 +51,7 @@
 		.nlh = {						    \
 			.nlmsg_type = SOCK_DIAG_BY_FAMILY,		    \
 			.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST,\
-			.nlmsg_seq = 123456,				    \
+			.nlmsg_seq = MAGIC_SEQ,				    \
 			.nlmsg_len = sizeof(_req),			    \
 		},							    \
 	}
@@ -1777,7 +1779,7 @@ static int tcpdiag_send(int fd, int protocol, struct filter *f)
 		req.nlh.nlmsg_type = DCCPDIAG_GETSOCK;
 	req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST;
 	req.nlh.nlmsg_pid = 0;
-	req.nlh.nlmsg_seq = 123456;
+	req.nlh.nlmsg_seq = MAGIC_SEQ;
 	memset(&req.r, 0, sizeof(req.r));
 	req.r.idiag_family = AF_INET;
 	req.r.idiag_states = f->states;
@@ -1937,7 +1939,7 @@ again:
 			struct inet_diag_msg *r = NLMSG_DATA(h);
 
 			if (/*h->nlmsg_pid != rth->local.nl_pid ||*/
-			    h->nlmsg_seq != 123456)
+			    h->nlmsg_seq != MAGIC_SEQ)
 				goto skip_it;
 
 			if (h->nlmsg_type == NLMSG_DONE)
@@ -2422,8 +2424,10 @@ static void unix_list_print(struct unixstat *list, struct filter *f)
 	}
 }
 
-static int unix_show_sock(struct nlmsghdr *nlh, struct filter *f)
+static int unix_show_sock(const struct sockaddr_nl *addr, struct nlmsghdr *nlh,
+		void *arg)
 {
+	struct filter *f = (struct filter *)arg;
 	struct unix_diag_msg *r = NLMSG_DATA(nlh);
 	struct rtattr *tb[UNIX_DIAG_MAX+1];
 	char name[128];
@@ -2512,90 +2516,30 @@ static int unix_show_sock(struct nlmsghdr *nlh, struct filter *f)
 	return 0;
 }
 
-static int handle_netlink_request(struct filter *f, FILE *dump_fp,
-				  struct nlmsghdr *req, size_t size,
-				  int (* show_one_sock)(struct nlmsghdr *nlh, struct filter *f))
+static int handle_netlink_request(struct filter *f, struct nlmsghdr *req,
+		size_t size, rtnl_filter_t show_one_sock)
 {
-	int fd;
-	char	buf[16384];
+	int ret = -1;
+	struct rtnl_handle rth;
 
-	if ((fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_INET_DIAG)) < 0)
+	if (rtnl_open_byproto(&rth, 0, NETLINK_SOCK_DIAG))
 		return -1;
 
-	if (send(fd, req, size, 0) < 0) {
-		close(fd);
-		return -1;
-	}
+	rth.dump = MAGIC_SEQ;
 
-	while (1) {
-		ssize_t status;
-		struct nlmsghdr *h;
-		struct sockaddr_nl nladdr;
-		socklen_t slen = sizeof(nladdr);
+	if (rtnl_send(&rth, req, size) < 0)
+		goto Exit;
 
-		status = recvfrom(fd, buf, sizeof(buf), 0,
-				  (struct sockaddr *) &nladdr, &slen);
-		if (status < 0) {
-			if (errno == EINTR)
-				continue;
-			perror("OVERRUN");
-			continue;
-		}
-		if (status == 0) {
-			fprintf(stderr, "EOF on netlink\n");
-			goto close_it;
-		}
-
-		if (dump_fp)
-			fwrite(buf, 1, NLMSG_ALIGN(status), dump_fp);
-
-		h = (struct nlmsghdr*)buf;
-		while (NLMSG_OK(h, status)) {
-			int err;
+	if (rtnl_dump_filter(&rth, show_one_sock, f))
+		goto Exit;
 
-			if (/*h->nlmsg_pid != rth->local.nl_pid ||*/
-			    h->nlmsg_seq != 123456)
-				goto skip_it;
-
-			if (h->nlmsg_type == NLMSG_DONE)
-				goto close_it;
-
-			if (h->nlmsg_type == NLMSG_ERROR) {
-				struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h);
-				if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
-					fprintf(stderr, "ERROR truncated\n");
-				} else {
-					errno = -err->error;
-					if (errno != ENOENT)
-						fprintf(stderr, "DIAG answers %d\n", errno);
-				}
-				close(fd);
-				return -1;
-			}
-			if (!dump_fp) {
-				err = show_one_sock(h, f);
-				if (err < 0) {
-					close(fd);
-					return err;
-				}
-			}
-
-skip_it:
-			h = NLMSG_NEXT(h, status);
-		}
-
-		if (status) {
-			fprintf(stderr, "!!!Remnant of size %zd\n", status);
-			exit(1);
-		}
-	}
-
-close_it:
-	close(fd);
-	return 0;
+	ret = 0;
+Exit:
+	rtnl_close(&rth);
+	return ret;
 }
 
-static int unix_show_netlink(struct filter *f, FILE *dump_fp)
+static int unix_show_netlink(struct filter *f)
 {
 	DIAG_REQUEST(req, struct unix_diag_req r);
 
@@ -2605,8 +2549,7 @@ static int unix_show_netlink(struct filter *f, FILE *dump_fp)
 	if (show_mem)
 		req.r.udiag_show |= UDIAG_SHOW_MEMINFO;
 
-	return handle_netlink_request(f, dump_fp, &req.nlh,
-					sizeof(req), unix_show_sock);
+	return handle_netlink_request(f, &req.nlh, sizeof(req), unix_show_sock);
 }
 
 static int unix_show(struct filter *f)
@@ -2619,7 +2562,7 @@ static int unix_show(struct filter *f)
 	struct unixstat *list = NULL;
 
 	if (!getenv("PROC_NET_UNIX") && !getenv("PROC_ROOT")
-	    && unix_show_netlink(f, NULL) == 0)
+	    && unix_show_netlink(f) == 0)
 		return 0;
 
 	if ((fp = net_unix_open()) == NULL)
@@ -2693,7 +2636,8 @@ static int unix_show(struct filter *f)
 	return 0;
 }
 
-static int packet_show_sock(struct nlmsghdr *nlh, struct filter *f)
+static int packet_show_sock(const struct sockaddr_nl *addr,
+		struct nlmsghdr *nlh, void *arg)
 {
 	struct packet_diag_msg *r = NLMSG_DATA(nlh);
 	struct rtattr *tb[PACKET_DIAG_MAX+1];
@@ -2786,15 +2730,14 @@ static int packet_show_sock(struct nlmsghdr *nlh, struct filter *f)
 	return 0;
 }
 
-static int packet_show_netlink(struct filter *f, FILE *dump_fp)
+static int packet_show_netlink(struct filter *f)
 {
 	DIAG_REQUEST(req, struct packet_diag_req r);
 
 	req.r.sdiag_family = AF_PACKET;
 	req.r.pdiag_show = PACKET_SHOW_INFO | PACKET_SHOW_MEMINFO | PACKET_SHOW_FILTER;
 
-	return handle_netlink_request(f, dump_fp, &req.nlh, sizeof(req),
-			packet_show_sock);
+	return handle_netlink_request(f, &req.nlh, sizeof(req), packet_show_sock);
 }
 
 
@@ -2811,7 +2754,7 @@ static int packet_show(struct filter *f)
 	int ino;
 	unsigned long long sk;
 
-	if (packet_show_netlink(f, NULL) == 0)
+	if (packet_show_netlink(f) == 0)
 		return 0;
 
 	if ((fp = net_packet_open()) == NULL)
@@ -2982,8 +2925,10 @@ static void netlink_show_one(struct filter *f,
 	return;
 }
 
-static int netlink_show_sock(struct nlmsghdr *nlh, struct filter *f)
+static int netlink_show_sock(const struct sockaddr_nl *addr,
+		struct nlmsghdr *nlh, void *arg)
 {
+	struct filter *f = (struct filter *)arg;
 	struct netlink_diag_msg *r = NLMSG_DATA(nlh);
 	struct rtattr *tb[NETLINK_DIAG_MAX+1];
 	int rq = 0, wq = 0;
@@ -3016,7 +2961,7 @@ static int netlink_show_sock(struct nlmsghdr *nlh, struct filter *f)
 	return 0;
 }
 
-static int netlink_show_netlink(struct filter *f, FILE *dump_fp)
+static int netlink_show_netlink(struct filter *f)
 {
 	DIAG_REQUEST(req, struct netlink_diag_req r);
 
@@ -3024,8 +2969,7 @@ static int netlink_show_netlink(struct filter *f, FILE *dump_fp)
 	req.r.sdiag_protocol = NDIAG_PROTO_ALL;
 	req.r.ndiag_show = NDIAG_SHOW_GROUPS | NDIAG_SHOW_MEMINFO;
 
-	return handle_netlink_request(f, dump_fp, &req.nlh,
-					sizeof(req), netlink_show_sock);
+	return handle_netlink_request(f, &req.nlh, sizeof(req), netlink_show_sock);
 }
 
 static int netlink_show(struct filter *f)
@@ -3038,7 +2982,7 @@ static int netlink_show(struct filter *f)
 	unsigned long long sk, cb;
 
 	if (!getenv("PROC_NET_NETLINK") && !getenv("PROC_ROOT") &&
-		netlink_show_netlink(f, NULL) == 0)
+		netlink_show_netlink(f) == 0)
 		return 0;
 
 	if ((fp = net_netlink_open()) == NULL)
-- 
2.1.3

^ permalink raw reply related

* Re: [PATCH v3 net-next 2/2 tuntap: Increase the number of queues in tun.
From: Pankaj Gupta @ 2014-12-04 10:42 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, linux-kernel, netdev, davem, dgibson, vfalico,
	edumazet, vyasevic, hkchu, wuzhy, xemul, therbert, bhutchings,
	xii, stephen, jiri, sergei shtylyov
In-Reply-To: <20141204102013.GC17122@redhat.com>


> 
> On Thu, Dec 04, 2014 at 03:03:34AM +0008, Jason Wang wrote:
> > 
> > 
> > On Wed, Dec 3, 2014 at 5:52 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > >On Wed, Dec 03, 2014 at 12:49:37PM +0530, Pankaj Gupta wrote:
> > >> Networking under kvm works best if we allocate a per-vCPU RX and TX
> > >> queue in a virtual NIC. This requires a per-vCPU queue on the host
> > >>side.
> > >> It is now safe to increase the maximum number of queues.
> > >> Preceding patche: 'net: allow large number of rx queues'
> > >
> > >s/patche/patch/
> > >
> > >> made sure this won't cause failures due to high order memory
> > >> allocations. Increase it to 256: this is the max number of vCPUs
> > >> KVM supports.
> > >> Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> > >> Reviewed-by: David Gibson <dgibson@redhat.com>
> > >
> > >Hmm it's kind of nasty that each tun device is now using x16 memory.
> > >Maybe we should look at using a flex array instead, and removing the
> > >limitation altogether (e.g. make it INT_MAX)?
> > 
> > But this only happens when IFF_MULTIQUEUE were used.
> 
> I refer to this field:
>         struct tun_file __rcu   *tfiles[MAX_TAP_QUEUES];
> if we make MAX_TAP_QUEUES 256, this will use 4K bytes,
> apparently unconditionally.

Are you saying use flow array for tfiles in-place of array of tun_file
pointer and grow dynamically when/if needed?

If yes, I agree it will be all order-1 allocation but it will add some level
of indirection as pointed by DaveM for flow, this time for tfiles. But yes, dynamically
allocating flex array as per usage will help to minimise memory pressure which in this
case is high, 256.

> 
> 
> > And core has vmalloc() fallback.
> > So probably not a big issue?
> > >
> > >
> > >
> > >> ---
> > >>  drivers/net/tun.c | 9 +++++----
> > >>  1 file changed, 5 insertions(+), 4 deletions(-)
> > >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> > >> index e3fa65a..a19dc5f8 100644
> > >> --- a/drivers/net/tun.c
> > >> +++ b/drivers/net/tun.c
> > >> @@ -113,10 +113,11 @@ struct tap_filter {
> > >>  	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
> > >>  };
> > >>    -/* DEFAULT_MAX_NUM_RSS_QUEUES were chosen to let the rx/tx queues
> > >>allocated for
> > >> - * the netdevice to be fit in one page. So we can make sure the
> > >>success of
> > >> - * memory allocation. TODO: increase the limit. */
> > >> -#define MAX_TAP_QUEUES DEFAULT_MAX_NUM_RSS_QUEUES
> > >> +/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal
> > >> + * to max number of vCPUS in guest. Also, we are making sure here
> > >> + * queue memory allocation do not fail.
> > >
> > >It's not queue memory allocation anymore, is it?
> > >I would say "
> > >This also helps the tfiles field fit in 4K, so the whole tun
> > >device only needs an order-1 allocation.
> > >"
> > >
> > >> + */
> > >> +#define MAX_TAP_QUEUES 256
> > >>  #define MAX_TAP_FLOWS  4096
> > >>  #define TUN_FLOW_EXPIRE (3 * HZ)
> > >> --  1.8.3.1
> > >> --
> > >> To unsubscribe from this list: send the line "unsubscribe netdev" in
> > >> the body of a message to majordomo@vger.kernel.org
> > >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH v3 net-next 1/2] net: allow large number of rx queues
From: Pankaj Gupta @ 2014-12-04 10:45 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kernel, netdev, davem, jasowang, dgibson, vfalico, edumazet,
	vyasevic, hkchu, wuzhy, xemul, therbert, bhutchings, xii, stephen,
	jiri, sergei shtylyov
In-Reply-To: <20141203094224.GB9487@redhat.com>


> 
> On Wed, Dec 03, 2014 at 12:49:36PM +0530, Pankaj Gupta wrote:
> > netif_alloc_rx_queues() uses kcalloc() to allocate memory
> > for "struct netdev_queue *_rx" array.
> > If we are doing large rx queue allocation kcalloc() might
> > fail, so this patch does a fallback to vzalloc().
> > Similar implementation is done for tx queue allocation in
> > netif_alloc_netdev_queues().
> > 
> > We avoid failure of high order memory allocation
> > with the help of vzalloc(), this allows us to do large
> > rx and tx queue allocation which in turn helps us to
> > increase the number of queues in tun.
> > 
> > As vmalloc() adds overhead on a critical network path,
> > __GFP_REPEAT flag is used with kzalloc() to do this fallback
> > only when really needed.
> > 
> > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> > Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> > Reviewed-by: David Gibson <dgibson@redhat.com>
> > ---
> >  net/core/dev.c | 19 +++++++++++++------
> >  1 file changed, 13 insertions(+), 6 deletions(-)
> > 
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index e916ba8..abe9560 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -6059,17 +6059,25 @@ void netif_stacked_transfer_operstate(const struct
> > net_device *rootdev,
> >  EXPORT_SYMBOL(netif_stacked_transfer_operstate);
> >  
> >  #ifdef CONFIG_SYSFS
> > +static void netif_free_rx_queues(struct net_device *dev)
> > +{
> > +	kvfree(dev->_rx);
> > +}
> > +
> 
> I would just open-code this.

I will make the changes with the next version.
Thanks,
Pankaj
> 
> >  static int netif_alloc_rx_queues(struct net_device *dev)
> >  {
> >  	unsigned int i, count = dev->num_rx_queues;
> >  	struct netdev_rx_queue *rx;
> > +	size_t sz = count * sizeof(*rx);
> >  
> >  	BUG_ON(count < 1);
> >  
> > -	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
> > -	if (!rx)
> > -		return -ENOMEM;
> > -
> > +	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
> > +	if (!rx) {
> > +		rx = vzalloc(sz);
> > +		if (!rx)
> > +			return -ENOMEM;
> > +	}
> >  	dev->_rx = rx;
> >  
> >  	for (i = 0; i < count; i++)
> > @@ -6698,9 +6706,8 @@ void free_netdev(struct net_device *dev)
> >  
> >  	netif_free_tx_queues(dev);
> >  #ifdef CONFIG_SYSFS
> > -	kfree(dev->_rx);
> > +	netif_free_rx_queues(dev);
> >  #endif
> > -
> 
> and I think it's nicer with the empty line.
> 
> >  	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
> >  
> >  	/* Flush device addresses */
> > --
> > 1.8.3.1
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* [PATCH net-next] bnx2x: Use correct fastpath version for VFs.
From: Yuval Mintz @ 2014-12-04 10:52 UTC (permalink / raw)
  To: davem, netdev; +Cc: Ariel.Elior, Yuval Mintz

Our FW can support several fastpath HSI [for backward compatibility] but up
until now VFs were always configured to use latest fastpath HSI [although VF
driver might be older and use an older fastpath HSI].

For linux drivers, the differences are insignificant since driver never
utilized features that were overridden by the HSI change. But for VMs running
other operating systems this might be a problem.
In addition, eventually FW might change fastpath HSI in such a manner that
backward compatibility WILL break unless configured with proper version.

This patch fixes the issue for other operating system VMs, as well as lays
the ground work for forward compatibility in regard to the fastpath HSI.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: Ariel Elior <Ariel.Elior@qlogic.com>
---
Hi Dave,

Please consider applying this patch to `net-next'.

[Notice there's a single too-long line in it, but there was no 'good'
way of preventing it]

Thanks,
Yuval Mintz

---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c  |  2 +
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c    |  2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h    |  2 +
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c |  1 +
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h |  2 +
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c  | 75 ++++++++++++++++++++++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.h  |  9 ++-
 7 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 336ef3c..07c6368 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -3163,6 +3163,8 @@ static void bnx2x_pf_q_prep_general(struct bnx2x *bp,
 		gen_init->mtu = bp->dev->mtu;
 
 	gen_init->cos = cos;
+
+	gen_init->fp_hsi = ETH_FP_HSI_VERSION;
 }
 
 static void bnx2x_pf_rx_q_prep(struct bnx2x *bp,
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
index 7bc2924..07cdf9b 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
@@ -4336,7 +4336,7 @@ static void bnx2x_q_fill_init_general_data(struct bnx2x *bp,
 		test_bit(BNX2X_Q_FLG_FCOE, flags) ?
 		LLFC_TRAFFIC_TYPE_FCOE : LLFC_TRAFFIC_TYPE_NW;
 
-	gen_data->fp_hsi_ver = ETH_FP_HSI_VERSION;
+	gen_data->fp_hsi_ver = params->fp_hsi;
 
 	DP(BNX2X_MSG_SP, "flags: active %d, cos %d, stats en %d\n",
 	   gen_data->activate_flg, gen_data->cos, gen_data->statistics_en_flg);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h
index e97275f..86baecb 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h
@@ -937,6 +937,8 @@ struct bnx2x_general_setup_params {
 	u8		spcl_id;
 	u16		mtu;
 	u8		cos;
+
+	u8		fp_hsi;
 };
 
 struct bnx2x_rxq_setup_params {
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index c88b20a..e5aca2d 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -193,6 +193,7 @@ void bnx2x_vfop_qctor_prep(struct bnx2x *bp,
 	/* Setup-op general parameters */
 	setup_p->gen_params.spcl_id = vf->sp_cl_id;
 	setup_p->gen_params.stat_id = vfq_stat_id(vf, q);
+	setup_p->gen_params.fp_hsi = vf->fp_hsi;
 
 	/* Setup-op pause params:
 	 * Nothing to do, the pause thresholds are set by default to 0 which
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h
index 01bafa4..66ee62a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h
@@ -205,6 +205,8 @@ struct bnx2x_virtf {
 	/* slow-path operations */
 	struct mutex			op_mutex; /* one vfop at a time mutex */
 	enum channel_tlvs		op_current;
+
+	u8 fp_hsi;
 };
 
 #define BNX2X_NR_VIRTFN(bp)	((bp)->vfdb->sriov.nr_virtfn)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index b1d9c44..be40eab 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -224,6 +224,7 @@ int bnx2x_vfpf_acquire(struct bnx2x *bp, u8 tx_count, u8 rx_count)
 	struct vfpf_acquire_tlv *req = &bp->vf2pf_mbox->req.acquire;
 	struct pfvf_acquire_resp_tlv *resp = &bp->vf2pf_mbox->resp.acquire_resp;
 	struct vfpf_port_phys_id_resp_tlv *phys_port_resp;
+	struct vfpf_fp_hsi_resp_tlv *fp_hsi_resp;
 	u32 vf_id;
 	bool resources_acquired = false;
 
@@ -237,6 +238,7 @@ int bnx2x_vfpf_acquire(struct bnx2x *bp, u8 tx_count, u8 rx_count)
 
 	req->vfdev_info.vf_id = vf_id;
 	req->vfdev_info.vf_os = 0;
+	req->vfdev_info.fp_hsi_ver = ETH_FP_HSI_VERSION;
 
 	req->resc_request.num_rxqs = rx_count;
 	req->resc_request.num_txqs = tx_count;
@@ -316,9 +318,14 @@ int bnx2x_vfpf_acquire(struct bnx2x *bp, u8 tx_count, u8 rx_count)
 			memset(&bp->vf2pf_mbox->resp, 0,
 			       sizeof(union pfvf_tlvs));
 		} else {
-			/* PF reports error */
-			BNX2X_ERR("Failed to get the requested amount of resources: %d. Breaking...\n",
-				  bp->acquire_resp.hdr.status);
+			/* Determine reason of PF failure of acquire process */
+			fp_hsi_resp = bnx2x_search_tlv_list(bp, resp,
+							    CHANNEL_TLV_FP_HSI_SUPPORT);
+			if (fp_hsi_resp && !fp_hsi_resp->is_supported)
+				BNX2X_ERR("Old hypervisor - doesn't support current fastpath HSI version; Need to downgrade VF driver [or upgrade hypervisor]\n");
+			else
+				BNX2X_ERR("Failed to get the requested amount of resources: %d. Breaking...\n",
+					  bp->acquire_resp.hdr.status);
 			rc = -EAGAIN;
 			goto out;
 		}
@@ -333,6 +340,25 @@ int bnx2x_vfpf_acquire(struct bnx2x *bp, u8 tx_count, u8 rx_count)
 		bp->flags |= HAS_PHYS_PORT_ID;
 	}
 
+	/* Old Hypevisors might not even support the FP_HSI_SUPPORT TLV.
+	 * If that's the case, we need to make certain required FW was
+	 * supported by such a hypervisor [i.e., v0-v2].
+	 */
+	fp_hsi_resp = bnx2x_search_tlv_list(bp, resp,
+					    CHANNEL_TLV_FP_HSI_SUPPORT);
+	if (!fp_hsi_resp && (ETH_FP_HSI_VERSION > ETH_FP_HSI_VER_2)) {
+		BNX2X_ERR("Old hypervisor - need to downgrade VF's driver\n");
+
+		/* Since acquire succeeded on the PF side, we need to send a
+		 * release message in order to allow future probes.
+		 */
+		bnx2x_vfpf_finalize(bp, &req->first_tlv);
+		bnx2x_vfpf_release(bp);
+
+		rc = -EINVAL;
+		goto out;
+	}
+
 	/* get HW info */
 	bp->common.chip_id |= (bp->acquire_resp.pfdev_info.chip_num & 0xffff);
 	bp->link_params.chip_id = bp->common.chip_id;
@@ -1125,6 +1151,26 @@ static void bnx2x_vf_mbx_resp_phys_port(struct bnx2x *bp,
 	*offset += sizeof(struct vfpf_port_phys_id_resp_tlv);
 }
 
+static void bnx2x_vf_mbx_resp_fp_hsi_ver(struct bnx2x *bp,
+					 struct bnx2x_virtf *vf,
+					 void *buffer,
+					 u16 *offset)
+{
+	struct vfpf_fp_hsi_resp_tlv *fp_hsi;
+
+	bnx2x_add_tlv(bp, buffer, *offset, CHANNEL_TLV_FP_HSI_SUPPORT,
+		      sizeof(struct vfpf_fp_hsi_resp_tlv));
+
+	fp_hsi = (struct vfpf_fp_hsi_resp_tlv *)
+		 (((u8 *)buffer) + *offset);
+	fp_hsi->is_supported = (vf->fp_hsi > ETH_FP_HSI_VERSION) ? 0 : 1;
+
+	/* Offset should continue representing the offset to the tail
+	 * of TLV data (outside this function scope)
+	 */
+	*offset += sizeof(struct vfpf_fp_hsi_resp_tlv);
+}
+
 static void bnx2x_vf_mbx_acquire_resp(struct bnx2x *bp, struct bnx2x_virtf *vf,
 				      struct bnx2x_vf_mbx *mbx, int vfop_status)
 {
@@ -1219,6 +1265,12 @@ static void bnx2x_vf_mbx_acquire_resp(struct bnx2x *bp, struct bnx2x_virtf *vf,
 				  CHANNEL_TLV_PHYS_PORT_ID))
 		bnx2x_vf_mbx_resp_phys_port(bp, vf, &mbx->msg->resp, &length);
 
+	/* `New' vfs will want to know if fastpath HSI is supported, since
+	 * if that's not the case they could print into system log the fact
+	 * the driver version must be updated.
+	 */
+	bnx2x_vf_mbx_resp_fp_hsi_ver(bp, vf, &mbx->msg->resp, &length);
+
 	bnx2x_add_tlv(bp, &mbx->msg->resp, length, CHANNEL_TLV_LIST_END,
 		      sizeof(struct channel_list_end_tlv));
 
@@ -1288,6 +1340,23 @@ static void bnx2x_vf_mbx_acquire(struct bnx2x *bp, struct bnx2x_virtf *vf,
 		goto out;
 	}
 
+	/* Verify the VF fastpath HSI can be supported by the loaded FW.
+	 * Linux vfs should be oblivious to changes between v0 and v2.
+	 */
+	if (bnx2x_vf_mbx_is_windows_vm(bp, &mbx->msg->req.acquire))
+		vf->fp_hsi = acquire->vfdev_info.fp_hsi_ver;
+	else
+		vf->fp_hsi = max_t(u8, acquire->vfdev_info.fp_hsi_ver,
+				   ETH_FP_HSI_VER_2);
+	if (vf->fp_hsi > ETH_FP_HSI_VERSION) {
+		DP(BNX2X_MSG_IOV,
+		   "VF [%d] - Can't support acquire request since VF requests a FW version which is too new [%02x > %02x]\n",
+		   vf->abs_vfid, acquire->vfdev_info.fp_hsi_ver,
+		   ETH_FP_HSI_VERSION);
+		rc = -EINVAL;
+		goto out;
+	}
+
 	/* acquire the resources */
 	rc = bnx2x_vf_acquire(bp, vf, &acquire->resc_request);
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.h
index 15670c4..b86479f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.h
@@ -124,7 +124,7 @@ struct vfpf_acquire_tlv {
 #define VF_OS_UNDEFINED		(0 << VF_OS_SHIFT)
 #define VF_OS_WINDOWS		(1 << VF_OS_SHIFT)
 
-		u8 padding;
+		u8 fp_hsi_ver;
 		u8 caps;
 #define VF_CAP_SUPPORT_EXT_BULLETIN	(1 << 0)
 	} vfdev_info;
@@ -204,6 +204,12 @@ struct vfpf_port_phys_id_resp_tlv {
 	u8 padding[2];
 };
 
+struct vfpf_fp_hsi_resp_tlv {
+	struct channel_tlv tl;
+	u8 is_supported;
+	u8 padding[3];
+};
+
 #define VFPF_INIT_FLG_STATS_COALESCE	(1 << 0) /* when set the VFs queues
 						  * stats will be coalesced on
 						  * the leading RSS queue
@@ -448,6 +454,7 @@ enum channel_tlvs {
 	CHANNEL_TLV_UPDATE_RSS,
 	CHANNEL_TLV_PHYS_PORT_ID,
 	CHANNEL_TLV_UPDATE_TPA,
+	CHANNEL_TLV_FP_HSI_SUPPORT,
 	CHANNEL_TLV_MAX
 };
 
-- 
1.9.3

^ permalink raw reply related

* Re: [PATCH] SSB / B44: fix WOL for BCM4401
From: Andrey Skvortsov @ 2014-12-04 11:11 UTC (permalink / raw)
  To: Michael Büsch
  Cc: John W. Linville, Larry Finger, Rafael J. Wysocki, Gary.Zambrano,
	netdev, linux-kernel, b43-dev, Rafał Miłecki
In-Reply-To: <20141203172315.120040c2@wiggum>

[-- Attachment #1: Type: text/plain, Size: 820 bytes --]

On Wed, Dec 03, 2014 at 05:23:15PM +0100, Michael Büsch wrote:

> > > > > That sounds good, indeed.
> > > > > I'd still prefer, if someone with b43 (wireless) would test it, too.
> > > > 
> > > > I did a partial test with my PowerBook G4. With the patch installed, it would 
> > > > both suspend and hibernate, but WOL would be impossible. This computer uses a 
> > > > PCMCIA version of the BCM4318, and power is turned off to the PCMCIA card when 
> > > > suspended or hibernating.
> > > 
> > > Thanks for testing.
> > > 
> > > John, can you take this one? Or do we need to split the b44 part out?
> > > I added my Signed-off.
> > 
> > Um, sure...3.19 is OK I presume?
> 
> I think we could even wait for 3.20.
>

Thanks, guys.


-- 
Best regards,
Andrey Skvortsov

PGP Key ID: 0x57A3AEAD

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: Is this 32-bit NCM?y
From: Bjørn Mork @ 2014-12-04 11:44 UTC (permalink / raw)
  To: Midge Shaojun Tan
  Cc: Enrico Mioso, Kevin Zhu, Eli Britstein, Alex Strizhevsky,
	youtux@gmail.com, linux-usb@vger.kernel.org,
	netdev@vger.kernel.org
In-Reply-To: <AMSPR06MB6011E001029C251790CB923EE780@AMSPR06MB601.eurprd06.prod.outlook.com>

"Midge Shaojun  Tan" <ShaojunMidge.Tan@audiocodes.com> writes:

> Hi all,
>
> I test OK with kervel 3.16.4
> Need disable other Ethernet network, just like eth1. (Then the DNS and route is OK)
> And also need disable arp, (ifconfig wwan0 -arp up), because China UNICOM don't respond the ARP message.

The ARP functionality is independent of operator.  It is handled
internally by the modem firmware.  There are no MAC addresses or
ethernet headers transmitted over the radio link.  That's all faked by
the modem.  All MAC addresses and ethernet headers are local to the
modem<->host USB link.

> With new mode switch string: /etc/usb_modeswitch.d/12d1:14fe
> Please see the patch and check whether it is correct?

I see that you have two changes there:

1) the ETH_HLEN adjustment of ctx->tx_remainder is dropped
2) the NDP is placed after the first frame.

I haven't verified the effect of the tx_remainder change, but I assume
it fixes an alignment problem for this device.  I'd like to look more at
the effect of this for different values of wNdpOutPayloadRemainder and
wNdpOutDivisor.

We can choose to put the NDP at the end of the NTB if we find that this
fixes some problem, but doing so by default for every NCM and MBIM
device is a bit risky. If we accept that some devices are so buggy that
the NDP cannot be placed anywhere (as required by the spec), then we
have to assume that this goes both ways.  Which means that moving the
NDP to the end of the NTB might break some other device.  We just don't
know that since we haven't ever tried it.

And your fix doesn't really move it to the end either.  It just places
the NDP after the first ethernet packet.  Which happens to be the end if
there is only one packet in the NTB. But if we aggregate more packets
into this NTB then the result will look like this:

 NTH
 eth packet 1
 NDP
 eth packet 2
 ..
 eth packet N

I'm not convinced this modem will handle that if it cannot handle the
NDP being before the first packet...  This needs to be tested.  Try
increasing /sys/class/net/wwan0/cdc_ncm/tx_timer_usecs to force the
driver to aggregate packets and see if everything still works.
Preferably while looking at the resulting NTB to verify that it does
contain more than one ethernet packet.

I realize I sound a bit negative now.  This is absolutely not my
intention. This is great work, providing some real progress wrt figuring
out what goes on here.  Thanks a lot!  I am sure we can sort out the
remaining issues, which are really minor compared to what you have found
so far.

Bjørn

^ permalink raw reply

* Possible regression: "gre: Use inner mac length when computing tunnel length"
From: Timo Teras @ 2014-12-04 12:16 UTC (permalink / raw)
  To: Tom Herbert, Alexander Duyck, netdev

Hi,

After upgrading to latest 3.14.24 or newer, I noticed a weird TSO bug
in the "dmvpn" setup I use. And seems 3.14.23 works just fine. So the
commit 14051f0452a2c26a "gre: Use inner mac length when computing
tunnel length" would appear to be the related commit (but have not yet
tested this).

In practice what happens is that forwarding path between ethX (or vlanX)
and gre1 gets broken.

There's probably two differences to the "regular" gre tunnel case:
- it's nbma mode, meaning the gre header is inserted via slightly
  different code path
- the gre1 packets are IPsec encrypted in transport mode

As additional detail, doing "ethtool -K gre1 tso off" will workaround
the issue, so it is clearly tso issue pointing even further to the
commit in question.

Is this something the suspected patch could cause? Any suggestions
what to test more?

Thanks,
Timo

^ permalink raw reply

* Re: Is this 32-bit NCM?y
From: Enrico Mioso @ 2014-12-04 12:17 UTC (permalink / raw)
  To: Bjørn Mork
  Cc: Midge Shaojun Tan, Kevin Zhu, Eli Britstein, Alex Strizhevsky,
	youtux@gmail.com, linux-usb@vger.kernel.org,
	netdev@vger.kernel.org
In-Reply-To: <877fy7myfb.fsf@nemi.mork.no>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3995 bytes --]




On Thu, 4 Dec 2014, Bjørn Mork wrote:

> "Midge Shaojun  Tan" <ShaojunMidge.Tan@audiocodes.com> writes:
>
>> Hi all,
>>
>> I test OK with kervel 3.16.4
>> Need disable other Ethernet network, just like eth1. (Then the DNS and route is OK)
>> And also need disable arp, (ifconfig wwan0 -arp up), because China UNICOM don't respond the ARP message.
>
> The ARP functionality is independent of operator.  It is handled
> internally by the modem firmware.  There are no MAC addresses or
> ethernet headers transmitted over the radio link.  That's all faked by
> the modem.  All MAC addresses and ethernet headers are local to the
> modem<->host USB link.
>
>> With new mode switch string: /etc/usb_modeswitch.d/12d1:14fe
>> Please see the patch and check whether it is correct?
>
> I see that you have two changes there:
>
> 1) the ETH_HLEN adjustment of ctx->tx_remainder is dropped
> 2) the NDP is placed after the first frame.
>
> I haven't verified the effect of the tx_remainder change, but I assume
> it fixes an alignment problem for this device.  I'd like to look more at
> the effect of this for different values of wNdpOutPayloadRemainder and
> wNdpOutDivisor.
>
> We can choose to put the NDP at the end of the NTB if we find that this
> fixes some problem, but doing so by default for every NCM and MBIM
> device is a bit risky. If we accept that some devices are so buggy that
> the NDP cannot be placed anywhere (as required by the spec), then we
> have to assume that this goes both ways.  Which means that moving the
> NDP to the end of the NTB might break some other device.  We just don't
> know that since we haven't ever tried it.

Guys - this is a little hard moment for my life, so sorry if I sound negative 
too, really. No humor here.

Bjorn, I am starting to think that we might need to (sadly) differentiate how
we threat "standard" NCM devices in respect to Huawei ones, a little bit more.
As you said - changing the handling in any way would be too risky - and
definitely, I am starting to think that different firmware versions have
different bugs in this regard (might be not in NCM handling itself, but in
respect to other areas, like ARP handling / faking ).
At the end we might come out with a good enough solution - but we would start 
to complicate things more and more, and definitely we might end up complicating
more and more the driver itself. At some point, we might also end up finding 
out that we need a different workarounds for different firmwares, not even 
compatible one another.
so - my idea, that is hypotetical:
1 - Factorize out cdc_ncm more
2 - Creating new {tx,rx}_fixup routines applying different workarounds.

then, when the need to apply different workarounds for difference device (or 
firmware versions) arise, we might also think of a way to communicate to the 
driver the firmware version, allowing it to select some quirks to apply.
What do you all think about it?

> And your fix doesn't really move it to the end either.  It just places
> the NDP after the first ethernet packet.  Which happens to be the end if
> there is only one packet in the NTB. But if we aggregate more packets
> into this NTB then the result will look like this:
>
> NTH
> eth packet 1
> NDP
> eth packet 2
> ..
> eth packet N
>
> I'm not convinced this modem will handle that if it cannot handle the
> NDP being before the first packet...  This needs to be tested.  Try
> increasing /sys/class/net/wwan0/cdc_ncm/tx_timer_usecs to force the
> driver to aggregate packets and see if everything still works.
> Preferably while looking at the resulting NTB to verify that it does
> contain more than one ethernet packet.
>
> I realize I sound a bit negative now.  This is absolutely not my
> intention. This is great work, providing some real progress wrt figuring
> out what goes on here.  Thanks a lot!  I am sure we can sort out the
> remaining issues, which are really minor compared to what you have found
> so far.
>
>
>
> Bjørn
>
>
I'll test this now.

^ permalink raw reply

* Re: Is this 32-bit NCM?y
From: Enrico Mioso @ 2014-12-04 12:28 UTC (permalink / raw)
  To: Bjørn Mork
  Cc: Midge Shaojun Tan, Kevin Zhu, Eli Britstein, Alex Strizhevsky,
	youtux-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org,
	linux-usb-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <877fy7myfb.fsf-lbf33ChDnrE/G1V5fR+Y7Q@public.gmane.org>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3917 bytes --]

... DHCP will work with some DHCPNACKS in the meanwhile, but ping stops working at all.
Otherwise, it works with the standard value:

--- 8.8.8.8 ping statistics ---
48 packets transmitted, 48 received, 0% packet loss, time 47004ms
rtt min/avg/max/mdev = 362.084/392.878/523.132/33.636 ms

And I was expecting effectively to see some lost packets, but instead... no.


On Thu, 4 Dec 2014, Bjørn Mork wrote:

> Date: Thu, 4 Dec 2014 12:44:56
> From: Bjørn Mork <bjorn-yOkvZcmFvRU@public.gmane.org>
> To: Midge Shaojun Tan <ShaojunMidge.Tan-6C2+4RG2qWF0ubjbjo6WXg@public.gmane.org>
> Cc: Enrico Mioso <mrkiko.rs-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>,
>     Kevin Zhu <Mingying.Zhu-6C2+4RG2qWF0ubjbjo6WXg@public.gmane.org>,
>     Eli Britstein <Eli.Britstein-6C2+4RG2qWF0ubjbjo6WXg@public.gmane.org>,
>     Alex Strizhevsky <alexxst-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>,
>     "youtux-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org" <youtux-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>,
>     "linux-usb-u79uwXL29TY76Z2rM5mHXA@public.gmane.org" <linux-usb-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>,
>     "netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org" <netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>
> Subject: Re: Is this 32-bit NCM?y
> 
> "Midge Shaojun  Tan" <ShaojunMidge.Tan-6C2+4RG2qWF0ubjbjo6WXg@public.gmane.org> writes:
>
>> Hi all,
>>
>> I test OK with kervel 3.16.4
>> Need disable other Ethernet network, just like eth1. (Then the DNS and route is OK)
>> And also need disable arp, (ifconfig wwan0 -arp up), because China UNICOM don't respond the ARP message.
>
> The ARP functionality is independent of operator.  It is handled
> internally by the modem firmware.  There are no MAC addresses or
> ethernet headers transmitted over the radio link.  That's all faked by
> the modem.  All MAC addresses and ethernet headers are local to the
> modem<->host USB link.
>
>> With new mode switch string: /etc/usb_modeswitch.d/12d1:14fe
>> Please see the patch and check whether it is correct?
>
> I see that you have two changes there:
>
> 1) the ETH_HLEN adjustment of ctx->tx_remainder is dropped
> 2) the NDP is placed after the first frame.
>
> I haven't verified the effect of the tx_remainder change, but I assume
> it fixes an alignment problem for this device.  I'd like to look more at
> the effect of this for different values of wNdpOutPayloadRemainder and
> wNdpOutDivisor.
>
> We can choose to put the NDP at the end of the NTB if we find that this
> fixes some problem, but doing so by default for every NCM and MBIM
> device is a bit risky. If we accept that some devices are so buggy that
> the NDP cannot be placed anywhere (as required by the spec), then we
> have to assume that this goes both ways.  Which means that moving the
> NDP to the end of the NTB might break some other device.  We just don't
> know that since we haven't ever tried it.
>
> And your fix doesn't really move it to the end either.  It just places
> the NDP after the first ethernet packet.  Which happens to be the end if
> there is only one packet in the NTB. But if we aggregate more packets
> into this NTB then the result will look like this:
>
> NTH
> eth packet 1
> NDP
> eth packet 2
> ..
> eth packet N
>
> I'm not convinced this modem will handle that if it cannot handle the
> NDP being before the first packet...  This needs to be tested.  Try
> increasing /sys/class/net/wwan0/cdc_ncm/tx_timer_usecs to force the
> driver to aggregate packets and see if everything still works.
> Preferably while looking at the resulting NTB to verify that it does
> contain more than one ethernet packet.
>
> I realize I sound a bit negative now.  This is absolutely not my
> intention. This is great work, providing some real progress wrt figuring
> out what goes on here.  Thanks a lot!  I am sure we can sort out the
> remaining issues, which are really minor compared to what you have found
> so far.
>
>
>
> Bjørn
>
>

^ permalink raw reply

* Re: Where exactly will arch_fast_hash be used
From: Hannes Frederic Sowa @ 2014-12-04 12:34 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Thomas Graf, Daniel Borkmann, David S. Miller, Theodore Ts'o,
	netdev, Linux Kernel Mailing List
In-Reply-To: <20141204081147.GA19030@gondor.apana.org.au>

Hi Herbert,

On Do, 2014-12-04 at 16:11 +0800, Herbert Xu wrote:
> While working on rhashtable it came to me that this whole concept
> of arch_fast_hash is flawed.  CRCs are linear functions so it's
> fairly easy for an attacker to identify collisions or at least
> eliminate a large amount of search space (e.g., controlling the
> last bit of the hash result is almost trivial, even when you add
> a random seed).
> 
> So what exactly are we going to use arch_fast_hash for? Presumably
> it's places where security is never goint to be an issue, right?
> 
> Even if security wasn't an issue, straight CRC32 has really poor
> lower-order bit distribution, which makes it a terrible choice for
> a hash table that simply uses the lower-order bits.

I wondered the same while trying to use arch_fast_hash in a lot more
places (I did a new implementation in assembler I'll send later on, it
is mostly optimized to deal with ovs flow keys).

While the uniformity of crc32 does actually look good and IMHO this even
holds for the lower bits of the hash, I totally agree on the linearity
matters.

The easiest way to make arch_fast_hash non-linear would be to build up
on the crc32 instruction like e.g. the cityhash function family does and
it seems not too hard to do that by combining two crc32c outputs of the
original and cyclic shifted input data. I have doubts if this is faster
than jhash in the end. There are proposals from Intel to do so, but they
are patent encumbered. :/

For most consumers in the networking stack, security and DoS resistence
is an issue. OVS, for which this was designed at first does do rehashing
from time to time, but still there is a possible DoS attack vector with
this hashing algorithm.

Bye,
Hannes

^ permalink raw reply

* [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm
From: Hannes Frederic Sowa @ 2014-12-04 13:08 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Jay Vosburgh, Thomas Graf, Daniel Borkmann,
	Eric Dumazet

By default the arch_fast_hash hashing function pointers are initialized
to jhash(2). If during boot-up a CPU with SSE4.2 is detected they get
updated to the CRC32 ones. This dispatching scheme incurs a function
pointer lookup and indirect call for every hashing operation.

To keep the number of clobbered registers short the hashing primitives
are implemented in assembler. This makes it easier to do the dispatch
by alternative_call.

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Jay Vosburgh <jay.vosburgh@canonical.com>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Daniel Borkmann <dborkman@redhat.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 arch/x86/include/asm/hash.h      |  53 ++++++++++-
 arch/x86/kernel/i386_ksyms_32.c  |   6 ++
 arch/x86/kernel/x8664_ksyms_64.c |   6 ++
 arch/x86/lib/Makefile            |   2 +-
 arch/x86/lib/arch_hash.S         | 192 +++++++++++++++++++++++++++++++++++++++
 arch/x86/lib/hash.c              |  92 -------------------
 arch/x86/lib/jhash.c             |   6 ++
 include/asm-generic/hash.h       |  18 +++-
 include/linux/hash.h             |  34 -------
 lib/Makefile                     |   2 +-
 lib/hash.c                       |  39 --------
 net/openvswitch/flow_table.c     |   2 +-
 12 files changed, 280 insertions(+), 172 deletions(-)
 create mode 100644 arch/x86/lib/arch_hash.S
 delete mode 100644 arch/x86/lib/hash.c
 create mode 100644 arch/x86/lib/jhash.c
 delete mode 100644 lib/hash.c

diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h
index e8c58f8..620081b 100644
--- a/arch/x86/include/asm/hash.h
+++ b/arch/x86/include/asm/hash.h
@@ -1,7 +1,56 @@
 #ifndef _ASM_X86_HASH_H
 #define _ASM_X86_HASH_H
 
-struct fast_hash_ops;
-extern void setup_arch_fast_hash(struct fast_hash_ops *ops);
+#include <linux/cpufeature.h>
+#include <asm/alternative.h>
+
+#include <linux/jhash.h>
+
+#ifdef CONFIG_AS_CRC32
+
+u32 __jhash_trampoline(const void *data, u32 len, u32 seed);
+u32 __sse42_crc32(const void *data, u32 len, u32 seed);
+
+#ifdef CONFIG_X86_64
+
+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
+{
+	u32 hash;
+
+	alternative_call(__jhash_trampoline, __sse42_crc32, X86_FEATURE_XMM4_2,
+			 ASM_OUTPUT2("=a" (hash), "=D" (data), "=S" (len),
+				     "=d" (seed)),
+			 "1" (data), "2" (len), "3" (seed)
+			 : "memory", "cc");
+
+	return hash;
+}
+
+#else /* CONFIG_X86_64 */
+
+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
+{
+	u32 hash;
+
+	alternative_call(__jhash_trampoline, __sse42_crc32, X86_FEATURE_XMM4_2,
+			 ASM_OUTPUT2("=a" (hash), "=d" (len), "=c" (seed)),
+			 "0" (data), "1" (len), "2" (seed)
+			 : "memory", "cc");
+
+	return hash;
+}
+
+#endif /* CONFIG_x86_64 */
+
+#else /* CONFIG_AS_CRC32 */
+
+u32 __jhash(const void *data, u32 len, u32 seed);
+
+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
+{
+	return __jhash(data, len, seed);
+}
+
+#endif
 
 #endif /* _ASM_X86_HASH_H */
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 05fd74f..afb98da 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,4 +1,5 @@
 #include <linux/module.h>
+#include <linux/hash.h>
 
 #include <asm/checksum.h>
 #include <asm/pgtable.h>
@@ -38,6 +39,11 @@ EXPORT_SYMBOL(strstr);
 EXPORT_SYMBOL(csum_partial);
 EXPORT_SYMBOL(empty_zero_page);
 
+#ifdef CONFIG_AS_CRC32
+EXPORT_SYMBOL(__sse42_crc32);
+EXPORT_SYMBOL(__jhash_trampoline);
+#endif
+
 #ifdef CONFIG_PREEMPT
 EXPORT_SYMBOL(___preempt_schedule);
 #ifdef CONFIG_CONTEXT_TRACKING
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 0406819..1094c13 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -3,6 +3,7 @@
 
 #include <linux/module.h>
 #include <linux/smp.h>
+#include <linux/hash.h>
 
 #include <net/checksum.h>
 
@@ -42,6 +43,11 @@ EXPORT_SYMBOL(clear_page);
 
 EXPORT_SYMBOL(csum_partial);
 
+#ifdef CONFIG_AS_CRC32
+EXPORT_SYMBOL(__sse42_crc32);
+EXPORT_SYMBOL(__jhash_trampoline);
+#endif
+
 /*
  * Export string functions. We normally rely on gcc builtin for most of these,
  * but gcc sometimes decides not to inline them.
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index db92793..168bbef 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -23,7 +23,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o hash.o
+obj-y += msr.o msr-reg.o msr-reg-export.o jhash.o arch_hash.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
diff --git a/arch/x86/lib/arch_hash.S b/arch/x86/lib/arch_hash.S
new file mode 100644
index 0000000..ff526a4
--- /dev/null
+++ b/arch/x86/lib/arch_hash.S
@@ -0,0 +1,192 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+
+#ifdef CONFIG_AS_CRC32
+
+#ifdef CONFIG_X86_64
+
+ENTRY(__jhash_trampoline)
+	CFI_STARTPROC
+
+	pushq_cfi %rcx
+	pushq_cfi %r8
+	pushq_cfi %r9
+	pushq_cfi %r10
+	pushq_cfi %r11
+
+	call __jhash
+
+	popq_cfi %r11
+	popq_cfi %r10
+	popq_cfi %r9
+	popq_cfi %r8
+	popq_cfi %rcx
+
+	retq
+
+	CFI_ENDPROC
+ENDPROC(__jhash_trampoline)
+
+ENTRY(__sse42_crc32)
+	CFI_STARTPROC
+
+	movq %rdx, %rax
+	cmpq $0x40, %rsi
+	jb .Lcrc_32bytes
+	subq $0x40, %rsi
+
+.Lcrc_64bytes:
+	subq $0x40, %rsi
+	crc32q 0*8(%rdi), %rax
+	crc32q 1*8(%rdi), %rax
+	crc32q 2*8(%rdi), %rax
+	crc32q 3*8(%rdi), %rax
+	crc32q 4*8(%rdi), %rax
+	crc32q 5*8(%rdi), %rax
+	crc32q 6*8(%rdi), %rax
+	crc32q 7*8(%rdi), %rax
+	leaq   8*8(%rdi), %rdi
+	jae .Lcrc_64bytes
+	addq $0x40, %rsi
+
+.Lcrc_32bytes:
+	cmpq $0x20, %rsi
+	jb .Lcrc_16bytes
+
+	subq $0x20, %rsi
+	crc32q 0*8(%rdi), %rax
+	crc32q 1*8(%rdi), %rax
+	crc32q 2*8(%rdi), %rax
+	crc32q 3*8(%rdi), %rax
+	leaq   4*8(%rdi), %rdi
+
+.Lcrc_16bytes:
+	cmpq $0x10, %rsi
+	jb .Lcrc_8bytes
+
+	subq $0x10, %rsi
+	crc32q 0*8(%rdi), %rax
+	crc32q 1*8(%rdi), %rax
+	leaq   2*8(%rdi), %rdi
+
+.Lcrc_8bytes:
+	cmpq $0x8, %rsi
+	jb .Lcrc_4bytes
+
+	subq $0x8, %rsi
+	crc32q (%rdi), %rax
+	leaq 1*8(%rdi), %rdi
+
+.Lcrc_4bytes:
+	cmpq $0x4, %rsi
+	jb .Lcrc_2bytes
+
+	subq $0x4, %rsi
+	crc32l (%rdi), %eax
+	leaq   1*4(%rdi), %rdi
+
+.Lcrc_2bytes:
+	cmpq $0x2, %rsi
+	jb .Lcrc_1bytes
+
+	subq $0x2, %rsi
+	crc32w (%rdi), %eax
+	leaq 1*2(%rdi), %rdi
+
+.Lcrc_1bytes:
+	cmpq $0x1, %rsi
+	jb .Lend
+
+	crc32b (%rdi), %eax
+.Lend:
+	retq
+	CFI_ENDPROC
+ENDPROC(__sse42_crc32)
+
+#else /* CONFIG_X86_32 */
+
+ENTRY(__jhash_trampoline)
+	CFI_STARTPROC
+
+	call __jhash
+
+	retl
+	CFI_ENDPROC
+ENDPROC(__jhash_trampoline)
+
+ENTRY(__sse42_crc32)
+	CFI_STARTPROC
+
+	xchgl %eax,%ecx
+	xchgl %edx,%ecx
+
+	cmpl $0x20, %ecx
+	jb .Lcrc_16bytes
+	subl $0x20, %ecx
+
+.Lcrc_32bytes:
+	subl $0x20, %ecx
+	crc32l 0*4(%edx), %eax
+	crc32l 1*4(%edx), %eax
+	crc32l 2*4(%edx), %eax
+	crc32l 3*4(%edx), %eax
+	crc32l 4*4(%edx), %eax
+	crc32l 5*4(%edx), %eax
+	crc32l 6*4(%edx), %eax
+	crc32l 7*4(%edx), %eax
+	leal   8*4(%edx), %edx
+	jae .Lcrc_32bytes
+	addl $0x20, %ecx
+
+.Lcrc_16bytes:
+	cmpl $0x10, %ecx
+	jb .Lcrc_8bytes
+
+	subl $0x10, %ecx
+	crc32l 0*4(%edx), %eax
+	crc32l 1*4(%edx), %eax
+	crc32l 2*4(%edx), %eax
+	crc32l 3*4(%edx), %eax
+	leal   4*4(%edx), %edx
+
+.Lcrc_8bytes:
+	cmpl $0x8, %ecx
+	jb .Lcrc_4bytes
+
+	subl $0x8, %ecx
+	crc32l 0*4(%edx), %eax
+	crc32l 1*4(%edx), %eax
+	leal   2*4(%edx), %edx
+
+.Lcrc_4bytes:
+	cmpl $0x4, %ecx
+	jb .Lcrc_2bytes
+
+	subl $0x4, %ecx
+	crc32l 0*4(%edx), %eax
+	leal   1*4(%edx), %edx
+
+.Lcrc_2bytes:
+	cmpl $0x2, %ecx
+	jb .Lcrc_1bytes
+
+	subl $0x2, %ecx
+	crc32w (%edx), %eax
+	leal   1*2(%edx), %edx
+
+.Lcrc_1bytes:
+	cmpl $0x1, %ecx
+	jb .Lend
+
+	crc32b (%edx), %eax
+
+.Lend:
+	retl
+
+	CFI_ENDPROC
+ENDPROC(__sse42_crc32)
+
+#endif
+
+#endif /* CONFIG_AS_CRC32 */
diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
deleted file mode 100644
index ff4fa51..0000000
--- a/arch/x86/lib/hash.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Some portions derived from code covered by the following notice:
- *
- * Copyright (c) 2010-2013 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/hash.h>
-#include <linux/init.h>
-
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-#include <asm/hash.h>
-
-static inline u32 crc32_u32(u32 crc, u32 val)
-{
-#ifdef CONFIG_AS_CRC32
-	asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val));
-#else
-	asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val));
-#endif
-	return crc;
-}
-
-static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
-{
-	const u32 *p32 = (const u32 *) data;
-	u32 i, tmp = 0;
-
-	for (i = 0; i < len / 4; i++)
-		seed = crc32_u32(seed, *p32++);
-
-	switch (len & 3) {
-	case 3:
-		tmp |= *((const u8 *) p32 + 2) << 16;
-		/* fallthrough */
-	case 2:
-		tmp |= *((const u8 *) p32 + 1) << 8;
-		/* fallthrough */
-	case 1:
-		tmp |= *((const u8 *) p32);
-		seed = crc32_u32(seed, tmp);
-		break;
-	}
-
-	return seed;
-}
-
-static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
-{
-	const u32 *p32 = (const u32 *) data;
-	u32 i;
-
-	for (i = 0; i < len; i++)
-		seed = crc32_u32(seed, *p32++);
-
-	return seed;
-}
-
-void __init setup_arch_fast_hash(struct fast_hash_ops *ops)
-{
-	if (cpu_has_xmm4_2) {
-		ops->hash  = intel_crc4_2_hash;
-		ops->hash2 = intel_crc4_2_hash2;
-	}
-}
diff --git a/arch/x86/lib/jhash.c b/arch/x86/lib/jhash.c
new file mode 100644
index 0000000..ab4b408
--- /dev/null
+++ b/arch/x86/lib/jhash.c
@@ -0,0 +1,6 @@
+#include <linux/jhash.h>
+
+u32 __jhash(const void *data, u32 len, u32 seed)
+{
+	return jhash(data, len, seed);
+}
diff --git a/include/asm-generic/hash.h b/include/asm-generic/hash.h
index b631284..07b8892 100644
--- a/include/asm-generic/hash.h
+++ b/include/asm-generic/hash.h
@@ -1,9 +1,23 @@
 #ifndef __ASM_GENERIC_HASH_H
 #define __ASM_GENERIC_HASH_H
 
-struct fast_hash_ops;
-static inline void setup_arch_fast_hash(struct fast_hash_ops *ops)
+#include <linux/jhash.h>
+
+/**
+ *	arch_fast_hash - Caclulates a hash over a given buffer that can have
+ *			 arbitrary size. This function will eventually use an
+ *			 architecture-optimized hashing implementation if
+ *			 available, and trades off distribution for speed.
+ *
+ *	@data: buffer to hash
+ *	@len: length of buffer in bytes
+ *	@seed: start seed
+ *
+ *	Returns 32bit hash.
+ */
+u32 arch_fast_hash(const void *data, u32 len, u32 seed)
 {
+	return jhash(data, len, seed);
 }
 
 #endif /* __ASM_GENERIC_HASH_H */
diff --git a/include/linux/hash.h b/include/linux/hash.h
index d0494c3..6e8fb02 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -84,38 +84,4 @@ static inline u32 hash32_ptr(const void *ptr)
 	return (u32)val;
 }
 
-struct fast_hash_ops {
-	u32 (*hash)(const void *data, u32 len, u32 seed);
-	u32 (*hash2)(const u32 *data, u32 len, u32 seed);
-};
-
-/**
- *	arch_fast_hash - Caclulates a hash over a given buffer that can have
- *			 arbitrary size. This function will eventually use an
- *			 architecture-optimized hashing implementation if
- *			 available, and trades off distribution for speed.
- *
- *	@data: buffer to hash
- *	@len: length of buffer in bytes
- *	@seed: start seed
- *
- *	Returns 32bit hash.
- */
-extern u32 arch_fast_hash(const void *data, u32 len, u32 seed);
-
-/**
- *	arch_fast_hash2 - Caclulates a hash over a given buffer that has a
- *			  size that is of a multiple of 32bit words. This
- *			  function will eventually use an architecture-
- *			  optimized hashing implementation if available,
- *			  and trades off distribution for speed.
- *
- *	@data: buffer to hash (must be 32bit padded)
- *	@len: number of 32bit words
- *	@seed: start seed
- *
- *	Returns 32bit hash.
- */
-extern u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed);
-
 #endif /* _LINUX_HASH_H */
diff --git a/lib/Makefile b/lib/Makefile
index 0211d2b..4b9baa4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
-	 percpu-refcount.o percpu_ida.o hash.o rhashtable.o reciprocal_div.o
+	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += kstrtox.o
diff --git a/lib/hash.c b/lib/hash.c
deleted file mode 100644
index fea973f..0000000
--- a/lib/hash.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/* General purpose hashing library
- *
- * That's a start of a kernel hashing library, which can be extended
- * with further algorithms in future. arch_fast_hash{2,}() will
- * eventually resolve to an architecture optimized implementation.
- *
- * Copyright 2013 Francesco Fusco <ffusco@redhat.com>
- * Copyright 2013 Daniel Borkmann <dborkman@redhat.com>
- * Copyright 2013 Thomas Graf <tgraf@redhat.com>
- * Licensed under the GNU General Public License, version 2.0 (GPLv2)
- */
-
-#include <linux/jhash.h>
-#include <linux/hash.h>
-#include <linux/cache.h>
-
-static struct fast_hash_ops arch_hash_ops __read_mostly = {
-	.hash  = jhash,
-	.hash2 = jhash2,
-};
-
-u32 arch_fast_hash(const void *data, u32 len, u32 seed)
-{
-	return arch_hash_ops.hash(data, len, seed);
-}
-EXPORT_SYMBOL_GPL(arch_fast_hash);
-
-u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed)
-{
-	return arch_hash_ops.hash2(data, len, seed);
-}
-EXPORT_SYMBOL_GPL(arch_fast_hash2);
-
-static int __init hashlib_init(void)
-{
-	setup_arch_fast_hash(&arch_hash_ops);
-	return 0;
-}
-early_initcall(hashlib_init);
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index e0a7fef..79bc65d 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -366,7 +366,7 @@ static u32 flow_hash(const struct sw_flow_key *key, int key_start,
 	/* Make sure number of hash bytes are multiple of u32. */
 	BUILD_BUG_ON(sizeof(long) % sizeof(u32));
 
-	return arch_fast_hash2(hash_key, hash_u32s, 0);
+	return arch_fast_hash(hash_key, hash_u32s, 0);
 }
 
 static int flow_key_start(const struct sw_flow_key *key)
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next 01/10] net/mlx4_en: Set csum level for encapsulated packets
From: Or Gerlitz @ 2014-12-04 13:13 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Or Gerlitz
In-Reply-To: <1417698835-11050-1-git-send-email-ogerlitz@mellanox.com>

This was dropped by mistake for the napi_gro_frags flow, fix that.

Fixes: dd65beac48a5 ('net/mlx4_en: Extend usage of napi_gro_frags')
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 946d352..3a9f9bf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -886,6 +886,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 			gro_skb->len = length;
 			gro_skb->data_len = length;
 			gro_skb->ip_summed = ip_summed;
+			if (l2_tunnel && ip_summed == CHECKSUM_UNNECESSARY)
+				gro_skb->csum_level = 1;
 
 			if (l2_tunnel && ip_summed == CHECKSUM_UNNECESSARY)
 				gro_skb->encapsulation = 1;
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 02/10] net/mlx4_core: Mask out host side virtualization features for guests
From: Or Gerlitz @ 2014-12-04 13:13 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Or Gerlitz
In-Reply-To: <1417698835-11050-1-git-send-email-ogerlitz@mellanox.com>

When VFs (guests in this context) issue the QUERY_DEV_CAP command, they
need not be told that host side virtualization features such as VST, FSM
(MAC anti-spoofing) and running > 80 VFs are supported by the device.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 4251f81..8c9ea70 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -982,7 +982,7 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
 	u64	flags;
 	int	err = 0;
 	u8	field;
-	u32	bmme_flags;
+	u32	bmme_flags, field32;
 	int	real_port;
 	int	slave_port;
 	int	first_port;
@@ -1053,6 +1053,11 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
 	field &= ~0x80;
 	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
 
+	/* turn off host side virt features (VST, FSM, etc) for guests */
+	MLX4_GET(field32, outbox->buf, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	field32 &= ~((1 << 26) | (1 << 21) | (1 << 20));
+	MLX4_PUT(outbox->buf, field32, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+
 	return 0;
 }
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 00/10] mlx4 driver update
From: Or Gerlitz @ 2014-12-04 13:13 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Or Gerlitz

Hi Dave, 

This series from Matan, Jenny, Dotan and myself is mostly about adding
support to a new performance optimized flow steering mode (patches 4-10).

The 1st two patches are small fixes (one for VXLAN and one for SRIOV),
and the third patch is a fix to avoid hard-lockup situation when many
(hunderds) processes holding user-space QPs/CQs get events.

Matan and Or. 

Dotan Barak (1):
  net/mlx4: Add a check if there are too many reserved QPs

Eugenia Emantayev (1):
  net/mlx4: Change QP allocation scheme

Matan Barak (6):
  net/mlx4_core: Use tasklet for user-space CQ completion events
  net/mlx4: Add mlx4_bitmap zone allocator
  net/mlx4: Add A0 hybrid steering
  net/mlx4_core: Add explicit error message when rule doesn't meet configuration
  net/mlx4: Refactor QUERY_PORT
  net/mlx4: Add support for A0 steering

Or Gerlitz (2):
  net/mlx4_en: Set csum level for encapsulated packets
  net/mlx4_core: Mask out host side virtualization features for guests

 drivers/infiniband/hw/mlx4/cq.c                    |    5 +-
 drivers/infiniband/hw/mlx4/main.c                  |    2 +-
 drivers/infiniband/hw/mlx4/qp.c                    |   13 +-
 drivers/net/ethernet/mellanox/mlx4/alloc.c         |  435 +++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/cq.c            |   49 +++
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |   13 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |    7 +-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c         |   14 +-
 drivers/net/ethernet/mellanox/mlx4/eq.c            |   16 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c            |  220 +++++++---
 drivers/net/ethernet/mellanox/mlx4/fw.h            |   42 ++-
 drivers/net/ethernet/mellanox/mlx4/main.c          |  217 +++++++++--
 drivers/net/ethernet/mellanox/mlx4/mcg.c           |   21 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |   97 +++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h       |    2 +-
 drivers/net/ethernet/mellanox/mlx4/qp.c            |  303 +++++++++++++-
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |    7 +-
 include/linux/mlx4/device.h                        |   51 ++-
 18 files changed, 1345 insertions(+), 169 deletions(-)

^ permalink raw reply

* [PATCH net-next 07/10] net/mlx4: Add A0 hybrid steering
From: Or Gerlitz @ 2014-12-04 13:13 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Or Gerlitz
In-Reply-To: <1417698835-11050-1-git-send-email-ogerlitz@mellanox.com>

From: Matan Barak <matanb@mellanox.com>

A0 hybrid steering is a form of high performance flow steering.
By using this mode, mlx4 cards use a fast limited table based steering,
in order to enable fast steering of unicast packets to a QP.

In order to implement A0 hybrid steering we allocate resources
from different zones:
(1) General range
(2) Special MAC-assigned QPs [RSS, Raw-Ethernet] each has its own region.

When we create a rss QP or a raw ethernet (A0 steerable and BF ready) QP,
we try hard to allocate the QP from range (2). Otherwise, we try hard not
to allocate from this  range. However, when the system is pushed to its
limits and one needs every resource, the allocator uses every region it can.

Meaning, when we run out of raw-eth qps, the allocator allocates from the
general range (and the special-A0 area is no longer active). If we run out
of RSS qps, the mechanism tries to allocate from the raw-eth QP zone. If that
is also exhausted, the allocator will allocate from the general range
(and the A0 region is no longer active).

Note that if a raw-eth qp is allocated from the general range, it attempts
to allocate the range such that bits 6 and 7 (blueflame bits) in the
QP number are not set.

When the feature is used in SRIOV, the VF has to notify the PF what
kind of QP attributes it needs. In order to do that, along with the
"Eth QP blueflame" bit, we reserve a new "A0 steerable QP". According
to the combination of these bits, the PF tries to allocate a suitable QP.

In order to maintain backward compatibility (with older PFs), the PF
notifies which QP attributes it supports via QUERY_FUNC_CAP command.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/infiniband/hw/mlx4/qp.c                |    6 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |    2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c     |    3 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c        |    6 +-
 drivers/net/ethernet/mellanox/mlx4/main.c      |    8 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h      |   13 +-
 drivers/net/ethernet/mellanox/mlx4/qp.c        |  277 ++++++++++++++++++++++--
 include/linux/mlx4/device.h                    |   10 +-
 8 files changed, 300 insertions(+), 25 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 506d1bd..cf000b7 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -807,8 +807,10 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 		 * VLAN insertion. */
 		if (init_attr->qp_type == IB_QPT_RAW_PACKET)
 			err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn,
-						    init_attr->cap.max_send_wr ?
-						    MLX4_RESERVE_ETH_BF_QP : 0);
+						    (init_attr->cap.max_send_wr ?
+						     MLX4_RESERVE_ETH_BF_QP : 0) |
+						    (init_attr->cap.max_recv_wr ?
+						     MLX4_RESERVE_A0_QP : 0));
 		else
 			if (qp->flags & MLX4_IB_QP_NETIF)
 				err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 6537631..5701115 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -595,7 +595,7 @@ static int mlx4_en_get_qp(struct mlx4_en_priv *priv)
 		return 0;
 	}
 
-	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, 0);
+	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, MLX4_RESERVE_A0_QP);
 	en_dbg(DRV, priv, "Reserved qp %d\n", *qpn);
 	if (err) {
 		en_err(priv, "Failed to reserve qp for mac registration\n");
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 4862552..86fe6d9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1132,7 +1132,8 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
 	int err;
 	u32 qpn;
 
-	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn, 0);
+	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn,
+				    MLX4_RESERVE_A0_QP);
 	if (err) {
 		en_err(priv, "Failed reserving drop qpn\n");
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 745deb7..4da5aed 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -275,6 +275,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 #define QUERY_FUNC_CAP_FLAG_VALID_MAILBOX	0x04
 
 #define QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG	(1UL << 31)
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG	(1UL << 30)
 
 /* when opcode modifier = 1 */
 #define QUERY_FUNC_CAP_PHYS_PORT_OFFSET		0x3
@@ -406,7 +407,8 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
 
-		size = QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG;
+		size = QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG |
+			QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG;
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
 	} else
 		err = -EINVAL;
@@ -509,6 +511,8 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 			MLX4_GET(size, outbox, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
 			if (size & QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG)
 				func_cap->extra_flags |= MLX4_QUERY_FUNC_FLAGS_BF_RES_QP;
+			if (size & QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG)
+				func_cap->extra_flags |= MLX4_QUERY_FUNC_FLAGS_A0_RES_QP;
 		}
 
 		goto out;
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 6a9a941..3bfe90b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -436,6 +436,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		(1 << dev->caps.log_num_vlans) *
 		dev->caps.num_ports;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_RSS_RAW_ETH] =
+		MLX4_A0_STEERING_TABLE_SIZE;
 
 	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
@@ -469,7 +471,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	if (!mlx4_is_slave(dev)) {
 		mlx4_enable_cqe_eqe_stride(dev);
 		dev->caps.alloc_res_qp_mask =
-			(dev->caps.bf_reg_size ? MLX4_RESERVE_ETH_BF_QP : 0);
+			(dev->caps.bf_reg_size ? MLX4_RESERVE_ETH_BF_QP : 0) |
+			MLX4_RESERVE_A0_QP;
 	} else {
 		dev->caps.alloc_res_qp_mask = 0;
 	}
@@ -826,6 +829,9 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 	    dev->caps.bf_reg_size)
 		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_ETH_BF_QP;
 
+	if (func_cap.extra_flags & MLX4_QUERY_FUNC_FLAGS_A0_RES_QP)
+		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_A0_QP;
+
 	return 0;
 
 err_mem:
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index bc1505e..cebd118 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -682,8 +682,19 @@ struct mlx4_srq_table {
 	struct mlx4_icm_table	cmpt_table;
 };
 
+enum mlx4_qp_table_zones {
+	MLX4_QP_TABLE_ZONE_GENERAL,
+	MLX4_QP_TABLE_ZONE_RSS,
+	MLX4_QP_TABLE_ZONE_RAW_ETH,
+	MLX4_QP_TABLE_ZONE_NUM
+};
+
+#define MLX4_A0_STEERING_TABLE_SIZE    256
+
 struct mlx4_qp_table {
-	struct mlx4_bitmap	bitmap;
+	struct mlx4_bitmap	*bitmap_gen;
+	struct mlx4_zone_allocator *zones;
+	u32			zones_uids[MLX4_QP_TABLE_ZONE_NUM];
 	u32			rdmarc_base;
 	int			rdmarc_shift;
 	spinlock_t		lock;
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 8720428..d8d040c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -213,6 +213,7 @@ EXPORT_SYMBOL_GPL(mlx4_qp_modify);
 int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
 			    int *base, u8 flags)
 {
+	u32 uid;
 	int bf_qp = !!(flags & (u8)MLX4_RESERVE_ETH_BF_QP);
 
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -221,8 +222,16 @@ int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
 	if (cnt > MLX4_MAX_BF_QP_RANGE && bf_qp)
 		return -ENOMEM;
 
-	*base = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align,
-					bf_qp ? MLX4_BF_QP_SKIP_MASK : 0);
+	uid = MLX4_QP_TABLE_ZONE_GENERAL;
+	if (flags & (u8)MLX4_RESERVE_A0_QP) {
+		if (bf_qp)
+			uid = MLX4_QP_TABLE_ZONE_RAW_ETH;
+		else
+			uid = MLX4_QP_TABLE_ZONE_RSS;
+	}
+
+	*base = mlx4_zone_alloc_entries(qp_table->zones, uid, cnt, align,
+					bf_qp ? MLX4_BF_QP_SKIP_MASK : 0, NULL);
 	if (*base == -1)
 		return -ENOMEM;
 
@@ -263,7 +272,7 @@ void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
 
 	if (mlx4_is_qp_reserved(dev, (u32) base_qpn))
 		return;
-	mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, MLX4_USE_RR);
+	mlx4_zone_free_entries_unique(qp_table->zones, base_qpn, cnt);
 }
 
 void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
@@ -473,6 +482,227 @@ static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn)
 			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 }
 
+#define MLX4_QP_TABLE_RSS_ETH_PRIORITY 2
+#define MLX4_QP_TABLE_RAW_ETH_PRIORITY 1
+#define MLX4_QP_TABLE_RAW_ETH_SIZE     256
+
+static int mlx4_create_zones(struct mlx4_dev *dev,
+			     u32 reserved_bottom_general,
+			     u32 reserved_top_general,
+			     u32 reserved_bottom_rss,
+			     u32 start_offset_rss,
+			     u32 max_table_offset)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	struct mlx4_bitmap (*bitmap)[MLX4_QP_TABLE_ZONE_NUM] = NULL;
+	int bitmap_initialized = 0;
+	u32 last_offset;
+	int k;
+	int err;
+
+	qp_table->zones = mlx4_zone_allocator_create(MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP);
+
+	if (NULL == qp_table->zones)
+		return -ENOMEM;
+
+	bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL);
+
+	if (NULL == bitmap) {
+		err = -ENOMEM;
+		goto free_zone;
+	}
+
+	err = mlx4_bitmap_init(*bitmap + MLX4_QP_TABLE_ZONE_GENERAL, dev->caps.num_qps,
+			       (1 << 23) - 1, reserved_bottom_general,
+			       reserved_top_general);
+
+	if (err)
+		goto free_bitmap;
+
+	++bitmap_initialized;
+
+	err = mlx4_zone_add_one(qp_table->zones, *bitmap + MLX4_QP_TABLE_ZONE_GENERAL,
+				MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO |
+				MLX4_ZONE_USE_RR, 0,
+				0, qp_table->zones_uids + MLX4_QP_TABLE_ZONE_GENERAL);
+
+	if (err)
+		goto free_bitmap;
+
+	err = mlx4_bitmap_init(*bitmap + MLX4_QP_TABLE_ZONE_RSS,
+			       reserved_bottom_rss,
+			       reserved_bottom_rss - 1,
+			       dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+			       reserved_bottom_rss - start_offset_rss);
+
+	if (err)
+		goto free_bitmap;
+
+	++bitmap_initialized;
+
+	err = mlx4_zone_add_one(qp_table->zones, *bitmap + MLX4_QP_TABLE_ZONE_RSS,
+				MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO |
+				MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO |
+				MLX4_ZONE_USE_RR, MLX4_QP_TABLE_RSS_ETH_PRIORITY,
+				0, qp_table->zones_uids + MLX4_QP_TABLE_ZONE_RSS);
+
+	if (err)
+		goto free_bitmap;
+
+	last_offset = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+	/*  We have a single zone for the A0 steering QPs area of the FW. This area
+	 *  needs to be split into subareas. One set of subareas is for RSS QPs
+	 *  (in which qp number bits 6 and/or 7 are set); the other set of subareas
+	 *  is for RAW_ETH QPs, which require that both bits 6 and 7 are zero.
+	 *  Currently, the values returned by the FW (A0 steering area starting qp number
+	 *  and A0 steering area size) are such that there are only two subareas -- one
+	 *  for RSS and one for RAW_ETH.
+	 */
+	for (k = MLX4_QP_TABLE_ZONE_RSS + 1; k < sizeof(*bitmap)/sizeof((*bitmap)[0]);
+	     k++) {
+		int size;
+		u32 offset = start_offset_rss;
+		u32 bf_mask;
+		u32 requested_size;
+
+		/* Assuming MLX4_BF_QP_SKIP_MASK is consecutive ones, this calculates
+		 * a mask of all LSB bits set until (and not including) the first
+		 * set bit of  MLX4_BF_QP_SKIP_MASK. For example, if MLX4_BF_QP_SKIP_MASK
+		 * is 0xc0, bf_mask will be 0x3f.
+		 */
+		bf_mask = (MLX4_BF_QP_SKIP_MASK & ~(MLX4_BF_QP_SKIP_MASK - 1)) - 1;
+		requested_size = min((u32)MLX4_QP_TABLE_RAW_ETH_SIZE, bf_mask + 1);
+
+		if (((last_offset & MLX4_BF_QP_SKIP_MASK) &&
+		     ((int)(max_table_offset - last_offset)) >=
+		     roundup_pow_of_two(MLX4_BF_QP_SKIP_MASK)) ||
+		    (!(last_offset & MLX4_BF_QP_SKIP_MASK) &&
+		     !((last_offset + requested_size - 1) &
+		       MLX4_BF_QP_SKIP_MASK)))
+			size = requested_size;
+		else {
+			u32 candidate_offset =
+				(last_offset | MLX4_BF_QP_SKIP_MASK | bf_mask) + 1;
+
+			if (last_offset & MLX4_BF_QP_SKIP_MASK)
+				last_offset = candidate_offset;
+
+			/* From this point, the BF bits are 0 */
+
+			if (last_offset > max_table_offset) {
+				/* need to skip */
+				size = -1;
+			} else {
+				size = min3(max_table_offset - last_offset,
+					    bf_mask - (last_offset & bf_mask),
+					    requested_size);
+				if (size < requested_size) {
+					int candidate_size;
+
+					candidate_size = min3(
+						max_table_offset - candidate_offset,
+						bf_mask - (last_offset & bf_mask),
+						requested_size);
+
+					/*  We will not take this path if last_offset was
+					 *  already set above to candidate_offset
+					 */
+					if (candidate_size > size) {
+						last_offset = candidate_offset;
+						size = candidate_size;
+					}
+				}
+			}
+		}
+
+		if (size > 0) {
+			/* mlx4_bitmap_alloc_range will find a contiguous range of "size"
+			 * QPs in which both bits 6 and 7 are zero, because we pass it the
+			 * MLX4_BF_SKIP_MASK).
+			 */
+			offset = mlx4_bitmap_alloc_range(
+					*bitmap + MLX4_QP_TABLE_ZONE_RSS,
+					size, 1,
+					MLX4_BF_QP_SKIP_MASK);
+
+			if (offset == (u32)-1) {
+				err = -ENOMEM;
+				break;
+			}
+
+			last_offset = offset + size;
+
+			err = mlx4_bitmap_init(*bitmap + k, roundup_pow_of_two(size),
+					       roundup_pow_of_two(size) - 1, 0,
+					       roundup_pow_of_two(size) - size);
+		} else {
+			/* Add an empty bitmap, we'll allocate from different zones (since
+			 * at least one is reserved)
+			 */
+			err = mlx4_bitmap_init(*bitmap + k, 1,
+					       MLX4_QP_TABLE_RAW_ETH_SIZE - 1, 0,
+					       0);
+			mlx4_bitmap_alloc_range(*bitmap + k, 1, 1, 0);
+		}
+
+		if (err)
+			break;
+
+		++bitmap_initialized;
+
+		err = mlx4_zone_add_one(qp_table->zones, *bitmap + k,
+					MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO |
+					MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO |
+					MLX4_ZONE_USE_RR, MLX4_QP_TABLE_RAW_ETH_PRIORITY,
+					offset, qp_table->zones_uids + k);
+
+		if (err)
+			break;
+	}
+
+	if (err)
+		goto free_bitmap;
+
+	qp_table->bitmap_gen = *bitmap;
+
+	return err;
+
+free_bitmap:
+	for (k = 0; k < bitmap_initialized; k++)
+		mlx4_bitmap_cleanup(*bitmap + k);
+	kfree(bitmap);
+free_zone:
+	mlx4_zone_allocator_destroy(qp_table->zones);
+	return err;
+}
+
+static void mlx4_cleanup_qp_zones(struct mlx4_dev *dev)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+
+	if (qp_table->zones) {
+		int i;
+
+		for (i = 0;
+		     i < sizeof(qp_table->zones_uids)/sizeof(qp_table->zones_uids[0]);
+		     i++) {
+			struct mlx4_bitmap *bitmap =
+				mlx4_zone_get_bitmap(qp_table->zones,
+						     qp_table->zones_uids[i]);
+
+			mlx4_zone_remove_one(qp_table->zones, qp_table->zones_uids[i]);
+			if (NULL == bitmap)
+				continue;
+
+			mlx4_bitmap_cleanup(bitmap);
+		}
+		mlx4_zone_allocator_destroy(qp_table->zones);
+		kfree(qp_table->bitmap_gen);
+		qp_table->bitmap_gen = NULL;
+		qp_table->zones = NULL;
+	}
+}
+
 int mlx4_init_qp_table(struct mlx4_dev *dev)
 {
 	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
@@ -480,22 +710,33 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	int reserved_from_top = 0;
 	int reserved_from_bot;
 	int k;
+	int fixed_reserved_from_bot_rv = 0;
+	int bottom_reserved_for_rss_bitmap;
+	u32 max_table_offset = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
+		MLX4_A0_STEERING_TABLE_SIZE;
 
 	spin_lock_init(&qp_table->lock);
 	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
 	if (mlx4_is_slave(dev))
 		return 0;
 
-	/*
-	 * We reserve 2 extra QPs per port for the special QPs.  The
+	/* We reserve 2 extra QPs per port for the special QPs.  The
 	 * block of special QPs must be aligned to a multiple of 8, so
 	 * round up.
 	 *
 	 * We also reserve the MSB of the 24-bit QP number to indicate
 	 * that a QP is an XRC QP.
 	 */
-	dev->phys_caps.base_sqpn =
-		ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8);
+	for (k = 0; k <= MLX4_QP_REGION_BOTTOM; k++)
+		fixed_reserved_from_bot_rv += dev->caps.reserved_qps_cnt[k];
+
+	if (fixed_reserved_from_bot_rv < max_table_offset)
+		fixed_reserved_from_bot_rv = max_table_offset;
+
+	/* We reserve at least 1 extra for bitmaps that we don't have enough space for*/
+	bottom_reserved_for_rss_bitmap =
+		roundup_pow_of_two(fixed_reserved_from_bot_rv + 1);
+	dev->phys_caps.base_sqpn = ALIGN(bottom_reserved_for_rss_bitmap, 8);
 
 	{
 		int sort[MLX4_NUM_QP_REGION];
@@ -505,8 +746,8 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 		for (i = 1; i < MLX4_NUM_QP_REGION; ++i)
 			sort[i] = i;
 
-		for (i = MLX4_NUM_QP_REGION; i > 0; --i) {
-			for (j = 2; j < i; ++j) {
+		for (i = MLX4_NUM_QP_REGION; i > MLX4_QP_REGION_BOTTOM; --i) {
+			for (j = MLX4_QP_REGION_BOTTOM + 2; j < i; ++j) {
 				if (dev->caps.reserved_qps_cnt[sort[j]] >
 				    dev->caps.reserved_qps_cnt[sort[j - 1]]) {
 					tmp             = sort[j];
@@ -516,13 +757,12 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 			}
 		}
 
-		for (i = 1; i < MLX4_NUM_QP_REGION; ++i) {
+		for (i = MLX4_QP_REGION_BOTTOM + 1; i < MLX4_NUM_QP_REGION; ++i) {
 			last_base -= dev->caps.reserved_qps_cnt[sort[i]];
 			dev->caps.reserved_qps_base[sort[i]] = last_base;
 			reserved_from_top +=
 				dev->caps.reserved_qps_cnt[sort[i]];
 		}
-
 	}
 
        /* Reserve 8 real SQPs in both native and SRIOV modes.
@@ -541,9 +781,11 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 		return -EINVAL;
 	}
 
-	err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps,
-			       (1 << 23) - 1, reserved_from_bot,
-			       reserved_from_top);
+	err = mlx4_create_zones(dev, reserved_from_bot, reserved_from_bot,
+				bottom_reserved_for_rss_bitmap,
+				fixed_reserved_from_bot_rv,
+				max_table_offset);
+
 	if (err)
 		return err;
 
@@ -579,7 +821,8 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	err = mlx4_CONF_SPECIAL_QP(dev, dev->phys_caps.base_sqpn);
 	if (err)
 		goto err_mem;
-	return 0;
+
+	return err;
 
 err_mem:
 	kfree(dev->caps.qp0_tunnel);
@@ -588,6 +831,7 @@ err_mem:
 	kfree(dev->caps.qp1_proxy);
 	dev->caps.qp0_tunnel = dev->caps.qp0_proxy =
 		dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL;
+	mlx4_cleanup_qp_zones(dev);
 	return err;
 }
 
@@ -597,7 +841,8 @@ void mlx4_cleanup_qp_table(struct mlx4_dev *dev)
 		return;
 
 	mlx4_CONF_SPECIAL_QP(dev, 0);
-	mlx4_bitmap_cleanup(&mlx4_priv(dev)->qp_table.bitmap);
+
+	mlx4_cleanup_qp_zones(dev);
 }
 
 int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 272aa25..39890cd 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -195,7 +195,8 @@ enum {
 };
 
 enum {
-	MLX4_QUERY_FUNC_FLAGS_BF_RES_QP		= 1LL << 0
+	MLX4_QUERY_FUNC_FLAGS_BF_RES_QP		= 1LL << 0,
+	MLX4_QUERY_FUNC_FLAGS_A0_RES_QP		= 1LL << 1
 };
 
 /* bit enums for an 8-bit flags field indicating special use
@@ -207,6 +208,7 @@ enum {
  * This enum may use only bits 0..7.
  */
 enum {
+	MLX4_RESERVE_A0_QP	= 1 << 6,
 	MLX4_RESERVE_ETH_BF_QP	= 1 << 7,
 };
 
@@ -349,6 +351,8 @@ enum {
 
 enum mlx4_qp_region {
 	MLX4_QP_REGION_FW = 0,
+	MLX4_QP_REGION_RSS_RAW_ETH,
+	MLX4_QP_REGION_BOTTOM = MLX4_QP_REGION_RSS_RAW_ETH,
 	MLX4_QP_REGION_ETH_ADDR,
 	MLX4_QP_REGION_FC_ADDR,
 	MLX4_QP_REGION_FC_EXCH,
@@ -891,7 +895,9 @@ static inline int mlx4_num_reserved_sqps(struct mlx4_dev *dev)
 static inline int mlx4_is_qp_reserved(struct mlx4_dev *dev, u32 qpn)
 {
 	return (qpn < dev->phys_caps.base_sqpn + 8 +
-		16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev));
+		16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev) &&
+		qpn >= dev->phys_caps.base_sqpn) ||
+	       (qpn < dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW]);
 }
 
 static inline int mlx4_is_guest_proxy(struct mlx4_dev *dev, int slave, u32 qpn)
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 09/10] net/mlx4: Refactor QUERY_PORT
From: Or Gerlitz @ 2014-12-04 13:13 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Or Gerlitz
In-Reply-To: <1417698835-11050-1-git-send-email-ogerlitz@mellanox.com>

From: Matan Barak <matanb@mellanox.com>

Currently QUERY_PORT is done as a part of QUERY_DEV_CAP firmware command.

Since we would like to use it without querying all device capabilities,
extract this part to be a function of its own.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c   |  141 +++++++++++++++++------------
 drivers/net/ethernet/mellanox/mlx4/fw.h   |   37 +++++----
 drivers/net/ethernet/mellanox/mlx4/main.c |   71 ++++++++++-----
 3 files changed, 154 insertions(+), 95 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 4da5aed..e07cb9b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -886,61 +886,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	if (field32 & (1 << 21))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_80_VFS;
 
-	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
-		for (i = 1; i <= dev_cap->num_ports; ++i) {
-			MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
-			dev_cap->max_vl[i]	   = field >> 4;
-			MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
-			dev_cap->ib_mtu[i]	   = field >> 4;
-			dev_cap->max_port_width[i] = field & 0xf;
-			MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
-			dev_cap->max_gids[i]	   = 1 << (field & 0xf);
-			MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
-			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
-		}
-	} else {
-#define QUERY_PORT_SUPPORTED_TYPE_OFFSET	0x00
-#define QUERY_PORT_MTU_OFFSET			0x01
-#define QUERY_PORT_ETH_MTU_OFFSET		0x02
-#define QUERY_PORT_WIDTH_OFFSET			0x06
-#define QUERY_PORT_MAX_GID_PKEY_OFFSET		0x07
-#define QUERY_PORT_MAX_MACVLAN_OFFSET		0x0a
-#define QUERY_PORT_MAX_VL_OFFSET		0x0b
-#define QUERY_PORT_MAC_OFFSET			0x10
-#define QUERY_PORT_TRANS_VENDOR_OFFSET		0x18
-#define QUERY_PORT_WAVELENGTH_OFFSET		0x1c
-#define QUERY_PORT_TRANS_CODE_OFFSET		0x20
-
-		for (i = 1; i <= dev_cap->num_ports; ++i) {
-			err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 0, MLX4_CMD_QUERY_PORT,
-					   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
-			if (err)
-				goto out;
-
-			MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
-			dev_cap->supported_port_types[i] = field & 3;
-			dev_cap->suggested_type[i] = (field >> 3) & 1;
-			dev_cap->default_sense[i] = (field >> 4) & 1;
-			MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
-			dev_cap->ib_mtu[i]	   = field & 0xf;
-			MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
-			dev_cap->max_port_width[i] = field & 0xf;
-			MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
-			dev_cap->max_gids[i]	   = 1 << (field >> 4);
-			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
-			MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
-			dev_cap->max_vl[i]	   = field & 0xf;
-			MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
-			dev_cap->log_max_macs[i]  = field & 0xf;
-			dev_cap->log_max_vlans[i] = field >> 4;
-			MLX4_GET(dev_cap->eth_mtu[i], outbox, QUERY_PORT_ETH_MTU_OFFSET);
-			MLX4_GET(dev_cap->def_mac[i], outbox, QUERY_PORT_MAC_OFFSET);
-			MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET);
-			dev_cap->trans_type[i] = field32 >> 24;
-			dev_cap->vendor_oui[i] = field32 & 0xffffff;
-			MLX4_GET(dev_cap->wavelength[i], outbox, QUERY_PORT_WAVELENGTH_OFFSET);
-			MLX4_GET(dev_cap->trans_code[i], outbox, QUERY_PORT_TRANS_CODE_OFFSET);
-		}
+	for (i = 1; i <= dev_cap->num_ports; i++) {
+		err = mlx4_QUERY_PORT(dev, i, dev_cap->port_cap + i);
+		if (err)
+			goto out;
 	}
 
 	mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n",
@@ -977,8 +926,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
 		 dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
 	mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
-		 dev_cap->local_ca_ack_delay, 128 << dev_cap->ib_mtu[1],
-		 dev_cap->max_port_width[1]);
+		 dev_cap->local_ca_ack_delay, 128 << dev_cap->port_cap[1].ib_mtu,
+		 dev_cap->port_cap[1].max_port_width);
 	mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
 		 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
 	mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
@@ -995,6 +944,84 @@ out:
 	return err;
 }
 
+int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u32 field32;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+				   MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+
+		if (err)
+			goto out;
+
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+		port_cap->max_vl	   = field >> 4;
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
+		port_cap->ib_mtu	   = field >> 4;
+		port_cap->max_port_width = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
+		port_cap->max_gids	   = 1 << (field & 0xf);
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
+		port_cap->max_pkeys	   = 1 << (field & 0xf);
+	} else {
+#define QUERY_PORT_SUPPORTED_TYPE_OFFSET	0x00
+#define QUERY_PORT_MTU_OFFSET			0x01
+#define QUERY_PORT_ETH_MTU_OFFSET		0x02
+#define QUERY_PORT_WIDTH_OFFSET			0x06
+#define QUERY_PORT_MAX_GID_PKEY_OFFSET		0x07
+#define QUERY_PORT_MAX_MACVLAN_OFFSET		0x0a
+#define QUERY_PORT_MAX_VL_OFFSET		0x0b
+#define QUERY_PORT_MAC_OFFSET			0x10
+#define QUERY_PORT_TRANS_VENDOR_OFFSET		0x18
+#define QUERY_PORT_WAVELENGTH_OFFSET		0x1c
+#define QUERY_PORT_TRANS_CODE_OFFSET		0x20
+
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT,
+				   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+		if (err)
+			goto out;
+
+		MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+		port_cap->supported_port_types = field & 3;
+		port_cap->suggested_type = (field >> 3) & 1;
+		port_cap->default_sense = (field >> 4) & 1;
+		MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
+		port_cap->ib_mtu	   = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
+		port_cap->max_port_width = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
+		port_cap->max_gids	   = 1 << (field >> 4);
+		port_cap->max_pkeys	   = 1 << (field & 0xf);
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
+		port_cap->max_vl	   = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
+		port_cap->log_max_macs  = field & 0xf;
+		port_cap->log_max_vlans = field >> 4;
+		MLX4_GET(port_cap->eth_mtu, outbox, QUERY_PORT_ETH_MTU_OFFSET);
+		MLX4_GET(port_cap->def_mac, outbox, QUERY_PORT_MAC_OFFSET);
+		MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET);
+		port_cap->trans_type = field32 >> 24;
+		port_cap->vendor_oui = field32 & 0xffffff;
+		MLX4_GET(port_cap->wavelength, outbox, QUERY_PORT_WAVELENGTH_OFFSET);
+		MLX4_GET(port_cap->trans_code, outbox, QUERY_PORT_TRANS_CODE_OFFSET);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
 int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
 			       struct mlx4_vhcr *vhcr,
 			       struct mlx4_cmd_mailbox *inbox,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 0e910a4..744398b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -43,6 +43,25 @@ struct mlx4_mod_stat_cfg {
 	u8 log_pg_sz_m;
 };
 
+struct mlx4_port_cap {
+	u8  supported_port_types;
+	u8  suggested_type;
+	u8  default_sense;
+	u8  log_max_macs;
+	u8  log_max_vlans;
+	int ib_mtu;
+	int max_port_width;
+	int max_vl;
+	int max_gids;
+	int max_pkeys;
+	u64 def_mac;
+	u16 eth_mtu;
+	int trans_type;
+	int vendor_oui;
+	u16 wavelength;
+	u64 trans_code;
+};
+
 struct mlx4_dev_cap {
 	int max_srq_sz;
 	int max_qp_sz;
@@ -67,17 +86,6 @@ struct mlx4_dev_cap {
 	int local_ca_ack_delay;
 	int num_ports;
 	u32 max_msg_sz;
-	int ib_mtu[MLX4_MAX_PORTS + 1];
-	int max_port_width[MLX4_MAX_PORTS + 1];
-	int max_vl[MLX4_MAX_PORTS + 1];
-	int max_gids[MLX4_MAX_PORTS + 1];
-	int max_pkeys[MLX4_MAX_PORTS + 1];
-	u64 def_mac[MLX4_MAX_PORTS + 1];
-	u16 eth_mtu[MLX4_MAX_PORTS + 1];
-	int trans_type[MLX4_MAX_PORTS + 1];
-	int vendor_oui[MLX4_MAX_PORTS + 1];
-	u16 wavelength[MLX4_MAX_PORTS + 1];
-	u64 trans_code[MLX4_MAX_PORTS + 1];
 	u16 stat_rate_support;
 	int fs_log_max_ucast_qp_range_size;
 	int fs_max_num_qp_per_entry;
@@ -115,12 +123,8 @@ struct mlx4_dev_cap {
 	u64 max_icm_sz;
 	int max_gso_sz;
 	int max_rss_tbl_sz;
-	u8  supported_port_types[MLX4_MAX_PORTS + 1];
-	u8  suggested_type[MLX4_MAX_PORTS + 1];
-	u8  default_sense[MLX4_MAX_PORTS + 1];
-	u8  log_max_macs[MLX4_MAX_PORTS + 1];
-	u8  log_max_vlans[MLX4_MAX_PORTS + 1];
 	u32 max_counters;
+	struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_func_cap {
@@ -217,6 +221,7 @@ struct mlx4_set_ib_param {
 };
 
 int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
+int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_cap);
 int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 			struct mlx4_func_cap *func_cap);
 int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 3bfe90b..6173b80 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -254,6 +254,46 @@ static void mlx4_enable_cqe_eqe_stride(struct mlx4_dev *dev)
 	}
 }
 
+static int _mlx4_dev_port(struct mlx4_dev *dev, int port,
+			  struct mlx4_port_cap *port_cap)
+{
+	dev->caps.vl_cap[port]	    = port_cap->max_vl;
+	dev->caps.ib_mtu_cap[port]	    = port_cap->ib_mtu;
+	dev->phys_caps.gid_phys_table_len[port]  = port_cap->max_gids;
+	dev->phys_caps.pkey_phys_table_len[port] = port_cap->max_pkeys;
+	/* set gid and pkey table operating lengths by default
+	 * to non-sriov values
+	 */
+	dev->caps.gid_table_len[port]  = port_cap->max_gids;
+	dev->caps.pkey_table_len[port] = port_cap->max_pkeys;
+	dev->caps.port_width_cap[port] = port_cap->max_port_width;
+	dev->caps.eth_mtu_cap[port]    = port_cap->eth_mtu;
+	dev->caps.def_mac[port]        = port_cap->def_mac;
+	dev->caps.supported_type[port] = port_cap->supported_port_types;
+	dev->caps.suggested_type[port] = port_cap->suggested_type;
+	dev->caps.default_sense[port] = port_cap->default_sense;
+	dev->caps.trans_type[port]	    = port_cap->trans_type;
+	dev->caps.vendor_oui[port]     = port_cap->vendor_oui;
+	dev->caps.wavelength[port]     = port_cap->wavelength;
+	dev->caps.trans_code[port]     = port_cap->trans_code;
+
+	return 0;
+}
+
+static int mlx4_dev_port(struct mlx4_dev *dev, int port,
+			 struct mlx4_port_cap *port_cap)
+{
+	int err = 0;
+
+	err = mlx4_QUERY_PORT(dev, port, port_cap);
+
+	if (err)
+		mlx4_err(dev, "QUERY_PORT command failed.\n");
+
+	return err;
+}
+
+#define MLX4_A0_STEERING_TABLE_SIZE	256
 static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	int err;
@@ -289,24 +329,11 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 				      dev->caps.num_sys_eqs :
 				      MLX4_MAX_EQ_NUM;
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
-		dev->caps.vl_cap[i]	    = dev_cap->max_vl[i];
-		dev->caps.ib_mtu_cap[i]	    = dev_cap->ib_mtu[i];
-		dev->phys_caps.gid_phys_table_len[i]  = dev_cap->max_gids[i];
-		dev->phys_caps.pkey_phys_table_len[i] = dev_cap->max_pkeys[i];
-		/* set gid and pkey table operating lengths by default
-		 * to non-sriov values */
-		dev->caps.gid_table_len[i]  = dev_cap->max_gids[i];
-		dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i];
-		dev->caps.port_width_cap[i] = dev_cap->max_port_width[i];
-		dev->caps.eth_mtu_cap[i]    = dev_cap->eth_mtu[i];
-		dev->caps.def_mac[i]        = dev_cap->def_mac[i];
-		dev->caps.supported_type[i] = dev_cap->supported_port_types[i];
-		dev->caps.suggested_type[i] = dev_cap->suggested_type[i];
-		dev->caps.default_sense[i] = dev_cap->default_sense[i];
-		dev->caps.trans_type[i]	    = dev_cap->trans_type[i];
-		dev->caps.vendor_oui[i]     = dev_cap->vendor_oui[i];
-		dev->caps.wavelength[i]     = dev_cap->wavelength[i];
-		dev->caps.trans_code[i]     = dev_cap->trans_code[i];
+		err = _mlx4_dev_port(dev, i, dev_cap->port_cap + i);
+		if (err) {
+			mlx4_err(dev, "QUERY_PORT command failed, aborting\n");
+			return err;
+		}
 	}
 
 	dev->caps.uar_page_size	     = PAGE_SIZE;
@@ -415,13 +442,13 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			dev->caps.possible_type[i] = dev->caps.port_type[i];
 		}
 
-		if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
-			dev->caps.log_num_macs = dev_cap->log_max_macs[i];
+		if (dev->caps.log_num_macs > dev_cap->port_cap[i].log_max_macs) {
+			dev->caps.log_num_macs = dev_cap->port_cap[i].log_max_macs;
 			mlx4_warn(dev, "Requested number of MACs is too much for port %d, reducing to %d\n",
 				  i, 1 << dev->caps.log_num_macs);
 		}
-		if (dev->caps.log_num_vlans > dev_cap->log_max_vlans[i]) {
-			dev->caps.log_num_vlans = dev_cap->log_max_vlans[i];
+		if (dev->caps.log_num_vlans > dev_cap->port_cap[i].log_max_vlans) {
+			dev->caps.log_num_vlans = dev_cap->port_cap[i].log_max_vlans;
 			mlx4_warn(dev, "Requested number of VLANs is too much for port %d, reducing to %d\n",
 				  i, 1 << dev->caps.log_num_vlans);
 		}
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 03/10] net/mlx4_core: Use tasklet for user-space CQ completion events
From: Or Gerlitz @ 2014-12-04 13:13 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Or Gerlitz
In-Reply-To: <1417698835-11050-1-git-send-email-ogerlitz@mellanox.com>

From: Matan Barak <matanb@mellanox.com>

Previously, we've fired all our completion callbacks straight from our ISR.

Some of those callbacks were lightweight (for example, mlx4_en's and
IPoIB napi callbacks), but some of them did more work (for example,
the user-space RDMA stack uverbs' completion handler). Besides that,
doing more than the minimal work in ISR is generally considered wrong,
it could even lead to a hard lockup of the system. Since when a lot
of completion events are generated by the hardware, the loop over those
events could be so long, that we'll get into a hard lockup by the system
watchdog.

In order to avoid that, add a new way of invoking completion events
callbacks. In the interrupt itself, we add the CQs which receive completion
event to a per-EQ list and schedule a tasklet. In the tasklet context
we loop over all the CQs in the list and invoke the user callback.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/infiniband/hw/mlx4/cq.c           |    5 ++-
 drivers/net/ethernet/mellanox/mlx4/cq.c   |   49 +++++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/eq.c   |   16 +++++++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |   12 +++++++
 include/linux/mlx4/device.h               |    5 +++
 5 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 1066eec..a3b70f6 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -233,7 +233,10 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector
 	if (err)
 		goto err_dbmap;
 
-	cq->mcq.comp  = mlx4_ib_cq_comp;
+	if (context)
+		cq->mcq.tasklet_ctx.comp = mlx4_ib_cq_comp;
+	else
+		cq->mcq.comp = mlx4_ib_cq_comp;
 	cq->mcq.event = mlx4_ib_cq_event;
 
 	if (context)
diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
index 56022d6..060ea63 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -52,6 +52,50 @@
 #define MLX4_CQ_STATE_ARMED_SOL		( 6 <<  8)
 #define MLX4_EQ_STATE_FIRED		(10 <<  8)
 
+#define TASKLET_THRESHOLD 1000
+
+void mlx4_cq_tasklet_cb(unsigned long data)
+{
+	unsigned long flags;
+	unsigned int i = 0;
+	struct mlx4_eq_tasklet *ctx = (struct mlx4_eq_tasklet *)data;
+	struct mlx4_cq *mcq, *temp;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	list_splice_tail_init(&ctx->list, &ctx->process_list);
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	list_for_each_entry_safe(mcq, temp, &ctx->process_list, tasklet_ctx.list) {
+		list_del_init(&mcq->tasklet_ctx.list);
+		mcq->tasklet_ctx.comp(mcq);
+		if (atomic_dec_and_test(&mcq->refcount))
+			complete(&mcq->free);
+		if (++i == TASKLET_THRESHOLD)
+			break;
+	}
+
+	if (i == TASKLET_THRESHOLD)
+		tasklet_schedule(&ctx->task);
+}
+
+static void mlx4_add_cq_to_tasklet(struct mlx4_cq *cq)
+{
+	unsigned long flags;
+	struct mlx4_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv;
+
+	spin_lock_irqsave(&tasklet_ctx->lock, flags);
+	/* When migrating CQs between EQs will be implemented, please note
+	 * that you need to sync this point. It is possible that
+	 * while migrating a CQ, completions on the old EQs could
+	 * still arrive.
+	 */
+	if (list_empty_careful(&cq->tasklet_ctx.list)) {
+		atomic_inc(&cq->refcount);
+		list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
+	}
+	spin_unlock_irqrestore(&tasklet_ctx->lock, flags);
+}
+
 void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
 {
 	struct mlx4_cq *cq;
@@ -292,6 +336,11 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 	cq->uar        = uar;
 	atomic_set(&cq->refcount, 1);
 	init_completion(&cq->free);
+	cq->comp = mlx4_add_cq_to_tasklet;
+	cq->tasklet_ctx.priv =
+		&priv->eq_table.eq[cq->vector].tasklet_ctx;
+	INIT_LIST_HEAD(&cq->tasklet_ctx.list);
+
 
 	cq->irq = priv->eq_table.eq[cq->vector].irq;
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index d68b264..3d275fb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -450,7 +450,7 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_eqe *eqe;
-	int cqn;
+	int cqn = -1;
 	int eqes_found = 0;
 	int set_ci = 0;
 	int port;
@@ -758,6 +758,13 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 
 	eq_set_ci(eq, 1);
 
+	/* cqn is 24bit wide but is initialized such that its higher bits
+	 * are ones too. Thus, if we got any event, cqn's high bits should be off
+	 * and we need to schedule the tasklet.
+	 */
+	if (!(cqn & ~0xffffff))
+		tasklet_schedule(&eq->tasklet_ctx.task);
+
 	return eqes_found;
 }
 
@@ -971,6 +978,12 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
 
 	eq->cons_index = 0;
 
+	INIT_LIST_HEAD(&eq->tasklet_ctx.list);
+	INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
+	spin_lock_init(&eq->tasklet_ctx.lock);
+	tasklet_init(&eq->tasklet_ctx.task, mlx4_cq_tasklet_cb,
+		     (unsigned long)&eq->tasklet_ctx);
+
 	return err;
 
 err_out_free_mtt:
@@ -1027,6 +1040,7 @@ static void mlx4_free_eq(struct mlx4_dev *dev,
 		}
 	}
 	synchronize_irq(eq->irq);
+	tasklet_disable(&eq->tasklet_ctx.task);
 
 	mlx4_mtt_cleanup(dev, &eq->mtt);
 	for (i = 0; i < npages; ++i)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index f48e7c3..b67ef48 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -43,6 +43,8 @@
 #include <linux/timer.h>
 #include <linux/semaphore.h>
 #include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/driver.h>
@@ -373,6 +375,14 @@ struct mlx4_srq_context {
 	__be64			db_rec_addr;
 };
 
+struct mlx4_eq_tasklet {
+	struct list_head list;
+	struct list_head process_list;
+	struct tasklet_struct task;
+	/* lock on completion tasklet list */
+	spinlock_t lock;
+};
+
 struct mlx4_eq {
 	struct mlx4_dev	       *dev;
 	void __iomem	       *doorbell;
@@ -383,6 +393,7 @@ struct mlx4_eq {
 	int			nent;
 	struct mlx4_buf_list   *page_list;
 	struct mlx4_mtt		mtt;
+	struct mlx4_eq_tasklet	tasklet_ctx;
 };
 
 struct mlx4_slave_eqe {
@@ -1146,6 +1157,7 @@ void mlx4_cmd_use_polling(struct mlx4_dev *dev);
 int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
 		  unsigned long timeout);
 
+void mlx4_cq_tasklet_cb(unsigned long data);
 void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn);
 void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index cf09e65..3951b53 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -621,6 +621,11 @@ struct mlx4_cq {
 
 	atomic_t		refcount;
 	struct completion	free;
+	struct {
+		struct list_head list;
+		void (*comp)(struct mlx4_cq *);
+		void		*priv;
+	} tasklet_ctx;
 };
 
 struct mlx4_qp {
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 05/10] net/mlx4: Add a check if there are too many reserved QPs
From: Or Gerlitz @ 2014-12-04 13:13 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Dotan Barak, Or Gerlitz
In-Reply-To: <1417698835-11050-1-git-send-email-ogerlitz@mellanox.com>

From: Dotan Barak <dotanb@dev.mellanox.co.il>

The number of reserved QPs is affected both from the firmware and
from the driver's requirements. This patch adds a check that
validates that this number is indeed feasable.

Signed-off-by: Dotan Barak <dotanb@dev.mellanox.co.il>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/qp.c |    8 +++++++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 40e82ed..8720428 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -478,6 +478,7 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
 	int err;
 	int reserved_from_top = 0;
+	int reserved_from_bot;
 	int k;
 
 	spin_lock_init(&qp_table->lock);
@@ -534,9 +535,14 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	* b. All the proxy SQPs (8 per function)
 	* c. All the tunnel QPs (8 per function)
 	*/
+	reserved_from_bot = mlx4_num_reserved_sqps(dev);
+	if (reserved_from_bot + reserved_from_top > dev->caps.num_qps) {
+		mlx4_err(dev, "Number of reserved QPs is higher than number of QPs\n");
+		return -EINVAL;
+	}
 
 	err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps,
-			       (1 << 23) - 1, mlx4_num_reserved_sqps(dev),
+			       (1 << 23) - 1, reserved_from_bot,
 			       reserved_from_top);
 	if (err)
 		return err;
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 06/10] net/mlx4: Add mlx4_bitmap zone allocator
From: Or Gerlitz @ 2014-12-04 13:13 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Or Gerlitz
In-Reply-To: <1417698835-11050-1-git-send-email-ogerlitz@mellanox.com>

From: Matan Barak <matanb@mellanox.com>

The zone allocator is a mechanism which manages a few mlx4_bitmaps.

When allocating a resource, the user indicates the desired zone of
which this resource will be allocated from. If possible, the resource
will be allocated from this zone. Otherwise, the resource will be
allocated from a less-than, equal-to, higher-than priority zone,
according to the desired zone's properties with that respective
allocation order.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/alloc.c |  392 ++++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |   69 +++++
 2 files changed, 461 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
index 91a8acc..665e9da 100644
--- a/drivers/net/ethernet/mellanox/mlx4/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -149,6 +149,21 @@ u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap)
 	return bitmap->avail;
 }
 
+static u32 mlx4_bitmap_max(struct mlx4_bitmap *bitmap)
+{
+	return bitmap->max;
+}
+
+static u32 mlx4_bitmap_effective_len(struct mlx4_bitmap *bitmap)
+{
+	return bitmap->effective_len;
+}
+
+static u32 mlx4_bitmap_masked_value(struct mlx4_bitmap *bitmap, u32 obj)
+{
+	return obj & (bitmap->max + bitmap->reserved_top - 1);
+}
+
 void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
 			    int use_rr)
 {
@@ -178,6 +193,7 @@ int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
 	bitmap->mask = mask;
 	bitmap->reserved_top = reserved_top;
 	bitmap->avail = num - reserved_top - reserved_bot;
+	bitmap->effective_len = bitmap->avail;
 	spin_lock_init(&bitmap->lock);
 	bitmap->table = kzalloc(BITS_TO_LONGS(bitmap->max) *
 				sizeof (long), GFP_KERNEL);
@@ -194,6 +210,382 @@ void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
 	kfree(bitmap->table);
 }
 
+struct mlx4_zone_allocator {
+	struct list_head		entries;
+	struct list_head		prios;
+	u32				last_uid;
+	u32				mask;
+	/* protect the zone_allocator from concurrent accesses */
+	spinlock_t			lock;
+	enum mlx4_zone_alloc_flags	flags;
+};
+
+struct mlx4_zone_entry {
+	struct list_head		list;
+	struct list_head		prio_list;
+	u32				uid;
+	struct mlx4_zone_allocator	*allocator;
+	struct mlx4_bitmap		*bitmap;
+	int				use_rr;
+	int				priority;
+	int				offset;
+	enum mlx4_zone_flags		flags;
+};
+
+struct mlx4_zone_allocator *mlx4_zone_allocator_create(enum mlx4_zone_alloc_flags flags)
+{
+	struct mlx4_zone_allocator *zones = kmalloc(sizeof(*zones), GFP_KERNEL);
+
+	if (NULL == zones)
+		return NULL;
+
+	INIT_LIST_HEAD(&zones->entries);
+	INIT_LIST_HEAD(&zones->prios);
+	spin_lock_init(&zones->lock);
+	zones->last_uid = 0;
+	zones->mask = 0;
+	zones->flags = flags;
+
+	return zones;
+}
+
+int mlx4_zone_add_one(struct mlx4_zone_allocator *zone_alloc,
+		      struct mlx4_bitmap *bitmap,
+		      u32 flags,
+		      int priority,
+		      int offset,
+		      u32 *puid)
+{
+	u32 mask = mlx4_bitmap_masked_value(bitmap, (u32)-1);
+	struct mlx4_zone_entry *it;
+	struct mlx4_zone_entry *zone = kmalloc(sizeof(*zone), GFP_KERNEL);
+
+	if (NULL == zone)
+		return -ENOMEM;
+
+	zone->flags = flags;
+	zone->bitmap = bitmap;
+	zone->use_rr = (flags & MLX4_ZONE_USE_RR) ? MLX4_USE_RR : 0;
+	zone->priority = priority;
+	zone->offset = offset;
+
+	spin_lock(&zone_alloc->lock);
+
+	zone->uid = zone_alloc->last_uid++;
+	zone->allocator = zone_alloc;
+
+	if (zone_alloc->mask < mask)
+		zone_alloc->mask = mask;
+
+	list_for_each_entry(it, &zone_alloc->prios, prio_list)
+		if (it->priority >= priority)
+			break;
+
+	if (&it->prio_list == &zone_alloc->prios || it->priority > priority)
+		list_add_tail(&zone->prio_list, &it->prio_list);
+	list_add_tail(&zone->list, &it->list);
+
+	spin_unlock(&zone_alloc->lock);
+
+	*puid = zone->uid;
+
+	return 0;
+}
+
+/* Should be called under a lock */
+static int __mlx4_zone_remove_one_entry(struct mlx4_zone_entry *entry)
+{
+	struct mlx4_zone_allocator *zone_alloc = entry->allocator;
+
+	if (!list_empty(&entry->prio_list)) {
+		/* Check if we need to add an alternative node to the prio list */
+		if (!list_is_last(&entry->list, &zone_alloc->entries)) {
+			struct mlx4_zone_entry *next = list_first_entry(&entry->list,
+									typeof(*next),
+									list);
+
+			if (next->priority == entry->priority)
+				list_add_tail(&next->prio_list, &entry->prio_list);
+		}
+
+		list_del(&entry->prio_list);
+	}
+
+	list_del(&entry->list);
+
+	if (zone_alloc->flags & MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP) {
+		u32 mask = 0;
+		struct mlx4_zone_entry *it;
+
+		list_for_each_entry(it, &zone_alloc->prios, prio_list) {
+			u32 cur_mask = mlx4_bitmap_masked_value(it->bitmap, (u32)-1);
+
+			if (mask < cur_mask)
+				mask = cur_mask;
+		}
+		zone_alloc->mask = mask;
+	}
+
+	return 0;
+}
+
+void mlx4_zone_allocator_destroy(struct mlx4_zone_allocator *zone_alloc)
+{
+	struct mlx4_zone_entry *zone, *tmp;
+
+	spin_lock(&zone_alloc->lock);
+
+	list_for_each_entry_safe(zone, tmp, &zone_alloc->entries, list) {
+		list_del(&zone->list);
+		list_del(&zone->prio_list);
+		kfree(zone);
+	}
+
+	spin_unlock(&zone_alloc->lock);
+	kfree(zone_alloc);
+}
+
+/* Should be called under a lock */
+static u32 __mlx4_alloc_from_zone(struct mlx4_zone_entry *zone, int count,
+				  int align, u32 skip_mask, u32 *puid)
+{
+	u32 uid;
+	u32 res;
+	struct mlx4_zone_allocator *zone_alloc = zone->allocator;
+	struct mlx4_zone_entry *curr_node;
+
+	res = mlx4_bitmap_alloc_range(zone->bitmap, count,
+				      align, skip_mask);
+
+	if (res != (u32)-1) {
+		res += zone->offset;
+		uid = zone->uid;
+		goto out;
+	}
+
+	list_for_each_entry(curr_node, &zone_alloc->prios, prio_list) {
+		if (unlikely(curr_node->priority == zone->priority))
+			break;
+	}
+
+	if (zone->flags & MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO) {
+		struct mlx4_zone_entry *it = curr_node;
+
+		list_for_each_entry_continue_reverse(it, &zone_alloc->entries, list) {
+			res = mlx4_bitmap_alloc_range(it->bitmap, count,
+						      align, skip_mask);
+			if (res != (u32)-1) {
+				res += it->offset;
+				uid = it->uid;
+				goto out;
+			}
+		}
+	}
+
+	if (zone->flags & MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO) {
+		struct mlx4_zone_entry *it = curr_node;
+
+		list_for_each_entry_from(it, &zone_alloc->entries, list) {
+			if (unlikely(it == zone))
+				continue;
+
+			if (unlikely(it->priority != curr_node->priority))
+				break;
+
+			res = mlx4_bitmap_alloc_range(it->bitmap, count,
+						      align, skip_mask);
+			if (res != (u32)-1) {
+				res += it->offset;
+				uid = it->uid;
+				goto out;
+			}
+		}
+	}
+
+	if (zone->flags & MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO) {
+		if (list_is_last(&curr_node->prio_list, &zone_alloc->prios))
+			goto out;
+
+		curr_node = list_first_entry(&curr_node->prio_list,
+					     typeof(*curr_node),
+					     prio_list);
+
+		list_for_each_entry_from(curr_node, &zone_alloc->entries, list) {
+			res = mlx4_bitmap_alloc_range(curr_node->bitmap, count,
+						      align, skip_mask);
+			if (res != (u32)-1) {
+				res += curr_node->offset;
+				uid = curr_node->uid;
+				goto out;
+			}
+		}
+	}
+
+out:
+	if (NULL != puid && res != (u32)-1)
+		*puid = uid;
+	return res;
+}
+
+/* Should be called under a lock */
+static void __mlx4_free_from_zone(struct mlx4_zone_entry *zone, u32 obj,
+				  u32 count)
+{
+	mlx4_bitmap_free_range(zone->bitmap, obj - zone->offset, count, zone->use_rr);
+}
+
+/* Should be called under a lock */
+static struct mlx4_zone_entry *__mlx4_find_zone_by_uid(
+		struct mlx4_zone_allocator *zones, u32 uid)
+{
+	struct mlx4_zone_entry *zone;
+
+	list_for_each_entry(zone, &zones->entries, list) {
+		if (zone->uid == uid)
+			return zone;
+	}
+
+	return NULL;
+}
+
+struct mlx4_bitmap *mlx4_zone_get_bitmap(struct mlx4_zone_allocator *zones, u32 uid)
+{
+	struct mlx4_zone_entry *zone;
+	struct mlx4_bitmap *bitmap;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	bitmap = zone == NULL ? NULL : zone->bitmap;
+
+	spin_unlock(&zones->lock);
+
+	return bitmap;
+}
+
+int mlx4_zone_remove_one(struct mlx4_zone_allocator *zones, u32 uid)
+{
+	struct mlx4_zone_entry *zone;
+	int res;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	if (NULL == zone) {
+		res = -1;
+		goto out;
+	}
+
+	res = __mlx4_zone_remove_one_entry(zone);
+
+out:
+	spin_unlock(&zones->lock);
+	kfree(zone);
+
+	return res;
+}
+
+/* Should be called under a lock */
+static struct mlx4_zone_entry *__mlx4_find_zone_by_uid_unique(
+		struct mlx4_zone_allocator *zones, u32 obj)
+{
+	struct mlx4_zone_entry *zone, *zone_candidate = NULL;
+	u32 dist = (u32)-1;
+
+	/* Search for the smallest zone that this obj could be
+	 * allocated from. This is done in order to handle
+	 * situations when small bitmaps are allocated from bigger
+	 * bitmaps (and the allocated space is marked as reserved in
+	 * the bigger bitmap.
+	 */
+	list_for_each_entry(zone, &zones->entries, list) {
+		if (obj >= zone->offset) {
+			u32 mobj = (obj - zone->offset) & zones->mask;
+
+			if (mobj < mlx4_bitmap_max(zone->bitmap)) {
+				u32 curr_dist = mlx4_bitmap_effective_len(zone->bitmap);
+
+				if (curr_dist < dist) {
+					dist = curr_dist;
+					zone_candidate = zone;
+				}
+			}
+		}
+	}
+
+	return zone_candidate;
+}
+
+u32 mlx4_zone_alloc_entries(struct mlx4_zone_allocator *zones, u32 uid, int count,
+			    int align, u32 skip_mask, u32 *puid)
+{
+	struct mlx4_zone_entry *zone;
+	int res = -1;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	if (NULL == zone)
+		goto out;
+
+	res = __mlx4_alloc_from_zone(zone, count, align, skip_mask, puid);
+
+out:
+	spin_unlock(&zones->lock);
+
+	return res;
+}
+
+u32 mlx4_zone_free_entries(struct mlx4_zone_allocator *zones, u32 uid, u32 obj, u32 count)
+{
+	struct mlx4_zone_entry *zone;
+	int res = 0;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	if (NULL == zone) {
+		res = -1;
+		goto out;
+	}
+
+	__mlx4_free_from_zone(zone, obj, count);
+
+out:
+	spin_unlock(&zones->lock);
+
+	return res;
+}
+
+u32 mlx4_zone_free_entries_unique(struct mlx4_zone_allocator *zones, u32 obj, u32 count)
+{
+	struct mlx4_zone_entry *zone;
+	int res;
+
+	if (!(zones->flags & MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP))
+		return -EFAULT;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid_unique(zones, obj);
+
+	if (NULL == zone) {
+		res = -1;
+		goto out;
+	}
+
+	__mlx4_free_from_zone(zone, obj, count);
+	res = 0;
+
+out:
+	spin_unlock(&zones->lock);
+
+	return res;
+}
 /*
  * Handling for queue buffers -- we allocate a bunch of memory and
  * register it in a memory region at HCA virtual address 0.  If the
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 6834da6..bc1505e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -245,6 +245,7 @@ struct mlx4_bitmap {
 	u32                     reserved_top;
 	u32			mask;
 	u32			avail;
+	u32			effective_len;
 	spinlock_t		lock;
 	unsigned long	       *table;
 };
@@ -1345,4 +1346,72 @@ int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port);
 int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave);
 int mlx4_config_mad_demux(struct mlx4_dev *dev);
 
+enum mlx4_zone_flags {
+	MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO	= 1UL << 0,
+	MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO	= 1UL << 1,
+	MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO	= 1UL << 2,
+	MLX4_ZONE_USE_RR			= 1UL << 3,
+};
+
+enum mlx4_zone_alloc_flags {
+	/* No two objects could overlap between zones. UID
+	 * could be left unused. If this flag is given and
+	 * two overlapped zones are used, an object will be free'd
+	 * from the smallest possible matching zone.
+	 */
+	MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP	= 1UL << 0,
+};
+
+struct mlx4_zone_allocator;
+
+/* Create a new zone allocator */
+struct mlx4_zone_allocator *mlx4_zone_allocator_create(enum mlx4_zone_alloc_flags flags);
+
+/* Attach a mlx4_bitmap <bitmap> of priority <priority> to the zone allocator
+ * <zone_alloc>. Allocating an object from this zone adds an offset <offset>.
+ * Similarly, when searching for an object to free, this offset it taken into
+ * account. The use_rr mlx4_ib parameter for allocating objects from this <bitmap>
+ * is given through the MLX4_ZONE_USE_RR flag in <flags>.
+ * When an allocation fails, <zone_alloc> tries to allocate from other zones
+ * according to the policy set by <flags>. <puid> is the unique identifier
+ * received to this zone.
+ */
+int mlx4_zone_add_one(struct mlx4_zone_allocator *zone_alloc,
+		      struct mlx4_bitmap *bitmap,
+		      u32 flags,
+		      int priority,
+		      int offset,
+		      u32 *puid);
+
+/* Remove bitmap indicated by <uid> from <zone_alloc> */
+int mlx4_zone_remove_one(struct mlx4_zone_allocator *zone_alloc, u32 uid);
+
+/* Delete the zone allocator <zone_alloc. This function doesn't destroy
+ * the attached bitmaps.
+ */
+void mlx4_zone_allocator_destroy(struct mlx4_zone_allocator *zone_alloc);
+
+/* Allocate <count> objects with align <align> and skip_mask <skip_mask>
+ * from the mlx4_bitmap whose uid is <uid>. The bitmap which we actually
+ * allocated from is returned in <puid>. If the allocation fails, a negative
+ * number is returned. Otherwise, the offset of the first object is returned.
+ */
+u32 mlx4_zone_alloc_entries(struct mlx4_zone_allocator *zones, u32 uid, int count,
+			    int align, u32 skip_mask, u32 *puid);
+
+/* Free <count> objects, start from <obj> of the uid <uid> from zone_allocator
+ * <zones>.
+ */
+u32 mlx4_zone_free_entries(struct mlx4_zone_allocator *zones,
+			   u32 uid, u32 obj, u32 count);
+
+/* If <zones> was allocated with MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP, instead of
+ * specifying the uid when freeing an object, zone allocator could figure it by
+ * itself. Other parameters are similar to mlx4_zone_free.
+ */
+u32 mlx4_zone_free_entries_unique(struct mlx4_zone_allocator *zones, u32 obj, u32 count);
+
+/* Returns a pointer to mlx4_bitmap that was attached to <zones> with <uid> */
+struct mlx4_bitmap *mlx4_zone_get_bitmap(struct mlx4_zone_allocator *zones, u32 uid);
+
 #endif /* MLX4_H */
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 10/10] net/mlx4: Add support for A0 steering
From: Or Gerlitz @ 2014-12-04 13:13 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Or Gerlitz
In-Reply-To: <1417698835-11050-1-git-send-email-ogerlitz@mellanox.com>

From: Matan Barak <matanb@mellanox.com>

Add the required firmware commands for A0 steering and a way to enable
that. The firmware support focuses on INIT_HCA, QUERY_HCA, QUERY_PORT,
QUERY_DEV_CAP and QUERY_FUNC_CAP commands. Those commands are used
to configure and query the device.

The different A0 DMFS (steering) modes are:

Static - optimized performance, but flow steering rules are
limited. This mode should be choosed explicitly by the user
in order to be used.

Dynamic - this mode should be explicitly choosed by the user.
In this mode, the FW works in optimized steering mode as long as
it can and afterwards automatically drops to classic (full) DMFS.

Disable - this mode should be explicitly choosed by the user.
The user instructs the system not to use optimized steering, even if
the FW supports Dynamic A0 DMFS (and thus will be able to use optimized
steering in Default A0 DMFS mode).

Default - this mode is implicitly choosed. In this mode, if the FW
supports Dynamic A0 DMFS, it'll work in this mode. Otherwise, it'll
work at Disable A0 DMFS mode.

In order to enable A0 steering, we use log_num_mgm_entry_size param.
If the value of the parameter is not positive, we treat the absolute
value of log_num_mgm_entry_size as a bit field. Setting bit 2 of this
bit field enables static A0 steering.

issue: 387689
Change-Id: I52f21ab5eeaa108f555fe157c07daa65dfe8c322
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |    3 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c        |   48 ++++++++-
 drivers/net/ethernet/mellanox/mlx4/fw.h        |    4 +
 drivers/net/ethernet/mellanox/mlx4/main.c      |  131 ++++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h      |    2 -
 drivers/net/ethernet/mellanox/mlx4/qp.c        |    4 +-
 include/linux/mlx4/device.h                    |   17 +++-
 7 files changed, 190 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 5701115..08aa6f8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2594,7 +2594,8 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 
 	if (mdev->dev->caps.steering_mode ==
-	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+	    MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    mdev->dev->caps.dmfs_high_steer_mode != MLX4_STEERING_DMFS_A0_STATIC)
 		dev->hw_features |= NETIF_F_NTUPLE;
 
 	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index e07cb9b..b303b34 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -144,7 +144,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[15] = "Ethernet Backplane autoneg support",
 		[16] = "CONFIG DEV support",
 		[17] = "Asymmetric EQs support",
-		[18] = "More than 80 VFs support"
+		[18] = "More than 80 VFs support",
+		[19] = "Performance optimized for limited rule configuration flow steering support"
 	};
 	int i;
 
@@ -680,6 +681,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_FW_REASSIGN_MAC		0x9d
 #define QUERY_DEV_CAP_VXLAN			0x9e
 #define QUERY_DEV_CAP_MAD_DEMUX_OFFSET		0xb0
+#define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET	0xa8
+#define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET	0xac
 
 	dev_cap->flags2 = 0;
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
@@ -876,6 +879,13 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	if (field32 & (1 << 0))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_MAD_DEMUX;
 
+	MLX4_GET(dev_cap->dmfs_high_rate_qpn_base, outbox,
+		 QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET);
+	dev_cap->dmfs_high_rate_qpn_base &= MGM_QPN_MASK;
+	MLX4_GET(dev_cap->dmfs_high_rate_qpn_range, outbox,
+		 QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET);
+	dev_cap->dmfs_high_rate_qpn_range &= MGM_QPN_MASK;
+
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
 	if (field32 & (1 << 16))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP;
@@ -935,6 +945,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
 	mlx4_dbg(dev, "Max counters: %d\n", dev_cap->max_counters);
 	mlx4_dbg(dev, "Max RSS Table size: %d\n", dev_cap->max_rss_tbl_sz);
+	mlx4_dbg(dev, "DMFS high rate steer QPn base: %d\n",
+		 dev_cap->dmfs_high_rate_qpn_base);
+	mlx4_dbg(dev, "DMFS high rate steer QPn range: %d\n",
+		 dev_cap->dmfs_high_rate_qpn_range);
 
 	dump_dev_cap_flags(dev, dev_cap->flags);
 	dump_dev_cap_flags2(dev, dev_cap->flags2);
@@ -996,6 +1010,7 @@ int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_c
 		port_cap->supported_port_types = field & 3;
 		port_cap->suggested_type = (field >> 3) & 1;
 		port_cap->default_sense = (field >> 4) & 1;
+		port_cap->dmfs_optimized_state = (field >> 5) & 1;
 		MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
 		port_cap->ib_mtu	   = field & 0xf;
 		MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
@@ -1525,6 +1540,12 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	struct mlx4_cmd_mailbox *mailbox;
 	__be32 *inbox;
 	int err;
+	static const u8 a0_dmfs_hw_steering[] =  {
+		[MLX4_STEERING_DMFS_A0_DEFAULT]		= 0,
+		[MLX4_STEERING_DMFS_A0_DYNAMIC]		= 1,
+		[MLX4_STEERING_DMFS_A0_STATIC]		= 2,
+		[MLX4_STEERING_DMFS_A0_DISABLE]		= 3
+	};
 
 #define INIT_HCA_IN_SIZE		 0x200
 #define INIT_HCA_VERSION_OFFSET		 0x000
@@ -1558,6 +1579,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 #define  INIT_HCA_FS_PARAM_OFFSET         0x1d0
 #define  INIT_HCA_FS_BASE_OFFSET          (INIT_HCA_FS_PARAM_OFFSET + 0x00)
 #define  INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x12)
+#define  INIT_HCA_FS_A0_OFFSET		  (INIT_HCA_FS_PARAM_OFFSET + 0x18)
 #define  INIT_HCA_FS_LOG_TABLE_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x1b)
 #define  INIT_HCA_FS_ETH_BITS_OFFSET      (INIT_HCA_FS_PARAM_OFFSET + 0x21)
 #define  INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x22)
@@ -1668,8 +1690,11 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 		/* Enable Ethernet flow steering
 		 * with udp unicast and tcp unicast
 		 */
-		MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
-			 INIT_HCA_FS_ETH_BITS_OFFSET);
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_STATIC)
+			MLX4_PUT(inbox,
+				 (u8)(MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
+				 INIT_HCA_FS_ETH_BITS_OFFSET);
 		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
 			 INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET);
 		/* Enable IPoIB flow steering
@@ -1679,6 +1704,13 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 			 INIT_HCA_FS_IB_BITS_OFFSET);
 		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
 			 INIT_HCA_FS_IB_NUM_ADDRS_OFFSET);
+
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+			MLX4_PUT(inbox,
+				 ((u8)(a0_dmfs_hw_steering[dev->caps.dmfs_high_steer_mode]
+				       << 6)),
+				 INIT_HCA_FS_A0_OFFSET);
 	} else {
 		MLX4_PUT(inbox, param->mc_base,	INIT_HCA_MC_BASE_OFFSET);
 		MLX4_PUT(inbox, param->log_mc_entry_sz,
@@ -1729,6 +1761,12 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 	u32 dword_field;
 	int err;
 	u8 byte_field;
+	static const u8 a0_dmfs_query_hw_steering[] =  {
+		[0] = MLX4_STEERING_DMFS_A0_DEFAULT,
+		[1] = MLX4_STEERING_DMFS_A0_DYNAMIC,
+		[2] = MLX4_STEERING_DMFS_A0_STATIC,
+		[3] = MLX4_STEERING_DMFS_A0_DISABLE
+	};
 
 #define QUERY_HCA_GLOBAL_CAPS_OFFSET	0x04
 #define QUERY_HCA_CORE_CLOCK_OFFSET	0x0c
@@ -1781,6 +1819,10 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
 		MLX4_GET(param->log_mc_table_sz, outbox,
 			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+		MLX4_GET(byte_field, outbox,
+			 INIT_HCA_FS_A0_OFFSET);
+		param->dmfs_high_steer_mode =
+			a0_dmfs_query_hw_steering[(byte_field >> 6) & 3];
 	} else {
 		MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET);
 		MLX4_GET(param->log_mc_entry_sz, outbox,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 744398b..794e282 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -60,6 +60,7 @@ struct mlx4_port_cap {
 	int vendor_oui;
 	u16 wavelength;
 	u64 trans_code;
+	u8 dmfs_optimized_state;
 };
 
 struct mlx4_dev_cap {
@@ -124,6 +125,8 @@ struct mlx4_dev_cap {
 	int max_gso_sz;
 	int max_rss_tbl_sz;
 	u32 max_counters;
+	u32 dmfs_high_rate_qpn_base;
+	u32 dmfs_high_rate_qpn_range;
 	struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1];
 };
 
@@ -194,6 +197,7 @@ struct mlx4_init_hca_param {
 	u8  mw_enabled;  /* Enable memory windows */
 	u8  uar_page_sz; /* log pg sz in 4k chunks */
 	u8  steering_mode; /* for QUERY_HCA */
+	u8  dmfs_high_steer_mode; /* for QUERY_HCA */
 	u64 dev_cap_enabled;
 	u16 cqe_size; /* For use only when CQE stride feature enabled */
 	u16 eqe_size; /* For use only when EQE stride feature enabled */
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 6173b80..3845fe8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -105,7 +105,8 @@ MODULE_PARM_DESC(enable_64b_cqe_eqe,
 		 "Enable 64 byte CQEs/EQEs when the FW supports this (default: True)");
 
 #define PF_CONTEXT_BEHAVIOUR_MASK	(MLX4_FUNC_CAP_64B_EQE_CQE | \
-					 MLX4_FUNC_CAP_EQE_CQE_STRIDE)
+					 MLX4_FUNC_CAP_EQE_CQE_STRIDE | \
+					 MLX4_FUNC_CAP_DMFS_A0_STATIC)
 
 static char mlx4_version[] =
 	DRV_NAME ": Mellanox ConnectX core driver v"
@@ -463,8 +464,28 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		(1 << dev->caps.log_num_vlans) *
 		dev->caps.num_ports;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
+
+	if (dev_cap->dmfs_high_rate_qpn_base > 0 &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)
+		dev->caps.dmfs_high_rate_qpn_base = dev_cap->dmfs_high_rate_qpn_base;
+	else
+		dev->caps.dmfs_high_rate_qpn_base =
+			dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+
+	if (dev_cap->dmfs_high_rate_qpn_range > 0 &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN) {
+		dev->caps.dmfs_high_rate_qpn_range = dev_cap->dmfs_high_rate_qpn_range;
+		dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_DEFAULT;
+		dev->caps.flags2 |= MLX4_DEV_CAP_FLAG2_FS_A0;
+	} else {
+		dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_NOT_SUPPORTED;
+		dev->caps.dmfs_high_rate_qpn_base =
+			dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+		dev->caps.dmfs_high_rate_qpn_range = MLX4_A0_STEERING_TABLE_SIZE;
+	}
+
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_RSS_RAW_ETH] =
-		MLX4_A0_STEERING_TABLE_SIZE;
+		dev->caps.dmfs_high_rate_qpn_range;
 
 	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
@@ -753,7 +774,7 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 
 	if ((func_cap.pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) !=
 	    PF_CONTEXT_BEHAVIOUR_MASK) {
-		mlx4_err(dev, "Unknown pf context behaviour\n");
+		mlx4_err(dev, "Unknown pf context behaviour %x\n", func_cap.pf_context_behaviour);
 		return -ENOSYS;
 	}
 
@@ -1640,10 +1661,46 @@ static int choose_log_fs_mgm_entry_size(int qp_per_entry)
 	return (i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE) ? i : -1;
 }
 
+static const char *dmfs_high_rate_steering_mode_str(int dmfs_high_steer_mode)
+{
+	switch (dmfs_high_steer_mode) {
+	case MLX4_STEERING_DMFS_A0_DEFAULT:
+		return "default performance";
+
+	case MLX4_STEERING_DMFS_A0_DYNAMIC:
+		return "dynamic hybrid mode";
+
+	case MLX4_STEERING_DMFS_A0_STATIC:
+		return "performance optimized for limited rule configuration (static)";
+
+	case MLX4_STEERING_DMFS_A0_DISABLE:
+		return "disabled performance optimized steering";
+
+	case MLX4_STEERING_DMFS_A0_NOT_SUPPORTED:
+		return "performance optimized steering not supported";
+
+	default:
+		return "Unrecognized mode";
+	}
+}
+
+#define MLX4_DMFS_A0_STEERING			(1UL << 2)
+
 static void choose_steering_mode(struct mlx4_dev *dev,
 				 struct mlx4_dev_cap *dev_cap)
 {
-	if (mlx4_log_num_mgm_entry_size == -1 &&
+	if (mlx4_log_num_mgm_entry_size <= 0) {
+		if ((-mlx4_log_num_mgm_entry_size) & MLX4_DMFS_A0_STEERING) {
+			if (dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+				mlx4_err(dev, "DMFS high rate mode not supported\n");
+			else
+				dev->caps.dmfs_high_steer_mode =
+					MLX4_STEERING_DMFS_A0_STATIC;
+		}
+	}
+
+	if (mlx4_log_num_mgm_entry_size <= 0 &&
 	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN &&
 	    (!mlx4_is_mfunc(dev) ||
 	     (dev_cap->fs_max_num_qp_per_entry >= (dev->num_vfs + 1))) &&
@@ -1656,6 +1713,9 @@ static void choose_steering_mode(struct mlx4_dev *dev,
 		dev->caps.fs_log_max_ucast_qp_range_size =
 			dev_cap->fs_log_max_ucast_qp_range_size;
 	} else {
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+			dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_DISABLE;
 		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
 		    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
 			dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
@@ -1682,7 +1742,8 @@ static void choose_tunnel_offload_mode(struct mlx4_dev *dev,
 				       struct mlx4_dev_cap *dev_cap)
 {
 	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED &&
-	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS)
+	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS &&
+	    dev->caps.dmfs_high_steer_mode != MLX4_STEERING_DMFS_A0_STATIC)
 		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_VXLAN;
 	else
 		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_NONE;
@@ -1691,6 +1752,35 @@ static void choose_tunnel_offload_mode(struct mlx4_dev *dev,
 		 == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) ? "vxlan" : "none");
 }
 
+static int mlx4_validate_optimized_steering(struct mlx4_dev *dev)
+{
+	int i;
+	struct mlx4_port_cap port_cap;
+
+	if (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+		return -EINVAL;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_dev_port(dev, i, &port_cap)) {
+			mlx4_err(dev,
+				 "QUERY_DEV_CAP command failed, can't veify DMFS high rate steering.\n");
+		} else if ((dev->caps.dmfs_high_steer_mode !=
+			    MLX4_STEERING_DMFS_A0_DEFAULT) &&
+			   (port_cap.dmfs_optimized_state ==
+			    !!(dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_DISABLE))) {
+			mlx4_err(dev,
+				 "DMFS high rate steer mode differ, driver requested %s but %s in FW.\n",
+				 dmfs_high_rate_steering_mode_str(
+					dev->caps.dmfs_high_steer_mode),
+				 (port_cap.dmfs_optimized_state ?
+					"enabled" : "disabled"));
+		}
+	}
+
+	return 0;
+}
+
 static int mlx4_init_fw(struct mlx4_dev *dev)
 {
 	struct mlx4_mod_stat_cfg   mlx4_cfg;
@@ -1743,6 +1833,10 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 		choose_steering_mode(dev, &dev_cap);
 		choose_tunnel_offload_mode(dev, &dev_cap);
 
+		if (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC &&
+		    mlx4_is_master(dev))
+			dev->caps.function_caps |= MLX4_FUNC_CAP_DMFS_A0_STATIC;
+
 		err = mlx4_get_phys_port_id(dev);
 		if (err)
 			mlx4_err(dev, "Fail to get physical port id\n");
@@ -1829,6 +1923,24 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 				mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported\n");
 			}
 		}
+
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED) {
+			if (mlx4_validate_optimized_steering(dev))
+				mlx4_warn(dev, "Optimized steering validation failed\n");
+
+			if (dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_DISABLE) {
+				dev->caps.dmfs_high_rate_qpn_base =
+					dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+				dev->caps.dmfs_high_rate_qpn_range =
+					MLX4_A0_STEERING_TABLE_SIZE;
+			}
+
+			mlx4_dbg(dev, "DMFS high rate steer mode is: %s\n",
+				 dmfs_high_rate_steering_mode_str(
+					dev->caps.dmfs_high_steer_mode));
+		}
 	} else {
 		err = mlx4_init_slave(dev);
 		if (err) {
@@ -3201,10 +3313,11 @@ static int __init mlx4_verify_params(void)
 		port_type_array[0] = true;
 	}
 
-	if (mlx4_log_num_mgm_entry_size != -1 &&
-	    (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE ||
-	     mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE)) {
-		pr_warn("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not in legal range (-1 or %d..%d)\n",
+	if (mlx4_log_num_mgm_entry_size < -7 ||
+	    (mlx4_log_num_mgm_entry_size > 0 &&
+	     (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE ||
+	      mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE))) {
+		pr_warn("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not in legal range (-7..0 or %d..%d)\n",
 			mlx4_log_num_mgm_entry_size,
 			MLX4_MIN_MGM_LOG_ENTRY_SIZE,
 			MLX4_MAX_MGM_LOG_ENTRY_SIZE);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index cebd118..bdd4eea 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -689,8 +689,6 @@ enum mlx4_qp_table_zones {
 	MLX4_QP_TABLE_ZONE_NUM
 };
 
-#define MLX4_A0_STEERING_TABLE_SIZE    256
-
 struct mlx4_qp_table {
 	struct mlx4_bitmap	*bitmap_gen;
 	struct mlx4_zone_allocator *zones;
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index d8d040c..1586ecc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -712,8 +712,8 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	int k;
 	int fixed_reserved_from_bot_rv = 0;
 	int bottom_reserved_for_rss_bitmap;
-	u32 max_table_offset = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
-		MLX4_A0_STEERING_TABLE_SIZE;
+	u32 max_table_offset = dev->caps.dmfs_high_rate_qpn_base +
+			dev->caps.dmfs_high_rate_qpn_range;
 
 	spin_lock_init(&qp_table->lock);
 	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 39890cd..25c791e 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -117,6 +117,14 @@ enum {
 	MLX4_STEERING_MODE_DEVICE_MANAGED
 };
 
+enum {
+	MLX4_STEERING_DMFS_A0_DEFAULT,
+	MLX4_STEERING_DMFS_A0_DYNAMIC,
+	MLX4_STEERING_DMFS_A0_STATIC,
+	MLX4_STEERING_DMFS_A0_DISABLE,
+	MLX4_STEERING_DMFS_A0_NOT_SUPPORTED
+};
+
 static inline const char *mlx4_steering_mode_str(int steering_mode)
 {
 	switch (steering_mode) {
@@ -191,7 +199,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  15,
 	MLX4_DEV_CAP_FLAG2_CONFIG_DEV		= 1LL <<  16,
 	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17,
-	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18
+	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18,
+	MLX4_DEV_CAP_FLAG2_FS_A0		= 1LL <<  19
 };
 
 enum {
@@ -225,7 +234,8 @@ enum {
 
 enum {
 	MLX4_FUNC_CAP_64B_EQE_CQE	= 1L << 0,
-	MLX4_FUNC_CAP_EQE_CQE_STRIDE	= 1L << 1
+	MLX4_FUNC_CAP_EQE_CQE_STRIDE	= 1L << 1,
+	MLX4_FUNC_CAP_DMFS_A0_STATIC	= 1L << 2
 };
 
 
@@ -482,6 +492,7 @@ struct mlx4_caps {
 	int			reserved_mcgs;
 	int			num_qp_per_mgm;
 	int			steering_mode;
+	int			dmfs_high_steer_mode;
 	int			fs_log_max_ucast_qp_range_size;
 	int			num_pds;
 	int			reserved_pds;
@@ -522,6 +533,8 @@ struct mlx4_caps {
 	int			tunnel_offload_mode;
 	u8			rx_checksum_flags_port[MLX4_MAX_PORTS + 1];
 	u8			alloc_res_qp_mask;
+	u32			dmfs_high_rate_qpn_base;
+	u32			dmfs_high_rate_qpn_range;
 };
 
 struct mlx4_buf_list {
-- 
1.7.1

^ permalink raw reply related

* Re: Where exactly will arch_fast_hash be used
From: Daniel Borkmann @ 2014-12-04 13:14 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Herbert Xu, Thomas Graf, David S. Miller, Theodore Ts'o,
	netdev, Linux Kernel Mailing List, fusco
In-Reply-To: <1417696468.5386.23.camel@localhost>

On 12/04/2014 01:34 PM, Hannes Frederic Sowa wrote:
> On Do, 2014-12-04 at 16:11 +0800, Herbert Xu wrote:
>> While working on rhashtable it came to me that this whole concept
>> of arch_fast_hash is flawed.  CRCs are linear functions so it's
>> fairly easy for an attacker to identify collisions or at least
>> eliminate a large amount of search space (e.g., controlling the
>> last bit of the hash result is almost trivial, even when you add
>> a random seed).
>>
>> So what exactly are we going to use arch_fast_hash for? Presumably
>> it's places where security is never goint to be an issue, right?

The original proposal [1] targeted ovs-only as a closed-door user in
order to speed up the worst case of calculating a hash over the extracted
flow key, that is, struct sw_flow_key (which nowadays consumes up to
7 cachelines on x86_64 ...).

   [1] http://thread.gmane.org/gmane.linux.network/293981/

>> Even if security wasn't an issue, straight CRC32 has really poor
>> lower-order bit distribution, which makes it a terrible choice for
>> a hash table that simply uses the lower-order bits.
>
> I wondered the same while trying to use arch_fast_hash in a lot more
> places (I did a new implementation in assembler I'll send later on, it
> is mostly optimized to deal with ovs flow keys).
>
> While the uniformity of crc32 does actually look good and IMHO this even
> holds for the lower bits of the hash, I totally agree on the linearity
> matters.
>
> The easiest way to make arch_fast_hash non-linear would be to build up
> on the crc32 instruction like e.g. the cityhash function family does and
> it seems not too hard to do that by combining two crc32c outputs of the
> original and cyclic shifted input data. I have doubts if this is faster
> than jhash in the end. There are proposals from Intel to do so, but they
> are patent encumbered. :/
>
> For most consumers in the networking stack, security and DoS resistence
> is an issue. OVS, for which this was designed at first does do rehashing
> from time to time, but still there is a possible DoS attack vector with
> this hashing algorithm.

^ permalink raw reply

* [PATCH net-next 08/10] net/mlx4_core: Add explicit error message when rule doesn't meet configuration
From: Or Gerlitz @ 2014-12-04 13:13 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Or Gerlitz
In-Reply-To: <1417698835-11050-1-git-send-email-ogerlitz@mellanox.com>

From: Matan Barak <matanb@mellanox.com>

When a given flow steering rule is invalid in respect to the current
steering configuration, print the correct error message to the system log.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/mcg.c |   21 ++++++++++++++++++---
 1 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index 8728431..a3867e7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -999,12 +999,27 @@ int mlx4_flow_attach(struct mlx4_dev *dev,
 	}
 
 	ret = mlx4_QP_FLOW_STEERING_ATTACH(dev, mailbox, size >> 2, reg_id);
-	if (ret == -ENOMEM)
+	if (ret == -ENOMEM) {
 		mlx4_err_rule(dev,
 			      "mcg table is full. Fail to register network rule\n",
 			      rule);
-	else if (ret)
-		mlx4_err_rule(dev, "Fail to register network rule\n", rule);
+	} else if (ret) {
+		if (ret == -ENXIO) {
+			if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED)
+				mlx4_err_rule(dev,
+					      "DMFS is not enabled, "
+					      "failed to register network rule.\n",
+					      rule);
+			else
+				mlx4_err_rule(dev,
+					      "Rule exceeds the dmfs_high_rate_mode limitations, "
+					      "failed to register network rule.\n",
+					      rule);
+
+		} else {
+			mlx4_err_rule(dev, "Fail to register network rule.\n", rule);
+		}
+	}
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 04/10] net/mlx4: Change QP allocation scheme
From: Or Gerlitz @ 2014-12-04 13:13 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Tal Alon, Jack Morgenstein,
	Eugenia Emantayev, Or Gerlitz
In-Reply-To: <1417698835-11050-1-git-send-email-ogerlitz@mellanox.com>

From: Eugenia Emantayev <eugenia@mellanox.co.il>

When using BF (Blue-Flame), the QPN overrides the VLAN, CV, and SV fields
in the WQE. Thus, BF may only be used for QPNs with bits 6,7 unset.

The current Ethernet driver code reserves a Tx QP range with 256b alignment.

This is wrong because if there are more than 64 Tx QPs in use,
QPNs >= base + 65 will have bits 6/7 set.

This problem is not specific for the Ethernet driver, any entity that
tries to reserve more than 64 BF-enabled QPs should fail. Also, using
ranges is not necessary here and is wasteful.

The new mechanism introduced here will support reservation for
"Eth QPs eligible for BF" for all drivers: bare-metal, multi-PF, and VFs
(when hypervisors support WC in VMs). The flow we use is:

1. In mlx4_en, allocate Tx QPs one by one instead of a range allocation,
   and request "BF enabled QPs" if BF is supported for the function

2. In the ALLOC_RES FW command, change param1 to:
a. param1[23:0]  - number of QPs
b. param1[31-24] - flags controlling QPs reservation

Bit 31 refers to Eth blueflame supported QPs. Those QPs must have
bits 6 and 7 unset in order to be used in Ethernet.

Bits 24-30 of the flags are currently reserved.

When a function tries to allocate a QP, it states the required attributes
for this QP. Those attributes are considered "best-effort". If an attribute,
such as Ethernet BF enabled QP, is a must-have attribute, the function has
to check that attribute is supported before trying to do the allocation.

In a lower layer of the code, mlx4_qp_reserve_range masks out the bits
which are unsupported. If SRIOV is used, the PF validates those attirubtes
and masks out unsupported attributes as well. In order to notify VFs which
attirbutes are supported, the VF uses QUERY_FUNC_CAP command. This command's
mailbox is filled by the PF, which notifies which QP allocation attributes
it supports.


Signed-off-by: Eugenia Emantayev <eugenia@mellanox.co.il>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/infiniband/hw/mlx4/main.c                  |    2 +-
 drivers/infiniband/hw/mlx4/qp.c                    |   11 +++--
 drivers/net/ethernet/mellanox/mlx4/alloc.c         |   43 +++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |   10 +----
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |    4 +-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c         |   14 +++++-
 drivers/net/ethernet/mellanox/mlx4/fw.c            |   20 +++++++++-
 drivers/net/ethernet/mellanox/mlx4/fw.h            |    1 +
 drivers/net/ethernet/mellanox/mlx4/main.c          |   11 +++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |    5 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h       |    2 +-
 drivers/net/ethernet/mellanox/mlx4/qp.c            |   24 +++++++++--
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |    7 +++-
 include/linux/mlx4/device.h                        |   21 +++++++++-
 14 files changed, 137 insertions(+), 38 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 0c33755..57ecc5b 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2227,7 +2227,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
 		err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
 					    MLX4_IB_UC_STEER_QPN_ALIGN,
-					    &ibdev->steer_qpn_base);
+					    &ibdev->steer_qpn_base, 0);
 		if (err)
 			goto err_counter;
 
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 9c5150c..506d1bd 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -802,16 +802,19 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			}
 		}
 	} else {
-		/* Raw packet QPNs must be aligned to 8 bits. If not, the WQE
-		 * BlueFlame setup flow wrongly causes VLAN insertion. */
+		/* Raw packet QPNs may not have bits 6,7 set in their qp_num;
+		 * otherwise, the WQE BlueFlame setup flow wrongly causes
+		 * VLAN insertion. */
 		if (init_attr->qp_type == IB_QPT_RAW_PACKET)
-			err = mlx4_qp_reserve_range(dev->dev, 1, 1 << 8, &qpn);
+			err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn,
+						    init_attr->cap.max_send_wr ?
+						    MLX4_RESERVE_ETH_BF_QP : 0);
 		else
 			if (qp->flags & MLX4_IB_QP_NETIF)
 				err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
 			else
 				err = mlx4_qp_reserve_range(dev->dev, 1, 1,
-							    &qpn);
+							    &qpn, 0);
 		if (err)
 			goto err_proxy;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
index b0297da..91a8acc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -76,22 +76,53 @@ void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr)
 	mlx4_bitmap_free_range(bitmap, obj, 1, use_rr);
 }
 
-u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align)
+static unsigned long find_aligned_range(unsigned long *bitmap,
+					u32 start, u32 nbits,
+					int len, int align, u32 skip_mask)
+{
+	unsigned long end, i;
+
+again:
+	start = ALIGN(start, align);
+
+	while ((start < nbits) && (test_bit(start, bitmap) ||
+				   (start & skip_mask)))
+		start += align;
+
+	if (start >= nbits)
+		return -1;
+
+	end = start+len;
+	if (end > nbits)
+		return -1;
+
+	for (i = start + 1; i < end; i++) {
+		if (test_bit(i, bitmap) || ((u32)i & skip_mask)) {
+			start = i + 1;
+			goto again;
+		}
+	}
+
+	return start;
+}
+
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
+			    int align, u32 skip_mask)
 {
 	u32 obj;
 
-	if (likely(cnt == 1 && align == 1))
+	if (likely(cnt == 1 && align == 1 && !skip_mask))
 		return mlx4_bitmap_alloc(bitmap);
 
 	spin_lock(&bitmap->lock);
 
-	obj = bitmap_find_next_zero_area(bitmap->table, bitmap->max,
-				bitmap->last, cnt, align - 1);
+	obj = find_aligned_range(bitmap->table, bitmap->last,
+				 bitmap->max, cnt, align, skip_mask);
 	if (obj >= bitmap->max) {
 		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
 				& bitmap->mask;
-		obj = bitmap_find_next_zero_area(bitmap->table, bitmap->max,
-						0, cnt, align - 1);
+		obj = find_aligned_range(bitmap->table, 0, bitmap->max,
+					 cnt, align, skip_mask);
 	}
 
 	if (obj < bitmap->max) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index b7c9978..6537631 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -595,7 +595,7 @@ static int mlx4_en_get_qp(struct mlx4_en_priv *priv)
 		return 0;
 	}
 
-	err = mlx4_qp_reserve_range(dev, 1, 1, qpn);
+	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, 0);
 	en_dbg(DRV, priv, "Reserved qp %d\n", *qpn);
 	if (err) {
 		en_err(priv, "Failed to reserve qp for mac registration\n");
@@ -1974,15 +1974,8 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
 {
 	struct mlx4_en_port_profile *prof = priv->prof;
 	int i;
-	int err;
 	int node;
 
-	err = mlx4_qp_reserve_range(priv->mdev->dev, priv->tx_ring_num, 256, &priv->base_tx_qpn);
-	if (err) {
-		en_err(priv, "failed reserving range for TX rings\n");
-		return err;
-	}
-
 	/* Create tx Rings */
 	for (i = 0; i < priv->tx_ring_num; i++) {
 		node = cpu_to_node(i % num_online_cpus());
@@ -1991,7 +1984,6 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
 			goto err;
 
 		if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i],
-					   priv->base_tx_qpn + i,
 					   prof->tx_ring_size, TXBB_SIZE,
 					   node, i))
 			goto err;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 3a9f9bf..4862552 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1132,7 +1132,7 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
 	int err;
 	u32 qpn;
 
-	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn);
+	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn, 0);
 	if (err) {
 		en_err(priv, "Failed reserving drop qpn\n");
 		return err;
@@ -1175,7 +1175,7 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
 	en_dbg(DRV, priv, "Configuring rss steering\n");
 	err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num,
 				    priv->rx_ring_num,
-				    &rss_map->base_qpn);
+				    &rss_map->base_qpn, 0);
 	if (err) {
 		en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num);
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index d0cecbd..a308d41 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -46,7 +46,7 @@
 #include "mlx4_en.h"
 
 int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
-			   struct mlx4_en_tx_ring **pring, int qpn, u32 size,
+			   struct mlx4_en_tx_ring **pring, u32 size,
 			   u16 stride, int node, int queue_index)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
@@ -112,11 +112,17 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	       ring, ring->buf, ring->size, ring->buf_size,
 	       (unsigned long long) ring->wqres.buf.direct.map);
 
-	ring->qpn = qpn;
+	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
+				    MLX4_RESERVE_ETH_BF_QP);
+	if (err) {
+		en_err(priv, "failed reserving qp for TX ring\n");
+		goto err_map;
+	}
+
 	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
 	if (err) {
 		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
-		goto err_map;
+		goto err_reserve;
 	}
 	ring->qp.event = mlx4_en_sqp_event;
 
@@ -143,6 +149,8 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	*pring = ring;
 	return 0;
 
+err_reserve:
+	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
 err_map:
 	mlx4_en_unmap_buffer(&ring->wqres.buf);
 err_hwq_res:
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 8c9ea70..745deb7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -266,10 +266,15 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 #define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET		0x64
 #define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET		0x68
 
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET	0x6c
+
 #define QUERY_FUNC_CAP_FMR_FLAG			0x80
 #define QUERY_FUNC_CAP_FLAG_RDMA		0x40
 #define QUERY_FUNC_CAP_FLAG_ETH			0x80
 #define QUERY_FUNC_CAP_FLAG_QUOTAS		0x10
+#define QUERY_FUNC_CAP_FLAG_VALID_MAILBOX	0x04
+
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG	(1UL << 31)
 
 /* when opcode modifier = 1 */
 #define QUERY_FUNC_CAP_PHYS_PORT_OFFSET		0x3
@@ -339,7 +344,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 			mlx4_get_active_ports(dev, slave);
 		/* enable rdma and ethernet interfaces, and new quota locations */
 		field = (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA |
-			 QUERY_FUNC_CAP_FLAG_QUOTAS);
+			 QUERY_FUNC_CAP_FLAG_QUOTAS | QUERY_FUNC_CAP_FLAG_VALID_MAILBOX);
 		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS_OFFSET);
 
 		field = min(
@@ -401,6 +406,8 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
 
+		size = QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
 	} else
 		err = -EINVAL;
 
@@ -493,6 +500,17 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 		MLX4_GET(size, outbox, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
 		func_cap->reserved_eq = size & 0xFFFFFF;
 
+		func_cap->extra_flags = 0;
+
+		/* Mailbox data from 0x6c and onward should only be treated if
+		 * QUERY_FUNC_CAP_FLAG_VALID_MAILBOX is set in func_cap->flags
+		 */
+		if (func_cap->flags & QUERY_FUNC_CAP_FLAG_VALID_MAILBOX) {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
+			if (size & QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG)
+				func_cap->extra_flags |= MLX4_QUERY_FUNC_FLAGS_BF_RES_QP;
+		}
+
 		goto out;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 475215e..0e910a4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -144,6 +144,7 @@ struct mlx4_func_cap {
 	u8	port_flags;
 	u8	flags1;
 	u64	phys_port_id;
+	u32	extra_flags;
 };
 
 struct mlx4_func {
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 3044f9e..6a9a941 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -466,8 +466,13 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	    mlx4_is_master(dev))
 		dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE;
 
-	if (!mlx4_is_slave(dev))
+	if (!mlx4_is_slave(dev)) {
 		mlx4_enable_cqe_eqe_stride(dev);
+		dev->caps.alloc_res_qp_mask =
+			(dev->caps.bf_reg_size ? MLX4_RESERVE_ETH_BF_QP : 0);
+	} else {
+		dev->caps.alloc_res_qp_mask = 0;
+	}
 
 	return 0;
 }
@@ -817,6 +822,10 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 
 	slave_adjust_steering_mode(dev, &dev_cap, &hca_param);
 
+	if (func_cap.extra_flags & MLX4_QUERY_FUNC_FLAGS_BF_RES_QP &&
+	    dev->caps.bf_reg_size)
+		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_ETH_BF_QP;
+
 	return 0;
 
 err_mem:
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index b67ef48..6834da6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -884,7 +884,8 @@ extern struct workqueue_struct *mlx4_wq;
 
 u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
 void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr);
-u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align);
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
+			    int align, u32 skip_mask);
 void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
 			    int use_rr);
 u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap);
@@ -970,7 +971,7 @@ int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
 		     struct mlx4_cmd_mailbox *outbox,
 		     struct mlx4_cmd_info *cmd);
 int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
-			    int *base);
+			    int *base, u8 flags);
 void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
 int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac);
 void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index aaa7efb..576dd07 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -778,7 +778,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev);
 
 int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 			   struct mlx4_en_tx_ring **pring,
-			   int qpn, u32 size, u16 stride,
+			   u32 size, u16 stride,
 			   int node, int queue_index);
 void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
 			     struct mlx4_en_tx_ring **pring);
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 2301365..40e82ed 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -42,6 +42,10 @@
 #include "mlx4.h"
 #include "icm.h"
 
+/* QP to support BF should have bits 6,7 cleared */
+#define MLX4_BF_QP_SKIP_MASK	0xc0
+#define MLX4_MAX_BF_QP_RANGE	0x40
+
 void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
 {
 	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
@@ -207,26 +211,36 @@ int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 EXPORT_SYMBOL_GPL(mlx4_qp_modify);
 
 int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
-				   int *base)
+			    int *base, u8 flags)
 {
+	int bf_qp = !!(flags & (u8)MLX4_RESERVE_ETH_BF_QP);
+
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_qp_table *qp_table = &priv->qp_table;
 
-	*base = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align);
+	if (cnt > MLX4_MAX_BF_QP_RANGE && bf_qp)
+		return -ENOMEM;
+
+	*base = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align,
+					bf_qp ? MLX4_BF_QP_SKIP_MASK : 0);
 	if (*base == -1)
 		return -ENOMEM;
 
 	return 0;
 }
 
-int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base)
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			  int *base, u8 flags)
 {
 	u64 in_param = 0;
 	u64 out_param;
 	int err;
 
+	/* Turn off all unsupported QP allocation flags */
+	flags &= dev->caps.alloc_res_qp_mask;
+
 	if (mlx4_is_mfunc(dev)) {
-		set_param_l(&in_param, cnt);
+		set_param_l(&in_param, (((u32)flags) << 24) | (u32)cnt);
 		set_param_h(&in_param, align);
 		err = mlx4_cmd_imm(dev, in_param, &out_param,
 				   RES_QP, RES_OP_RESERVE,
@@ -238,7 +252,7 @@ int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base)
 		*base = get_param_l(&out_param);
 		return 0;
 	}
-	return __mlx4_qp_reserve_range(dev, cnt, align, base);
+	return __mlx4_qp_reserve_range(dev, cnt, align, base, flags);
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 16f617b..4efbd1e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -1543,16 +1543,21 @@ static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 	int align;
 	int base;
 	int qpn;
+	u8 flags;
 
 	switch (op) {
 	case RES_OP_RESERVE:
 		count = get_param_l(&in_param) & 0xffffff;
+		/* Turn off all unsupported QP allocation flags that the
+		 * slave tries to set.
+		 */
+		flags = (get_param_l(&in_param) >> 24) & dev->caps.alloc_res_qp_mask;
 		align = get_param_h(&in_param);
 		err = mlx4_grant_resource(dev, slave, RES_QP, count, 0);
 		if (err)
 			return err;
 
-		err = __mlx4_qp_reserve_range(dev, count, align, &base);
+		err = __mlx4_qp_reserve_range(dev, count, align, &base, flags);
 		if (err) {
 			mlx4_release_resource(dev, slave, RES_QP, count, 0);
 			return err;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 3951b53..272aa25 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -195,6 +195,22 @@ enum {
 };
 
 enum {
+	MLX4_QUERY_FUNC_FLAGS_BF_RES_QP		= 1LL << 0
+};
+
+/* bit enums for an 8-bit flags field indicating special use
+ * QPs which require special handling in qp_reserve_range.
+ * Currently, this only includes QPs used by the ETH interface,
+ * where we expect to use blueflame.  These QPs must not have
+ * bits 6 and 7 set in their qp number.
+ *
+ * This enum may use only bits 0..7.
+ */
+enum {
+	MLX4_RESERVE_ETH_BF_QP	= 1 << 7,
+};
+
+enum {
 	MLX4_DEV_CAP_64B_EQE_ENABLED	= 1LL << 0,
 	MLX4_DEV_CAP_64B_CQE_ENABLED	= 1LL << 1,
 	MLX4_DEV_CAP_CQE_STRIDE_ENABLED	= 1LL << 2,
@@ -501,6 +517,7 @@ struct mlx4_caps {
 	u64			phys_port_id[MLX4_MAX_PORTS + 1];
 	int			tunnel_offload_mode;
 	u8			rx_checksum_flags_port[MLX4_MAX_PORTS + 1];
+	u8			alloc_res_qp_mask;
 };
 
 struct mlx4_buf_list {
@@ -950,8 +967,8 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
 		  unsigned vector, int collapsed, int timestamp_en);
 void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
-
-int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base);
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			  int *base, u8 flags);
 void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
 
 int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp,
-- 
1.7.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox