Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next] sctp: refactor sctp_datamsg_from_user
From: Marcelo Ricardo Leitner @ 2016-12-29 17:53 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Neil Horman, Vlad Yasevich

This patch refactors sctp_datamsg_from_user() in an attempt to make it
better to read and avoid code duplication for handling the last
fragment.

It also avoids doing division and remaining operations. Even though, it
should still operate similarly as before this patch.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 net/sctp/chunk.c | 107 +++++++++++++++++--------------------------------------
 1 file changed, 32 insertions(+), 75 deletions(-)

diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 615f0ddd41dfb1ff46a9d4e564716de8e7b60ea6..e3621cb4827fadb5f5cb41ebe8455dfa3300a765 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -165,14 +165,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 					    struct sctp_sndrcvinfo *sinfo,
 					    struct iov_iter *from)
 {
-	int max, whole, i, offset, over, err;
-	int len, first_len;
-	int max_data;
+	size_t len, first_len, max_data, remaining;
+	size_t msg_len = iov_iter_count(from);
+	struct list_head *pos, *temp;
 	struct sctp_chunk *chunk;
 	struct sctp_datamsg *msg;
-	struct list_head *pos, *temp;
-	size_t msg_len = iov_iter_count(from);
-	__u8 frag;
+	int err;
 
 	msg = sctp_datamsg_new(GFP_KERNEL);
 	if (!msg)
@@ -185,7 +183,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	    (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) ||
 	     !SCTP_PR_POLICY(sinfo->sinfo_flags)))
 		msg->expires_at = jiffies +
-				    msecs_to_jiffies(sinfo->sinfo_timetolive);
+				  msecs_to_jiffies(sinfo->sinfo_timetolive);
 
 	/* This is the biggest possible DATA chunk that can fit into
 	 * the packet
@@ -195,7 +193,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		   sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
 	max_data = SCTP_TRUNC4(max_data);
 
-	max = asoc->frag_point;
 	/* If the the peer requested that we authenticate DATA chunks
 	 * we need to account for bundling of the AUTH chunks along with
 	 * DATA.
@@ -208,12 +205,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 					      hmac_desc->hmac_len);
 	}
 
-	/* Now, check if we need to reduce our max */
-	if (max > max_data)
-		max = max_data;
+	/* Check what's our max considering the above */
+	max_data = min_t(size_t, max_data, asoc->frag_point);
 
-	whole = 0;
-	first_len = max;
+	/* Set first_len and then account for possible bundles on first frag */
+	first_len = max_data;
 
 	/* Check to see if we have a pending SACK and try to let it be bundled
 	 * with this message.  Do this if we don't have any data queued already.
@@ -224,40 +220,38 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) &&
 	    asoc->outqueue.out_qlen == 0 &&
 	    list_empty(&asoc->outqueue.retransmit) &&
-	    msg_len > max)
-		max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
+	    msg_len > max_data)
+		first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
 
 	/* Encourage Cookie-ECHO bundling. */
 	if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
-		max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
-
-	/* Now that we adjusted completely, reset first_len */
-	if (first_len > max_data)
-		first_len = max_data;
+		first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
 
 	/* Account for a different sized first fragment */
 	if (msg_len >= first_len) {
-		msg_len -= first_len;
-		whole = 1;
 		msg->can_delay = 0;
-	}
-
-	/* How many full sized?  How many bytes leftover? */
-	whole += msg_len / max;
-	over = msg_len % max;
-	offset = 0;
-
-	if ((whole > 1) || (whole && over))
 		SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
+	} else {
+		/* Which may be the only one... */
+		first_len = msg_len;
+	}
 
-	/* Create chunks for all the full sized DATA chunks. */
-	for (i = 0, len = first_len; i < whole; i++) {
-		frag = SCTP_DATA_MIDDLE_FRAG;
+	/* Create chunks for all DATA chunks. */
+	for (remaining = msg_len; remaining; remaining -= len) {
+		u8 frag = SCTP_DATA_MIDDLE_FRAG;
 
-		if (0 == i)
+		if (remaining == msg_len) {
+			/* First frag, which may also be the last */
 			frag |= SCTP_DATA_FIRST_FRAG;
+			len = first_len;
+		} else {
+			/* Middle frags */
+			len = max_data;
+		}
 
-		if ((i == (whole - 1)) && !over) {
+		if (len >= remaining) {
+			/* Last frag, which may also be the first */
+			len = remaining;
 			frag |= SCTP_DATA_LAST_FRAG;
 
 			/* The application requests to set the I-bit of the
@@ -271,7 +265,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 
 		chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
 						 0, GFP_KERNEL);
-
 		if (!chunk) {
 			err = -ENOMEM;
 			goto errout;
@@ -282,45 +275,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 			goto errout_chunk_free;
 
 		/* Put the chunk->skb back into the form expected by send.  */
-		__skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
-			   - (__u8 *)chunk->skb->data);
-
-		sctp_datamsg_assign(msg, chunk);
-		list_add_tail(&chunk->frag_list, &msg->chunks);
-
-		/* The first chunk, the first chunk was likely short
-		 * to allow bundling, so reset to full size.
-		 */
-		if (0 == i)
-			len = max;
-	}
-
-	/* .. now the leftover bytes. */
-	if (over) {
-		if (!whole)
-			frag = SCTP_DATA_NOT_FRAG;
-		else
-			frag = SCTP_DATA_LAST_FRAG;
-
-		if ((sinfo->sinfo_flags & SCTP_EOF) ||
-		    (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
-			frag |= SCTP_DATA_SACK_IMM;
-
-		chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag,
-						 0, GFP_KERNEL);
-
-		if (!chunk) {
-			err = -ENOMEM;
-			goto errout;
-		}
-
-		err = sctp_user_addto_chunk(chunk, over, from);
-
-		/* Put the chunk->skb back into the form expected by send.  */
-		__skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
-			   - (__u8 *)chunk->skb->data);
-		if (err < 0)
-			goto errout_chunk_free;
+		__skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr -
+				       chunk->skb->data);
 
 		sctp_datamsg_assign(msg, chunk);
 		list_add_tail(&chunk->frag_list, &msg->chunks);
@@ -338,6 +294,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		sctp_chunk_free(chunk);
 	}
 	sctp_datamsg_put(msg);
+
 	return ERR_PTR(err);
 }
 
-- 
2.9.3

^ permalink raw reply related

* RE: [Open-FCoE] [PATCH RFC net-next 1/5] qed: Add support for hardware offloaded FCoE.
From: Mintz, Yuval @ 2016-12-29 17:28 UTC (permalink / raw)
  To: Hannes Reinecke, Dupuis, Chad, martin.petersen@oracle.com
  Cc: fcoe-devel@open-fcoe.org, netdev@vger.kernel.org,
	Dept-Eng QLogic Storage Upstream, linux-scsi@vger.kernel.org
In-Reply-To: <371bc2ac-c630-0e95-29aa-3fa16fe0c764@suse.de>

> > +struct fcoe_tstorm_fcoe_task_st_ctx_read_write {
> > +	union fcoe_cleanup_addr_exp_ro_union
> cleanup_addr_exp_ro_union;
> > +	__le16 flags;
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_MASK
> 0x7
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_SHIFT      0
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_SHIFT
> 3
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_SHIFT       4
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_SHIFT      5
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_SHIFT
> 6
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_MAS
> K   0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_SHIFT
> 7
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_MASK
> 0x3
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_SHIFT       8
> > +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_MASK
> 0x3F
> > +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_SHIFT
> 10
> 
> A very odd way of defining a bitfield ...
> Why not use a 'normal' bitfield here?

This is the format of our generated firmware HSI; We already have
Thousands of definitions using this same format.

^ permalink raw reply

* [PATCH v1 1/2] bpf: add a longest prefix match trie map implementation
From: Daniel Mack @ 2016-12-29 17:28 UTC (permalink / raw)
  To: ast; +Cc: dh.herrmann, daniel, netdev, davem, Daniel Mack
In-Reply-To: <20161229172855.14910-1-daniel@zonque.org>

This trie implements a longest prefix match algorithm that can be used
to match IP addresses to a stored set of ranges.

Internally, data is stored in an unbalanced trie of nodes that has a
maximum height of n, where n is the prefixlen the trie was created
with.

Tries may be created with prefix lengths that are multiples of 8, in
the range from 8 to 2048. The key used for lookup and update operations
is a struct bpf_lpm_trie_key, and the value is a uint64_t.

The code carries more information about the internal implementation.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
---
 include/uapi/linux/bpf.h |   7 +
 kernel/bpf/Makefile      |   2 +-
 kernel/bpf/lpm_trie.c    | 468 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 476 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/lpm_trie.c

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87..d564277 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -63,6 +63,12 @@ struct bpf_insn {
 	__s32	imm;		/* signed immediate constant */
 };
 
+/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
+struct bpf_lpm_trie_key {
+	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
+	__u8	data[0];	/* Arbitrary size */
+};
+
 /* BPF syscall commands, see bpf(2) man-page for details. */
 enum bpf_cmd {
 	BPF_MAP_CREATE,
@@ -89,6 +95,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CGROUP_ARRAY,
 	BPF_MAP_TYPE_LRU_HASH,
 	BPF_MAP_TYPE_LRU_PERCPU_HASH,
+	BPF_MAP_TYPE_LPM_TRIE,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1276474..e1ce4f4 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
 obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
new file mode 100644
index 0000000..8b6a61d
--- /dev/null
+++ b/kernel/bpf/lpm_trie.c
@@ -0,0 +1,468 @@
+/*
+ * Longest prefix match list implementation
+ *
+ * Copyright (c) 2016 Daniel Mack
+ * Copyright (c) 2016 David Herrmann
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <net/ipv6.h>
+
+/* Intermediate node */
+#define LPM_TREE_NODE_FLAG_IM BIT(0)
+
+struct lpm_trie_node;
+
+struct lpm_trie_node {
+	struct rcu_head rcu;
+	struct lpm_trie_node __rcu	*child[2];
+	u32				prefixlen;
+	u32				flags;
+	u64				value;
+	u8				data[0];
+};
+
+struct lpm_trie {
+	struct bpf_map			map;
+	struct lpm_trie_node __rcu	*root;
+	size_t				n_entries;
+	size_t				max_prefixlen;
+	size_t				data_size;
+	spinlock_t			lock;
+};
+
+/*
+ * This trie implements a longest prefix match algorithm that can be used to
+ * match IP addresses to a stored set of ranges.
+ *
+ * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
+ * interpreted as big endian, so data[0] stores the most significant byte.
+ *
+ * Match ranges are internally stored in instances of struct lpm_trie_node
+ * which each contain their prefix length as well as two pointers that may
+ * lead to more nodes containing more specific matches. Each node also stores
+ * a value that is defined by and returned to userspace via the update_elem
+ * and lookup functions.
+ *
+ * For instance, let's start with a trie that was created with a prefix length
+ * of 32, so it can be used for IPv4 addresses, and one single element that
+ * matches 192.168.0.0/16. The data array would hence contain
+ * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
+ * stick to IP-address notation for readability though.
+ *
+ * As the trie is empty initially, the new node (1) will be places as root
+ * node, denoted as (R) in the example below. As there are no other node, both
+ * child pointers are %NULL.
+ *
+ *              +----------------+
+ *              |       (1)  (R) |
+ *              | 192.168.0.0/16 |
+ *              |    value: 1    |
+ *              |   [0]    [1]   |
+ *              +----------------+
+ *
+ * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
+ * a node with the same data and a smaller prefix (ie, a less specific one),
+ * node (2) will become a child of (1). In child index depends on the next bit
+ * that is outside of that (1) matches, and that bit is 0, so (2) will be
+ * child[0] of (1):
+ *
+ *              +----------------+
+ *              |       (1)  (R) |
+ *              | 192.168.0.0/16 |
+ *              |    value: 1    |
+ *              |   [0]    [1]   |
+ *              +----------------+
+ *                   |
+ *    +----------------+
+ *    |       (2)      |
+ *    | 192.168.0.0/24 |
+ *    |    value: 2    |
+ *    |   [0]    [1]   |
+ *    +----------------+
+ *
+ * The child[1] slot of (1) could be filled with another node which has bit #17
+ * (the next bit after the ones that (1) matches on) set to 1. For instance,
+ * 192.168.128.0/24:
+ *
+ *              +----------------+
+ *              |       (1)  (R) |
+ *              | 192.168.0.0/16 |
+ *              |    value: 1    |
+ *              |   [0]    [1]   |
+ *              +----------------+
+ *                   |      |
+ *    +----------------+  +------------------+
+ *    |       (2)      |  |        (3)       |
+ *    | 192.168.0.0/24 |  | 192.168.128.0/24 |
+ *    |    value: 2    |  |     value: 3     |
+ *    |   [0]    [1]   |  |    [0]    [1]    |
+ *    +----------------+  +------------------+
+ *
+ * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
+ * it, node (1) is looked at first, and because (4) of the semantics laid out
+ * above (bit #17 is 0), it would normally be attached to (1) as child[0].
+ * However, that slot is already allocated, so a new node is needed in between.
+ * That node is does not have a value attached to it and it will never be
+ * returned to users as result of a lookup. It is only there to differenciate
+ * the traversal further. It will get a prefix as wide as necessary to
+ * distinguish its two children:
+ *
+ *                      +----------------+
+ *                      |       (1)  (R) |
+ *                      | 192.168.0.0/16 |
+ *                      |    value: 1    |
+ *                      |   [0]    [1]   |
+ *                      +----------------+
+ *                           |      |
+ *            +----------------+  +------------------+
+ *            |       (4)  (I) |  |        (3)       |
+ *            | 192.168.0.0/23 |  | 192.168.128.0/24 |
+ *            |    value: ---  |  |     value: 3     |
+ *            |   [0]    [1]   |  |    [0]    [1]    |
+ *            +----------------+  +------------------+
+ *                 |      |
+ *  +----------------+  +----------------+
+ *  |       (2)      |  |       (5)      |
+ *  | 192.168.0.0/24 |  | 192.168.1.0/24 |
+ *  |    value: 2    |  |     value: 5   |
+ *  |   [0]    [1]   |  |   [0]    [1]   |
+ *  +----------------+  +----------------+
+ *
+ * 192.168.1.1/32 would be a child of (5) etc.
+ *
+ * An intermediate node will be turned into a 'real' node on demand. In the
+ * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
+ *
+ * A fully populated trie would have a height of 32 nodes, as the trie was
+ * created with a prefix length of 32.
+ *
+ * The lookup starts at the root node. If the current node matches and if there
+ * is a child that can be used to become more specific, the trie is traversed
+ * downwards. The last node in the traversal that is a non-intermediate one is
+ * returned.
+ */
+
+static inline int extract_bit(const u8 *data, size_t index)
+{
+	return !!(data[index / 8] & (1 << (7 - (index % 8))));
+}
+
+/**
+ * longest_prefix_match() - determine the longest prefix
+ * @trie:	The trie to get internal sizes from
+ * @node:	The node to operate on
+ * @key:	The key to compare to @node
+ *
+ * Determine the longest prefix of @node that matches the bits in @key.
+ */
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+				   const struct lpm_trie_node *node,
+				   const struct bpf_lpm_trie_key *key)
+{
+	size_t prefixlen = 0;
+	int i;
+
+	for (i = 0; i < trie->data_size; i++) {
+		size_t b;
+
+		b = 8 - fls(node->data[i] ^ key->data[i]);
+		prefixlen += b;
+
+		if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
+			return min(node->prefixlen, key->prefixlen);
+
+		if (b < 8)
+			break;
+	}
+
+	return prefixlen;
+}
+
+/* Called from syscall or from eBPF program */
+static void *trie_lookup_elem(struct bpf_map *map, void *_key)
+{
+	struct lpm_trie_node *node, *found = NULL;
+	struct bpf_lpm_trie_key *key = _key;
+	struct lpm_trie *trie =
+		container_of(map, struct lpm_trie, map);
+
+	/* Start walking the trie from the root node ... */
+
+	for (node = rcu_dereference(trie->root); node;) {
+		unsigned int next_bit;
+		size_t matchlen;
+
+		/*
+		 * Determine the longest prefix of @node that matches @key.
+		 * If it's the maximum possible prefix for this trie, we have
+		 * an exact match and can return it directly.
+		 */
+		matchlen = longest_prefix_match(trie, node, key);
+		if (matchlen == trie->max_prefixlen)
+			return &node->value;
+
+		/*
+		 * If the number of bits that match is smaller than the prefix
+		 * length of @node, bail out and return the node we have seen
+		 * last in the traversal (ie, the parent).
+		 */
+		if (matchlen < node->prefixlen)
+			break;
+
+		/*
+		 * Consider this node as return candidate unless it is an
+		 * artificially added intermediate one
+		 */
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+			found = node;
+
+		/*
+		 * If the node match is fully satisfied, let's see if we can
+		 * become more specific. Determine the next bit in the key and
+		 * traverse down.
+		 */
+		next_bit = extract_bit(key->data, node->prefixlen);
+		node = rcu_dereference(node->child[next_bit]);
+	}
+
+	return found ? &found->value : NULL;
+}
+
+static struct lpm_trie_node *lpm_trie_node_alloc(size_t data_size)
+{
+	return kmalloc(sizeof(struct lpm_trie_node) + data_size,
+		       GFP_ATOMIC | __GFP_NOWARN);
+}
+
+/* Called from syscall or from eBPF program */
+static int trie_update_elem(struct bpf_map *map,
+			    void *_key, void *value, u64 flags)
+{
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct lpm_trie_node *node, *im_node, *new_node = NULL;
+	struct lpm_trie_node __rcu **slot;
+	struct bpf_lpm_trie_key *key = _key;
+	unsigned int next_bit;
+	size_t matchlen = 0;
+	int ret = 0;
+
+	if (key->prefixlen > trie->max_prefixlen)
+		return -EINVAL;
+
+	spin_lock(&trie->lock);
+
+	/* Allocate and fill a new node */
+
+	if (trie->n_entries == trie->map.max_entries) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	new_node = lpm_trie_node_alloc(trie->data_size);
+	if (!new_node) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	trie->n_entries++;
+	new_node->value = *(u64 *) value;
+	new_node->prefixlen = key->prefixlen;
+	new_node->flags = 0;
+	new_node->child[0] = NULL;
+	new_node->child[1] = NULL;
+	memcpy(new_node->data, key->data, trie->data_size);
+
+	/*
+	 * Now find a slot to attach the new node. To do that, walk the tree
+	 * from the root match as many bits as possible for each node until we
+	 * either find an empty slot or a slot that needs to be replaced by an
+	 * intermediate node.
+	 */
+	slot = &trie->root;
+
+	while ((node = rcu_dereference_protected(*slot,
+					lockdep_is_held(&trie->lock)))) {
+		matchlen = longest_prefix_match(trie, node, key);
+
+		if (node->prefixlen != matchlen ||
+		    node->prefixlen == key->prefixlen ||
+		    node->prefixlen == trie->max_prefixlen)
+			break;
+
+		next_bit = extract_bit(key->data, node->prefixlen);
+		slot = &node->child[next_bit];
+	}
+
+	/*
+	 * If the slot is empty (a free child pointer or an empty root),
+	 * simply assign the @new_node to that slot and be done.
+	 */
+	if (!node) {
+		rcu_assign_pointer(*slot, new_node);
+		goto out;
+	}
+
+	/*
+	 * If the slot we picked already exists, replace it with @new_node
+	 * which already has the correct data array and value set.
+	 */
+	if (node->prefixlen == matchlen) {
+		new_node->child[0] = node->child[0];
+		new_node->child[1] = node->child[1];
+
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+			trie->n_entries--;
+
+		rcu_assign_pointer(*slot, new_node);
+		kfree_rcu(node, rcu);
+
+		goto out;
+	}
+
+	/*
+	 * If the new node matches the prefix completely, it must be an
+	 * inserted as an ancestor. Simply insert it between @node and @*slot.
+	 */
+	if (matchlen == key->prefixlen) {
+		next_bit = extract_bit(node->data, matchlen);
+		rcu_assign_pointer(new_node->child[next_bit], node);
+		rcu_assign_pointer(*slot, new_node);
+		goto out;
+	}
+
+	im_node = lpm_trie_node_alloc(trie->data_size);
+	if (!im_node) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	im_node->prefixlen = matchlen;
+	im_node->flags |= LPM_TREE_NODE_FLAG_IM;
+	memcpy(im_node->data, node->data, trie->data_size);
+
+	/* Now determine which child to install in which slot */
+	if (extract_bit(key->data, matchlen)) {
+		rcu_assign_pointer(im_node->child[0], node);
+		rcu_assign_pointer(im_node->child[1], new_node);
+	} else {
+		rcu_assign_pointer(im_node->child[0], new_node);
+		rcu_assign_pointer(im_node->child[1], node);
+	}
+
+	/* Finally, assign the intermediate node to the determined spot */
+	rcu_assign_pointer(*slot, im_node);
+
+out:
+	if (ret) {
+		if (new_node)
+			trie->n_entries--;
+
+		kfree(new_node);
+		kfree(im_node);
+	}
+
+	spin_unlock(&trie->lock);
+
+	return ret;
+}
+
+static struct bpf_map *trie_alloc(union bpf_attr *attr)
+{
+	struct lpm_trie *trie;
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->map_flags ||
+	    attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1   ||
+	    attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 ||
+	    attr->value_size != sizeof(u64))
+		return ERR_PTR(-EINVAL);
+
+	trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
+	if (!trie)
+		return NULL;
+
+	/* copy mandatory map attributes */
+	trie->map.map_type = attr->map_type;
+	trie->map.key_size = attr->key_size;
+	trie->map.value_size = attr->value_size;
+	trie->map.max_entries = attr->max_entries;
+	trie->data_size = attr->key_size -
+				offsetof(struct bpf_lpm_trie_key, data);
+	trie->max_prefixlen = trie->data_size * 8;
+
+	spin_lock_init(&trie->lock);
+
+	return &trie->map;
+}
+
+static void trie_free(struct bpf_map *map)
+{
+	struct lpm_trie_node __rcu **slot;
+	struct lpm_trie_node *node;
+	struct lpm_trie *trie =
+		container_of(map, struct lpm_trie, map);
+
+	spin_lock(&trie->lock);
+
+	/*
+	 * Always start at the root and walk down to a node that has no
+	 * children. Then free that node, nullify its pointer in the parent,
+	 * then start over.
+	 */
+
+	for (;;) {
+		slot = &trie->root;
+
+		for (;;) {
+			node = rcu_dereference_protected(*slot,
+					lockdep_is_held(&trie->lock));
+			if (!node)
+				goto out;
+
+			if (node->child[0]) {
+				slot = &node->child[0];
+				continue;
+			}
+
+			if (node->child[1]) {
+				slot = &node->child[1];
+				continue;
+			}
+
+			kfree(node);
+			rcu_assign_pointer(*slot, NULL);
+			break;
+		}
+	}
+
+out:
+	spin_unlock(&trie->lock);
+}
+
+static const struct bpf_map_ops trie_ops = {
+	.map_alloc = trie_alloc,
+	.map_free = trie_free,
+	.map_lookup_elem = trie_lookup_elem,
+	.map_update_elem = trie_update_elem,
+};
+
+static struct bpf_map_type_list trie_type __read_mostly = {
+	.ops = &trie_ops,
+	.type = BPF_MAP_TYPE_LPM_TRIE,
+};
+
+static int __init register_trie_map(void)
+{
+	bpf_register_map_type(&trie_type);
+	return 0;
+}
+late_initcall(register_trie_map);
-- 
2.9.3

^ permalink raw reply related

* [PATCH v1 2/2] bpf: Add tests for the lpm trie map
From: Daniel Mack @ 2016-12-29 17:28 UTC (permalink / raw)
  To: ast; +Cc: dh.herrmann, daniel, netdev, davem, Daniel Mack
In-Reply-To: <20161229172855.14910-1-daniel@zonque.org>

From: David Herrmann <dh.herrmann@gmail.com>

The first part of this program runs randomized tests against the
lpm-bpf-map. It implements a "Trivial Longest Prefix Match" (tlpm)
based on simple, linear, single linked lists. The implementation
should be pretty straightforward.

Based on tlpm, this inserts randomized data into bpf-lpm-maps and
verifies the trie-based bpf-map implementation behaves the same way
as tlpm.

The second part uses 'real world' IPv4 and IPv6 addresses and tests
the trie with those.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Daniel Mack <daniel@zonque.org>
---
 tools/testing/selftests/bpf/.gitignore     |   1 +
 tools/testing/selftests/bpf/Makefile       |   4 +-
 tools/testing/selftests/bpf/test_lpm_map.c | 348 +++++++++++++++++++++++++++++
 3 files changed, 351 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_lpm_map.c

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 071431b..d3b1c9b 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -1,3 +1,4 @@
 test_verifier
 test_maps
 test_lru_map
+test_lpm_map
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7a5f245..064a3e5 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -1,8 +1,8 @@
 CFLAGS += -Wall -O2 -I../../../../usr/include
 
-test_objs = test_verifier test_maps test_lru_map
+test_objs = test_verifier test_maps test_lru_map test_lpm_map
 
-TEST_PROGS := test_verifier test_maps test_lru_map test_kmod.sh
+TEST_PROGS := test_verifier test_maps test_lru_map test_lpm_map test_kmod.sh
 TEST_FILES := $(test_objs)
 
 all: $(test_objs)
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c
new file mode 100644
index 0000000..08db750
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -0,0 +1,348 @@
+/*
+ * Randomized tests for eBPF longest-prefix-match maps
+ *
+ * This program runs randomized tests against the lpm-bpf-map. It implements a
+ * "Trivial Longest Prefix Match" (tlpm) based on simple, linear, singly linked
+ * lists. The implementation should be pretty straightforward.
+ *
+ * Based on tlpm, this inserts randomized data into bpf-lpm-maps and verifies
+ * the trie-based bpf-map implementation behaves the same way as tlpm.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/bpf.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+
+#include "bpf_sys.h"
+#include "bpf_util.h"
+
+struct tlpm_node {
+	struct tlpm_node *next;
+	size_t n_bits;
+	uint8_t key[];
+};
+
+static struct tlpm_node *tlpm_add(struct tlpm_node *list,
+				  const uint8_t *key,
+				  size_t n_bits)
+{
+	struct tlpm_node *node;
+	size_t n;
+
+	/* add new entry with @key/@n_bits to @list and return new head */
+
+	n = (n_bits + 7) / 8;
+	node = malloc(sizeof(*node) + n);
+	assert(node);
+
+	node->next = list;
+	node->n_bits = n_bits;
+	memcpy(node->key, key, n);
+
+	return node;
+}
+
+static void tlpm_clear(struct tlpm_node *list)
+{
+	struct tlpm_node *node;
+
+	/* free all entries in @list */
+
+	while ((node = list)) {
+		list = list->next;
+		free(node);
+	}
+}
+
+static struct tlpm_node *tlpm_match(struct tlpm_node *list,
+				    const uint8_t *key,
+				    size_t n_bits)
+{
+	struct tlpm_node *best = NULL;
+	size_t i;
+
+	/*
+	 * Perform longest prefix-match on @key/@n_bits. That is, iterate all
+	 * entries and match each prefix against @key. Remember the "best"
+	 * entry we find (i.e., the longest prefix that matches) and return it
+	 * to the caller when done.
+	 */
+
+	for ( ; list; list = list->next) {
+		for (i = 0; i < n_bits && i < list->n_bits; ++i) {
+			if ((key[i / 8] & (1 << (7 - i % 8))) !=
+			    (list->key[i / 8] & (1 << (7 - i % 8))))
+				break;
+		}
+
+		if (i >= list->n_bits) {
+			if (!best || i > best->n_bits)
+				best = list;
+		}
+	}
+
+	return best;
+}
+
+static void test_lpm_basic(void)
+{
+	struct tlpm_node *list = NULL, *t1, *t2;
+
+	/* very basic, static tests to verify tlpm works as expected */
+
+	assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+
+	t1 = list = tlpm_add(list, (uint8_t[]){ 0xff }, 8);
+	assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+	assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16));
+	assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0x00 }, 16));
+	assert(!tlpm_match(list, (uint8_t[]){ 0x7f }, 8));
+	assert(!tlpm_match(list, (uint8_t[]){ 0xfe }, 8));
+	assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 7));
+
+	t2 = list = tlpm_add(list, (uint8_t[]){ 0xff, 0xff }, 16);
+	assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+	assert(t2 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16));
+	assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 15));
+	assert(!tlpm_match(list, (uint8_t[]){ 0x7f, 0xff }, 16));
+
+	tlpm_clear(list);
+}
+
+static void test_lpm_order(void)
+{
+	struct tlpm_node *t1, *t2, *l1 = NULL, *l2 = NULL;
+	size_t i, j;
+
+	/*
+	 * Verify the tlpm implementation works correctly regardless of the
+	 * order of entries. Insert a random set of entries into @l1, and copy
+	 * the same data in reverse order into @l2. Then verify a lookup of
+	 * random keys will yield the same result in both sets.
+	 */
+
+	for (i = 0; i < (1 << 12); ++i)
+		l1 = tlpm_add(l1, (uint8_t[]){
+					rand() % 0xff,
+					rand() % 0xff,
+				}, rand() % 16 + 1);
+
+	for (t1 = l1; t1; t1 = t1->next)
+		l2 = tlpm_add(l2, t1->key, t1->n_bits);
+
+	for (i = 0; i < (1 << 8); ++i) {
+		uint8_t key[] = { rand() % 0xff, rand() % 0xff };
+
+		t1 = tlpm_match(l1, key, 16);
+		t2 = tlpm_match(l2, key, 16);
+
+		assert(!t1 == !t2);
+		if (t1) {
+			assert(t1->n_bits == t2->n_bits);
+			for (j = 0; j < t1->n_bits; ++j)
+				assert((t1->key[j / 8] & (1 << (7 - j % 8))) ==
+				       (t2->key[j / 8] & (1 << (7 - j % 8))));
+		}
+	}
+
+	tlpm_clear(l1);
+	tlpm_clear(l2);
+}
+
+static void test_lpm_map(void)
+{
+	size_t i, j, n_matches, n_nodes, n_lookups;
+	struct tlpm_node *t, *list = NULL;
+	struct bpf_lpm_trie_key *key;
+	uint8_t value[8] = {};
+	int r, map;
+
+	/*
+	 * Compare behavior of tlpm vs. bpf-lpm. Create a randomized set of
+	 * prefixes and insert it into both tlpm and bpf-lpm. Then run some
+	 * randomized lookups and verify both maps return the same result.
+	 */
+
+	n_matches = 0;
+	n_nodes = 1 << 8;
+	n_lookups = 1 << 16;
+
+	key = alloca(sizeof(*key) + 4);
+	memset(key, 0, sizeof(*key) + 4);
+
+	map = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE,
+			     sizeof(*key) + 4,
+			     sizeof(value),
+			     4096,
+			     0);
+	assert(map >= 0);
+
+	for (i = 0; i < n_nodes; ++i) {
+		value[0] = rand() & 0xff;
+		value[1] = rand() & 0xff;
+		value[2] = rand() & 0xff;
+		value[3] = rand() & 0xff;
+		value[4] = rand() % 33;
+
+		list = tlpm_add(list, value, value[4]);
+
+		key->prefixlen = value[4];
+		memcpy(key->data, value, 4);
+		r = bpf_map_update(map, key, value, 0);
+		assert(!r);
+	}
+
+	for (i = 0; i < n_lookups; ++i) {
+		uint8_t data[] = {
+			rand() % 0xff,
+			rand() % 0xff,
+			rand() % 0xff,
+			rand() % 0xff
+		};
+
+		t = tlpm_match(list, data, 32);
+
+		key->prefixlen = 32;
+		memcpy(key->data, data, 4);
+		r = bpf_map_lookup(map, key, value);
+		assert(!r || errno == ENOENT);
+		assert(!t == !!r);
+
+		if (t) {
+			++n_matches;
+			assert(t->n_bits == value[4]);
+			for (j = 0; j < t->n_bits; ++j)
+				assert((t->key[j / 8] & (1 << (7 - j % 8))) ==
+				       (value[j / 8] & (1 << (7 - j % 8))));
+		}
+	}
+
+	close(map);
+	tlpm_clear(list);
+
+	/*
+	 * With 255 random nodes in the map, we are pretty likely to match
+	 * something on every lookup. For statistics, use this:
+	 *
+	 *     printf("  nodes: %zu\n"
+	 *            "lookups: %zu\n"
+	 *            "matches: %zu\n", n_nodes, n_lookups, n_matches);
+	 */
+}
+
+/* Test the implementation with some 'real world' examples */
+
+static void test_lpm_ipaddr(void)
+{
+	struct bpf_lpm_trie_key *key_ipv4;
+	struct bpf_lpm_trie_key *key_ipv6;
+	size_t key_size_ipv4;
+	size_t key_size_ipv6;
+	int map_fd_ipv4;
+	int map_fd_ipv6;
+	__u64 value;
+
+	key_size_ipv4 = sizeof(*key_ipv4) + sizeof(__u32);
+	key_size_ipv6 = sizeof(*key_ipv6) + sizeof(__u32) * 4;
+	key_ipv4 = alloca(key_size_ipv4);
+	key_ipv6 = alloca(key_size_ipv6);
+
+	map_fd_ipv4 = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE,
+				     key_size_ipv4, sizeof(value),
+				     100, 0);
+	assert(map_fd_ipv4 >= 0);
+
+	map_fd_ipv6 = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE,
+				     key_size_ipv6, sizeof(value),
+				     100, 0);
+	assert(map_fd_ipv6 >= 0);
+
+	/* Fill data some IPv4 and IPv6 address ranges */
+	value = 1;
+	key_ipv4->prefixlen = 16;
+	inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
+	assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+	value = 2;
+	key_ipv4->prefixlen = 24;
+	inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
+	assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+	value = 3;
+	key_ipv4->prefixlen = 24;
+	inet_pton(AF_INET, "192.168.128.0", key_ipv4->data);
+	assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+	value = 5;
+	key_ipv4->prefixlen = 24;
+	inet_pton(AF_INET, "192.168.1.0", key_ipv4->data);
+	assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+	value = 4;
+	key_ipv4->prefixlen = 23;
+	inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
+	assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+	value = 0xdeadbeef;
+	key_ipv6->prefixlen = 64;
+	inet_pton(AF_INET6, "2a00:1450:4001:814::200e", key_ipv6->data);
+	assert(bpf_map_update(map_fd_ipv6, key_ipv6, &value, 0) == 0);
+
+	/* Set tprefixlen to maximum for lookups */
+	key_ipv4->prefixlen = 32;
+	key_ipv6->prefixlen = 128;
+
+	/* Test some lookups that should come back with a value */
+	inet_pton(AF_INET, "192.168.128.23", key_ipv4->data);
+	assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, &value) == 0);
+	assert(value == 3);
+
+	inet_pton(AF_INET, "192.168.0.1", key_ipv4->data);
+	assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, &value) == 0);
+	assert(value == 2);
+
+	inet_pton(AF_INET6, "2a00:1450:4001:814::", key_ipv6->data);
+	assert(bpf_map_lookup(map_fd_ipv6, key_ipv6, &value) == 0);
+	assert(value == 0xdeadbeef);
+
+	inet_pton(AF_INET6, "2a00:1450:4001:814::1", key_ipv6->data);
+	assert(bpf_map_lookup(map_fd_ipv6, key_ipv6, &value) == 0);
+	assert(value == 0xdeadbeef);
+
+	/* Test some lookups that should not match any entry */
+	inet_pton(AF_INET, "10.0.0.1", key_ipv4->data);
+	assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, &value) == -1 &&
+	       errno == ENOENT);
+
+	inet_pton(AF_INET, "11.11.11.11", key_ipv4->data);
+	assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, &value) == -1 &&
+	       errno == ENOENT);
+
+	inet_pton(AF_INET6, "2a00:ffff::", key_ipv6->data);
+	assert(bpf_map_lookup(map_fd_ipv6, key_ipv6, &value) == -1 &&
+	       errno == ENOENT);
+
+	close(map_fd_ipv4);
+	close(map_fd_ipv6);
+}
+
+int main(void)
+{
+	/* we want predictable, pseudo random tests */
+	srand(0xf00ba1);
+
+	test_lpm_basic();
+	test_lpm_order();
+	test_lpm_map();
+	test_lpm_ipaddr();
+
+	printf("test_lpm: OK\n");
+	return 0;
+}
-- 
2.9.3

^ permalink raw reply related

* [PATCH v1 0/2] bpf: add longest prefix match map
From: Daniel Mack @ 2016-12-29 17:28 UTC (permalink / raw)
  To: ast; +Cc: dh.herrmann, daniel, netdev, davem, Daniel Mack

This patch set adds a longest prefix match algorithm that can be used
to match IP addresses to a stored set of ranges. It is exposed as a
bpf map type.

Internally, data is stored in an unbalanced tree of nodes that has a
maximum height of n, where n is the prefixlen the trie was created
with.

Not that this has nothing to do with fib or fib6 and is in no way meant
to replace or share code with it. It's rather a much simpler
implementation that is specifically written with bpf maps in mind.

Patch 1/2 adds the implementation, and 2/2 an extensive test suite.

Feedback is much appreciated.

Thanks,
Daniel

Changelog:

rfc -> v1:
	* Add __rcu pointer annotations to make sparse happy
	* Fold _lpm_trie_find_target_node() into its only caller
	* Fix some minor documentation issues

Daniel Mack (1):
  bpf: add a longest prefix match trie map implementation

David Herrmann (1):
  bpf: Add tests for the lpm trie map

 include/uapi/linux/bpf.h                   |   7 +
 kernel/bpf/Makefile                        |   2 +-
 kernel/bpf/lpm_trie.c                      | 468 +++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/.gitignore     |   1 +
 tools/testing/selftests/bpf/Makefile       |   4 +-
 tools/testing/selftests/bpf/test_lpm_map.c | 348 +++++++++++++++++++++
 6 files changed, 827 insertions(+), 3 deletions(-)
 create mode 100644 kernel/bpf/lpm_trie.c
 create mode 100644 tools/testing/selftests/bpf/test_lpm_map.c

-- 
2.9.3

^ permalink raw reply

* [PATCH net-next 14/14] MAINTAINERS: Add bnxt_en maintainer info.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 MAINTAINERS | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index cfff2c9..11904a9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2605,6 +2605,12 @@ L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/broadcom/bnx2x/
 
+BROADCOM BNXT_EN 50 GIGABIT ETHERNET DRIVER
+M:	Michael Chan <michael.chan@broadcom.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	drivers/net/ethernet/broadcom/bnxt/
+
 BROADCOM BCM281XX/BCM11XXX/BCM216XX ARM ARCHITECTURE
 M:	Florian Fainelli <f.fainelli@gmail.com>
 M:	Ray Jui <rjui@broadcom.com>
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 09/14] bnxt_en: Assign additional vnics to VFs.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

Assign additional vnics to VFs whenever possible so that NTUPLE can be
supported on the VFs.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index c696025..0c9f6c1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -429,6 +429,8 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
 		vf_rx_rings = (pf->max_rx_rings - bp->rx_nr_rings) / num_vfs;
 	vf_ring_grps = (bp->pf.max_hw_ring_grps - bp->rx_nr_rings) / num_vfs;
 	vf_tx_rings = (pf->max_tx_rings - bp->tx_nr_rings) / num_vfs;
+	vf_vnics = (pf->max_vnics - bp->nr_vnics) / num_vfs;
+	vf_vnics = min_t(u16, vf_vnics, vf_rx_rings);
 
 	req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_MTU |
 				  FUNC_CFG_REQ_ENABLES_MRU |
@@ -451,7 +453,6 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
 	req.num_rx_rings = cpu_to_le16(vf_rx_rings);
 	req.num_hw_ring_grps = cpu_to_le16(vf_ring_grps);
 	req.num_l2_ctxs = cpu_to_le16(4);
-	vf_vnics = 1;
 
 	req.num_vnics = cpu_to_le16(vf_vnics);
 	/* FIXME spec currently uses 1 bit for stats ctx */
@@ -506,6 +507,8 @@ static int bnxt_sriov_enable(struct bnxt *bp, int *num_vfs)
 			    min_rx_rings)
 				rx_ok = 1;
 		}
+		if (bp->pf.max_vnics - bp->nr_vnics < min_rx_rings)
+			rx_ok = 0;
 
 		if (bp->pf.max_tx_rings - bp->tx_nr_rings >= min_tx_rings)
 			tx_ok = 1;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 13/14] bnxt_en: Handle no aggregation ring gracefully.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

The current code assumes that we will always have at least 2 rx rings, 1
will be used as an aggregation ring for TPA and jumbo page placements.
However, it is possible, especially on a VF, that there is only 1 rx
ring available.  In this scenario, the current code will fail to initialize.
To handle it, we need to properly set up only 1 ring without aggregation.
Set a new flag BNXT_FLAG_NO_AGG_RINGS for this condition and add logic to
set up the chip to place RX data linearly into a single buffer per packet.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 25 +++++++++++++++++++++----
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 +
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1f54a7a..98e9484 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2496,7 +2496,7 @@ void bnxt_set_ring_params(struct bnxt *bp)
 		agg_factor = min_t(u32, 4, 65536 / BNXT_RX_PAGE_SIZE);
 
 	bp->flags &= ~BNXT_FLAG_JUMBO;
-	if (rx_space > PAGE_SIZE) {
+	if (rx_space > PAGE_SIZE && !(bp->flags & BNXT_FLAG_NO_AGG_RINGS)) {
 		u32 jumbo_factor;
 
 		bp->flags |= BNXT_FLAG_JUMBO;
@@ -6174,6 +6174,9 @@ static int bnxt_set_features(struct net_device *dev, netdev_features_t features)
 	if (features & NETIF_F_LRO)
 		flags |= BNXT_FLAG_LRO;
 
+	if (bp->flags & BNXT_FLAG_NO_AGG_RINGS)
+		flags &= ~BNXT_FLAG_TPA;
+
 	if (features & NETIF_F_HW_VLAN_CTAG_RX)
 		flags |= BNXT_FLAG_STRIP_VLAN;
 
@@ -7040,8 +7043,17 @@ static int bnxt_get_dflt_rings(struct bnxt *bp, int *max_rx, int *max_tx,
 	int rc;
 
 	rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
-	if (rc)
-		return rc;
+	if (rc && (bp->flags & BNXT_FLAG_AGG_RINGS)) {
+		/* Not enough rings, try disabling agg rings. */
+		bp->flags &= ~BNXT_FLAG_AGG_RINGS;
+		rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
+		if (rc)
+			return rc;
+		bp->flags |= BNXT_FLAG_NO_AGG_RINGS;
+		bp->dev->hw_features &= ~NETIF_F_LRO;
+		bp->dev->features &= ~NETIF_F_LRO;
+		bnxt_set_ring_params(bp);
+	}
 
 	if (bp->flags & BNXT_FLAG_ROCE_CAP) {
 		int max_cp, max_stat, max_irq;
@@ -7236,7 +7248,12 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	bnxt_set_tpa_flags(bp);
 	bnxt_set_ring_params(bp);
 	bnxt_set_max_func_irqs(bp, max_irqs);
-	bnxt_set_dflt_rings(bp);
+	rc = bnxt_set_dflt_rings(bp);
+	if (rc) {
+		netdev_err(bp->dev, "Not enough rings available.\n");
+		rc = -ENOMEM;
+		goto init_err;
+	}
 
 	/* Default RSS hash cfg. */
 	bp->rss_hash_cfg = VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4 |
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index d174729d..f6b9b1c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -950,6 +950,7 @@ struct bnxt {
 	#define BNXT_FLAG_ROCEV2_CAP	0x10000
 	#define BNXT_FLAG_ROCE_CAP	(BNXT_FLAG_ROCEV1_CAP |	\
 					 BNXT_FLAG_ROCEV2_CAP)
+	#define BNXT_FLAG_NO_AGG_RINGS	0x20000
 	#define BNXT_FLAG_CHIP_NITRO_A0	0x1000000
 
 	#define BNXT_FLAG_ALL_CONFIG_FEATS (BNXT_FLAG_TPA |		\
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 12/14] bnxt_en: Set default completion ring for async events.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

With the added support for the bnxt_re RDMA driver, both drivers can be
allocating completion rings in any order.  The firmware does not know
which completion ring should be receiving async events.  Add an
extra step to tell firmware the completion ring number for receiving
async events after bnxt_en allocates the completion rings.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 338dbd0..1f54a7a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3858,6 +3858,30 @@ static int hwrm_ring_alloc_send_msg(struct bnxt *bp,
 	return rc;
 }
 
+static int bnxt_hwrm_set_async_event_cr(struct bnxt *bp, int idx)
+{
+	int rc;
+
+	if (BNXT_PF(bp)) {
+		struct hwrm_func_cfg_input req = {0};
+
+		bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+		req.fid = cpu_to_le16(0xffff);
+		req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_ASYNC_EVENT_CR);
+		req.async_event_cr = cpu_to_le16(idx);
+		rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	} else {
+		struct hwrm_func_vf_cfg_input req = {0};
+
+		bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_CFG, -1, -1);
+		req.enables =
+			cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_ASYNC_EVENT_CR);
+		req.async_event_cr = cpu_to_le16(idx);
+		rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	}
+	return rc;
+}
+
 static int bnxt_hwrm_ring_alloc(struct bnxt *bp)
 {
 	int i, rc = 0;
@@ -3874,6 +3898,12 @@ static int bnxt_hwrm_ring_alloc(struct bnxt *bp)
 			goto err_out;
 		BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
 		bp->grp_info[i].cp_fw_ring_id = ring->fw_ring_id;
+
+		if (!i) {
+			rc = bnxt_hwrm_set_async_event_cr(bp, ring->fw_ring_id);
+			if (rc)
+				netdev_warn(bp->dev, "Failed to set async event completion ring.\n");
+		}
 	}
 
 	for (i = 0; i < bp->tx_nr_rings; i++) {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 11/14] bnxt_en: Implement new scheme to reserve tx rings.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

In order to properly support TX rate limiting in SRIOV VF functions or
NPAR functions, firmware needs better control over tx ring allocations.
The new scheme requires the driver to reserve the number of tx rings
and to query to see if the requested number of tx rings is reserved.
The driver will use the new scheme when the firmware interface spec is
1.6.1 or newer.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 59 ++++++++++++++++++++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |  2 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 15 ++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c   | 10 +++-
 4 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 4d478e7..338dbd0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4045,6 +4045,50 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path)
 	}
 }
 
+/* Caller must hold bp->hwrm_cmd_lock */
+int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings)
+{
+	struct hwrm_func_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_func_qcfg_input req = {0};
+	int rc;
+
+	if (bp->hwrm_spec_code < 0x10601)
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1);
+	req.fid = cpu_to_le16(fid);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc)
+		*tx_rings = le16_to_cpu(resp->alloc_tx_rings);
+
+	return rc;
+}
+
+int bnxt_hwrm_reserve_tx_rings(struct bnxt *bp, int *tx_rings)
+{
+	struct hwrm_func_cfg_input req = {0};
+	int rc;
+
+	if (bp->hwrm_spec_code < 0x10601)
+		return 0;
+
+	if (BNXT_VF(bp))
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+	req.fid = cpu_to_le16(0xffff);
+	req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS);
+	req.num_tx_rings = cpu_to_le16(*tx_rings);
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		return rc;
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = __bnxt_hwrm_get_tx_rings(bp, 0xffff, tx_rings);
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
 static void bnxt_hwrm_set_coal_params(struct bnxt *bp, u32 max_bufs,
 	u32 buf_tmrs, u16 flags,
 	struct hwrm_ring_cmpl_ring_cfg_aggint_params_input *req)
@@ -6509,10 +6553,16 @@ int bnxt_setup_mq_tc(struct net_device *dev, u8 tc)
 		sh = true;
 
 	if (tc) {
-		int max_rx_rings, max_tx_rings, rc;
+		int max_rx_rings, max_tx_rings, req_tx_rings, rsv_tx_rings, rc;
 
+		req_tx_rings = bp->tx_nr_rings_per_tc * tc;
 		rc = bnxt_get_max_rings(bp, &max_rx_rings, &max_tx_rings, sh);
-		if (rc || bp->tx_nr_rings_per_tc * tc > max_tx_rings)
+		if (rc || req_tx_rings > max_tx_rings)
+			return -ENOMEM;
+
+		rsv_tx_rings = req_tx_rings;
+		if (bnxt_hwrm_reserve_tx_rings(bp, &rsv_tx_rings) ||
+		    rsv_tx_rings < req_tx_rings)
 			return -ENOMEM;
 	}
 
@@ -7000,6 +7050,11 @@ static int bnxt_set_dflt_rings(struct bnxt *bp)
 		return rc;
 	bp->rx_nr_rings = min_t(int, dflt_rings, max_rx_rings);
 	bp->tx_nr_rings_per_tc = min_t(int, dflt_rings, max_tx_rings);
+
+	rc = bnxt_hwrm_reserve_tx_rings(bp, &bp->tx_nr_rings_per_tc);
+	if (rc)
+		netdev_warn(bp->dev, "Unable to reserve tx rings\n");
+
 	bp->tx_nr_rings = bp->tx_nr_rings_per_tc;
 	bp->cp_nr_rings = sh ? max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) :
 			       bp->tx_nr_rings + bp->rx_nr_rings;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 75803e5..d174729d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1149,6 +1149,8 @@ struct bnxt {
 int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
 				     int bmap_size);
 int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id);
+int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings);
+int bnxt_hwrm_reserve_tx_rings(struct bnxt *bp, int *tx_rings);
 int bnxt_hwrm_set_coal(struct bnxt *);
 unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
 void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index e6b1196..dd21be4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -388,6 +388,7 @@ static int bnxt_set_channels(struct net_device *dev,
 {
 	struct bnxt *bp = netdev_priv(dev);
 	int max_rx_rings, max_tx_rings, tcs;
+	int req_tx_rings, rsv_tx_rings;
 	u32 rc = 0;
 	bool sh = false;
 
@@ -423,6 +424,20 @@ static int bnxt_set_channels(struct net_device *dev,
 		    channel->tx_count > max_tx_rings))
 		return -ENOMEM;
 
+	req_tx_rings = sh ? channel->combined_count : channel->tx_count;
+	req_tx_rings = min_t(int, req_tx_rings, max_tx_rings);
+	if (tcs > 1)
+		req_tx_rings *= tcs;
+
+	rsv_tx_rings = req_tx_rings;
+	if (bnxt_hwrm_reserve_tx_rings(bp, &rsv_tx_rings))
+		return -ENOMEM;
+
+	if (rsv_tx_rings < req_tx_rings) {
+		netdev_warn(dev, "Unable to allocate the requested tx rings\n");
+		return -ENOMEM;
+	}
+
 	if (netif_running(dev)) {
 		if (BNXT_PF(bp)) {
 			/* TODO CHIMP_FW: Send message to all VF's
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index 0c9f6c1..64ef0e5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -416,6 +416,7 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
 	u16 vf_ring_grps;
 	struct hwrm_func_cfg_input req = {0};
 	struct bnxt_pf_info *pf = &bp->pf;
+	int total_vf_tx_rings = 0;
 
 	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
 
@@ -460,6 +461,8 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
 
 	mutex_lock(&bp->hwrm_cmd_lock);
 	for (i = 0; i < num_vfs; i++) {
+		int vf_tx_rsvd = vf_tx_rings;
+
 		req.fid = cpu_to_le16(pf->first_vf_id + i);
 		rc = _hwrm_send_message(bp, &req, sizeof(req),
 					HWRM_CMD_TIMEOUT);
@@ -467,10 +470,15 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
 			break;
 		pf->active_vfs = i + 1;
 		pf->vf[i].fw_fid = le16_to_cpu(req.fid);
+		rc = __bnxt_hwrm_get_tx_rings(bp, pf->vf[i].fw_fid,
+					      &vf_tx_rsvd);
+		if (rc)
+			break;
+		total_vf_tx_rings += vf_tx_rsvd;
 	}
 	mutex_unlock(&bp->hwrm_cmd_lock);
 	if (!rc) {
-		pf->max_tx_rings -= vf_tx_rings * num_vfs;
+		pf->max_tx_rings -= total_vf_tx_rings;
 		pf->max_rx_rings -= vf_rx_rings * num_vfs;
 		pf->max_hw_ring_grps -= vf_ring_grps * num_vfs;
 		pf->max_cp_rings -= vf_cp_rings * num_vfs;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 10/14] bnxt_en: Add IPV6 hardware RFS support.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

Accept ipv6 flows in .ndo_rx_flow_steer() and support ETHTOOL_GRXCLSRULE
ipv6 flows.

Signed-off-by: Michael Chan <michael.chan@broadocm.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 32 +++++++++++---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 53 +++++++++++++++++------
 2 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 0ca530e..4d478e7 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3316,10 +3316,26 @@ static int bnxt_hwrm_cfa_ntuple_filter_alloc(struct bnxt *bp,
 	req.ip_addr_type = CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV4;
 	req.ip_protocol = keys->basic.ip_proto;
 
-	req.src_ipaddr[0] = keys->addrs.v4addrs.src;
-	req.src_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
-	req.dst_ipaddr[0] = keys->addrs.v4addrs.dst;
-	req.dst_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
+	if (keys->basic.n_proto == htons(ETH_P_IPV6)) {
+		int i;
+
+		req.ethertype = htons(ETH_P_IPV6);
+		req.ip_addr_type =
+			CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6;
+		*(struct in6_addr *)&req.src_ipaddr[0] =
+			keys->addrs.v6addrs.src;
+		*(struct in6_addr *)&req.dst_ipaddr[0] =
+			keys->addrs.v6addrs.dst;
+		for (i = 0; i < 4; i++) {
+			req.src_ipaddr_mask[i] = cpu_to_be32(0xffffffff);
+			req.dst_ipaddr_mask[i] = cpu_to_be32(0xffffffff);
+		}
+	} else {
+		req.src_ipaddr[0] = keys->addrs.v4addrs.src;
+		req.src_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
+		req.dst_ipaddr[0] = keys->addrs.v4addrs.dst;
+		req.dst_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
+	}
 
 	req.src_port = keys->ports.src;
 	req.src_port_mask = cpu_to_be16(0xffff);
@@ -6588,12 +6604,18 @@ static int bnxt_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
 		goto err_free;
 	}
 
-	if ((fkeys->basic.n_proto != htons(ETH_P_IP)) ||
+	if ((fkeys->basic.n_proto != htons(ETH_P_IP) &&
+	     fkeys->basic.n_proto != htons(ETH_P_IPV6)) ||
 	    ((fkeys->basic.ip_proto != IPPROTO_TCP) &&
 	     (fkeys->basic.ip_proto != IPPROTO_UDP))) {
 		rc = -EPROTONOSUPPORT;
 		goto err_free;
 	}
+	if (fkeys->basic.n_proto == htons(ETH_P_IPV6) &&
+	    bp->hwrm_spec_code < 0x10601) {
+		rc = -EPROTONOSUPPORT;
+		goto err_free;
+	}
 
 	memcpy(new_fltr->dst_mac_addr, eth->h_dest, ETH_ALEN);
 	memcpy(new_fltr->src_mac_addr, eth->h_source, ETH_ALEN);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 1cfa7a6..e6b1196 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -524,24 +524,49 @@ static int bnxt_grxclsrule(struct bnxt *bp, struct ethtool_rxnfc *cmd)
 
 fltr_found:
 	fkeys = &fltr->fkeys;
-	if (fkeys->basic.ip_proto == IPPROTO_TCP)
-		fs->flow_type = TCP_V4_FLOW;
-	else if (fkeys->basic.ip_proto == IPPROTO_UDP)
-		fs->flow_type = UDP_V4_FLOW;
-	else
-		goto fltr_err;
+	if (fkeys->basic.n_proto == htons(ETH_P_IP)) {
+		if (fkeys->basic.ip_proto == IPPROTO_TCP)
+			fs->flow_type = TCP_V4_FLOW;
+		else if (fkeys->basic.ip_proto == IPPROTO_UDP)
+			fs->flow_type = UDP_V4_FLOW;
+		else
+			goto fltr_err;
+
+		fs->h_u.tcp_ip4_spec.ip4src = fkeys->addrs.v4addrs.src;
+		fs->m_u.tcp_ip4_spec.ip4src = cpu_to_be32(~0);
+
+		fs->h_u.tcp_ip4_spec.ip4dst = fkeys->addrs.v4addrs.dst;
+		fs->m_u.tcp_ip4_spec.ip4dst = cpu_to_be32(~0);
 
-	fs->h_u.tcp_ip4_spec.ip4src = fkeys->addrs.v4addrs.src;
-	fs->m_u.tcp_ip4_spec.ip4src = cpu_to_be32(~0);
+		fs->h_u.tcp_ip4_spec.psrc = fkeys->ports.src;
+		fs->m_u.tcp_ip4_spec.psrc = cpu_to_be16(~0);
 
-	fs->h_u.tcp_ip4_spec.ip4dst = fkeys->addrs.v4addrs.dst;
-	fs->m_u.tcp_ip4_spec.ip4dst = cpu_to_be32(~0);
+		fs->h_u.tcp_ip4_spec.pdst = fkeys->ports.dst;
+		fs->m_u.tcp_ip4_spec.pdst = cpu_to_be16(~0);
+	} else {
+		int i;
 
-	fs->h_u.tcp_ip4_spec.psrc = fkeys->ports.src;
-	fs->m_u.tcp_ip4_spec.psrc = cpu_to_be16(~0);
+		if (fkeys->basic.ip_proto == IPPROTO_TCP)
+			fs->flow_type = TCP_V6_FLOW;
+		else if (fkeys->basic.ip_proto == IPPROTO_UDP)
+			fs->flow_type = UDP_V6_FLOW;
+		else
+			goto fltr_err;
+
+		*(struct in6_addr *)&fs->h_u.tcp_ip6_spec.ip6src[0] =
+			fkeys->addrs.v6addrs.src;
+		*(struct in6_addr *)&fs->h_u.tcp_ip6_spec.ip6dst[0] =
+			fkeys->addrs.v6addrs.dst;
+		for (i = 0; i < 4; i++) {
+			fs->m_u.tcp_ip6_spec.ip6src[i] = cpu_to_be32(~0);
+			fs->m_u.tcp_ip6_spec.ip6dst[i] = cpu_to_be32(~0);
+		}
+		fs->h_u.tcp_ip6_spec.psrc = fkeys->ports.src;
+		fs->m_u.tcp_ip6_spec.psrc = cpu_to_be16(~0);
 
-	fs->h_u.tcp_ip4_spec.pdst = fkeys->ports.dst;
-	fs->m_u.tcp_ip4_spec.pdst = cpu_to_be16(~0);
+		fs->h_u.tcp_ip6_spec.pdst = fkeys->ports.dst;
+		fs->m_u.tcp_ip6_spec.pdst = cpu_to_be16(~0);
+	}
 
 	fs->ring_cookie = fltr->rxq;
 	rc = 0;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 08/14] bnxt_en: Add new hardware RFS mode.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

The existing hardware RFS mode uses one hardware RSS context block
per ring just to calculate the RSS hash.  This is very wasteful and
prevents VF functions from using it.  The new hardware mode shares
the same hardware RSS context for RSS placement and RFS steering.
This allows VFs to enable RFS.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 27 ++++++++++++++++++++++++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 +
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index f7ea99f..0ca530e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2630,6 +2630,10 @@ static int bnxt_alloc_vnic_attributes(struct bnxt *bp)
 			goto out;
 		}
 
+		if ((bp->flags & BNXT_FLAG_NEW_RSS_CAP) &&
+		    !(vnic->flags & BNXT_VNIC_RSS_FLAG))
+			continue;
+
 		/* Allocate rss table and hash key */
 		vnic->rss_table = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
 						     &vnic->rss_table_dma_addr,
@@ -3562,6 +3566,12 @@ int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
 		req.rss_rule = cpu_to_le16(vnic->fw_rss_cos_lb_ctx[0]);
 		req.enables |= cpu_to_le32(VNIC_CFG_REQ_ENABLES_RSS_RULE |
 					   VNIC_CFG_REQ_ENABLES_MRU);
+	} else if (vnic->flags & BNXT_VNIC_RFS_NEW_RSS_FLAG) {
+		req.rss_rule =
+			cpu_to_le16(bp->vnic_info[0].fw_rss_cos_lb_ctx[0]);
+		req.enables |= cpu_to_le32(VNIC_CFG_REQ_ENABLES_RSS_RULE |
+					   VNIC_CFG_REQ_ENABLES_MRU);
+		req.flags |= cpu_to_le32(VNIC_CFG_REQ_FLAGS_RSS_DFLT_CR_MODE);
 	} else {
 		req.rss_rule = cpu_to_le16(0xffff);
 	}
@@ -4490,8 +4500,12 @@ static void bnxt_hwrm_resource_free(struct bnxt *bp, bool close_path,
 
 static int bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id)
 {
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
 	int rc;
 
+	if (vnic->flags & BNXT_VNIC_RFS_NEW_RSS_FLAG)
+		goto skip_rss_ctx;
+
 	/* allocate context for vnic */
 	rc = bnxt_hwrm_vnic_ctx_alloc(bp, vnic_id, 0);
 	if (rc) {
@@ -4511,6 +4525,7 @@ static int bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id)
 		bp->rsscos_nr_ctxs++;
 	}
 
+skip_rss_ctx:
 	/* configure default vnic, ring grp */
 	rc = bnxt_hwrm_vnic_cfg(bp, vnic_id);
 	if (rc) {
@@ -4545,13 +4560,17 @@ static int bnxt_alloc_rfs_vnics(struct bnxt *bp)
 	int i, rc = 0;
 
 	for (i = 0; i < bp->rx_nr_rings; i++) {
+		struct bnxt_vnic_info *vnic;
 		u16 vnic_id = i + 1;
 		u16 ring_id = i;
 
 		if (vnic_id >= bp->nr_vnics)
 			break;
 
-		bp->vnic_info[vnic_id].flags |= BNXT_VNIC_RFS_FLAG;
+		vnic = &bp->vnic_info[vnic_id];
+		vnic->flags |= BNXT_VNIC_RFS_FLAG;
+		if (bp->flags & BNXT_FLAG_NEW_RSS_CAP)
+			vnic->flags |= BNXT_VNIC_RFS_NEW_RSS_FLAG;
 		rc = bnxt_hwrm_vnic_alloc(bp, vnic_id, ring_id, 1);
 		if (rc) {
 			netdev_err(bp->dev, "hwrm vnic %d alloc failure rc: %x\n",
@@ -5985,6 +6004,8 @@ static bool bnxt_rfs_supported(struct bnxt *bp)
 {
 	if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp))
 		return true;
+	if (bp->flags & BNXT_FLAG_NEW_RSS_CAP)
+		return true;
 	return false;
 }
 
@@ -6000,6 +6021,10 @@ static bool bnxt_rfs_capable(struct bnxt *bp)
 	vnics = 1 + bp->rx_nr_rings;
 	max_vnics = bnxt_get_max_func_vnics(bp);
 	max_rss_ctxs = bnxt_get_max_func_rss_ctxs(bp);
+
+	/* RSS contexts not a limiting factor */
+	if (bp->flags & BNXT_FLAG_NEW_RSS_CAP)
+		max_rss_ctxs = max_vnics;
 	if (vnics > max_vnics || vnics > max_rss_ctxs) {
 		netdev_warn(bp->dev,
 			    "Not enough resources to support NTUPLE filters, enough resources for up to %d rx rings\n",
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 80bf1ab..75803e5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -708,6 +708,7 @@ struct bnxt_vnic_info {
 #define BNXT_VNIC_RFS_FLAG	2
 #define BNXT_VNIC_MCAST_FLAG	4
 #define BNXT_VNIC_UCAST_FLAG	8
+#define BNXT_VNIC_RFS_NEW_RSS_FLAG	0x10
 };
 
 #if defined(CONFIG_BNXT_SRIOV)
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 07/14] bnxt_en: Refactor code that determines RFS capability.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

Add function bnxt_rfs_supported() that determines if the chip supports
RFS.  Refactor the existing function bnxt_rfs_capable() that determines
if run-time conditions support RFS.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 38 +++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 9168326..f7ea99f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4835,6 +4835,24 @@ static int bnxt_setup_int_mode(struct bnxt *bp)
 	return rc;
 }
 
+static unsigned int bnxt_get_max_func_rss_ctxs(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+	if (BNXT_VF(bp))
+		return bp->vf.max_rsscos_ctxs;
+#endif
+	return bp->pf.max_rsscos_ctxs;
+}
+
+static unsigned int bnxt_get_max_func_vnics(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+	if (BNXT_VF(bp))
+		return bp->vf.max_vnics;
+#endif
+	return bp->pf.max_vnics;
+}
+
 unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp)
 {
 #if defined(CONFIG_BNXT_SRIOV)
@@ -5962,20 +5980,30 @@ static int bnxt_cfg_rx_mode(struct bnxt *bp)
 	return rc;
 }
 
+/* If the chip and firmware supports RFS */
+static bool bnxt_rfs_supported(struct bnxt *bp)
+{
+	if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp))
+		return true;
+	return false;
+}
+
+/* If runtime conditions support RFS */
 static bool bnxt_rfs_capable(struct bnxt *bp)
 {
 #ifdef CONFIG_RFS_ACCEL
-	struct bnxt_pf_info *pf = &bp->pf;
-	int vnics;
+	int vnics, max_vnics, max_rss_ctxs;
 
 	if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_MSIX_CAP))
 		return false;
 
 	vnics = 1 + bp->rx_nr_rings;
-	if (vnics > pf->max_rsscos_ctxs || vnics > pf->max_vnics) {
+	max_vnics = bnxt_get_max_func_vnics(bp);
+	max_rss_ctxs = bnxt_get_max_func_rss_ctxs(bp);
+	if (vnics > max_vnics || vnics > max_rss_ctxs) {
 		netdev_warn(bp->dev,
 			    "Not enough resources to support NTUPLE filters, enough resources for up to %d rx rings\n",
-			    min(pf->max_rsscos_ctxs - 1, pf->max_vnics - 1));
+			    min(max_rss_ctxs - 1, max_vnics - 1));
 		return false;
 	}
 
@@ -7092,7 +7120,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 
 	bnxt_hwrm_vnic_qcaps(bp);
-	if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp)) {
+	if (bnxt_rfs_supported(bp)) {
 		dev->hw_features |= NETIF_F_NTUPLE;
 		if (bnxt_rfs_capable(bp)) {
 			bp->flags |= BNXT_FLAG_RFS;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 06/14] bnxt_en: Add function to get vnic capability.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

The new vnic RSS capability will enhance NTUPLE support, to be added
in subsequent patches.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     | 22 +++++++++++++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     |  1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 34 +++++++++++++++++++++++++++
 3 files changed, 57 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 0654c3f..9168326 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3665,6 +3665,27 @@ static int bnxt_hwrm_vnic_alloc(struct bnxt *bp, u16 vnic_id,
 	return rc;
 }
 
+static int bnxt_hwrm_vnic_qcaps(struct bnxt *bp)
+{
+	struct hwrm_vnic_qcaps_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_vnic_qcaps_input req = {0};
+	int rc;
+
+	if (bp->hwrm_spec_code < 0x10600)
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_QCAPS, -1, -1);
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc) {
+		if (resp->flags &
+		    cpu_to_le32(VNIC_QCAPS_RESP_FLAGS_RSS_DFLT_CR_CAP))
+			bp->flags |= BNXT_FLAG_NEW_RSS_CAP;
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
 static int bnxt_hwrm_ring_grp_alloc(struct bnxt *bp)
 {
 	u16 i;
@@ -7070,6 +7091,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 				    VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV6;
 	}
 
+	bnxt_hwrm_vnic_qcaps(bp);
 	if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp)) {
 		dev->hw_features |= NETIF_F_NTUPLE;
 		if (bnxt_rfs_capable(bp)) {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 0eb6401..80bf1ab 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -944,6 +944,7 @@ struct bnxt {
 	#define BNXT_FLAG_PORT_STATS	0x400
 	#define BNXT_FLAG_UDP_RSS_CAP	0x800
 	#define BNXT_FLAG_EEE_CAP	0x1000
+	#define BNXT_FLAG_NEW_RSS_CAP	0x2000
 	#define BNXT_FLAG_ROCEV1_CAP	0x8000
 	#define BNXT_FLAG_ROCEV2_CAP	0x10000
 	#define BNXT_FLAG_ROCE_CAP	(BNXT_FLAG_ROCEV1_CAP |	\
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
index 2ddfa51..d0d49ed 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
@@ -2797,6 +2797,40 @@ struct hwrm_vnic_cfg_output {
 	u8 valid;
 };
 
+/* hwrm_vnic_qcaps */
+/* Input (24 bytes) */
+struct hwrm_vnic_qcaps_input {
+	__le16 req_type;
+	__le16 cmpl_ring;
+	__le16 seq_id;
+	__le16 target_id;
+	__le64 resp_addr;
+	__le32 enables;
+	__le32 unused_0;
+};
+
+/* Output (24 bytes) */
+struct hwrm_vnic_qcaps_output {
+	__le16 error_code;
+	__le16 req_type;
+	__le16 seq_id;
+	__le16 resp_len;
+	__le16 mru;
+	u8 unused_0;
+	u8 unused_1;
+	__le32 flags;
+	#define VNIC_QCAPS_RESP_FLAGS_VLAN_STRIP_CAP		    0x2UL
+	#define VNIC_QCAPS_RESP_FLAGS_BD_STALL_CAP		    0x4UL
+	#define VNIC_QCAPS_RESP_FLAGS_ROCE_DUAL_VNIC_CAP	    0x8UL
+	#define VNIC_QCAPS_RESP_FLAGS_ROCE_ONLY_VNIC_CAP	    0x10UL
+	#define VNIC_QCAPS_RESP_FLAGS_RSS_DFLT_CR_CAP		    0x20UL
+	__le32 unused_2;
+	u8 unused_3;
+	u8 unused_4;
+	u8 unused_5;
+	u8 valid;
+};
+
 /* hwrm_vnic_tpa_cfg */
 /* Input (40 bytes) */
 struct hwrm_vnic_tpa_cfg_input {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 05/14] bnxt_en: Refactor TPA code path.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

Call tcp_gro_complete() in the common code path instead of the chip-
specific method.  The newer 5731x method is missing the call.

Signed-off-by: Michael Chan <michael.chan@broadcmo.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 4a8059f..0654c3f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1127,7 +1127,6 @@ static struct sk_buff *bnxt_gro_func_5730x(struct bnxt_tpa_info *tpa_info,
 		dev_kfree_skb_any(skb);
 		return NULL;
 	}
-	tcp_gro_complete(skb);
 
 	if (nw_off) { /* tunnel */
 		struct udphdr *uh = NULL;
@@ -1177,6 +1176,8 @@ static inline struct sk_buff *bnxt_gro_skb(struct bnxt *bp,
 		       RX_TPA_END_CMP_PAYLOAD_OFFSET) >>
 		      RX_TPA_END_CMP_PAYLOAD_OFFSET_SHIFT;
 	skb = bp->gro_func(tpa_info, payload_off, TPA_END_GRO_TS(tpa_end), skb);
+	if (likely(skb))
+		tcp_gro_complete(skb);
 #endif
 	return skb;
 }
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 04/14] bnxt_en: Fix and clarify link_info->advertising.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

The advertising field is closely related to the auto_link_speeds field.
The former is the user setting while the latter is the firmware setting.
Both should be u16.  We should use the advertising field in
bnxt_get_link_ksettings because the auto_link_speeds field may not
be updated with the latest from the firmware yet.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         | 4 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 5 +++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 277573b..4a8059f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5363,7 +5363,7 @@ static void bnxt_hwrm_set_link_common(struct bnxt *bp,
 {
 	u8 autoneg = bp->link_info.autoneg;
 	u16 fw_link_speed = bp->link_info.req_link_speed;
-	u32 advertising = bp->link_info.advertising;
+	u16 advertising = bp->link_info.advertising;
 
 	if (autoneg & BNXT_AUTONEG_SPEED) {
 		req->auto_mode |=
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index fddc316..0eb6401 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -828,7 +828,7 @@ struct bnxt_link_info {
 #define BNXT_LINK_SPEED_40GB	PORT_PHY_QCFG_RESP_LINK_SPEED_40GB
 #define BNXT_LINK_SPEED_50GB	PORT_PHY_QCFG_RESP_LINK_SPEED_50GB
 	u16			support_speeds;
-	u16			auto_link_speeds;
+	u16			auto_link_speeds;	/* fw adv setting */
 #define BNXT_LINK_SPEED_MSK_100MB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100MB
 #define BNXT_LINK_SPEED_MSK_1GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_1GB
 #define BNXT_LINK_SPEED_MSK_2GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_2GB
@@ -851,7 +851,7 @@ struct bnxt_link_info {
 	u8			req_duplex;
 	u8			req_flow_ctrl;
 	u16			req_link_speed;
-	u32			advertising;
+	u16			advertising;	/* user adv setting */
 	bool			force_link_chng;
 
 	/* a copy of phy_qcfg output used to report link
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 784aa77..1cfa7a6 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -893,7 +893,7 @@ u32 _bnxt_fw_to_ethtool_adv_spds(u16 fw_speeds, u8 fw_pause)
 static void bnxt_fw_to_ethtool_advertised_spds(struct bnxt_link_info *link_info,
 				struct ethtool_link_ksettings *lk_ksettings)
 {
-	u16 fw_speeds = link_info->auto_link_speeds;
+	u16 fw_speeds = link_info->advertising;
 	u8 fw_pause = 0;
 
 	if (link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
@@ -1090,8 +1090,9 @@ static int bnxt_set_link_ksettings(struct net_device *dev,
 	struct bnxt *bp = netdev_priv(dev);
 	struct bnxt_link_info *link_info = &bp->link_info;
 	const struct ethtool_link_settings *base = &lk_ksettings->base;
-	u32 speed, fw_advertising = 0;
 	bool set_pause = false;
+	u16 fw_advertising = 0;
+	u32 speed;
 	int rc = 0;
 
 	if (!BNXT_SINGLE_PF(bp))
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 03/14] bnxt_en: Improve the IRQ disable sequence during shutdown.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

The IRQ is disabled by writing to the completion ring doorbell.  This
should be done before the hardware completion ring is freed for correctness.
The current code disables IRQs after all the completion rings are freed.

Fix it by calling bnxt_disable_int_sync() before freeing the completion
rings.  Rearrange the code to avoid forward declaration.

Signed-off-by: Michael Chan <michael.chan@broadocm.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 89 ++++++++++++++++---------------
 1 file changed, 46 insertions(+), 43 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 3fbc842..277573b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2953,6 +2953,45 @@ static int bnxt_alloc_mem(struct bnxt *bp, bool irq_re_init)
 	return rc;
 }
 
+static void bnxt_disable_int(struct bnxt *bp)
+{
+	int i;
+
+	if (!bp->bnapi)
+		return;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+
+		BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
+	}
+}
+
+static void bnxt_disable_int_sync(struct bnxt *bp)
+{
+	int i;
+
+	atomic_inc(&bp->intr_sem);
+
+	bnxt_disable_int(bp);
+	for (i = 0; i < bp->cp_nr_rings; i++)
+		synchronize_irq(bp->irq_tbl[i].vector);
+}
+
+static void bnxt_enable_int(struct bnxt *bp)
+{
+	int i;
+
+	atomic_set(&bp->intr_sem, 0);
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+
+		BNXT_CP_DB_REARM(cpr->cp_doorbell, cpr->cp_raw_cons);
+	}
+}
+
 void bnxt_hwrm_cmd_hdr_init(struct bnxt *bp, void *request, u16 req_type,
 			    u16 cmpl_ring, u16 target_id)
 {
@@ -3937,6 +3976,12 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path)
 		}
 	}
 
+	/* The completion rings are about to be freed.  After that the
+	 * IRQ doorbell will not work anymore.  So we need to disable
+	 * IRQ here.
+	 */
+	bnxt_disable_int_sync(bp);
+
 	for (i = 0; i < bp->cp_nr_rings; i++) {
 		struct bnxt_napi *bnapi = bp->bnapi[i];
 		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
@@ -4658,34 +4703,6 @@ static int bnxt_init_nic(struct bnxt *bp, bool irq_re_init)
 	return bnxt_init_chip(bp, irq_re_init);
 }
 
-static void bnxt_disable_int(struct bnxt *bp)
-{
-	int i;
-
-	if (!bp->bnapi)
-		return;
-
-	for (i = 0; i < bp->cp_nr_rings; i++) {
-		struct bnxt_napi *bnapi = bp->bnapi[i];
-		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
-
-		BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
-	}
-}
-
-static void bnxt_enable_int(struct bnxt *bp)
-{
-	int i;
-
-	atomic_set(&bp->intr_sem, 0);
-	for (i = 0; i < bp->cp_nr_rings; i++) {
-		struct bnxt_napi *bnapi = bp->bnapi[i];
-		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
-
-		BNXT_CP_DB_REARM(cpr->cp_doorbell, cpr->cp_raw_cons);
-	}
-}
-
 static int bnxt_set_real_num_queues(struct bnxt *bp)
 {
 	int rc;
@@ -5640,19 +5657,6 @@ static int bnxt_open(struct net_device *dev)
 	return __bnxt_open_nic(bp, true, true);
 }
 
-static void bnxt_disable_int_sync(struct bnxt *bp)
-{
-	int i;
-
-	atomic_inc(&bp->intr_sem);
-	if (!netif_running(bp->dev))
-		return;
-
-	bnxt_disable_int(bp);
-	for (i = 0; i < bp->cp_nr_rings; i++)
-		synchronize_irq(bp->irq_tbl[i].vector);
-}
-
 int bnxt_close_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 {
 	int rc = 0;
@@ -5674,13 +5678,12 @@ int bnxt_close_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 	while (test_bit(BNXT_STATE_IN_SP_TASK, &bp->state))
 		msleep(20);
 
-	/* Flush rings before disabling interrupts */
+	/* Flush rings and and disable interrupts */
 	bnxt_shutdown_nic(bp, irq_re_init);
 
 	/* TODO CHIMP_FW: Link/PHY related cleanup if (link_re_init) */
 
 	bnxt_disable_napi(bp);
-	bnxt_disable_int_sync(bp);
 	del_timer_sync(&bp->timer);
 	bnxt_free_skbs(bp);
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 01/14] bnxt_en: Remove busy poll logic in the driver.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

Use native NAPI polling instead.  The next patch will complete the work
by switching to use napi_complete_done()

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 53 +----------------
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 99 -------------------------------
 2 files changed, 3 insertions(+), 149 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 9608cb4..b53f958 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -39,9 +39,6 @@
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
 #include <net/udp_tunnel.h>
-#ifdef CONFIG_NET_RX_BUSY_POLL
-#include <net/busy_poll.h>
-#endif
 #include <linux/workqueue.h>
 #include <linux/prefetch.h>
 #include <linux/cache.h>
@@ -1356,11 +1353,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons,
 		rc = -ENOMEM;
 		if (likely(skb)) {
 			skb_record_rx_queue(skb, bnapi->index);
-			skb_mark_napi_id(skb, &bnapi->napi);
-			if (bnxt_busy_polling(bnapi))
-				netif_receive_skb(skb);
-			else
-				napi_gro_receive(&bnapi->napi, skb);
+			napi_gro_receive(&bnapi->napi, skb);
 			rc = 1;
 		}
 		goto next_rx_no_prod;
@@ -1460,11 +1453,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons,
 	}
 
 	skb_record_rx_queue(skb, bnapi->index);
-	skb_mark_napi_id(skb, &bnapi->napi);
-	if (bnxt_busy_polling(bnapi))
-		netif_receive_skb(skb);
-	else
-		napi_gro_receive(&bnapi->napi, skb);
+	napi_gro_receive(&bnapi->napi, skb);
 	rc = 1;
 
 next_rx:
@@ -1782,9 +1771,6 @@ static int bnxt_poll(struct napi_struct *napi, int budget)
 	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
 	int work_done = 0;
 
-	if (!bnxt_lock_napi(bnapi))
-		return budget;
-
 	while (1) {
 		work_done += bnxt_poll_work(bp, bnapi, budget - work_done);
 
@@ -1798,36 +1784,9 @@ static int bnxt_poll(struct napi_struct *napi, int budget)
 		}
 	}
 	mmiowb();
-	bnxt_unlock_napi(bnapi);
 	return work_done;
 }
 
-#ifdef CONFIG_NET_RX_BUSY_POLL
-static int bnxt_busy_poll(struct napi_struct *napi)
-{
-	struct bnxt_napi *bnapi = container_of(napi, struct bnxt_napi, napi);
-	struct bnxt *bp = bnapi->bp;
-	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
-	int rx_work, budget = 4;
-
-	if (atomic_read(&bp->intr_sem) != 0)
-		return LL_FLUSH_FAILED;
-
-	if (!bp->link_info.link_up)
-		return LL_FLUSH_FAILED;
-
-	if (!bnxt_lock_poll(bnapi))
-		return LL_FLUSH_BUSY;
-
-	rx_work = bnxt_poll_work(bp, bnapi, budget);
-
-	BNXT_CP_DB_REARM(cpr->cp_doorbell, cpr->cp_raw_cons);
-
-	bnxt_unlock_poll(bnapi);
-	return rx_work;
-}
-#endif
-
 static void bnxt_free_tx_skbs(struct bnxt *bp)
 {
 	int i, max_idx;
@@ -5094,10 +5053,8 @@ static void bnxt_disable_napi(struct bnxt *bp)
 	if (!bp->bnapi)
 		return;
 
-	for (i = 0; i < bp->cp_nr_rings; i++) {
+	for (i = 0; i < bp->cp_nr_rings; i++)
 		napi_disable(&bp->bnapi[i]->napi);
-		bnxt_disable_poll(bp->bnapi[i]);
-	}
 }
 
 static void bnxt_enable_napi(struct bnxt *bp)
@@ -5106,7 +5063,6 @@ static void bnxt_enable_napi(struct bnxt *bp)
 
 	for (i = 0; i < bp->cp_nr_rings; i++) {
 		bp->bnapi[i]->in_reset = false;
-		bnxt_enable_poll(bp->bnapi[i]);
 		napi_enable(&bp->bnapi[i]->napi);
 	}
 }
@@ -6765,9 +6721,6 @@ static void bnxt_udp_tunnel_del(struct net_device *dev,
 #endif
 	.ndo_udp_tunnel_add	= bnxt_udp_tunnel_add,
 	.ndo_udp_tunnel_del	= bnxt_udp_tunnel_del,
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	.ndo_busy_poll		= bnxt_busy_poll,
-#endif
 };
 
 static void bnxt_remove_one(struct pci_dev *pdev)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 16defe9..fddc316 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -654,21 +654,9 @@ struct bnxt_napi {
 	struct bnxt_rx_ring_info	*rx_ring;
 	struct bnxt_tx_ring_info	*tx_ring;
 
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	atomic_t		poll_state;
-#endif
 	bool			in_reset;
 };
 
-#ifdef CONFIG_NET_RX_BUSY_POLL
-enum bnxt_poll_state_t {
-	BNXT_STATE_IDLE = 0,
-	BNXT_STATE_NAPI,
-	BNXT_STATE_POLL,
-	BNXT_STATE_DISABLE,
-};
-#endif
-
 struct bnxt_irq {
 	irq_handler_t	handler;
 	unsigned int	vector;
@@ -1141,93 +1129,6 @@ struct bnxt {
 	((offsetof(struct tx_port_stats, counter) +	\
 	  sizeof(struct rx_port_stats) + 512) / 8)
 
-#ifdef CONFIG_NET_RX_BUSY_POLL
-static inline void bnxt_enable_poll(struct bnxt_napi *bnapi)
-{
-	atomic_set(&bnapi->poll_state, BNXT_STATE_IDLE);
-}
-
-/* called from the NAPI poll routine to get ownership of a bnapi */
-static inline bool bnxt_lock_napi(struct bnxt_napi *bnapi)
-{
-	int rc = atomic_cmpxchg(&bnapi->poll_state, BNXT_STATE_IDLE,
-				BNXT_STATE_NAPI);
-
-	return rc == BNXT_STATE_IDLE;
-}
-
-static inline void bnxt_unlock_napi(struct bnxt_napi *bnapi)
-{
-	atomic_set(&bnapi->poll_state, BNXT_STATE_IDLE);
-}
-
-/* called from the busy poll routine to get ownership of a bnapi */
-static inline bool bnxt_lock_poll(struct bnxt_napi *bnapi)
-{
-	int rc = atomic_cmpxchg(&bnapi->poll_state, BNXT_STATE_IDLE,
-				BNXT_STATE_POLL);
-
-	return rc == BNXT_STATE_IDLE;
-}
-
-static inline void bnxt_unlock_poll(struct bnxt_napi *bnapi)
-{
-	atomic_set(&bnapi->poll_state, BNXT_STATE_IDLE);
-}
-
-static inline bool bnxt_busy_polling(struct bnxt_napi *bnapi)
-{
-	return atomic_read(&bnapi->poll_state) == BNXT_STATE_POLL;
-}
-
-static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
-{
-	int old;
-
-	while (1) {
-		old = atomic_cmpxchg(&bnapi->poll_state, BNXT_STATE_IDLE,
-				     BNXT_STATE_DISABLE);
-		if (old == BNXT_STATE_IDLE)
-			break;
-		usleep_range(500, 5000);
-	}
-}
-
-#else
-
-static inline void bnxt_enable_poll(struct bnxt_napi *bnapi)
-{
-}
-
-static inline bool bnxt_lock_napi(struct bnxt_napi *bnapi)
-{
-	return true;
-}
-
-static inline void bnxt_unlock_napi(struct bnxt_napi *bnapi)
-{
-}
-
-static inline bool bnxt_lock_poll(struct bnxt_napi *bnapi)
-{
-	return false;
-}
-
-static inline void bnxt_unlock_poll(struct bnxt_napi *bnapi)
-{
-}
-
-static inline bool bnxt_busy_polling(struct bnxt_napi *bnapi)
-{
-	return false;
-}
-
-static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
-{
-}
-
-#endif
-
 #define I2C_DEV_ADDR_A0				0xa0
 #define I2C_DEV_ADDR_A2				0xa2
 #define SFP_EEPROM_SFF_8472_COMP_ADDR		0x5e
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 02/14] bnxt_en: Use napi_complete_done()
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>

For better busy polling and GRO support.  Do not re-arm IRQ if
napi_complete_done() returns false.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index b53f958..3fbc842 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1778,8 +1778,9 @@ static int bnxt_poll(struct napi_struct *napi, int budget)
 			break;
 
 		if (!bnxt_has_work(bp, cpr)) {
-			napi_complete(napi);
-			BNXT_CP_DB_REARM(cpr->cp_doorbell, cpr->cp_raw_cons);
+			if (napi_complete_done(napi, work_done))
+				BNXT_CP_DB_REARM(cpr->cp_doorbell,
+						 cpr->cp_raw_cons);
 			break;
 		}
 	}
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 00/14] bnxt_en: updates for net-next.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
  To: davem; +Cc: netdev

This patch series for net-next contains cleanups, new features and minor
fixes.  The driver specific busy polling code is removed to use busy
polling support in core networking.  Hardware RFS support is enhanced with
added ipv6 flows support and VF support.  A new scheme to allocate TX
rings from the firmware is implemented for newer chips and firmware.  Plus
some misc. cleanups, minor fixes, and to add the maintainer entry.  Please
review.

Michael Chan (14):
  bnxt_en: Remove busy poll logic in the driver.
  bnxt_en: Use napi_complete_done()
  bnxt_en: Improve the IRQ disable sequence during shutdown.
  bnxt_en: Fix and clarify link_info->advertising.
  bnxt_en: Refactor TPA code path.
  bnxt_en: Add function to get vnic capability.
  bnxt_en: Refactor code that determines RFS capability.
  bnxt_en: Add new hardware RFS mode.
  bnxt_en: Assign additional vnics to VFs.
  bnxt_en: Add IPV6 hardware RFS support.
  bnxt_en: Implement new scheme to reserve tx rings.
  bnxt_en: Set default completion ring for async events.
  bnxt_en: Handle no aggregation ring gracefully.
  MAINTAINERS: Add bnxt_en maintainer info.

 MAINTAINERS                                       |   6 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 385 +++++++++++++++-------
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         | 108 +-----
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  73 +++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h     |  34 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c   |  15 +-
 6 files changed, 388 insertions(+), 233 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH] stmmac: adding EEE to GMAC4
From: Joao Pinto @ 2016-12-29 17:10 UTC (permalink / raw)
  To: davem
  Cc: peppe.cavallaro, alexandre.torgue, rayagond, linux-kernel, netdev,
	Joao Pinto

This patch adds Energy Efficiency Ethernet to GMAC4.

Signed-off-by: Joao Pinto <jpinto@synopsys.com>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h      | 12 +++++
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 59 +++++++++++++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index b524598..73d1dab 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -90,6 +90,18 @@ enum power_event {
 	power_down = 0x00000001,
 };
 
+/* Energy Efficient Ethernet (EEE) for GMAC4
+ *
+ * LPI status, timer and control register offset
+ */
+#define GMAC4_LPI_CTRL_STATUS	0xd0
+#define GMAC4_LPI_TIMER_CTRL	0xd4
+
+/* LPI control and status defines */
+#define GMAC4_LPI_CTRL_STATUS_LPITXA	BIT(19)	/* Enable LPI TX Automate */
+#define GMAC4_LPI_CTRL_STATUS_PLS	BIT(17) /* PHY Link Status */
+#define GMAC4_LPI_CTRL_STATUS_LPIEN	BIT(16)	/* LPI Enable */
+
 /* MAC Debug bitmap */
 #define GMAC_DEBUG_TFCSTS_MASK		GENMASK(18, 17)
 #define GMAC_DEBUG_TFCSTS_SHIFT		17
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index ecfbf57..02eab79 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -137,6 +137,61 @@ static void dwmac4_get_umac_addr(struct mac_device_info *hw,
 				   GMAC_ADDR_LOW(reg_n));
 }
 
+static void dwmac4_set_eee_mode(struct mac_device_info *hw)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	/* Enable the link status receive on RGMII, SGMII ore SMII
+	 * receive path and instruct the transmit to enter in LPI
+	 * state.
+	 */
+	value = readl(ioaddr + GMAC4_LPI_CTRL_STATUS);
+	value |= GMAC4_LPI_CTRL_STATUS_LPIEN | GMAC4_LPI_CTRL_STATUS_LPITXA;
+
+	writel(value, ioaddr + GMAC4_LPI_CTRL_STATUS);
+}
+
+static void dwmac4_reset_eee_mode(struct mac_device_info *hw)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	value = readl(ioaddr + GMAC4_LPI_CTRL_STATUS);
+	value &= ~(GMAC4_LPI_CTRL_STATUS_LPIEN | GMAC4_LPI_CTRL_STATUS_LPITXA);
+	writel(value, ioaddr + GMAC4_LPI_CTRL_STATUS);
+}
+
+static void dwmac4_set_eee_pls(struct mac_device_info *hw, int link)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	value = readl(ioaddr + GMAC4_LPI_CTRL_STATUS);
+
+	if (link)
+		value |= GMAC4_LPI_CTRL_STATUS_PLS;
+	else
+		value &= ~GMAC4_LPI_CTRL_STATUS_PLS;
+
+	writel(value, ioaddr + GMAC4_LPI_CTRL_STATUS);
+}
+
+static void dwmac4_set_eee_timer(struct mac_device_info *hw, int ls, int tw)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	int value = ((tw & 0xffff)) | ((ls & 0x7ff) << 16);
+
+	/* Program the timers in the LPI timer control register:
+	 * LS: minimum time (ms) for which the link
+	 *  status from PHY should be ok before transmitting
+	 *  the LPI pattern.
+	 * TW: minimum time (us) for which the core waits
+	 *  after it has stopped transmitting the LPI pattern.
+	 */
+	writel(value, ioaddr + GMAC4_LPI_TIMER_CTRL);
+}
+
 static void dwmac4_set_filter(struct mac_device_info *hw,
 			      struct net_device *dev)
 {
@@ -410,6 +465,10 @@ static const struct stmmac_ops dwmac4_ops = {
 	.pmt = dwmac4_pmt,
 	.set_umac_addr = dwmac4_set_umac_addr,
 	.get_umac_addr = dwmac4_get_umac_addr,
+	.set_eee_mode = dwmac4_set_eee_mode,
+	.reset_eee_mode = dwmac4_reset_eee_mode,
+	.set_eee_timer = dwmac4_set_eee_timer,
+	.set_eee_pls = dwmac4_set_eee_pls,
 	.pcs_ctrl_ane = dwmac4_ctrl_ane,
 	.pcs_rane = dwmac4_rane,
 	.pcs_get_adv_lp = dwmac4_get_adv_lp,
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH net] net: stmmac: Fix error path after register_netdev move
From: David Miller @ 2016-12-29 17:08 UTC (permalink / raw)
  To: f.fainelli
  Cc: netdev, pavel, Joao.Pinto, seraphin.bonnaffe, alexandre.torgue,
	manabian, niklas.cassel, johan, boon.leong.ong, weifeng.voon,
	lars.persson, linux-kernel, peppe.cavallaro, alexandre.torgue
In-Reply-To: <20161228234441.5026-1-f.fainelli@gmail.com>

From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 28 Dec 2016 15:44:41 -0800

> Commit 5701659004d6 ("net: stmmac: Fix race between stmmac_drv_probe and
> stmmac_open") re-ordered how the MDIO bus registration and the network
> device are registered, but missed to unwind the MDIO bus registration in
> case we fail to register the network device.
> 
> Fixes: 5701659004d6 ("net: stmmac: Fix race between stmmac_drv_probe and stmmac_open")
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

Applied, thanks Florian.

^ permalink raw reply

* Re: [PATCH] drivers: atm: eni: rename macro DAUGTHER_ID to fix spelling mistake
From: David Miller @ 2016-12-29 17:07 UTC (permalink / raw)
  To: colin.king; +Cc: 3chas3, linux-atm-general, netdev, linux-kernel
In-Reply-To: <20161228173120.20207-1-colin.king@canonical.com>

From: Colin King <colin.king@canonical.com>
Date: Wed, 28 Dec 2016 17:31:20 +0000

> From: Colin Ian King <colin.king@canonical.com>
> 
> Rename DAUGTHER_ID to DAUGHTER_ID to fix spelling mistake
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Applied.

^ permalink raw reply

* Re: ipv6: remove unnecessary inet6_sk check
From: David Miller @ 2016-12-29 17:06 UTC (permalink / raw)
  To: davej; +Cc: netdev
In-Reply-To: <20161228165318.5afrtlbulsgymc4c@codemonkey.org.uk>

From: Dave Jones <davej@codemonkey.org.uk>
Date: Wed, 28 Dec 2016 11:53:18 -0500

> np is already assigned in the variable declaration of ping_v6_sendmsg.
> At this point, we have already dereferenced np several times, so the
> NULL check is also redundant.
> 
> Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
> Signed-off-by: Dave Jones <davej@codemonkey.org.uk>

Applied, thanks Dave.

^ permalink raw reply

* Re: [PATCHv2 net-next 00/11] net: mvpp2: misc improvements and preparation patches
From: David Miller @ 2016-12-29 17:03 UTC (permalink / raw)
  To: thomas.petazzoni
  Cc: netdev, jason, andrew, sebastian.hesselbarth, gregory.clement,
	nadavh, hannah, yehuday, linux-arm-kernel, stefanc, mw
In-Reply-To: <1482943567-12483-1-git-send-email-thomas.petazzoni@free-electrons.com>

From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Wed, 28 Dec 2016 17:45:56 +0100

> This series contains a number of misc improvements and preparation
> patches for an upcoming series that adds support for the new PPv2.2
> network controller to the mvpp2 driver.
> 
> The most significant improvements are:
> 
>  - Switching to using build_skb(), which is necessary for the upcoming
>    PPv2.2 support, but anyway a good improvement to the current mvpp2
>    driver (supporting PPv2.1).
> 
>  - Making the driver build on 64-bit platforms.
> 
> Changes since v1:
> 
>  - This series is split as a separate series from the larger patch set
>    adding support for PPv2.2 in the mvpp2 driver, as requested by
>    David Miller.
> 
>  - Rebased on top of v4.10-rc1.

You still have warnings to fix for the 64-bit build:

drivers/net/ethernet/marvell/mvpp2.c: In function ‘mvpp2_rx’:
drivers/net/ethernet/marvell/mvpp2.c:5125:10: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
   data = (void *)rx_desc->buf_cookie;
          ^

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox