Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v2] net: core: rework basic flow dissection helper
From: Paolo Abeni @ 2018-05-04  9:32 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Eric Dumazet, Jason Wang

When the core networking needs to detect the transport offset in a given
packet and parse it explicitly, a full-blown flow_keys struct is used for
storage.
This patch introduces a smaller keys store, rework the basic flow dissect
helper to use it, and apply this new helper where possible - namely in
skb_probe_transport_header(). The used flow dissector data structures
are renamed to match more closely the new role.

The above gives ~50% performance improvement in micro benchmarking around
skb_probe_transport_header() and ~30% around eth_get_headlen(), mostly due
to the smaller memset. Small, but measurable improvement is measured also
in macro benchmarking.

v1 -> v2: use the new helper in eth_get_headlen() and skb_get_poff(),
  as per DaveM suggestion

Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h       | 18 ++++++++++--------
 include/net/flow_dissector.h |  7 ++++++-
 net/core/flow_dissector.c    | 17 +++++++++--------
 net/ethernet/eth.c           |  6 +++---
 4 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 908d66e55b14..693564a9a979 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1171,7 +1171,7 @@ void __skb_get_hash(struct sk_buff *skb);
 u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
-		   const struct flow_keys *keys, int hlen);
+		   const struct flow_keys_basic *keys, int hlen);
 __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
 			    void *data, int hlen_proto);
 
@@ -1208,13 +1208,14 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
 				  NULL, 0, 0, 0, flags);
 }
 
-static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
-						  void *data, __be16 proto,
-						  int nhoff, int hlen,
-						  unsigned int flags)
+static inline bool
+skb_flow_dissect_flow_keys_basic(const struct sk_buff *skb,
+				 struct flow_keys_basic *flow, void *data,
+				 __be16 proto, int nhoff, int hlen,
+				 unsigned int flags)
 {
 	memset(flow, 0, sizeof(*flow));
-	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
+	return __skb_flow_dissect(skb, &flow_keys_basic_dissector, flow,
 				  data, proto, nhoff, hlen, flags);
 }
 
@@ -2350,11 +2351,12 @@ static inline void skb_pop_mac_header(struct sk_buff *skb)
 static inline void skb_probe_transport_header(struct sk_buff *skb,
 					      const int offset_hint)
 {
-	struct flow_keys keys;
+	struct flow_keys_basic keys;
 
 	if (skb_transport_header_was_set(skb))
 		return;
-	else if (skb_flow_dissect_flow_keys(skb, &keys, 0))
+
+	if (skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0))
 		skb_set_transport_header(skb, keys.control.thoff);
 	else
 		skb_set_transport_header(skb, offset_hint);
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 9a074776f70b..e2f6e5c928bb 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -226,6 +226,11 @@ struct flow_dissector {
 	unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
 };
 
+struct flow_keys_basic {
+	struct flow_dissector_key_control control;
+	struct flow_dissector_key_basic basic;
+};
+
 struct flow_keys {
 	struct flow_dissector_key_control control;
 #define FLOW_KEYS_HASH_START_FIELD basic
@@ -244,7 +249,7 @@ __be32 flow_get_u32_src(const struct flow_keys *flow);
 __be32 flow_get_u32_dst(const struct flow_keys *flow);
 
 extern struct flow_dissector flow_keys_dissector;
-extern struct flow_dissector flow_keys_buf_dissector;
+extern struct flow_dissector flow_keys_basic_dissector;
 
 /* struct flow_keys_digest:
  *
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d29f09bc5ff9..030d4ca177fb 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1253,7 +1253,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
 EXPORT_SYMBOL(skb_get_hash_perturb);
 
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
-		   const struct flow_keys *keys, int hlen)
+		   const struct flow_keys_basic *keys, int hlen)
 {
 	u32 poff = keys->control.thoff;
 
@@ -1314,9 +1314,9 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
  */
 u32 skb_get_poff(const struct sk_buff *skb)
 {
-	struct flow_keys keys;
+	struct flow_keys_basic keys;
 
-	if (!skb_flow_dissect_flow_keys(skb, &keys, 0))
+	if (!skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0))
 		return 0;
 
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
@@ -1403,7 +1403,7 @@ static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
 	},
 };
 
-static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
 	{
 		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
 		.offset = offsetof(struct flow_keys, control),
@@ -1417,7 +1417,8 @@ static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
 struct flow_dissector flow_keys_dissector __read_mostly;
 EXPORT_SYMBOL(flow_keys_dissector);
 
-struct flow_dissector flow_keys_buf_dissector __read_mostly;
+struct flow_dissector flow_keys_basic_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_basic_dissector);
 
 static int __init init_default_flow_dissectors(void)
 {
@@ -1427,9 +1428,9 @@ static int __init init_default_flow_dissectors(void)
 	skb_flow_dissector_init(&flow_keys_dissector_symmetric,
 				flow_keys_dissector_symmetric_keys,
 				ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
-	skb_flow_dissector_init(&flow_keys_buf_dissector,
-				flow_keys_buf_dissector_keys,
-				ARRAY_SIZE(flow_keys_buf_dissector_keys));
+	skb_flow_dissector_init(&flow_keys_basic_dissector,
+				flow_keys_basic_dissector_keys,
+				ARRAY_SIZE(flow_keys_basic_dissector_keys));
 	return 0;
 }
 
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index eaeba9b99a73..ee28440f57c5 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -128,15 +128,15 @@ u32 eth_get_headlen(void *data, unsigned int len)
 {
 	const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
 	const struct ethhdr *eth = (const struct ethhdr *)data;
-	struct flow_keys keys;
+	struct flow_keys_basic keys;
 
 	/* this should never happen, but better safe than sorry */
 	if (unlikely(len < sizeof(*eth)))
 		return len;
 
 	/* parse any remaining L2/L3 headers, check for L4 */
-	if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
-					    sizeof(*eth), len, flags))
+	if (!skb_flow_dissect_flow_keys_basic(NULL, &keys, data, eth->h_proto,
+					      sizeof(*eth), len, flags))
 		return max_t(u32, keys.control.thoff, sizeof(*eth));
 
 	/* parse for any L4 headers */
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH ipsec-next] xfrm: use a dedicated slab cache for struct xfrm_state
From: Steffen Klassert @ 2018-05-04  9:31 UTC (permalink / raw)
  To: Mathias Krause; +Cc: Herbert Xu, David S. Miller, netdev
In-Reply-To: <1525337707-5281-1-git-send-email-minipli@googlemail.com>

On Thu, May 03, 2018 at 10:55:07AM +0200, Mathias Krause wrote:
> struct xfrm_state is rather large (768 bytes here) and therefore wastes
> quite a lot of memory as it falls into the kmalloc-1024 slab cache,
> leaving 256 bytes of unused memory per XFRM state object -- a net waste
> of 25%.
> 
> Using a dedicated slab cache for struct xfrm_state reduces the level of
> internal fragmentation to a minimum.
> 
> On my configuration SLUB chooses to create a slab cache covering 4
> pages holding 21 objects, resulting in an average memory waste of ~13
> bytes per object -- a net waste of only 1.6%.
> 
> In my tests this led to memory savings of roughly 2.3MB for 10k XFRM
> states.
> 
> Signed-off-by: Mathias Krause <minipli@googlemail.com>

Applied, thanks Mathias!

^ permalink raw reply

* [PATCH] selftests: net: add udpgso* to TEST_GEN_FILES
From: Anders Roxell @ 2018-05-04  9:17 UTC (permalink / raw)
  To: davem, shuah; +Cc: netdev, linux-kselftest, linux-kernel, Anders Roxell

The generated files udpgso* shouldn't be part of TEST_PROGS, they are
used by udpgso.sh and udpgsp_bench.sh. They should be added to the
TEST_GEN_FILES to get installed without being added to the main
run_kselftest.sh script.

Fixes: 3a687bef148d ("selftests: udp gso benchmark")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
---
 tools/testing/selftests/net/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 44895de1a0c4..f0363387ef2f 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -11,9 +11,9 @@ TEST_GEN_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
 TEST_GEN_FILES += tcp_mmap tcp_inq
+TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict
-TEST_GEN_PROGS += udpgso udpgso_bench_tx udpgso_bench_rx
 
 include ../lib.mk
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net] sctp: delay the authentication for the duplicated cookie-echo chunk
From: Xin Long @ 2018-05-04  9:05 UTC (permalink / raw)
  To: network dev, linux-sctp; +Cc: davem, Marcelo Ricardo Leitner, Neil Horman

Now sctp only delays the authentication for the normal cookie-echo
chunk by setting chunk->auth_chunk in sctp_endpoint_bh_rcv(). But
for the duplicated one with auth, in sctp_assoc_bh_rcv(), it does
authentication first based on the old asoc, which will definitely
fail due to the different auth info in the old asoc.

The duplicated cookie-echo chunk will create a new asoc with the
auth info from this chunk, and the authentication should also be
done with the new asoc's auth info for all of the collision 'A',
'B' and 'D'. Otherwise, the duplicated cookie-echo chunk with auth
will never pass the authentication and create the new connection.

This issue exists since very beginning, and this fix is to make
sctp_assoc_bh_rcv() follow the way sctp_assoc_bh_rcv() does for
the normal cookie-echo chunk to delay the authentication.

While at it, remove the unused params from sctp_sf_authenticate()
and define sctp_auth_chunk_verify() used for all the places that
do the delayed authentication.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/sctp/associola.c    | 30 ++++++++++++++++-
 net/sctp/sm_statefuns.c | 86 ++++++++++++++++++++++++++-----------------------
 2 files changed, 75 insertions(+), 41 deletions(-)

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 837806d..a47179d 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1024,8 +1024,9 @@ static void sctp_assoc_bh_rcv(struct work_struct *work)
 	struct sctp_endpoint *ep;
 	struct sctp_chunk *chunk;
 	struct sctp_inq *inqueue;
-	int state;
+	int first_time = 1;	/* is this the first time through the loop */
 	int error = 0;
+	int state;
 
 	/* The association should be held so we should be safe. */
 	ep = asoc->ep;
@@ -1036,6 +1037,30 @@ static void sctp_assoc_bh_rcv(struct work_struct *work)
 		state = asoc->state;
 		subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type);
 
+		/* If the first chunk in the packet is AUTH, do special
+		 * processing specified in Section 6.3 of SCTP-AUTH spec
+		 */
+		if (first_time && subtype.chunk == SCTP_CID_AUTH) {
+			struct sctp_chunkhdr *next_hdr;
+
+			next_hdr = sctp_inq_peek(inqueue);
+			if (!next_hdr)
+				goto normal;
+
+			/* If the next chunk is COOKIE-ECHO, skip the AUTH
+			 * chunk while saving a pointer to it so we can do
+			 * Authentication later (during cookie-echo
+			 * processing).
+			 */
+			if (next_hdr->type == SCTP_CID_COOKIE_ECHO) {
+				chunk->auth_chunk = skb_clone(chunk->skb,
+							      GFP_ATOMIC);
+				chunk->auth = 1;
+				continue;
+			}
+		}
+
+normal:
 		/* SCTP-AUTH, Section 6.3:
 		 *    The receiver has a list of chunk types which it expects
 		 *    to be received only after an AUTH-chunk.  This list has
@@ -1074,6 +1099,9 @@ static void sctp_assoc_bh_rcv(struct work_struct *work)
 		/* If there is an error on chunk, discard this packet. */
 		if (error && chunk)
 			chunk->pdiscard = 1;
+
+		if (first_time)
+			first_time = 0;
 	}
 	sctp_association_put(asoc);
 }
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 28c070e..c9ae340 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -153,10 +153,7 @@ static enum sctp_disposition sctp_sf_violation_chunk(
 					struct sctp_cmd_seq *commands);
 
 static enum sctp_ierror sctp_sf_authenticate(
-					struct net *net,
-					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const union sctp_subtype type,
 					struct sctp_chunk *chunk);
 
 static enum sctp_disposition __sctp_sf_do_9_1_abort(
@@ -626,6 +623,38 @@ enum sctp_disposition sctp_sf_do_5_1C_ack(struct net *net,
 	return SCTP_DISPOSITION_CONSUME;
 }
 
+static bool sctp_auth_chunk_verify(struct net *net, struct sctp_chunk *chunk,
+				   const struct sctp_association *asoc)
+{
+	struct sctp_chunk auth;
+
+	if (!chunk->auth_chunk)
+		return true;
+
+	/* SCTP-AUTH:  auth_chunk pointer is only set when the cookie-echo
+	 * is supposed to be authenticated and we have to do delayed
+	 * authentication.  We've just recreated the association using
+	 * the information in the cookie and now it's much easier to
+	 * do the authentication.
+	 */
+
+	/* Make sure that we and the peer are AUTH capable */
+	if (!net->sctp.auth_enable || !asoc->peer.auth_capable)
+		return false;
+
+	/* set-up our fake chunk so that we can process it */
+	auth.skb = chunk->auth_chunk;
+	auth.asoc = chunk->asoc;
+	auth.sctp_hdr = chunk->sctp_hdr;
+	auth.chunk_hdr = (struct sctp_chunkhdr *)
+				skb_push(chunk->auth_chunk,
+					 sizeof(struct sctp_chunkhdr));
+	skb_pull(chunk->auth_chunk, sizeof(struct sctp_chunkhdr));
+	auth.transport = chunk->transport;
+
+	return sctp_sf_authenticate(asoc, &auth) == SCTP_IERROR_NO_ERROR;
+}
+
 /*
  * Respond to a normal COOKIE ECHO chunk.
  * We are the side that is being asked for an association.
@@ -763,37 +792,9 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
 	if (error)
 		goto nomem_init;
 
-	/* SCTP-AUTH:  auth_chunk pointer is only set when the cookie-echo
-	 * is supposed to be authenticated and we have to do delayed
-	 * authentication.  We've just recreated the association using
-	 * the information in the cookie and now it's much easier to
-	 * do the authentication.
-	 */
-	if (chunk->auth_chunk) {
-		struct sctp_chunk auth;
-		enum sctp_ierror ret;
-
-		/* Make sure that we and the peer are AUTH capable */
-		if (!net->sctp.auth_enable || !new_asoc->peer.auth_capable) {
-			sctp_association_free(new_asoc);
-			return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
-		}
-
-		/* set-up our fake chunk so that we can process it */
-		auth.skb = chunk->auth_chunk;
-		auth.asoc = chunk->asoc;
-		auth.sctp_hdr = chunk->sctp_hdr;
-		auth.chunk_hdr = (struct sctp_chunkhdr *)
-					skb_push(chunk->auth_chunk,
-						 sizeof(struct sctp_chunkhdr));
-		skb_pull(chunk->auth_chunk, sizeof(struct sctp_chunkhdr));
-		auth.transport = chunk->transport;
-
-		ret = sctp_sf_authenticate(net, ep, new_asoc, type, &auth);
-		if (ret != SCTP_IERROR_NO_ERROR) {
-			sctp_association_free(new_asoc);
-			return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
-		}
+	if (!sctp_auth_chunk_verify(net, chunk, new_asoc)) {
+		sctp_association_free(new_asoc);
+		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 	}
 
 	repl = sctp_make_cookie_ack(new_asoc, chunk);
@@ -1797,13 +1798,15 @@ static enum sctp_disposition sctp_sf_do_dupcook_a(
 	if (sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC))
 		goto nomem;
 
+	if (!sctp_auth_chunk_verify(net, chunk, new_asoc))
+		return SCTP_DISPOSITION_DISCARD;
+
 	/* Make sure no new addresses are being added during the
 	 * restart.  Though this is a pretty complicated attack
 	 * since you'd have to get inside the cookie.
 	 */
-	if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk, commands)) {
+	if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk, commands))
 		return SCTP_DISPOSITION_CONSUME;
-	}
 
 	/* If the endpoint is in the SHUTDOWN-ACK-SENT state and recognizes
 	 * the peer has restarted (Action A), it MUST NOT setup a new
@@ -1912,6 +1915,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_b(
 	if (sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC))
 		goto nomem;
 
+	if (!sctp_auth_chunk_verify(net, chunk, new_asoc))
+		return SCTP_DISPOSITION_DISCARD;
+
 	/* Update the content of current association.  */
 	sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc));
 	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
@@ -2009,6 +2015,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_d(
 	 * a COOKIE ACK.
 	 */
 
+	if (!sctp_auth_chunk_verify(net, chunk, asoc))
+		return SCTP_DISPOSITION_DISCARD;
+
 	/* Don't accidentally move back into established state. */
 	if (asoc->state < SCTP_STATE_ESTABLISHED) {
 		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
@@ -4171,10 +4180,7 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast(
  * The return value is the disposition of the chunk.
  */
 static enum sctp_ierror sctp_sf_authenticate(
-					struct net *net,
-					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const union sctp_subtype type,
 					struct sctp_chunk *chunk)
 {
 	struct sctp_shared_key *sh_key = NULL;
@@ -4275,7 +4281,7 @@ enum sctp_disposition sctp_sf_eat_auth(struct net *net,
 						  commands);
 
 	auth_hdr = (struct sctp_authhdr *)chunk->skb->data;
-	error = sctp_sf_authenticate(net, ep, asoc, type, chunk);
+	error = sctp_sf_authenticate(asoc, chunk);
 	switch (error) {
 	case SCTP_IERROR_AUTH_BAD_HMAC:
 		/* Generate the ERROR chunk and discard the rest
-- 
2.1.0

^ permalink raw reply related

* [PATCH v2 net-next] net: stmmac: Add support for U32 TC filter using Flexible RX Parser
From: Jose Abreu @ 2018-05-04  9:01 UTC (permalink / raw)
  To: netdev
  Cc: Jose Abreu, David S. Miller, Joao Pinto, Vitor Soares,
	Giuseppe Cavallaro, Alexandre Torgue, Jakub Kicinski

This adds support for U32 filter by using an HW only feature called
Flexible RX Parser. This allow us to match any given packet field with a
pattern and accept/reject or even route the packet to a specific DMA
channel.

Right now we only support acception or rejection of frame and we only
support simple rules. Though, the Parser has the flexibility of jumping to
specific rules as an if condition so complex rules can be established.

This is only supported in GMAC5.10+.

The following commands can be used to test this code:

	1) Setup an ingress qdisk:
	# tc qdisc add dev eth0 handle ffff: ingress

	2) Setup a filter (e.g. filter by IP):
	# tc filter add dev eth0 parent ffff: protocol ip u32 match ip \
		src 192.168.0.3 skip_sw action drop

In every tests performed we always used the "skip_sw" flag to make sure
only the RX Parser was involved.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Jakub Kicinski <kubakici@wp.pl>
---
Changes from v1:
	- Follow Linux network coding style (David)
	- Use tc_cls_can_offload_and_chain0() (Jakub)
---
 drivers/net/ethernet/stmicro/stmmac/Makefile      |    2 +-
 drivers/net/ethernet/stmicro/stmmac/common.h      |    5 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h      |    4 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c |    1 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c  |    3 +
 drivers/net/ethernet/stmicro/stmmac/dwmac5.c      |  195 ++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac5.h      |   13 +
 drivers/net/ethernet/stmicro/stmmac/hwif.c        |    8 +
 drivers/net/ethernet/stmicro/stmmac/hwif.h        |   25 ++-
 drivers/net/ethernet/stmicro/stmmac/stmmac.h      |   29 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |   59 ++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c   |  295 +++++++++++++++++++++
 12 files changed, 636 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c

diff --git a/drivers/net/ethernet/stmicro/stmmac/Makefile b/drivers/net/ethernet/stmicro/stmmac/Makefile
index e3b578b..68e9e26 100644
--- a/drivers/net/ethernet/stmicro/stmmac/Makefile
+++ b/drivers/net/ethernet/stmicro/stmmac/Makefile
@@ -5,7 +5,7 @@ stmmac-objs:= stmmac_main.o stmmac_ethtool.o stmmac_mdio.o ring_mode.o	\
 	      dwmac100_core.o dwmac100_dma.o enh_desc.o norm_desc.o	\
 	      mmc_core.o stmmac_hwtstamp.o stmmac_ptp.o dwmac4_descs.o	\
 	      dwmac4_dma.o dwmac4_lib.o dwmac4_core.o dwmac5.o hwif.o \
-	      $(stmmac-y)
+	      stmmac_tc.o $(stmmac-y)
 
 # Ordering matters. Generic driver must be last.
 obj-$(CONFIG_STMMAC_PLATFORM)	+= stmmac-platform.o
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 627e905..a679cb7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -353,6 +353,10 @@ struct dma_features {
 	unsigned int rx_fifo_size;
 	/* Automotive Safety Package */
 	unsigned int asp;
+	/* RX Parser */
+	unsigned int frpsel;
+	unsigned int frpbs;
+	unsigned int frpes;
 };
 
 /* GMAC TX FIFO is 8K, Rx FIFO is 16K */
@@ -412,6 +416,7 @@ struct mac_device_info {
 	const struct stmmac_dma_ops *dma;
 	const struct stmmac_mode_ops *mode;
 	const struct stmmac_hwtimestamp *ptp;
+	const struct stmmac_tc_ops *tc;
 	struct mii_regs mii;	/* MII register Addresses */
 	struct mac_link link;
 	void __iomem *pcsr;     /* vpointer to device CSRs */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index 03eab90..6330a55 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -194,6 +194,9 @@ enum power_event {
 
 /* MAC HW features3 bitmap */
 #define GMAC_HW_FEAT_ASP		GENMASK(29, 28)
+#define GMAC_HW_FEAT_FRPES		GENMASK(14, 13)
+#define GMAC_HW_FEAT_FRPBS		GENMASK(12, 11)
+#define GMAC_HW_FEAT_FRPSEL		BIT(10)
 
 /* MAC HW ADDR regs */
 #define GMAC_HI_DCS			GENMASK(18, 16)
@@ -202,6 +205,7 @@ enum power_event {
 
 /*  MTL registers */
 #define MTL_OPERATION_MODE		0x00000c00
+#define MTL_FRPE			BIT(15)
 #define MTL_OPERATION_SCHALG_MASK	GENMASK(6, 5)
 #define MTL_OPERATION_SCHALG_WRR	(0x0 << 5)
 #define MTL_OPERATION_SCHALG_WFQ	(0x1 << 5)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 7289b3b..a7121a7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -795,6 +795,7 @@ static void dwmac4_debug(void __iomem *ioaddr, struct stmmac_extra_stats *x,
 	.safety_feat_config = dwmac5_safety_feat_config,
 	.safety_feat_irq_status = dwmac5_safety_feat_irq_status,
 	.safety_feat_dump = dwmac5_safety_feat_dump,
+	.rxp_config = dwmac5_rxp_config,
 };
 
 int dwmac4_setup(struct stmmac_priv *priv)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index d37d457..117c3a5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -379,6 +379,9 @@ static void dwmac4_get_hw_feature(void __iomem *ioaddr,
 
 	/* 5.10 Features */
 	dma_cap->asp = (hw_cap & GMAC_HW_FEAT_ASP) >> 28;
+	dma_cap->frpes = (hw_cap & GMAC_HW_FEAT_FRPES) >> 13;
+	dma_cap->frpbs = (hw_cap & GMAC_HW_FEAT_FRPBS) >> 11;
+	dma_cap->frpsel = (hw_cap & GMAC_HW_FEAT_FRPSEL) >> 10;
 }
 
 /* Enable/disable TSO feature and set MSS */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
index 2978550..b2becb8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
@@ -7,6 +7,7 @@
 #include "common.h"
 #include "dwmac4.h"
 #include "dwmac5.h"
+#include "stmmac.h"
 
 struct dwmac5_error_desc {
 	bool valid;
@@ -299,3 +300,197 @@ int dwmac5_safety_feat_dump(struct stmmac_safety_stats *stats,
 		*desc = dwmac5_all_errors[module].desc[offset].desc;
 	return 0;
 }
+
+static int dwmac5_rxp_disable(void __iomem *ioaddr)
+{
+	u32 val;
+	int ret;
+
+	val = readl(ioaddr + MTL_OPERATION_MODE);
+	val &= ~MTL_FRPE;
+	writel(val, ioaddr + MTL_OPERATION_MODE);
+
+	ret = readl_poll_timeout(ioaddr + MTL_RXP_CONTROL_STATUS, val,
+			val & RXPI, 1, 10000);
+	if (ret)
+		return ret;
+	return 0;
+}
+
+static void dwmac5_rxp_enable(void __iomem *ioaddr)
+{
+	u32 val;
+
+	val = readl(ioaddr + MTL_OPERATION_MODE);
+	val |= MTL_FRPE;
+	writel(val, ioaddr + MTL_OPERATION_MODE);
+}
+
+static int dwmac5_rxp_update_single_entry(void __iomem *ioaddr,
+					  struct stmmac_tc_entry *entry,
+					  int pos)
+{
+	int ret, i;
+
+	for (i = 0; i < (sizeof(entry->val) / sizeof(u32)); i++) {
+		int real_pos = pos * (sizeof(entry->val) / sizeof(u32)) + i;
+		u32 val;
+
+		/* Wait for ready */
+		ret = readl_poll_timeout(ioaddr + MTL_RXP_IACC_CTRL_STATUS,
+				val, !(val & STARTBUSY), 1, 10000);
+		if (ret)
+			return ret;
+
+		/* Write data */
+		val = *((u32 *)&entry->val + i);
+		writel(val, ioaddr + MTL_RXP_IACC_DATA);
+
+		/* Write pos */
+		val = real_pos & ADDR;
+		writel(val, ioaddr + MTL_RXP_IACC_CTRL_STATUS);
+
+		/* Write OP */
+		val |= WRRDN;
+		writel(val, ioaddr + MTL_RXP_IACC_CTRL_STATUS);
+
+		/* Start Write */
+		val |= STARTBUSY;
+		writel(val, ioaddr + MTL_RXP_IACC_CTRL_STATUS);
+
+		/* Wait for done */
+		ret = readl_poll_timeout(ioaddr + MTL_RXP_IACC_CTRL_STATUS,
+				val, !(val & STARTBUSY), 1, 10000);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct stmmac_tc_entry *
+dwmac5_rxp_get_next_entry(struct stmmac_tc_entry *entries, unsigned int count,
+			  u32 curr_prio)
+{
+	struct stmmac_tc_entry *entry;
+	u32 min_prio = ~0x0;
+	int i, min_prio_idx;
+	bool found = false;
+
+	for (i = count - 1; i >= 0; i--) {
+		entry = &entries[i];
+
+		/* Do not update unused entries */
+		if (!entry->in_use)
+			continue;
+		/* Do not update already updated entries (i.e. fragments) */
+		if (entry->in_hw)
+			continue;
+		/* Let last entry be updated last */
+		if (entry->is_last)
+			continue;
+		/* Do not return fragments */
+		if (entry->is_frag)
+			continue;
+		/* Check if we already checked this prio */
+		if (entry->prio < curr_prio)
+			continue;
+		/* Check if this is the minimum prio */
+		if (entry->prio < min_prio) {
+			min_prio = entry->prio;
+			min_prio_idx = i;
+			found = true;
+		}
+	}
+
+	if (found)
+		return &entries[min_prio_idx];
+	return NULL;
+}
+
+int dwmac5_rxp_config(void __iomem *ioaddr, struct stmmac_tc_entry *entries,
+		      unsigned int count)
+{
+	struct stmmac_tc_entry *entry, *frag;
+	int i, ret, nve = 0;
+	u32 curr_prio = 0;
+	u32 old_val, val;
+
+	/* Force disable RX */
+	old_val = readl(ioaddr + GMAC_CONFIG);
+	val = old_val & ~GMAC_CONFIG_RE;
+	writel(val, ioaddr + GMAC_CONFIG);
+
+	/* Disable RX Parser */
+	ret = dwmac5_rxp_disable(ioaddr);
+	if (ret)
+		goto re_enable;
+
+	/* Set all entries as NOT in HW */
+	for (i = 0; i < count; i++) {
+		entry = &entries[i];
+		entry->in_hw = false;
+	}
+
+	/* Update entries by reverse order */
+	while (1) {
+		entry = dwmac5_rxp_get_next_entry(entries, count, curr_prio);
+		if (!entry)
+			break;
+
+		curr_prio = entry->prio;
+		frag = entry->frag_ptr;
+
+		/* Set special fragment requirements */
+		if (frag) {
+			entry->val.af = 0;
+			entry->val.rf = 0;
+			entry->val.nc = 1;
+			entry->val.ok_index = nve + 2;
+		}
+
+		ret = dwmac5_rxp_update_single_entry(ioaddr, entry, nve);
+		if (ret)
+			goto re_enable;
+
+		entry->table_pos = nve++;
+		entry->in_hw = true;
+
+		if (frag && !frag->in_hw) {
+			ret = dwmac5_rxp_update_single_entry(ioaddr, frag, nve);
+			if (ret)
+				goto re_enable;
+			frag->table_pos = nve++;
+			frag->in_hw = true;
+		}
+	}
+
+	if (!nve)
+		goto re_enable;
+
+	/* Update all pass entry */
+	for (i = 0; i < count; i++) {
+		entry = &entries[i];
+		if (!entry->is_last)
+			continue;
+
+		ret = dwmac5_rxp_update_single_entry(ioaddr, entry, nve);
+		if (ret)
+			goto re_enable;
+
+		entry->table_pos = nve++;
+	}
+
+	/* Assume n. of parsable entries == n. of valid entries */
+	val = (nve << 16) & NPE;
+	val |= nve & NVE;
+	writel(val, ioaddr + MTL_RXP_CONTROL_STATUS);
+
+	/* Enable RX Parser */
+	dwmac5_rxp_enable(ioaddr);
+
+re_enable:
+	/* Re-enable RX */
+	writel(old_val, ioaddr + GMAC_CONFIG);
+	return ret;
+}
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
index bd4c466..cc810af 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
@@ -11,6 +11,17 @@
 #define PRTYEN				BIT(1)
 #define TMOUTEN				BIT(0)
 
+#define MTL_RXP_CONTROL_STATUS		0x00000ca0
+#define RXPI				BIT(31)
+#define NPE				GENMASK(23, 16)
+#define NVE				GENMASK(7, 0)
+#define MTL_RXP_IACC_CTRL_STATUS	0x00000cb0
+#define STARTBUSY			BIT(31)
+#define RXPEIEC				GENMASK(22, 21)
+#define RXPEIEE				BIT(20)
+#define WRRDN				BIT(16)
+#define ADDR				GENMASK(15, 0)
+#define MTL_RXP_IACC_DATA		0x00000cb4
 #define MTL_ECC_CONTROL			0x00000cc0
 #define TSOEE				BIT(4)
 #define MRXPEE				BIT(3)
@@ -48,5 +59,7 @@ int dwmac5_safety_feat_irq_status(struct net_device *ndev,
 		struct stmmac_safety_stats *stats);
 int dwmac5_safety_feat_dump(struct stmmac_safety_stats *stats,
 			int index, unsigned long *count, const char **desc);
+int dwmac5_rxp_config(void __iomem *ioaddr, struct stmmac_tc_entry *entries,
+		      unsigned int count);
 
 #endif /* __DWMAC5_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
index 2b0a7e7..9acc8d2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -77,6 +77,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 	const void *mac;
 	const void *hwtimestamp;
 	const void *mode;
+	const void *tc;
 	int (*setup)(struct stmmac_priv *priv);
 	int (*quirks)(struct stmmac_priv *priv);
 } stmmac_hw[] = {
@@ -90,6 +91,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.mac = &dwmac100_ops,
 		.hwtimestamp = &stmmac_ptp,
 		.mode = NULL,
+		.tc = NULL,
 		.setup = dwmac100_setup,
 		.quirks = stmmac_dwmac1_quirks,
 	}, {
@@ -101,6 +103,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.mac = &dwmac1000_ops,
 		.hwtimestamp = &stmmac_ptp,
 		.mode = NULL,
+		.tc = NULL,
 		.setup = dwmac1000_setup,
 		.quirks = stmmac_dwmac1_quirks,
 	}, {
@@ -112,6 +115,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.mac = &dwmac4_ops,
 		.hwtimestamp = &stmmac_ptp,
 		.mode = NULL,
+		.tc = NULL,
 		.setup = dwmac4_setup,
 		.quirks = stmmac_dwmac4_quirks,
 	}, {
@@ -123,6 +127,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.mac = &dwmac410_ops,
 		.hwtimestamp = &stmmac_ptp,
 		.mode = &dwmac4_ring_mode_ops,
+		.tc = NULL,
 		.setup = dwmac4_setup,
 		.quirks = NULL,
 	}, {
@@ -134,6 +139,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.mac = &dwmac410_ops,
 		.hwtimestamp = &stmmac_ptp,
 		.mode = &dwmac4_ring_mode_ops,
+		.tc = NULL,
 		.setup = dwmac4_setup,
 		.quirks = NULL,
 	}, {
@@ -145,6 +151,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.mac = &dwmac510_ops,
 		.hwtimestamp = &stmmac_ptp,
 		.mode = &dwmac4_ring_mode_ops,
+		.tc = &dwmac510_tc_ops,
 		.setup = dwmac4_setup,
 		.quirks = NULL,
 	}
@@ -196,6 +203,7 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 		mac->mac = entry->mac;
 		mac->ptp = entry->hwtimestamp;
 		mac->mode = entry->mode;
+		mac->tc = entry->tc;
 
 		priv->hw = mac;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index bfad616..b7539a1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -5,10 +5,12 @@
 #ifndef __STMMAC_HWIF_H__
 #define __STMMAC_HWIF_H__
 
+#include <linux/netdevice.h>
+
 #define stmmac_do_void_callback(__priv, __module, __cname,  __arg0, __args...) \
 ({ \
 	int __result = -EINVAL; \
-	if ((__priv)->hw->__module->__cname) { \
+	if ((__priv)->hw->__module && (__priv)->hw->__module->__cname) { \
 		(__priv)->hw->__module->__cname((__arg0), ##__args); \
 		__result = 0; \
 	} \
@@ -17,7 +19,7 @@
 #define stmmac_do_callback(__priv, __module, __cname,  __arg0, __args...) \
 ({ \
 	int __result = -EINVAL; \
-	if ((__priv)->hw->__module->__cname) \
+	if ((__priv)->hw->__module && (__priv)->hw->__module->__cname) \
 		__result = (__priv)->hw->__module->__cname((__arg0), ##__args); \
 	__result; \
 })
@@ -232,6 +234,7 @@ struct stmmac_dma_ops {
 struct net_device;
 struct rgmii_adv;
 struct stmmac_safety_stats;
+struct stmmac_tc_entry;
 
 /* Helpers to program the MAC core */
 struct stmmac_ops {
@@ -301,6 +304,9 @@ struct stmmac_ops {
 			struct stmmac_safety_stats *stats);
 	int (*safety_feat_dump)(struct stmmac_safety_stats *stats,
 			int index, unsigned long *count, const char **desc);
+	/* Flexible RX Parser */
+	int (*rxp_config)(void __iomem *ioaddr, struct stmmac_tc_entry *entries,
+			  unsigned int count);
 };
 
 #define stmmac_core_init(__priv, __args...) \
@@ -365,6 +371,8 @@ struct stmmac_ops {
 	stmmac_do_callback(__priv, mac, safety_feat_irq_status, __args)
 #define stmmac_safety_feat_dump(__priv, __args...) \
 	stmmac_do_callback(__priv, mac, safety_feat_dump, __args)
+#define stmmac_rxp_config(__priv, __args...) \
+	stmmac_do_callback(__priv, mac, rxp_config, __args)
 
 /* PTP and HW Timer helpers */
 struct stmmac_hwtimestamp {
@@ -419,6 +427,18 @@ struct stmmac_mode_ops {
 	stmmac_do_void_callback(__priv, mode, clean_desc3, __args)
 
 struct stmmac_priv;
+struct tc_cls_u32_offload;
+
+struct stmmac_tc_ops {
+	int (*init)(struct stmmac_priv *priv);
+	int (*setup_cls_u32)(struct stmmac_priv *priv,
+			     struct tc_cls_u32_offload *cls);
+};
+
+#define stmmac_tc_init(__priv, __args...) \
+	stmmac_do_callback(__priv, tc, init, __args)
+#define stmmac_tc_setup_cls_u32(__priv, __args...) \
+	stmmac_do_callback(__priv, tc, setup_cls_u32, __args)
 
 extern const struct stmmac_ops dwmac100_ops;
 extern const struct stmmac_dma_ops dwmac100_dma_ops;
@@ -429,6 +449,7 @@ struct stmmac_mode_ops {
 extern const struct stmmac_ops dwmac410_ops;
 extern const struct stmmac_dma_ops dwmac410_dma_ops;
 extern const struct stmmac_ops dwmac510_ops;
+extern const struct stmmac_tc_ops dwmac510_tc_ops;
 
 #define GMAC_VERSION		0x00000020	/* GMAC CORE Version */
 #define GMAC4_VERSION		0x00000110	/* GMAC4+ CORE Version */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index 2443f20..42fc76e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -76,6 +76,30 @@ struct stmmac_rx_queue {
 	struct napi_struct napi ____cacheline_aligned_in_smp;
 };
 
+struct stmmac_tc_entry {
+	bool in_use;
+	bool in_hw;
+	bool is_last;
+	bool is_frag;
+	void *frag_ptr;
+	unsigned int table_pos;
+	u32 handle;
+	u32 prio;
+	struct {
+		u32 match_data;
+		u32 match_en;
+		u8 af:1;
+		u8 rf:1;
+		u8 im:1;
+		u8 nc:1;
+		u8 res1:4;
+		u8 frame_offset;
+		u8 ok_index;
+		u8 dma_ch_no;
+		u32 res2;
+	} __packed val;
+};
+
 struct stmmac_priv {
 	/* Frequently used values are kept adjacent for cache effect */
 	u32 tx_count_frames;
@@ -151,6 +175,11 @@ struct stmmac_priv {
 	unsigned long state;
 	struct workqueue_struct *wq;
 	struct work_struct service_task;
+
+	/* TC Handling */
+	unsigned int tc_entries_max;
+	unsigned int tc_off_max;
+	struct stmmac_tc_entry *tc_entries;
 };
 
 enum stmmac_state {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 0135fd3..84b29ef 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -45,6 +45,7 @@
 #include <linux/seq_file.h>
 #endif /* CONFIG_DEBUG_FS */
 #include <linux/net_tstamp.h>
+#include <net/pkt_cls.h>
 #include "stmmac_ptp.h"
 #include "stmmac.h"
 #include <linux/reset.h>
@@ -3786,6 +3787,58 @@ static int stmmac_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	return ret;
 }
 
+static int stmmac_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+				    void *cb_priv)
+{
+	struct stmmac_priv *priv = cb_priv;
+	int ret = -EOPNOTSUPP;
+
+	stmmac_disable_all_queues(priv);
+
+	switch (type) {
+	case TC_SETUP_CLSU32:
+		if (tc_cls_can_offload_and_chain0(priv->dev, type_data))
+			ret = stmmac_tc_setup_cls_u32(priv, priv, type_data);
+		break;
+	default:
+		break;
+	}
+
+	stmmac_enable_all_queues(priv);
+	return ret;
+}
+
+static int stmmac_setup_tc_block(struct stmmac_priv *priv,
+				 struct tc_block_offload *f)
+{
+	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		return tcf_block_cb_register(f->block, stmmac_setup_tc_block_cb,
+				priv, priv);
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, stmmac_setup_tc_block_cb, priv);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int stmmac_setup_tc(struct net_device *ndev, enum tc_setup_type type,
+			   void *type_data)
+{
+	struct stmmac_priv *priv = netdev_priv(ndev);
+
+	switch (type) {
+	case TC_SETUP_BLOCK:
+		return stmmac_setup_tc_block(priv, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static int stmmac_set_mac_address(struct net_device *ndev, void *addr)
 {
 	struct stmmac_priv *priv = netdev_priv(ndev);
@@ -4024,6 +4077,7 @@ static void stmmac_exit_fs(struct net_device *dev)
 	.ndo_set_rx_mode = stmmac_set_rx_mode,
 	.ndo_tx_timeout = stmmac_tx_timeout,
 	.ndo_do_ioctl = stmmac_ioctl,
+	.ndo_setup_tc = stmmac_setup_tc,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = stmmac_poll_controller,
 #endif
@@ -4223,6 +4277,11 @@ int stmmac_dvr_probe(struct device *device,
 	ndev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 			    NETIF_F_RXCSUM;
 
+	ret = stmmac_tc_init(priv, priv);
+	if (!ret) {
+		ndev->hw_features |= NETIF_F_HW_TC;
+	}
+
 	if ((priv->plat->tso_en) && (priv->dma_cap.tsoen)) {
 		ndev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
 		priv->tso = true;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
new file mode 100644
index 0000000..881c94b
--- /dev/null
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * Copyright (c) 2018 Synopsys, Inc. and/or its affiliates.
+ * stmmac TC Handling (HW only)
+ */
+
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_gact.h>
+#include "common.h"
+#include "dwmac4.h"
+#include "dwmac5.h"
+#include "stmmac.h"
+
+static void tc_fill_all_pass_entry(struct stmmac_tc_entry *entry)
+{
+	memset(entry, 0, sizeof(*entry));
+	entry->in_use = true;
+	entry->is_last = true;
+	entry->is_frag = false;
+	entry->prio = ~0x0;
+	entry->handle = 0;
+	entry->val.match_data = 0x0;
+	entry->val.match_en = 0x0;
+	entry->val.af = 1;
+	entry->val.dma_ch_no = 0x0;
+}
+
+static struct stmmac_tc_entry *tc_find_entry(struct stmmac_priv *priv,
+					     struct tc_cls_u32_offload *cls,
+					     bool free)
+{
+	struct stmmac_tc_entry *entry, *first = NULL, *dup = NULL;
+	u32 loc = cls->knode.handle;
+	int i;
+
+	for (i = 0; i < priv->tc_entries_max; i++) {
+		entry = &priv->tc_entries[i];
+		if (!entry->in_use && !first && free)
+			first = entry;
+		if (entry->handle == loc && !free)
+			dup = entry;
+	}
+
+	if (dup)
+		return dup;
+	if (first) {
+		first->handle = loc;
+		first->in_use = true;
+
+		/* Reset HW values */
+		memset(&first->val, 0, sizeof(first->val));
+	}
+
+	return first;
+}
+
+static int tc_fill_actions(struct stmmac_tc_entry *entry,
+			   struct stmmac_tc_entry *frag,
+			   struct tc_cls_u32_offload *cls)
+{
+	struct stmmac_tc_entry *action_entry = entry;
+	const struct tc_action *act;
+	struct tcf_exts *exts;
+	LIST_HEAD(actions);
+
+	exts = cls->knode.exts;
+	if (!tcf_exts_has_actions(exts))
+		return -EINVAL;
+	if (frag)
+		action_entry = frag;
+
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(act, &actions, list) {
+		/* Accept */
+		if (is_tcf_gact_ok(act)) {
+			action_entry->val.af = 1;
+			break;
+		}
+		/* Drop */
+		if (is_tcf_gact_shot(act)) {
+			action_entry->val.rf = 1;
+			break;
+		}
+
+		/* Unsupported */
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int tc_fill_entry(struct stmmac_priv *priv,
+			 struct tc_cls_u32_offload *cls)
+{
+	struct stmmac_tc_entry *entry, *frag = NULL;
+	struct tc_u32_sel *sel = cls->knode.sel;
+	u32 off, data, mask, real_off, rem;
+	u32 prio = cls->common.prio;
+	int ret;
+
+	/* Only 1 match per entry */
+	if (sel->nkeys <= 0 || sel->nkeys > 1)
+		return -EINVAL;
+
+	off = sel->keys[0].off << sel->offshift;
+	data = sel->keys[0].val;
+	mask = sel->keys[0].mask;
+
+	switch (ntohs(cls->common.protocol)) {
+	case ETH_P_ALL:
+		break;
+	case ETH_P_IP:
+		off += ETH_HLEN;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (off > priv->tc_off_max)
+		return -EINVAL;
+
+	real_off = off / 4;
+	rem = off % 4;
+
+	entry = tc_find_entry(priv, cls, true);
+	if (!entry)
+		return -EINVAL;
+
+	if (rem) {
+		frag = tc_find_entry(priv, cls, true);
+		if (!frag) {
+			ret = -EINVAL;
+			goto err_unuse;
+		}
+
+		entry->frag_ptr = frag;
+		entry->val.match_en = (mask << (rem * 8)) &
+			GENMASK(31, rem * 8);
+		entry->val.match_data = (data << (rem * 8)) &
+			GENMASK(31, rem * 8);
+		entry->val.frame_offset = real_off;
+		entry->prio = prio;
+
+		frag->val.match_en = (mask >> (rem * 8)) &
+			GENMASK(rem * 8 - 1, 0);
+		frag->val.match_data = (data >> (rem * 8)) &
+			GENMASK(rem * 8 - 1, 0);
+		frag->val.frame_offset = real_off + 1;
+		frag->prio = prio;
+		frag->is_frag = true;
+	} else {
+		entry->frag_ptr = NULL;
+		entry->val.match_en = mask;
+		entry->val.match_data = data;
+		entry->val.frame_offset = real_off;
+		entry->prio = prio;
+	}
+
+	ret = tc_fill_actions(entry, frag, cls);
+	if (ret)
+		goto err_unuse;
+
+	return 0;
+
+err_unuse:
+	if (frag)
+		frag->in_use = false;
+	entry->in_use = false;
+	return ret;
+}
+
+static void tc_unfill_entry(struct stmmac_priv *priv,
+			    struct tc_cls_u32_offload *cls)
+{
+	struct stmmac_tc_entry *entry;
+
+	entry = tc_find_entry(priv, cls, false);
+	if (!entry)
+		return;
+
+	entry->in_use = false;
+	if (entry->frag_ptr) {
+		entry = entry->frag_ptr;
+		entry->is_frag = false;
+		entry->in_use = false;
+	}
+}
+
+static int tc_config_knode(struct stmmac_priv *priv,
+			   struct tc_cls_u32_offload *cls)
+{
+	int ret;
+
+	ret = tc_fill_entry(priv, cls);
+	if (ret)
+		return ret;
+
+	ret = stmmac_rxp_config(priv, priv->hw->pcsr, priv->tc_entries,
+			priv->tc_entries_max);
+	if (ret)
+		goto err_unfill;
+
+	return 0;
+
+err_unfill:
+	tc_unfill_entry(priv, cls);
+	return ret;
+}
+
+static int tc_delete_knode(struct stmmac_priv *priv,
+			   struct tc_cls_u32_offload *cls)
+{
+	int ret;
+
+	/* Set entry and fragments as not used */
+	tc_unfill_entry(priv, cls);
+
+	ret = stmmac_rxp_config(priv, priv->hw->pcsr, priv->tc_entries,
+			priv->tc_entries_max);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int tc_setup_cls_u32(struct stmmac_priv *priv,
+			    struct tc_cls_u32_offload *cls)
+{
+	switch (cls->command) {
+	case TC_CLSU32_REPLACE_KNODE:
+		tc_unfill_entry(priv, cls);
+		/* Fall through */
+	case TC_CLSU32_NEW_KNODE:
+		return tc_config_knode(priv, cls);
+	case TC_CLSU32_DELETE_KNODE:
+		return tc_delete_knode(priv, cls);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int tc_init(struct stmmac_priv *priv)
+{
+	struct dma_features *dma_cap = &priv->dma_cap;
+	unsigned int count;
+
+	if (!dma_cap->frpsel)
+		return -EINVAL;
+
+	switch (dma_cap->frpbs) {
+	case 0x0:
+		priv->tc_off_max = 64;
+		break;
+	case 0x1:
+		priv->tc_off_max = 128;
+		break;
+	case 0x2:
+		priv->tc_off_max = 256;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch (dma_cap->frpes) {
+	case 0x0:
+		count = 64;
+		break;
+	case 0x1:
+		count = 128;
+		break;
+	case 0x2:
+		count = 256;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Reserve one last filter which lets all pass */
+	priv->tc_entries_max = count;
+	priv->tc_entries = devm_kzalloc(priv->device,
+			sizeof(*priv->tc_entries) * count, GFP_KERNEL);
+	if (!priv->tc_entries)
+		return -ENOMEM;
+
+	tc_fill_all_pass_entry(&priv->tc_entries[count - 1]);
+
+	dev_info(priv->device, "Enabling HW TC (entries=%d, max_off=%d)\n",
+			priv->tc_entries_max, priv->tc_off_max);
+	return 0;
+}
+
+const struct stmmac_tc_ops dwmac510_tc_ops = {
+	.init = tc_init,
+	.setup_cls_u32 = tc_setup_cls_u32,
+};
-- 
1.7.1

^ permalink raw reply related

* Re: i.MX6S/DL and QCA8334 switch using DSA driver - CPU port not working
From: Michal Vokáč @ 2018-05-04  8:45 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev, Vivien Didelot, Florian Fainelli
In-Reply-To: <20180430132025.GF10066@lunn.ch>

On 30.4.2018 15:20, Andrew Lunn wrote:
>> Using rgmii-id for the port is not valid as the qca8k driver does not support
>> that mode. It only supports rgmii and sgmii. I think this is actually not
>> correct. When phy-mode is set to rgmii for port the qca8k driver configures
>> internal delays in the switch. So it behaves like rgmii-id I think.
>>
>> Should not it be:
>>
>> --- a/drivers/net/dsa/qca8k.c
>> +++ b/drivers/net/dsa/qca8k.c
>> @@ -474,7 +474,7 @@ qca8k_set_pad_ctrl(struct qca8k_priv *priv, int port, int mode)
>>   	 * PHY or MAC.
>>   	 */
>>   	switch (mode) {
>> -	case PHY_INTERFACE_MODE_RGMII:
>> +	case PHY_INTERFACE_MODE_RGMII_ID:
>>   	qca8k_write(priv, reg,
>> 		    QCA8K_PORT_PAD_RGMII_EN |
>> 		    QCA8K_PORT_PAD_RGMII_TX_DELAY(3) |
> 
> We have to be careful cleaning this up. It has the potential to break
> existing boards when using an old device tree blob.

Oh, I see. Thanks for pointing this out.

Some news to the problem with the non-working CPU port.
Andrew, thank you very mych for the ideas how to debug the issue.
I tried what you suggested but have no luck. FYI Now I am doing all my tests
with linux-stable.

First of all I tried to make work my old phy driver for the switch with latest
kernel. It works on v4.1.46 but did not on v4.17-rc2 - no IP@ on eth0.
So a very same issue as I have with the DSA. Bisecting the kernel picked:

d5c3d84 ("net: phy: Avoid polling PHY with PHY_IGNORE_INTERRUPTS")

Fixed that by using PHY_POLL in my driver. I was hoping that I may have similar
issue when using DSA but it looks OK. This is with the DSA enabled:

  # dmesg | grep PHY
  [    3.452536] Generic PHY 2188000.ethernet-1:01: attached PHY driver [Generic PHY] (mii_bus:phy_addr=2188000.ethernet-1:01, irq=POLL)
  [    3.453437] Generic PHY 2188000.ethernet-1:02: attached PHY driver [Generic PHY] (mii_bus:phy_addr=2188000.ethernet-1:02, irq=POLL)
  [   20.769281] Generic PHY fixed-0:00: attached PHY driver [Generic PHY] (mii_bus:phy_addr=fixed-0:00, irq=POLL)

Anyway, now I am sure that I can use RGMII interface with mainline when I am
not using DSA and phy-mode is set to rgmii and I use QCA8K_PORT_PAD_RGMII_TX_DELAY(2)
and QCA8K_PORT_PAD_RGMII_RX_DELAY(2).

To debug the non-working CPU port with DSA I tried these kernel versions:

  - v4.8-rc6-1085-g6b93fb4 - NOT OK
    - Can not go lower than this version. qca8k driver was introduced here.
  - 4.9.84 - NOT OK
  - 4.17-rc2 - NOT OK

Some RGMII delay tunning attempts with v4.17-rc2:

phy-mode (fec)	Rx/Tx delay	result
--------------------------------------
rgmii		0/0		NOT OK
rgmii		1/1		NOT OK
rgmii		2/2		NOT OK
rgmii		3/3		NOT OK
rgmii-id	0/0		NOT OK
rgmii-id	1/1		NOT OK
rgmii-id	2/2		NOT OK
rgmii-id	3/3		NOT OK

I am out of ideas how to further debug this.
Any additional adivce will be much appreciated.

Thanks, Michal.

^ permalink raw reply

* Re: [PATCH net] macmace: Set platform device coherent_dma_mask
From: Michael Schmitz @ 2018-05-04  8:16 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Christoph Hellwig, Finn Thain, David S. Miller, linux-m68k,
	netdev, Linux Kernel Mailing List
In-Reply-To: <CAMuHMdW+waQSQxLvBXEZkhCAj_kh=4SR1hZ4FAeC8YWXtRKg1A@mail.gmail.com>

Hi Geert,

Am 04.05.2018 um 19:24 schrieb Geert Uytterhoeven:
> Hi Michael,
> 
>>> Yes, that would be useful.  The other assumption could be that
>>> platform devices always allow an all-0xff dma mask.
>>
>> That's not always true (Atari NCR5380 SCSI and floppy would use a 24
>> bit DMA mask). We use bounce buffers allocated from a dedicated lowmem
>> pool there currently, and for all I know don't use the DMA API yet.
>>
>> I bet that is a rare exception though. Setting the default DMA mask
>> for platform devices to all-0xff and letting the few odd drivers force
>> a different setting seems the best way forward.
> 
> I'd say that's usually a property of the platform, not of the device?

Right - I was thinking 'm68k' as platform, not a particular machine like
Mac or Falcon (the 24 bit mask only applies to that particular model
anyway).

> So IMHO it belongs in the platform code, not in the device driver code.

OK - let's have a default mask of 64 bit, and allow machine specific
platform_init() to override using a new helper function.

Cheers,

	Michael

> Gr{oetje,eeting}s,
> 
>                         Geert
> 

^ permalink raw reply

* Re: Repeating "unregister_netdevice: waiting for lo to become free" caused by upstream 76da0704507bb ("ipv6: only call ip6_route_dev_notify() once for NETDEV_UNREGISTER")
From: Rafał Miłecki @ 2018-05-04  7:54 UTC (permalink / raw)
  To: Konstantin Khlebnikov, WANG Cong, David S. Miller,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, Network Development, jeffy,
	David Ahern
  Cc: Greg Kroah-Hartman, Stable, Dan Streetman, Dan Streetman,
	Mathias Tillman
In-Reply-To: <07b74ef0-5ce6-b391-7b0f-59685350e802@gmail.com>

On 25 April 2018 at 16:44, Rafał Miłecki <zajec5@gmail.com> wrote:
> On 25.04.2018 16:30, Konstantin Khlebnikov wrote:
>>
>> On 25.04.2018 17:16, Rafał Miłecki wrote:
>>>
>>> On 23.04.2018 15:08, Rafał Miłecki wrote:
>>>>
>>>> I've just updated my kernel 4.4.x and noticed a regression. Bisecting
>>>> pointed me to the commit 2417da3f4d6bc ("ipv6: only call
>>>> ip6_route_dev_notify() once for NETDEV_UNREGISTER") [0] which is
>>>> backport of upstream 76da0704507bb. That backported commit has
>>>> appeared in a 4.4.103.
>>>>
>>>> I use OpenWrt/LEDE [1] distribution and LXC [2] 1.1.5. After stopping
>>>> a container I start getting these messages:
>>>> [  229.419188] unregister_netdevice: waiting for lo to become free.
>>>> Usage count = 1
>>>> [  239.660408] unregister_netdevice: waiting for lo to become free.
>>>> Usage count = 1
>>>> [  249.839189] unregister_netdevice: waiting for lo to become free.
>>>> Usage count = 1
>>>> (...)
>>>>
>>>> Trying to start LXC nevertheless results in lxc-start command hang
>>>> around network configuration. Trying to query LXC state afterwards
>>>> results in a lxc-info command hang too.
>>>>
>>>> I tried Googling for this issue and found similar reports:
>>>> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1729637
>>>> https://github.com/fnproject/fn/issues/686
>>>>
>>>> https://lime-technology.com/forums/topic/66863-kernelunregister_netdevice-waiting-for-lo-to-become-free-usage-count-1/
>>>> all of them related to the Docker, which is probably a similar use
>>>> case to the LXC.
>>>>
>>>> I couldn't find any reference to commit 76da0704507bb that could
>>>> suggest fixing the problem I'm seeing.
>>>>
>>>> Does anyone have an idea what is the issue I'm seeing about? Or even
>>>> better, how to fix it? Can I provide any additional info that would
>>>> help?
>>>>
>>>>
>>>> [0]
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?h=linux-4.4.y&id=2417da3f4d6bc4fc6c77f613f0e2264090892aa5
>>>> [1] https://openwrt.org/
>>>> [2] https://linuxcontainers.org/
>>>
>>>
>>> Today I tried 4.14.34 to see if that helps. Unfortunately it doesn't. I
>>> still experience the same problem.
>>>
>>>  From reading various reports regarding that "unregister_netdevice:
>>> waiting for lo to become free" message it appears the problem is caused
>>> by a leaking dst refcnt somewhere in the kernel code.
>>>
>>> I found links to few commit fixing leaks at various places:
>>> 4a31a6b19f9dd ("sctp: fix dst refcnt leak in sctp_v4_get_dst")
>>> 957d761cf91cd ("sctp: fix dst refcnt leak in sctp_v6_get_dst()")
>>> 4ee806d51176b ("net: tcp: close sock if net namespace is exiting")
>>> d747a7a51b009 ("tcp: reset sk_rx_dst in tcp_disconnect()")
>>> 751eb6b6042a5 ("ipv6: addrconf: fix dev refcont leak when DAD failed")
>>>
>>> All above patches are present in the linux-v4.4.y and are part of kernel
>>> 4.4.124 I use. So it seems I'm facing yet another dst refcnt leak.
>>>
>>> Could commit 2417da3f4d6bc ("ipv6: only call ip6_route_dev_notify() once
>>> for NETDEV_UNREGISTER") introduce a new dst refcnt leak? Or does it only
>>> expost existing one?
>>
>>
>> Mathias Tillman reported this as "4.4.103 linux kernel regression".
>> Last message in that thread (which I couldn't find in mailing list
>> archives) had:
>> | As it turns out, it's due to a patch in the Turris Omnia/OpenWRT code
>> that adds a in6_dev_get call without calling in6_dev_put.
>
>
> Wow, this is very helpful, thank you!
>
> Somehow I didn't even think about OpenWrt downstream patches. Too bad
> this wasn't reported to the OpenWrt community, I spent 2 days on this.
> There is indeed:
> target/linux/generic/patches-4.4/670-ipv6-allow-rejecting-with-source-address-failed-policy.patch
> [PATCH 1/2] ipv6: allow rejecting with "source address failed policy"
>
> I'll move this issue discussion to the OpenWrt/LEDE now, I hope we can
> sort it out.

For a reference it has been fixed in OpenWrt/LEDE by Felix in:

1) master branch:
https://git.openwrt.org/?p=openwrt/openwrt.git;a=commitdiff;h=58f7b5b96c301176d639540df4723c798af2a999

2) lede-17.01 branch
https://git.openwrt.org/?p=openwrt/openwrt.git;a=commitdiff;h=999bb66b20b03c753801ecebf1ec2a03c6a63c96

-- 
Rafał

^ permalink raw reply

* Double-lock bug in drivers/isdn/hardware/mISDN/hfcmulti.c
From: Iago Abal @ 2018-05-04  7:27 UTC (permalink / raw)
  To: Karsten Keil, David S. Miller, Stephen Hemminger, Johannes Berg,
	Arvind Yadav, Kees Cook, netdev

Hi,

There is a potential double-lock sequence starting from hfc_remove_pci().

Forward trace:

  1. hfc_remove_pci()  LOCKS spin_lock_irqsave(&HFClock, flags) at 5284
  2. hfc_remove_pci()  calls release_card(card)                 at 5285
  3. release_card()    calls release_port(hc, hc->chan[ch].dch) at 4674
  4. release_port()    calls plxsd_checksync(hc, 1)             at 4595
  5. plxsd_checksync() calls hfcmulti_resync(hc, ..., rm)       at 1036 or 1044
  6. hfcmulti_resync() LOCKS spin_lock_irqsave(&HFClock, flags) at 933

NB: Bug found by static analysis thanks to EBA
(https://github.com/IagoAbal/eba).

Hope it helps,

-- iago

^ permalink raw reply

* Re: DSA switch
From: Ran Shalit @ 2018-05-04  7:26 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev
In-Reply-To: <CAJ2oMh+0pKsX9w2=upu8SQud8Jyn1WhoAQiREwwu9+ehzFPJVA@mail.gmail.com>

On Fri, May 4, 2018 at 9:59 AM, Ran Shalit <ranshalit@gmail.com> wrote:
> On Fri, May 4, 2018 at 12:05 AM, Andrew Lunn <andrew@lunn.ch> wrote:
>>> I am using kernel 2.6.37, but I think it is not kernel issue, but more
>>> bad patches done on kernel.
>>> It is based on TI's kernel, but with some custom modifications on
>>> driver's switch, to make it work with TI's cpsw switch.
>>> Seems like someone made some bad patch, I'll continue investigating it.
>>> You can ignore the question...
>>>
>>> Many thanks a lot for the help,
>>> Ran
>>
>> There is no DSA driver for the cpsw. Are you just using the cpsw to
>> pass frames to a switch which is supported by DSA?
>>
>> In theory, mainline CPSW should just work for passing frames to an
>> external switch. So why not just use mainline?
>>
>
> It seems that the bridge functions OK,
> so I rather keep on working with it, instead of doing too many
> dramatically changes in the custom kernel of TI's which works with our
> chip (dm8148).
>
> Yet, I would like to ask about the bridge:
> Can a bridge also be used with dsa switch when ports are connected to
> different subnets ?
>
> Regards,
> Ran



I also see that there is no bridge function in /drivers/net/dsa files
in our kernel (2.6.37)
I can't find any reference to *bridge* or function used in bridge patch:
https://patchwork.ozlabs.org/patch/16578/

So, how is it that bridge worked in my system ?
Does it mean that it actually does the bridging not in the switch but
in the kernel ip stack ?

Thank you,
ran

^ permalink raw reply

* Re: [PATCH net] nsh: fix infinite loop
From: Jiri Benc @ 2018-05-04  7:23 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, netdev, Eric Dumazet
In-Reply-To: <20180503203754.60611-1-edumazet@google.com>

On Thu,  3 May 2018 13:37:54 -0700, Eric Dumazet wrote:
> diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c
> index d7da99a0b0b852d7459eed9ac6d3cdf3d49a1a1c..9696ef96b719bf24625adea2a959deac1d2a975f 100644
> --- a/net/nsh/nsh.c
> +++ b/net/nsh/nsh.c
> @@ -57,6 +57,8 @@ int nsh_pop(struct sk_buff *skb)
>  		return -ENOMEM;
>  	nh = (struct nshhdr *)(skb->data);
>  	length = nsh_hdr_len(nh);
> +	if (length < NSH_BASE_HDR_LEN)
> +		return -EINVAL;
>  	inner_proto = tun_p_to_eth_p(nh->np);
>  	if (!pskb_may_pull(skb, length))
>  		return -ENOMEM;
> @@ -90,6 +92,8 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb,
>  	if (unlikely(!pskb_may_pull(skb, NSH_BASE_HDR_LEN)))
>  		goto out;
>  	nsh_len = nsh_hdr_len(nsh_hdr(skb));
> +	if (nsh_len < NSH_BASE_HDR_LEN)
> +		goto out;
>  	if (unlikely(!pskb_may_pull(skb, nsh_len)))
>  		goto out;
>  

Acked-by: Jiri Benc <jbenc@redhat.com>

Thanks, Eric, and shame on me!

 Jiri

^ permalink raw reply

* Re: [PATCH net] macmace: Set platform device coherent_dma_mask
From: Geert Uytterhoeven @ 2018-05-04  7:24 UTC (permalink / raw)
  To: Michael Schmitz
  Cc: Christoph Hellwig, Finn Thain, David S. Miller, linux-m68k,
	netdev, Linux Kernel Mailing List
In-Reply-To: <CAOmrzk+y5pmm2anaw15pch--y_gqoUO4NQemSbSanAXpEnttkg@mail.gmail.com>

Hi Michael,

On Thu, May 3, 2018 at 10:24 PM, Michael Schmitz <schmitzmic@gmail.com> wrote:
> On Thu, May 3, 2018 at 8:51 PM, Christoph Hellwig <hch@lst.de> wrote:
>> On Thu, May 03, 2018 at 10:46:56AM +0200, Geert Uytterhoeven wrote:
>>> Perhaps you can add a new helper (platform_device_register_simple_dma()?)
>>> that takes the DMA mask, too?
>>> With people setting the mask to kill the WARNING splat, this may become
>>> more common.
>>>
>>> struct platform_device_info already has a dma_mask field, but
>>> platform_device_register_resndata() explicitly sets it to zero.
>>
>> Yes, that would be useful.  The other assumption could be that
>> platform devices always allow an all-0xff dma mask.
>
> That's not always true (Atari NCR5380 SCSI and floppy would use a 24
> bit DMA mask). We use bounce buffers allocated from a dedicated lowmem
> pool there currently, and for all I know don't use the DMA API yet.
>
> I bet that is a rare exception though. Setting the default DMA mask
> for platform devices to all-0xff and letting the few odd drivers force
> a different setting seems the best way forward.

I'd say that's usually a property of the platform, not of the device?
So IMHO it belongs in the platform code, not in the device driver code.

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* [PATCH] net: sched: cls: fix a potential missing-check bug
From: Wenwen Wang @ 2018-05-04  7:05 UTC (permalink / raw)
  To: Wenwen Wang
  Cc: Kangjie Lu, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
	David S. Miller, open list:TC subsystem, open list

In rsvp_change(), the value of f->res.classid is checked to be no more
than 255. Otherwise, the execution will goto errout. This is enforced by a
if-statement check. However, in the following execution, f->res.classid is
assigned with a new value returned from gen_tunnel(), and the new value
is only checked against 0. Given that gen_tunnel() may return a value
larger than 255 based on data, the new value of f->res.classid should
be re-checked.

This patch adds a re-check to ensure the new value of f->res.classid is not
great than 255; otherwise, an error code will be returned.

Signed-off-by: Wenwen Wang <wang6495@umn.edu>
---
 net/sched/cls_rsvp.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 4f12976..7ced8fc 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -590,6 +590,9 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
 		if (f->res.classid == 0 &&
 		    (f->res.classid = gen_tunnel(data)) == 0)
 			goto errout;
+
+		if (f->res.classid > 255)
+			goto errout;
 	}

 	for (sp = &data->ht[h1];
-- 
2.7.4

^ permalink raw reply related

* Re: DSA switch
From: Ran Shalit @ 2018-05-04  6:59 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev
In-Reply-To: <20180503210545.GJ17027@lunn.ch>

On Fri, May 4, 2018 at 12:05 AM, Andrew Lunn <andrew@lunn.ch> wrote:
>> I am using kernel 2.6.37, but I think it is not kernel issue, but more
>> bad patches done on kernel.
>> It is based on TI's kernel, but with some custom modifications on
>> driver's switch, to make it work with TI's cpsw switch.
>> Seems like someone made some bad patch, I'll continue investigating it.
>> You can ignore the question...
>>
>> Many thanks a lot for the help,
>> Ran
>
> There is no DSA driver for the cpsw. Are you just using the cpsw to
> pass frames to a switch which is supported by DSA?
>
> In theory, mainline CPSW should just work for passing frames to an
> external switch. So why not just use mainline?
>

It seems that the bridge functions OK,
so I rather keep on working with it, instead of doing too many
dramatically changes in the custom kernel of TI's which works with our
chip (dm8148).

Yet, I would like to ask about the bridge:
Can a bridge also be used with dsa switch when ports are connected to
different subnets ?

Regards,
Ran

^ permalink raw reply

* Re: [PATCH v2 bpf-next 2/2] bpf: add selftest for stackmap with build_id in NMI context
From: Song Liu @ 2018-05-04  6:41 UTC (permalink / raw)
  To: Tobin C. Harding; +Cc: netdev@vger.kernel.org, Kernel Team, Teng Qin
In-Reply-To: <20180503071902.GP3791@eros>

Thanks Tobin. I will fold these changes in. 

> On May 3, 2018, at 12:19 AM, Tobin C. Harding <tobin@apporbit.com> wrote:
> 
> On Wed, May 02, 2018 at 04:20:30PM -0700, Song Liu wrote:
>> This new test captures stackmap with build_id with hardware event
>> PERF_COUNT_HW_CPU_CYCLES.
>> 
>> Because we only support one ips-to-build_id lookup per cpu in NMI
>> context, stack_amap will not be able to do the lookup in this test.
> 
>         stack_map ?

This one is stack_amap. There are two maps in the test. 

Song

> 
>> Therefore, we didn't do compare_stack_ips(), as it will alwasy fail.
>> 
>> urandom_read.c is extended to run configurable cycles so that it can be
>> caught by the perf event.
>> 
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---
>> tools/testing/selftests/bpf/test_progs.c   | 137 +++++++++++++++++++++++++++++
>> tools/testing/selftests/bpf/urandom_read.c |  10 ++-
>> 2 files changed, 145 insertions(+), 2 deletions(-)
>> 
>> diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
>> index aa336f0..00bb08c 100644
>> --- a/tools/testing/selftests/bpf/test_progs.c
>> +++ b/tools/testing/selftests/bpf/test_progs.c
>> @@ -1272,6 +1272,142 @@ static void test_stacktrace_build_id(void)
>> 	return;
>> }
>> 
>> +static void test_stacktrace_build_id_nmi(void)
>> +{
>> +	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
>> +	const char *file = "./test_stacktrace_build_id.o";
>> +	int err, pmu_fd, prog_fd;
>> +	struct perf_event_attr attr = {
>> +		.sample_freq = 5000,
>> +		.freq = 1,
>> +		.type = PERF_TYPE_HARDWARE,
>> +		.config = PERF_COUNT_HW_CPU_CYCLES,
>> +	};
>> +	__u32 key, previous_key, val, duration = 0;
>> +	struct bpf_object *obj;
>> +	char buf[256];
>> +	int i, j;
>> +	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
>> +	int build_id_matches = 0;
>> +
>> +	err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
>> +	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
>> +		goto out;
> 		    
> perhaps:
> 		return;
> 
>> +	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
>> +			 0 /* cpu 0 */, -1 /* group id */,
>> +			 0 /* flags */);
>> +	if (CHECK(pmu_fd < 0, "perf_event_open",
>> +		  "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
>> +		  pmu_fd, errno))
>> +		goto close_prog;
>> +
>> +	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
>> +	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
>> +		  err, errno))
>> +		goto close_pmu;
>> +
>> +	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
>> +	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
>> +		  err, errno))
>> +		goto disable_pmu;
>> +
>> +	/* find map fds */
>> +	control_map_fd = bpf_find_map(__func__, obj, "control_map");
>> +	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
>> +		  "err %d errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
>> +	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
>> +		  "err %d errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
>> +	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
>> +		  err, errno))
>> +		goto disable_pmu;
>> +
>> +	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
>> +	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
>> +		  "err %d errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
>> +	       == 0);
>> +	assert(system("taskset 0x1 ./urandom_read 100000") == 0);
>> +	/* disable stack trace collection */
>> +	key = 0;
>> +	val = 1;
>> +	bpf_map_update_elem(control_map_fd, &key, &val, 0);
>> +
>> +	/* for every element in stackid_hmap, we can find a corresponding one
>> +	 * in stackmap, and vise versa.
>> +	 */
>> +	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
>> +	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
>> +		  "err %d errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
>> +	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
>> +		  "err %d errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	err = extract_build_id(buf, 256);
>> +
>> +	if (CHECK(err, "get build_id with readelf",
>> +		  "err %d errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
>> +	if (CHECK(err, "get_next_key from stackmap",
>> +		  "err %d, errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	do {
>> +		char build_id[64];
>> +
>> +		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
>> +		if (CHECK(err, "lookup_elem from stackmap",
>> +			  "err %d, errno %d\n", err, errno))
>> +			goto disable_pmu;
>> +		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
>> +			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
>> +			    id_offs[i].offset != 0) {
>> +				for (j = 0; j < 20; ++j)
>> +					sprintf(build_id + 2 * j, "%02x",
>> +						id_offs[i].build_id[j] & 0xff);
>> +				if (strstr(buf, build_id) != NULL)
>> +					build_id_matches = 1;
>> +			}
>> +		previous_key = key;
>> +	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
>> +
>> +	if (CHECK(build_id_matches < 1, "build id match",
>> +		  "Didn't find expected build ID from the map\n"))
>> +		goto disable_pmu;
>> +
>> +	/*
>> +	 * We intentionally skip compare_stack_ips(). This is because we
>> +	 * only support one in_nmi() ips-to-build_id translation per cpu
>> +	 * at any time, thus stack_amap here will always fallback to
>> +	 * BPF_STACK_BUILD_ID_IP;
>> +	 */
>> +
>> +disable_pmu:
>> +	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
>> +
>> +close_pmu:
>> +	close(pmu_fd);
>> +
>> +close_prog:
>> +	bpf_object__close(obj);
>> +
>> +out:
>> +	return;
>> +}
> 
> No real need for label 'out' right?  We can just return directly and
> remove the last three lines of this function.
> 
> Hope this helps,
> Tobin.

^ permalink raw reply

* [PATCH 8/8] rhashtable: don't hold lock on first table throughout insertion.
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

rhashtable_try_insert() currently hold a lock on the bucket in
the first table, while also locking buckets in subsequent tables.
This is unnecessary and looks like a hold-over from some earlier
version of the implementation.

As insert and remove always lock a bucket in each table in turn, and
as insert only inserts in the final table, there cannot be any races
that are not covered by simply locking a bucket in each table in turn.

When an insert call reaches that last table it can be sure that there
is no match entry in any other table as it has searched them all, and
insertion never happens anywhere but in the last table.  The fact that
code tests for the existence of future_tbl while holding a lock on
the relevant bucket ensures that two threads inserting the same key
will make compatible decisions about which is the "last" table.

This simplifies the code and allows the ->rehash field to be
discarded.
We still need a way to ensure that a dead bucket_table is never
re-linked by rhashtable_walk_stop().  This can be achieved by
setting the ->size to 1.  This still allows lookup code to work (and
correctly not find anything) but can never happen on an active bucket
table (as the minimum size is 4).

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |   13 -------------
 lib/rhashtable.c           |   42 ++++++++++--------------------------------
 2 files changed, 10 insertions(+), 45 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 82d061ff96d6..0529925af41d 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -73,7 +73,6 @@ struct rhlist_head {
 struct bucket_table {
 	unsigned int		size;
 	unsigned int		nest;
-	unsigned int		rehash;
 	u32			hash_rnd;
 	unsigned int		locks_mask;
 	spinlock_t		*locks;
@@ -885,12 +884,6 @@ static inline int rhltable_insert(
  * @obj:	pointer to hash head inside object
  * @params:	hash table parameters
  *
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
  * This lookup function may only be used for fixed key hash table (key_len
  * parameter set). It will BUG() if used inappropriately.
  *
@@ -946,12 +939,6 @@ static inline void *rhashtable_lookup_get_insert_fast(
  * @obj:	pointer to hash head inside object
  * @params:	hash table parameters
  *
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
  * Lookups may occur in parallel with hashtable mutations and resizing.
  *
  * Will trigger an automatic deferred table resizing if residency in the
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index d0267e37e7e1..4f7a7423a675 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -293,10 +293,9 @@ static int rhashtable_rehash_chain(struct rhashtable *ht,
 	while (!(err = rhashtable_rehash_one(ht, old_hash)))
 		;
 
-	if (err == -ENOENT) {
-		old_tbl->rehash++;
+	if (err == -ENOENT)
 		err = 0;
-	}
+
 	spin_unlock_bh(old_bucket_lock);
 
 	return err;
@@ -345,6 +344,9 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	spin_lock(&ht->lock);
 	list_for_each_entry(walker, &old_tbl->walkers, list)
 		walker->tbl = NULL;
+
+	/* Ensure rhashtable_walk_stop() doesn't relink this table */
+	old_tbl->size = 1;
 	spin_unlock(&ht->lock);
 
 	/* Wait for readers. All new readers will see the new
@@ -597,36 +599,14 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 	struct bucket_table *new_tbl;
 	struct bucket_table *tbl;
 	unsigned int hash;
-	spinlock_t *lock;
 	void *data;
 
-	tbl = rcu_dereference(ht->tbl);
-
-	/* All insertions must grab the oldest table containing
-	 * the hashed bucket that is yet to be rehashed.
-	 */
-	for (;;) {
-		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
-		lock = rht_bucket_lock(tbl, hash);
-		spin_lock_bh(lock);
-
-		if (tbl->rehash <= hash)
-			break;
-
-		spin_unlock_bh(lock);
-		tbl = rcu_dereference(tbl->future_tbl);
-	}
-
-	data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
-	new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
-	if (PTR_ERR(new_tbl) != -EEXIST)
-		data = ERR_CAST(new_tbl);
+	new_tbl = rcu_dereference(ht->tbl);
 
-	while (!IS_ERR_OR_NULL(new_tbl)) {
+	do {
 		tbl = new_tbl;
 		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
-		spin_lock_nested(rht_bucket_lock(tbl, hash),
-				 SINGLE_DEPTH_NESTING);
+		spin_lock(rht_bucket_lock(tbl, hash));
 
 		data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
 		new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
@@ -634,9 +614,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 			data = ERR_CAST(new_tbl);
 
 		spin_unlock(rht_bucket_lock(tbl, hash));
-	}
-
-	spin_unlock_bh(lock);
+	} while (!IS_ERR_OR_NULL(new_tbl));
 
 	if (PTR_ERR(data) == -EAGAIN)
 		data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
@@ -971,7 +949,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 	ht = iter->ht;
 
 	spin_lock(&ht->lock);
-	if (tbl->rehash < tbl->size)
+	if (tbl->size > 1)
 		list_add(&iter->walker.list, &tbl->walkers);
 	else
 		iter->walker.tbl = NULL;

^ permalink raw reply related

* [PATCH 7/8] rhashtable: add rhashtable_walk_prev()
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

rhashtable_walk_prev() returns the object returned by
the previous rhashtable_walk_next(), providing it is still in the
table (or was during this grace period).
This works even if rhashtable_walk_stop() and rhashtable_talk_start()
have been called since the last rhashtable_walk_next().

If there have been no calls to rhashtable_walk_next(), or if the
object is gone from the table, then NULL is returned.

This can usefully be used in a seq_file ->start() function.
If the pos is the same as was returned by the last ->next() call,
then rhashtable_walk_prev() can be used to re-establish the
current location in the table.  If it returns NULL, then
rhashtable_walk_next() should be used.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |    1 +
 lib/rhashtable.c           |   31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 20684a451cb0..82d061ff96d6 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -367,6 +367,7 @@ static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
 }
 
 void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void *rhashtable_walk_prev(struct rhashtable_iter *iter);
 void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
 
 void rhashtable_free_and_destroy(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 038c4156b66a..d0267e37e7e1 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -921,6 +921,37 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);
 
+/**
+ * rhashtable_walk_prev - Return the previously returned object, if available
+ * @iter:	Hash table iterator
+ *
+ * If rhashtable_walk_next() has previously been called and the object
+ * it returned is still in the hash table, that object is returned again,
+ * otherwise %NULL is returned.
+ *
+ * If the recent rhashtable_walk_next() call was since the most recent
+ * rhashtable_walk_start() call then the returned object may not, strictly
+ * speaking, still be in the table.  It will be safe to dereference.
+ *
+ * Note that the iterator is not changed and in particular it does not
+ * step backwards.
+ */
+void *rhashtable_walk_prev(struct rhashtable_iter *iter)
+{
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+
+	if (!p)
+		return NULL;
+	if (!iter->p_is_unsafe || ht->rhlist)
+		return p;
+	rht_for_each_rcu(p, iter->walker.tbl, iter->slot)
+		if (p == iter->p)
+			return p;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_prev);
+
 /**
  * rhashtable_walk_stop - Finish a hash table walk
  * @iter:	Hash table iterator

^ permalink raw reply related

* [PATCH 6/8] rhashtable: further improve stability of rhashtable_walk
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

If the sequence:
   obj = rhashtable_walk_next(iter);
   rhashtable_walk_stop(iter);
   rhashtable_remove_fast(ht, &obj->head, params);
   rhashtable_walk_start(iter);

 races with another thread inserting or removing
 an object on the same hash chain, a subsequent
 rhashtable_walk_next() is not guaranteed to get the "next"
 object. It is possible that an object could be
 repeated, or missed.

 This can be made more reliable by keeping the objects in a hash chain
 sorted by memory address.  A subsequent rhashtable_walk_next()
 call can reliably find the correct position in the list, and thus
 find the 'next' object.

 It is not possible (certainly not so easy) to achieve this with an
 rhltable as keeping the hash chain in order is not so easy.  When the
 first object with a given key is removed, it is replaced in the chain
 with the next object with the same key, and the address of that
 object may not be correctly ordered.
 No current user of rhltable_walk_enter() calls
 rhashtable_walk_start() more than once, so no current code
 could benefit from a more reliable walk of rhltables.

 This patch only attempts to improve walks for rhashtables.
 - a new object is always inserted after the last object with a
   smaller address, or at the start
 - when rhashtable_walk_start() is called, it records that 'p' is not
   'safe', meaning that it cannot be dereferenced.  The revalidation
   that was previously done here is moved to rhashtable_walk_next()
 - when rhashtable_walk_next() is called while p is not NULL and not
   safe, it walks the chain looking for the first object with an
   address greater than p and returns that.  If there is none, it moves
   to the next hash chain.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |   11 +++++-
 lib/rhashtable.c           |   82 ++++++++++++++++++++++++++++----------------
 2 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 5091abf975a1..20684a451cb0 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -188,6 +188,7 @@ struct rhashtable_iter {
 	struct rhashtable_walker walker;
 	unsigned int slot;
 	unsigned int skip;
+	bool p_is_unsafe;
 	bool end_of_table;
 };
 
@@ -737,7 +738,12 @@ static inline void *__rhashtable_insert_fast(
 		    (params.obj_cmpfn ?
 		     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
 		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
-			pprev = &head->next;
+			if (rhlist) {
+				pprev = &head->next;
+			} else {
+				if (head < obj)
+					pprev = &head->next;
+			}
 			continue;
 		}
 
@@ -1233,7 +1239,8 @@ static inline int rhashtable_walk_init(struct rhashtable *ht,
  * Note that if you restart a walk after rhashtable_walk_stop you
  * may see the same object twice.  Also, you may miss objects if
  * there are removals in between rhashtable_walk_stop and the next
- * call to rhashtable_walk_start.
+ * call to rhashtable_walk_start.  Note that this is different to
+ * rhashtable_walk_enter() which misses objects.
  *
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 83f5d1ebf452..038c4156b66a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -234,6 +234,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 	struct bucket_table *new_tbl = rhashtable_last_table(ht,
 		rht_dereference_rcu(old_tbl->future_tbl, ht));
 	struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
+	struct rhash_head __rcu **inspos;
 	int err = -EAGAIN;
 	struct rhash_head *head, *next, *entry;
 	spinlock_t *new_bucket_lock;
@@ -262,12 +263,15 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 	new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
 
 	spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
-	head = rht_dereference_bucket(new_tbl->buckets[new_hash],
-				      new_tbl, new_hash);
-
+	inspos = &new_tbl->buckets[new_hash];
+	head = rht_dereference_bucket(*inspos, new_tbl, new_hash);
+	while (!rht_is_a_nulls(head) && head < entry) {
+		inspos = &head->next;
+		head = rht_dereference_bucket(*inspos, new_tbl, new_hash);
+	}
 	RCU_INIT_POINTER(entry->next, head);
 
-	rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
+	rcu_assign_pointer(*inspos, entry);
 	spin_unlock(new_bucket_lock);
 
 	rcu_assign_pointer(*pprev, next);
@@ -565,6 +569,10 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 		return ERR_PTR(-ENOMEM);
 
 	head = rht_dereference_bucket(*pprev, tbl, hash);
+	while (!rht_is_a_nulls(head) && head < obj) {
+		pprev = &head->next;
+		head = rht_dereference_bucket(*pprev, tbl, hash);
+	}
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (ht->rhlist) {
@@ -659,10 +667,10 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  *
  * This function prepares a hash table walk.
  *
- * Note that if you restart a walk after rhashtable_walk_stop you
- * may see the same object twice.  Also, you may miss objects if
- * there are removals in between rhashtable_walk_stop and the next
- * call to rhashtable_walk_start.
+ * A walk is guaranteed to return every object that was in
+ * the table before this call, and is still in the table when
+ * rhashtable_walk_next() returns NULL.  Duplicates can be
+ * seen, but only if there is a rehash event during the walk.
  *
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
@@ -746,19 +754,10 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 
 	if (iter->p && !rhlist) {
 		/*
-		 * We need to validate that 'p' is still in the table, and
-		 * if so, update 'skip'
+		 * 'p' will be revalidated when rhashtable_walk_next()
+		 * is called.
 		 */
-		struct rhash_head *p;
-		int skip = 0;
-		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
-			skip++;
-			if (p == iter->p) {
-				iter->skip = skip;
-				goto found;
-			}
-		}
-		iter->p = NULL;
+		iter->p_is_unsafe = true;
 	} else if (iter->p && rhlist) {
 		/* Need to validate that 'list' is still in the table, and
 		 * if so, update 'skip' and 'p'.
@@ -875,15 +874,39 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 	bool rhlist = ht->rhlist;
 
 	if (p) {
-		if (!rhlist || !(list = rcu_dereference(list->next))) {
-			p = rcu_dereference(p->next);
-			list = container_of(p, struct rhlist_head, rhead);
-		}
-		if (!rht_is_a_nulls(p)) {
-			iter->skip++;
-			iter->p = p;
-			iter->list = list;
-			return rht_obj(ht, rhlist ? &list->rhead : p);
+		if (!rhlist && iter->p_is_unsafe) {
+			/*
+			 * First time next() was called after start().
+			 * Need to find location of 'p' in the list.
+			 */
+			struct rhash_head *p;
+
+			iter->skip = 0;
+			rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+				iter->skip++;
+				if (p <= iter->p)
+					continue;
+
+				/* p is the next object after iter->p */
+				iter->p = p;
+				iter->p_is_unsafe = false;
+				return rht_obj(ht, p);
+			}
+			/* There is no "next" object in the list, move
+			 * to next hash chain.
+			 */
+		} else {
+			if (!rhlist || !(list = rcu_dereference(list->next))) {
+				p = rcu_dereference(p->next);
+				list = container_of(p, struct rhlist_head,
+						    rhead);
+			}
+			if (!rht_is_a_nulls(p)) {
+				iter->skip++;
+				iter->p = p;
+				iter->list = list;
+				return rht_obj(ht, rhlist ? &list->rhead : p);
+			}
 		}
 
 		/* At the end of this slot, switch to next one and then find
@@ -893,6 +916,7 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 		iter->slot++;
 	}
 
+	iter->p_is_unsafe = false;
 	return __rhashtable_walk_find_next(iter);
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);

^ permalink raw reply related

* [PATCH 5/8] rhashtable: remove rhashtable_walk_peek()
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

This function has a somewhat confused behavior that is not properly
described by the documentation.
Sometimes is returns the previous object, sometimes it returns the
next one.
Sometimes it changes the iterator, sometimes it doesn't.

This function is not currently used and is not worth keeping, so
remove it.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |    1 -
 lib/rhashtable.c           |   34 ----------------------------------
 2 files changed, 35 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 8822924dd05a..5091abf975a1 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -366,7 +366,6 @@ static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
 }
 
 void *rhashtable_walk_next(struct rhashtable_iter *iter);
-void *rhashtable_walk_peek(struct rhashtable_iter *iter);
 void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
 
 void rhashtable_free_and_destroy(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 114e6090228a..83f5d1ebf452 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -897,40 +897,6 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);
 
-/**
- * rhashtable_walk_peek - Return the next object but don't advance the iterator
- * @iter:	Hash table iterator
- *
- * Returns the next object or NULL when the end of the table is reached.
- *
- * Returns -EAGAIN if resize event occurred.  Note that the iterator
- * will rewind back to the beginning and you may continue to use it.
- */
-void *rhashtable_walk_peek(struct rhashtable_iter *iter)
-{
-	struct rhlist_head *list = iter->list;
-	struct rhashtable *ht = iter->ht;
-	struct rhash_head *p = iter->p;
-
-	if (p)
-		return rht_obj(ht, ht->rhlist ? &list->rhead : p);
-
-	/* No object found in current iter, find next one in the table. */
-
-	if (iter->skip) {
-		/* A nonzero skip value points to the next entry in the table
-		 * beyond that last one that was found. Decrement skip so
-		 * we find the current value. __rhashtable_walk_find_next
-		 * will restore the original value of skip assuming that
-		 * the table hasn't changed.
-		 */
-		iter->skip--;
-	}
-
-	return __rhashtable_walk_find_next(iter);
-}
-EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
-
 /**
  * rhashtable_walk_stop - Finish a hash table walk
  * @iter:	Hash table iterator

^ permalink raw reply related

* [PATCH 4/8] rhashtable: fix race in nested_table_alloc()
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

If two threads run nested_table_alloc() at the same time
they could both allocate a new table.
Best case is that one of them will never be freed, leaking memory.
Worst case is hat entry get stored there before it leaks,
and the are lost from the table.

So use cmpxchg to detect the race and free the unused table.

Fixes: da20420f83ea ("rhashtable: Add nested tables")
Cc: stable@vger.kernel.org # 4.11+
Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/rhashtable.c |   10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index b73afe1dec7e..114e6090228a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -119,6 +119,7 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
 					      unsigned int nhash)
 {
 	union nested_table *ntbl;
+	union nested_table *tmp;
 	int i;
 
 	ntbl = rcu_dereference(*prev);
@@ -133,9 +134,12 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
 					    (i << shifted) | nhash);
 	}
 
-	rcu_assign_pointer(*prev, ntbl);
-
-	return ntbl;
+	rcu_assign_pointer(tmp, ntbl);
+	if (cmpxchg(prev, NULL, tmp) == NULL)
+		return tmp;
+	/* Raced with another thread. */
+	kfree(ntbl);
+	return rcu_dereference(*prev);
 }
 
 static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,

^ permalink raw reply related

* [PATCH 3/8] rhashtable: use cmpxchg() to protect ->future_tbl.
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

Rather than borrowing one of the bucket locks to
protect ->future_tbl updates, use cmpxchg().
This gives more freedom to change how bucket locking
is implemented.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/rhashtable.c |   17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 4a3f94e8e8a6..b73afe1dec7e 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -298,21 +298,16 @@ static int rhashtable_rehash_attach(struct rhashtable *ht,
 				    struct bucket_table *old_tbl,
 				    struct bucket_table *new_tbl)
 {
-	/* Protect future_tbl using the first bucket lock. */
-	spin_lock_bh(old_tbl->locks);
-
-	/* Did somebody beat us to it? */
-	if (rcu_access_pointer(old_tbl->future_tbl)) {
-		spin_unlock_bh(old_tbl->locks);
-		return -EEXIST;
-	}
-
 	/* Make insertions go into the new, empty table right away. Deletions
 	 * and lookups will be attempted in both tables until we synchronize.
+	 * The use of 'tmp' is simply to ensure we get the required memory
+	 * barriers before the cmpxchg().
 	 */
-	rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
+	struct bucket_table *tmp;
 
-	spin_unlock_bh(old_tbl->locks);
+	rcu_assign_pointer(tmp, new_tbl);
+	if (cmpxchg(&old_tbl->future_tbl, NULL, tmp) != NULL)
+		return -EEXIST;
 
 	return 0;
 }

^ permalink raw reply related

* [PATCH 2/8] rhashtable: remove nulls_base and related code.
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

This "feature" is unused, undocumented, and untested and so
doesn't really belong.  If a use for the nulls marker
is found, all this code would need to be reviewed to
ensure it works as required.  It would be just as easy to
just add the code if/when it is needed instead.

This patch actually fixes a bug too.  The table resizing allows a
table to grow to 2^31 buckets, but the hash is truncated to 27 bits -
any growth beyond 2^27 is wasteful an ineffective.

This patch result in NULLS_MARKER(0) being used for all chains,
and leave the use of rht_is_a_null() to test for it.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |   35 +++--------------------------------
 lib/rhashtable.c           |    8 --------
 lib/test_rhashtable.c      |    5 +----
 3 files changed, 4 insertions(+), 44 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 4e1f535c2034..8822924dd05a 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -29,25 +29,8 @@
 
 /*
  * The end of the chain is marked with a special nulls marks which has
- * the following format:
- *
- * +-------+-----------------------------------------------------+-+
- * | Base  |                      Hash                           |1|
- * +-------+-----------------------------------------------------+-+
- *
- * Base (4 bits) : Reserved to distinguish between multiple tables.
- *                 Specified via &struct rhashtable_params.nulls_base.
- * Hash (27 bits): Full hash (unmasked) of first element added to bucket
- * 1 (1 bit)     : Nulls marker (always set)
- *
- * The remaining bits of the next pointer remain unused for now.
+ * the least significant bit set.
  */
-#define RHT_BASE_BITS		4
-#define RHT_HASH_BITS		27
-#define RHT_BASE_SHIFT		RHT_HASH_BITS
-
-/* Base bits plus 1 bit for nulls marker */
-#define RHT_HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
 
 /* Maximum chain length before rehash
  *
@@ -129,7 +112,6 @@ struct rhashtable;
  * @min_size: Minimum size while shrinking
  * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
  * @automatic_shrinking: Enable automatic shrinking of tables
- * @nulls_base: Base value to generate nulls marker
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
  * @obj_hashfn: Function to hash object
  * @obj_cmpfn: Function to compare key with object
@@ -143,7 +125,6 @@ struct rhashtable_params {
 	u16			min_size;
 	bool			automatic_shrinking;
 	u8			locks_mul;
-	u32			nulls_base;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
 	rht_obj_cmpfn_t		obj_cmpfn;
@@ -210,24 +191,14 @@ struct rhashtable_iter {
 	bool end_of_table;
 };
 
-static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
-{
-	return NULLS_MARKER(ht->p.nulls_base + hash);
-}
-
 #define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
-	((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+	((ptr) = (typeof(ptr)) NULLS_MARKER(0))
 
 static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
 {
 	return ((unsigned long) ptr & 1);
 }
 
-static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr)
-{
-	return ((unsigned long) ptr) >> 1;
-}
-
 static inline void *rht_obj(const struct rhashtable *ht,
 			    const struct rhash_head *he)
 {
@@ -237,7 +208,7 @@ static inline void *rht_obj(const struct rhashtable *ht,
 static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
 					    unsigned int hash)
 {
-	return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1);
+	return hash & (tbl->size - 1);
 }
 
 static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 9427b5766134..4a3f94e8e8a6 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -994,7 +994,6 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
  *	.key_offset = offsetof(struct test_obj, key),
  *	.key_len = sizeof(int),
  *	.hashfn = jhash,
- *	.nulls_base = (1U << RHT_BASE_SHIFT),
  * };
  *
  * Configuration Example 2: Variable length keys
@@ -1028,9 +1027,6 @@ int rhashtable_init(struct rhashtable *ht,
 	    (params->obj_hashfn && !params->obj_cmpfn))
 		return -EINVAL;
 
-	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
-		return -EINVAL;
-
 	memset(ht, 0, sizeof(*ht));
 	mutex_init(&ht->mutex);
 	spin_lock_init(&ht->lock);
@@ -1095,10 +1091,6 @@ int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
 {
 	int err;
 
-	/* No rhlist NULLs marking for now. */
-	if (params->nulls_base)
-		return -EINVAL;
-
 	err = rhashtable_init(&hlt->ht, params);
 	hlt->ht.rhlist = true;
 	return err;
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index bf92b7aa2a49..b428a9c7522a 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -83,7 +83,7 @@ static u32 my_hashfn(const void *data, u32 len, u32 seed)
 {
 	const struct test_obj_rhl *obj = data;
 
-	return (obj->value.id % 10) << RHT_HASH_RESERVED_SPACE;
+	return (obj->value.id % 10);
 }
 
 static int my_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
@@ -99,7 +99,6 @@ static struct rhashtable_params test_rht_params = {
 	.key_offset = offsetof(struct test_obj, value),
 	.key_len = sizeof(struct test_obj_val),
 	.hashfn = jhash,
-	.nulls_base = (3U << RHT_BASE_SHIFT),
 };
 
 static struct rhashtable_params test_rht_params_dup = {
@@ -294,8 +293,6 @@ static int __init test_rhltable(unsigned int entries)
 	if (!obj_in_table)
 		goto out_free;
 
-	/* nulls_base not supported in rhlist interface */
-	test_rht_params.nulls_base = 0;
 	err = rhltable_init(&rhlt, &test_rht_params);
 	if (WARN_ON(err))
 		goto out_free;

^ permalink raw reply related

* [PATCH 1/8] rhashtable: silence RCU warning in rhashtable_test.
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

print_ht in rhashtable_test calls rht_dereference() with neither
RCU protection or the mutex.  This triggers an RCU warning.
So take the mutex to silence the warning.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/test_rhashtable.c |    3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index f4000c137dbe..bf92b7aa2a49 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -499,6 +499,8 @@ static unsigned int __init print_ht(struct rhltable *rhlt)
 	unsigned int i, cnt = 0;

 	ht = &rhlt->ht;
+	/* Take the mutex to avoid RCU warning */
+	mutex_lock(&ht->mutex);
 	tbl = rht_dereference(ht->tbl, ht);
 	for (i = 0; i < tbl->size; i++) {
 		struct rhash_head *pos, *next;
@@ -532,6 +534,7 @@ static unsigned int __init print_ht(struct rhltable *rhlt)
 		}
 	}
 	printk(KERN_ERR "\n---- ht: ----%s\n-------------\n", buff);
+	mutex_unlock(&ht->mutex);

 	return cnt;
 }

^ permalink raw reply related

* [PATCH 0/8] Assorted rhashtable fixes and cleanups
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel

This series contains some bugfixes, mostly minor though one
is worthy of a stable backport I think - tagged with Fixes and Cc: stable.

Then there are improvements to walking, which have been discussed
to some degree already.
Finally a code simplification which I think is correct...

Thanks,
NeilBrown

---

NeilBrown (8):
      rhashtable: silence RCU warning in rhashtable_test.
      rhashtable: remove nulls_base and related code.
      rhashtable: use cmpxchg() to protect ->future_tbl.
      rhashtable: fix race in nested_table_alloc()
      rhashtable: remove rhashtable_walk_peek()
      rhashtable: further improve stability of rhashtable_walk
      rhashtable: add rhashtable_walk_prev()
      rhashtable: don't hold lock on first table throughout insertion.


 include/linux/rhashtable.h |   61 +++----------
 lib/rhashtable.c           |  202 +++++++++++++++++++++-----------------------
 lib/test_rhashtable.c      |    8 +-
 3 files changed, 113 insertions(+), 158 deletions(-)

--
Signature

^ permalink raw reply

* Re: [net-next PATCH 3/5] udp: Add support for software checksum and GSO_PARTIAL with GSO offload
From: Eric Dumazet @ 2018-05-04  3:50 UTC (permalink / raw)
  To: Alexander Duyck, netdev, willemb, davem, Steffen Klassert
In-Reply-To: <20180504003333.4496.7705.stgit@localhost.localdomain>



On 05/03/2018 05:33 PM, Alexander Duyck wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch adds support for a software provided checksum and GSO_PARTIAL
> segmentation support. With this we can offload UDP segmentation on devices
> that only have partial support for tunnels.
> 
> Since we are no longer needing the hardware checksum we can drop the checks
> in the segmentation code that were verifying if it was present.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> ---
>  net/ipv4/udp_offload.c |   28 ++++++++++++++++++----------
>  net/ipv6/udp_offload.c |   11 +----------
>  2 files changed, 19 insertions(+), 20 deletions(-)
> 
> diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
> index 946d06d2aa0c..fd94bbb369b2 100644
> --- a/net/ipv4/udp_offload.c
> +++ b/net/ipv4/udp_offload.c
> @@ -217,6 +217,13 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
>  		return segs;
>  	}
>  
> +	/* GSO partial and frag_list segmentation only requires splitting
> +	 * the frame into an MSS multiple and possibly a remainder, both
> +	 * cases return a GSO skb. So update the mss now.
> +	 */
> +	if (skb_is_gso(segs))
> +		mss *= skb_shinfo(segs)->gso_segs;
> +
> 

I do not understand this code.

I am also seeing it in tcp, after commit 07b26c9454a2a ("gso: Support partial splitting at the frag_list pointer")

Presumably this broke tcp_gso_tstamp() , right ?

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox