Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 5/7] nfp: add set ipv4 header action flower offload
From: Simon Horman @ 2017-10-06  8:21 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, oss-drivers, Pieter Jansen van Vuuren, Simon Horman
In-Reply-To: <1507278086-3102-1-git-send-email-simon.horman@netronome.com>

From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>

Previously we did not have offloading support for set IPv4 actions. This
patch enables TC flower offload of set IPv4 src and dst address actions.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/flower/action.c | 46 ++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   | 10 +++++
 2 files changed, 56 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 631ea4b7d08e..2f886a529ee4 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -266,14 +266,53 @@ nfp_fl_set_eth(const struct tc_action *action, int idx, u32 off,
 }
 
 static int
+nfp_fl_set_ip4(const struct tc_action *action, int idx, u32 off,
+	       struct nfp_fl_set_ip4_addrs *set_ip_addr)
+{
+	u16 tmp_set_ipv4_op;
+	__be32 exact, mask;
+
+	/* We are expecting tcf_pedit to return a big endian value */
+	mask = (__force __be32)~tcf_pedit_mask(action, idx);
+	exact = (__force __be32)tcf_pedit_val(action, idx);
+
+	if (exact & ~mask)
+		return -EOPNOTSUPP;
+
+	switch (off) {
+	case offsetof(struct iphdr, daddr):
+		set_ip_addr->ipv4_dst_mask = mask;
+		set_ip_addr->ipv4_dst = exact;
+		break;
+	case offsetof(struct iphdr, saddr):
+		set_ip_addr->ipv4_src_mask = mask;
+		set_ip_addr->ipv4_src = exact;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	set_ip_addr->reserved = cpu_to_be16(0);
+	tmp_set_ipv4_op = FIELD_PREP(NFP_FL_ACT_LEN_LW,
+				     sizeof(*set_ip_addr) >> NFP_FL_LW_SIZ) |
+			  FIELD_PREP(NFP_FL_ACT_JMP_ID,
+				     NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS);
+	set_ip_addr->a_op = cpu_to_be16(tmp_set_ipv4_op);
+
+	return 0;
+}
+
+static int
 nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 {
+	struct nfp_fl_set_ip4_addrs set_ip_addr;
 	struct nfp_fl_set_eth set_eth;
 	enum pedit_header_type htype;
 	int idx, nkeys, err;
 	size_t act_size;
 	u32 offset, cmd;
 
+	memset(&set_ip_addr, 0, sizeof(set_ip_addr));
 	memset(&set_eth, 0, sizeof(set_eth));
 	nkeys = tcf_pedit_nkeys(action);
 
@@ -289,6 +328,9 @@ nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 		case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
 			err = nfp_fl_set_eth(action, idx, offset, &set_eth);
 			break;
+		case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
+			err = nfp_fl_set_ip4(action, idx, offset, &set_ip_addr);
+			break;
 		default:
 			return -EOPNOTSUPP;
 		}
@@ -300,6 +342,10 @@ nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 		act_size = sizeof(set_eth);
 		memcpy(nfp_action, &set_eth, act_size);
 		*a_len += act_size;
+	} else if (set_ip_addr.a_op) {
+		act_size = sizeof(set_ip_addr);
+		memcpy(nfp_action, &set_ip_addr, act_size);
+		*a_len += act_size;
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index ffeaf85aa420..7ace557fdf84 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -78,6 +78,7 @@
 #define NFP_FL_ACTION_OPCODE_POP_VLAN		2
 #define NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL	6
 #define NFP_FL_ACTION_OPCODE_SET_ETHERNET	7
+#define NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS	9
 #define NFP_FL_ACTION_OPCODE_PRE_TUNNEL		17
 #define NFP_FL_ACTION_OPCODE_NUM		32
 
@@ -115,6 +116,15 @@ struct nfp_fl_set_eth {
 	u8 eth_addr_val[ETH_ALEN * 2];
 };
 
+struct nfp_fl_set_ip4_addrs {
+	__be16 a_op;
+	__be16 reserved;
+	__be32 ipv4_src_mask;
+	__be32 ipv4_src;
+	__be32 ipv4_dst_mask;
+	__be32 ipv4_dst;
+};
+
 struct nfp_fl_output {
 	__be16 a_op;
 	__be16 flags;
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next 6/7] nfp: add set ipv6 source and destination address
From: Simon Horman @ 2017-10-06  8:21 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, oss-drivers, Pieter Jansen van Vuuren, Simon Horman
In-Reply-To: <1507278086-3102-1-git-send-email-simon.horman@netronome.com>

From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>

Previously we did not have offloading support for set IPv6 actions. This
patch enables TC flower offload of set IPv6 src and dst address actions.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/flower/action.c | 72 ++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   | 11 ++++
 2 files changed, 83 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 2f886a529ee4..4394e4f15fdb 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -302,9 +302,55 @@ nfp_fl_set_ip4(const struct tc_action *action, int idx, u32 off,
 	return 0;
 }
 
+static void
+nfp_fl_set_ip6_helper(int opcode_tag, int idx, __be32 exact, __be32 mask,
+		      struct nfp_fl_set_ipv6_addr *ip6)
+{
+	u16 tmp_set_op;
+
+	ip6->ipv6[idx % 4].mask = mask;
+	ip6->ipv6[idx % 4].exact = exact;
+
+	ip6->reserved = cpu_to_be16(0);
+	tmp_set_op = FIELD_PREP(NFP_FL_ACT_LEN_LW, sizeof(*ip6) >>
+				NFP_FL_LW_SIZ) |
+		     FIELD_PREP(NFP_FL_ACT_JMP_ID, opcode_tag);
+	ip6->a_op = cpu_to_be16(tmp_set_op);
+}
+
+static int
+nfp_fl_set_ip6(const struct tc_action *action, int idx, u32 off,
+	       struct nfp_fl_set_ipv6_addr *ip_dst,
+	       struct nfp_fl_set_ipv6_addr *ip_src)
+{
+	__be32 exact, mask;
+
+	/* We are expecting tcf_pedit to return a big endian value */
+	mask = (__force __be32)~tcf_pedit_mask(action, idx);
+	exact = (__force __be32)tcf_pedit_val(action, idx);
+
+	if (exact & ~mask)
+		return -EOPNOTSUPP;
+
+	if (off < offsetof(struct ipv6hdr, saddr))
+		return -EOPNOTSUPP;
+	else if (off < offsetof(struct ipv6hdr, daddr))
+		nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_SRC, idx,
+				      exact, mask, ip_src);
+	else if (off < offsetof(struct ipv6hdr, daddr) +
+		       sizeof(struct in6_addr))
+		nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_DST, idx,
+				      exact, mask, ip_dst);
+	else
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
 static int
 nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 {
+	struct nfp_fl_set_ipv6_addr set_ip6_dst, set_ip6_src;
 	struct nfp_fl_set_ip4_addrs set_ip_addr;
 	struct nfp_fl_set_eth set_eth;
 	enum pedit_header_type htype;
@@ -312,6 +358,8 @@ nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 	size_t act_size;
 	u32 offset, cmd;
 
+	memset(&set_ip6_dst, 0, sizeof(set_ip6_dst));
+	memset(&set_ip6_src, 0, sizeof(set_ip6_src));
 	memset(&set_ip_addr, 0, sizeof(set_ip_addr));
 	memset(&set_eth, 0, sizeof(set_eth));
 	nkeys = tcf_pedit_nkeys(action);
@@ -331,6 +379,10 @@ nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 		case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
 			err = nfp_fl_set_ip4(action, idx, offset, &set_ip_addr);
 			break;
+		case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6:
+			err = nfp_fl_set_ip6(action, idx, offset, &set_ip6_dst,
+					     &set_ip6_src);
+			break;
 		default:
 			return -EOPNOTSUPP;
 		}
@@ -346,6 +398,26 @@ nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 		act_size = sizeof(set_ip_addr);
 		memcpy(nfp_action, &set_ip_addr, act_size);
 		*a_len += act_size;
+	} else if (set_ip6_dst.a_op && set_ip6_src.a_op) {
+		/* TC compiles set src and dst IPv6 address as a single action,
+		 * the hardware requires this to be 2 separate actions.
+		 */
+		act_size = sizeof(set_ip6_src);
+		memcpy(nfp_action, &set_ip6_src, act_size);
+		*a_len += act_size;
+
+		act_size = sizeof(set_ip6_dst);
+		memcpy(&nfp_action[sizeof(set_ip6_src)], &set_ip6_dst,
+		       act_size);
+		*a_len += act_size;
+	} else if (set_ip6_dst.a_op) {
+		act_size = sizeof(set_ip6_dst);
+		memcpy(nfp_action, &set_ip6_dst, act_size);
+		*a_len += act_size;
+	} else if (set_ip6_src.a_op) {
+		act_size = sizeof(set_ip6_src);
+		memcpy(nfp_action, &set_ip6_src, act_size);
+		*a_len += act_size;
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index 7ace557fdf84..527914e294d7 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -79,6 +79,8 @@
 #define NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL	6
 #define NFP_FL_ACTION_OPCODE_SET_ETHERNET	7
 #define NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS	9
+#define NFP_FL_ACTION_OPCODE_SET_IPV6_SRC	11
+#define NFP_FL_ACTION_OPCODE_SET_IPV6_DST	12
 #define NFP_FL_ACTION_OPCODE_PRE_TUNNEL		17
 #define NFP_FL_ACTION_OPCODE_NUM		32
 
@@ -125,6 +127,15 @@ struct nfp_fl_set_ip4_addrs {
 	__be32 ipv4_dst;
 };
 
+struct nfp_fl_set_ipv6_addr {
+	__be16 a_op;
+	__be16 reserved;
+	struct {
+		__be32 mask;
+		__be32 exact;
+	} ipv6[4];
+};
+
 struct nfp_fl_output {
 	__be16 a_op;
 	__be16 flags;
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next 7/7] nfp: add set tcp and udp header action flower offload
From: Simon Horman @ 2017-10-06  8:21 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, oss-drivers, Pieter Jansen van Vuuren, Simon Horman
In-Reply-To: <1507278086-3102-1-git-send-email-simon.horman@netronome.com>

From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>

Previously we did not have offloading support for set TCP/UDP actions. This
patch enables TC flower offload of set TCP/UDP sport and dport actions.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/flower/action.c | 42 ++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   |  9 +++++
 2 files changed, 51 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 4394e4f15fdb..1194c47ef827 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -348,10 +348,39 @@ nfp_fl_set_ip6(const struct tc_action *action, int idx, u32 off,
 }
 
 static int
+nfp_fl_set_tport(const struct tc_action *action, int idx, u32 off,
+		 struct nfp_fl_set_tport *set_tport, int opcode)
+{
+	u32 exact, mask;
+	u16 tmp_set_op;
+
+	if (off)
+		return -EOPNOTSUPP;
+
+	mask = ~tcf_pedit_mask(action, idx);
+	exact = tcf_pedit_val(action, idx);
+
+	if (exact & ~mask)
+		return -EOPNOTSUPP;
+
+	nfp_fl_set_helper32(exact, mask, set_tport->tp_port_val,
+			    set_tport->tp_port_mask);
+
+	set_tport->reserved = cpu_to_be16(0);
+	tmp_set_op = FIELD_PREP(NFP_FL_ACT_LEN_LW,
+				sizeof(*set_tport) >> NFP_FL_LW_SIZ);
+	tmp_set_op |= FIELD_PREP(NFP_FL_ACT_JMP_ID, opcode);
+	set_tport->a_op = cpu_to_be16(tmp_set_op);
+
+	return 0;
+}
+
+static int
 nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 {
 	struct nfp_fl_set_ipv6_addr set_ip6_dst, set_ip6_src;
 	struct nfp_fl_set_ip4_addrs set_ip_addr;
+	struct nfp_fl_set_tport set_tport;
 	struct nfp_fl_set_eth set_eth;
 	enum pedit_header_type htype;
 	int idx, nkeys, err;
@@ -361,6 +390,7 @@ nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 	memset(&set_ip6_dst, 0, sizeof(set_ip6_dst));
 	memset(&set_ip6_src, 0, sizeof(set_ip6_src));
 	memset(&set_ip_addr, 0, sizeof(set_ip_addr));
+	memset(&set_tport, 0, sizeof(set_tport));
 	memset(&set_eth, 0, sizeof(set_eth));
 	nkeys = tcf_pedit_nkeys(action);
 
@@ -383,6 +413,14 @@ nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 			err = nfp_fl_set_ip6(action, idx, offset, &set_ip6_dst,
 					     &set_ip6_src);
 			break;
+		case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
+			err = nfp_fl_set_tport(action, idx, offset, &set_tport,
+					       NFP_FL_ACTION_OPCODE_SET_TCP);
+			break;
+		case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
+			err = nfp_fl_set_tport(action, idx, offset, &set_tport,
+					       NFP_FL_ACTION_OPCODE_SET_UDP);
+			break;
 		default:
 			return -EOPNOTSUPP;
 		}
@@ -418,6 +456,10 @@ nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 		act_size = sizeof(set_ip6_src);
 		memcpy(nfp_action, &set_ip6_src, act_size);
 		*a_len += act_size;
+	} else if (set_tport.a_op) {
+		act_size = sizeof(set_tport);
+		memcpy(nfp_action, &set_tport, act_size);
+		*a_len += act_size;
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index 527914e294d7..f7b7242a22bc 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -81,6 +81,8 @@
 #define NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS	9
 #define NFP_FL_ACTION_OPCODE_SET_IPV6_SRC	11
 #define NFP_FL_ACTION_OPCODE_SET_IPV6_DST	12
+#define NFP_FL_ACTION_OPCODE_SET_UDP		14
+#define NFP_FL_ACTION_OPCODE_SET_TCP		15
 #define NFP_FL_ACTION_OPCODE_PRE_TUNNEL		17
 #define NFP_FL_ACTION_OPCODE_NUM		32
 
@@ -136,6 +138,13 @@ struct nfp_fl_set_ipv6_addr {
 	} ipv6[4];
 };
 
+struct nfp_fl_set_tport {
+	__be16 a_op;
+	__be16 reserved;
+	u8 tp_port_mask[4];
+	u8 tp_port_val[4];
+};
+
 struct nfp_fl_output {
 	__be16 a_op;
 	__be16 flags;
-- 
2.1.4

^ permalink raw reply related

* Re: [net-next V4 PATCH 2/5] bpf: XDP_REDIRECT enable use of cpumap
From: kbuild test robot @ 2017-10-06  8:30 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: kbuild-all, netdev, jakub.kicinski, Michael S. Tsirkin,
	pavel.odintsov, Jason Wang, mchan, John Fastabend,
	peter.waskiewicz.jr, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Andy Gospodarek
In-Reply-To: <150711863012.9499.383645968070658124.stgit@firesoul>

[-- Attachment #1: Type: text/plain, Size: 15313 bytes --]

Hi Jesper,

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Jesper-Dangaard-Brouer/New-bpf-cpumap-type-for-XDP_REDIRECT/20171006-024959
config: um-allyesconfig (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=um 

Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings

All warnings (new ones prefixed by >>):

   Cyclomatic Complexity 4 net/core/filter.c:____bpf_skb_set_tunnel_opt
   Cyclomatic Complexity 1 net/core/filter.c:bpf_skb_set_tunnel_opt
   Cyclomatic Complexity 6 net/core/filter.c:____bpf_skb_under_cgroup
   Cyclomatic Complexity 1 net/core/filter.c:bpf_skb_under_cgroup
   Cyclomatic Complexity 3 net/core/filter.c:____bpf_xdp_event_output
   Cyclomatic Complexity 1 net/core/filter.c:bpf_xdp_event_output
   Cyclomatic Complexity 2 net/core/filter.c:____bpf_get_socket_cookie
   Cyclomatic Complexity 1 net/core/filter.c:bpf_get_socket_cookie
   Cyclomatic Complexity 3 net/core/filter.c:____bpf_get_socket_uid
   Cyclomatic Complexity 1 net/core/filter.c:bpf_get_socket_uid
   Cyclomatic Complexity 21 net/core/filter.c:____bpf_setsockopt
   Cyclomatic Complexity 1 net/core/filter.c:bpf_setsockopt
   Cyclomatic Complexity 10 net/core/filter.c:bpf_skb_is_valid_access
   Cyclomatic Complexity 4 net/core/filter.c:sk_filter_is_valid_access
   Cyclomatic Complexity 6 net/core/filter.c:lwt_is_valid_access
   Cyclomatic Complexity 7 net/core/filter.c:sock_filter_is_valid_access
   Cyclomatic Complexity 2 net/core/filter.c:bpf_unclone_prologue
   Cyclomatic Complexity 1 net/core/filter.c:tc_cls_act_prologue
   Cyclomatic Complexity 7 net/core/filter.c:tc_cls_act_is_valid_access
   Cyclomatic Complexity 4 net/core/filter.c:__is_valid_xdp_access
   Cyclomatic Complexity 5 net/core/filter.c:xdp_is_valid_access
   Cyclomatic Complexity 4 net/core/filter.c:__is_valid_sock_ops_access
   Cyclomatic Complexity 3 net/core/filter.c:sock_ops_is_valid_access
   Cyclomatic Complexity 1 net/core/filter.c:sk_skb_prologue
   Cyclomatic Complexity 6 net/core/filter.c:sk_skb_is_valid_access
   Cyclomatic Complexity 3 net/core/filter.c:__xdp_generic_ok_fwd_dev
   Cyclomatic Complexity 11 net/core/filter.c:sk_filter_trim_cap
   Cyclomatic Complexity 3 net/core/filter.c:bpf_skb_copy
   Cyclomatic Complexity 1 net/core/filter.c:bpf_xdp_copy
   Cyclomatic Complexity 2 net/core/filter.c:bpf_skb_grow_rcsum
   Cyclomatic Complexity 2 net/core/filter.c:sk_filter_release
   Cyclomatic Complexity 2 net/core/filter.c:bpf_release_orig_filter
   Cyclomatic Complexity 2 net/core/filter.c:__bpf_prog_release
   Cyclomatic Complexity 1 net/core/filter.c:__sk_filter_release
   Cyclomatic Complexity 2 net/core/filter.c:sk_filter_release_rcu
   Cyclomatic Complexity 1 net/core/filter.c:bpf_prog_destroy
   Cyclomatic Complexity 9 net/core/filter.c:check_load_and_stores
   Cyclomatic Complexity 19 net/core/filter.c:bpf_check_classic
   Cyclomatic Complexity 3 net/core/filter.c:bpf_prog_store_orig_filter
   Cyclomatic Complexity 10 net/core/filter.c:convert_skb_access
   Cyclomatic Complexity 24 net/core/filter.c:convert_bpf_extensions
   Cyclomatic Complexity 52 net/core/filter.c:bpf_convert_filter
   Cyclomatic Complexity 6 net/core/filter.c:bpf_migrate_filter
   Cyclomatic Complexity 5 net/core/filter.c:bpf_prepare_filter
   Cyclomatic Complexity 4 net/core/filter.c:bpf_prog_create
   Cyclomatic Complexity 7 net/core/filter.c:bpf_prog_create_from_user
   Cyclomatic Complexity 6 net/core/filter.c:__get_filter
   Cyclomatic Complexity 5 net/core/filter.c:bpf_warn_invalid_xdp_action
   Cyclomatic Complexity 7 net/core/filter.c:__reuseport_attach_prog
   Cyclomatic Complexity 2 net/core/filter.c:__get_bpf
   Cyclomatic Complexity 3 net/core/filter.c:__bpf_redirect_common
   Cyclomatic Complexity 2 net/core/filter.c:__bpf_redirect
   Cyclomatic Complexity 1 net/core/filter.c:bpf_skb_generic_push
   Cyclomatic Complexity 3 net/core/filter.c:bpf_skb_net_hdr_push
   Cyclomatic Complexity 5 net/core/filter.c:bpf_skb_proto_4_to_6
   Cyclomatic Complexity 4 net/core/filter.c:bpf_skb_net_grow
   Cyclomatic Complexity 2 net/core/filter.c:bpf_skb_generic_pop
   Cyclomatic Complexity 3 net/core/filter.c:bpf_skb_net_hdr_pop
   Cyclomatic Complexity 5 net/core/filter.c:bpf_skb_proto_6_to_4
   Cyclomatic Complexity 3 net/core/filter.c:bpf_skb_proto_xlat
   Cyclomatic Complexity 4 net/core/filter.c:bpf_skb_net_shrink
   Cyclomatic Complexity 12 net/core/filter.c:bpf_skb_adjust_net
   Cyclomatic Complexity 4 net/core/filter.c:xdp_do_flush_map
   Cyclomatic Complexity 3 net/core/filter.c:__xdp_map_lookup_elem
   Cyclomatic Complexity 6 net/core/filter.c:__bpf_tx_xdp_map
   Cyclomatic Complexity 12 net/core/filter.c:xdp_do_redirect_map
   Cyclomatic Complexity 4 net/core/filter.c:xdp_do_redirect
   Cyclomatic Complexity 11 net/core/filter.c:bpf_base_func_proto
   Cyclomatic Complexity 4 net/core/filter.c:sk_filter_func_proto
   Cyclomatic Complexity 7 net/core/filter.c:xdp_func_proto
   Cyclomatic Complexity 10 net/core/filter.c:lwt_inout_func_proto
   Cyclomatic Complexity 2 net/core/filter.c:sock_filter_func_proto
   Cyclomatic Complexity 3 net/core/filter.c:sock_ops_func_proto
   Cyclomatic Complexity 9 net/core/filter.c:sk_skb_func_proto
   Cyclomatic Complexity 74 net/core/filter.c:bpf_convert_ctx_access
   Cyclomatic Complexity 5 net/core/filter.c:bpf_get_skb_set_tunnel_proto
   Cyclomatic Complexity 30 net/core/filter.c:tc_cls_act_func_proto
   Cyclomatic Complexity 14 net/core/filter.c:lwt_xmit_func_proto
   Cyclomatic Complexity 4 net/core/filter.c:tc_cls_act_convert_ctx_access
   Cyclomatic Complexity 7 net/core/filter.c:xdp_convert_ctx_access
   Cyclomatic Complexity 14 net/core/filter.c:sock_filter_convert_ctx_access
   Cyclomatic Complexity 27 net/core/filter.c:sock_ops_convert_ctx_access
   Cyclomatic Complexity 1 net/core/filter.c:sk_filter_uncharge
   Cyclomatic Complexity 8 net/core/filter.c:__sk_attach_prog
   Cyclomatic Complexity 3 net/core/filter.c:sk_attach_filter
   Cyclomatic Complexity 6 net/core/filter.c:sk_detach_filter
   Cyclomatic Complexity 2 net/core/filter.c:sk_filter_charge
   Cyclomatic Complexity 3 net/core/filter.c:sk_reuseport_attach_filter
   Cyclomatic Complexity 3 net/core/filter.c:sk_attach_bpf
   Cyclomatic Complexity 3 net/core/filter.c:sk_reuseport_attach_bpf
   Cyclomatic Complexity 2 net/core/filter.c:skb_do_redirect
   Cyclomatic Complexity 2 net/core/filter.c:do_sk_redirect_map
   Cyclomatic Complexity 11 net/core/filter.c:xdp_do_generic_redirect_map
   Cyclomatic Complexity 4 net/core/filter.c:xdp_do_generic_redirect
   Cyclomatic Complexity 14 net/core/filter.c:bpf_helper_changes_pkt_data
   Cyclomatic Complexity 9 net/core/filter.c:sk_get_filter
   Cyclomatic Complexity 1 net/core/filter.c:_GLOBAL__sub_I_65535_0_sk_filter_trim_cap
   In file included from include/linux/bpf_trace.h:5:0,
                    from net/core/filter.c:58:
   net/core/filter.c: In function 'xdp_do_generic_redirect_map':
>> include/trace/events/xdp.h:150:3: warning: 'fwd' may be used uninitialized in this function [-Wmaybe-uninitialized]
      trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \
      ^~~~~~~~~~~~~~~~~~~~~~~~~~
   net/core/filter.c:2687:21: note: 'fwd' was declared here
     struct net_device *fwd;
                        ^~~
--
   Cyclomatic Complexity 4 net//core/filter.c:____bpf_skb_set_tunnel_opt
   Cyclomatic Complexity 1 net//core/filter.c:bpf_skb_set_tunnel_opt
   Cyclomatic Complexity 6 net//core/filter.c:____bpf_skb_under_cgroup
   Cyclomatic Complexity 1 net//core/filter.c:bpf_skb_under_cgroup
   Cyclomatic Complexity 3 net//core/filter.c:____bpf_xdp_event_output
   Cyclomatic Complexity 1 net//core/filter.c:bpf_xdp_event_output
   Cyclomatic Complexity 2 net//core/filter.c:____bpf_get_socket_cookie
   Cyclomatic Complexity 1 net//core/filter.c:bpf_get_socket_cookie
   Cyclomatic Complexity 3 net//core/filter.c:____bpf_get_socket_uid
   Cyclomatic Complexity 1 net//core/filter.c:bpf_get_socket_uid
   Cyclomatic Complexity 21 net//core/filter.c:____bpf_setsockopt
   Cyclomatic Complexity 1 net//core/filter.c:bpf_setsockopt
   Cyclomatic Complexity 10 net//core/filter.c:bpf_skb_is_valid_access
   Cyclomatic Complexity 4 net//core/filter.c:sk_filter_is_valid_access
   Cyclomatic Complexity 6 net//core/filter.c:lwt_is_valid_access
   Cyclomatic Complexity 7 net//core/filter.c:sock_filter_is_valid_access
   Cyclomatic Complexity 2 net//core/filter.c:bpf_unclone_prologue
   Cyclomatic Complexity 1 net//core/filter.c:tc_cls_act_prologue
   Cyclomatic Complexity 7 net//core/filter.c:tc_cls_act_is_valid_access
   Cyclomatic Complexity 4 net//core/filter.c:__is_valid_xdp_access
   Cyclomatic Complexity 5 net//core/filter.c:xdp_is_valid_access
   Cyclomatic Complexity 4 net//core/filter.c:__is_valid_sock_ops_access
   Cyclomatic Complexity 3 net//core/filter.c:sock_ops_is_valid_access
   Cyclomatic Complexity 1 net//core/filter.c:sk_skb_prologue
   Cyclomatic Complexity 6 net//core/filter.c:sk_skb_is_valid_access
   Cyclomatic Complexity 3 net//core/filter.c:__xdp_generic_ok_fwd_dev
   Cyclomatic Complexity 11 net//core/filter.c:sk_filter_trim_cap
   Cyclomatic Complexity 3 net//core/filter.c:bpf_skb_copy
   Cyclomatic Complexity 1 net//core/filter.c:bpf_xdp_copy
   Cyclomatic Complexity 2 net//core/filter.c:bpf_skb_grow_rcsum
   Cyclomatic Complexity 2 net//core/filter.c:sk_filter_release
   Cyclomatic Complexity 2 net//core/filter.c:bpf_release_orig_filter
   Cyclomatic Complexity 2 net//core/filter.c:__bpf_prog_release
   Cyclomatic Complexity 1 net//core/filter.c:__sk_filter_release
   Cyclomatic Complexity 2 net//core/filter.c:sk_filter_release_rcu
   Cyclomatic Complexity 1 net//core/filter.c:bpf_prog_destroy
   Cyclomatic Complexity 9 net//core/filter.c:check_load_and_stores
   Cyclomatic Complexity 19 net//core/filter.c:bpf_check_classic
   Cyclomatic Complexity 3 net//core/filter.c:bpf_prog_store_orig_filter
   Cyclomatic Complexity 10 net//core/filter.c:convert_skb_access
   Cyclomatic Complexity 24 net//core/filter.c:convert_bpf_extensions
   Cyclomatic Complexity 52 net//core/filter.c:bpf_convert_filter
   Cyclomatic Complexity 6 net//core/filter.c:bpf_migrate_filter
   Cyclomatic Complexity 5 net//core/filter.c:bpf_prepare_filter
   Cyclomatic Complexity 4 net//core/filter.c:bpf_prog_create
   Cyclomatic Complexity 7 net//core/filter.c:bpf_prog_create_from_user
   Cyclomatic Complexity 6 net//core/filter.c:__get_filter
   Cyclomatic Complexity 5 net//core/filter.c:bpf_warn_invalid_xdp_action
   Cyclomatic Complexity 7 net//core/filter.c:__reuseport_attach_prog
   Cyclomatic Complexity 2 net//core/filter.c:__get_bpf
   Cyclomatic Complexity 3 net//core/filter.c:__bpf_redirect_common
   Cyclomatic Complexity 2 net//core/filter.c:__bpf_redirect
   Cyclomatic Complexity 1 net//core/filter.c:bpf_skb_generic_push
   Cyclomatic Complexity 3 net//core/filter.c:bpf_skb_net_hdr_push
   Cyclomatic Complexity 5 net//core/filter.c:bpf_skb_proto_4_to_6
   Cyclomatic Complexity 4 net//core/filter.c:bpf_skb_net_grow
   Cyclomatic Complexity 2 net//core/filter.c:bpf_skb_generic_pop
   Cyclomatic Complexity 3 net//core/filter.c:bpf_skb_net_hdr_pop
   Cyclomatic Complexity 5 net//core/filter.c:bpf_skb_proto_6_to_4
   Cyclomatic Complexity 3 net//core/filter.c:bpf_skb_proto_xlat
   Cyclomatic Complexity 4 net//core/filter.c:bpf_skb_net_shrink
   Cyclomatic Complexity 12 net//core/filter.c:bpf_skb_adjust_net
   Cyclomatic Complexity 4 net//core/filter.c:xdp_do_flush_map
   Cyclomatic Complexity 3 net//core/filter.c:__xdp_map_lookup_elem
   Cyclomatic Complexity 6 net//core/filter.c:__bpf_tx_xdp_map
   Cyclomatic Complexity 12 net//core/filter.c:xdp_do_redirect_map
   Cyclomatic Complexity 4 net//core/filter.c:xdp_do_redirect
   Cyclomatic Complexity 11 net//core/filter.c:bpf_base_func_proto
   Cyclomatic Complexity 4 net//core/filter.c:sk_filter_func_proto
   Cyclomatic Complexity 7 net//core/filter.c:xdp_func_proto
   Cyclomatic Complexity 10 net//core/filter.c:lwt_inout_func_proto
   Cyclomatic Complexity 2 net//core/filter.c:sock_filter_func_proto
   Cyclomatic Complexity 3 net//core/filter.c:sock_ops_func_proto
   Cyclomatic Complexity 9 net//core/filter.c:sk_skb_func_proto
   Cyclomatic Complexity 74 net//core/filter.c:bpf_convert_ctx_access
   Cyclomatic Complexity 5 net//core/filter.c:bpf_get_skb_set_tunnel_proto
   Cyclomatic Complexity 30 net//core/filter.c:tc_cls_act_func_proto
   Cyclomatic Complexity 14 net//core/filter.c:lwt_xmit_func_proto
   Cyclomatic Complexity 4 net//core/filter.c:tc_cls_act_convert_ctx_access
   Cyclomatic Complexity 7 net//core/filter.c:xdp_convert_ctx_access
   Cyclomatic Complexity 14 net//core/filter.c:sock_filter_convert_ctx_access
   Cyclomatic Complexity 27 net//core/filter.c:sock_ops_convert_ctx_access
   Cyclomatic Complexity 1 net//core/filter.c:sk_filter_uncharge
   Cyclomatic Complexity 8 net//core/filter.c:__sk_attach_prog
   Cyclomatic Complexity 3 net//core/filter.c:sk_attach_filter
   Cyclomatic Complexity 6 net//core/filter.c:sk_detach_filter
   Cyclomatic Complexity 2 net//core/filter.c:sk_filter_charge
   Cyclomatic Complexity 3 net//core/filter.c:sk_reuseport_attach_filter
   Cyclomatic Complexity 3 net//core/filter.c:sk_attach_bpf
   Cyclomatic Complexity 3 net//core/filter.c:sk_reuseport_attach_bpf
   Cyclomatic Complexity 2 net//core/filter.c:skb_do_redirect
   Cyclomatic Complexity 2 net//core/filter.c:do_sk_redirect_map
   Cyclomatic Complexity 11 net//core/filter.c:xdp_do_generic_redirect_map
   Cyclomatic Complexity 4 net//core/filter.c:xdp_do_generic_redirect
   Cyclomatic Complexity 14 net//core/filter.c:bpf_helper_changes_pkt_data
   Cyclomatic Complexity 9 net//core/filter.c:sk_get_filter
   Cyclomatic Complexity 1 net//core/filter.c:_GLOBAL__sub_I_65535_0_sk_filter_trim_cap
   In file included from include/linux/bpf_trace.h:5:0,
                    from net//core/filter.c:58:
   net//core/filter.c: In function 'xdp_do_generic_redirect_map':
>> include/trace/events/xdp.h:150:3: warning: 'fwd' may be used uninitialized in this function [-Wmaybe-uninitialized]
      trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \
      ^~~~~~~~~~~~~~~~~~~~~~~~~~
   net//core/filter.c:2687:21: note: 'fwd' was declared here
     struct net_device *fwd;
                        ^~~

vim +/fwd +150 include/trace/events/xdp.h

   138	
   139	#define devmap_ifindex(fwd, map)				\
   140		(!fwd ? 0 :						\
   141		 (!map ? 0 :						\
   142		  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
   143		   ((struct net_device *)fwd)->ifindex : 0)))
   144	
   145	#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
   146		 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
   147					0, map, idx)
   148	
   149	#define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err)	\
 > 150		 trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map),	\
   151					    err, map, idx)
   152	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 19628 bytes --]

^ permalink raw reply

* [net-next PATCH 0/3] Improve xdp_monitor samples/bpf
From: Jesper Dangaard Brouer @ 2017-10-06  8:41 UTC (permalink / raw)
  To: netdev, Andy Gospodarek
  Cc: Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer

Here are some improvements to the xdp_monitor tool currently located
under samples/bpf/.  Once the tools library libbpf become more feature
complete, xdp_monitor should be converted to use it, and be moved into
tools/bpf/xdp/ or tools/xdp/.

---

Jesper Dangaard Brouer (3):
      samples/bpf: xdp_monitor first 8 bytes are not accessible by bpf
      samples/bpf: xdp_monitor also record xdp_exception tracepoint
      samples/bpf: xdp_monitor increase memory rlimit


 samples/bpf/xdp_monitor_kern.c |   60 +++++++++++++++-----
 samples/bpf/xdp_monitor_user.c |  119 +++++++++++++++++++++++++++++++---------
 2 files changed, 139 insertions(+), 40 deletions(-)

^ permalink raw reply

* [net-next PATCH 1/3] samples/bpf: xdp_monitor first 8 bytes are not accessible by bpf
From: Jesper Dangaard Brouer @ 2017-10-06  8:41 UTC (permalink / raw)
  To: netdev, Andy Gospodarek
  Cc: Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
In-Reply-To: <150727927390.4460.3200093291677318710.stgit@firesoul>

The first 8 bytes of the tracepoint context struct are not accessible
by the bpf code.  This is a choice that dates back to the original
inclusion of this code.

See explaination in:
 commit 98b5c2c65c29 ("perf, bpf: allow bpf programs attach to tracepoints")

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 samples/bpf/xdp_monitor_kern.c |   22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 74f3fd8ed729..cc7e19d2ad76 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -17,19 +17,15 @@ struct bpf_map_def SEC("maps") redirect_err_cnt = {
  * Code in:                kernel/include/trace/events/xdp.h
  */
 struct xdp_redirect_ctx {
-	unsigned short common_type;	//	offset:0;  size:2; signed:0;
-	unsigned char common_flags;	//	offset:2;  size:1; signed:0;
-	unsigned char common_preempt_count;//	offset:3;  size:1; signed:0;
-	int common_pid;			//	offset:4;  size:4; signed:1;
-
-	int prog_id;			//	offset:8;  size:4; signed:1;
-	u32 act;			//	offset:12  size:4; signed:0;
-	int ifindex;			//	offset:16  size:4; signed:1;
-	int err;			//	offset:20  size:4; signed:1;
-	int to_ifindex;			//	offset:24  size:4; signed:1;
-	u32 map_id;			//	offset:28  size:4; signed:0;
-	int map_index;			//	offset:32  size:4; signed:1;
-};					//	offset:36
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int prog_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12  size:4; signed:0;
+	int ifindex;		//	offset:16  size:4; signed:1;
+	int err;		//	offset:20  size:4; signed:1;
+	int to_ifindex;		//	offset:24  size:4; signed:1;
+	u32 map_id;		//	offset:28  size:4; signed:0;
+	int map_index;		//	offset:32  size:4; signed:1;
+};				//	offset:36
 
 enum {
 	XDP_REDIRECT_SUCCESS = 0,

^ permalink raw reply related

* [net-next PATCH 2/3] samples/bpf: xdp_monitor also record xdp_exception tracepoint
From: Jesper Dangaard Brouer @ 2017-10-06  8:41 UTC (permalink / raw)
  To: netdev, Andy Gospodarek
  Cc: Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
In-Reply-To: <150727927390.4460.3200093291677318710.stgit@firesoul>

Also monitor the tracepoint xdp_exception.  This tracepoint is usually
invoked by the drivers.  Programs themselves can activate this by
returning XDP_ABORTED, which will drop the packet but also trigger the
tracepoint.  This is useful for distinguishing intentional (XDP_DROP)
vs. ebpf-program error cases that cased a drop (XDP_ABORTED).

Drivers also use this tracepoint for reporting on XDP actions that are
unknown to the specific driver.  This can help the user to detect if a
driver e.g. doesn't implement XDP_REDIRECT yet.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 samples/bpf/xdp_monitor_kern.c |   38 ++++++++++++++
 samples/bpf/xdp_monitor_user.c |  108 +++++++++++++++++++++++++++++++---------
 2 files changed, 121 insertions(+), 25 deletions(-)

diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index cc7e19d2ad76..2fe2f761a0d0 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -13,6 +13,14 @@ struct bpf_map_def SEC("maps") redirect_err_cnt = {
 	/* TODO: have entries for all possible errno's */
 };
 
+#define XDP_UNKNOWN	XDP_REDIRECT + 1
+struct bpf_map_def SEC("maps") exception_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(u64),
+	.max_entries	= XDP_UNKNOWN + 1,
+};
+
 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
  * Code in:                kernel/include/trace/events/xdp.h
  */
@@ -44,7 +52,7 @@ int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
 
 	cnt  = bpf_map_lookup_elem(&redirect_err_cnt, &key);
 	if (!cnt)
-		return 0;
+		return 1;
 	*cnt += 1;
 
 	return 0; /* Indicate event was filtered (no further processing)*/
@@ -82,3 +90,31 @@ int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx)
 {
 	return xdp_redirect_collect_stat(ctx);
 }
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
+ * Code in:                kernel/include/trace/events/xdp.h
+ */
+struct xdp_exception_ctx {
+	u64 __pad;	// First 8 bytes are not accessible by bpf code
+	int prog_id;	//	offset:8;  size:4; signed:1;
+	u32 act;	//	offset:12; size:4; signed:0;
+	int ifindex;	//	offset:16; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_exception")
+int trace_xdp_exception(struct xdp_exception_ctx *ctx)
+{
+	u64 *cnt;;
+	u32 key;
+
+	key = ctx->act;
+	if (key > XDP_REDIRECT)
+		key = XDP_UNKNOWN;
+
+	cnt = bpf_map_lookup_elem(&exception_cnt, &key);
+	if (!cnt)
+		return 1;
+	*cnt += 1;
+
+	return 0;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index c5ab8b776973..97c3456c11b2 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -89,6 +89,23 @@ static const char *err2str(int err)
 		return redir_names[err];
 	return NULL;
 }
+/* enum xdp_action */
+#define XDP_UNKNOWN	XDP_REDIRECT + 1
+#define XDP_ACTION_MAX (XDP_UNKNOWN + 1)
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+	[XDP_ABORTED]	= "XDP_ABORTED",
+	[XDP_DROP]	= "XDP_DROP",
+	[XDP_PASS]	= "XDP_PASS",
+	[XDP_TX]	= "XDP_TX",
+	[XDP_REDIRECT]	= "XDP_REDIRECT",
+	[XDP_UNKNOWN]	= "XDP_UNKNOWN",
+};
+static const char *action2str(int action)
+{
+	if (action < XDP_ACTION_MAX)
+		return xdp_action_names[action];
+	return NULL;
+}
 
 struct record {
 	__u64 counter;
@@ -97,6 +114,7 @@ struct record {
 
 struct stats_record {
 	struct record xdp_redir[REDIR_RES_MAX];
+	struct record xdp_exception[XDP_ACTION_MAX];
 };
 
 static void stats_print_headers(bool err_only)
@@ -104,39 +122,72 @@ static void stats_print_headers(bool err_only)
 	if (err_only)
 		printf("\n%s\n", __doc_err_only__);
 
-	printf("%-14s %-10s %-18s %-9s\n",
-	       "XDP_REDIRECT", "pps ", "pps-human-readable", "measure-period");
+	printf("%-14s %-11s %-10s %-18s %-9s\n",
+	       "ACTION", "result", "pps ", "pps-human-readable", "measure-period");
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+	double period_ = 0;
+	__u64 period = 0;
+
+	period = r->timestamp - p->timestamp;
+	if (period > 0)
+		period_ = ((double) period / NANOSEC_PER_SEC);
+
+	return period_;
+}
+
+static double calc_pps(struct record *r, struct record *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->counter - p->counter;
+		pps = packets / period;
+	}
+	return pps;
 }
 
 static void stats_print(struct stats_record *rec,
 			struct stats_record *prev,
 			bool err_only)
 {
+	double period = 0, pps = 0;
+	struct record *r, *p;
 	int i = 0;
 
+	char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n";
+
+	/* tracepoint: xdp:xdp_redirect_* */
 	if (err_only)
 		i = REDIR_ERROR;
 
 	for (; i < REDIR_RES_MAX; i++) {
-		struct record *r = &rec->xdp_redir[i];
-		struct record *p = &prev->xdp_redir[i];
-		__u64 period  = 0;
-		__u64 packets = 0;
-		double pps = 0;
-		double period_ = 0;
+		r = &rec->xdp_redir[i];
+		p = &prev->xdp_redir[i];
 
 		if (p->timestamp) {
-			packets = r->counter - p->counter;
-			period  = r->timestamp - p->timestamp;
-			if (period > 0) {
-				period_ = ((double) period / NANOSEC_PER_SEC);
-				pps = packets / period_;
-			}
+			period = calc_period(r, p);
+			pps = calc_pps(r, p, period);
 		}
+		printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period);
+	}
 
-		printf("%-14s %-10.0f %'-18.0f %f\n",
-		       err2str(i), pps, pps, period_);
+	/* tracepoint: xdp:xdp_exception */
+	for (i = 0; i < XDP_ACTION_MAX; i++) {
+		r = &rec->xdp_exception[i];
+		p = &prev->xdp_exception[i];
+		if (p->timestamp) {
+			period = calc_period(r, p);
+			pps = calc_pps(r, p, period);
+		}
+		if (pps > 0)
+			printf(fmt, action2str(i), "Exception",
+			       pps, pps, period);
 	}
+	printf("\n");
 }
 
 static __u64 get_key32_value64_percpu(int fd, __u32 key)
@@ -160,25 +211,33 @@ static __u64 get_key32_value64_percpu(int fd, __u32 key)
 	return sum;
 }
 
-static bool stats_collect(int fd, struct stats_record *rec)
+static bool stats_collect(struct stats_record *rec)
 {
+	int fd;
 	int i;
 
 	/* TODO: Detect if someone unloaded the perf event_fd's, as
 	 * this can happen by someone running perf-record -e
 	 */
 
+	fd = map_data[0].fd; /* map0: redirect_err_cnt */
 	for (i = 0; i < REDIR_RES_MAX; i++) {
 		rec->xdp_redir[i].timestamp = gettime();
 		rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i);
 	}
+
+	fd = map_data[1].fd; /* map1: exception_cnt */
+	for (i = 0; i < XDP_ACTION_MAX; i++) {
+		rec->xdp_exception[i].timestamp = gettime();
+		rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i);
+	}
+
 	return true;
 }
 
 static void stats_poll(int interval, bool err_only)
 {
 	struct stats_record rec, prev;
-	int map_fd;
 
 	memset(&rec, 0, sizeof(rec));
 
@@ -190,16 +249,17 @@ static void stats_poll(int interval, bool err_only)
 		printf("\n%s", __doc__);
 
 	/* TODO Need more advanced stats on error types */
-	if (verbose)
-		printf(" - Stats map: %s\n", map_data[0].name);
-	map_fd = map_data[0].fd;
-
-	stats_print_headers(err_only);
+	if (verbose) {
+		printf(" - Stats map0: %s\n", map_data[0].name);
+		printf(" - Stats map1: %s\n", map_data[1].name);
+		printf("\n");
+	}
 	fflush(stdout);
 
 	while (1) {
 		memcpy(&prev, &rec, sizeof(rec));
-		stats_collect(map_fd, &rec);
+		stats_collect(&rec);
+		stats_print_headers(err_only);
 		stats_print(&rec, &prev, err_only);
 		fflush(stdout);
 		sleep(interval);

^ permalink raw reply related

* [net-next PATCH 3/3] samples/bpf: xdp_monitor increase memory rlimit
From: Jesper Dangaard Brouer @ 2017-10-06  8:41 UTC (permalink / raw)
  To: netdev, Andy Gospodarek
  Cc: Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
In-Reply-To: <150727927390.4460.3200093291677318710.stgit@firesoul>

Other concurrent running programs, like perf or the XDP program what
needed to be monitored, might take up part of the max locked memory
limit.  Thus, the xdp_monitor tool have to set the RLIMIT_MEMLOCK to
RLIM_INFINITY, as it cannot determine a more sane limit.

Using the man exit(3) specified EXIT_FAILURE return exit code, and
correct other users too.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 samples/bpf/xdp_monitor_user.c |   11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index 97c3456c11b2..eaba165b3549 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -20,6 +20,7 @@ static const char *__doc_err_only__=
 #include <unistd.h>
 #include <locale.h>
 
+#include <sys/resource.h>
 #include <getopt.h>
 #include <net/if.h>
 #include <time.h>
@@ -295,6 +296,7 @@ static void print_bpf_prog_info(void)
 
 int main(int argc, char **argv)
 {
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
 	int longindex = 0, opt;
 	int ret = EXIT_SUCCESS;
 	char bpf_obj_file[256];
@@ -325,13 +327,18 @@ int main(int argc, char **argv)
 		}
 	}
 
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return EXIT_FAILURE;
+	}
+
 	if (load_bpf_file(bpf_obj_file)) {
 		printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
-		return 1;
+		return EXIT_FAILURE;
 	}
 	if (!prog_fd[0]) {
 		printf("ERROR - load_bpf_file: %s\n", strerror(errno));
-		return 1;
+		return EXIT_FAILURE;
 	}
 
 	if (debug) {

^ permalink raw reply related

* Re: BUG in free_netdev() on ppp link deletion
From: Guillaume Nault @ 2017-10-06  8:57 UTC (permalink / raw)
  To: Beniamino Galvani
  Cc: linux-ppp, netdev, Paul Mackerras, David Ahern, Gao Feng
In-Reply-To: <20171006080902.GA11223@tp>

On Fri, Oct 06, 2017 at 10:09:03AM +0200, Beniamino Galvani wrote:
> On Thu, Oct 05, 2017 at 04:55:03PM +0200, Guillaume Nault wrote:
> > Sorry for the delay, I've followed a few complicated dead ends before
> > getting to this simple and rather obvious fix.
> > 
> > Can you try this patch?
> > 
> > [..]
> 
> The patch solves the issue, thanks.
> 
Thanks, I'm going to do some more tests and submit it formally.

^ permalink raw reply

* Re: [net-next V4 PATCH 1/5] bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP
From: Jesper Dangaard Brouer @ 2017-10-06  9:03 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: netdev, jakub.kicinski, Michael S. Tsirkin, pavel.odintsov,
	Jason Wang, mchan, John Fastabend, peter.waskiewicz.jr,
	Daniel Borkmann, Andy Gospodarek, brouer
In-Reply-To: <20171004190201.5no5mrmkko43cvv2@ast-mbp>

On Wed, 4 Oct 2017 12:02:02 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Wed, Oct 04, 2017 at 02:03:45PM +0200, Jesper Dangaard Brouer wrote:
> > The 'cpumap' is primary used as a backend map for XDP BPF helper
> > call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
> > 
> > This patch implement the main part of the map.  It is not connected to
> > the XDP redirect system yet, and no SKB allocation are done yet.
> > 
> > The main concern in this patch is to ensure the datapath can run
> > without any locking.  This adds complexity to the setup and tear-down
> > procedure, which assumptions are extra carefully documented in the
> > code comments.
> > 
> > V2:
> >  - make sure array isn't larger than NR_CPUS
> >  - make sure CPUs added is a valid possible CPU
> > 
> > V3: fix nitpicks from Jakub Kicinski <kubakici@wp.pl>
> > 
> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>  
> ...
> > +static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
> > +{
> > +	struct bpf_cpu_map *cmap;
> > +	u64 cost;
> > +	int err;
> > +
> > +	/* check sanity of attributes */
> > +	if (attr->max_entries == 0 || attr->key_size != 4 ||
> > +	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	cmap = kzalloc(sizeof(*cmap), GFP_USER);
> > +	if (!cmap)
> > +		return ERR_PTR(-ENOMEM);  
> 
> just noticed that there is nothing here nor in DEVMAP/SOCKMAP
> that prevents unpriv user to create them.
> I'm not sure it was intentional for DEVMAP/SOCKMAP.
> For CPUMAP I'd suggest to restrict it to root, since it
> suppose to operate with XDP only which is root anyway.
> Note, lpm and lru maps are cap_sys_admin only already.

I agree.  Have restricted this in V5

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* RE: [PATCH net-next] ip_gre: check packet length and mtu correctly in erspan_fb_xmit
From: David Laight @ 2017-10-06  9:38 UTC (permalink / raw)
  To: 'William Tu'; +Cc: netdev@vger.kernel.org, Xin Long
In-Reply-To: <CALDO+SZPmF_UG9BsZKD2FzJUbx0PcUYKrOpp4Xta4Wufezhh+g@mail.gmail.com>

From: William Tu
> Sent: 05 October 2017 22:21
...
> >> -     if (skb->len > dev->mtu) {
> >> +     if (skb->len - dev->hard_header_len > dev->mtu) {
> >
> > Can you guarantee that skb->len > dev_hard_header_len?
> > It is probably safer to check skb->len > dev->hard_header_len + dev->mtu
> > since that addition isn't going to overflow.
> Sure, I will fix it.
> 
> >
> >>               pskb_trim(skb, dev->mtu);
> >>               truncate = true;
> >
> > Is that pskb_trim() now truncating to the correct size?
> 
> You're right, now I should truncate to (dev->mtu + dev_hard_header_len)

It might be worth caching that length in the dev structure
to avoid the arithmetic on every packet.

	David


^ permalink raw reply

* Re: [PATCH v2] net/mac80211/mesh_plink: Convert timers to use timer_setup()
From: Johannes Berg @ 2017-10-06  9:49 UTC (permalink / raw)
  To: Kees Cook
  Cc: David S. Miller, linux-wireless, netdev, Thomas Gleixner,
	linux-kernel
In-Reply-To: <20171005173910.GA72335@beast>

On Thu, 2017-10-05 at 10:39 -0700, Kees Cook wrote:
> In preparation for unconditionally passing the struct timer_list
> pointer to
> all timer callbacks, switch to using the new timer_setup() and
> from_timer()
> to pass the timer pointer explicitly. This requires adding a pointer
> back
> to the sta_info since container_of() can't resolve the sta_info.
> 
Applied, thanks. (The change did land in net-next, and I merged that in
to merge this)

johannes

^ permalink raw reply

* Re: Fw: [Bug 197099] New: Kernel panic in interrupt [l2tp_ppp]
From: James Chapman @ 2017-10-06  9:52 UTC (permalink / raw)
  To: SviMik; +Cc: netdev, Guillaume Nault
In-Reply-To: <CA++DawbbfQq1ZGB_x5t4a9mhkkptnDfcDxtCXgHYfGpY=8q83g@mail.gmail.com>

On 6 October 2017 at 05:45, SviMik <svimik@gmail.com> wrote:
> 2017-10-04 10:49 GMT+03:00 James Chapman <jchapman@katalix.com>:
>> On 3 October 2017 at 08:27, James Chapman <jchapman@katalix.com> wrote:
>>> For capturing complete oops messages, have you tried setting up
>>> netconsole? You might also find the full text in the syslog on reboot.
>
> Why, thank you! You've just told me that Santa Claus exists :)

You're welcome. Heh, my wife says I have a few more grey hairs and I
don't shave as often as I should. :)

> I've set up netconsole on 93 of my servers, and hope starting from
> tomorrow I'll have more pretty kernel panic reports, and get them even
> from servers where I had never had a chance to capture the console
> before.
>
>>> It's interesting that you are seeing l2tp issues since switching to
>>> 4.x kernels. Are you able to try earlier kernels to find the latest
>>> version that works? I'm curious whether things broke at v3.15.
>
> I'll try, but it will take some time to grab enough statistics. The
> bug is relatively rare, only few panics per day on the whole bunch of
> 93 servers.
>
>> It's possible that this may be fixed by a patch that is already
>> upstream and merged for v4.14. The fix is from Guillaume Nault:
>>
>> f3c66d4 l2tp: prevent creation of sessions on terminated tunnels
>>
>> If it's possible that the L2TP server may try to create a session in a
>> tunnel that is being closed, this bug would be exposed.
>>
>> Guillaume's fix isn't yet pushed to stable releases. Are you able to
>> try a v4.14-rc build?
>
> Sorry, I'm not skilled enough to build a kernel for CentOS on my own.
> Will wait till it appears in elrepo. The latest version there is
> currently 4.13.5. Meanwhile I'll try to switch to 3.10 and see how it
> works.

No problem. Please keep us updated. If Guillaume's fix in v4.14
prevents the l2tp crashes in your systems, I'd like to push it out to
stable releases. I have been trying to reproduce the problem here but
have had no luck so far. My guess is that your l2tp servers have a
large ppp population and are handling a lot of traffic. Until we have
evidence that Guillaume's patch resolves this problem, it's harder to
justify pushing it out to stable.

> I have also captured few more kernel panics in the last few days.
> Please see if they are related to this bug:
> http://svimik.com/hdmmsk1kp2.png
> http://svimik.com/hdmmsk1kp3.png
> http://svimik.com/hdmmsk1kp4.png
> http://svimik.com/hdmmsk2kp6.png

Thanks. None of these are related to this bug but it looks like p3, p4
and p6 are all in the networking code. It might be worth opening
separate threads for these. A full oops capture with netconsole would
likely get more attention though.

To check whether the oops is related to this bug yourself, please
check for text that contains "l2tp_xmit_skb" before posting it to this
thread.

^ permalink raw reply

* [PATCH 1/2] fq: support filtering a given tin
From: Johannes Berg @ 2017-10-06  9:53 UTC (permalink / raw)
  To: linux-wireless-u79uwXL29TY76Z2rM5mHXA
  Cc: Toke Høiland-Jørgensen, netdev-u79uwXL29TY76Z2rM5mHXA,
	Michał Kazior, Johannes Berg

From: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

Add to the FQ API a way to filter a given tin, in order to
remove frames that fulfil certain criteria according to a
filter function.

This will be used by mac80211 to remove frames belonging to
an AP VLAN interface that's being removed.

Signed-off-by: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
 include/net/fq.h      |  7 +++++
 include/net/fq_impl.h | 72 ++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/include/net/fq.h b/include/net/fq.h
index 6d8521a30c5c..ac944a686840 100644
--- a/include/net/fq.h
+++ b/include/net/fq.h
@@ -90,6 +90,13 @@ typedef void fq_skb_free_t(struct fq *,
 			   struct fq_flow *,
 			   struct sk_buff *);
 
+/* Return %true to filter (drop) the frame. */
+typedef bool fq_skb_filter_t(struct fq *,
+			     struct fq_tin *,
+			     struct fq_flow *,
+			     struct sk_buff *,
+			     void *);
+
 typedef struct fq_flow *fq_flow_get_default_t(struct fq *,
 					      struct fq_tin *,
 					      int idx,
diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h
index 4e6131cd3f43..8b237e4afee6 100644
--- a/include/net/fq_impl.h
+++ b/include/net/fq_impl.h
@@ -12,24 +12,22 @@
 
 /* functions that are embedded into includer */
 
-static struct sk_buff *fq_flow_dequeue(struct fq *fq,
-				       struct fq_flow *flow)
+static void fq_adjust_removal(struct fq *fq,
+			      struct fq_flow *flow,
+			      struct sk_buff *skb)
 {
 	struct fq_tin *tin = flow->tin;
-	struct fq_flow *i;
-	struct sk_buff *skb;
-
-	lockdep_assert_held(&fq->lock);
-
-	skb = __skb_dequeue(&flow->queue);
-	if (!skb)
-		return NULL;
 
 	tin->backlog_bytes -= skb->len;
 	tin->backlog_packets--;
 	flow->backlog -= skb->len;
 	fq->backlog--;
 	fq->memory_usage -= skb->truesize;
+}
+
+static void fq_rejigger_backlog(struct fq *fq, struct fq_flow *flow)
+{
+	struct fq_flow *i;
 
 	if (flow->backlog == 0) {
 		list_del_init(&flow->backlogchain);
@@ -43,6 +41,21 @@ static struct sk_buff *fq_flow_dequeue(struct fq *fq,
 		list_move_tail(&flow->backlogchain,
 			       &i->backlogchain);
 	}
+}
+
+static struct sk_buff *fq_flow_dequeue(struct fq *fq,
+				       struct fq_flow *flow)
+{
+	struct sk_buff *skb;
+
+	lockdep_assert_held(&fq->lock);
+
+	skb = __skb_dequeue(&flow->queue);
+	if (!skb)
+		return NULL;
+
+	fq_adjust_removal(fq, flow, skb);
+	fq_rejigger_backlog(fq, flow);
 
 	return skb;
 }
@@ -188,6 +201,45 @@ static void fq_tin_enqueue(struct fq *fq,
 	}
 }
 
+static void fq_flow_filter(struct fq *fq,
+			   struct fq_flow *flow,
+			   fq_skb_filter_t filter_func,
+			   void *filter_data,
+			   fq_skb_free_t free_func)
+{
+	struct fq_tin *tin = flow->tin;
+	struct sk_buff *skb, *tmp;
+
+	lockdep_assert_held(&fq->lock);
+
+	skb_queue_walk_safe(&flow->queue, skb, tmp) {
+		if (!filter_func(fq, tin, flow, skb, filter_data))
+			continue;
+
+		__skb_unlink(skb, &flow->queue);
+		fq_adjust_removal(fq, flow, skb);
+		free_func(fq, tin, flow, skb);
+	}
+
+	fq_rejigger_backlog(fq, flow);
+}
+
+static void fq_tin_filter(struct fq *fq,
+			  struct fq_tin *tin,
+			  fq_skb_filter_t filter_func,
+			  void *filter_data,
+			  fq_skb_free_t free_func)
+{
+	struct fq_flow *flow;
+
+	lockdep_assert_held(&fq->lock);
+
+	list_for_each_entry(flow, &tin->new_flows, flowchain)
+		fq_flow_filter(fq, flow, filter_func, filter_data, free_func);
+	list_for_each_entry(flow, &tin->old_flows, flowchain)
+		fq_flow_filter(fq, flow, filter_func, filter_data, free_func);
+}
+
 static void fq_flow_reset(struct fq *fq,
 			  struct fq_flow *flow,
 			  fq_skb_free_t free_func)
-- 
2.14.2

^ permalink raw reply related

* [PATCH 2/2] mac80211: only remove AP VLAN frames from TXQ
From: Johannes Berg @ 2017-10-06  9:53 UTC (permalink / raw)
  To: linux-wireless
  Cc: Toke Høiland-Jørgensen, netdev, Michał Kazior,
	Johannes Berg
In-Reply-To: <20171006095333.26335-1-johannes@sipsolutions.net>

From: Johannes Berg <johannes.berg@intel.com>

When removing an AP VLAN interface, mac80211 currently purges
the entire TXQ for the AP interface. Fix this by using the FQ
API introduced in the previous patch to filter frames.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ieee80211_i.h |  2 ++
 net/mac80211/iface.c       | 25 +++----------------------
 net/mac80211/tx.c          | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9675814f64db..68f874e73561 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2009,6 +2009,8 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 			struct txq_info *txq, int tid);
 void ieee80211_txq_purge(struct ieee80211_local *local,
 			 struct txq_info *txqi);
+void ieee80211_txq_remove_vlan(struct ieee80211_local *local,
+			       struct ieee80211_sub_if_data *sdata);
 void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 			 u16 transaction, u16 auth_alg, u16 status,
 			 const u8 *extra, size_t extra_len, const u8 *bssid,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 2619daa29961..13b16f90e1cf 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -793,9 +793,7 @@ static int ieee80211_open(struct net_device *dev)
 static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 			      bool going_down)
 {
-	struct ieee80211_sub_if_data *txq_sdata = sdata;
 	struct ieee80211_local *local = sdata->local;
-	struct fq *fq = &local->fq;
 	unsigned long flags;
 	struct sk_buff *skb, *tmp;
 	u32 hw_reconf_flags = 0;
@@ -939,9 +937,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP_VLAN:
-		txq_sdata = container_of(sdata->bss,
-					 struct ieee80211_sub_if_data, u.ap);
-
 		mutex_lock(&local->mtx);
 		list_del(&sdata->u.vlan.list);
 		mutex_unlock(&local->mtx);
@@ -998,8 +993,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 		skb_queue_purge(&sdata->skb_queue);
 	}
 
-	sdata->bss = NULL;
-
 	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 	for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
 		skb_queue_walk_safe(&local->pending[i], skb, tmp) {
@@ -1012,22 +1005,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 	}
 	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
 
-	if (txq_sdata->vif.txq) {
-		struct txq_info *txqi = to_txq_info(txq_sdata->vif.txq);
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		ieee80211_txq_remove_vlan(local, sdata);
 
-		/*
-		 * FIXME FIXME
-		 *
-		 * We really shouldn't purge the *entire* txqi since that
-		 * contains frames for the other AP_VLANs (and possibly
-		 * the AP itself) as well, but there's no API in FQ now
-		 * to be able to filter.
-		 */
-
-		spin_lock_bh(&fq->lock);
-		ieee80211_txq_purge(local, txqi);
-		spin_unlock_bh(&fq->lock);
-	}
+	sdata->bss = NULL;
 
 	if (local->open_count == 0)
 		ieee80211_clear_tx_pending(local);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 94826680cf2b..7b8154474b9e 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1396,6 +1396,40 @@ static void ieee80211_txq_enqueue(struct ieee80211_local *local,
 		       fq_flow_get_default_func);
 }
 
+static bool fq_vlan_filter_func(struct fq *fq, struct fq_tin *tin,
+				struct fq_flow *flow, struct sk_buff *skb,
+				void *data)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+	return info->control.vif == data;
+}
+
+void ieee80211_txq_remove_vlan(struct ieee80211_local *local,
+			       struct ieee80211_sub_if_data *sdata)
+{
+	struct fq *fq = &local->fq;
+	struct txq_info *txqi;
+	struct fq_tin *tin;
+	struct ieee80211_sub_if_data *ap;
+
+	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP_VLAN))
+		return;
+
+	ap = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap);
+
+	if (!ap->vif.txq)
+		return;
+
+	txqi = to_txq_info(ap->vif.txq);
+	tin = &txqi->tin;
+
+	spin_lock_bh(&fq->lock);
+	fq_tin_filter(fq, tin, fq_vlan_filter_func, &sdata->vif,
+		      fq_skb_free_func);
+	spin_unlock_bh(&fq->lock);
+}
+
 void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 			struct sta_info *sta,
 			struct txq_info *txqi, int tid)
-- 
2.14.2

^ permalink raw reply related

* Re: [PATCH 1/2] fq: support filtering a given tin
From: Toke Høiland-Jørgensen @ 2017-10-06 10:30 UTC (permalink / raw)
  To: Johannes Berg, linux-wireless-u79uwXL29TY76Z2rM5mHXA
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, Michał Kazior, Johannes Berg
In-Reply-To: <20171006095333.26335-1-johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>

Johannes Berg <johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org> writes:

> From: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>
> Add to the FQ API a way to filter a given tin, in order to
> remove frames that fulfil certain criteria according to a
> filter function.
>
> This will be used by mac80211 to remove frames belonging to
> an AP VLAN interface that's being removed.
>
> Signed-off-by: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

Acked-by: Toke Høiland-Jørgensen <toke-LJ9M9ZcSy1A@public.gmane.org>

^ permalink raw reply

* Re: [PATCH 2/2] mac80211: only remove AP VLAN frames from TXQ
From: Toke Høiland-Jørgensen @ 2017-10-06 10:30 UTC (permalink / raw)
  To: Johannes Berg, linux-wireless-u79uwXL29TY76Z2rM5mHXA
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, Michał Kazior, Johannes Berg
In-Reply-To: <20171006095333.26335-2-johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>

Johannes Berg <johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org> writes:

> From: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>
> When removing an AP VLAN interface, mac80211 currently purges
> the entire TXQ for the AP interface. Fix this by using the FQ
> API introduced in the previous patch to filter frames.
>
> Signed-off-by: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

Acked-by: Toke Høiland-Jørgensen <toke-LJ9M9ZcSy1A@public.gmane.org>

^ permalink raw reply

* Re: [net-next V4 PATCH 1/5] bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP
From: Jesper Dangaard Brouer @ 2017-10-06 10:50 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: netdev, jakub.kicinski, Michael S. Tsirkin, pavel.odintsov,
	Jason Wang, mchan, John Fastabend, peter.waskiewicz.jr,
	Daniel Borkmann, Alexei Starovoitov, Andy Gospodarek, brouer,
	Tobias Klauser
In-Reply-To: <59D5FDFF.5040002@iogearbox.net>

On Thu, 05 Oct 2017 11:40:15 +0200
Daniel Borkmann <daniel@iogearbox.net> wrote:

> On 10/04/2017 02:03 PM, Jesper Dangaard Brouer wrote:
> [...]
> > +#define CPU_MAP_BULK_SIZE 8  /* 8 == one cacheline on 64-bit archs */
> > +struct xdp_bulk_queue {
> > +	void *q[CPU_MAP_BULK_SIZE];
> > +	unsigned int count;
> > +};
> > +
> > +/* Struct for every remote "destination" CPU in map */
> > +struct bpf_cpu_map_entry {
> > +	u32 cpu;    /* kthread CPU and map index */
> > +	int map_id; /* Back reference to map */  
> 
> map_id is not used here if I read it correctly? We should
> then remove it.

It is actually used in a later patch. Notice, there is no unused
members in the final patch.  I did consider adding back in the later
patch, but it was annoying to during the devel and split-up patch
phase, as it creates conflicts when I move between the different
patches, that need to modify this struct.  Thus, I choose to keep the
end-struct in this cpumap-base-patch.  If you insist, I can go though
the patch-stack and carefully introduce changes to the struct in steps?


> > +	u32 qsize;  /* Redundant queue size for map lookup */
> > +
> > +	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
> > +	struct xdp_bulk_queue __percpu *bulkq;
> > +
> > +	/* Queue with potential multi-producers, and single-consumer kthread */
> > +	struct ptr_ring *queue;
> > +	struct task_struct *kthread;
> > +	struct work_struct kthread_stop_wq;
> > +
> > +	atomic_t refcnt; /* Control when this struct can be free'ed */
> > +	struct rcu_head rcu;
> > +};
> > +
> > +struct bpf_cpu_map {
> > +	struct bpf_map map;
> > +	/* Below members specific for map type */
> > +	struct bpf_cpu_map_entry **cpu_map;
> > +	unsigned long __percpu *flush_needed;
> > +};
> > +
> > +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
> > +			     struct xdp_bulk_queue *bq);  
> 
> Could we avoid forward declaration?

This forward declaration helps me avoid mixing the enqueue code with
the bpf-map code.  I like that all the enqueue code is located after
the struct bpf_map_ops cpu_map_ops deceleration.  If you insist, I can
reorder the code?

 
> > +static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
> > +{
> > +	return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
> > +}
> > +
> > +static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
> > +{
> > +	struct bpf_cpu_map *cmap;
> > +	u64 cost;
> > +	int err;
> > +
> > +	/* check sanity of attributes */
> > +	if (attr->max_entries == 0 || attr->key_size != 4 ||
> > +	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	cmap = kzalloc(sizeof(*cmap), GFP_USER);
> > +	if (!cmap)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	/* mandatory map attributes */
> > +	cmap->map.map_type = attr->map_type;
> > +	cmap->map.key_size = attr->key_size;
> > +	cmap->map.value_size = attr->value_size;
> > +	cmap->map.max_entries = attr->max_entries;
> > +	cmap->map.map_flags = attr->map_flags;
> > +	cmap->map.numa_node = bpf_map_attr_numa_node(attr);
> > +
> > +	/* Pre-limit array size based on NR_CPUS, not final CPU check */
> > +	if (cmap->map.max_entries > NR_CPUS)
> > +		return ERR_PTR(-E2BIG);
> > +
> > +	/* make sure page count doesn't overflow */
> > +	cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
> > +	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
> > +	if (cost >= U32_MAX - PAGE_SIZE)
> > +		goto free_cmap;
> > +	cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
> > +
> > +	/* if map size is larger than memlock limit, reject it early */
> > +	err = bpf_map_precharge_memlock(cmap->map.pages);
> > +	if (err)
> > +		goto free_cmap;  
> 
> Given this is almost the same as devmap and touches user land reporting,
> we should probably go and do the same as in 582db7e0c4c2 ("bpf: devmap:
> pass on return value of bpf_map_precharge_memlock").

I guess we have to do this... even-though I absolutely HATE that this
will return -EPERM which user land will see as "Operation not permitted".

Even-though I know this, I got confused and spend several hours hunting
the wrong kind of error:
 https://github.com/netoptimizer/prototype-kernel/commit/cf4694792bf9807f48dc174a149ab0805133184a

Even the iovisor/BCC tool have a work-around for this ambiguous ABI
that we provide, and explicitly call their work-around a "hack":

https://github.com/iovisor/bcc/blob/2e20494f63a/src/cc/libbpf.c#L366-L373

 if (ret < 0 && errno == EPERM) {
    // When EPERM is returned, two reasons are possible:
    //  1. user has no permissions for bpf()
    //  2. user has insufficent rlimit for locked memory
    // Unfortunately, there is no api to inspect the current usage of locked
    // mem for the user, so an accurate calculation of how much memory to lock
    // for this new program is difficult to calculate. As a hack, bump the limit
    // to unlimited. If program load fails again, return the error.

 
> > +	/* A per cpu bitfield with a bit per possible CPU in map  */
> > +	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
> > +					    __alignof__(unsigned long));
> > +	if (!cmap->flush_needed)
> > +		goto free_cmap;
> > +
> > +	/* Alloc array for possible remote "destination" CPUs */
> > +	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
> > +					   sizeof(struct bpf_cpu_map_entry *),
> > +					   cmap->map.numa_node);
> > +	if (!cmap->cpu_map)
> > +		goto free_cmap;
> > +
> > +	return &cmap->map;
> > +free_cmap:
> > +	free_percpu(cmap->flush_needed);
> > +	kfree(cmap);
> > +	return ERR_PTR(-ENOMEM);
> > +}
> > +
> > +void __cpu_map_queue_destructor(void *ptr)
> > +{
> > +	/* For now, just catch this as an error */
> > +	if (!ptr)
> > +		return;
> > +	pr_err("ERROR: %s() cpu_map queue was not empty\n", __func__);  
> 
> Can you elaborate on this "for now" condition? Is this a race
> when kthread doesn't consume queue on thread exit, or should it
> be impossible to trigger (should it then be replaced with a
> 'if (WARN_ON_ONCE(ptr)) page_frag_free(ptr)' and a more elaborate
> comment)?

The "for now" is an old comment while developing and testing this.
In this final state of the patchset it _should_ not be possible to
trigger this situation.  I like your idea of replacing it with a
WARN_ON_ONCE.  (as it might be good to keep in some form, as it would
catch is someone changing the code which breaks the RCU+WQ+kthread
tear-down procedure).


> > +	page_frag_free(ptr);
> > +}
> > +
> > +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
> > +{
> > +	if (atomic_dec_and_test(&rcpu->refcnt)) {
> > +		/* The queue should be empty at this point */
> > +		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
> > +		kfree(rcpu->queue);
> > +		kfree(rcpu);
> > +	}
> > +}
> > +
> > +static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
> > +{
> > +	atomic_inc(&rcpu->refcnt);
> > +}
> > +
> > +/* called from workqueue, to workaround syscall using preempt_disable */
> > +static void cpu_map_kthread_stop(struct work_struct *work)
> > +{
> > +	struct bpf_cpu_map_entry *rcpu;
> > +
> > +	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
> > +	synchronize_rcu(); /* wait for flush in __cpu_map_entry_free() */
> > +	kthread_stop(rcpu->kthread); /* calls put_cpu_map_entry */
> > +}
> > +
> > +static int cpu_map_kthread_run(void *data)
> > +{
> > +	struct bpf_cpu_map_entry *rcpu = data;
> > +
> > +	set_current_state(TASK_INTERRUPTIBLE);
> > +	while (!kthread_should_stop()) {
> > +		struct xdp_pkt *xdp_pkt;
> > +
> > +		schedule();
> > +		/* Do work */
> > +		while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
> > +			/* For now just "refcnt-free" */
> > +			page_frag_free(xdp_pkt);
> > +		}
> > +		__set_current_state(TASK_INTERRUPTIBLE);
> > +	}
> > +	put_cpu_map_entry(rcpu);
> > +
> > +	__set_current_state(TASK_RUNNING);
> > +	return 0;
> > +}
> > +
> > +struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
> > +{
> > +	gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
> > +	struct bpf_cpu_map_entry *rcpu;
> > +	int numa, err;
> > +
> > +	/* Have map->numa_node, but choose node of redirect target CPU */
> > +	numa = cpu_to_node(cpu);
> > +
> > +	rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
> > +	if (!rcpu)
> > +		return NULL;
> > +
> > +	/* Alloc percpu bulkq */
> > +	rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
> > +					 sizeof(void *), gfp);
> > +	if (!rcpu->bulkq)
> > +		goto fail;
> > +
> > +	/* Alloc queue */
> > +	rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
> > +	if (!rcpu->queue)
> > +		goto fail;
> > +
> > +	err = ptr_ring_init(rcpu->queue, qsize, gfp);
> > +	if (err)
> > +		goto fail;
> > +	rcpu->qsize = qsize;
> > +
> > +	/* Setup kthread */
> > +	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
> > +					       "cpumap/%d/map:%d", cpu, map_id);
> > +	if (IS_ERR(rcpu->kthread))
> > +		goto fail;  
> 
> What about ptr_ring_cleanup() when we fail here?

Thanks for catching this ... added.

> > +
> > +	/* Make sure kthread runs on a single CPU */
> > +	kthread_bind(rcpu->kthread, cpu);
> > +	wake_up_process(rcpu->kthread);
> > +
> > +	get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
> > +	get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
> > +
> > +	return rcpu;
> > +
> > +fail:   /* Hint: free API detect NULL values */
> > +	free_percpu(rcpu->bulkq);
> > +	kfree(rcpu->queue);
> > +	kfree(rcpu);
> > +	return NULL;
> > +}
> > +
> > +void __cpu_map_entry_free(struct rcu_head *rcu)
> > +{
> > +	struct bpf_cpu_map_entry *rcpu;
> > +	int cpu;
> > +
> > +	/* This cpu_map_entry have been disconnected from map and one
> > +	 * RCU graze-period have elapsed.  Thus, XDP cannot queue any
> > +	 * new packets and cannot change/set flush_needed that can
> > +	 * find this entry.
> > +	 */
> > +	rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
> > +
> > +	/* Flush remaining packets in percpu bulkq */
> > +	for_each_online_cpu(cpu) {
> > +		struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
> > +
> > +		/* No concurrent bq_enqueue can run at this point */
> > +		bq_flush_to_queue(rcpu, bq);
> > +	}
> > +	free_percpu(rcpu->bulkq);
> > +	/* Cannot kthread_stop() here, last put free rcpu resources */
> > +	put_cpu_map_entry(rcpu);
> > +}
> > +
> > +/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
> > + * ensure any driver rcu critical sections have completed, but this
> > + * does not guarantee a flush has happened yet. Because driver side
> > + * rcu_read_lock/unlock only protects the running XDP program.  The
> > + * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
> > + * pending flush op doesn't fail.
> > + *
> > + * The bpf_cpu_map_entry is still used by the kthread, and there can
> > + * still be pending packets (in queue and percpu bulkq).  A refcnt
> > + * makes sure to last user (kthread_stop vs. call_rcu) free memory
> > + * resources.
> > + *
> > + * The rcu callback __cpu_map_entry_free flush remaining packets in
> > + * percpu bulkq to queue.  Due to caller map_delete_elem() disable
> > + * preemption, cannot call kthread_stop() to make sure queue is empty.
> > + * Instead a work_queue is started for stopping kthread,
> > + * cpu_map_kthread_stop, which waits for an RCU graze period before
> > + * stopping kthread, emptying the queue.
> > + */
> > +void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
> > +			     u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
> > +{
> > +	struct bpf_cpu_map_entry *old_rcpu;
> > +
> > +	old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
> > +	if (old_rcpu) {
> > +		call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
> > +		INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
> > +		schedule_work(&old_rcpu->kthread_stop_wq);
> > +	}
> > +}
> > +
> > +int cpu_map_delete_elem(struct bpf_map *map, void *key)
> > +{
> > +	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
> > +	u32 key_cpu = *(u32 *)key;
> > +
> > +	if (key_cpu >= map->max_entries)
> > +		return -EINVAL;
> > +
> > +	/* notice caller map_delete_elem() use preempt_disable() */
> > +	__cpu_map_entry_replace(cmap, key_cpu, NULL);
> > +	return 0;
> > +}
> > +
> > +int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
> > +				u64 map_flags)
> > +{
> > +	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
> > +	struct bpf_cpu_map_entry *rcpu;
> > +
> > +	/* Array index key correspond to CPU number */
> > +	u32 key_cpu = *(u32 *)key;
> > +	/* Value is the queue size */
> > +	u32 qsize = *(u32 *)value;
> > +
> > +	/* Make sure CPU is a valid possible cpu */
> > +	if (!cpu_possible(key_cpu))
> > +		return -ENODEV;
> > +
> > +	if (unlikely(map_flags > BPF_EXIST))
> > +		return -EINVAL;
> > +	if (unlikely(key_cpu >= cmap->map.max_entries))
> > +		return -E2BIG;
> > +	if (unlikely(map_flags == BPF_NOEXIST))
> > +		return -EEXIST;
> > +	if (unlikely(qsize > 16384)) /* sanity limit on qsize */
> > +		return -EOVERFLOW;
> > +
> > +	if (qsize == 0) {
> > +		rcpu = NULL; /* Same as deleting */
> > +	} else {
> > +		/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
> > +		rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
> > +		if (!rcpu)
> > +			return -ENOMEM;
> > +	}
> > +	rcu_read_lock();
> > +	__cpu_map_entry_replace(cmap, key_cpu, rcpu);
> > +	rcu_read_unlock();
> > +	return 0;  
> 
> You need to update verifier such that this function cannot be called
> out of an BPF program,

In the example BPF program, I do a lookup into the map, but only to
verify that an entry exist (I don't look at the value).  I would like
to support such usage.


> otherwise it would be possible under full RCU
> read context, which is explicitly avoided here and also it would otherwise
> be allowed for other maps of different type as well, which needs to
> be avoided.

Sorry, I don't follow this.

 
> > +}
> > +
> > +void cpu_map_free(struct bpf_map *map)
> > +{
> > +	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
> > +	int cpu;
> > +	u32 i;
> > +
> > +	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
> > +	 * so the bpf programs (can be more than one that used this map) were
> > +	 * disconnected from events. Wait for outstanding critical sections in
> > +	 * these programs to complete. The rcu critical section only guarantees
> > +	 * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
> > +	 * It does __not__ ensure pending flush operations (if any) are
> > +	 * complete.
> > +	 */
> > +	synchronize_rcu();
> > +
> > +	/* To ensure all pending flush operations have completed wait for flush
> > +	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
> > +	 * Because the above synchronize_rcu() ensures the map is disconnected
> > +	 * from the program we can assume no new bits will be set.
> > +	 */
> > +	for_each_online_cpu(cpu) {
> > +		unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
> > +
> > +		while (!bitmap_empty(bitmap, cmap->map.max_entries))
> > +			cond_resched();
> > +	}
> > +
> > +	/* For cpu_map the remote CPUs can still be using the entries
> > +	 * (struct bpf_cpu_map_entry).
> > +	 */
> > +	for (i = 0; i < cmap->map.max_entries; i++) {
> > +		struct bpf_cpu_map_entry *rcpu;
> > +
> > +		rcpu = READ_ONCE(cmap->cpu_map[i]);
> > +		if (!rcpu)
> > +			continue;
> > +
> > +		/* bq flush and cleanup happens after RCU graze-period */
> > +		__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
> > +	}
> > +	free_percpu(cmap->flush_needed);
> > +	bpf_map_area_free(cmap->cpu_map);
> > +	kfree(cmap);
> > +}
> > +
> > +struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
> > +{
> > +	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
> > +	struct bpf_cpu_map_entry *rcpu;
> > +
> > +	if (key >= map->max_entries)
> > +		return NULL;
> > +
> > +	rcpu = READ_ONCE(cmap->cpu_map[key]);
> > +	return rcpu;
> > +}
> > +
> > +static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
> > +{
> > +	struct bpf_cpu_map_entry *rcpu =
> > +		__cpu_map_lookup_elem(map, *(u32 *)key);
> > +
> > +	return rcpu ? &rcpu->qsize : NULL;  
> 
> The qsize doesn't seem used anywhere else besides here, but you
> probably should update verifier such that this cannot be called
> out of the BPF program, which could then mangle qsize value.

It is true that the BPF prog can modify this qsize value, but it's not
the authoritative value, so it doesn't really matter.

As I said above, I do want to do a lookup from a BPF program.  To allow
to BPF program to know, if an entry is valid, else it will blindly
send to a cpu destination.  Maybe bpf_prog's just have to use a
map-on-the-side to coordinate this(?), but then a sysadm modifying the
real cpumap will be invisible to the program.


Maybe we should just disable BPF-progs from reading this in the first
iteration?  It would allow for more advanced usage schemes later..

One crazy idea is to have the cpu_map_lookup_elem() return if any
packets are in-flight on this cpu-queue. (Making it easier to avoid OoO
packets, when switching target CPU, but it can also be implemented by
the BPF-programmer herself via maps, although via some extra atomic
cost).


> > +}
> > +
> > +static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> > +{
> > +	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
> > +	u32 index = key ? *(u32 *)key : U32_MAX;
> > +	u32 *next = next_key;
> > +
> > +	if (index >= cmap->map.max_entries) {
> > +		*next = 0;
> > +		return 0;
> > +	}
> > +
> > +	if (index == cmap->map.max_entries - 1)
> > +		return -ENOENT;
> > +	*next = index + 1;
> > +	return 0;
> > +}


I would have liked to have implemented next_key so it only returned the
next valid cpu_entry, and used it as a simple round-robin scheduler.
But AFAIK the next_key function is not allowed from bpf_prog's, right?


> > +
> > +const struct bpf_map_ops cpu_map_ops = {
> > +	.map_alloc		= cpu_map_alloc,
> > +	.map_free		= cpu_map_free,
> > +	.map_delete_elem	= cpu_map_delete_elem,
> > +	.map_update_elem	= cpu_map_update_elem,
> > +	.map_lookup_elem	= cpu_map_lookup_elem,
> > +	.map_get_next_key	= cpu_map_get_next_key,
> > +};
> > +
> > +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
> > +			     struct xdp_bulk_queue *bq)
> > +{
> > +	struct ptr_ring *q;
> > +	int i;
> > +
> > +	if (unlikely(!bq->count))
> > +		return 0;
> > +
> > +	q = rcpu->queue;
> > +	spin_lock(&q->producer_lock);
> > +
> > +	for (i = 0; i < bq->count; i++) {
> > +		void *xdp_pkt = bq->q[i];
> > +		int err;
> > +
> > +		err = __ptr_ring_produce(q, xdp_pkt);
> > +		if (err) {
> > +			/* Free xdp_pkt */
> > +			page_frag_free(xdp_pkt);
> > +		}
> > +	}
> > +	bq->count = 0;
> > +	spin_unlock(&q->producer_lock);
> > +
> > +	return 0;
> > +}
> > +
[...]


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [PATCH] bnx2x: Use pci_ari_enabled() instead of local copy
From: Bjorn Helgaas @ 2017-10-06 11:00 UTC (permalink / raw)
  To: Ariel Elior; +Cc: netdev, everest-linux-l2, linux-kernel

From: Bjorn Helgaas <bhelgaas@google.com>

Use pci_ari_enabled() from the PCI core instead of the identical local copy
bnx2x_ari_enabled().  No functional change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c |    7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index 9ca994d0bab6..3591077a5f6b 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -1074,11 +1074,6 @@ static void bnx2x_vf_set_bars(struct bnx2x *bp, struct bnx2x_virtf *vf)
 	}
 }
 
-static int bnx2x_ari_enabled(struct pci_dev *dev)
-{
-	return dev->bus->self && dev->bus->self->ari_enabled;
-}
-
 static int
 bnx2x_get_vf_igu_cam_info(struct bnx2x *bp)
 {
@@ -1212,7 +1207,7 @@ int bnx2x_iov_init_one(struct bnx2x *bp, int int_mode_param,
 
 	err = -EIO;
 	/* verify ari is enabled */
-	if (!bnx2x_ari_enabled(bp->pdev)) {
+	if (!pci_ari_enabled(bp->pdev->bus)) {
 		BNX2X_ERR("ARI not supported (check pci bridge ARI forwarding), SRIOV can not be enabled\n");
 		return 0;
 	}

^ permalink raw reply related

* Re: [net-next,v2] ip_gre: check packet length and mtu correctly in erspan tx
From: Xin Long @ 2017-10-06 11:03 UTC (permalink / raw)
  To: William Tu; +Cc: network dev, David Laight
In-Reply-To: <1507230432-56495-1-git-send-email-u9012063@gmail.com>

On Fri, Oct 6, 2017 at 3:07 AM, William Tu <u9012063@gmail.com> wrote:
> Similarly to early patch for erspan_xmit(), the ARPHDR_ETHER device
> is the length of the whole ether packet.  So skb->len should subtract
> the dev->hard_header_len.
>
> Fixes: 1a66a836da63 ("gre: add collect_md mode to ERSPAN tunnel")
> Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN")
> Signed-off-by: William Tu <u9012063@gmail.com>
> Cc: Xin Long <lucien.xin@gmail.com>
> Cc: David Laight <David.Laight@aculab.com>
> ---
> v1->v2:
> use addition to avoid overflow
> fix pskb_trim size
> ---
>  net/ipv4/ip_gre.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
> index b279c325c7f6..fb95f68d6e53 100644
> --- a/net/ipv4/ip_gre.c
> +++ b/net/ipv4/ip_gre.c
> @@ -579,8 +579,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
>         if (gre_handle_offloads(skb, false))
>                 goto err_free_rt;
>
> -       if (skb->len > dev->mtu) {
> -               pskb_trim(skb, dev->mtu);
> +       if (skb->len > dev->mtu + dev->hard_header_len) {
> +               pskb_trim(skb, dev->mtu + dev->hard_header_len);
>                 truncate = true;
>         }
>
> @@ -731,8 +731,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
>         if (skb_cow_head(skb, dev->needed_headroom))
>                 goto free_skb;
>
> -       if (skb->len - dev->hard_header_len > dev->mtu) {
> -               pskb_trim(skb, dev->mtu);
> +       if (skb->len > dev->mtu + dev->hard_header_len) {
> +               pskb_trim(skb, dev->mtu + dev->hard_header_len);
>                 truncate = true;
>         }
>
> --
> 2.7.4
>
Reviewed-by: Xin Long <lucien.xin@gmail.com>

^ permalink raw reply

* Re: [PATCH net v2 3/9] net/mac89x0: Fix and modernize log messages
From: Finn Thain @ 2017-10-06 11:06 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <20171005.210842.1348457661309929796.davem@davemloft.net>

On Thu, 5 Oct 2017, David Miller wrote:

> From: Finn Thain <fthain@telegraphics.com.au>
> Date: Thu, 5 Oct 2017 21:11:05 -0400 (EDT)
> 
> > Fix misplaced newlines in conditional log messages.
> 
> Please don't do this, the way the author formatted the strings was 
> intentional, they intended to print out:
> 
> 	NAME: cs89%c0%s rev %c found at %#8lx IRQ %d ADDR %pM
> 
> But now you are splitting it into multiple lines.

Right.

> Also, you're printing the IRQ information after register_netdev() which 
> is bad.  As soon as register_netdev() is called, the driver's
> ->open() routine can be invoked, and during which time some
> log messages could be emitted during that operation.
> 
> And that would cut the probe messages up.
> 

Yes and no. The thing is, "IRQ %d" isn't really a "probe message" and 
doesn't need to be logged at all: the IRQ is entirely fixed. Actually the 
same is true for the macmace driver. There is value in this information 
but it can be found in /proc/interrupts so I'd happily drop the "IRQ %d" 
portion from these log messages.

> I know how you got to this state, you saw a reference to dev->name 
> before it had a real value.  You just removed the "eth%d" string 
> entirely.  And since you removed the dev->name reference, you had no 
> reason to move log messages after register_netdev() at all.
> 

Not quite. I used the "MAC %pM, IRQ %d" style for consistency with other 
NIC drivers. Though consistency in itself may be insufficient 
justification. More importantly, I wanted the MAC address logged together 
with the actual interface name. That's how I arrived at this code.

> Anyways, you can also see the intention of the author here becuase they 
> have _explicit_ leading newlines in the error path messages that come 
> after the inital probe printk.
> 

Of course. I do understand the existing code. And the code actually 
reflects the intentions of the author of the ISA driver. Having the IRQ 
logged could be really valuable to the typical ISA card user but this 
platform is not ISA.

> The real way to fix the early dev->name reference is to replace it with 
> a dev_info() call and have it use the struct device name rather than the 
> netdev device one.
> 

This driver only runs on machines with one expansion slot (called a "Comm 
Slot"). So I figured that either pr_info() or printk(KERN_INFO ...) would 
do just fine here (always has done). I did consider dev_info() but I don't 
see the benefit. I'm probably missing something, so would you elaborate 
please?

BTW I've also used pr_info() elsewhere in this series in platform drivers. 
It's not yet clear to me whether the mac89x0 driver should ultimately bind 
to a platform device or a nubus device: comm slot cards are a bit of 
each.

> Again, I think you really shouldn't be making these small weird changes 
> to these old drivers.
> 

These are weird changes befitting a weird platform.

I understand your reluctance to touch legacy drivers, but my intention is 
not change for the sake of change. There is bitrot here. Sometimes that's 
due to the rest of the kernel having changed and sometimes it's due to 
actual damage of the kind you seem to fear. I'm trying to address both.

-- 

^ permalink raw reply

* Re: [net-next V4 PATCH 2/5] bpf: XDP_REDIRECT enable use of cpumap
From: Jesper Dangaard Brouer @ 2017-10-06 11:17 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: netdev, jakub.kicinski, Michael S. Tsirkin, pavel.odintsov,
	Jason Wang, mchan, John Fastabend, peter.waskiewicz.jr,
	Daniel Borkmann, Alexei Starovoitov, Andy Gospodarek, brouer
In-Reply-To: <59D60505.2040004@iogearbox.net>


On Thu, 05 Oct 2017 12:10:13 +0200 Daniel Borkmann <daniel@iogearbox.net> wrote:

> On 10/04/2017 02:03 PM, Jesper Dangaard Brouer wrote:
> > This patch connects cpumap to the xdp_do_redirect_map infrastructure.
> >
> > Still no SKB allocation are done yet.  The XDP frames are transferred
> > to the other CPU, but they are simply refcnt decremented on the remote
> > CPU.  This served as a good benchmark for measuring the overhead of
> > remote refcnt decrement.  If driver page recycle cache is not
> > efficient then this, exposes a bottleneck in the page allocator.
> >
> > A shout-out to MST's ptr_ring, which is the secret behind is being so
> > efficient to transfer memory pointers between CPUs, without constantly
> > bouncing cache-lines between CPUs.
> >
> > V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot.
> >
> > V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet,
> >   as implementation require a separate upstream discussion.
> >
> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>  
> [...]
> > diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
> > index ae8e29352261..4926a9971f90 100644
> > --- a/kernel/bpf/cpumap.c
> > +++ b/kernel/bpf/cpumap.c
> > @@ -493,7 +493,8 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
> >   	return 0;
> >   }
> >
> > -int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp)
> > +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
> > +		    struct net_device *dev_rx)
> >   {
> >   	struct xdp_pkt *xdp_pkt;
> >   	int headroom;
> > @@ -505,7 +506,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp)
> >   	xdp_pkt = xdp->data_hard_start;
> >   	xdp_pkt->data = xdp->data;
> >   	xdp_pkt->len  = xdp->data_end - xdp->data;
> > -	xdp_pkt->headroom = headroom;
> > +	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);  
> 
> (Just a note, bit confusing that first two patches add and extend
>   this, and only in the third you add the xdp->data_meta handling,
>   makes it harder to review at least.)

Sorry.  This is a left-overs from rebasing and measuring the cost of
transferring only the pointer to the page, and remote put_page().
And your xdp->data_meta, happen basically while my patches was in-flight.

I'll move this one-line back to patch 2, to spreading over too many
patches.

> [...]
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 9b6e7e84aafd..dbf2ae071108 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -2521,10 +2521,36 @@ static int __bpf_tx_xdp(struct net_device *dev,
> >   	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
> >   	if (err)
> >   		return err;
> > -	if (map)
> > +	dev->netdev_ops->ndo_xdp_flush(dev);
> > +	return 0;
> > +}
> > +
> > +static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
> > +			    struct bpf_map *map,
> > +			    struct xdp_buff *xdp,
> > +			    u32 index)
> > +{
> > +	int err;
> > +
> > +	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
> > +		struct net_device *dev = fwd;
> > +
> > +		if (!dev->netdev_ops->ndo_xdp_xmit)
> > +			return -EOPNOTSUPP;
> > +
> > +		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
> > +		if (err)
> > +			return err;
> >   		__dev_map_insert_ctx(map, index);
> > -	else
> > -		dev->netdev_ops->ndo_xdp_flush(dev);
> > +
> > +	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
> > +		struct bpf_cpu_map_entry *rcpu = fwd;
> > +
> > +		err = cpu_map_enqueue(rcpu, xdp, dev_rx);
> > +		if (err)
> > +			return err;
> > +		__cpu_map_insert_ctx(map, index);
> > +	}
> >   	return 0;
> >   }
> >
> > @@ -2534,11 +2560,33 @@ void xdp_do_flush_map(void)
> >   	struct bpf_map *map = ri->map_to_flush;
> >
> >   	ri->map_to_flush = NULL;
> > -	if (map)
> > -		__dev_map_flush(map);
> > +	if (map) {
> > +		switch (map->map_type) {
> > +		case BPF_MAP_TYPE_DEVMAP:
> > +			__dev_map_flush(map);
> > +			break;
> > +		case BPF_MAP_TYPE_CPUMAP:
> > +			__cpu_map_flush(map);
> > +			break;
> > +		default:
> > +			break;
> > +		}
> > +	}
> >   }
> >   EXPORT_SYMBOL_GPL(xdp_do_flush_map);
> >
> > +static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
> > +{
> > +	switch (map->map_type) {
> > +	case BPF_MAP_TYPE_DEVMAP:
> > +		return __dev_map_lookup_elem(map, index);
> > +	case BPF_MAP_TYPE_CPUMAP:
> > +		return __cpu_map_lookup_elem(map, index);
> > +	default:
> > +		return NULL;
> > +	}  
> 
> Should we just have a callback and instead of the above use
> map->ptr_lookup_elem() (or however we name it) ... lot of it
> is pretty much the same logic as with devmap.

We could extend struct bpf_map *map with such a callback, I was just
afraid that this would be too invasive.

Performance wise, I don't thinks will hurt too much.
http://www.cipht.net/2017/10/03/are-jump-tables-always-fastest.html


> > +}
> > +
> >   static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog,
> >   				   unsigned long aux)
> >   {
> > @@ -2551,8 +2599,8 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
> >   	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
> >   	unsigned long map_owner = ri->map_owner;
> >   	struct bpf_map *map = ri->map;
> > -	struct net_device *fwd = NULL;
> >   	u32 index = ri->ifindex;
> > +	void *fwd = NULL;
> >   	int err;
> >
> >   	ri->ifindex = 0;  
> 



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH 2/3 v2] net: phy: DP83822 initial driver submission
From: Dan Murphy @ 2017-10-06 12:00 UTC (permalink / raw)
  To: Andrew Lunn, Woojung.Huh; +Cc: f.fainelli, netdev, afd
In-Reply-To: <20171004235307.GD16612@lunn.ch>

Andrew

On 10/04/2017 06:53 PM, Andrew Lunn wrote:
> On Wed, Oct 04, 2017 at 10:44:36PM +0000, Woojung.Huh@microchip.com wrote:
>>> +static int dp83822_suspend(struct phy_device *phydev)
>>> +{
>>> +	int value;
>>> +
>>> +	mutex_lock(&phydev->lock);
>>> +	value = phy_read_mmd(phydev, DP83822_DEVADDR,
>>> MII_DP83822_WOL_CFG);
>>> +	mutex_unlock(&phydev->lock);
> 
>> Would we need mutex to access phy_read_mmd()?
>> phy_read_mmd() has mdio_lock for indirect access.
> 
> Hi Woojung
> 
> The mdio lock is not sufficient. It protects against two mdio
> accesses. But here we need to protect against two phy operations.
> There is a danger something else tries to access the phy during
> suspend.
> 
>>> +	if (!(value & DP83822_WOL_EN))
>>> +		genphy_suspend(phydev);
> 
> Releasing the lock before calling genphy_suspend() is not so nice.
> Maybe add a version which assumes the lock has already been taken?
> 

The marvell driver does not take a lock and calls genphy_suspend/resume
so I am wondering if this driver needs to take a lock.

The at803x needs to take the lock because it does not call into the genphy
functions.

I will submit a new version with the lock removed.

Dan

>       Andrew
> 


-- 
------------------
Dan Murphy

^ permalink raw reply

* Re: [net-next V4 PATCH 2/5] bpf: XDP_REDIRECT enable use of cpumap
From: Jesper Dangaard Brouer @ 2017-10-06 12:01 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: netdev, jakub.kicinski, Michael S. Tsirkin, pavel.odintsov,
	Jason Wang, mchan, John Fastabend, peter.waskiewicz.jr,
	Daniel Borkmann, Alexei Starovoitov, Andy Gospodarek, brouer
In-Reply-To: <20171006131748.75185f65@redhat.com>

On Fri, 6 Oct 2017 13:17:48 +0200
Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> > > -int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp)
> > > +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
> > > +		    struct net_device *dev_rx)
> > >   {
> > >   	struct xdp_pkt *xdp_pkt;
> > >   	int headroom;
> > > @@ -505,7 +506,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp)
> > >   	xdp_pkt = xdp->data_hard_start;
> > >   	xdp_pkt->data = xdp->data;
> > >   	xdp_pkt->len  = xdp->data_end - xdp->data;
> > > -	xdp_pkt->headroom = headroom;
> > > +	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);    
> > 
> > (Just a note, bit confusing that first two patches add and extend
> >   this, and only in the third you add the xdp->data_meta handling,
> >   makes it harder to review at least.)  
> 
> Sorry.  This is a left-overs from rebasing and measuring the cost of
> transferring only the pointer to the page, and remote put_page().
> And your xdp->data_meta, happen basically while my patches was in-flight.
> 
> I'll move this one-line back to patch 2, to spreading over too many
> patches.

I instead choose to move the creation of cpu_map_enqueue() into this
patch, but in a more simple version stating explicit that this is only
seen as a void pointer enqueue.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH] ipv6: gso: fix payload length when gso_size is zero
From: Alexey Kodanev @ 2017-10-06 12:16 UTC (permalink / raw)
  To: Duyck, Alexander H, netdev@vger.kernel.org
  Cc: davem@davemloft.net, steffen.klassert@secunet.com
In-Reply-To: <1507229878.2098.56.camel@intel.com>

On 10/05/2017 09:58 PM, Duyck, Alexander H wrote:
> On Thu, 2017-10-05 at 20:06 +0300, Alexey Kodanev wrote:
>> When gso_size reset to zero for the tail segment in skb_segment(), later
>> in ipv6_gso_segment(), we will get incorrect payload_len for that segment.
>> inet_gso_segment() already has a check for gso_size before calculating
>> payload so fixing only IPv6 part.
>>
>> The issue was found with LTP vxlan & gre tests over ixgbe NIC.
>>
>> Fixes: 07b26c9454a2 ("gso: Support partial splitting at the frag_list pointer")
>> Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
>> ---
>>  net/ipv6/ip6_offload.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
>> index cdb3728..4a87f94 100644
>> --- a/net/ipv6/ip6_offload.c
>> +++ b/net/ipv6/ip6_offload.c
>> @@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
>>  
>>  	for (skb = segs; skb; skb = skb->next) {
>>  		ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
>> -		if (gso_partial)
>> +		if (gso_partial && skb_is_gso(skb))
>>  			payload_len = skb_shinfo(skb)->gso_size +
>>  				      SKB_GSO_CB(skb)->data_offset +
>>  				      skb->head - (unsigned char *)(ipv6h + 1);
> So looking over this change it looks good to me. I'm just wondering if
> you have looked at the code in __skb_udp_tunnel_segment or
> gre_gso_segment? It seems like if you needed this change here you
> should need to make similar changes to those functions as well. I'm
> wondering if we just aren't seeing issues due to the segments already
> being MSS sized before being handed to us for segmentation.

Right, it can happen in __skb_udp_tunnel_segment as well. I wasn't able
to reproduce with gre, it looks like it doesn't go to that part of code,
skipping it, possibly, on gso_type & SKB_GSO_GRE_CSUM check, though the
NIC has tx-gre-csum-segmentation enabled...

I'll send new version after testing completed.

Thanks,
Alexey

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox