Netdev List
 help / color / mirror / Atom feed
* [bpf-next 3/3] selftests/bpf: test bpf flow dissection
From: Petar Penkov @ 2018-08-30 18:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, ast, daniel, simon.horman, ecree, songliubraving, tom,
	Petar Penkov, Willem de Bruijn
In-Reply-To: <20180830182301.89435-1-peterpenkov96@gmail.com>

From: Petar Penkov <ppenkov@google.com>

Adds a test that sends different types of packets over multiple
tunnels and verifies that valid packets are dissected correctly.  To do
so, a tc-flower rule is added to drop packets on UDP src port 9, and
packets are sent from ports 8, 9, and 10. Only the packets on port 9
should be dropped. Because tc-flower relies on the flow dissector to
match flows, correct classification demonstrates correct dissection.

Also add support logic to load the BPF program and to inject the test
packets.

Signed-off-by: Petar Penkov <ppenkov@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 tools/testing/selftests/bpf/.gitignore        |   2 +
 tools/testing/selftests/bpf/Makefile          |   6 +-
 tools/testing/selftests/bpf/config            |   1 +
 .../selftests/bpf/flow_dissector_load.c       | 140 ++++
 .../selftests/bpf/test_flow_dissector.c       | 782 ++++++++++++++++++
 .../selftests/bpf/test_flow_dissector.sh      | 115 +++
 tools/testing/selftests/bpf/with_addr.sh      |  54 ++
 tools/testing/selftests/bpf/with_tunnels.sh   |  36 +
 8 files changed, 1134 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/flow_dissector_load.c
 create mode 100644 tools/testing/selftests/bpf/test_flow_dissector.c
 create mode 100755 tools/testing/selftests/bpf/test_flow_dissector.sh
 create mode 100755 tools/testing/selftests/bpf/with_addr.sh
 create mode 100755 tools/testing/selftests/bpf/with_tunnels.sh

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 49938d72cf63..e61a85ac4b79 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -19,3 +19,5 @@ test_btf
 test_sockmap
 test_lirc_mode2_user
 get_cgroup_id_user
+test_flow_dissector
+flow_dissector_load
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index e65f50f9185e..fd3851d5c079 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -47,10 +47,12 @@ TEST_PROGS := test_kmod.sh \
 	test_tunnel.sh \
 	test_lwt_seg6local.sh \
 	test_lirc_mode2.sh \
-	test_skb_cgroup_id.sh
+	test_skb_cgroup_id.sh \
+	test_flow_dissector.sh
 
 # Compile but not part of 'make run_tests'
-TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user
+TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \
+	flow_dissector_load test_flow_dissector
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index b4994a94968b..3655508f95fd 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -18,3 +18,4 @@ CONFIG_CRYPTO_HMAC=m
 CONFIG_CRYPTO_SHA256=m
 CONFIG_VXLAN=y
 CONFIG_GENEVE=y
+CONFIG_NET_CLS_FLOWER=m
diff --git a/tools/testing/selftests/bpf/flow_dissector_load.c b/tools/testing/selftests/bpf/flow_dissector_load.c
new file mode 100644
index 000000000000..d3273b5b3173
--- /dev/null
+++ b/tools/testing/selftests/bpf/flow_dissector_load.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <errno.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+const char *cfg_pin_path = "/sys/fs/bpf/flow_dissector";
+const char *cfg_map_name = "jmp_table";
+bool cfg_attach = true;
+char *cfg_section_name;
+char *cfg_path_name;
+
+static void load_and_attach_program(void)
+{
+	struct bpf_program *prog, *main_prog;
+	struct bpf_map *prog_array;
+	int i, fd, prog_fd, ret;
+	struct bpf_object *obj;
+	int prog_array_fd;
+
+	ret = bpf_prog_load(cfg_path_name, BPF_PROG_TYPE_FLOW_DISSECTOR, &obj,
+			    &prog_fd);
+	if (ret)
+		error(1, 0, "bpf_prog_load %s", cfg_path_name);
+
+	main_prog = bpf_object__find_program_by_title(obj, cfg_section_name);
+	if (!main_prog)
+		error(1, 0, "bpf_object__find_program_by_title %s",
+		      cfg_section_name);
+
+	prog_fd = bpf_program__fd(main_prog);
+	if (prog_fd < 0)
+		error(1, 0, "bpf_program__fd");
+
+	prog_array = bpf_object__find_map_by_name(obj, cfg_map_name);
+	if (!prog_array)
+		error(1, 0, "bpf_object__find_map_by_name %s", cfg_map_name);
+
+	prog_array_fd = bpf_map__fd(prog_array);
+	if (prog_array_fd < 0)
+		error(1, 0, "bpf_map__fd %s", cfg_map_name);
+
+	i = 0;
+	bpf_object__for_each_program(prog, obj) {
+		fd = bpf_program__fd(prog);
+		if (fd < 0)
+			error(1, 0, "bpf_program__fd");
+
+		if (fd != prog_fd) {
+			printf("%d: %s\n", i, bpf_program__title(prog, false));
+			bpf_map_update_elem(prog_array_fd, &i, &fd, BPF_ANY);
+			++i;
+		}
+	}
+
+	ret = bpf_prog_attach(prog_fd, 0 /* Ignore */, BPF_FLOW_DISSECTOR, 0);
+	if (ret)
+		error(1, 0, "bpf_prog_attach %s", cfg_path_name);
+
+	ret = bpf_object__pin(obj, cfg_pin_path);
+	if (ret)
+		error(1, 0, "bpf_object__pin %s", cfg_pin_path);
+
+}
+
+static void detach_program(void)
+{
+	char command[64];
+	int ret;
+
+	ret = bpf_prog_detach(0, BPF_FLOW_DISSECTOR);
+	if (ret)
+		error(1, 0, "bpf_prog_detach");
+
+	/* To unpin, it is necessary and sufficient to just remove this dir */
+	sprintf(command, "rm -r %s", cfg_pin_path);
+	ret = system(command);
+	if (ret)
+		error(1, errno, command);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	bool attach = false;
+	bool detach = false;
+	int c;
+
+	while ((c = getopt(argc, argv, "adp:s:")) != -1) {
+		switch (c) {
+		case 'a':
+			if (detach)
+				error(1, 0, "attach/detach are exclusive");
+			attach = true;
+			break;
+		case 'd':
+			if (attach)
+				error(1, 0, "attach/detach are exclusive");
+			detach = true;
+			break;
+		case 'p':
+			if (cfg_path_name)
+				error(1, 0, "only one prog name can be given");
+
+			cfg_path_name = optarg;
+			break;
+		case 's':
+			if (cfg_section_name)
+				error(1, 0, "only one section can be given");
+
+			cfg_section_name = optarg;
+			break;
+		}
+	}
+
+	if (detach)
+		cfg_attach = false;
+
+	if (cfg_attach && !cfg_path_name)
+		error(1, 0, "must provide a path to the BPF program");
+
+	if (cfg_attach && !cfg_section_name)
+		error(1, 0, "must provide a section name");
+}
+
+int main(int argc, char **argv)
+{
+	parse_opts(argc, argv);
+	if (cfg_attach)
+		load_and_attach_program();
+	else
+		detach_program();
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.c b/tools/testing/selftests/bpf/test_flow_dissector.c
new file mode 100644
index 000000000000..12b784afba31
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_flow_dissector.c
@@ -0,0 +1,782 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Inject packets with all sorts of encapsulation into the kernel.
+ *
+ * IPv4/IPv6	outer layer 3
+ * GRE/GUE/BARE outer layer 4, where bare is IPIP/SIT/IPv4-in-IPv6/..
+ * IPv4/IPv6    inner layer 3
+ */
+
+#define _GNU_SOURCE
+
+#include <stddef.h>
+#include <arpa/inet.h>
+#include <asm/byteorder.h>
+#include <error.h>
+#include <errno.h>
+#include <linux/if_packet.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ipv6.h>
+#include <netinet/ip.h>
+#include <netinet/in.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#define CFG_PORT_INNER	8000
+
+/* Add some protocol definitions that do not exist in userspace */
+
+struct grehdr {
+	uint16_t unused;
+	uint16_t protocol;
+} __attribute__((packed));
+
+struct guehdr {
+	union {
+		struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+			__u8	hlen:5,
+				control:1,
+				version:2;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+			__u8	version:2,
+				control:1,
+				hlen:5;
+#else
+#error  "Please fix <asm/byteorder.h>"
+#endif
+			__u8	proto_ctype;
+			__be16	flags;
+		};
+		__be32	word;
+	};
+};
+
+static uint8_t	cfg_dsfield_inner;
+static uint8_t	cfg_dsfield_outer;
+static uint8_t	cfg_encap_proto;
+static bool	cfg_expect_failure = false;
+static int	cfg_l3_extra = AF_UNSPEC;	/* optional SIT prefix */
+static int	cfg_l3_inner = AF_UNSPEC;
+static int	cfg_l3_outer = AF_UNSPEC;
+static int	cfg_num_pkt = 10;
+static int	cfg_num_secs = 0;
+static char	cfg_payload_char = 'a';
+static int	cfg_payload_len = 100;
+static int	cfg_port_gue = 6080;
+static bool	cfg_only_rx;
+static bool	cfg_only_tx;
+static int	cfg_src_port = 9;
+
+static char	buf[ETH_DATA_LEN];
+
+#define INIT_ADDR4(name, addr4, port)				\
+	static struct sockaddr_in name = {			\
+		.sin_family = AF_INET,				\
+		.sin_port = __constant_htons(port),		\
+		.sin_addr.s_addr = __constant_htonl(addr4),	\
+	};
+
+#define INIT_ADDR6(name, addr6, port)				\
+	static struct sockaddr_in6 name = {			\
+		.sin6_family = AF_INET6,			\
+		.sin6_port = __constant_htons(port),		\
+		.sin6_addr = addr6,				\
+	};
+
+INIT_ADDR4(in_daddr4, INADDR_LOOPBACK, CFG_PORT_INNER)
+INIT_ADDR4(in_saddr4, INADDR_LOOPBACK + 2, 0)
+INIT_ADDR4(out_daddr4, INADDR_LOOPBACK, 0)
+INIT_ADDR4(out_saddr4, INADDR_LOOPBACK + 1, 0)
+INIT_ADDR4(extra_daddr4, INADDR_LOOPBACK, 0)
+INIT_ADDR4(extra_saddr4, INADDR_LOOPBACK + 1, 0)
+
+INIT_ADDR6(in_daddr6, IN6ADDR_LOOPBACK_INIT, CFG_PORT_INNER)
+INIT_ADDR6(in_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(out_daddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(out_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(extra_daddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(extra_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
+
+static unsigned long util_gettime(void)
+{
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static void util_printaddr(const char *msg, struct sockaddr *addr)
+{
+	unsigned long off = 0;
+	char nbuf[INET6_ADDRSTRLEN];
+
+	switch (addr->sa_family) {
+	case PF_INET:
+		off = __builtin_offsetof(struct sockaddr_in, sin_addr);
+		break;
+	case PF_INET6:
+		off = __builtin_offsetof(struct sockaddr_in6, sin6_addr);
+		break;
+	default:
+		error(1, 0, "printaddr: unsupported family %u\n",
+		      addr->sa_family);
+	}
+
+	if (!inet_ntop(addr->sa_family, ((void *) addr) + off, nbuf,
+		       sizeof(nbuf)))
+		error(1, errno, "inet_ntop");
+
+	fprintf(stderr, "%s: %s\n", msg, nbuf);
+}
+
+static unsigned long add_csum_hword(const uint16_t *start, int num_u16)
+{
+	unsigned long sum = 0;
+	int i;
+
+	for (i = 0; i < num_u16; i++)
+		sum += start[i];
+
+	return sum;
+}
+
+static uint16_t build_ip_csum(const uint16_t *start, int num_u16,
+			      unsigned long sum)
+{
+	sum += add_csum_hword(start, num_u16);
+
+	while (sum >> 16)
+		sum = (sum & 0xffff) + (sum >> 16);
+
+	return ~sum;
+}
+
+static void build_ipv4_header(void *header, uint8_t proto,
+			      uint32_t src, uint32_t dst,
+			      int payload_len, uint8_t tos)
+{
+	struct iphdr *iph = header;
+
+	iph->ihl = 5;
+	iph->version = 4;
+	iph->tos = tos;
+	iph->ttl = 8;
+	iph->tot_len = htons(sizeof(*iph) + payload_len);
+	iph->id = htons(1337);
+	iph->protocol = proto;
+	iph->saddr = src;
+	iph->daddr = dst;
+	iph->check = build_ip_csum((void *) iph, iph->ihl << 1, 0);
+}
+
+static void ipv6_set_dsfield(struct ipv6hdr *ip6h, uint8_t dsfield)
+{
+	uint16_t val, *ptr = (uint16_t *)ip6h;
+
+	val = ntohs(*ptr);
+	val &= 0xF00F;
+	val |= ((uint16_t) dsfield) << 4;
+	*ptr = htons(val);
+}
+
+static void build_ipv6_header(void *header, uint8_t proto,
+			      struct sockaddr_in6 *src,
+			      struct sockaddr_in6 *dst,
+			      int payload_len, uint8_t dsfield)
+{
+	struct ipv6hdr *ip6h = header;
+
+	ip6h->version = 6;
+	ip6h->payload_len = htons(payload_len);
+	ip6h->nexthdr = proto;
+	ip6h->hop_limit = 8;
+	ipv6_set_dsfield(ip6h, dsfield);
+
+	memcpy(&ip6h->saddr, &src->sin6_addr, sizeof(ip6h->saddr));
+	memcpy(&ip6h->daddr, &dst->sin6_addr, sizeof(ip6h->daddr));
+}
+
+static uint16_t build_udp_v4_csum(const struct iphdr *iph,
+				  const struct udphdr *udph,
+				  int num_words)
+{
+	unsigned long pseudo_sum;
+	int num_u16 = sizeof(iph->saddr);	/* halfwords: twice byte len */
+
+	pseudo_sum = add_csum_hword((void *) &iph->saddr, num_u16);
+	pseudo_sum += htons(IPPROTO_UDP);
+	pseudo_sum += udph->len;
+	return build_ip_csum((void *) udph, num_words, pseudo_sum);
+}
+
+static uint16_t build_udp_v6_csum(const struct ipv6hdr *ip6h,
+				  const struct udphdr *udph,
+				  int num_words)
+{
+	unsigned long pseudo_sum;
+	int num_u16 = sizeof(ip6h->saddr);	/* halfwords: twice byte len */
+
+	pseudo_sum = add_csum_hword((void *) &ip6h->saddr, num_u16);
+	pseudo_sum += htons(ip6h->nexthdr);
+	pseudo_sum += ip6h->payload_len;
+	return build_ip_csum((void *) udph, num_words, pseudo_sum);
+}
+
+static void build_udp_header(void *header, int payload_len,
+			     uint16_t dport, int family)
+{
+	struct udphdr *udph = header;
+	int len = sizeof(*udph) + payload_len;
+
+	udph->source = htons(cfg_src_port);
+	udph->dest = htons(dport);
+	udph->len = htons(len);
+	udph->check = 0;
+	if (family == AF_INET)
+		udph->check = build_udp_v4_csum(header - sizeof(struct iphdr),
+						udph, len >> 1);
+	else
+		udph->check = build_udp_v6_csum(header - sizeof(struct ipv6hdr),
+						udph, len >> 1);
+}
+
+static void build_gue_header(void *header, uint8_t proto)
+{
+	struct guehdr *gueh = header;
+
+	gueh->proto_ctype = proto;
+}
+
+static void build_gre_header(void *header, uint16_t proto)
+{
+	struct grehdr *greh = header;
+
+	greh->protocol = htons(proto);
+}
+
+static int l3_length(int family)
+{
+	if (family == AF_INET)
+		return sizeof(struct iphdr);
+	else
+		return sizeof(struct ipv6hdr);
+}
+
+static int build_packet(void)
+{
+	int ol3_len = 0, ol4_len = 0, il3_len = 0, il4_len = 0;
+	int el3_len = 0;
+
+	if (cfg_l3_extra)
+		el3_len = l3_length(cfg_l3_extra);
+
+	/* calculate header offsets */
+	if (cfg_encap_proto) {
+		ol3_len = l3_length(cfg_l3_outer);
+
+		if (cfg_encap_proto == IPPROTO_GRE)
+			ol4_len = sizeof(struct grehdr);
+		else if (cfg_encap_proto == IPPROTO_UDP)
+			ol4_len = sizeof(struct udphdr) + sizeof(struct guehdr);
+	}
+
+	il3_len = l3_length(cfg_l3_inner);
+	il4_len = sizeof(struct udphdr);
+
+	if (el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len >=
+	    sizeof(buf))
+		error(1, 0, "packet too large\n");
+
+	/*
+	 * Fill packet from inside out, to calculate correct checksums.
+	 * But create ip before udp headers, as udp uses ip for pseudo-sum.
+	 */
+	memset(buf + el3_len + ol3_len + ol4_len + il3_len + il4_len,
+	       cfg_payload_char, cfg_payload_len);
+
+	/* add zero byte for udp csum padding */
+	buf[el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len] = 0;
+
+	switch (cfg_l3_inner) {
+	case PF_INET:
+		build_ipv4_header(buf + el3_len + ol3_len + ol4_len,
+				  IPPROTO_UDP,
+				  in_saddr4.sin_addr.s_addr,
+				  in_daddr4.sin_addr.s_addr,
+				  il4_len + cfg_payload_len,
+				  cfg_dsfield_inner);
+		break;
+	case PF_INET6:
+		build_ipv6_header(buf + el3_len + ol3_len + ol4_len,
+				  IPPROTO_UDP,
+				  &in_saddr6, &in_daddr6,
+				  il4_len + cfg_payload_len,
+				  cfg_dsfield_inner);
+		break;
+	}
+
+	build_udp_header(buf + el3_len + ol3_len + ol4_len + il3_len,
+			 cfg_payload_len, CFG_PORT_INNER, cfg_l3_inner);
+
+	if (!cfg_encap_proto)
+		return il3_len + il4_len + cfg_payload_len;
+
+	switch (cfg_l3_outer) {
+	case PF_INET:
+		build_ipv4_header(buf + el3_len, cfg_encap_proto,
+				  out_saddr4.sin_addr.s_addr,
+				  out_daddr4.sin_addr.s_addr,
+				  ol4_len + il3_len + il4_len + cfg_payload_len,
+				  cfg_dsfield_outer);
+		break;
+	case PF_INET6:
+		build_ipv6_header(buf + el3_len, cfg_encap_proto,
+				  &out_saddr6, &out_daddr6,
+				  ol4_len + il3_len + il4_len + cfg_payload_len,
+				  cfg_dsfield_outer);
+		break;
+	}
+
+	switch (cfg_encap_proto) {
+	case IPPROTO_UDP:
+		build_gue_header(buf + el3_len + ol3_len + ol4_len -
+				 sizeof(struct guehdr),
+				 cfg_l3_inner == PF_INET ? IPPROTO_IPIP
+							 : IPPROTO_IPV6);
+		build_udp_header(buf + el3_len + ol3_len,
+				 sizeof(struct guehdr) + il3_len + il4_len +
+				 cfg_payload_len,
+				 cfg_port_gue, cfg_l3_outer);
+		break;
+	case IPPROTO_GRE:
+		build_gre_header(buf + el3_len + ol3_len,
+				 cfg_l3_inner == PF_INET ? ETH_P_IP
+							 : ETH_P_IPV6);
+		break;
+	}
+
+	switch (cfg_l3_extra) {
+	case PF_INET:
+		build_ipv4_header(buf,
+				  cfg_l3_outer == PF_INET ? IPPROTO_IPIP
+							  : IPPROTO_IPV6,
+				  extra_saddr4.sin_addr.s_addr,
+				  extra_daddr4.sin_addr.s_addr,
+				  ol3_len + ol4_len + il3_len + il4_len +
+				  cfg_payload_len, 0);
+		break;
+	case PF_INET6:
+		build_ipv6_header(buf,
+				  cfg_l3_outer == PF_INET ? IPPROTO_IPIP
+							  : IPPROTO_IPV6,
+				  &extra_saddr6, &extra_daddr6,
+				  ol3_len + ol4_len + il3_len + il4_len +
+				  cfg_payload_len, 0);
+		break;
+	}
+
+	return el3_len + ol3_len + ol4_len + il3_len + il4_len +
+	       cfg_payload_len;
+}
+
+/* sender transmits encapsulated over RAW or unencap'd over UDP */
+static int setup_tx(void)
+{
+	int family, fd, ret;
+
+	if (cfg_l3_extra)
+		family = cfg_l3_extra;
+	else if (cfg_l3_outer)
+		family = cfg_l3_outer;
+	else
+		family = cfg_l3_inner;
+
+	fd = socket(family, SOCK_RAW, IPPROTO_RAW);
+	if (fd == -1)
+		error(1, errno, "socket tx");
+
+	if (cfg_l3_extra) {
+		if (cfg_l3_extra == PF_INET)
+			ret = connect(fd, (void *) &extra_daddr4,
+				      sizeof(extra_daddr4));
+		else
+			ret = connect(fd, (void *) &extra_daddr6,
+				      sizeof(extra_daddr6));
+		if (ret)
+			error(1, errno, "connect tx");
+	} else if (cfg_l3_outer) {
+		/* connect to destination if not encapsulated */
+		if (cfg_l3_outer == PF_INET)
+			ret = connect(fd, (void *) &out_daddr4,
+				      sizeof(out_daddr4));
+		else
+			ret = connect(fd, (void *) &out_daddr6,
+				      sizeof(out_daddr6));
+		if (ret)
+			error(1, errno, "connect tx");
+	} else {
+		/* otherwise using loopback */
+		if (cfg_l3_inner == PF_INET)
+			ret = connect(fd, (void *) &in_daddr4,
+				      sizeof(in_daddr4));
+		else
+			ret = connect(fd, (void *) &in_daddr6,
+				      sizeof(in_daddr6));
+		if (ret)
+			error(1, errno, "connect tx");
+	}
+
+	return fd;
+}
+
+/* receiver reads unencapsulated UDP */
+static int setup_rx(void)
+{
+	int fd, ret;
+
+	fd = socket(cfg_l3_inner, SOCK_DGRAM, 0);
+	if (fd == -1)
+		error(1, errno, "socket rx");
+
+	if (cfg_l3_inner == PF_INET)
+		ret = bind(fd, (void *) &in_daddr4, sizeof(in_daddr4));
+	else
+		ret = bind(fd, (void *) &in_daddr6, sizeof(in_daddr6));
+	if (ret)
+		error(1, errno, "bind rx");
+
+	return fd;
+}
+
+static int do_tx(int fd, const char *pkt, int len)
+{
+	int ret;
+
+	ret = write(fd, pkt, len);
+	if (ret == -1)
+		error(1, errno, "send");
+	if (ret != len)
+		error(1, errno, "send: len (%d < %d)\n", ret, len);
+
+	return 1;
+}
+
+static int do_poll(int fd, short events, int timeout)
+{
+	struct pollfd pfd;
+	int ret;
+
+	pfd.fd = fd;
+	pfd.events = events;
+
+	ret = poll(&pfd, 1, timeout);
+	if (ret == -1)
+		error(1, errno, "poll");
+	if (ret && !(pfd.revents & POLLIN))
+		error(1, errno, "poll: unexpected event 0x%x\n", pfd.revents);
+
+	return ret;
+}
+
+static int do_rx(int fd)
+{
+	char rbuf;
+	int ret, num = 0;
+
+	while (1) {
+		ret = recv(fd, &rbuf, 1, MSG_DONTWAIT);
+		if (ret == -1 && errno == EAGAIN)
+			break;
+		if (ret == -1)
+			error(1, errno, "recv");
+		if (rbuf != cfg_payload_char)
+			error(1, 0, "recv: payload mismatch");
+		num++;
+	};
+
+	return num;
+}
+
+static int do_main(void)
+{
+	unsigned long tstop, treport, tcur;
+	int fdt = -1, fdr = -1, len, tx = 0, rx = 0;
+
+	if (!cfg_only_tx)
+		fdr = setup_rx();
+	if (!cfg_only_rx)
+		fdt = setup_tx();
+
+	len = build_packet();
+
+	tcur = util_gettime();
+	treport = tcur + 1000;
+	tstop = tcur + (cfg_num_secs * 1000);
+
+	while (1) {
+		if (!cfg_only_rx)
+			tx += do_tx(fdt, buf, len);
+
+		if (!cfg_only_tx)
+			rx += do_rx(fdr);
+
+		if (cfg_num_secs) {
+			tcur = util_gettime();
+			if (tcur >= tstop)
+				break;
+			if (tcur >= treport) {
+				fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx);
+				tx = 0;
+				rx = 0;
+				treport = tcur + 1000;
+			}
+		} else {
+			if (tx == cfg_num_pkt)
+				break;
+		}
+	}
+
+	/* read straggler packets, if any */
+	if (rx < tx) {
+		tstop = util_gettime() + 100;
+		while (rx < tx) {
+			tcur = util_gettime();
+			if (tcur >= tstop)
+				break;
+
+			do_poll(fdr, POLLIN, tstop - tcur);
+			rx += do_rx(fdr);
+		}
+	}
+
+	fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx);
+
+	if (fdr != -1 && close(fdr))
+		error(1, errno, "close rx");
+	if (fdt != -1 && close(fdt))
+		error(1, errno, "close tx");
+
+	/*
+	 * success (== 0) only if received all packets
+	 * unless failure is expected, in which case none must arrive.
+	 */
+	if (cfg_expect_failure)
+		return rx != 0;
+	else
+		return rx != tx;
+}
+
+
+static void __attribute__((noreturn)) usage(const char *filepath)
+{
+	fprintf(stderr, "Usage: %s [-e gre|gue|bare|none] [-i 4|6] [-l len] "
+			"[-O 4|6] [-o 4|6] [-n num] [-t secs] [-R] [-T] "
+			"[-s <osrc> [-d <odst>] [-S <isrc>] [-D <idst>] "
+			"[-x <otos>] [-X <itos>] [-f <isport>] [-F]\n",
+		filepath);
+	exit(1);
+}
+
+static void parse_addr(int family, void *addr, const char *optarg)
+{
+	int ret;
+
+	ret = inet_pton(family, optarg, addr);
+	if (ret == -1)
+		error(1, errno, "inet_pton");
+	if (ret == 0)
+		error(1, 0, "inet_pton: bad string");
+}
+
+static void parse_addr4(struct sockaddr_in *addr, const char *optarg)
+{
+	parse_addr(AF_INET, &addr->sin_addr, optarg);
+}
+
+static void parse_addr6(struct sockaddr_in6 *addr, const char *optarg)
+{
+	parse_addr(AF_INET6, &addr->sin6_addr, optarg);
+}
+
+static int parse_protocol_family(const char *filepath, const char *optarg)
+{
+	if (!strcmp(optarg, "4"))
+		return PF_INET;
+	if (!strcmp(optarg, "6"))
+		return PF_INET6;
+
+	usage(filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "d:D:e:f:Fhi:l:n:o:O:Rs:S:t:Tx:X:")) != -1) {
+		switch (c) {
+		case 'd':
+			if (cfg_l3_outer == AF_UNSPEC)
+				error(1, 0, "-d must be preceded by -o");
+			if (cfg_l3_outer == AF_INET)
+				parse_addr4(&out_daddr4, optarg);
+			else
+				parse_addr6(&out_daddr6, optarg);
+			break;
+		case 'D':
+			if (cfg_l3_inner == AF_UNSPEC)
+				error(1, 0, "-D must be preceded by -i");
+			if (cfg_l3_inner == AF_INET)
+				parse_addr4(&in_daddr4, optarg);
+			else
+				parse_addr6(&in_daddr6, optarg);
+			break;
+		case 'e':
+			if (!strcmp(optarg, "gre"))
+				cfg_encap_proto = IPPROTO_GRE;
+			else if (!strcmp(optarg, "gue"))
+				cfg_encap_proto = IPPROTO_UDP;
+			else if (!strcmp(optarg, "bare"))
+				cfg_encap_proto = IPPROTO_IPIP;
+			else if (!strcmp(optarg, "none"))
+				cfg_encap_proto = IPPROTO_IP;	/* == 0 */
+			else
+				usage(argv[0]);
+			break;
+		case 'f':
+			cfg_src_port = strtol(optarg, NULL, 0);
+			break;
+		case 'F':
+			cfg_expect_failure = true;
+			break;
+		case 'h':
+			usage(argv[0]);
+			break;
+		case 'i':
+			if (!strcmp(optarg, "4"))
+				cfg_l3_inner = PF_INET;
+			else if (!strcmp(optarg, "6"))
+				cfg_l3_inner = PF_INET6;
+			else
+				usage(argv[0]);
+			break;
+		case 'l':
+			cfg_payload_len = strtol(optarg, NULL, 0);
+			break;
+		case 'n':
+			cfg_num_pkt = strtol(optarg, NULL, 0);
+			break;
+		case 'o':
+			cfg_l3_outer = parse_protocol_family(argv[0], optarg);
+			break;
+		case 'O':
+			cfg_l3_extra = parse_protocol_family(argv[0], optarg);
+			break;
+		case 'R':
+			cfg_only_rx = true;
+			break;
+		case 's':
+			if (cfg_l3_outer == AF_INET)
+				parse_addr4(&out_saddr4, optarg);
+			else
+				parse_addr6(&out_saddr6, optarg);
+			break;
+		case 'S':
+			if (cfg_l3_inner == AF_INET)
+				parse_addr4(&in_saddr4, optarg);
+			else
+				parse_addr6(&in_saddr6, optarg);
+			break;
+		case 't':
+			cfg_num_secs = strtol(optarg, NULL, 0);
+			break;
+		case 'T':
+			cfg_only_tx = true;
+			break;
+		case 'x':
+			cfg_dsfield_outer = strtol(optarg, NULL, 0);
+			break;
+		case 'X':
+			cfg_dsfield_inner = strtol(optarg, NULL, 0);
+			break;
+		}
+	}
+
+	if (cfg_only_rx && cfg_only_tx)
+		error(1, 0, "options: cannot combine rx-only and tx-only");
+
+	if (cfg_encap_proto && cfg_l3_outer == AF_UNSPEC)
+		error(1, 0, "options: must specify outer with encap");
+	else if ((!cfg_encap_proto) && cfg_l3_outer != AF_UNSPEC)
+		error(1, 0, "options: cannot combine no-encap and outer");
+	else if ((!cfg_encap_proto) && cfg_l3_extra != AF_UNSPEC)
+		error(1, 0, "options: cannot combine no-encap and extra");
+
+	if (cfg_l3_inner == AF_UNSPEC)
+		cfg_l3_inner = AF_INET6;
+	if (cfg_l3_inner == AF_INET6 && cfg_encap_proto == IPPROTO_IPIP)
+		cfg_encap_proto = IPPROTO_IPV6;
+
+	/* RFC 6040 4.2:
+	 *   on decap, if outer encountered congestion (CE == 0x3),
+	 *   but inner cannot encode ECN (NoECT == 0x0), then drop packet.
+	 */
+	if (((cfg_dsfield_outer & 0x3) == 0x3) &&
+	    ((cfg_dsfield_inner & 0x3) == 0x0))
+		cfg_expect_failure = true;
+}
+
+static void print_opts(void)
+{
+	if (cfg_l3_inner == PF_INET6) {
+		util_printaddr("inner.dest6", (void *) &in_daddr6);
+		util_printaddr("inner.source6", (void *) &in_saddr6);
+	} else {
+		util_printaddr("inner.dest4", (void *) &in_daddr4);
+		util_printaddr("inner.source4", (void *) &in_saddr4);
+	}
+
+	if (!cfg_l3_outer)
+		return;
+
+	fprintf(stderr, "encap proto:   %u\n", cfg_encap_proto);
+
+	if (cfg_l3_outer == PF_INET6) {
+		util_printaddr("outer.dest6", (void *) &out_daddr6);
+		util_printaddr("outer.source6", (void *) &out_saddr6);
+	} else {
+		util_printaddr("outer.dest4", (void *) &out_daddr4);
+		util_printaddr("outer.source4", (void *) &out_saddr4);
+	}
+
+	if (!cfg_l3_extra)
+		return;
+
+	if (cfg_l3_outer == PF_INET6) {
+		util_printaddr("extra.dest6", (void *) &extra_daddr6);
+		util_printaddr("extra.source6", (void *) &extra_saddr6);
+	} else {
+		util_printaddr("extra.dest4", (void *) &extra_daddr4);
+		util_printaddr("extra.source4", (void *) &extra_saddr4);
+	}
+
+}
+
+int main(int argc, char **argv)
+{
+	parse_opts(argc, argv);
+	print_opts();
+	return do_main();
+}
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.sh b/tools/testing/selftests/bpf/test_flow_dissector.sh
new file mode 100755
index 000000000000..c0fb073b5eab
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_flow_dissector.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Load BPF flow dissector and verify it correctly dissects traffic
+export TESTNAME=test_flow_dissector
+unmount=0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+msg="skip all tests:"
+if [ $UID != 0 ]; then
+	echo $msg please run this as root >&2
+	exit $ksft_skip
+fi
+
+# This test needs to be run in a network namespace with in_netns.sh. Check if
+# this is the case and run it with in_netns.sh if it is being run in the root
+# namespace.
+if [[ -z $(ip netns identify $$) ]]; then
+	../net/in_netns.sh "$0" "$@"
+	exit $?
+fi
+
+# Determine selftest success via shell exit code
+exit_handler()
+{
+	if (( $? == 0 )); then
+		echo "selftests: $TESTNAME [PASS]";
+	else
+		echo "selftests: $TESTNAME [FAILED]";
+	fi
+
+	set +e
+
+	# Cleanup
+	tc filter del dev lo ingress pref 1337 2> /dev/null
+	tc qdisc del dev lo ingress 2> /dev/null
+	./flow_dissector_load -d 2> /dev/null
+	if [ $unmount -ne 0 ]; then
+		umount bpffs 2> /dev/null
+	fi
+}
+
+# Exit script immediately (well catched by trap handler) if any
+# program/thing exits with a non-zero status.
+set -e
+
+# (Use 'trap -l' to list meaning of numbers)
+trap exit_handler 0 2 3 6 9
+
+# Mount BPF file system
+if /bin/mount | grep /sys/fs/bpf > /dev/null; then
+	echo "bpffs already mounted"
+else
+	echo "bpffs not mounted. Mounting..."
+	unmount=1
+	/bin/mount bpffs /sys/fs/bpf -t bpf
+fi
+
+# Attach BPF program
+./flow_dissector_load -p bpf_flow.o -s dissect
+
+# Setup
+tc qdisc add dev lo ingress
+
+echo "Testing IPv4..."
+# Drops all IP/UDP packets coming from port 9
+tc filter add dev lo parent ffff: protocol ip pref 1337 flower ip_proto \
+	udp src_port 9 action drop
+
+# Send 10 IPv4/UDP packets from port 8. Filter should not drop any.
+./test_flow_dissector -i 4 -f 8
+# Send 10 IPv4/UDP packets from port 9. Filter should drop all.
+./test_flow_dissector -i 4 -f 9 -F
+# Send 10 IPv4/UDP packets from port 10. Filter should not drop any.
+./test_flow_dissector -i 4 -f 10
+
+echo "Testing IPIP..."
+# Send 10 IPv4/IPv4/UDP packets from port 8. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
+	-D 192.168.0.1 -S 1.1.1.1 -f 8
+# Send 10 IPv4/IPv4/UDP packets from port 9. Filter should drop all.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
+	-D 192.168.0.1 -S 1.1.1.1 -f 9 -F
+# Send 10 IPv4/IPv4/UDP packets from port 10. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
+	-D 192.168.0.1 -S 1.1.1.1 -f 10
+
+echo "Testing IPv4 + GRE..."
+# Send 10 IPv4/GRE/IPv4/UDP packets from port 8. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
+	-D 192.168.0.1 -S 1.1.1.1 -f 8
+# Send 10 IPv4/GRE/IPv4/UDP packets from port 9. Filter should drop all.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
+	-D 192.168.0.1 -S 1.1.1.1 -f 9 -F
+# Send 10 IPv4/GRE/IPv4/UDP packets from port 10. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
+	-D 192.168.0.1 -S 1.1.1.1 -f 10
+
+tc filter del dev lo ingress pref 1337
+
+echo "Testing IPv6..."
+# Drops all IPv6/UDP packets coming from port 9
+tc filter add dev lo parent ffff: protocol ipv6 pref 1337 flower ip_proto \
+	udp src_port 9 action drop
+
+# Send 10 IPv6/UDP packets from port 8. Filter should not drop any.
+./test_flow_dissector -i 6 -f 8
+# Send 10 IPv6/UDP packets from port 9. Filter should drop all.
+./test_flow_dissector -i 6 -f 9 -F
+# Send 10 IPv6/UDP packets from port 10. Filter should not drop any.
+./test_flow_dissector -i 6 -f 10
+
+exit 0
diff --git a/tools/testing/selftests/bpf/with_addr.sh b/tools/testing/selftests/bpf/with_addr.sh
new file mode 100755
index 000000000000..ffcd3953f94c
--- /dev/null
+++ b/tools/testing/selftests/bpf/with_addr.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# add private ipv4 and ipv6 addresses to loopback
+
+readonly V6_INNER='100::a/128'
+readonly V4_INNER='192.168.0.1/32'
+
+if getopts ":s" opt; then
+  readonly SIT_DEV_NAME='sixtofourtest0'
+  readonly V6_SIT='2::/64'
+  readonly V4_SIT='172.17.0.1/32'
+  shift
+fi
+
+fail() {
+  echo "error: $*" 1>&2
+  exit 1
+}
+
+setup() {
+  ip -6 addr add "${V6_INNER}" dev lo || fail 'failed to setup v6 address'
+  ip -4 addr add "${V4_INNER}" dev lo || fail 'failed to setup v4 address'
+
+  if [[ -n "${V6_SIT}" ]]; then
+    ip link add "${SIT_DEV_NAME}" type sit remote any local any \
+	    || fail 'failed to add sit'
+    ip link set dev "${SIT_DEV_NAME}" up \
+	    || fail 'failed to bring sit device up'
+    ip -6 addr add "${V6_SIT}" dev "${SIT_DEV_NAME}" \
+	    || fail 'failed to setup v6 SIT address'
+    ip -4 addr add "${V4_SIT}" dev "${SIT_DEV_NAME}" \
+	    || fail 'failed to setup v4 SIT address'
+  fi
+
+  sleep 2	# avoid race causing bind to fail
+}
+
+cleanup() {
+  if [[ -n "${V6_SIT}" ]]; then
+    ip -4 addr del "${V4_SIT}" dev "${SIT_DEV_NAME}"
+    ip -6 addr del "${V6_SIT}" dev "${SIT_DEV_NAME}"
+    ip link del "${SIT_DEV_NAME}"
+  fi
+
+  ip -4 addr del "${V4_INNER}" dev lo
+  ip -6 addr del "${V6_INNER}" dev lo
+}
+
+trap cleanup EXIT
+
+setup
+"$@"
+exit "$?"
diff --git a/tools/testing/selftests/bpf/with_tunnels.sh b/tools/testing/selftests/bpf/with_tunnels.sh
new file mode 100755
index 000000000000..e24949ed3a20
--- /dev/null
+++ b/tools/testing/selftests/bpf/with_tunnels.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# setup tunnels for flow dissection test
+
+readonly SUFFIX="test_$(mktemp -u XXXX)"
+CONFIG="remote 127.0.0.2 local 127.0.0.1 dev lo"
+
+setup() {
+  ip link add "ipip_${SUFFIX}" type ipip ${CONFIG}
+  ip link add "gre_${SUFFIX}" type gre ${CONFIG}
+  ip link add "sit_${SUFFIX}" type sit ${CONFIG}
+
+  echo "tunnels before test:"
+  ip tunnel show
+
+  ip link set "ipip_${SUFFIX}" up
+  ip link set "gre_${SUFFIX}" up
+  ip link set "sit_${SUFFIX}" up
+}
+
+
+cleanup() {
+  ip tunnel del "ipip_${SUFFIX}"
+  ip tunnel del "gre_${SUFFIX}"
+  ip tunnel del "sit_${SUFFIX}"
+
+  echo "tunnels after test:"
+  ip tunnel show
+}
+
+trap cleanup EXIT
+
+setup
+"$@"
+exit "$?"
-- 
2.19.0.rc0.228.g281dcd1b4d0-goog

^ permalink raw reply related

* [bpf-next 2/3] flow_dissector: implements eBPF parser
From: Petar Penkov @ 2018-08-30 18:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, ast, daniel, simon.horman, ecree, songliubraving, tom,
	Petar Penkov, Willem de Bruijn
In-Reply-To: <20180830182301.89435-1-peterpenkov96@gmail.com>

From: Petar Penkov <ppenkov@google.com>

This eBPF program extracts basic/control/ip address/ports keys from
incoming packets. It supports recursive parsing for IP encapsulation,
and VLAN, along with IPv4/IPv6 and extension headers.  This program is
meant to show how flow dissection and key extraction can be done in
eBPF.

Link: http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf
Signed-off-by: Petar Penkov <ppenkov@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 tools/testing/selftests/bpf/Makefile   |   2 +-
 tools/testing/selftests/bpf/bpf_flow.c | 390 +++++++++++++++++++++++++
 2 files changed, 391 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/bpf_flow.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index fff7fb1285fc..e65f50f9185e 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,7 +35,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
 	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
 	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
-	test_skb_cgroup_id_kern.o
+	test_skb_cgroup_id_kern.o bpf_flow.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/bpf_flow.c b/tools/testing/selftests/bpf/bpf_flow.c
new file mode 100644
index 000000000000..6d8388735d1d
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_flow.c
@@ -0,0 +1,390 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <limits.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_packet.h>
+#include <sys/socket.h>
+#include <linux/if_tunnel.h>
+#include <linux/mpls.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+#define PROG(F) SEC(#F) int bpf_func_##F
+
+/* These are the identifiers of the BPF programs that will be used in tail
+ * calls. Name is limited to 16 characters, with the terminating character and
+ * bpf_func_ above, we have only 6 to work with, anything after will be cropped.
+ */
+enum {
+	IP,
+	IPV6,
+	IPV6OP,	/* Destination/Hop-by-Hop Options IPv6 Extension header */
+	IPV6FR,	/* Fragmentation IPv6 Extension Header */
+	MPLS,
+	VLAN,
+};
+
+#define IP_MF		0x2000
+#define IP_OFFSET	0x1FFF
+#define IP6_MF		0x0001
+#define IP6_OFFSET	0xFFF8
+
+struct vlan_hdr {
+	__be16 h_vlan_TCI;
+	__be16 h_vlan_encapsulated_proto;
+};
+
+struct gre_hdr {
+	__be16 flags;
+	__be16 proto;
+};
+
+struct frag_hdr {
+	__u8 nexthdr;
+	__u8 reserved;
+	__be16 frag_off;
+	__be32 identification;
+};
+
+struct bpf_map_def SEC("maps") jmp_table = {
+	.type = BPF_MAP_TYPE_PROG_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u32),
+	.max_entries = 8
+};
+
+struct bpf_dissect_cb {
+	__u16 nhoff;
+};
+
+static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb,
+							 __u16 hdr_size)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	void *data_end = (__u8 *)(long)skb->data_end;
+	void *data = (__u8 *)(long)skb->data;
+	__u8 *hdr;
+
+	/* Verifies this variable offset does not overflow */
+	if (cb->nhoff > (USHRT_MAX - hdr_size))
+		return NULL;
+
+	hdr = data + cb->nhoff;
+	if (hdr + hdr_size > data_end)
+		return NULL;
+
+	return hdr;
+}
+
+/* Dispatches on ETHERTYPE */
+static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto)
+{
+	struct bpf_flow_keys *keys = (struct bpf_flow_keys *)(long)(skb->flow_keys);
+	keys->n_proto = proto;
+
+	switch (proto) {
+	case bpf_htons(ETH_P_IP):
+		bpf_tail_call(skb, &jmp_table, IP);
+		break;
+	case bpf_htons(ETH_P_IPV6):
+		bpf_tail_call(skb, &jmp_table, IPV6);
+		break;
+	case bpf_htons(ETH_P_MPLS_MC):
+	case bpf_htons(ETH_P_MPLS_UC):
+		bpf_tail_call(skb, &jmp_table, MPLS);
+		break;
+	case bpf_htons(ETH_P_8021Q):
+	case bpf_htons(ETH_P_8021AD):
+		bpf_tail_call(skb, &jmp_table, VLAN);
+		break;
+	default:
+		/* Protocol not supported */
+		return BPF_DROP;
+	}
+
+	return BPF_DROP;
+}
+
+static __always_inline void write_ports(struct __sk_buff *skb)
+{
+	struct bpf_flow_keys *keys = (struct bpf_flow_keys *)(long)(skb->flow_keys);
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	void *data = (__u8 *)(long)skb->data;
+	__be16 *ports = data + cb->nhoff; /* TCP/UDP start with the ports */
+
+	keys->sport = ports[0];
+	keys->dport = ports[1];
+}
+
+SEC("dissect")
+int dissect(struct __sk_buff *skb)
+{
+	if (!skb->vlan_present)
+		return parse_eth_proto(skb, skb->protocol);
+	else
+		return parse_eth_proto(skb, skb->vlan_proto);
+}
+
+/* Parses on IPPROTO_* */
+static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
+{
+	struct bpf_flow_keys *keys = (struct bpf_flow_keys *)(long)(skb->flow_keys);
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	void *data_end = (void *)(long)skb->data_end;
+	struct icmphdr *icmp;
+	struct gre_hdr *gre;
+	struct ethhdr *eth;
+	struct tcphdr *tcp;
+	struct udphdr *udp;
+
+	keys->ip_proto = proto;
+	switch (proto) {
+	case IPPROTO_ICMP:
+		icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp));
+		if (!icmp)
+			return BPF_DROP;
+		return BPF_OK;
+	case IPPROTO_IPIP:
+		keys->is_encap = true;
+		return parse_eth_proto(skb, bpf_htons(ETH_P_IP));
+	case IPPROTO_IPV6:
+		keys->is_encap = true;
+		return parse_eth_proto(skb, bpf_htons(ETH_P_IPV6));
+	case IPPROTO_GRE:
+		gre = bpf_flow_dissect_get_header(skb, sizeof(*gre));
+		if (!gre)
+			return BPF_DROP;
+
+		if (bpf_htons(gre->flags & GRE_VERSION))
+			/* Only inspect standard GRE packets with version 0 */
+			return BPF_OK;
+
+		cb->nhoff += sizeof(*gre); /* Step over GRE Flags and Proto */
+		if (GRE_IS_CSUM(gre->flags))
+			cb->nhoff += 4; /* Step over chksum and Padding */
+		if (GRE_IS_KEY(gre->flags))
+			cb->nhoff += 4; /* Step over key */
+		if (GRE_IS_SEQ(gre->flags))
+			cb->nhoff += 4; /* Step over sequence number */
+
+		keys->is_encap = true;
+
+		if (gre->proto == bpf_htons(ETH_P_TEB)) {
+			eth = bpf_flow_dissect_get_header(skb, sizeof(*eth));
+			if (!eth)
+				return BPF_DROP;
+
+			cb->nhoff += sizeof(*eth);
+
+			return parse_eth_proto(skb, eth->h_proto);
+		} else {
+			return parse_eth_proto(skb, gre->proto);
+		}
+	case IPPROTO_TCP:
+		tcp = bpf_flow_dissect_get_header(skb, sizeof(*tcp));
+		if (!tcp)
+			return BPF_DROP;
+
+		if (tcp->doff < 5)
+			return BPF_DROP;
+
+		if ((__u8 *)tcp + (tcp->doff << 2) > data_end)
+			return BPF_DROP;
+
+		keys->thoff = cb->nhoff;
+		write_ports(skb);
+		return BPF_OK;
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE:
+		udp = bpf_flow_dissect_get_header(skb, sizeof(*udp));
+		if (!udp)
+			return BPF_DROP;
+
+		keys->thoff = cb->nhoff;
+		write_ports(skb);
+		return BPF_OK;
+	default:
+		return BPF_DROP;
+	}
+
+	return BPF_DROP;
+}
+
+static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
+{
+	struct bpf_flow_keys *keys = (struct bpf_flow_keys *)(long)(skb->flow_keys);
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+
+	keys->ip_proto = nexthdr;
+	switch (nexthdr) {
+	case IPPROTO_HOPOPTS:
+	case IPPROTO_DSTOPTS:
+		bpf_tail_call(skb, &jmp_table, IPV6OP);
+		break;
+	case IPPROTO_FRAGMENT:
+		bpf_tail_call(skb, &jmp_table, IPV6FR);
+		break;
+	default:
+		return parse_ip_proto(skb, nexthdr);
+	}
+
+	return BPF_DROP;
+}
+
+PROG(IP)(struct __sk_buff *skb)
+{
+	struct bpf_flow_keys *keys = (struct bpf_flow_keys *)(long)(skb->flow_keys);
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	void *data_end = (__u8 *)(long)skb->data_end;
+	void *data = (__u8 *)(long)skb->data;
+	bool done = false;
+	struct iphdr *iph;
+
+	iph = bpf_flow_dissect_get_header(skb, sizeof(*iph));
+	if (!iph)
+		return BPF_DROP;
+
+	/* IP header cannot be smaller than 20 bytes */
+	if (iph->ihl < 5)
+		return BPF_DROP;
+
+	keys->addr_proto = ETH_P_IP;
+	keys->ipv4_src = iph->saddr;
+	keys->ipv4_dst = iph->daddr;
+
+	cb->nhoff += iph->ihl << 2;
+	if (data + cb->nhoff > data_end)
+		return BPF_DROP;
+
+	if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) {
+		keys->is_frag = true;
+		if (iph->frag_off & bpf_htons(IP_OFFSET))
+			/* From second fragment on, packets do not have headers
+			 * we can parse.
+			 */
+			done = true;
+		else
+			keys->is_first_frag = true;
+	}
+
+	if (done)
+		return BPF_OK;
+
+	return parse_ip_proto(skb, iph->protocol);
+}
+
+PROG(IPV6)(struct __sk_buff *skb)
+{
+	struct bpf_flow_keys *keys = (struct bpf_flow_keys *)(long)(skb->flow_keys);
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	struct ipv6hdr *ip6h;
+
+	ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h));
+	if (!ip6h)
+		return BPF_DROP;
+
+	keys->addr_proto = ETH_P_IPV6;
+	memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr));
+
+	cb->nhoff += sizeof(struct ipv6hdr);
+
+	return parse_ipv6_proto(skb, ip6h->nexthdr);
+}
+
+PROG(IPV6OP)(struct __sk_buff *skb)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	struct ipv6_opt_hdr *ip6h;
+
+	ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h));
+	if (!ip6h)
+		return BPF_DROP;
+
+	/* hlen is in 8-octects and does not include the first 8 bytes
+	 * of the header
+	 */
+	cb->nhoff += (1 + ip6h->hdrlen) << 3;
+
+	return parse_ipv6_proto(skb, ip6h->nexthdr);
+}
+
+PROG(IPV6FR)(struct __sk_buff *skb)
+{
+	struct bpf_flow_keys *keys = (struct bpf_flow_keys *)(long)(skb->flow_keys);
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	struct frag_hdr *fragh;
+
+	fragh = bpf_flow_dissect_get_header(skb, sizeof(*fragh));
+	if (!fragh)
+		return BPF_DROP;
+
+	cb->nhoff += sizeof(*fragh);
+	keys->is_frag = true;
+	if (!(fragh->frag_off & bpf_htons(IP6_OFFSET)))
+		keys->is_first_frag = true;
+
+	return parse_ipv6_proto(skb, fragh->nexthdr);
+}
+
+PROG(MPLS)(struct __sk_buff *skb)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	void *data = (void *)(long)skb->data;
+	struct mpls_label *mpls;
+	__u8 *version;
+
+	mpls = bpf_flow_dissect_get_header(skb, sizeof(*mpls));
+	if (!mpls)
+		return BPF_DROP;
+
+	return BPF_OK;
+}
+
+PROG(VLAN)(struct __sk_buff *skb)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	struct vlan_hdr *vlan;
+	__be16 proto;
+
+	/* Peek back to see if single or double-tagging */
+	if (bpf_skb_load_bytes(skb, cb->nhoff - sizeof(proto), &proto,
+			       sizeof(proto)))
+		return BPF_DROP;
+
+	/* Account for double-tagging */
+	if (proto == bpf_htons(ETH_P_8021AD)) {
+		vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan));
+		if (!vlan)
+			return BPF_DROP;
+
+		if (vlan->h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q))
+			return BPF_DROP;
+
+		cb->nhoff += sizeof(*vlan);
+	}
+
+	vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan));
+	if (!vlan)
+		return BPF_DROP;
+
+	cb->nhoff += sizeof(*vlan);
+	/* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/
+	if (vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) ||
+	    vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q))
+		return BPF_DROP;
+
+	return parse_eth_proto(skb, vlan->h_vlan_encapsulated_proto);
+}
+
+char __license[] SEC("license") = "GPL";
-- 
2.19.0.rc0.228.g281dcd1b4d0-goog

^ permalink raw reply related

* [bpf-next 1/3] flow_dissector: implements flow dissector BPF hook
From: Petar Penkov @ 2018-08-30 18:22 UTC (permalink / raw)
  To: netdev
  Cc: davem, ast, daniel, simon.horman, ecree, songliubraving, tom,
	Petar Penkov, Willem de Bruijn
In-Reply-To: <20180830182301.89435-1-peterpenkov96@gmail.com>

From: Petar Penkov <ppenkov@google.com>

Adds a hook for programs of type BPF_PROG_TYPE_FLOW_DISSECTOR and
attach type BPF_FLOW_DISSECTOR that is executed in the flow dissector
path. The BPF program is per-network namespace.

Signed-off-by: Petar Penkov <ppenkov@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/linux/bpf.h            |   1 +
 include/linux/bpf_types.h      |   1 +
 include/linux/skbuff.h         |   7 ++
 include/net/net_namespace.h    |   3 +
 include/net/sch_generic.h      |  12 ++-
 include/uapi/linux/bpf.h       |  25 ++++++
 kernel/bpf/syscall.c           |   8 ++
 kernel/bpf/verifier.c          |  33 ++++++++
 net/core/filter.c              |  67 ++++++++++++++++
 net/core/flow_dissector.c      | 136 +++++++++++++++++++++++++++++++++
 tools/bpf/bpftool/prog.c       |   1 +
 tools/include/uapi/linux/bpf.h |  25 ++++++
 tools/lib/bpf/libbpf.c         |   2 +
 13 files changed, 318 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 523481a3471b..988a00797bcd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -212,6 +212,7 @@ enum bpf_reg_type {
 	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
+	PTR_TO_FLOW_KEYS,	 /* reg points to bpf_flow_keys */
 };
 
 /* The information passed from prog-specific *_is_valid_access
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index cd26c090e7c0..22083712dd18 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -32,6 +32,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
 #ifdef CONFIG_INET
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
 #endif
+BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector)
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 17a13e4785fc..ce0e863f02a2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -243,6 +243,8 @@ struct scatterlist;
 struct pipe_inode_info;
 struct iov_iter;
 struct napi_struct;
+struct bpf_prog;
+union bpf_attr;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -1192,6 +1194,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     const struct flow_dissector_key *key,
 			     unsigned int key_count);
 
+int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
+				       struct bpf_prog *prog);
+
+int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
+
 bool __skb_flow_dissect(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container,
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 9b5fdc50519a..99d4148e0f90 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -43,6 +43,7 @@ struct ctl_table_header;
 struct net_generic;
 struct uevent_sock;
 struct netns_ipvs;
+struct bpf_prog;
 
 
 #define NETDEV_HASHBITS    8
@@ -145,6 +146,8 @@ struct net {
 #endif
 	struct net_generic __rcu	*gen;
 
+	struct bpf_prog __rcu	*flow_dissector_prog;
+
 	/* Note : following structs are cache line aligned */
 #ifdef CONFIG_XFRM
 	struct netns_xfrm	xfrm;
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a6d00093f35e..1b81ba85fd2d 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -19,6 +19,7 @@ struct Qdisc_ops;
 struct qdisc_walker;
 struct tcf_walker;
 struct module;
+struct bpf_flow_keys;
 
 typedef int tc_setup_cb_t(enum tc_setup_type type,
 			  void *type_data, void *cb_priv);
@@ -307,9 +308,14 @@ struct tcf_proto {
 };
 
 struct qdisc_skb_cb {
-	unsigned int		pkt_len;
-	u16			slave_dev_queue_mapping;
-	u16			tc_classid;
+	union {
+		struct {
+			unsigned int		pkt_len;
+			u16			slave_dev_queue_mapping;
+			u16			tc_classid;
+		};
+		struct bpf_flow_keys *flow_keys;
+	};
 #define QDISC_CB_PRIV_LEN 20
 	unsigned char		data[QDISC_CB_PRIV_LEN];
 };
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 66917a4eba27..3064706fcaaa 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -152,6 +152,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 	BPF_PROG_TYPE_LIRC_MODE2,
 	BPF_PROG_TYPE_SK_REUSEPORT,
+	BPF_PROG_TYPE_FLOW_DISSECTOR,
 };
 
 enum bpf_attach_type {
@@ -172,6 +173,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UDP4_SENDMSG,
 	BPF_CGROUP_UDP6_SENDMSG,
 	BPF_LIRC_MODE2,
+	BPF_FLOW_DISSECTOR,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2333,6 +2335,7 @@ struct __sk_buff {
 	/* ... here. */
 
 	__u32 data_meta;
+	__u32 flow_keys;
 };
 
 struct bpf_tunnel_key {
@@ -2778,4 +2781,26 @@ enum bpf_task_fd_type {
 	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
 };
 
+struct bpf_flow_keys {
+	__u16	thoff;
+	__u16	addr_proto;			/* ETH_P_* of valid addrs */
+	__u8	is_frag;
+	__u8	is_first_frag;
+	__u8	is_encap;
+	__be16	n_proto;
+	__u8	ip_proto;
+	union {
+		struct {
+			__be32	ipv4_src;
+			__be32	ipv4_dst;
+		};
+		struct {
+			__u32	ipv6_src[4];	/* in6_addr; network order */
+			__u32	ipv6_dst[4];	/* in6_addr; network order */
+		};
+	};
+	__be16	sport;
+	__be16	dport;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8339d81cba1d..043c1a72e382 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1616,6 +1616,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_LIRC_MODE2:
 		ptype = BPF_PROG_TYPE_LIRC_MODE2;
 		break;
+	case BPF_FLOW_DISSECTOR:
+		ptype = BPF_PROG_TYPE_FLOW_DISSECTOR;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1637,6 +1640,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_LIRC_MODE2:
 		ret = lirc_prog_attach(attr, prog);
 		break;
+	case BPF_PROG_TYPE_FLOW_DISSECTOR:
+		ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
+		break;
 	default:
 		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
 	}
@@ -1689,6 +1695,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
 	case BPF_LIRC_MODE2:
 		return lirc_prog_detach(attr);
+	case BPF_FLOW_DISSECTOR:
+		return skb_flow_dissector_bpf_prog_detach(attr);
 	default:
 		return -EINVAL;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 92246117d2b0..ac9aeed325b5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -261,6 +261,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_PACKET]		= "pkt",
 	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
+	[PTR_TO_FLOW_KEYS]	= "flow_keys",
 };
 
 static void print_liveness(struct bpf_verifier_env *env,
@@ -993,6 +994,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_PACKET:
 	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET_END:
+	case PTR_TO_FLOW_KEYS:
 	case CONST_PTR_TO_MAP:
 		return true;
 	default:
@@ -1321,6 +1323,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	case BPF_PROG_TYPE_LWT_XMIT:
 	case BPF_PROG_TYPE_SK_SKB:
 	case BPF_PROG_TYPE_SK_MSG:
+	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 		if (meta)
 			return meta->pkt_access;
 
@@ -1404,6 +1407,18 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 	return -EACCES;
 }
 
+static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
+				  int size)
+{
+	if (size < 0 || off < 0 ||
+	    (u64)off + size > sizeof(struct bpf_flow_keys)) {
+		verbose(env, "invalid access to flow keys off=%d size=%d\n",
+			off, size);
+		return -EACCES;
+	}
+	return 0;
+}
+
 static bool __is_pointer_value(bool allow_ptr_leaks,
 			       const struct bpf_reg_state *reg)
 {
@@ -1505,6 +1520,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 		 * right in front, treat it the very same way.
 		 */
 		return check_pkt_ptr_alignment(env, reg, off, size, strict);
+	case PTR_TO_FLOW_KEYS:
+		pointer_desc = "flow keys ";
+		break;
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
 		break;
@@ -1778,6 +1796,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_packet_access(env, regno, off, size, false);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
+	} else if (reg->type == PTR_TO_FLOW_KEYS) {
+		if (t == BPF_WRITE && value_regno >= 0 &&
+		    is_pointer_value(env, value_regno)) {
+			verbose(env, "R%d leaks addr into flow keys\n",
+				value_regno);
+			return -EACCES;
+		}
+
+		err = check_flow_keys_access(env, off, size);
+		if (!err && t == BPF_READ && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
@@ -1925,6 +1954,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size,
 					   zero_size_allowed);
+	case PTR_TO_FLOW_KEYS:
+		return check_flow_keys_access(env, reg->off, access_size);
 	case PTR_TO_MAP_VALUE:
 		return check_map_access(env, regno, reg->off, access_size,
 					zero_size_allowed);
@@ -3976,6 +4007,7 @@ static bool may_access_skb(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 		return true;
 	default:
 		return false;
@@ -4451,6 +4483,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case PTR_TO_CTX:
 	case CONST_PTR_TO_MAP:
 	case PTR_TO_PACKET_END:
+	case PTR_TO_FLOW_KEYS:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
 		 */
diff --git a/net/core/filter.c b/net/core/filter.c
index c25eb36f1320..0143b9c0c67e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5092,6 +5092,17 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
 static const struct bpf_func_proto *
 lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -5207,6 +5218,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
 		if (size != size_default)
 			return false;
 		break;
@@ -5235,6 +5247,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
 	}
@@ -5260,6 +5273,7 @@ static bool lwt_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
 		return false;
 	}
 
@@ -5470,6 +5484,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
 	}
@@ -5671,6 +5686,7 @@ static bool sk_skb_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
 		return false;
 	}
 
@@ -5730,6 +5746,38 @@ static bool sk_msg_is_valid_access(int off, int size,
 	return true;
 }
 
+static bool flow_dissector_is_valid_access(int off, int size,
+					   enum bpf_access_type type,
+					   const struct bpf_prog *prog,
+					   struct bpf_insn_access_aux *info)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+		case bpf_ctx_range(struct __sk_buff, flow_keys):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case bpf_ctx_range(struct __sk_buff, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
+		info->reg_type = PTR_TO_FLOW_KEYS;
+		break;
+	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+		return false;
+	}
+
+	return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
@@ -6024,6 +6072,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				      bpf_target_off(struct sock_common,
 						     skc_num, 2, target_size));
 		break;
+
+	case offsetof(struct __sk_buff, flow_keys):
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, flow_keys);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct qdisc_skb_cb, flow_keys);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
+		break;
 	}
 
 	return insn - insn_buf;
@@ -6987,6 +7044,16 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = {
 const struct bpf_prog_ops sk_msg_prog_ops = {
 };
 
+const struct bpf_verifier_ops flow_dissector_verifier_ops = {
+	.get_func_proto		= flow_dissector_func_proto,
+	.is_valid_access	= flow_dissector_is_valid_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
+	.gen_ld_abs		= bpf_gen_ld_abs,
+};
+
+const struct bpf_prog_ops flow_dissector_prog_ops = {
+};
+
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index ce9eeeb7c024..a5f7da69911a 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -25,6 +25,9 @@
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 #include <uapi/linux/batadv_packet.h>
+#include <linux/bpf.h>
+
+static DEFINE_MUTEX(flow_dissector_mutex);
 
 static void dissector_set_key(struct flow_dissector *flow_dissector,
 			      enum flow_dissector_key_id key_id)
@@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 }
 EXPORT_SYMBOL(skb_flow_dissector_init);
 
+int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
+				       struct bpf_prog *prog)
+{
+	struct bpf_prog *attached;
+	struct net *net;
+
+	net = current->nsproxy->net_ns;
+	mutex_lock(&flow_dissector_mutex);
+	attached = rcu_dereference_protected(net->flow_dissector_prog,
+					     lockdep_is_held(&flow_dissector_mutex));
+	if (attached) {
+		/* Only one BPF program can be attached at a time */
+		mutex_unlock(&flow_dissector_mutex);
+		return -EEXIST;
+	}
+	rcu_assign_pointer(net->flow_dissector_prog, prog);
+	mutex_unlock(&flow_dissector_mutex);
+	return 0;
+}
+
+int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
+{
+	struct bpf_prog *attached;
+	struct net *net;
+
+	net = current->nsproxy->net_ns;
+	mutex_lock(&flow_dissector_mutex);
+	attached = rcu_dereference_protected(net->flow_dissector_prog,
+					     lockdep_is_held(&flow_dissector_mutex));
+	if (!attached) {
+		mutex_unlock(&flow_dissector_mutex);
+		return -ENOENT;
+	}
+	bpf_prog_put(attached);
+	RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
+	mutex_unlock(&flow_dissector_mutex);
+	return 0;
+}
 /**
  * skb_flow_get_be16 - extract be16 entity
  * @skb: sk_buff to extract from
@@ -588,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
 	return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
 }
 
+static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
+				     struct flow_dissector *flow_dissector,
+				     void *target_container)
+{
+	struct flow_dissector_key_control *key_control;
+	struct flow_dissector_key_basic *key_basic;
+	struct flow_dissector_key_addrs *key_addrs;
+	struct flow_dissector_key_ports *key_ports;
+
+	key_control = skb_flow_dissector_target(flow_dissector,
+						FLOW_DISSECTOR_KEY_CONTROL,
+						target_container);
+	key_control->thoff = flow_keys->thoff;
+	if (flow_keys->is_frag)
+		key_control->flags |= FLOW_DIS_IS_FRAGMENT;
+	if (flow_keys->is_first_frag)
+		key_control->flags |= FLOW_DIS_FIRST_FRAG;
+	if (flow_keys->is_encap)
+		key_control->flags |= FLOW_DIS_ENCAPSULATION;
+
+	key_basic = skb_flow_dissector_target(flow_dissector,
+					      FLOW_DISSECTOR_KEY_BASIC,
+					      target_container);
+	key_basic->n_proto = flow_keys->n_proto;
+	key_basic->ip_proto = flow_keys->ip_proto;
+
+	if (flow_keys->addr_proto == ETH_P_IP &&
+	    dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+		key_addrs = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						      target_container);
+		key_addrs->v4addrs.src = flow_keys->ipv4_src;
+		key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
+		key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+	} else if (flow_keys->addr_proto == ETH_P_IPV6 &&
+		   dissector_uses_key(flow_dissector,
+				      FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+		key_addrs = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						      target_container);
+		memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src,
+		       sizeof(key_addrs->v6addrs));
+		key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+	}
+
+	if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+		key_ports = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_PORTS,
+						      target_container);
+		key_ports->src = flow_keys->sport;
+		key_ports->dst = flow_keys->dport;
+	}
+}
+
 /**
  * __skb_flow_dissect - extract the flow_keys struct and return it
  * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
@@ -619,6 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	struct flow_dissector_key_vlan *key_vlan;
 	enum flow_dissect_ret fdret;
 	enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
+	struct bpf_prog *attached;
 	int num_hdrs = 0;
 	u8 ip_proto = 0;
 	bool ret;
@@ -658,6 +754,46 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 					      FLOW_DISSECTOR_KEY_BASIC,
 					      target_container);
 
+	rcu_read_lock();
+	attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog);
+	if (attached) {
+		/* Note that even though the const qualifier is discarded
+		 * throughout the execution of the BPF program, all changes(the
+		 * control block) are reverted after the BPF program returns.
+		 * Therefore, __skb_flow_dissect does not alter the skb.
+		 */
+		struct bpf_flow_keys flow_keys = {};
+		struct qdisc_skb_cb cb_saved;
+		struct qdisc_skb_cb *cb;
+		u16 *pseudo_cb;
+		u32 result;
+
+		cb = qdisc_skb_cb(skb);
+		pseudo_cb = (u16 *)bpf_skb_cb((struct sk_buff *)skb);
+
+		/* Save Control Block */
+		memcpy(&cb_saved, cb, sizeof(cb_saved));
+		memset(cb, 0, sizeof(cb_saved));
+
+		/* Pass parameters to the BPF program */
+		cb->flow_keys = &flow_keys;
+		*pseudo_cb = nhoff;
+
+		bpf_compute_data_pointers((struct sk_buff *)skb);
+		result = BPF_PROG_RUN(attached, skb);
+
+		/* Restore state */
+		memcpy(cb, &cb_saved, sizeof(cb_saved));
+
+		__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
+					 target_container);
+		key_control->thoff = min_t(u16, key_control->thoff,
+					   skb ? skb->len : hlen);
+		rcu_read_unlock();
+		return result == BPF_OK;
+	}
+	rcu_read_unlock();
+
 	if (dissector_uses_key(flow_dissector,
 			       FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
 		struct ethhdr *eth = eth_hdr(skb);
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index dce960d22106..b1cd3bc8db70 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -74,6 +74,7 @@ static const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_RAW_TRACEPOINT]	= "raw_tracepoint",
 	[BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
 	[BPF_PROG_TYPE_LIRC_MODE2]	= "lirc_mode2",
+	[BPF_PROG_TYPE_FLOW_DISSECTOR]	= "flow_dissector",
 };
 
 static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 66917a4eba27..3064706fcaaa 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -152,6 +152,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 	BPF_PROG_TYPE_LIRC_MODE2,
 	BPF_PROG_TYPE_SK_REUSEPORT,
+	BPF_PROG_TYPE_FLOW_DISSECTOR,
 };
 
 enum bpf_attach_type {
@@ -172,6 +173,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UDP4_SENDMSG,
 	BPF_CGROUP_UDP6_SENDMSG,
 	BPF_LIRC_MODE2,
+	BPF_FLOW_DISSECTOR,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2333,6 +2335,7 @@ struct __sk_buff {
 	/* ... here. */
 
 	__u32 data_meta;
+	__u32 flow_keys;
 };
 
 struct bpf_tunnel_key {
@@ -2778,4 +2781,26 @@ enum bpf_task_fd_type {
 	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
 };
 
+struct bpf_flow_keys {
+	__u16	thoff;
+	__u16	addr_proto;			/* ETH_P_* of valid addrs */
+	__u8	is_frag;
+	__u8	is_first_frag;
+	__u8	is_encap;
+	__be16	n_proto;
+	__u8	ip_proto;
+	union {
+		struct {
+			__be32	ipv4_src;
+			__be32	ipv4_dst;
+		};
+		struct {
+			__u32	ipv6_src[4];	/* in6_addr; network order */
+			__u32	ipv6_dst[4];	/* in6_addr; network order */
+		};
+	};
+	__be16	sport;
+	__be16	dport;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 2abd0f112627..0c749ce1b717 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1502,6 +1502,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_LIRC_MODE2:
 	case BPF_PROG_TYPE_SK_REUSEPORT:
+	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 		return false;
 	case BPF_PROG_TYPE_UNSPEC:
 	case BPF_PROG_TYPE_KPROBE:
@@ -2121,6 +2122,7 @@ static const struct {
 	BPF_PROG_SEC("sk_skb",		BPF_PROG_TYPE_SK_SKB),
 	BPF_PROG_SEC("sk_msg",		BPF_PROG_TYPE_SK_MSG),
 	BPF_PROG_SEC("lirc_mode2",	BPF_PROG_TYPE_LIRC_MODE2),
+	BPF_PROG_SEC("flow_dissector",	BPF_PROG_TYPE_FLOW_DISSECTOR),
 	BPF_SA_PROG_SEC("cgroup/bind4",	BPF_CGROUP_INET4_BIND),
 	BPF_SA_PROG_SEC("cgroup/bind6",	BPF_CGROUP_INET6_BIND),
 	BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT),
-- 
2.19.0.rc0.228.g281dcd1b4d0-goog

^ permalink raw reply related

* [bpf-next 0/3] Introduce eBPF flow dissector
From: Petar Penkov @ 2018-08-30 18:22 UTC (permalink / raw)
  To: netdev
  Cc: davem, ast, daniel, simon.horman, ecree, songliubraving, tom,
	Petar Penkov

From: Petar Penkov <ppenkov@google.com>

This patch series hardens the RX stack by allowing flow dissection in BPF,
as previously discussed [1]. Because of the rigorous checks of the BPF
verifier, this provides significant security guarantees. In particular, the
BPF flow dissector cannot get inside of an infinite loop, as with
CVE-2013-4348, because BPF programs are guaranteed to terminate. It cannot
read outside of packet bounds, because all memory accesses are checked.
Also, with BPF the administrator can decide which protocols to support,
reducing potential attack surface. Rarely encountered protocols can be
excluded from dissection and the program can be updated without kernel
recompile or reboot if a bug is discovered.

Patch 1 adds infrastructure to execute a BPF program in __skb_flow_dissect.
This includes a new BPF program and attach type.

Patch 2 adds a flow dissector program in BPF. This parses most protocols in
__skb_flow_dissect in BPF for a subset of flow keys (basic, control, ports,
and address types).

Patch 3 adds a selftest that attaches the BPF program to the flow dissector
and sends traffic with different levels of encapsulation.

Performance Evaluation:
The in-kernel implementation was compared against the demo program from
patch 2 using the test in patch 3 with IPv4/UDP traffic over 10 seconds.
	$perf record -a -C 4 taskset -c 4 ./test_flow_dissector -i 4 -f 8 \
		-t 10

In-kernel Dissector:
	__skb_flow_dissect overhead: 2.12%
	Total Packets: 3,272,597 (from output of ./test_flow_dissector)

BPF Dissector:
	__skb_flow_dissect overhead: 1.63% 
	Total Packets: 3,232,356 (from output of ./test_flow_dissector)

No-op Dissector:
	__skb_flow_dissect overhead: 1.52% 
	Total Packets: 3,330,635 (from output of ./test_flow_dissector)

Changes since RFC:

1/ (Patch 1) Flow dissector hook is no longer global. Instead, it is
per-netns

2/ (Patch 1) Defined struct bpf_flow_keys to be used in BPF flow dissector
programs instead of exposing the internal flow keys layout. Added a
function to translate from bpf_flow_keys to the internal layout after BPF
dissection is complete. The pointer to this struct is stored in
qdisc_skb_cb rather than inside of the 20 byte control block which
simplifies verification and allows access to all 20 bytes of the cb.

3/ (Patch 2) Removed GUE parsing as it relied on a hardcoded port

4/ (Patch 2) MPLS parsing now stops at the first label which is consistent
with the in-kernel flow dissector

5/ (Patch 2) Refactored to use direct packet access and to write out to
struct bpf_flow_keys

[1] http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf

Petar Penkov (3):
  flow_dissector: implements flow dissector BPF hook
  flow_dissector: implements eBPF parser
  selftests/bpf: test bpf flow dissection

 include/linux/bpf.h                           |   1 +
 include/linux/bpf_types.h                     |   1 +
 include/linux/skbuff.h                        |   7 +
 include/net/net_namespace.h                   |   3 +
 include/net/sch_generic.h                     |  12 +-
 include/uapi/linux/bpf.h                      |  25 +
 kernel/bpf/syscall.c                          |   8 +
 kernel/bpf/verifier.c                         |  33 +
 net/core/filter.c                             |  67 ++
 net/core/flow_dissector.c                     | 136 +++
 tools/bpf/bpftool/prog.c                      |   1 +
 tools/include/uapi/linux/bpf.h                |  25 +
 tools/lib/bpf/libbpf.c                        |   2 +
 tools/testing/selftests/bpf/.gitignore        |   2 +
 tools/testing/selftests/bpf/Makefile          |   8 +-
 tools/testing/selftests/bpf/bpf_flow.c        | 390 +++++++++
 tools/testing/selftests/bpf/config            |   1 +
 .../selftests/bpf/flow_dissector_load.c       | 140 ++++
 .../selftests/bpf/test_flow_dissector.c       | 782 ++++++++++++++++++
 .../selftests/bpf/test_flow_dissector.sh      | 115 +++
 tools/testing/selftests/bpf/with_addr.sh      |  54 ++
 tools/testing/selftests/bpf/with_tunnels.sh   |  36 +
 22 files changed, 1843 insertions(+), 6 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/bpf_flow.c
 create mode 100644 tools/testing/selftests/bpf/flow_dissector_load.c
 create mode 100644 tools/testing/selftests/bpf/test_flow_dissector.c
 create mode 100755 tools/testing/selftests/bpf/test_flow_dissector.sh
 create mode 100755 tools/testing/selftests/bpf/with_addr.sh
 create mode 100755 tools/testing/selftests/bpf/with_tunnels.sh

-- 
2.19.0.rc0.228.g281dcd1b4d0-goog

^ permalink raw reply

* [PATCH v2 2/2] Documentation/ABI: document /sys/class/net/*/dev_port
From: Arseny Maslennikov @ 2018-08-30 18:22 UTC (permalink / raw)
  To: linux-rdma; +Cc: Arseny Maslennikov, Doug Ledford, Jason Gunthorpe, netdev
In-Reply-To: <20180830182238.16361-1-ar@cs.msu.ru>

The sysfs field was introduced 4 years ago along with fixes to various
drivers that erroneously used `dev_id' for that purpose, but it was not
properly documented anywhere.
See commit v3.14-rc3-739-g3f85944fe207.

Signed-off-by: Arseny Maslennikov <ar@cs.msu.ru>
---
 Documentation/ABI/testing/sysfs-class-net | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
index 2f1788111cd9..1593d8997ade 100644
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -91,6 +91,16 @@ Description:
 		stacked (e.g: VLAN interfaces) but still have the same MAC
 		address as their parent device.
 
+What:		/sys/class/net/<iface>/dev_port
+Date:		February 2014
+KernelVersion:	3.15
+Contact:	netdev@vger.kernel.org
+Description:
+		Indicates the port number of this network device, formatted
+		as a decimal value. Some NICs have multiple independent ports
+		on the same PCI bus, device and function. This field allows
+		userspace to distinguish the respective interfaces.
+
 What:		/sys/class/net/<iface>/dormant
 Date:		March 2006
 KernelVersion:	2.6.17
-- 
2.18.0

^ permalink raw reply related

* [PATCH v2 0/2] IB/ipoib: Use dev_port to disambiguate port numbers
From: Arseny Maslennikov @ 2018-08-30 18:22 UTC (permalink / raw)
  To: linux-rdma; +Cc: Arseny Maslennikov, Doug Ledford, Jason Gunthorpe, netdev

Pre-3.15 userspace had trouble distinguishing different ports
of a NIC on a single PCI bus/device/function. To solve this,
a sysfs field `dev_port' was introduced quite a while ago
(commit v3.14-rc3-739-g3f85944fe207), and some relevant device
drivers were fixed to use it, but not in case of IPoIB.

The convention for some reason never got documented in the kernel, but
was immediately adopted by userspace (notably udev[1][2], biosdevname[3])

3/3 documents the sysfs field — that's why I'm CC-ing netdev.

This series was tested on and applies to 4.19-rc1.

[1] https://lists.freedesktop.org/archives/systemd-devel/2014-June/020788.html
[2] https://lists.freedesktop.org/archives/systemd-devel/2014-July/020804.html
[3] https://github.com/CloudAutomationNTools/biosdevname/blob/c795d51dd93a5309652f0d635f12a3ecfabfaa72/src/eths.c#L38

v1->v2: replace a line instead of inserting and then removing.

Arseny Maslennikov (2):
  IB/ipoib: Use dev_port to expose network interface port numbers
  Documentation/ABI: document /sys/class/net/*/dev_port

 Documentation/ABI/testing/sysfs-class-net | 10 ++++++++++
 drivers/infiniband/ulp/ipoib/ipoib_main.c |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

-- 
2.18.0

^ permalink raw reply

* [PATCH v2 1/2] IB/ipoib: Use dev_port to expose network interface port numbers
From: Arseny Maslennikov @ 2018-08-30 18:22 UTC (permalink / raw)
  To: linux-rdma; +Cc: Arseny Maslennikov, Doug Ledford, Jason Gunthorpe, netdev
In-Reply-To: <20180830182238.16361-1-ar@cs.msu.ru>

Some InfiniBand network devices have multiple ports on the same PCI
function. This initializes the `dev_port' sysfs field of those
network interfaces with their port number.

Prior to this the kernel erroneously used the `dev_id' sysfs
field of those network interfaces to convey the port number to userspace.

The use of `dev_id' was considered correct until Linux 3.15,
when another field, `dev_port', was defined for this particular
purpose and `dev_id' was reserved for distinguishing stacked ifaces
(e.g: VLANs) with the same hardware address as their parent device.

Similar fixes to net/mlx4_en and many other drivers, which started
exporting this information through `dev_id' before 3.15, were accepted
into the kernel 4 years ago.
See 76a066f2a2a0 (`net/mlx4_en: Expose port number through sysfs').

Signed-off-by: Arseny Maslennikov <ar@cs.msu.ru>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index e3d28f9ad9c0..ba16a63ee303 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1880,7 +1880,7 @@ static int ipoib_parent_init(struct net_device *ndev)
 	       sizeof(union ib_gid));
 
 	SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent);
-	priv->dev->dev_id = priv->port - 1;
+	priv->dev->dev_port = priv->port - 1;
 
 	return 0;
 }
-- 
2.18.0

^ permalink raw reply related

* [PATCH net] ibmvnic: Include missing return code checks in reset function
From: Thomas Falcon @ 2018-08-30 18:19 UTC (permalink / raw)
  To: netdev; +Cc: Thomas Falcon

Check the return codes of these functions and halt reset
in case of failure. The driver will remain in a dormant state
until the next reset event, when device initialization will be
re-attempted.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index ffe7acb..d834308 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1841,11 +1841,17 @@ static int do_reset(struct ibmvnic_adapter *adapter,
 			adapter->map_id = 1;
 			release_rx_pools(adapter);
 			release_tx_pools(adapter);
-			init_rx_pools(netdev);
-			init_tx_pools(netdev);
+			rc = init_rx_pools(netdev);
+			if (rc)
+				return rc;
+			rc = init_tx_pools(netdev);
+			if (rc)
+				return rc;
 
 			release_napi(adapter);
-			init_napi(adapter);
+			rc = init_napi(adapter);
+			if (rc)
+				return rc;
 		} else {
 			rc = reset_tx_pools(adapter);
 			if (rc)
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH v4] 9p: Add refcount to p9_req_t
From: Tomas Bortoli @ 2018-08-30 22:20 UTC (permalink / raw)
  To: Dominique Martinet, Eric Van Hensbergen, Latchesar Ionkov
  Cc: v9fs-developer, netdev, linux-kernel, syzkaller,
	Dominique Martinet
In-Reply-To: <1535626341-20693-1-git-send-email-asmadeus@codewreck.org>

On 08/30/2018 12:52 PM, Dominique Martinet wrote:
> From: Tomas Bortoli <tomasbortoli@gmail.com>
> 
> To avoid use-after-free(s), use a refcount to keep track of the
> usable references to any instantiated struct p9_req_t.
> 
> This commit adds p9_req_put(), p9_req_get() and p9_req_try_get() as
> wrappers to kref_put(), kref_get() and kref_get_unless_zero().
> These are used by the client and the transports to keep track of
> valid requests' references.
> 
> p9_free_req() is added back and used as callback by kref_put().
> 
> Add SLAB_TYPESAFE_BY_RCU as it ensures that the memory freed by
> kmem_cache_free() will not be reused for another type until the rcu
> synchronisation period is over, so an address gotten under rcu read
> lock is safe to inc_ref() without corrupting random memory while
> the lock is held.
> 
> Co-developed-by: Dominique Martinet <dominique.martinet@cea.fr>
> Signed-off-by: Tomas Bortoli <tomasbortoli@gmail.com>
> Reported-by: syzbot+467050c1ce275af2a5b8@syzkaller.appspotmail.com
> Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
> ---
> v3:
>  - add req put if virtio zc request fails
>  - add req put if cancelled callback is not defined for virtio
>  - (incorrectly) add req put in rdma cancelled callback
> 
> v4:
>  - removed rdma's cancelled callback put again
>  - changed the else if no cancelled callback into actually giving virtio
> a callback, xen does not need to call put in that case either because
> both function rely on tag_lookup to find the request. trans_fd only
> needs to put in cancelled because it also keeps the req in a list around
> for cancel.
>  - add req put for trans xen's request(), I'm not sure why that one was
> missing either..
> 
> And with that I believe I am done testing all four transports.
> I'll do a second round of tests next week just to make sure, but it
> should be good enough™
> Sorry for the multiple iterations.
> 
>  include/net/9p/client.h | 14 ++++++++++
>  net/9p/client.c         | 57 ++++++++++++++++++++++++++++++++++++-----
>  net/9p/trans_fd.c       | 11 +++++++-
>  net/9p/trans_rdma.c     |  1 +
>  net/9p/trans_virtio.c   | 26 ++++++++++++++++---
>  net/9p/trans_xen.c      |  1 +
>  6 files changed, 98 insertions(+), 12 deletions(-)
> 
> diff --git a/include/net/9p/client.h b/include/net/9p/client.h
> index 735f3979d559..947a570307a6 100644
> --- a/include/net/9p/client.h
> +++ b/include/net/9p/client.h
> @@ -94,6 +94,7 @@ enum p9_req_status_t {
>  struct p9_req_t {
>  	int status;
>  	int t_err;
> +	struct kref refcount;
>  	wait_queue_head_t wq;
>  	struct p9_fcall tc;
>  	struct p9_fcall rc;
> @@ -233,6 +234,19 @@ int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status);
>  int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl);
>  void p9_fcall_fini(struct p9_fcall *fc);
>  struct p9_req_t *p9_tag_lookup(struct p9_client *, u16);
> +
> +static inline void p9_req_get(struct p9_req_t *r)
> +{
> +	kref_get(&r->refcount);
> +}
> +
> +static inline int p9_req_try_get(struct p9_req_t *r)
> +{
> +	return kref_get_unless_zero(&r->refcount);
> +}
> +
> +int p9_req_put(struct p9_req_t *r);
> +
>  void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status);
>  
>  int p9_parse_header(struct p9_fcall *, int32_t *, int8_t *, int16_t *, int);
> diff --git a/net/9p/client.c b/net/9p/client.c
> index 7942c0bfcc5b..aeeb6d8515d4 100644
> --- a/net/9p/client.c
> +++ b/net/9p/client.c
> @@ -310,6 +310,18 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
>  	if (tag < 0)
>  		goto free;
>  
> +	/* Init ref to two because in the general case there is one ref
> +	 * that is put asynchronously by a writer thread, one ref
> +	 * temporarily given by p9_tag_lookup and put by p9_client_cb
> +	 * in the recv thread, and one ref put by p9_tag_remove in the
> +	 * main thread. The only exception is virtio that does not use
> +	 * p9_tag_lookup but does not have a writer thread either
> +	 * (the write happens synchronously in the request/zc_request
> +	 * callback), so p9_client_cb eats the second ref there
> +	 * as the pointer is duplicated directly by virtqueue_add_sgs()
> +	 */
> +	refcount_set(&req->refcount.refcount, 2);
> +
>  	return req;
>  
>  free:
> @@ -333,10 +345,21 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
>  	struct p9_req_t *req;
>  
>  	rcu_read_lock();
> +again:
>  	req = idr_find(&c->reqs, tag);
> -	/* There's no refcount on the req; a malicious server could cause
> -	 * us to dereference a NULL pointer
> -	 */
> +	if (req) {
> +		/* We have to be careful with the req found under rcu_read_lock
> +		 * Thanks to SLAB_TYPESAFE_BY_RCU we can safely try to get the
> +		 * ref again without corrupting other data, then check again
> +		 * that the tag matches once we have the ref
> +		 */
> +		if (!p9_req_try_get(req))
> +			goto again;
> +		if (req->tc.tag != tag) {
> +			p9_req_put(req);
> +			goto again;
> +		}
> +	}
>  	rcu_read_unlock();
>  
>  	return req;
> @@ -350,7 +373,7 @@ EXPORT_SYMBOL(p9_tag_lookup);
>   *
>   * Context: Any context.
>   */
> -static void p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
> +static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
>  {
>  	unsigned long flags;
>  	u16 tag = r->tc.tag;
> @@ -359,11 +382,23 @@ static void p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
>  	spin_lock_irqsave(&c->lock, flags);
>  	idr_remove(&c->reqs, tag);
>  	spin_unlock_irqrestore(&c->lock, flags);
> +	return p9_req_put(r);
> +}
> +
> +static void p9_req_free(struct kref *ref)
> +{
> +	struct p9_req_t *r = container_of(ref, struct p9_req_t, refcount);
>  	p9_fcall_fini(&r->tc);
>  	p9_fcall_fini(&r->rc);
>  	kmem_cache_free(p9_req_cache, r);
>  }
>  
> +int p9_req_put(struct p9_req_t *r)
> +{
> +	return kref_put(&r->refcount, p9_req_free);
> +}
> +EXPORT_SYMBOL(p9_req_put);
> +
>  /**
>   * p9_tag_cleanup - cleans up tags structure and reclaims resources
>   * @c:  v9fs client struct
> @@ -379,7 +414,9 @@ static void p9_tag_cleanup(struct p9_client *c)
>  	rcu_read_lock();
>  	idr_for_each_entry(&c->reqs, req, id) {
>  		pr_info("Tag %d still in use\n", id);
> -		p9_tag_remove(c, req);
> +		if (p9_tag_remove(c, req) == 0)
> +			pr_warn("Packet with tag %d has still references",
> +				req->tc.tag);
>  	}
>  	rcu_read_unlock();
>  }
> @@ -403,6 +440,7 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
>  
>  	wake_up(&req->wq);
>  	p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag);
> +	p9_req_put(req);
>  }
>  EXPORT_SYMBOL(p9_client_cb);
>  
> @@ -643,9 +681,10 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
>  	 * if we haven't received a response for oldreq,
>  	 * remove it from the list
>  	 */
> -	if (oldreq->status == REQ_STATUS_SENT)
> +	if (oldreq->status == REQ_STATUS_SENT) {
>  		if (c->trans_mod->cancelled)
>  			c->trans_mod->cancelled(c, oldreq);
> +	}
>  
>  	p9_tag_remove(c, req);
>  	return 0;
> @@ -682,6 +721,8 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
>  	return req;
>  reterr:
>  	p9_tag_remove(c, req);
> +	/* We have to put also the 2nd reference as it won't be used */
> +	p9_req_put(req);
>  	return ERR_PTR(err);
>  }
>  
> @@ -716,6 +757,8 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
>  
>  	err = c->trans_mod->request(c, req);
>  	if (err < 0) {
> +		/* write won't happen */
> +		p9_req_put(req);
>  		if (err != -ERESTARTSYS && err != -EFAULT)
>  			c->status = Disconnected;
>  		goto recalc_sigpending;
> @@ -2241,7 +2284,7 @@ EXPORT_SYMBOL(p9_client_readlink);
>  
>  int __init p9_client_init(void)
>  {
> -	p9_req_cache = KMEM_CACHE(p9_req_t, 0);
> +	p9_req_cache = KMEM_CACHE(p9_req_t, SLAB_TYPESAFE_BY_RCU);
>  	return p9_req_cache ? 0 : -ENOMEM;
>  }
>  
> diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
> index 20f46f13fe83..686e24e355d0 100644
> --- a/net/9p/trans_fd.c
> +++ b/net/9p/trans_fd.c
> @@ -132,6 +132,7 @@ struct p9_conn {
>  	struct list_head req_list;
>  	struct list_head unsent_req_list;
>  	struct p9_req_t *req;
> +	struct p9_req_t *wreq;
>  	char tmp_buf[7];
>  	struct p9_fcall rc;
>  	int wpos;
> @@ -383,6 +384,7 @@ static void p9_read_work(struct work_struct *work)
>  		m->rc.sdata = NULL;
>  		m->rc.offset = 0;
>  		m->rc.capacity = 0;
> +		p9_req_put(m->req);
>  		m->req = NULL;
>  	}
>  
> @@ -472,6 +474,8 @@ static void p9_write_work(struct work_struct *work)
>  		m->wbuf = req->tc.sdata;
>  		m->wsize = req->tc.size;
>  		m->wpos = 0;
> +		p9_req_get(req);
> +		m->wreq = req;
>  		spin_unlock(&m->client->lock);
>  	}
>  
> @@ -492,8 +496,11 @@ static void p9_write_work(struct work_struct *work)
>  	}
>  
>  	m->wpos += err;
> -	if (m->wpos == m->wsize)
> +	if (m->wpos == m->wsize) {
>  		m->wpos = m->wsize = 0;
> +		p9_req_put(m->wreq);
> +		m->wreq = NULL;
> +	}
>  
>  end_clear:
>  	clear_bit(Wworksched, &m->wsched);
> @@ -694,6 +701,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
>  	if (req->status == REQ_STATUS_UNSENT) {
>  		list_del(&req->req_list);
>  		req->status = REQ_STATUS_FLSHD;
> +		p9_req_put(req);
>  		ret = 0;
>  	}
>  	spin_unlock(&client->lock);
> @@ -711,6 +719,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
>  	spin_lock(&client->lock);
>  	list_del(&req->req_list);
>  	spin_unlock(&client->lock);
> +	p9_req_put(req);
>  
>  	return 0;
>  }
> diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
> index 5b0cda1aaa7a..9cc9b3a19ee7 100644
> --- a/net/9p/trans_rdma.c
> +++ b/net/9p/trans_rdma.c
> @@ -365,6 +365,7 @@ send_done(struct ib_cq *cq, struct ib_wc *wc)
>  			    c->busa, c->req->tc.size,
>  			    DMA_TO_DEVICE);
>  	up(&rdma->sq_sem);
> +	p9_req_put(c->req);
>  	kfree(c);
>  }
>  
> diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
> index 3dd6ce1c0f2d..eb596c2ed546 100644
> --- a/net/9p/trans_virtio.c
> +++ b/net/9p/trans_virtio.c
> @@ -207,6 +207,13 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
>  	return 1;
>  }
>  
> +/* Reply won't come, so drop req ref */
> +static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req)
> +{
> +	p9_req_put(req);
> +	return 0;
> +}
> +
>  /**
>   * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer,
>   * this takes a list of pages.
> @@ -404,6 +411,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
>  	struct scatterlist *sgs[4];
>  	size_t offs;
>  	int need_drop = 0;
> +	int kicked = 0;
>  
>  	p9_debug(P9_DEBUG_TRANS, "virtio request\n");
>  
> @@ -411,8 +419,10 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
>  		__le32 sz;
>  		int n = p9_get_mapped_pages(chan, &out_pages, uodata,
>  					    outlen, &offs, &need_drop);
> -		if (n < 0)
> -			return n;
> +		if (n < 0) {
> +			err = n;
> +			goto err_out;
> +		}
>  		out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE);
>  		if (n != outlen) {
>  			__le32 v = cpu_to_le32(n);
> @@ -428,8 +438,10 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
>  	} else if (uidata) {
>  		int n = p9_get_mapped_pages(chan, &in_pages, uidata,
>  					    inlen, &offs, &need_drop);
> -		if (n < 0)
> -			return n;
> +		if (n < 0) {
> +			err = n;
> +			goto err_out;
> +		}
>  		in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE);
>  		if (n != inlen) {
>  			__le32 v = cpu_to_le32(n);
> @@ -498,6 +510,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
>  	}
>  	virtqueue_kick(chan->vq);
>  	spin_unlock_irqrestore(&chan->lock, flags);
> +	kicked = 1;
>  	p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
>  	err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD);
>  	/*
> @@ -518,6 +531,10 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
>  	}
>  	kvfree(in_pages);
>  	kvfree(out_pages);
> +	if (!kicked) {
> +		/* reply won't come */
> +		p9_req_put(req);
> +	}
>  	return err;
>  }
>  
> @@ -750,6 +767,7 @@ static struct p9_trans_module p9_virtio_trans = {
>  	.request = p9_virtio_request,
>  	.zc_request = p9_virtio_zc_request,
>  	.cancel = p9_virtio_cancel,
> +	.cancelled = p9_virtio_cancelled,
>  	/*
>  	 * We leave one entry for input and one entry for response
>  	 * headers. We also skip one more entry to accomodate, address
> diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
> index 782a07f2ad0c..e2fbf3677b9b 100644
> --- a/net/9p/trans_xen.c
> +++ b/net/9p/trans_xen.c
> @@ -185,6 +185,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req)
>  	ring->intf->out_prod = prod;
>  	spin_unlock_irqrestore(&ring->lock, flags);
>  	notify_remote_via_irq(ring->irq);
> +	p9_req_put(p9_req);
>  
>  	return 0;
>  }
> 

LGTM, thanks Dominique!

Tomas

^ permalink raw reply

* Re: [PATCH v2 iproute2-next 0/3] support delivering packets in
From: David Ahern @ 2018-08-30 18:10 UTC (permalink / raw)
  To: Yousuk Seung, netdev; +Cc: Stephen Hemminger, Michael McLennan, Priyaranjan Jha
In-Reply-To: <20180827024230.246445-1-ysseung@google.com>

On 8/26/18 8:42 PM, Yousuk Seung wrote:
> This series adds support for the new "slot" netem parameter for
> slotting. Slotting is an approximation of shared media that gather up
> packets within a varying delay window before delivering them nearly at
> once.
> 
> Dave Taht (2):
>   tc: support conversions to or from 64 bit nanosecond-based time
>   q_netem: support delivering packets in delayed time slots
> 
> Yousuk Seung (1):
>   q_netem: slotting with non-uniform distribution
> 
>  include/utils.h     |  12 +++++
>  lib/utils.c         | 104 +++++++++++++++++++++++++++++++++++++++
>  man/man8/tc-netem.8 |  40 ++++++++++++++-
>  tc/q_netem.c        | 115 +++++++++++++++++++++++++++++++++++++++++++-
>  tc/tc_cbq.c         |   1 +
>  tc/tc_core.c        |   1 +
>  tc/tc_core.h        |   2 -
>  tc/tc_estimator.c   |   1 +
>  tc/tc_util.c        |  46 ------------------
>  tc/tc_util.h        |   3 --
>  10 files changed, 272 insertions(+), 53 deletions(-)
> 

applied to iproute2-next after fixing up a whitespace issue and 2
checkpatch errors in patch 2.

^ permalink raw reply

* Re: [PATCH net-next 1/3] net: nixge: Add support for fixed-link subnodes
From: Moritz Fischer @ 2018-08-30 17:21 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: David S. Miller, Kees Cook, Florian Fainelli,
	Linux Kernel Mailing List, netdev, Alex Williams
In-Reply-To: <20180830030420.GB16896@lunn.ch>

Hi Andrew,

On Wed, Aug 29, 2018 at 8:04 PM, Andrew Lunn <andrew@lunn.ch> wrote:
> On Wed, Aug 29, 2018 at 05:40:44PM -0700, Moritz Fischer wrote:
>> Add support for fixed-link cases where no MDIO is
>> actually required to run the device.
>> In that case no MDIO bus is instantiated since the
>> actual registers are not available in hardware.
>
> Hi Moritz
>
> There are a few different use cases here:
>
> The hardware is missing MDIO - You need fixed-link.

Agreed.
>
> The hardware has MDIO, but you don't have a PHY connected on it, and
> use fixed link.

Since it's an FPGA design in that case we'd probably build the hardware without
MDIO to save resources.

> The hardware has MDIO, and it is used e.g. for an Ethernet switch, or
> a PHY for another Ethernet interface. Plus you need fixed link.
We haven't had that yet but I can see that happen.
>
> The binding typically looks like:
>
> &fec1 {
>         phy-mode = "rmii";
>         pinctrl-names = "default";
>         pinctrl-0 = <&pinctrl_fec1>;
>         status = "okay";
>
>         fixed-link {
>                 speed = <100>;
>                 full-duplex;
>         };
>
>         mdio1: mdio {
>                 #address-cells = <1>;
>                 #size-cells = <0>;
>                 status = "okay";
>
>                 switch0: switch0@0 {
>                         compatible = "marvell,mv88e6085";
>                         pinctrl-names = "default";
>                         pinctrl-0 = <&pinctrl_switch>;
>                         reg = <0>;
>                         eeprom-length = <512>;
>                         interrupt-parent = <&gpio3>;
>
> It is important you have the mdio subnode, with PHYs and switches as
> children. The driver currently gets this wrong, it uses
> pdev->dev.of_node.

Oh, whoops. Yeah I should look into that. Any good examples of drivers doing
it right? Is the one going with the DT snippet above a good example?
>
> So the first patch should be to extend this behaviour. Look for a
> child node called mdio. If it exists, call nixge_mdio_setup() passing
> that child. Otherwise continue using pdev->dev.of_node, so you don't
> break backwards compatibility.

Ok will do.
>
> Then a patch adding support for fixed-link. If the mdio child node
> exists, you still need to register the MDIO bus. If there is no child
> node, but there is a fixed-link, skip registering the mdio bus with
> pdev->dev.of_node.
>
>         Andrew

Thanks for your feedback, much appreciated!

Moritz

^ permalink raw reply

* RE: [PATCH] i40e: mark expected switch fall-through
From: Kirsher, Jeffrey T @ 2018-08-30 21:09 UTC (permalink / raw)
  To: Gustavo A. R. Silva, David S. Miller
  Cc: intel-wired-lan@lists.osuosl.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20180830185019.GA30328@embeddedor.com>

> -----Original Message-----
> From: Gustavo A. R. Silva [mailto:gustavo@embeddedor.com]
> Sent: Thursday, August 30, 2018 11:50
> To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>; David S. Miller
> <davem@davemloft.net>
> Cc: intel-wired-lan@lists.osuosl.org; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; Gustavo A. R. Silva <gustavo@embeddedor.com>
> Subject: [PATCH] i40e: mark expected switch fall-through
> 
> In preparation to enabling -Wimplicit-fallthrough, mark switch cases where
> we are expecting to fall through.
> 
> Addresses-Coverity-ID: 1473099 ("Missing break in switch")
> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
> ---
>  drivers/net/ethernet/intel/i40e/i40e_xsk.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)

I have picked this up Dave.

^ permalink raw reply

* Re: [PATCH 1/2] dt-bindings: net: cpsw: Document cpsw-phy-sel usage but prefer phandle
From: Grygorii Strashko @ 2018-08-30 17:04 UTC (permalink / raw)
  To: Tony Lindgren
  Cc: David Miller, netdev, linux-omap, devicetree, Andrew Lunn,
	Ivan Khoronzhuk, Mark Rutland, Murali Karicheri, Rob Herring
In-Reply-To: <20180830004745.GU7523@atomide.com>



On 08/29/2018 07:47 PM, Tony Lindgren wrote:
> * Grygorii Strashko <grygorii.strashko@ti.com> [180830 00:12]:
>> Hi Tony,
>>
>> On 08/29/2018 10:00 AM, Tony Lindgren wrote:
>>> The current cpsw usage for cpsw-phy-sel is undocumented but is used for
>>> all the boards using cpsw. And cpsw-phy-sel is not really a child of
>>> the cpsw device, it lives in the system control module instead.
>>>
>>> Let's document the existing usage, and improve it a bit where we prefer
>>> to use a phandle instead of a child device for it. That way we can
>>> properly describe the hardware in dts files for things like genpd.
>>
>> I'm ok with this series, but I really don't like cpsw-phy-sel in general.
> 
> Yeah this binding predates any standards. This series
> only fixes the nasty issue of cpsw claiming a module as a
> child that's outside it's IO range.
> 
>> It was introduced long time back and now I'm thinking about possibility to replace it with
>> one of current generic interfaces - for example mux-controller.
>> Each port will control up to 3 muxes (port mode, idmode and rmii_ext_clk) and
>> transform phy-mode => mux states.
>> What do you think?
> 
> Sure a mux-controller here makes sense.
> 
>> Another option is to use phy, but it'd be complicated.
> 
> For the port muxes, how about a phy driver just using
> a pinctrl driver?
> 
> In general, it seems cpsw is just an interconnect instance
> (L4_FAST) with a control module (CPSW_WR) and a pile of
> independent other modules. That's described nicely in
> am437x TRM chapter "2.1.4 L4 Fast Peripheral Memory Map".
> So from that point of view the binding reg entries right
> now are all wrong :)

TRM not consistent - for am5 it's one MMIO region.

> 
> In the long run cpsw should be really treated as an
> interconnect instance with it's control module providing
> standard Linux framework services such as clock /
> regulator / phy / pinctrl / iio whatever for the other
> modules.
> 
> Just my 2c based on looking at the interconnect, I'm
> not too familiar with cpsw otherwise.

It's not separate modules. this is composite module which have only one 
fck/ick and most of blocks can't even function without each other.
Above might be the case for Keystone 2, but not omap CPSW.
Keystone 2 - has packet processor, security accelerator, queue manager 
in addition to its basic switch block.

-- 
regards,
-grygorii

^ permalink raw reply

* Re: KMSAN: uninit-value in rds_bind
From: Santosh Shilimkar @ 2018-08-30 20:56 UTC (permalink / raw)
  To: syzbot, linux-rdma, netdev, syzkaller-bugs; +Cc: davem, linux-kernel
In-Reply-To: <000000000000010a4d0574ab4743@google.com>

On 8/30/2018 11:31 AM, syzbot wrote:
> Hello,
> 
> syzbot found the following crash on:
> 
> HEAD commit:    2dca2cbde67a kmsan: fix build warnings with CONFIG_KMSAN=n
> git tree:       https://github.com/google/kmsan.git/master

BTW, can you please fix your git url since this one doesn't work.
This tree is not vanila kernel.org tree(4.19.0-rc1+ #36), so would be
good to get the line numbers correct for sources.

Regards,
Snatosh

^ permalink raw reply

* Re: [PATCH 2/4] r8169: Get and enable optional ether_clk clock
From: Stephen Boyd @ 2018-08-30 16:48 UTC (permalink / raw)
  To: David S . Miller, Andy Shevchenko, Hans de Goede, Heiner Kallweit,
	Irina Tirdea, Michael Turquette
  Cc: netdev, Johannes Stezenbach, Carlo Caione, linux-clk
In-Reply-To: <8a424470-9c57-9b95-9d41-3ea51d3f2629@redhat.com>

Quoting Hans de Goede (2018-08-29 10:09:57)
> Hi,
> 
> On 27-08-18 21:14, Stephen Boyd wrote:
> > Quoting Hans de Goede (2018-08-27 11:53:19)
> >> On 27-08-18 20:47, Stephen Boyd wrote:
> >>> How would you know that a clk device driver hasn't probed yet and isn't
> >>> the driver that's actually providing the clk to this device on x86
> >>> systems? With DT systems we can figure that out by looking at the DT and
> >>> seeing if the device driver requesting the clk has the clocks property.
> >>> On x86 systems it's all clkdev which doesn't really lend itself to
> >>> solving this problem.
> >>
> >> Right on x86 the assumption is that the clk driver will be builtin and
> >> will probe before the consumer. In this case that is true as the
> >> pmc-atom-clk driver can only be builtin and its platform device is
> >> instantiated from the acpi_lpss code and acpi init happens before
> >> the PCI bus is scanned.
> > 
> > If we can go with this assumption then we can make the optional clk API
> > work even on clkdev based systems. Maybe if x86 had some way of
> > indicating that all builtin clks are registered?
> 
> Unfortunately there is no such thing I'm afraid.

Ugh!

> 
> > That might work but
> > it's not very clean. Or if we could check to see if we're running on an
> > ACPI based system in clkdev we could use that to assume that clk_get()
> > will only be called after all providers have registered their lookups.
> 
> Yes some check for x86 + ACPI (ARM also uses ACPI, but there we
> should no do this AFAICT) is probably best. That or not use the
> new optional clk API on x86, but that means that any cross platform
> driver cannot use it, which would be a pain.

Right. The optional clk API will be not so great until we can get ACPI
to move way from clkdev.

> 
> BTW does your Acked-by indicate you are ok with merging this series
> through the netdev tree as I suggested in the cover-letter? If so
> can I also add your Acked-by to the 3th patch ?
> 

Yep, I thought I did that but now I've really done it.

^ permalink raw reply

* Re: [PATCH net-next 1/2] netlink: ipv4 IGMP join notifications
From: Patrick Ruddy @ 2018-08-30 16:44 UTC (permalink / raw)
  To: netdev; +Cc: roopa, jiri, stephen
In-Reply-To: <20180830093545.29465-2-pruddy@vyatta.att-mail.com>

Don't know what happened to the 0/2 cover for this series so here it
is:

This patch is an update to https://patchwork.ozlabs.org/patch/571127/.
The
previous patch was based on sending multicast MAC addresses in the
netlink messages to allow the programming of hardware. It was agreed to
rework this to use RTM_NEW/DELLINK messages which were more appropriate
for layer 2 addresses.
In the interim period it has become apparent that the applications
actually
needs to see the L3 multicast addresses which are joined for FORUS
processing so this patch has been reworked to send the L3 multicast
addresses using RTM_NEW/DELADDR.
These new multicast L3 netlink notifications should use the
IFA_MULTICAST
address type but this has been dropped in favour of IFA_ADDRESS as
during
testing it was noticed that some applications - notably getaddrinfo in
lib6c assume that there is an IFA_ADDRESS in a RTM_NEW/DELADDR and
blindly dereference it.
Finally the RTM_GETADDR for both address families has been modified to
include the multicast l3 addresses.

Patrick Ruddy (2):
  netlink: ipv4 IGMP join notifications
  netlink: ipv6 MLD join notifications

 include/linux/igmp.h |  2 +
 net/ipv4/devinet.c   | 39 +++++++++++++------
 net/ipv4/igmp.c      | 90 ++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/addrconf.c  | 44 ++++++++++++++++------
 net/ipv6/mcast.c     | 66 ++++++++++++++++++++++++++++++++
 5 files changed, 218 insertions(+), 23 deletions(-)

-- 
2.17.1

On Thu, 2018-08-30 at 10:35 +0100, Patrick Ruddy wrote:
> Some userspace applications need to know about IGMP joins from the kernel
> for 2 reasons
> 1. To allow the programming of multicast MAC filters in hardware
> 2. To form a multicast FORUS list for non link-local multicast
>    groups to be sent to the kernel and from there to the interested
>    party.
> (1) can be fulfilled but simply sending the hardware multicast MAC
> address to be programmed but (2) requires the L3 address to be sent
> since this cannot be constructed from the MAC address whereas the
> reverse translation is a standard library function.
> 
> This commit provides addition and deletion of multicast addresses
> using the RTM_NEWADDR and RTM_DELADDR messages. It also provides
> the RTM_GETADDR extension to allow multicast join state to be read
> from the kernel.
> 
> Signed-off-by: Patrick Ruddy <pruddy@vyatta.att-mail.com>
> ---
>  include/linux/igmp.h |  2 +
>  net/ipv4/devinet.c   | 39 +++++++++++++------
>  net/ipv4/igmp.c      | 90 ++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 120 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/igmp.h b/include/linux/igmp.h
> index 119f53941c12..1fb417865e7d 100644
> --- a/include/linux/igmp.h
> +++ b/include/linux/igmp.h
> @@ -130,6 +130,8 @@ extern void ip_mc_unmap(struct in_device *);
>  extern void ip_mc_remap(struct in_device *);
>  extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr);
>  extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr);
> +extern int ip_mc_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb,
> +			     struct net_device *dev);
>  int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed);
>  
>  #endif
> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
> index ea4bd8a52422..42f7dcc4fb5e 100644
> --- a/net/ipv4/devinet.c
> +++ b/net/ipv4/devinet.c
> @@ -57,6 +57,7 @@
>  #endif
>  #include <linux/kmod.h>
>  #include <linux/netconf.h>
> +#include <linux/igmp.h>
>  
>  #include <net/arp.h>
>  #include <net/ip.h>
> @@ -1651,6 +1652,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
>  	int h, s_h;
>  	int idx, s_idx;
>  	int ip_idx, s_ip_idx;
> +	int multicast, mcast_idx;
>  	struct net_device *dev;
>  	struct in_device *in_dev;
>  	struct in_ifaddr *ifa;
> @@ -1659,6 +1661,8 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
>  	s_h = cb->args[0];
>  	s_idx = idx = cb->args[1];
>  	s_ip_idx = ip_idx = cb->args[2];
> +	multicast = cb->args[3];
> +	mcast_idx = cb->args[4];
>  
>  	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
>  		idx = 0;
> @@ -1675,18 +1679,29 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
>  			if (!in_dev)
>  				goto cont;
>  
> -			for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
> -			     ifa = ifa->ifa_next, ip_idx++) {
> -				if (ip_idx < s_ip_idx)
> -					continue;
> -				if (inet_fill_ifaddr(skb, ifa,
> -					     NETLINK_CB(cb->skb).portid,
> -					     cb->nlh->nlmsg_seq,
> -					     RTM_NEWADDR, NLM_F_MULTI) < 0) {
> -					rcu_read_unlock();
> -					goto done;
> +			if (!multicast) {
> +				for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
> +				     ifa = ifa->ifa_next, ip_idx++) {
> +					if (ip_idx < s_ip_idx)
> +						continue;
> +					if (inet_fill_ifaddr(skb, ifa,
> +							     NETLINK_CB(cb->skb).portid,
> +							     cb->nlh->nlmsg_seq,
> +							     RTM_NEWADDR,
> +							     NLM_F_MULTI) < 0) {
> +						rcu_read_unlock();
> +						goto done;
> +					}
> +					nl_dump_check_consistent(cb,
> +								 nlmsg_hdr(skb));
>  				}
> -				nl_dump_check_consistent(cb, nlmsg_hdr(skb));
> +				/* set for multicast loop */
> +				multicast++;
> +			}
> +			/* loop over multicast addresses */
> +			if (ip_mc_dump_ifaddr(skb, cb, dev) < 0) {
> +				rcu_read_unlock();
> +				goto done;
>  			}
>  cont:
>  			idx++;
> @@ -1698,6 +1713,8 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
>  	cb->args[0] = h;
>  	cb->args[1] = idx;
>  	cb->args[2] = ip_idx;
> +	cb->args[3] = multicast;
> +	cb->args[4] = mcast_idx;
>  
>  	return skb->len;
>  }
> diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
> index cf75f8944b05..c9bbd1d27124 100644
> --- a/net/ipv4/igmp.c
> +++ b/net/ipv4/igmp.c
> @@ -86,6 +86,7 @@
>  #include <linux/inetdevice.h>
>  #include <linux/igmp.h>
>  #include <linux/if_arp.h>
> +#include <net/netlink.h>
>  #include <linux/rtnetlink.h>
>  #include <linux/times.h>
>  #include <linux/pkt_sched.h>
> @@ -1384,6 +1385,91 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
>  }
>  
>  
> +static int fill_addr(struct sk_buff *skb, struct net_device *dev, __be32 addr,
> +		     int type, unsigned int flags)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ifaddrmsg *ifm;
> +
> +	nlh = nlmsg_put(skb, 0, 0, type, sizeof(*ifm), flags);
> +	if (!nlh)
> +		return -EMSGSIZE;
> +
> +	ifm = nlmsg_data(nlh);
> +	ifm->ifa_family = AF_INET;
> +	ifm->ifa_prefixlen = 32;
> +	ifm->ifa_flags = IFA_F_PERMANENT;
> +	ifm->ifa_scope = RT_SCOPE_LINK;
> +	ifm->ifa_index = dev->ifindex;
> +
> +	if (nla_put_in_addr(skb, IFA_ADDRESS, addr))
> +		goto nla_put_failure;
> +	nlmsg_end(skb, nlh);
> +	return 0;
> +
> +nla_put_failure:
> +	nlmsg_cancel(skb, nlh);
> +	return -EMSGSIZE;
> +}
> +
> +static inline size_t addr_nlmsg_size(void)
> +{
> +	return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
> +		+ nla_total_size(sizeof(__be32));
> +}
> +
> +static void ip_mc_addr_notify(struct net_device *dev, __be32 addr, int type)
> +{
> +	struct net *net = dev_net(dev);
> +	struct sk_buff *skb;
> +	int err = -ENOBUFS;
> +
> +	skb = nlmsg_new(addr_nlmsg_size(), GFP_ATOMIC);
> +	if (!skb)
> +		goto errout;
> +
> +	err = fill_addr(skb, dev, addr, type, 0);
> +	if (err < 0) {
> +		WARN_ON(err == -EMSGSIZE);
> +		kfree_skb(skb);
> +		goto errout;
> +	}
> +	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_IFADDR, NULL, GFP_ATOMIC);
> +	return;
> +errout:
> +	if (err < 0)
> +		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
> +}
> +
> +int ip_mc_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb,
> +		      struct net_device *dev)
> +{
> +	int s_idx;
> +	int idx = 0;
> +	struct ip_mc_list *im;
> +	struct in_device *in_dev;
> +
> +	ASSERT_RTNL();
> +
> +	s_idx = cb->args[4];
> +	in_dev = __in_dev_get_rtnl(dev);
> +
> +	for_each_pmc_rtnl(in_dev, im) {
> +		if (idx < s_idx)
> +			continue;
> +		if (fill_addr(skb, dev, im->multiaddr, RTM_NEWADDR,
> +			      NLM_F_MULTI) < 0)
> +			goto done;
> +		nl_dump_check_consistent(cb, nlmsg_hdr(skb));
> +		idx++;
> +	}
> +
> + done:
> +	cb->args[4] = idx;
> +
> +	return skb->len;
> +}
> +
>  /*
>   *	A socket has joined a multicast group on device dev.
>   */
> @@ -1433,6 +1519,8 @@ static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
>  	igmpv3_del_delrec(in_dev, im);
>  #endif
>  	igmp_group_added(im);
> +
> +	ip_mc_addr_notify(in_dev->dev, addr, RTM_NEWADDR);
>  	if (!in_dev->dead)
>  		ip_rt_multicast_event(in_dev);
>  out:
> @@ -1664,6 +1752,8 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
>  				in_dev->mc_count--;
>  				igmp_group_dropped(i);
>  				ip_mc_clear_src(i);
> +				ip_mc_addr_notify(in_dev->dev, addr,
> +						  RTM_DELADDR);
>  
>  				if (!in_dev->dead)
>  					ip_rt_multicast_event(in_dev);

^ permalink raw reply

* Re: [PATCH net-next 2/3] net: nixge: Add support for having nixge as subdevice
From: Moritz Fischer @ 2018-08-30 16:39 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: David S. Miller, Kees Cook, Florian Fainelli,
	Linux Kernel Mailing List, netdev, Alex Williams
In-Reply-To: <20180830031110.GC16896@lunn.ch>

Hi Andrew,

On Wed, Aug 29, 2018 at 8:11 PM, Andrew Lunn <andrew@lunn.ch> wrote:

> Could you tell us more about the parent device. I'm guessing PCIe.  Is
> it x86 so no device tree? Are there cases where it does not have a PHY
> connected? What is connected instead? SFP? A switch? Can there be
> multiple PHYs on the MDIO bus?

The device is part of a larger FPGA design. One use case that I was trying
to support with this patch is PCIe with x86 (hopefully on it's own PF...)
Since the whole design isn't completely done, these are the use cases I
see upcoming and current:

ARM(64):
a) DT: PHY over MDIO (current use case), fixed-link with GPIO (coming)
b) DT: SFP (potentially coming)

x86:
a) no PHY (coming)-> fixed-link with GPIO
b) SFP (potentially), PHY over MDIO (potentially)

Thanks for your help,

Moritz

^ permalink raw reply

* Re: [PATCH net] ipv6: don't get lwtstate twice in ip6_rt_copy_init()
From: David Ahern @ 2018-08-30 16:10 UTC (permalink / raw)
  To: Alexey Kodanev, netdev; +Cc: David Miller
In-Reply-To: <1535645484-30629-1-git-send-email-alexey.kodanev@oracle.com>

On 8/30/18 10:11 AM, Alexey Kodanev wrote:
> Commit 80f1a0f4e0cd ("net/ipv6: Put lwtstate when destroying fib6_info")
> partially fixed the kmemleak [1], lwtstate can be copied from fib6_info,
> with ip6_rt_copy_init(), and it should be done only once there.
> 
> rt->dst.lwtstate is set by ip6_rt_init_dst(), at the start of the function
> ip6_rt_copy_init(), so there is no need to get it again at the end.
> 
> With this patch, lwtstate also isn't copied from RTF_REJECT routes.

Those should not have lwtstate set.

> 
> [1]:
> unreferenced object 0xffff880b6aaa14e0 (size 64):
>   comm "ip", pid 10577, jiffies 4295149341 (age 1273.903s)
>   hex dump (first 32 bytes):
>     01 00 04 00 04 00 00 00 10 00 00 00 00 00 00 00  ................
>     00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
>   backtrace:
>     [<0000000018664623>] lwtunnel_build_state+0x1bc/0x420
>     [<00000000b73aa29a>] ip6_route_info_create+0x9f7/0x1fd0
>     [<00000000ee2c5d1f>] ip6_route_add+0x14/0x70
>     [<000000008537b55c>] inet6_rtm_newroute+0xd9/0xe0
>     [<000000002acc50f5>] rtnetlink_rcv_msg+0x66f/0x8e0
>     [<000000008d9cd381>] netlink_rcv_skb+0x268/0x3b0
>     [<000000004c893c76>] netlink_unicast+0x417/0x5a0
>     [<00000000f2ab1afb>] netlink_sendmsg+0x70b/0xc30
>     [<00000000890ff0aa>] sock_sendmsg+0xb1/0xf0
>     [<00000000a2e7b66f>] ___sys_sendmsg+0x659/0x950
>     [<000000001e7426c8>] __sys_sendmsg+0xde/0x170
>     [<00000000fe411443>] do_syscall_64+0x9f/0x4a0
>     [<000000001be7b28b>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
>     [<000000006d21f353>] 0xffffffffffffffff

What test did you run to uncover this? Curious as to why my testing that
found the need for 80f1a0f4e0cd did not hit this.

> 
> Fixes: 6edb3c96a5f0 ("net/ipv6: Defer initialization of dst to data path")
> Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
> ---
>  net/ipv6/route.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 8e08a91..9f27ada 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -996,7 +996,6 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
>  	rt->rt6i_src = ort->fib6_src;
>  #endif
>  	rt->rt6i_prefsrc = ort->fib6_prefsrc;
> -	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
>  }
>  
>  static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
> 

Thanks for the patch.

Reviewed-by: David Ahern <dsahern@gmail.com>

^ permalink raw reply

* Re: [PATCH net-next 1/3] net: rework SIOCGSTAMP ioctl handling
From: Willem de Bruijn @ 2018-08-30 20:09 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Network Development, David Miller, linux-arch, y2038 Mailman List,
	Eric Dumazet, Willem de Bruijn, LKML, linux-hams, linux-bluetooth,
	linux-can, dccp, linux-wpan, linux-sctp, linux-x25
In-Reply-To: <20180829130308.3504560-1-arnd@arndb.de>

On Wed, Aug 29, 2018 at 9:05 AM Arnd Bergmann <arnd@arndb.de> wrote:
>
> The SIOCGSTAMP/SIOCGSTAMPNS ioctl commands are implemented by many
> socket protocol handlers, and all of those end up calling the same
> sock_get_timestamp()/sock_get_timestampns() helper functions, which
> results in a lot of duplicate code.
>
> With the introduction of 64-bit time_t on 32-bit architectures, this
> gets worse, as we then need four different ioctl commands in each
> socket protocol implementation.
>
> To simplify that, let's add a new .gettstamp() operation in
> struct proto_ops, and move ioctl implementation into the common
> sock_ioctl()/compat_sock_ioctl_trans() functions that these all go
> through.
>
> We can reuse the sock_get_timestamp() implementation, but generalize
> it so it can deal with both native and compat mode, as well as
> timeval and timespec structures.
>
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

This also will simplify fixing a recently reported race condition with
sock_get_timestamp [1]. That calls sock_enable_timestamp, which
modifies sk->sk_flags, without taking the socket lock. Currently some
callers of sock_get_timestamp hold the lock (ax25, netrom, qrtr), many
don't. See also how this patch removes the lock_sock in the netrom
case. Moving the call to sock_gettstamp outside the protocol handlers
will allow taking the lock inside the function.

If this is the only valid implementation of .gettstamp, the indirect
call could be avoided in favor of a simple branch.

Thanks,

Acked-by: Willem de Bruijn <willemb@google.com>

[1] http://lkml.kernel.org/r/20180518080308.GA28587@dragonet.kaist.ac.kr

^ permalink raw reply

* [PATCH net] ipv6: don't get lwtstate twice in ip6_rt_copy_init()
From: Alexey Kodanev @ 2018-08-30 16:11 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern, David Miller, Alexey Kodanev

Commit 80f1a0f4e0cd ("net/ipv6: Put lwtstate when destroying fib6_info")
partially fixed the kmemleak [1], lwtstate can be copied from fib6_info,
with ip6_rt_copy_init(), and it should be done only once there.

rt->dst.lwtstate is set by ip6_rt_init_dst(), at the start of the function
ip6_rt_copy_init(), so there is no need to get it again at the end.

With this patch, lwtstate also isn't copied from RTF_REJECT routes.

[1]:
unreferenced object 0xffff880b6aaa14e0 (size 64):
  comm "ip", pid 10577, jiffies 4295149341 (age 1273.903s)
  hex dump (first 32 bytes):
    01 00 04 00 04 00 00 00 10 00 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<0000000018664623>] lwtunnel_build_state+0x1bc/0x420
    [<00000000b73aa29a>] ip6_route_info_create+0x9f7/0x1fd0
    [<00000000ee2c5d1f>] ip6_route_add+0x14/0x70
    [<000000008537b55c>] inet6_rtm_newroute+0xd9/0xe0
    [<000000002acc50f5>] rtnetlink_rcv_msg+0x66f/0x8e0
    [<000000008d9cd381>] netlink_rcv_skb+0x268/0x3b0
    [<000000004c893c76>] netlink_unicast+0x417/0x5a0
    [<00000000f2ab1afb>] netlink_sendmsg+0x70b/0xc30
    [<00000000890ff0aa>] sock_sendmsg+0xb1/0xf0
    [<00000000a2e7b66f>] ___sys_sendmsg+0x659/0x950
    [<000000001e7426c8>] __sys_sendmsg+0xde/0x170
    [<00000000fe411443>] do_syscall_64+0x9f/0x4a0
    [<000000001be7b28b>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [<000000006d21f353>] 0xffffffffffffffff

Fixes: 6edb3c96a5f0 ("net/ipv6: Defer initialization of dst to data path")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
---
 net/ipv6/route.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8e08a91..9f27ada 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -996,7 +996,6 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 	rt->rt6i_src = ort->fib6_src;
 #endif
 	rt->rt6i_prefsrc = ort->fib6_prefsrc;
-	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
 }
 
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net-next 0/5] rtnetlink: add IFA_IF_NETNSID for RTM_GETADDR
From: Nicolas Dichtel @ 2018-08-30 15:49 UTC (permalink / raw)
  To: Christian Brauner, Kirill Tkhai
  Cc: netdev, linux-kernel, davem, kuznet, yoshfuji, pombredanne,
	kstewart, gregkh, dsahern, fw, lucien.xin, jakub.kicinski, jbenc
In-Reply-To: <20180830144544.tpross4jd6awou4u@gmail.com>

Le 30/08/2018 à 16:45, Christian Brauner a écrit :
[snip]
> Introducing the IFA_IF_NETNSID property will not make the netlink
> interface less modular. It is a clean, RTM_*ADDR-request specific
> property using network namespace identifiers which we discussed in prior
> patches are the way to go forward.
> 
> You can already get interfaces via GETLINK from another network
> namespaces than the one you reside in (Which we enabled just a few
> months back.) but you can't do the same for GETADDR. Those two are
> almost always used together. When you want to get the links you usually
> also want to get the addresses associated with it right after.
> In a prior discussion we agreed that network namespace identifiers are
> the way to go forward but that any other propery, i.e. PIDs and fds
> should never be ported into other parts of the codebase and that is
> indeed something I agree with.
Yes, I agree with this and I think this series go to the right direction.

Maybe I would choose a more generic name for the attribute, something that can
be used in other netlink families (xfrm, netfilter, ...) also.
What about IFA_TARGET_NSID?

^ permalink raw reply

* Re: [PATCH net 1/2] selftests: pmtu: maximum MTU for vti4 is 2^16-1-20
From: Nicolas Dichtel @ 2018-08-30 15:41 UTC (permalink / raw)
  To: Sabrina Dubroca, netdev; +Cc: Stefano Brivio
In-Reply-To: <1e62875c4c72b38b17f6c73f9654696b14fb3166.1535636302.git.sd@queasysnail.net>

Le 30/08/2018 à 16:01, Sabrina Dubroca a écrit :
> Since commit 82612de1c98e ("ip_tunnel: restore binding to ifaces with a
> large mtu"), the maximum MTU for vti4 is based on IP_MAX_MTU instead of
> the mysterious constant 0xFFF8.  This makes this selftest fail.
> 
> Fixes: 82612de1c98e ("ip_tunnel: restore binding to ifaces with a large mtu")
> Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
> Acked-by: Stefano Brivio <sbrivio@redhat.com>

Thanks for fixing this.

Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>

^ permalink raw reply

* Re: [pull request][net-next 00/10] Mellanox, mlx5 and devlink updates 2018-07-31
From: Alexander Duyck @ 2018-08-30 15:39 UTC (permalink / raw)
  To: valex
  Cc: Erez Shitrit, Saeed Mahameed, Saeed Mahameed, David Miller,
	Netdev, Jiri Pirko, Jakub Kicinski, Bjorn Helgaas, linux-pci
In-Reply-To: <5206dd74-432d-3342-2a48-3cdd1be8b5cb@mellanox.com>

I'm dropping all the old comments since the conversation was flattened
and only has one level of marks for everything.

On Thu, Aug 30, 2018 at 7:43 AM Alex Vesker <valex@mellanox.com> wrote:

<snip>

> To which devlink interfaces are you referring?

All of them. Not just the ones in this patch. If you are exposing an
interface to the user you should have documentation for it somewhere.
You should probably look at adding a patch to make certain you have
all the existing devlink interfaces in the driver documented.

I would like to see something added to the documentation folder that
explains what all the DEVLINK_PARAM_GENERIC interfaces are expected to
do, and maybe why I would use them. Then in addition I would like to
see per-driver documentation added for the DEVLINK_PARAM_DRIVER calls.
So for example I can't find any documentation in the kernel on what
enable_64b_cqe_eqe or enable_4k_uar do in mlx4 or why I would need
them, but you have them exposed as interfaces to userspace.

> There are 3 patches here that provide the crdump capability,
> these are the patches I would like to resubmit.
>
> net/mlx5: Add Vendor Specific Capability access gateway:
>     This is needed to read from the VSC by only the driver to collect a dump

You should probably work with the linux-pci mailing list on this bit
since you are exposing a new capability and they can probably point
you in the direction of how they want to deal with any potential races
in terms of access to the device versus your capability which you are
adding support for dumping via devlink.

> net/mlx5: Add Crdump FW snapshot support
>     This is code that collects the dump and registers a region called crdump
> net/mlx5: Use devlink region_snapshot parameter
>     Here I use an already implemented global param that specifies whether
>     snapshots are supported.
>
> The devlink region feature is well documented.

Where?

> can it be that you referring to devlink region called "crdump" which mlx5 exposes?

I don't care about the internals. I care about user available
documentation for the interface that is exposed. How do you expect the
user to use this functionality? That is what I want documented.

<snip>

> Will it be sufficient to prevent setcpi access using "pci_cfg_access_lock -
> any userspace reads or writes to config space and concurrent lock requests will sleep"
> otherwise do you have a different solution?

That sounds like a step in the right direction, but that is something
you should work with the linux-pci list on. My main concern is that I
don't want us being able to come at this interface from multiple
directions and screw things up.

^ permalink raw reply

* [PATCH] Optimize lookup of /0 xfrm policies
From: Yannick Brosseau @ 2018-08-30 19:34 UTC (permalink / raw)
  To: steffen.klassert, herbert, davem, netdev
  Cc: linux-kernel, kernel-team, Yannick Brosseau

Currently, all the xfrm policies that are not /32 end up in
the inexact policies linked list which take a long time to lookup.

We can optimize the case where we have a /0 prefix in the policy, which
means we can match any address to that part.
We do this by putting those policies in the direct hash table after
zeroing the address part.
At lookup time, we do an additional lookup with the packet address
and either the destination or source address zeroed out.
We still call xfrm_policy_match to validate that the packet match the
selector.

In our tests, with this optimization we reduce softirq cpu utilisation
from about 40% to 7% with 3k policies.

Signed-off-by: Yannick Brosseau <scientist@fb.com>
---
 net/xfrm/xfrm_hash.h   | 10 +++++
 net/xfrm/xfrm_policy.c | 88 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h
index 61be810389d8..40997fb5336d 100644
--- a/net/xfrm/xfrm_hash.h
+++ b/net/xfrm/xfrm_hash.h
@@ -145,6 +145,16 @@ static inline unsigned int __sel_hash(const struct xfrm_selector *sel,
 	const xfrm_address_t *saddr = &sel->saddr;
 	unsigned int h = 0;
 
+	/* A selector with a prefixlen of zero can basically be ignored in
+	 * the matching. To speed up the lookup, let's hash it without those
+	 * component. In the lookup, we'll do an additional check for a zero
+	 * daddr and a zero saddr.
+	 */
+	if (sel->prefixlen_d == 0)
+		dbits = 0;
+	if (sel->prefixlen_s == 0)
+		sbits = 0;
+
 	switch (family) {
 	case AF_INET:
 		if (sel->prefixlen_d < dbits ||
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 3110c3fbee20..7c2259f140d5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1096,8 +1096,10 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 	int err;
 	struct xfrm_policy *pol, *ret;
 	const xfrm_address_t *daddr, *saddr;
+	static const xfrm_address_t zero_addr = {0};
+
 	struct hlist_head *chain;
-	unsigned int sequence;
+	unsigned int sequence, first_sequence;
 	u32 priority;
 
 	daddr = xfrm_flowi_daddr(fl, family);
@@ -1112,6 +1114,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 		chain = policy_hash_direct(net, daddr, saddr, family, dir);
 	} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
 
+	first_sequence = sequence;
 	priority = ~0U;
 	ret = NULL;
 	hlist_for_each_entry_rcu(pol, chain, bydst) {
@@ -1129,6 +1132,87 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 			break;
 		}
 	}
+
+	/* XXX FB NOT UPSTREAM YET T12762593 */
+	/* Do an additional lookup for saddr == 0, since we stored source
+	 * selector with a prefix len of 0 that way in the bydst hash
+	 */
+	do {
+		sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+		chain = policy_hash_direct(net, daddr, &zero_addr, family, dir);
+	} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+
+	hlist_for_each_entry_rcu(pol, chain, bydst) {
+		if ((pol->priority >= priority) && ret)
+			break;
+
+		err = xfrm_policy_match(pol, fl, type, family, dir);
+		if (err) {
+			if (err == -ESRCH)
+				continue;
+			else {
+				ret = ERR_PTR(err);
+				goto fail;
+			}
+		} else {
+			ret = pol;
+			priority = ret->priority;
+			break;
+		}
+	}
+
+	/* Do an additional lookup for daddr == 0, since we stored dest
+	 * selector with a prefix len of 0 that way in the bydst hash
+	 */
+	do {
+		sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+		chain = policy_hash_direct(net, &zero_addr, saddr, family, dir);
+	} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+
+	hlist_for_each_entry_rcu(pol, chain, bydst) {
+		if ((pol->priority >= priority) && ret)
+			break;
+
+		err = xfrm_policy_match(pol, fl, type, family, dir);
+		if (err) {
+			if (err == -ESRCH)
+				continue;
+			else {
+				ret = ERR_PTR(err);
+				goto fail;
+			}
+		} else {
+			ret = pol;
+			priority = ret->priority;
+			break;
+		}
+	}
+
+	/* Do an additional lookup for both saddr and daddr == 0 */
+	do {
+		sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+		chain = policy_hash_direct(net, &zero_addr, &zero_addr, family, dir);
+	} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+
+	hlist_for_each_entry_rcu(pol, chain, bydst) {
+		if ((pol->priority >= priority) && ret)
+			break;
+
+		err = xfrm_policy_match(pol, fl, type, family, dir);
+		if (err) {
+			if (err == -ESRCH)
+				continue;
+			else {
+				ret = ERR_PTR(err);
+				goto fail;
+			}
+		} else {
+			ret = pol;
+			priority = ret->priority;
+			break;
+		}
+	}
+
 	chain = &net->xfrm.policy_inexact[dir];
 	hlist_for_each_entry_rcu(pol, chain, bydst) {
 		if ((pol->priority >= priority) && ret)
@@ -1148,7 +1232,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 		}
 	}
 
-	if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
+	if (read_seqcount_retry(&xfrm_policy_hash_generation, first_sequence))
 		goto retry;
 
 	if (ret && !xfrm_pol_hold_rcu(ret))
-- 
2.18.0

^ permalink raw reply related

* Re: KMSAN: uninit-value in rds_bind
From: Santosh Shilimkar @ 2018-08-30 19:30 UTC (permalink / raw)
  To: syzbot, davem, linux-kernel, linux-rdma, netdev, rds-devel,
	syzkaller-bugs
In-Reply-To: <000000000000010a4d0574ab4743@google.com>

On 8/30/2018 11:31 AM, syzbot wrote:
> Hello,
> 
> syzbot found the following crash on:
> 
> HEAD commit:    2dca2cbde67a kmsan: fix build warnings with CONFIG_KMSAN=n
> git tree:       https://github.com/google/kmsan.git/master
> console output: https://syzkaller.appspot.com/x/log.txt?x=16db895a400000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=820d6393634b55e3
> dashboard link: 
> https://syzkaller.appspot.com/bug?extid=915c9f99f3dbc4bd6cd1
> compiler:       clang version 8.0.0 (trunk 339414)
> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=1137bffe400000
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1521a7fe400000
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+915c9f99f3dbc4bd6cd1@syzkaller.appspotmail.com
> 
OK. Will send the fix to address this.

Regards,
Santosh

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox