Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next v4 1/2] libbpf: add support for using AF_XDP sockets
From: Magnus Karlsson @ 2019-02-08 13:05 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev, jakub.kicinski,
	bjorn.topel, qi.z.zhang
  Cc: brouer, xiaolong.ye
In-Reply-To: <1549631126-29067-1-git-send-email-magnus.karlsson@intel.com>

This commit adds AF_XDP support to libbpf. The main reason for this is
to facilitate writing applications that use AF_XDP by offering
higher-level APIs that hide many of the details of the AF_XDP
uapi. This is in the same vein as libbpf facilitates XDP adoption by
offering easy-to-use higher level interfaces of XDP
functionality. Hopefully this will facilitate adoption of AF_XDP, make
applications using it simpler and smaller, and finally also make it
possible for applications to benefit from optimizations in the AF_XDP
user space access code. Previously, people just copied and pasted the
code from the sample application into their application, which is not
desirable.

The interface is composed of two parts:

* Low-level access interface to the four rings and the packet
* High-level control plane interface for creating and setting
  up umems and af_xdp sockets as well as a simple XDP program.

Tested-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 tools/include/uapi/linux/if_xdp.h |  78 ++++
 tools/lib/bpf/Build               |   2 +-
 tools/lib/bpf/Makefile            |   5 +-
 tools/lib/bpf/README.rst          |  11 +-
 tools/lib/bpf/libbpf.map          |   7 +
 tools/lib/bpf/xsk.c               | 742 ++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/xsk.h               | 205 +++++++++++
 7 files changed, 1047 insertions(+), 3 deletions(-)
 create mode 100644 tools/include/uapi/linux/if_xdp.h
 create mode 100644 tools/lib/bpf/xsk.c
 create mode 100644 tools/lib/bpf/xsk.h

diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
new file mode 100644
index 0000000..caed8b1
--- /dev/null
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * if_xdp: XDP socket user-space interface
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *	      Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#ifndef _LINUX_IF_XDP_H
+#define _LINUX_IF_XDP_H
+
+#include <linux/types.h>
+
+/* Options for the sxdp_flags field */
+#define XDP_SHARED_UMEM	(1 << 0)
+#define XDP_COPY	(1 << 1) /* Force copy-mode */
+#define XDP_ZEROCOPY	(1 << 2) /* Force zero-copy mode */
+
+struct sockaddr_xdp {
+	__u16 sxdp_family;
+	__u16 sxdp_flags;
+	__u32 sxdp_ifindex;
+	__u32 sxdp_queue_id;
+	__u32 sxdp_shared_umem_fd;
+};
+
+struct xdp_ring_offset {
+	__u64 producer;
+	__u64 consumer;
+	__u64 desc;
+};
+
+struct xdp_mmap_offsets {
+	struct xdp_ring_offset rx;
+	struct xdp_ring_offset tx;
+	struct xdp_ring_offset fr; /* Fill */
+	struct xdp_ring_offset cr; /* Completion */
+};
+
+/* XDP socket options */
+#define XDP_MMAP_OFFSETS		1
+#define XDP_RX_RING			2
+#define XDP_TX_RING			3
+#define XDP_UMEM_REG			4
+#define XDP_UMEM_FILL_RING		5
+#define XDP_UMEM_COMPLETION_RING	6
+#define XDP_STATISTICS			7
+
+struct xdp_umem_reg {
+	__u64 addr; /* Start of packet data area */
+	__u64 len; /* Length of packet data area */
+	__u32 chunk_size;
+	__u32 headroom;
+};
+
+struct xdp_statistics {
+	__u64 rx_dropped; /* Dropped for reasons other than invalid desc */
+	__u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
+	__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
+};
+
+/* Pgoff for mmaping the rings */
+#define XDP_PGOFF_RX_RING			  0
+#define XDP_PGOFF_TX_RING		 0x80000000
+#define XDP_UMEM_PGOFF_FILL_RING	0x100000000ULL
+#define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000ULL
+
+/* Rx/Tx descriptor */
+struct xdp_desc {
+	__u64 addr;
+	__u32 len;
+	__u32 options;
+};
+
+/* UMEM descriptor is __u64 */
+
+#endif /* _LINUX_IF_XDP_H */
diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index bfd9bfc..ee9d536 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1 +1 @@
-libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o bpf_prog_linfo.o libbpf_probes.o
+libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 8479162..761691b 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -164,6 +164,9 @@ $(BPF_IN): force elfdep bpfdep
 	@(test -f ../../include/uapi/linux/if_link.h -a -f ../../../include/uapi/linux/if_link.h && ( \
 	(diff -B ../../include/uapi/linux/if_link.h ../../../include/uapi/linux/if_link.h >/dev/null) || \
 	echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_link.h' differs from latest version at 'include/uapi/linux/if_link.h'" >&2 )) || true
+	@(test -f ../../include/uapi/linux/if_xdp.h -a -f ../../../include/uapi/linux/if_xdp.h && ( \
+	(diff -B ../../include/uapi/linux/if_xdp.h ../../../include/uapi/linux/if_xdp.h >/dev/null) || \
+	echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true
 	$(Q)$(MAKE) $(build)=libbpf
 
 $(OUTPUT)libbpf.so: $(BPF_IN)
@@ -174,7 +177,7 @@ $(OUTPUT)libbpf.a: $(BPF_IN)
 	$(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
 
 $(OUTPUT)test_libbpf: test_libbpf.cpp $(OUTPUT)libbpf.a
-	$(QUIET_LINK)$(CXX) $^ -lelf -o $@
+	$(QUIET_LINK)$(CXX) $(INCLUDES) $^ -lelf -o $@
 
 check: check_abi
 
diff --git a/tools/lib/bpf/README.rst b/tools/lib/bpf/README.rst
index 607aae4..45e3788 100644
--- a/tools/lib/bpf/README.rst
+++ b/tools/lib/bpf/README.rst
@@ -9,7 +9,7 @@ described here. It's recommended to follow these conventions whenever a
 new function or type is added to keep libbpf API clean and consistent.
 
 All types and functions provided by libbpf API should have one of the
-following prefixes: ``bpf_``, ``btf_``, ``libbpf_``.
+following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``.
 
 System call wrappers
 --------------------
@@ -62,6 +62,15 @@ Auxiliary functions and types that don't fit well in any of categories
 described above should have ``libbpf_`` prefix, e.g.
 ``libbpf_get_error`` or ``libbpf_prog_type_by_name``.
 
+AF_XDP functions
+-------------------
+
+AF_XDP functions should have an ``xsk_`` prefix, e.g.
+``xsk_umem__get_data`` or ``xsk_umem__create``. The interface consists
+of both low-level ring access functions and high-level configuration
+functions. These can be mixed and matched. Note that these functions
+are not reentrant for performance reasons.
+
 libbpf ABI
 ==========
 
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 89c1149..1cc1bb8 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -143,4 +143,11 @@ LIBBPF_0.0.2 {
 		btf_ext__new;
 		btf_ext__reloc_func_info;
 		btf_ext__reloc_line_info;
+		xsk_umem__create;
+		xsk_socket__create;
+		xsk_umem__delete;
+		xsk_socket__delete;
+		xsk_umem__get_data;
+		xsk_umem__fd;
+		xsk_socket__fd;
 } LIBBPF_0.0.1;
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
new file mode 100644
index 0000000..a982a76
--- /dev/null
+++ b/tools/lib/bpf/xsk.c
@@ -0,0 +1,742 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * AF_XDP user-space access library.
+ *
+ * Copyright(c) 2018 - 2019 Intel Corporation.
+ *
+ * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <asm/barrier.h>
+#include <linux/compiler.h>
+#include <linux/filter.h>
+#include <linux/if_ether.h>
+#include <linux/if_link.h>
+#include <linux/if_packet.h>
+#include <linux/if_xdp.h>
+#include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "bpf.h"
+#include "libbpf.h"
+#include "libbpf_util.h"
+#include "nlattr.h"
+#include "xsk.h"
+
+#ifndef SOL_XDP
+ #define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+ #define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+ #define PF_XDP AF_XDP
+#endif
+
+struct xsk_umem {
+	struct xsk_ring_prod *fill;
+	struct xsk_ring_cons *comp;
+	char *umem_area;
+	struct xsk_umem_config config;
+	int fd;
+	int refcount;
+};
+
+struct xsk_socket {
+	struct xsk_ring_cons *rx;
+	struct xsk_ring_prod *tx;
+	__u64 outstanding_tx;
+	struct xsk_umem *umem;
+	struct xsk_socket_config config;
+	int fd;
+	int xsks_map;
+	int ifindex;
+	int prog_fd;
+	int qidconf_map_fd;
+	int xsks_map_fd;
+	__u32 queue_id;
+};
+
+struct xsk_nl_info {
+	bool xdp_prog_attached;
+	int ifindex;
+	int fd;
+};
+
+#define MAX_QUEUES 128
+
+/* For 32-bit systems, we need to use mmap2 as the offsets are 64-bit.
+ * Unfortunately, it is not part of glibc.
+ */
+static inline void *xsk_mmap(void *addr, size_t length, int prot, int flags,
+			     int fd, __u64 offset)
+{
+#ifdef __NR_mmap2
+	unsigned int page_shift = __builtin_ffs(getpagesize()) - 1;
+	long ret = syscall(__NR_mmap2, addr, length, prot, flags, fd,
+			   (off_t)(offset >> page_shift));
+
+	return (void *)ret;
+#else
+	return mmap(addr, length, prot, flags, fd, offset);
+#endif
+}
+
+void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr)
+{
+	return &((char *)(umem->umem_area))[addr];
+}
+
+int xsk_umem__fd(const struct xsk_umem *umem)
+{
+	return umem ? umem->fd : -EINVAL;
+}
+
+int xsk_socket__fd(const struct xsk_socket *xsk)
+{
+	return xsk ? xsk->fd : -EINVAL;
+}
+
+static bool xsk_page_aligned(void *buffer)
+{
+	unsigned long addr = (unsigned long)buffer;
+
+	return !(addr & (getpagesize() - 1));
+}
+
+static void xsk_set_umem_config(struct xsk_umem_config *cfg,
+				const struct xsk_umem_config *usr_cfg)
+{
+	if (!usr_cfg) {
+		cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+		cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+		cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+		cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
+		return;
+	}
+
+	cfg->fill_size = usr_cfg->fill_size;
+	cfg->comp_size = usr_cfg->comp_size;
+	cfg->frame_size = usr_cfg->frame_size;
+	cfg->frame_headroom = usr_cfg->frame_headroom;
+}
+
+static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
+				      const struct xsk_socket_config *usr_cfg)
+{
+	if (!usr_cfg) {
+		cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+		cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+		cfg->libbpf_flags = 0;
+		cfg->xdp_flags = 0;
+		cfg->bind_flags = 0;
+		return;
+	}
+
+	cfg->rx_size = usr_cfg->rx_size;
+	cfg->tx_size = usr_cfg->tx_size;
+	cfg->libbpf_flags = usr_cfg->libbpf_flags;
+	cfg->xdp_flags = usr_cfg->xdp_flags;
+	cfg->bind_flags = usr_cfg->bind_flags;
+}
+
+int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
+		     struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
+		     const struct xsk_umem_config *usr_config)
+{
+	struct xdp_mmap_offsets off;
+	struct xdp_umem_reg mr;
+	struct xsk_umem *umem;
+	socklen_t optlen;
+	void *map;
+	int err;
+
+	if (!umem_area || !umem_ptr || !fill || !comp)
+		return -EFAULT;
+	if (!size && !xsk_page_aligned(umem_area))
+		return -EINVAL;
+
+	umem = calloc(1, sizeof(*umem));
+	if (!umem)
+		return -ENOMEM;
+
+	umem->fd = socket(AF_XDP, SOCK_RAW, 0);
+	if (umem->fd < 0) {
+		err = -errno;
+		goto out_umem_alloc;
+	}
+
+	umem->umem_area = umem_area;
+	xsk_set_umem_config(&umem->config, usr_config);
+
+	mr.addr = (uintptr_t)umem_area;
+	mr.len = size;
+	mr.chunk_size = umem->config.frame_size;
+	mr.headroom = umem->config.frame_headroom;
+
+	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING,
+			 &umem->config.fill_size,
+			 sizeof(umem->config.fill_size));
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
+			 &umem->config.comp_size,
+			 sizeof(umem->config.comp_size));
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	optlen = sizeof(off);
+	err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	map = xsk_mmap(NULL, off.fr.desc +
+		       umem->config.fill_size * sizeof(__u64),
+		       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+		       umem->fd, XDP_UMEM_PGOFF_FILL_RING);
+	if (map == MAP_FAILED) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	umem->fill = fill;
+	fill->mask = umem->config.fill_size - 1;
+	fill->size = umem->config.fill_size;
+	fill->producer = map + off.fr.producer;
+	fill->consumer = map + off.fr.consumer;
+	fill->ring = map + off.fr.desc;
+	fill->cached_cons = umem->config.fill_size;
+
+	map = xsk_mmap(NULL,
+		       off.cr.desc + umem->config.comp_size * sizeof(__u64),
+		       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+		       umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
+	if (map == MAP_FAILED) {
+		err = -errno;
+		goto out_mmap;
+	}
+
+	umem->comp = comp;
+	comp->mask = umem->config.comp_size - 1;
+	comp->size = umem->config.comp_size;
+	comp->producer = map + off.cr.producer;
+	comp->consumer = map + off.cr.consumer;
+	comp->ring = map + off.cr.desc;
+
+	*umem_ptr = umem;
+	return 0;
+
+out_mmap:
+	munmap(umem->fill,
+	       off.fr.desc + umem->config.fill_size * sizeof(__u64));
+out_socket:
+	close(umem->fd);
+out_umem_alloc:
+	free(umem);
+	return err;
+}
+
+static int xsk_parse_nl(void *cookie, void *msg, struct nlattr **tb)
+{
+	struct nlattr *tb_parsed[IFLA_XDP_MAX + 1];
+	struct xsk_nl_info *nl_info = cookie;
+	struct ifinfomsg *ifinfo = msg;
+	unsigned char mode;
+	int err;
+
+	if (nl_info->ifindex && nl_info->ifindex != ifinfo->ifi_index)
+		return 0;
+
+	if (!tb[IFLA_XDP])
+		return 0;
+
+	err = libbpf_nla_parse_nested(tb_parsed, IFLA_XDP_MAX, tb[IFLA_XDP],
+				      NULL);
+	if (err)
+		return err;
+
+	if (!tb_parsed[IFLA_XDP_ATTACHED] || !tb_parsed[IFLA_XDP_FD])
+		return 0;
+
+	mode = libbpf_nla_getattr_u8(tb_parsed[IFLA_XDP_ATTACHED]);
+	if (mode == XDP_ATTACHED_NONE)
+		return 0;
+
+	nl_info->xdp_prog_attached = true;
+	nl_info->fd = libbpf_nla_getattr_u32(tb_parsed[IFLA_XDP_FD]);
+	return 0;
+}
+
+static bool xsk_xdp_prog_attached(struct xsk_socket *xsk)
+{
+	struct xsk_nl_info nl_info;
+	unsigned int nl_pid;
+	char err_buf[256];
+	int sock, err;
+
+	sock = libbpf_netlink_open(&nl_pid);
+	if (sock < 0)
+		return false;
+
+	nl_info.xdp_prog_attached = false;
+	nl_info.ifindex = xsk->ifindex;
+	nl_info.fd = -1;
+
+	err = libbpf_nl_get_link(sock, nl_pid, xsk_parse_nl, &nl_info);
+	if (err) {
+		libbpf_strerror(err, err_buf, sizeof(err_buf));
+		pr_warning("Error:\n%s\n", err_buf);
+		close(sock);
+		return false;
+	}
+
+	close(sock);
+	xsk->prog_fd = nl_info.fd;
+	return nl_info.xdp_prog_attached;
+}
+
+static int xsk_load_xdp_prog(struct xsk_socket *xsk)
+{
+	char bpf_log_buf[BPF_LOG_BUF_SIZE];
+	int err, prog_fd;
+
+	/* This is the C-program:
+	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
+	 * {
+	 *     int *qidconf, index = ctx->rx_queue_index;
+	 *
+	 *     // A set entry here means that the correspnding queue_id
+	 *     // has an active AF_XDP socket bound to it.
+	 *     qidconf = bpf_map_lookup_elem(&qidconf_map, &index);
+	 *     if (!qidconf)
+	 *         return XDP_ABORTED;
+	 *
+	 *     if (*qidconf)
+	 *         return bpf_redirect_map(&xsks_map, index, 0);
+	 *
+	 *     return XDP_PASS;
+	 * }
+	 */
+	struct bpf_insn prog[] = {
+		/* r1 = *(u32 *)(r1 + 16) */
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 16),
+		/* *(u32 *)(r10 - 4) = r1 */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_1, -4),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+		BPF_LD_MAP_FD(BPF_REG_1, xsk->qidconf_map_fd),
+		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		/* if r1 == 0 goto +8 */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 8),
+		BPF_MOV32_IMM(BPF_REG_0, 2),
+		/* r1 = *(u32 *)(r1 + 0) */
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0),
+		/* if r1 == 0 goto +5 */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
+		/* r2 = *(u32 *)(r10 - 4) */
+		BPF_LD_MAP_FD(BPF_REG_1, xsk->xsks_map_fd),
+		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
+		BPF_MOV32_IMM(BPF_REG_3, 0),
+		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
+		/* The jumps are to this instruction */
+		BPF_EXIT_INSN(),
+	};
+	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+
+	prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, prog, insns_cnt,
+				   "LGPL-2.1 or BSD-2-Clause", 0, bpf_log_buf,
+				   BPF_LOG_BUF_SIZE);
+	if (prog_fd < 0) {
+		pr_warning("BPF log buffer:\n%s", bpf_log_buf);
+		return prog_fd;
+	}
+
+	err = bpf_set_link_xdp_fd(xsk->ifindex, prog_fd, xsk->config.xdp_flags);
+	if (err) {
+		close(prog_fd);
+		return err;
+	}
+
+	xsk->prog_fd = prog_fd;
+	return 0;
+}
+
+static int xsk_create_bpf_maps(struct xsk_socket *xsk)
+{
+	int fd;
+
+	fd = bpf_create_map_name(BPF_MAP_TYPE_ARRAY, "qidconf_map",
+				 sizeof(int), sizeof(int), MAX_QUEUES, 0);
+	if (fd < 0)
+		return fd;
+	xsk->qidconf_map_fd = fd;
+
+	fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map",
+				 sizeof(int), sizeof(int), MAX_QUEUES, 0);
+	if (fd < 0) {
+		close(xsk->qidconf_map_fd);
+		return fd;
+	}
+	xsk->xsks_map_fd = fd;
+
+	return 0;
+}
+
+static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
+{
+	close(xsk->qidconf_map_fd);
+	close(xsk->xsks_map_fd);
+}
+
+static int xsk_update_bpf_maps(struct xsk_socket *xsk, int qidconf_value,
+			       int xsks_value)
+{
+	bool qidconf_map_updated = false, xsks_map_updated = false;
+	struct bpf_prog_info prog_info = {};
+	__u32 prog_len = sizeof(prog_info);
+	struct bpf_map_info map_info;
+	__u32 map_len = sizeof(map_info);
+	__u32 *map_ids;
+	int reset_value = 0;
+	__u32 num_maps;
+	unsigned int i;
+	int err;
+
+	err = bpf_obj_get_info_by_fd(xsk->prog_fd, &prog_info, &prog_len);
+	if (err)
+		return err;
+
+	num_maps = prog_info.nr_map_ids;
+
+	map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
+	if (!map_ids)
+		return -ENOMEM;
+
+	memset(&prog_info, 0, prog_len);
+	prog_info.nr_map_ids = num_maps;
+	prog_info.map_ids = (__u64)(unsigned long)map_ids;
+
+	err = bpf_obj_get_info_by_fd(xsk->prog_fd, &prog_info, &prog_len);
+	if (err)
+		goto out_map_ids;
+
+	for (i = 0; i < prog_info.nr_map_ids; i++) {
+		int fd;
+
+		fd = bpf_map_get_fd_by_id(map_ids[i]);
+		if (fd < 0) {
+			err = -errno;
+			goto out_maps;
+		}
+
+		err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
+		if (err)
+			goto out_maps;
+
+		if (!strcmp(map_info.name, "qidconf_map")) {
+			err = bpf_map_update_elem(fd, &xsk->queue_id,
+						  &qidconf_value, 0);
+			if (err)
+				goto out_maps;
+			qidconf_map_updated = true;
+			xsk->qidconf_map_fd = fd;
+		} else if (!strcmp(map_info.name, "xsks_map")) {
+			err = bpf_map_update_elem(fd, &xsk->queue_id,
+						  &xsks_value, 0);
+			if (err)
+				goto out_maps;
+			xsks_map_updated = true;
+			xsk->xsks_map_fd = fd;
+		}
+
+		if (qidconf_map_updated && xsks_map_updated)
+			break;
+	}
+
+	if (!(qidconf_map_updated && xsks_map_updated)) {
+		err = -ENOENT;
+		goto out_maps;
+	}
+
+	err = 0;
+	goto out_success;
+
+out_maps:
+	if (qidconf_map_updated)
+		(void)bpf_map_update_elem(xsk->qidconf_map_fd, &xsk->queue_id,
+					  &reset_value, 0);
+	if (xsks_map_updated)
+		(void)bpf_map_update_elem(xsk->xsks_map_fd, &xsk->queue_id,
+					  &reset_value, 0);
+out_success:
+	if (qidconf_map_updated)
+		close(xsk->qidconf_map_fd);
+	if (xsks_map_updated)
+		close(xsk->xsks_map_fd);
+out_map_ids:
+	free(map_ids);
+	return err;
+}
+
+static int xsk_setup_xdp_prog(struct xsk_socket *xsk)
+{
+	bool prog_attached = false;
+	int err;
+
+	if (!xsk_xdp_prog_attached(xsk)) {
+		prog_attached = true;
+		err = xsk_create_bpf_maps(xsk);
+		if (err)
+			return err;
+
+		err = xsk_load_xdp_prog(xsk);
+		if (err)
+			goto out_maps;
+	}
+
+	err = xsk_update_bpf_maps(xsk, true, xsk->fd);
+	if (err)
+		goto out_load;
+
+	return 0;
+
+out_load:
+	if (prog_attached)
+		close(xsk->prog_fd);
+out_maps:
+	if (prog_attached)
+		xsk_delete_bpf_maps(xsk);
+	return err;
+}
+
+int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
+		       __u32 queue_id, struct xsk_umem *umem,
+		       struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
+		       const struct xsk_socket_config *usr_config)
+{
+	struct sockaddr_xdp sxdp = {};
+	struct xdp_mmap_offsets off;
+	struct xsk_socket *xsk;
+	socklen_t optlen;
+	void *map;
+	int err;
+
+	if (!umem || !xsk_ptr || !rx || !tx)
+		return -EFAULT;
+
+	if (umem->refcount) {
+		pr_warning("Error: shared umems not supported by libbpf.\n");
+		return -EBUSY;
+	}
+
+	xsk = calloc(1, sizeof(*xsk));
+	if (!xsk)
+		return -ENOMEM;
+
+	if (umem->refcount++ > 0) {
+		xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
+		if (xsk->fd < 0) {
+			err = -errno;
+			goto out_xsk_alloc;
+		}
+	} else {
+		xsk->fd = umem->fd;
+	}
+
+	xsk->outstanding_tx = 0;
+	xsk->queue_id = queue_id;
+	xsk->umem = umem;
+	xsk->ifindex = if_nametoindex(ifname);
+	if (!xsk->ifindex) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	xsk_set_xdp_socket_config(&xsk->config, usr_config);
+
+	if (rx) {
+		err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
+				 &xsk->config.rx_size,
+				 sizeof(xsk->config.rx_size));
+		if (err) {
+			err = -errno;
+			goto out_socket;
+		}
+	}
+	if (tx) {
+		err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
+				 &xsk->config.tx_size,
+				 sizeof(xsk->config.tx_size));
+		if (err) {
+			err = -errno;
+			goto out_socket;
+		}
+	}
+
+	optlen = sizeof(off);
+	err = getsockopt(xsk->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	if (rx) {
+		map = xsk_mmap(NULL, off.rx.desc +
+			       xsk->config.rx_size * sizeof(struct xdp_desc),
+			       PROT_READ | PROT_WRITE,
+			       MAP_SHARED | MAP_POPULATE,
+			       xsk->fd, XDP_PGOFF_RX_RING);
+		if (map == MAP_FAILED) {
+			err = -errno;
+			goto out_socket;
+		}
+
+		rx->mask = xsk->config.rx_size - 1;
+		rx->size = xsk->config.rx_size;
+		rx->producer = map + off.rx.producer;
+		rx->consumer = map + off.rx.consumer;
+		rx->ring = map + off.rx.desc;
+	}
+	xsk->rx = rx;
+
+	if (tx) {
+		map = xsk_mmap(NULL, off.tx.desc +
+			       xsk->config.tx_size * sizeof(struct xdp_desc),
+			       PROT_READ | PROT_WRITE,
+			       MAP_SHARED | MAP_POPULATE,
+			       xsk->fd, XDP_PGOFF_TX_RING);
+		if (map == MAP_FAILED) {
+			err = -errno;
+			goto out_mmap_rx;
+		}
+
+		tx->mask = xsk->config.tx_size - 1;
+		tx->size = xsk->config.tx_size;
+		tx->producer = map + off.tx.producer;
+		tx->consumer = map + off.tx.consumer;
+		tx->ring = map + off.tx.desc;
+		tx->cached_cons = xsk->config.tx_size;
+	}
+	xsk->tx = tx;
+
+	sxdp.sxdp_family = PF_XDP;
+	sxdp.sxdp_ifindex = xsk->ifindex;
+	sxdp.sxdp_queue_id = xsk->queue_id;
+	sxdp.sxdp_flags = xsk->config.bind_flags;
+
+	err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
+	if (err) {
+		err = -errno;
+		goto out_mmap_tx;
+	}
+
+	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
+		err = xsk_setup_xdp_prog(xsk);
+		if (err)
+			goto out_mmap_tx;
+	}
+
+	*xsk_ptr = xsk;
+	return 0;
+
+out_mmap_tx:
+	if (tx)
+		munmap(xsk->tx,
+		       off.tx.desc +
+		       xsk->config.tx_size * sizeof(struct xdp_desc));
+out_mmap_rx:
+	if (rx)
+		munmap(xsk->rx,
+		       off.rx.desc +
+		       xsk->config.rx_size * sizeof(struct xdp_desc));
+out_socket:
+	if (--umem->refcount)
+		close(xsk->fd);
+out_xsk_alloc:
+	free(xsk);
+	return err;
+}
+
+int xsk_umem__delete(struct xsk_umem *umem)
+{
+	struct xdp_mmap_offsets off;
+	socklen_t optlen;
+	int err;
+
+	if (!umem)
+		return 0;
+
+	if (umem->refcount)
+		return -EBUSY;
+
+	optlen = sizeof(off);
+	err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (!err) {
+		munmap(umem->fill->ring,
+		       off.fr.desc + umem->config.fill_size * sizeof(__u64));
+		munmap(umem->comp->ring,
+		       off.cr.desc + umem->config.comp_size * sizeof(__u64));
+	}
+
+	close(umem->fd);
+	free(umem);
+
+	return 0;
+}
+
+void xsk_socket__delete(struct xsk_socket *xsk)
+{
+	struct xdp_mmap_offsets off;
+	socklen_t optlen;
+	int err;
+
+	if (!xsk)
+		return;
+
+	(void)xsk_update_bpf_maps(xsk, 0, 0);
+
+	optlen = sizeof(off);
+	err = getsockopt(xsk->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (!err) {
+		if (xsk->rx)
+			munmap(xsk->rx->ring,
+			       off.rx.desc +
+			       xsk->config.rx_size * sizeof(struct xdp_desc));
+		if (xsk->tx)
+			munmap(xsk->tx->ring,
+			       off.tx.desc +
+			       xsk->config.tx_size * sizeof(struct xdp_desc));
+	}
+
+	xsk->umem->refcount--;
+	/* Do not close an fd that also has an associated umem connected
+	 * to it.
+	 */
+	if (xsk->fd != xsk->umem->fd)
+		close(xsk->fd);
+	free(xsk);
+}
diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
new file mode 100644
index 0000000..a1ab8c9
--- /dev/null
+++ b/tools/lib/bpf/xsk.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+
+/*
+ * AF_XDP user-space access library.
+ *
+ * Copyright(c) 2018 - 2019 Intel Corporation.
+ *
+ * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#ifndef __LIBBPF_XSK_H
+#define __LIBBPF_XSK_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <linux/if_xdp.h>
+
+#include "libbpf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Do not access these members directly. Use the functions below. */
+#define DEFINE_XSK_RING(name) \
+struct name { \
+	__u32 cached_prod; \
+	__u32 cached_cons; \
+	__u32 mask; \
+	__u32 size; \
+	__u32 *producer; \
+	__u32 *consumer; \
+	void *ring; \
+}
+
+DEFINE_XSK_RING(xsk_ring_prod);
+DEFINE_XSK_RING(xsk_ring_cons);
+
+struct xsk_umem;
+struct xsk_socket;
+
+static inline __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill,
+					      __u32 idx)
+{
+	__u64 *addrs = (__u64 *)fill->ring;
+
+	return &addrs[idx & fill->mask];
+}
+
+static inline const __u64 *
+xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx)
+{
+	const __u64 *addrs = (const __u64 *)comp->ring;
+
+	return &addrs[idx & comp->mask];
+}
+
+static inline struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx,
+						      __u32 idx)
+{
+	struct xdp_desc *descs = (struct xdp_desc *)tx->ring;
+
+	return &descs[idx & tx->mask];
+}
+
+static inline const struct xdp_desc *
+xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx)
+{
+	const struct xdp_desc *descs = (const struct xdp_desc *)rx->ring;
+
+	return &descs[idx & rx->mask];
+}
+
+static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
+{
+	__u32 free_entries = r->cached_cons - r->cached_prod;
+
+	if (free_entries >= nb)
+		return free_entries;
+
+	/* Refresh the local tail pointer.
+	 * cached_cons is r->size bigger than the real consumer pointer so
+	 * that this addition can be avoided in the more frequently
+	 * executed code that computs free_entries in the beginning of
+	 * this function. Without this optimization it whould have been
+	 * free_entries = r->cached_prod - r->cached_cons + r->size.
+	 */
+	r->cached_cons = *r->consumer + r->size;
+
+	return r->cached_cons - r->cached_prod;
+}
+
+static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb)
+{
+	__u32 entries = r->cached_prod - r->cached_cons;
+
+	if (entries == 0) {
+		r->cached_prod = *r->producer;
+		entries = r->cached_prod - r->cached_cons;
+	}
+
+	return (entries > nb) ? nb : entries;
+}
+
+static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod,
+					    size_t nb, __u32 *idx)
+{
+	if (unlikely(xsk_prod_nb_free(prod, nb) < nb))
+		return 0;
+
+	*idx = prod->cached_prod;
+	prod->cached_prod += nb;
+
+	return nb;
+}
+
+static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb)
+{
+	/* Make sure everything has been written to the ring before signalling
+	 * this to the kernel.
+	 */
+	smp_wmb();
+
+	*prod->producer += nb;
+}
+
+static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons,
+					 size_t nb, __u32 *idx)
+{
+	size_t entries = xsk_cons_nb_avail(cons, nb);
+
+	if (likely(entries > 0)) {
+		/* Make sure we do not speculatively read the data before
+		 * we have received the packet buffers from the ring.
+		 */
+		smp_rmb();
+
+		*idx = cons->cached_cons;
+		cons->cached_cons += entries;
+	}
+
+	return entries;
+}
+
+static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t nb)
+{
+	*cons->consumer += nb;
+}
+
+static inline void *xsk_umem__get_data_raw(void *umem_area, __u64 addr)
+{
+	return &((char *)umem_area)[addr];
+}
+
+LIBBPF_API void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr);
+
+LIBBPF_API int xsk_umem__fd(const struct xsk_umem *umem);
+LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
+
+#define XSK_RING_CONS__DEFAULT_NUM_DESCS      2048
+#define XSK_RING_PROD__DEFAULT_NUM_DESCS      2048
+#define XSK_UMEM__DEFAULT_FRAME_SHIFT    11 /* 2048 bytes */
+#define XSK_UMEM__DEFAULT_FRAME_SIZE     (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT)
+#define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0
+
+struct xsk_umem_config {
+	__u32 fill_size;
+	__u32 comp_size;
+	__u32 frame_size;
+	__u32 frame_headroom;
+};
+
+/* Flags for the libbpf_flags field. */
+#define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0)
+
+struct xsk_socket_config {
+	__u32 rx_size;
+	__u32 tx_size;
+	__u32 libbpf_flags;
+	__u32 xdp_flags;
+	__u16 bind_flags;
+};
+
+/* Set config to NULL to get the default configuration. */
+LIBBPF_API int xsk_umem__create(struct xsk_umem **umem,
+				void *umem_area, __u64 size,
+				struct xsk_ring_prod *fill,
+				struct xsk_ring_cons *comp,
+				const struct xsk_umem_config *config);
+LIBBPF_API int xsk_socket__create(struct xsk_socket **xsk,
+				  const char *ifname, __u32 queue_id,
+				  struct xsk_umem *umem,
+				  struct xsk_ring_cons *rx,
+				  struct xsk_ring_prod *tx,
+				  const struct xsk_socket_config *config);
+
+/* Returns 0 for success and -EBUSY if the umem is still in use. */
+LIBBPF_API int xsk_umem__delete(struct xsk_umem *umem);
+LIBBPF_API void xsk_socket__delete(struct xsk_socket *xsk);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* __LIBBPF_XSK_H */
-- 
2.7.4


^ permalink raw reply related

* [PATCH bpf-next v4 2/2] samples/bpf: convert xdpsock to use libbpf for AF_XDP access
From: Magnus Karlsson @ 2019-02-08 13:05 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev, jakub.kicinski,
	bjorn.topel, qi.z.zhang
  Cc: brouer, xiaolong.ye
In-Reply-To: <1549631126-29067-1-git-send-email-magnus.karlsson@intel.com>

This commit converts the xdpsock sample application to use the AF_XDP
functions present in libbpf. This cuts down the size of it by nearly
300 lines of code.

The default ring sizes plus the batch size has been increased and the
size of the umem area has decreased. This so that the sample application
will provide higher throughput. Note also that the shared umem code
has been removed from the sample as this is not supported by libbpf
at this point in time.

Tested-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 samples/bpf/Makefile       |   1 -
 samples/bpf/xdpsock.h      |  11 -
 samples/bpf/xdpsock_kern.c |  56 ---
 samples/bpf/xdpsock_user.c | 839 ++++++++++++++-------------------------------
 4 files changed, 259 insertions(+), 648 deletions(-)
 delete mode 100644 samples/bpf/xdpsock.h
 delete mode 100644 samples/bpf/xdpsock_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index a0ef7ed..a333e25 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -163,7 +163,6 @@ always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
 always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
-always += xdpsock_kern.o
 always += xdp_fwd_kern.o
 always += task_fd_query_kern.o
 always += xdp_sample_pkts_kern.o
diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h
deleted file mode 100644
index 533ab81..0000000
--- a/samples/bpf/xdpsock.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef XDPSOCK_H_
-#define XDPSOCK_H_
-
-/* Power-of-2 number of sockets */
-#define MAX_SOCKS 4
-
-/* Round-robin receive */
-#define RR_LB 0
-
-#endif /* XDPSOCK_H_ */
diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c
deleted file mode 100644
index b8ccd08..0000000
--- a/samples/bpf/xdpsock_kern.c
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define KBUILD_MODNAME "foo"
-#include <uapi/linux/bpf.h>
-#include "bpf_helpers.h"
-
-#include "xdpsock.h"
-
-struct bpf_map_def SEC("maps") qidconf_map = {
-	.type		= BPF_MAP_TYPE_ARRAY,
-	.key_size	= sizeof(int),
-	.value_size	= sizeof(int),
-	.max_entries	= 1,
-};
-
-struct bpf_map_def SEC("maps") xsks_map = {
-	.type = BPF_MAP_TYPE_XSKMAP,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = MAX_SOCKS,
-};
-
-struct bpf_map_def SEC("maps") rr_map = {
-	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(unsigned int),
-	.max_entries = 1,
-};
-
-SEC("xdp_sock")
-int xdp_sock_prog(struct xdp_md *ctx)
-{
-	int *qidconf, key = 0, idx;
-	unsigned int *rr;
-
-	qidconf = bpf_map_lookup_elem(&qidconf_map, &key);
-	if (!qidconf)
-		return XDP_ABORTED;
-
-	if (*qidconf != ctx->rx_queue_index)
-		return XDP_PASS;
-
-#if RR_LB /* NB! RR_LB is configured in xdpsock.h */
-	rr = bpf_map_lookup_elem(&rr_map, &key);
-	if (!rr)
-		return XDP_ABORTED;
-
-	*rr = (*rr + 1) & (MAX_SOCKS - 1);
-	idx = *rr;
-#else
-	idx = 0;
-#endif
-
-	return bpf_redirect_map(&xsks_map, idx, 0);
-}
-
-char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index f73055e..159f64b 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -1,37 +1,36 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2017 - 2018 Intel Corporation. */
 
-#include <assert.h>
+#include <asm/barrier.h>
 #include <errno.h>
 #include <getopt.h>
 #include <libgen.h>
 #include <linux/bpf.h>
+#include <linux/compiler.h>
 #include <linux/if_link.h>
 #include <linux/if_xdp.h>
 #include <linux/if_ether.h>
+#include <locale.h>
+#include <net/ethernet.h>
 #include <net/if.h>
+#include <poll.h>
+#include <pthread.h>
 #include <signal.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <net/ethernet.h>
+#include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/socket.h>
-#include <sys/mman.h>
+#include <sys/types.h>
 #include <time.h>
 #include <unistd.h>
-#include <pthread.h>
-#include <locale.h>
-#include <sys/types.h>
-#include <poll.h>
 
 #include "bpf/libbpf.h"
-#include "bpf_util.h"
+#include "bpf/xsk.h"
 #include <bpf/bpf.h>
 
-#include "xdpsock.h"
-
 #ifndef SOL_XDP
 #define SOL_XDP 283
 #endif
@@ -44,17 +43,11 @@
 #define PF_XDP AF_XDP
 #endif
 
-#define NUM_FRAMES 131072
-#define FRAME_HEADROOM 0
-#define FRAME_SHIFT 11
-#define FRAME_SIZE 2048
-#define NUM_DESCS 1024
-#define BATCH_SIZE 16
-
-#define FQ_NUM_DESCS 1024
-#define CQ_NUM_DESCS 1024
+#define NUM_FRAMES (4 * 1024)
+#define BATCH_SIZE 64
 
 #define DEBUG_HEXDUMP 0
+#define MAX_SOCKS 8
 
 typedef __u64 u64;
 typedef __u32 u32;
@@ -73,54 +66,30 @@ static const char *opt_if = "";
 static int opt_ifindex;
 static int opt_queue;
 static int opt_poll;
-static int opt_shared_packet_buffer;
 static int opt_interval = 1;
 static u32 opt_xdp_bind_flags;
 static __u32 prog_id;
 
-struct xdp_umem_uqueue {
-	u32 cached_prod;
-	u32 cached_cons;
-	u32 mask;
-	u32 size;
-	u32 *producer;
-	u32 *consumer;
-	u64 *ring;
-	void *map;
+struct xsk_umem_info {
+	struct xsk_ring_prod fq;
+	struct xsk_ring_cons cq;
+	struct xsk_umem *umem;
 };
 
-struct xdp_umem {
-	char *frames;
-	struct xdp_umem_uqueue fq;
-	struct xdp_umem_uqueue cq;
-	int fd;
-};
-
-struct xdp_uqueue {
-	u32 cached_prod;
-	u32 cached_cons;
-	u32 mask;
-	u32 size;
-	u32 *producer;
-	u32 *consumer;
-	struct xdp_desc *ring;
-	void *map;
-};
-
-struct xdpsock {
-	struct xdp_uqueue rx;
-	struct xdp_uqueue tx;
-	int sfd;
-	struct xdp_umem *umem;
-	u32 outstanding_tx;
+struct xsk_socket_info {
+	struct xsk_ring_cons rx;
+	struct xsk_ring_prod tx;
+	struct xsk_umem_info *umem;
+	struct xsk_socket *xsk;
 	unsigned long rx_npkts;
 	unsigned long tx_npkts;
 	unsigned long prev_rx_npkts;
 	unsigned long prev_tx_npkts;
+	u32 outstanding_tx;
 };
 
 static int num_socks;
-struct xdpsock *xsks[MAX_SOCKS];
+struct xsk_socket_info *xsks[MAX_SOCKS];
 
 static unsigned long get_nsecs(void)
 {
@@ -130,225 +99,124 @@ static unsigned long get_nsecs(void)
 	return ts.tv_sec * 1000000000UL + ts.tv_nsec;
 }
 
-static void dump_stats(void);
-
-#define lassert(expr)							\
-	do {								\
-		if (!(expr)) {						\
-			fprintf(stderr, "%s:%s:%i: Assertion failed: "	\
-				#expr ": errno: %d/\"%s\"\n",		\
-				__FILE__, __func__, __LINE__,		\
-				errno, strerror(errno));		\
-			dump_stats();					\
-			exit(EXIT_FAILURE);				\
-		}							\
-	} while (0)
-
-#define barrier() __asm__ __volatile__("": : :"memory")
-#ifdef __aarch64__
-#define u_smp_rmb() __asm__ __volatile__("dmb ishld": : :"memory")
-#define u_smp_wmb() __asm__ __volatile__("dmb ishst": : :"memory")
-#else
-#define u_smp_rmb() barrier()
-#define u_smp_wmb() barrier()
-#endif
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
-
-static const char pkt_data[] =
-	"\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
-	"\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
-	"\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
-	"\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
-
-static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb)
-{
-	u32 free_entries = q->cached_cons - q->cached_prod;
-
-	if (free_entries >= nb)
-		return free_entries;
-
-	/* Refresh the local tail pointer */
-	q->cached_cons = *q->consumer + q->size;
-
-	return q->cached_cons - q->cached_prod;
-}
-
-static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs)
+static void print_benchmark(bool running)
 {
-	u32 free_entries = q->cached_cons - q->cached_prod;
+	const char *bench_str = "INVALID";
 
-	if (free_entries >= ndescs)
-		return free_entries;
+	if (opt_bench == BENCH_RXDROP)
+		bench_str = "rxdrop";
+	else if (opt_bench == BENCH_TXONLY)
+		bench_str = "txonly";
+	else if (opt_bench == BENCH_L2FWD)
+		bench_str = "l2fwd";
 
-	/* Refresh the local tail pointer */
-	q->cached_cons = *q->consumer + q->size;
-	return q->cached_cons - q->cached_prod;
-}
+	printf("%s:%d %s ", opt_if, opt_queue, bench_str);
+	if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
+		printf("xdp-skb ");
+	else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
+		printf("xdp-drv ");
+	else
+		printf("	");
 
-static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb)
-{
-	u32 entries = q->cached_prod - q->cached_cons;
+	if (opt_poll)
+		printf("poll() ");
 
-	if (entries == 0) {
-		q->cached_prod = *q->producer;
-		entries = q->cached_prod - q->cached_cons;
+	if (running) {
+		printf("running...");
+		fflush(stdout);
 	}
-
-	return (entries > nb) ? nb : entries;
 }
 
-static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs)
+static void dump_stats(void)
 {
-	u32 entries = q->cached_prod - q->cached_cons;
+	unsigned long now = get_nsecs();
+	long dt = now - prev_time;
+	int i;
 
-	if (entries == 0) {
-		q->cached_prod = *q->producer;
-		entries = q->cached_prod - q->cached_cons;
-	}
+	prev_time = now;
 
-	return (entries > ndescs) ? ndescs : entries;
-}
+	for (i = 0; i < num_socks && xsks[i]; i++) {
+		char *fmt = "%-15s %'-11.0f %'-11lu\n";
+		double rx_pps, tx_pps;
 
-static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
-					 struct xdp_desc *d,
-					 size_t nb)
-{
-	u32 i;
+		rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
+			 1000000000. / dt;
+		tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
+			 1000000000. / dt;
 
-	if (umem_nb_free(fq, nb) < nb)
-		return -ENOSPC;
+		printf("\n sock%d@", i);
+		print_benchmark(false);
+		printf("\n");
 
-	for (i = 0; i < nb; i++) {
-		u32 idx = fq->cached_prod++ & fq->mask;
+		printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
+		       dt / 1000000000.);
+		printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
+		printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
 
-		fq->ring[idx] = d[i].addr;
+		xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
+		xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
 	}
-
-	u_smp_wmb();
-
-	*fq->producer = fq->cached_prod;
-
-	return 0;
 }
 
-static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u64 *d,
-				      size_t nb)
+static void *poller(void *arg)
 {
-	u32 i;
-
-	if (umem_nb_free(fq, nb) < nb)
-		return -ENOSPC;
-
-	for (i = 0; i < nb; i++) {
-		u32 idx = fq->cached_prod++ & fq->mask;
-
-		fq->ring[idx] = d[i];
+	(void)arg;
+	for (;;) {
+		sleep(opt_interval);
+		dump_stats();
 	}
 
-	u_smp_wmb();
-
-	*fq->producer = fq->cached_prod;
-
-	return 0;
+	return NULL;
 }
 
-static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
-					       u64 *d, size_t nb)
+static void remove_xdp_program(void)
 {
-	u32 idx, i, entries = umem_nb_avail(cq, nb);
-
-	u_smp_rmb();
-
-	for (i = 0; i < entries; i++) {
-		idx = cq->cached_cons++ & cq->mask;
-		d[i] = cq->ring[idx];
-	}
-
-	if (entries > 0) {
-		u_smp_wmb();
+	__u32 curr_prog_id = 0;
 
-		*cq->consumer = cq->cached_cons;
+	if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
+		printf("bpf_get_link_xdp_id failed\n");
+		exit(EXIT_FAILURE);
 	}
-
-	return entries;
-}
-
-static inline void *xq_get_data(struct xdpsock *xsk, u64 addr)
-{
-	return &xsk->umem->frames[addr];
+	if (prog_id == curr_prog_id)
+		bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
+	else if (!curr_prog_id)
+		printf("couldn't find a prog id on a given interface\n");
+	else
+		printf("program on interface changed, not removing\n");
 }
 
-static inline int xq_enq(struct xdp_uqueue *uq,
-			 const struct xdp_desc *descs,
-			 unsigned int ndescs)
+static void int_exit(int sig)
 {
-	struct xdp_desc *r = uq->ring;
-	unsigned int i;
+	struct xsk_umem *umem = xsks[0]->umem->umem;
 
-	if (xq_nb_free(uq, ndescs) < ndescs)
-		return -ENOSPC;
-
-	for (i = 0; i < ndescs; i++) {
-		u32 idx = uq->cached_prod++ & uq->mask;
-
-		r[idx].addr = descs[i].addr;
-		r[idx].len = descs[i].len;
-	}
+	(void)sig;
 
-	u_smp_wmb();
+	dump_stats();
+	xsk_socket__delete(xsks[0]->xsk);
+	(void)xsk_umem__delete(umem);
+	remove_xdp_program();
 
-	*uq->producer = uq->cached_prod;
-	return 0;
+	exit(EXIT_SUCCESS);
 }
 
-static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
-				 unsigned int id, unsigned int ndescs)
+static void __exit_with_error(int error, const char *file, const char *func,
+			      int line)
 {
-	struct xdp_desc *r = uq->ring;
-	unsigned int i;
-
-	if (xq_nb_free(uq, ndescs) < ndescs)
-		return -ENOSPC;
-
-	for (i = 0; i < ndescs; i++) {
-		u32 idx = uq->cached_prod++ & uq->mask;
-
-		r[idx].addr	= (id + i) << FRAME_SHIFT;
-		r[idx].len	= sizeof(pkt_data) - 1;
-	}
-
-	u_smp_wmb();
-
-	*uq->producer = uq->cached_prod;
-	return 0;
+	fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func,
+		line, error, strerror(error));
+	dump_stats();
+	remove_xdp_program();
+	exit(EXIT_FAILURE);
 }
 
-static inline int xq_deq(struct xdp_uqueue *uq,
-			 struct xdp_desc *descs,
-			 int ndescs)
-{
-	struct xdp_desc *r = uq->ring;
-	unsigned int idx;
-	int i, entries;
-
-	entries = xq_nb_avail(uq, ndescs);
-
-	u_smp_rmb();
-
-	for (i = 0; i < entries; i++) {
-		idx = uq->cached_cons++ & uq->mask;
-		descs[i] = r[idx];
-	}
-
-	if (entries > 0) {
-		u_smp_wmb();
+#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \
+						 __LINE__)
 
-		*uq->consumer = uq->cached_cons;
-	}
-
-	return entries;
-}
+static const char pkt_data[] =
+	"\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
+	"\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
+	"\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
+	"\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
 
 static void swap_mac_addresses(void *data)
 {
@@ -397,258 +265,73 @@ static void hex_dump(void *pkt, size_t length, u64 addr)
 	printf("\n");
 }
 
-static size_t gen_eth_frame(char *frame)
+static size_t gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
 {
-	memcpy(frame, pkt_data, sizeof(pkt_data) - 1);
+	memcpy(xsk_umem__get_data(umem->umem, addr), pkt_data,
+	       sizeof(pkt_data) - 1);
 	return sizeof(pkt_data) - 1;
 }
 
-static struct xdp_umem *xdp_umem_configure(int sfd)
+static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
 {
-	int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS;
-	struct xdp_mmap_offsets off;
-	struct xdp_umem_reg mr;
-	struct xdp_umem *umem;
-	socklen_t optlen;
-	void *bufs;
+	struct xsk_umem_info *umem;
+	int ret;
 
 	umem = calloc(1, sizeof(*umem));
-	lassert(umem);
-
-	lassert(posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
-			       NUM_FRAMES * FRAME_SIZE) == 0);
-
-	mr.addr = (__u64)bufs;
-	mr.len = NUM_FRAMES * FRAME_SIZE;
-	mr.chunk_size = FRAME_SIZE;
-	mr.headroom = FRAME_HEADROOM;
-
-	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0);
-	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size,
-			   sizeof(int)) == 0);
-	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size,
-			   sizeof(int)) == 0);
-
-	optlen = sizeof(off);
-	lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off,
-			   &optlen) == 0);
-
-	umem->fq.map = mmap(0, off.fr.desc +
-			    FQ_NUM_DESCS * sizeof(u64),
-			    PROT_READ | PROT_WRITE,
-			    MAP_SHARED | MAP_POPULATE, sfd,
-			    XDP_UMEM_PGOFF_FILL_RING);
-	lassert(umem->fq.map != MAP_FAILED);
-
-	umem->fq.mask = FQ_NUM_DESCS - 1;
-	umem->fq.size = FQ_NUM_DESCS;
-	umem->fq.producer = umem->fq.map + off.fr.producer;
-	umem->fq.consumer = umem->fq.map + off.fr.consumer;
-	umem->fq.ring = umem->fq.map + off.fr.desc;
-	umem->fq.cached_cons = FQ_NUM_DESCS;
-
-	umem->cq.map = mmap(0, off.cr.desc +
-			     CQ_NUM_DESCS * sizeof(u64),
-			     PROT_READ | PROT_WRITE,
-			     MAP_SHARED | MAP_POPULATE, sfd,
-			     XDP_UMEM_PGOFF_COMPLETION_RING);
-	lassert(umem->cq.map != MAP_FAILED);
-
-	umem->cq.mask = CQ_NUM_DESCS - 1;
-	umem->cq.size = CQ_NUM_DESCS;
-	umem->cq.producer = umem->cq.map + off.cr.producer;
-	umem->cq.consumer = umem->cq.map + off.cr.consumer;
-	umem->cq.ring = umem->cq.map + off.cr.desc;
-
-	umem->frames = bufs;
-	umem->fd = sfd;
+	if (!umem)
+		exit_with_error(errno);
 
-	if (opt_bench == BENCH_TXONLY) {
-		int i;
-
-		for (i = 0; i < NUM_FRAMES * FRAME_SIZE; i += FRAME_SIZE)
-			(void)gen_eth_frame(&umem->frames[i]);
-	}
+	ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
+			       NULL);
+	if (ret)
+		exit_with_error(-ret);
 
 	return umem;
 }
 
-static struct xdpsock *xsk_configure(struct xdp_umem *umem)
+static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem)
 {
-	struct sockaddr_xdp sxdp = {};
-	struct xdp_mmap_offsets off;
-	int sfd, ndescs = NUM_DESCS;
-	struct xdpsock *xsk;
-	bool shared = true;
-	socklen_t optlen;
-	u64 i;
-
-	sfd = socket(PF_XDP, SOCK_RAW, 0);
-	lassert(sfd >= 0);
+	struct xsk_socket_config cfg;
+	struct xsk_socket_info *xsk;
+	int ret;
+	u32 idx;
+	int i;
 
 	xsk = calloc(1, sizeof(*xsk));
-	lassert(xsk);
-
-	xsk->sfd = sfd;
-	xsk->outstanding_tx = 0;
-
-	if (!umem) {
-		shared = false;
-		xsk->umem = xdp_umem_configure(sfd);
-	} else {
-		xsk->umem = umem;
-	}
-
-	lassert(setsockopt(sfd, SOL_XDP, XDP_RX_RING,
-			   &ndescs, sizeof(int)) == 0);
-	lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING,
-			   &ndescs, sizeof(int)) == 0);
-	optlen = sizeof(off);
-	lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off,
-			   &optlen) == 0);
-
-	/* Rx */
-	xsk->rx.map = mmap(NULL,
-			   off.rx.desc +
-			   NUM_DESCS * sizeof(struct xdp_desc),
-			   PROT_READ | PROT_WRITE,
-			   MAP_SHARED | MAP_POPULATE, sfd,
-			   XDP_PGOFF_RX_RING);
-	lassert(xsk->rx.map != MAP_FAILED);
-
-	if (!shared) {
-		for (i = 0; i < NUM_DESCS * FRAME_SIZE; i += FRAME_SIZE)
-			lassert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1)
-				== 0);
-	}
-
-	/* Tx */
-	xsk->tx.map = mmap(NULL,
-			   off.tx.desc +
-			   NUM_DESCS * sizeof(struct xdp_desc),
-			   PROT_READ | PROT_WRITE,
-			   MAP_SHARED | MAP_POPULATE, sfd,
-			   XDP_PGOFF_TX_RING);
-	lassert(xsk->tx.map != MAP_FAILED);
-
-	xsk->rx.mask = NUM_DESCS - 1;
-	xsk->rx.size = NUM_DESCS;
-	xsk->rx.producer = xsk->rx.map + off.rx.producer;
-	xsk->rx.consumer = xsk->rx.map + off.rx.consumer;
-	xsk->rx.ring = xsk->rx.map + off.rx.desc;
-
-	xsk->tx.mask = NUM_DESCS - 1;
-	xsk->tx.size = NUM_DESCS;
-	xsk->tx.producer = xsk->tx.map + off.tx.producer;
-	xsk->tx.consumer = xsk->tx.map + off.tx.consumer;
-	xsk->tx.ring = xsk->tx.map + off.tx.desc;
-	xsk->tx.cached_cons = NUM_DESCS;
-
-	sxdp.sxdp_family = PF_XDP;
-	sxdp.sxdp_ifindex = opt_ifindex;
-	sxdp.sxdp_queue_id = opt_queue;
-
-	if (shared) {
-		sxdp.sxdp_flags = XDP_SHARED_UMEM;
-		sxdp.sxdp_shared_umem_fd = umem->fd;
-	} else {
-		sxdp.sxdp_flags = opt_xdp_bind_flags;
-	}
-
-	lassert(bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0);
+	if (!xsk)
+		exit_with_error(errno);
+
+	xsk->umem = umem;
+	cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+	cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+	cfg.libbpf_flags = 0;
+	cfg.xdp_flags = opt_xdp_flags;
+	cfg.bind_flags = opt_xdp_bind_flags;
+	ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem,
+				 &xsk->rx, &xsk->tx, &cfg);
+	if (ret)
+		exit_with_error(-ret);
+
+	ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags);
+	if (ret)
+		exit_with_error(-ret);
+
+	ret = xsk_ring_prod__reserve(&xsk->umem->fq,
+				     XSK_RING_PROD__DEFAULT_NUM_DESCS,
+				     &idx);
+	if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS)
+		exit_with_error(-ret);
+	for (i = 0;
+	     i < XSK_RING_PROD__DEFAULT_NUM_DESCS *
+		     XSK_UMEM__DEFAULT_FRAME_SIZE;
+	     i += XSK_UMEM__DEFAULT_FRAME_SIZE)
+		*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = i;
+	xsk_ring_prod__submit(&xsk->umem->fq,
+			      XSK_RING_PROD__DEFAULT_NUM_DESCS);
 
 	return xsk;
 }
 
-static void print_benchmark(bool running)
-{
-	const char *bench_str = "INVALID";
-
-	if (opt_bench == BENCH_RXDROP)
-		bench_str = "rxdrop";
-	else if (opt_bench == BENCH_TXONLY)
-		bench_str = "txonly";
-	else if (opt_bench == BENCH_L2FWD)
-		bench_str = "l2fwd";
-
-	printf("%s:%d %s ", opt_if, opt_queue, bench_str);
-	if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
-		printf("xdp-skb ");
-	else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
-		printf("xdp-drv ");
-	else
-		printf("	");
-
-	if (opt_poll)
-		printf("poll() ");
-
-	if (running) {
-		printf("running...");
-		fflush(stdout);
-	}
-}
-
-static void dump_stats(void)
-{
-	unsigned long now = get_nsecs();
-	long dt = now - prev_time;
-	int i;
-
-	prev_time = now;
-
-	for (i = 0; i < num_socks && xsks[i]; i++) {
-		char *fmt = "%-15s %'-11.0f %'-11lu\n";
-		double rx_pps, tx_pps;
-
-		rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
-			 1000000000. / dt;
-		tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
-			 1000000000. / dt;
-
-		printf("\n sock%d@", i);
-		print_benchmark(false);
-		printf("\n");
-
-		printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
-		       dt / 1000000000.);
-		printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
-		printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
-
-		xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
-		xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
-	}
-}
-
-static void *poller(void *arg)
-{
-	(void)arg;
-	for (;;) {
-		sleep(opt_interval);
-		dump_stats();
-	}
-
-	return NULL;
-}
-
-static void int_exit(int sig)
-{
-	__u32 curr_prog_id = 0;
-
-	(void)sig;
-	dump_stats();
-	if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
-		printf("bpf_get_link_xdp_id failed\n");
-		exit(EXIT_FAILURE);
-	}
-	if (prog_id == curr_prog_id)
-		bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
-	else if (!curr_prog_id)
-		printf("couldn't find a prog id on a given interface\n");
-	else
-		printf("program on interface changed, not removing\n");
-	exit(EXIT_SUCCESS);
-}
-
 static struct option long_options[] = {
 	{"rxdrop", no_argument, 0, 'r'},
 	{"txonly", no_argument, 0, 't'},
@@ -656,7 +339,6 @@ static struct option long_options[] = {
 	{"interface", required_argument, 0, 'i'},
 	{"queue", required_argument, 0, 'q'},
 	{"poll", no_argument, 0, 'p'},
-	{"shared-buffer", no_argument, 0, 's'},
 	{"xdp-skb", no_argument, 0, 'S'},
 	{"xdp-native", no_argument, 0, 'N'},
 	{"interval", required_argument, 0, 'n'},
@@ -676,7 +358,6 @@ static void usage(const char *prog)
 		"  -i, --interface=n	Run on interface n\n"
 		"  -q, --queue=n	Use queue n (default 0)\n"
 		"  -p, --poll		Use poll syscall\n"
-		"  -s, --shared-buffer	Use shared packet buffer\n"
 		"  -S, --xdp-skb=n	Use XDP skb-mod\n"
 		"  -N, --xdp-native=n	Enfore XDP native mode\n"
 		"  -n, --interval=n	Specify statistics update interval (default 1 sec).\n"
@@ -715,9 +396,6 @@ static void parse_command_line(int argc, char **argv)
 		case 'q':
 			opt_queue = atoi(optarg);
 			break;
-		case 's':
-			opt_shared_packet_buffer = 1;
-			break;
 		case 'p':
 			opt_poll = 1;
 			break;
@@ -751,75 +429,104 @@ static void parse_command_line(int argc, char **argv)
 			opt_if);
 		usage(basename(argv[0]));
 	}
+
 }
 
-static void kick_tx(int fd)
+static void kick_tx(struct xsk_socket_info *xsk)
 {
 	int ret;
 
-	ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+	ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
 	if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY)
 		return;
-	lassert(0);
+	exit_with_error(errno);
 }
 
-static inline void complete_tx_l2fwd(struct xdpsock *xsk)
+static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
 {
-	u64 descs[BATCH_SIZE];
+	u32 idx_cq, idx_fq;
 	unsigned int rcvd;
 	size_t ndescs;
 
 	if (!xsk->outstanding_tx)
 		return;
 
-	kick_tx(xsk->sfd);
+	kick_tx(xsk);
 	ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
-		 xsk->outstanding_tx;
+		xsk->outstanding_tx;
 
 	/* re-add completed Tx buffers */
-	rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, ndescs);
+	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, ndescs, &idx_cq);
 	if (rcvd > 0) {
-		umem_fill_to_kernel(&xsk->umem->fq, descs, rcvd);
+		unsigned int i;
+		int ret;
+
+		ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+		while (ret != rcvd) {
+			if (ret < 0)
+				exit_with_error(-ret);
+			ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd,
+						     &idx_fq);
+		}
+		for (i = 0; i < rcvd; i++)
+			*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) =
+				*xsk_ring_cons__comp_addr(&xsk->umem->cq,
+							  idx_cq++);
+
+		xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
+		xsk_ring_cons__release(&xsk->umem->cq, rcvd);
 		xsk->outstanding_tx -= rcvd;
 		xsk->tx_npkts += rcvd;
 	}
 }
 
-static inline void complete_tx_only(struct xdpsock *xsk)
+static inline void complete_tx_only(struct xsk_socket_info *xsk)
 {
-	u64 descs[BATCH_SIZE];
 	unsigned int rcvd;
+	u32 idx;
 
 	if (!xsk->outstanding_tx)
 		return;
 
-	kick_tx(xsk->sfd);
+	kick_tx(xsk);
 
-	rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, BATCH_SIZE);
+	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx);
 	if (rcvd > 0) {
+		xsk_ring_cons__release(&xsk->umem->cq, rcvd);
 		xsk->outstanding_tx -= rcvd;
 		xsk->tx_npkts += rcvd;
 	}
 }
 
-static void rx_drop(struct xdpsock *xsk)
+static void rx_drop(struct xsk_socket_info *xsk)
 {
-	struct xdp_desc descs[BATCH_SIZE];
 	unsigned int rcvd, i;
+	u32 idx_rx, idx_fq = 0;
+	int ret;
 
-	rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
+	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
 	if (!rcvd)
 		return;
 
+	ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+	while (ret != rcvd) {
+		if (ret < 0)
+			exit_with_error(-ret);
+		ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+	}
+
 	for (i = 0; i < rcvd; i++) {
-		char *pkt = xq_get_data(xsk, descs[i].addr);
+		u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
+		u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
+		char *pkt = xsk_umem__get_data(xsk->umem->umem, addr);
 
-		hex_dump(pkt, descs[i].len, descs[i].addr);
+		hex_dump(pkt, len, addr);
+		*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = addr;
 	}
 
+	xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
+	xsk_ring_cons__release(&xsk->rx, rcvd);
 	xsk->rx_npkts += rcvd;
-
-	umem_fill_to_kernel_ex(&xsk->umem->fq, descs, rcvd);
 }
 
 static void rx_drop_all(void)
@@ -830,7 +537,7 @@ static void rx_drop_all(void)
 	memset(fds, 0, sizeof(fds));
 
 	for (i = 0; i < num_socks; i++) {
-		fds[i].fd = xsks[i]->sfd;
+		fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
 		fds[i].events = POLLIN;
 		timeout = 1000; /* 1sn */
 	}
@@ -847,14 +554,14 @@ static void rx_drop_all(void)
 	}
 }
 
-static void tx_only(struct xdpsock *xsk)
+static void tx_only(struct xsk_socket_info *xsk)
 {
 	int timeout, ret, nfds = 1;
 	struct pollfd fds[nfds + 1];
-	unsigned int idx = 0;
+	u32 idx, frame_nb = 0;
 
 	memset(fds, 0, sizeof(fds));
-	fds[0].fd = xsk->sfd;
+	fds[0].fd = xsk_socket__fd(xsk->xsk);
 	fds[0].events = POLLOUT;
 	timeout = 1000; /* 1sn */
 
@@ -864,50 +571,73 @@ static void tx_only(struct xdpsock *xsk)
 			if (ret <= 0)
 				continue;
 
-			if (fds[0].fd != xsk->sfd ||
-			    !(fds[0].revents & POLLOUT))
+			if (!(fds[0].revents & POLLOUT))
 				continue;
 		}
 
-		if (xq_nb_free(&xsk->tx, BATCH_SIZE) >= BATCH_SIZE) {
-			lassert(xq_enq_tx_only(&xsk->tx, idx, BATCH_SIZE) == 0);
+		if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) ==
+		    BATCH_SIZE) {
+			unsigned int i;
 
+			for (i = 0; i < BATCH_SIZE; i++) {
+				xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
+					= (frame_nb + i) <<
+					XSK_UMEM__DEFAULT_FRAME_SHIFT;
+				xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
+					sizeof(pkt_data) - 1;
+			}
+
+			xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
 			xsk->outstanding_tx += BATCH_SIZE;
-			idx += BATCH_SIZE;
-			idx %= NUM_FRAMES;
+			frame_nb += BATCH_SIZE;
+			frame_nb %= NUM_FRAMES;
 		}
 
 		complete_tx_only(xsk);
 	}
 }
 
-static void l2fwd(struct xdpsock *xsk)
+static void l2fwd(struct xsk_socket_info *xsk)
 {
 	for (;;) {
-		struct xdp_desc descs[BATCH_SIZE];
 		unsigned int rcvd, i;
+		u32 idx_rx, idx_tx = 0;
 		int ret;
 
 		for (;;) {
 			complete_tx_l2fwd(xsk);
 
-			rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
+			rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE,
+						   &idx_rx);
 			if (rcvd > 0)
 				break;
 		}
 
+		ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
+		while (ret != rcvd) {
+			if (ret < 0)
+				exit_with_error(-ret);
+			ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
+		}
+
 		for (i = 0; i < rcvd; i++) {
-			char *pkt = xq_get_data(xsk, descs[i].addr);
+			u64 addr = xsk_ring_cons__rx_desc(&xsk->rx,
+							  idx_rx)->addr;
+			u32 len = xsk_ring_cons__rx_desc(&xsk->rx,
+							 idx_rx++)->len;
+			char *pkt = xsk_umem__get_data(xsk->umem->umem, addr);
 
 			swap_mac_addresses(pkt);
 
-			hex_dump(pkt, descs[i].len, descs[i].addr);
+			hex_dump(pkt, len, addr);
+			xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
+			xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
 		}
 
-		xsk->rx_npkts += rcvd;
+		xsk_ring_prod__submit(&xsk->tx, rcvd);
+		xsk_ring_cons__release(&xsk->rx, rcvd);
 
-		ret = xq_enq(&xsk->tx, descs, rcvd);
-		lassert(ret == 0);
+		xsk->rx_npkts += rcvd;
 		xsk->outstanding_tx += rcvd;
 	}
 }
@@ -915,17 +645,10 @@ static void l2fwd(struct xdpsock *xsk)
 int main(int argc, char **argv)
 {
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
-	struct bpf_prog_load_attr prog_load_attr = {
-		.prog_type	= BPF_PROG_TYPE_XDP,
-	};
-	int prog_fd, qidconf_map, xsks_map;
-	struct bpf_prog_info info = {};
-	__u32 info_len = sizeof(info);
-	struct bpf_object *obj;
-	char xdp_filename[256];
-	struct bpf_map *map;
-	int i, ret, key = 0;
+	struct xsk_umem_info *umem;
 	pthread_t pt;
+	void *bufs;
+	int ret;
 
 	parse_command_line(argc, argv);
 
@@ -935,67 +658,22 @@ int main(int argc, char **argv)
 		exit(EXIT_FAILURE);
 	}
 
-	snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]);
-	prog_load_attr.file = xdp_filename;
-
-	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
-		exit(EXIT_FAILURE);
-	if (prog_fd < 0) {
-		fprintf(stderr, "ERROR: no program found: %s\n",
-			strerror(prog_fd));
-		exit(EXIT_FAILURE);
-	}
-
-	map = bpf_object__find_map_by_name(obj, "qidconf_map");
-	qidconf_map = bpf_map__fd(map);
-	if (qidconf_map < 0) {
-		fprintf(stderr, "ERROR: no qidconf map found: %s\n",
-			strerror(qidconf_map));
-		exit(EXIT_FAILURE);
-	}
-
-	map = bpf_object__find_map_by_name(obj, "xsks_map");
-	xsks_map = bpf_map__fd(map);
-	if (xsks_map < 0) {
-		fprintf(stderr, "ERROR: no xsks map found: %s\n",
-			strerror(xsks_map));
-		exit(EXIT_FAILURE);
-	}
-
-	if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) {
-		fprintf(stderr, "ERROR: link set xdp fd failed\n");
-		exit(EXIT_FAILURE);
-	}
-
-	ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
-	if (ret) {
-		printf("can't get prog info - %s\n", strerror(errno));
-		return 1;
-	}
-	prog_id = info.id;
+	ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
+			     NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE);
+	if (ret)
+		exit_with_error(ret);
 
-	ret = bpf_map_update_elem(qidconf_map, &key, &opt_queue, 0);
-	if (ret) {
-		fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n");
-		exit(EXIT_FAILURE);
-	}
+       /* Create sockets... */
+	umem = xsk_configure_umem(bufs,
+				  NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE);
+	xsks[num_socks++] = xsk_configure_socket(umem);
 
-	/* Create sockets... */
-	xsks[num_socks++] = xsk_configure(NULL);
-
-#if RR_LB
-	for (i = 0; i < MAX_SOCKS - 1; i++)
-		xsks[num_socks++] = xsk_configure(xsks[0]->umem);
-#endif
+	if (opt_bench == BENCH_TXONLY) {
+		int i;
 
-	/* ...and insert them into the map. */
-	for (i = 0; i < num_socks; i++) {
-		key = i;
-		ret = bpf_map_update_elem(xsks_map, &key, &xsks[i]->sfd, 0);
-		if (ret) {
-			fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i);
-			exit(EXIT_FAILURE);
-		}
+		for (i = 0; i < NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE;
+		     i += XSK_UMEM__DEFAULT_FRAME_SIZE)
+			(void)gen_eth_frame(umem, i);
 	}
 
 	signal(SIGINT, int_exit);
@@ -1005,7 +683,8 @@ int main(int argc, char **argv)
 	setlocale(LC_ALL, "");
 
 	ret = pthread_create(&pt, NULL, poller, NULL);
-	lassert(ret == 0);
+	if (ret)
+		exit_with_error(ret);
 
 	prev_time = get_nsecs();
 
-- 
2.7.4


^ permalink raw reply related

* [PATCH bpf] xsk: add missing smp_rmb() in xsk_mmap
From: Magnus Karlsson @ 2019-02-08 13:13 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev

All the setup code in AF_XDP is protected by a mutex with the
exception of the mmap code that cannot use it. To make sure that a
process banging on the mmap call at the same time as another process
is setting up the socket, smp_wmb() calls were added in the umem
registration code and the queue creation code, so that the published
structures that xsk_mmap needs would be consistent. However, the
corresponding smp_rmb() calls were not added to the xsk_mmap
code. This patch adds these calls.

Fixes: 37b076933a8e3 ("xsk: add missing write- and data-dependency barrier")
Fixes: c0c77d8fb787c ("xsk: add user memory registration support sockopt")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 net/xdp/xsk.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index a032684..45f3b52 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -669,6 +669,8 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 		if (!umem)
 			return -EINVAL;
 
+		/* Matches the smp_wmb() in XDP_UMEM_REG */
+		smp_rmb();
 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
 			q = READ_ONCE(umem->fq);
 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
@@ -678,6 +680,8 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 	if (!q)
 		return -EINVAL;
 
+	/* Matches the smp_wmb() in xsk_init_queue */
+	smp_rmb();
 	qpg = virt_to_head_page(q->ring);
 	if (size > (PAGE_SIZE << compound_order(qpg)))
 		return -EINVAL;
-- 
2.7.4


^ permalink raw reply related

* [PATCH 1/2] can: c_can: support 64 message objects for D_CAN
From: Andrejs Cainikovs @ 2019-02-08 13:17 UTC (permalink / raw)
  To: Wolfgang Grandegger, Marc Kleine-Budde, David S. Miller,
	linux-can@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
  Cc: Patrick Zysset
In-Reply-To: <20190208131738.27668-1-andrejs.cainikovs@netmodule.com>

D_CAN supports up to 128 message objects, comparing to 32 on C_CAN.
However, some CPUs with D_CAN controller have their own limits:
TI AM335x Sitara CPU, for example, supports max of 64 message objects.

This patch extends max D_CAN message objects up to 64.

Signed-off-by: Andrejs Cainikovs <andrejs.cainikovs@netmodule.com>
---
 drivers/net/can/c_can/Kconfig | 12 ++++++++++++
 drivers/net/can/c_can/c_can.c | 42 ++++++++++++++++++++++--------------------
 drivers/net/can/c_can/c_can.h | 20 ++++++++++++++++----
 3 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/drivers/net/can/c_can/Kconfig b/drivers/net/can/c_can/Kconfig
index 61ffc12d8fd8..6c1ada7291df 100644
--- a/drivers/net/can/c_can/Kconfig
+++ b/drivers/net/can/c_can/Kconfig
@@ -20,4 +20,16 @@ config CAN_C_CAN_PCI
 	---help---
 	  This driver adds support for the C_CAN/D_CAN chips connected
 	  to the PCI bus.
+
+config CAN_C_CAN_DCAN_64_MSG_OBJECTS
+	bool "Use 64 message objects for D_CAN"
+	default n
+	---help---
+	  D_CAN supports up to 128 message objects, comparing to 32 on
+	  C_CAN. However, some CPUs with D_CAN controller have their
+	  own limits: TI AM335x Sitara CPU, for example, supports max
+	  of 64 message objects.
+	  Enabling this option extends max D_CAN message objects up to
+	  64.
+
 endif
diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c
index 606b7d8ffe13..5d695b89b459 100644
--- a/drivers/net/can/c_can/c_can.c
+++ b/drivers/net/can/c_can/c_can.c
@@ -352,15 +352,6 @@ static void c_can_setup_tx_object(struct net_device *dev, int iface,
 	}
 }
 
-static inline void c_can_activate_all_lower_rx_msg_obj(struct net_device *dev,
-						       int iface)
-{
-	int i;
-
-	for (i = C_CAN_MSG_OBJ_RX_FIRST; i <= C_CAN_MSG_RX_LOW_LAST; i++)
-		c_can_object_get(dev, iface, i, IF_COMM_CLR_NEWDAT);
-}
-
 static int c_can_handle_lost_msg_obj(struct net_device *dev,
 				     int iface, int objno, u32 ctrl)
 {
@@ -706,7 +697,16 @@ static void c_can_do_tx(struct net_device *dev)
 	struct net_device_stats *stats = &dev->stats;
 	u32 idx, obj, pkts = 0, bytes = 0, pend, clr;
 
-	clr = pend = priv->read_reg(priv, C_CAN_INTPND2_REG);
+#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+	if (priv->type == BOSCH_D_CAN) {
+		pend = priv->read_reg32(priv, C_CAN_INTPND3_REG);
+	} else {
+#endif
+		pend = priv->read_reg(priv, C_CAN_INTPND2_REG);
+#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+	}
+#endif
+	clr = pend;
 
 	while ((idx = ffs(pend))) {
 		idx--;
@@ -817,7 +817,17 @@ static int c_can_read_objects(struct net_device *dev, struct c_can_priv *priv,
 
 static inline u32 c_can_get_pending(struct c_can_priv *priv)
 {
-	u32 pend = priv->read_reg(priv, C_CAN_NEWDAT1_REG);
+	u32 pend;
+
+#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+	if (priv->type == BOSCH_D_CAN) {
+		pend = priv->read_reg32(priv, C_CAN_NEWDAT1_REG);
+	} else {
+#endif
+		pend = priv->read_reg(priv, C_CAN_NEWDAT1_REG);
+#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+	}
+#endif
 
 	return pend;
 }
@@ -828,8 +838,7 @@ static inline u32 c_can_get_pending(struct c_can_priv *priv)
  * c_can core saves a received CAN message into the first free message
  * object it finds free (starting with the lowest). Bits NEWDAT and
  * INTPND are set for this message object indicating that a new message
- * has arrived. To work-around this issue, we keep two groups of message
- * objects whose partitioning is defined by C_CAN_MSG_OBJ_RX_SPLIT.
+ * has arrived.
  *
  * We clear the newdat bit right away.
  *
@@ -840,13 +849,6 @@ static int c_can_do_rx_poll(struct net_device *dev, int quota)
 	struct c_can_priv *priv = netdev_priv(dev);
 	u32 pkts = 0, pend = 0, toread, n;
 
-	/*
-	 * It is faster to read only one 16bit register. This is only possible
-	 * for a maximum number of 16 objects.
-	 */
-	BUILD_BUG_ON_MSG(C_CAN_MSG_OBJ_RX_LAST > 16,
-			"Implementation does not support more message objects than 16");
-
 	while (quota > 0) {
 		if (!pend) {
 			pend = c_can_get_pending(priv);
diff --git a/drivers/net/can/c_can/c_can.h b/drivers/net/can/c_can/c_can.h
index 8acdc7fa4792..e44b686a70a2 100644
--- a/drivers/net/can/c_can/c_can.h
+++ b/drivers/net/can/c_can/c_can.h
@@ -23,9 +23,15 @@
 #define C_CAN_H
 
 /* message object split */
+
+#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+#define C_CAN_NO_OF_OBJECTS	64
+#else
 #define C_CAN_NO_OF_OBJECTS	32
-#define C_CAN_MSG_OBJ_RX_NUM	16
-#define C_CAN_MSG_OBJ_TX_NUM	16
+#endif
+
+#define C_CAN_MSG_OBJ_TX_NUM	(C_CAN_NO_OF_OBJECTS >> 1)
+#define C_CAN_MSG_OBJ_RX_NUM	(C_CAN_NO_OF_OBJECTS - C_CAN_MSG_OBJ_TX_NUM)
 
 #define C_CAN_MSG_OBJ_RX_FIRST	1
 #define C_CAN_MSG_OBJ_RX_LAST	(C_CAN_MSG_OBJ_RX_FIRST + \
@@ -35,9 +41,11 @@
 #define C_CAN_MSG_OBJ_TX_LAST	(C_CAN_MSG_OBJ_TX_FIRST + \
 				C_CAN_MSG_OBJ_TX_NUM - 1)
 
-#define C_CAN_MSG_OBJ_RX_SPLIT	9
-#define C_CAN_MSG_RX_LOW_LAST	(C_CAN_MSG_OBJ_RX_SPLIT - 1)
+#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+#define RECEIVE_OBJECT_BITS	0xffffffff
+#else
 #define RECEIVE_OBJECT_BITS	0x0000ffff
+#endif
 
 enum reg {
 	C_CAN_CTRL_REG = 0,
@@ -76,6 +84,8 @@ enum reg {
 	C_CAN_NEWDAT2_REG,
 	C_CAN_INTPND1_REG,
 	C_CAN_INTPND2_REG,
+	C_CAN_INTPND3_REG,
+	C_CAN_INTPND4_REG,
 	C_CAN_MSGVAL1_REG,
 	C_CAN_MSGVAL2_REG,
 	C_CAN_FUNCTION_REG,
@@ -137,6 +147,8 @@ static const u16 reg_map_d_can[] = {
 	[C_CAN_NEWDAT2_REG]	= 0x9E,
 	[C_CAN_INTPND1_REG]	= 0xB0,
 	[C_CAN_INTPND2_REG]	= 0xB2,
+	[C_CAN_INTPND3_REG]	= 0xB4,
+	[C_CAN_INTPND4_REG]	= 0xB6,
 	[C_CAN_MSGVAL1_REG]	= 0xC4,
 	[C_CAN_MSGVAL2_REG]	= 0xC6,
 	[C_CAN_IF1_COMREQ_REG]	= 0x100,
-- 
2.11.0


^ permalink raw reply related

* [PATCH 2/2] can: c_can: configurable amount of D_CAN RX objects
From: Andrejs Cainikovs @ 2019-02-08 13:17 UTC (permalink / raw)
  To: Wolfgang Grandegger, Marc Kleine-Budde, David S. Miller,
	linux-can@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
  Cc: Patrick Zysset
In-Reply-To: <20190208131738.27668-1-andrejs.cainikovs@netmodule.com>

Make number of D_CAN RX message objects configurable. This will allow
having bigger (or smaller) RX buffer instead of 50/50 split for RX/TX.

Signed-off-by: Andrejs Cainikovs <andrejs.cainikovs@netmodule.com>
---
 drivers/net/can/c_can/Kconfig |  8 ++++++
 drivers/net/can/c_can/c_can.c | 64 +++++++++++++++++++++++++++++--------------
 drivers/net/can/c_can/c_can.h | 16 +++++------
 3 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/drivers/net/can/c_can/Kconfig b/drivers/net/can/c_can/Kconfig
index 6c1ada7291df..949d2d12d71e 100644
--- a/drivers/net/can/c_can/Kconfig
+++ b/drivers/net/can/c_can/Kconfig
@@ -32,4 +32,12 @@ config CAN_C_CAN_DCAN_64_MSG_OBJECTS
 	  Enabling this option extends max D_CAN message objects up to
 	  64.
 
+config CAN_C_CAN_DCAN_RX_MSG_OBJECTS
+	int "Specify amount of D_CAN RX message objects"
+	depends on CAN_C_CAN_DCAN_64_MSG_OBJECTS
+	default 32
+	---help---
+	  Use specific number of message objects for RX, instead of
+	  50/50 split between RX/TX.
+
 endif
diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c
index 5d695b89b459..675bc223e222 100644
--- a/drivers/net/can/c_can/c_can.c
+++ b/drivers/net/can/c_can/c_can.c
@@ -208,6 +208,26 @@ static const struct can_bittiming_const c_can_bittiming_const = {
 	.brp_inc = 1,
 };
 
+static inline u64 c_can_get_mask(int bits)
+{
+	return ((u64)1 << bits) - 1;
+}
+
+static inline int c_can_ffs64(u64 x)
+{
+	int b;
+
+	b = ffs(x);
+
+	if (!b) {
+		b = ffs(x >> 32);
+		if (b)
+			b += 32;
+	}
+
+	return b;
+}
+
 static inline void c_can_pm_runtime_enable(const struct c_can_priv *priv)
 {
 	if (priv->device)
@@ -695,24 +715,23 @@ static void c_can_do_tx(struct net_device *dev)
 {
 	struct c_can_priv *priv = netdev_priv(dev);
 	struct net_device_stats *stats = &dev->stats;
-	u32 idx, obj, pkts = 0, bytes = 0, pend, clr;
+	u32 idx, obj, pkts = 0, bytes = 0;
+	u64 pend, clr;
 
+	/* Mask interrupt pending bits */
+	pend = priv->read_reg32(priv, C_CAN_INTPND1_REG);
 #ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
 	if (priv->type == BOSCH_D_CAN) {
-		pend = priv->read_reg32(priv, C_CAN_INTPND3_REG);
-	} else {
-#endif
-		pend = priv->read_reg(priv, C_CAN_INTPND2_REG);
-#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+		pend |= (u64)priv->read_reg32(priv, C_CAN_INTPND3_REG) << 32;
 	}
 #endif
-	clr = pend;
+	pend &= ~c_can_get_mask(C_CAN_MSG_OBJ_RX_NUM);
+	clr = pend >> C_CAN_MSG_OBJ_RX_NUM;
 
-	while ((idx = ffs(pend))) {
-		idx--;
-		pend &= ~(1 << idx);
-		obj = idx + C_CAN_MSG_OBJ_TX_FIRST;
+	while ((obj = c_can_ffs64(pend))) {
+		pend &= ~((u64)1 << (obj - 1));
 		c_can_inval_tx_object(dev, IF_RX, obj);
+		idx = obj - C_CAN_MSG_OBJ_TX_FIRST;
 		can_get_echo_skb(dev, idx);
 		bytes += priv->dlc[idx];
 		pkts++;
@@ -736,19 +755,19 @@ static void c_can_do_tx(struct net_device *dev)
  * raced with the hardware or failed to readout all upper
  * objects in the last run due to quota limit.
  */
-static u32 c_can_adjust_pending(u32 pend)
+static u64 c_can_adjust_pending(u64 pend)
 {
-	u32 weight, lasts;
+	u64 weight, lasts;
 
-	if (pend == RECEIVE_OBJECT_BITS)
+	if (pend == c_can_get_mask(C_CAN_MSG_OBJ_RX_NUM))
 		return pend;
 
 	/*
 	 * If the last set bit is larger than the number of pending
 	 * bits we have a gap.
 	 */
-	weight = hweight32(pend);
-	lasts = fls(pend);
+	weight = hweight64(pend);
+	lasts = fls64(pend);
 
 	/* If the bits are linear, nothing to do */
 	if (lasts == weight)
@@ -777,11 +796,11 @@ static inline void c_can_rx_finalize(struct net_device *dev,
 }
 
 static int c_can_read_objects(struct net_device *dev, struct c_can_priv *priv,
-			      u32 pend, int quota)
+			      u64 pend, int quota)
 {
 	u32 pkts = 0, ctrl, obj;
 
-	while ((obj = ffs(pend)) && quota > 0) {
+	while ((obj = c_can_ffs64(pend)) && quota > 0) {
 		pend &= ~BIT(obj - 1);
 
 		c_can_rx_object_get(dev, priv, obj);
@@ -815,13 +834,15 @@ static int c_can_read_objects(struct net_device *dev, struct c_can_priv *priv,
 	return pkts;
 }
 
-static inline u32 c_can_get_pending(struct c_can_priv *priv)
+static inline u64 c_can_get_pending(struct c_can_priv *priv)
 {
-	u32 pend;
+	u64 pend;
 
 #ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
 	if (priv->type == BOSCH_D_CAN) {
 		pend = priv->read_reg32(priv, C_CAN_NEWDAT1_REG);
+		pend |= (u64)priv->read_reg32(priv, C_CAN_NEWDAT3_REG) << 32;
+		pend &= c_can_get_mask(C_CAN_MSG_OBJ_RX_NUM);
 	} else {
 #endif
 		pend = priv->read_reg(priv, C_CAN_NEWDAT1_REG);
@@ -847,7 +868,8 @@ static inline u32 c_can_get_pending(struct c_can_priv *priv)
 static int c_can_do_rx_poll(struct net_device *dev, int quota)
 {
 	struct c_can_priv *priv = netdev_priv(dev);
-	u32 pkts = 0, pend = 0, toread, n;
+	u32 pkts = 0, n;
+	u64 pend = 0, toread;
 
 	while (quota > 0) {
 		if (!pend) {
diff --git a/drivers/net/can/c_can/c_can.h b/drivers/net/can/c_can/c_can.h
index e44b686a70a2..4a0759ee249d 100644
--- a/drivers/net/can/c_can/c_can.h
+++ b/drivers/net/can/c_can/c_can.h
@@ -26,12 +26,12 @@
 
 #ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
 #define C_CAN_NO_OF_OBJECTS	64
+#define C_CAN_MSG_OBJ_RX_NUM	CONFIG_CAN_C_CAN_DCAN_RX_MSG_OBJECTS
 #else
 #define C_CAN_NO_OF_OBJECTS	32
+#define C_CAN_MSG_OBJ_RX_NUM	16
 #endif
-
-#define C_CAN_MSG_OBJ_TX_NUM	(C_CAN_NO_OF_OBJECTS >> 1)
-#define C_CAN_MSG_OBJ_RX_NUM	(C_CAN_NO_OF_OBJECTS - C_CAN_MSG_OBJ_TX_NUM)
+#define C_CAN_MSG_OBJ_TX_NUM	(C_CAN_NO_OF_OBJECTS - C_CAN_MSG_OBJ_RX_NUM)
 
 #define C_CAN_MSG_OBJ_RX_FIRST	1
 #define C_CAN_MSG_OBJ_RX_LAST	(C_CAN_MSG_OBJ_RX_FIRST + \
@@ -41,12 +41,6 @@
 #define C_CAN_MSG_OBJ_TX_LAST	(C_CAN_MSG_OBJ_TX_FIRST + \
 				C_CAN_MSG_OBJ_TX_NUM - 1)
 
-#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
-#define RECEIVE_OBJECT_BITS	0xffffffff
-#else
-#define RECEIVE_OBJECT_BITS	0x0000ffff
-#endif
-
 enum reg {
 	C_CAN_CTRL_REG = 0,
 	C_CAN_CTRL_EX_REG,
@@ -82,6 +76,8 @@ enum reg {
 	C_CAN_TXRQST2_REG,
 	C_CAN_NEWDAT1_REG,
 	C_CAN_NEWDAT2_REG,
+	C_CAN_NEWDAT3_REG,
+	C_CAN_NEWDAT4_REG,
 	C_CAN_INTPND1_REG,
 	C_CAN_INTPND2_REG,
 	C_CAN_INTPND3_REG,
@@ -145,6 +141,8 @@ static const u16 reg_map_d_can[] = {
 	[C_CAN_TXRQST2_REG]	= 0x8A,
 	[C_CAN_NEWDAT1_REG]	= 0x9C,
 	[C_CAN_NEWDAT2_REG]	= 0x9E,
+	[C_CAN_NEWDAT3_REG]	= 0xA0,
+	[C_CAN_NEWDAT4_REG]	= 0xA2,
 	[C_CAN_INTPND1_REG]	= 0xB0,
 	[C_CAN_INTPND2_REG]	= 0xB2,
 	[C_CAN_INTPND3_REG]	= 0xB4,
-- 
2.11.0


^ permalink raw reply related

* Re: [PATCH net-next] ipvlan: decouple l3s mode dependencies from other modes
From: Florian Westphal @ 2019-02-08 13:29 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: davem, netdev, Mahesh Bandewar, Florian Westphal,
	Martynas Pumputis
In-Reply-To: <20190208125531.28970-1-daniel@iogearbox.net>

Daniel Borkmann <daniel@iogearbox.net> wrote:
> Right now ipvlan has a hard dependency on CONFIG_NETFILTER and
> otherwise it cannot be built. However, the only ipvlan operation
> mode that actually depends on netfilter is l3s, everything else
> is independent of it. Break this hard dependency such that users
> are able to use ipvlan l3 mode on systems where netfilter is not
> compiled in.
> 
> Therefore, this adds a hidden CONFIG_IPVLAN_L3S bool which is
> defaulting to y when CONFIG_NETFILTER is set in order to retain
> existing behavior for l3s. All l3s related code is refactored
> into ipvlan_l3s.c that is compiled in when enabled.

IIRC L3S is only meaningful with netfilter anyway, so this
looks like a good thing to do.

Acked-by: Florian Westphal <fw@strlen.de>


^ permalink raw reply

* [PATCH 1/2] can: c_can: support 64 message objects for D_CAN
From: Andrejs Cainikovs @ 2019-02-08 13:31 UTC (permalink / raw)
  To: Wolfgang Grandegger, Marc Kleine-Budde, David S . Miller,
	linux-can@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
  Cc: Patrick Zysset
In-Reply-To: <20190208132954.28166-1-andrejs.cainikovs@netmodule.com>

D_CAN supports up to 128 message objects, comparing to 32 on C_CAN.
However, some CPUs with D_CAN controller have their own limits:
TI AM335x Sitara CPU, for example, supports max of 64 message objects.

This patch extends max D_CAN message objects up to 64.

Signed-off-by: Andrejs Cainikovs <andrejs.cainikovs@netmodule.com>
---
 drivers/net/can/c_can/Kconfig | 12 ++++++++++++
 drivers/net/can/c_can/c_can.c | 42 ++++++++++++++++++++++--------------------
 drivers/net/can/c_can/c_can.h | 20 ++++++++++++++++----
 3 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/drivers/net/can/c_can/Kconfig b/drivers/net/can/c_can/Kconfig
index 61ffc12d8fd8..6c1ada7291df 100644
--- a/drivers/net/can/c_can/Kconfig
+++ b/drivers/net/can/c_can/Kconfig
@@ -20,4 +20,16 @@ config CAN_C_CAN_PCI
 	---help---
 	  This driver adds support for the C_CAN/D_CAN chips connected
 	  to the PCI bus.
+
+config CAN_C_CAN_DCAN_64_MSG_OBJECTS
+	bool "Use 64 message objects for D_CAN"
+	default n
+	---help---
+	  D_CAN supports up to 128 message objects, comparing to 32 on
+	  C_CAN. However, some CPUs with D_CAN controller have their
+	  own limits: TI AM335x Sitara CPU, for example, supports max
+	  of 64 message objects.
+	  Enabling this option extends max D_CAN message objects up to
+	  64.
+
 endif
diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c
index 606b7d8ffe13..5d695b89b459 100644
--- a/drivers/net/can/c_can/c_can.c
+++ b/drivers/net/can/c_can/c_can.c
@@ -352,15 +352,6 @@ static void c_can_setup_tx_object(struct net_device *dev, int iface,
 	}
 }
 
-static inline void c_can_activate_all_lower_rx_msg_obj(struct net_device *dev,
-						       int iface)
-{
-	int i;
-
-	for (i = C_CAN_MSG_OBJ_RX_FIRST; i <= C_CAN_MSG_RX_LOW_LAST; i++)
-		c_can_object_get(dev, iface, i, IF_COMM_CLR_NEWDAT);
-}
-
 static int c_can_handle_lost_msg_obj(struct net_device *dev,
 				     int iface, int objno, u32 ctrl)
 {
@@ -706,7 +697,16 @@ static void c_can_do_tx(struct net_device *dev)
 	struct net_device_stats *stats = &dev->stats;
 	u32 idx, obj, pkts = 0, bytes = 0, pend, clr;
 
-	clr = pend = priv->read_reg(priv, C_CAN_INTPND2_REG);
+#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+	if (priv->type == BOSCH_D_CAN) {
+		pend = priv->read_reg32(priv, C_CAN_INTPND3_REG);
+	} else {
+#endif
+		pend = priv->read_reg(priv, C_CAN_INTPND2_REG);
+#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+	}
+#endif
+	clr = pend;
 
 	while ((idx = ffs(pend))) {
 		idx--;
@@ -817,7 +817,17 @@ static int c_can_read_objects(struct net_device *dev, struct c_can_priv *priv,
 
 static inline u32 c_can_get_pending(struct c_can_priv *priv)
 {
-	u32 pend = priv->read_reg(priv, C_CAN_NEWDAT1_REG);
+	u32 pend;
+
+#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+	if (priv->type == BOSCH_D_CAN) {
+		pend = priv->read_reg32(priv, C_CAN_NEWDAT1_REG);
+	} else {
+#endif
+		pend = priv->read_reg(priv, C_CAN_NEWDAT1_REG);
+#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+	}
+#endif
 
 	return pend;
 }
@@ -828,8 +838,7 @@ static inline u32 c_can_get_pending(struct c_can_priv *priv)
  * c_can core saves a received CAN message into the first free message
  * object it finds free (starting with the lowest). Bits NEWDAT and
  * INTPND are set for this message object indicating that a new message
- * has arrived. To work-around this issue, we keep two groups of message
- * objects whose partitioning is defined by C_CAN_MSG_OBJ_RX_SPLIT.
+ * has arrived.
  *
  * We clear the newdat bit right away.
  *
@@ -840,13 +849,6 @@ static int c_can_do_rx_poll(struct net_device *dev, int quota)
 	struct c_can_priv *priv = netdev_priv(dev);
 	u32 pkts = 0, pend = 0, toread, n;
 
-	/*
-	 * It is faster to read only one 16bit register. This is only possible
-	 * for a maximum number of 16 objects.
-	 */
-	BUILD_BUG_ON_MSG(C_CAN_MSG_OBJ_RX_LAST > 16,
-			"Implementation does not support more message objects than 16");
-
 	while (quota > 0) {
 		if (!pend) {
 			pend = c_can_get_pending(priv);
diff --git a/drivers/net/can/c_can/c_can.h b/drivers/net/can/c_can/c_can.h
index 8acdc7fa4792..e44b686a70a2 100644
--- a/drivers/net/can/c_can/c_can.h
+++ b/drivers/net/can/c_can/c_can.h
@@ -23,9 +23,15 @@
 #define C_CAN_H
 
 /* message object split */
+
+#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+#define C_CAN_NO_OF_OBJECTS	64
+#else
 #define C_CAN_NO_OF_OBJECTS	32
-#define C_CAN_MSG_OBJ_RX_NUM	16
-#define C_CAN_MSG_OBJ_TX_NUM	16
+#endif
+
+#define C_CAN_MSG_OBJ_TX_NUM	(C_CAN_NO_OF_OBJECTS >> 1)
+#define C_CAN_MSG_OBJ_RX_NUM	(C_CAN_NO_OF_OBJECTS - C_CAN_MSG_OBJ_TX_NUM)
 
 #define C_CAN_MSG_OBJ_RX_FIRST	1
 #define C_CAN_MSG_OBJ_RX_LAST	(C_CAN_MSG_OBJ_RX_FIRST + \
@@ -35,9 +41,11 @@
 #define C_CAN_MSG_OBJ_TX_LAST	(C_CAN_MSG_OBJ_TX_FIRST + \
 				C_CAN_MSG_OBJ_TX_NUM - 1)
 
-#define C_CAN_MSG_OBJ_RX_SPLIT	9
-#define C_CAN_MSG_RX_LOW_LAST	(C_CAN_MSG_OBJ_RX_SPLIT - 1)
+#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+#define RECEIVE_OBJECT_BITS	0xffffffff
+#else
 #define RECEIVE_OBJECT_BITS	0x0000ffff
+#endif
 
 enum reg {
 	C_CAN_CTRL_REG = 0,
@@ -76,6 +84,8 @@ enum reg {
 	C_CAN_NEWDAT2_REG,
 	C_CAN_INTPND1_REG,
 	C_CAN_INTPND2_REG,
+	C_CAN_INTPND3_REG,
+	C_CAN_INTPND4_REG,
 	C_CAN_MSGVAL1_REG,
 	C_CAN_MSGVAL2_REG,
 	C_CAN_FUNCTION_REG,
@@ -137,6 +147,8 @@ static const u16 reg_map_d_can[] = {
 	[C_CAN_NEWDAT2_REG]	= 0x9E,
 	[C_CAN_INTPND1_REG]	= 0xB0,
 	[C_CAN_INTPND2_REG]	= 0xB2,
+	[C_CAN_INTPND3_REG]	= 0xB4,
+	[C_CAN_INTPND4_REG]	= 0xB6,
 	[C_CAN_MSGVAL1_REG]	= 0xC4,
 	[C_CAN_MSGVAL2_REG]	= 0xC6,
 	[C_CAN_IF1_COMREQ_REG]	= 0x100,
-- 
2.11.0


^ permalink raw reply related

* [PATCH 2/2] can: c_can: configurable amount of D_CAN RX objects
From: Andrejs Cainikovs @ 2019-02-08 13:31 UTC (permalink / raw)
  To: Wolfgang Grandegger, Marc Kleine-Budde, David S . Miller,
	linux-can@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
  Cc: Patrick Zysset
In-Reply-To: <20190208132954.28166-1-andrejs.cainikovs@netmodule.com>

Make number of D_CAN RX message objects configurable. This will allow
having bigger (or smaller) RX buffer instead of 50/50 split for RX/TX.

Signed-off-by: Andrejs Cainikovs <andrejs.cainikovs@netmodule.com>
---
 drivers/net/can/c_can/Kconfig |  8 ++++++
 drivers/net/can/c_can/c_can.c | 64 +++++++++++++++++++++++++++++--------------
 drivers/net/can/c_can/c_can.h | 16 +++++------
 3 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/drivers/net/can/c_can/Kconfig b/drivers/net/can/c_can/Kconfig
index 6c1ada7291df..949d2d12d71e 100644
--- a/drivers/net/can/c_can/Kconfig
+++ b/drivers/net/can/c_can/Kconfig
@@ -32,4 +32,12 @@ config CAN_C_CAN_DCAN_64_MSG_OBJECTS
 	  Enabling this option extends max D_CAN message objects up to
 	  64.
 
+config CAN_C_CAN_DCAN_RX_MSG_OBJECTS
+	int "Specify amount of D_CAN RX message objects"
+	depends on CAN_C_CAN_DCAN_64_MSG_OBJECTS
+	default 32
+	---help---
+	  Use specific number of message objects for RX, instead of
+	  50/50 split between RX/TX.
+
 endif
diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c
index 5d695b89b459..675bc223e222 100644
--- a/drivers/net/can/c_can/c_can.c
+++ b/drivers/net/can/c_can/c_can.c
@@ -208,6 +208,26 @@ static const struct can_bittiming_const c_can_bittiming_const = {
 	.brp_inc = 1,
 };
 
+static inline u64 c_can_get_mask(int bits)
+{
+	return ((u64)1 << bits) - 1;
+}
+
+static inline int c_can_ffs64(u64 x)
+{
+	int b;
+
+	b = ffs(x);
+
+	if (!b) {
+		b = ffs(x >> 32);
+		if (b)
+			b += 32;
+	}
+
+	return b;
+}
+
 static inline void c_can_pm_runtime_enable(const struct c_can_priv *priv)
 {
 	if (priv->device)
@@ -695,24 +715,23 @@ static void c_can_do_tx(struct net_device *dev)
 {
 	struct c_can_priv *priv = netdev_priv(dev);
 	struct net_device_stats *stats = &dev->stats;
-	u32 idx, obj, pkts = 0, bytes = 0, pend, clr;
+	u32 idx, obj, pkts = 0, bytes = 0;
+	u64 pend, clr;
 
+	/* Mask interrupt pending bits */
+	pend = priv->read_reg32(priv, C_CAN_INTPND1_REG);
 #ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
 	if (priv->type == BOSCH_D_CAN) {
-		pend = priv->read_reg32(priv, C_CAN_INTPND3_REG);
-	} else {
-#endif
-		pend = priv->read_reg(priv, C_CAN_INTPND2_REG);
-#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
+		pend |= (u64)priv->read_reg32(priv, C_CAN_INTPND3_REG) << 32;
 	}
 #endif
-	clr = pend;
+	pend &= ~c_can_get_mask(C_CAN_MSG_OBJ_RX_NUM);
+	clr = pend >> C_CAN_MSG_OBJ_RX_NUM;
 
-	while ((idx = ffs(pend))) {
-		idx--;
-		pend &= ~(1 << idx);
-		obj = idx + C_CAN_MSG_OBJ_TX_FIRST;
+	while ((obj = c_can_ffs64(pend))) {
+		pend &= ~((u64)1 << (obj - 1));
 		c_can_inval_tx_object(dev, IF_RX, obj);
+		idx = obj - C_CAN_MSG_OBJ_TX_FIRST;
 		can_get_echo_skb(dev, idx);
 		bytes += priv->dlc[idx];
 		pkts++;
@@ -736,19 +755,19 @@ static void c_can_do_tx(struct net_device *dev)
  * raced with the hardware or failed to readout all upper
  * objects in the last run due to quota limit.
  */
-static u32 c_can_adjust_pending(u32 pend)
+static u64 c_can_adjust_pending(u64 pend)
 {
-	u32 weight, lasts;
+	u64 weight, lasts;
 
-	if (pend == RECEIVE_OBJECT_BITS)
+	if (pend == c_can_get_mask(C_CAN_MSG_OBJ_RX_NUM))
 		return pend;
 
 	/*
 	 * If the last set bit is larger than the number of pending
 	 * bits we have a gap.
 	 */
-	weight = hweight32(pend);
-	lasts = fls(pend);
+	weight = hweight64(pend);
+	lasts = fls64(pend);
 
 	/* If the bits are linear, nothing to do */
 	if (lasts == weight)
@@ -777,11 +796,11 @@ static inline void c_can_rx_finalize(struct net_device *dev,
 }
 
 static int c_can_read_objects(struct net_device *dev, struct c_can_priv *priv,
-			      u32 pend, int quota)
+			      u64 pend, int quota)
 {
 	u32 pkts = 0, ctrl, obj;
 
-	while ((obj = ffs(pend)) && quota > 0) {
+	while ((obj = c_can_ffs64(pend)) && quota > 0) {
 		pend &= ~BIT(obj - 1);
 
 		c_can_rx_object_get(dev, priv, obj);
@@ -815,13 +834,15 @@ static int c_can_read_objects(struct net_device *dev, struct c_can_priv *priv,
 	return pkts;
 }
 
-static inline u32 c_can_get_pending(struct c_can_priv *priv)
+static inline u64 c_can_get_pending(struct c_can_priv *priv)
 {
-	u32 pend;
+	u64 pend;
 
 #ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
 	if (priv->type == BOSCH_D_CAN) {
 		pend = priv->read_reg32(priv, C_CAN_NEWDAT1_REG);
+		pend |= (u64)priv->read_reg32(priv, C_CAN_NEWDAT3_REG) << 32;
+		pend &= c_can_get_mask(C_CAN_MSG_OBJ_RX_NUM);
 	} else {
 #endif
 		pend = priv->read_reg(priv, C_CAN_NEWDAT1_REG);
@@ -847,7 +868,8 @@ static inline u32 c_can_get_pending(struct c_can_priv *priv)
 static int c_can_do_rx_poll(struct net_device *dev, int quota)
 {
 	struct c_can_priv *priv = netdev_priv(dev);
-	u32 pkts = 0, pend = 0, toread, n;
+	u32 pkts = 0, n;
+	u64 pend = 0, toread;
 
 	while (quota > 0) {
 		if (!pend) {
diff --git a/drivers/net/can/c_can/c_can.h b/drivers/net/can/c_can/c_can.h
index e44b686a70a2..4a0759ee249d 100644
--- a/drivers/net/can/c_can/c_can.h
+++ b/drivers/net/can/c_can/c_can.h
@@ -26,12 +26,12 @@
 
 #ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
 #define C_CAN_NO_OF_OBJECTS	64
+#define C_CAN_MSG_OBJ_RX_NUM	CONFIG_CAN_C_CAN_DCAN_RX_MSG_OBJECTS
 #else
 #define C_CAN_NO_OF_OBJECTS	32
+#define C_CAN_MSG_OBJ_RX_NUM	16
 #endif
-
-#define C_CAN_MSG_OBJ_TX_NUM	(C_CAN_NO_OF_OBJECTS >> 1)
-#define C_CAN_MSG_OBJ_RX_NUM	(C_CAN_NO_OF_OBJECTS - C_CAN_MSG_OBJ_TX_NUM)
+#define C_CAN_MSG_OBJ_TX_NUM	(C_CAN_NO_OF_OBJECTS - C_CAN_MSG_OBJ_RX_NUM)
 
 #define C_CAN_MSG_OBJ_RX_FIRST	1
 #define C_CAN_MSG_OBJ_RX_LAST	(C_CAN_MSG_OBJ_RX_FIRST + \
@@ -41,12 +41,6 @@
 #define C_CAN_MSG_OBJ_TX_LAST	(C_CAN_MSG_OBJ_TX_FIRST + \
 				C_CAN_MSG_OBJ_TX_NUM - 1)
 
-#ifdef CONFIG_CAN_C_CAN_DCAN_64_MSG_OBJECTS
-#define RECEIVE_OBJECT_BITS	0xffffffff
-#else
-#define RECEIVE_OBJECT_BITS	0x0000ffff
-#endif
-
 enum reg {
 	C_CAN_CTRL_REG = 0,
 	C_CAN_CTRL_EX_REG,
@@ -82,6 +76,8 @@ enum reg {
 	C_CAN_TXRQST2_REG,
 	C_CAN_NEWDAT1_REG,
 	C_CAN_NEWDAT2_REG,
+	C_CAN_NEWDAT3_REG,
+	C_CAN_NEWDAT4_REG,
 	C_CAN_INTPND1_REG,
 	C_CAN_INTPND2_REG,
 	C_CAN_INTPND3_REG,
@@ -145,6 +141,8 @@ static const u16 reg_map_d_can[] = {
 	[C_CAN_TXRQST2_REG]	= 0x8A,
 	[C_CAN_NEWDAT1_REG]	= 0x9C,
 	[C_CAN_NEWDAT2_REG]	= 0x9E,
+	[C_CAN_NEWDAT3_REG]	= 0xA0,
+	[C_CAN_NEWDAT4_REG]	= 0xA2,
 	[C_CAN_INTPND1_REG]	= 0xB0,
 	[C_CAN_INTPND2_REG]	= 0xB2,
 	[C_CAN_INTPND3_REG]	= 0xB4,
-- 
2.11.0


^ permalink raw reply related

* [PATCH 0/2] D_CAN RX buffer size improvements
From: Andrejs Cainikovs @ 2019-02-08 13:31 UTC (permalink / raw)
  To: Wolfgang Grandegger, Marc Kleine-Budde, David S . Miller,
	linux-can@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
  Cc: Patrick Zysset

Re-sending entire patchset due to missed cover letter, sorry.

This patchset introduces support for 64 D_CAN message objects with an option of
unequal split between RX/TX.

The rationale behind this is that there are lots of frame loss on higher bus
speeds. Below are test results from my custom Sitara AM3352 based board:

  Sender: timeout 15m cangen can0 -g 0 -i x
  Target: candump can0,0~0,#FFFFFFFF -td -c -d -e

  * Without patches:
    - 15 minute RX test, 500kbps
    - 16 RX / 16 TX message objects
    - 77 received frames lost out of 4649415

  * With patches applied:
    - 15 hours RX test, 500kbps
    - 56 RX / 8 TX message objects
    - 41 received frames lost out of 279303376

Please note, I do not have ability to test pure C_CAN, so it is left untested.

---

Andrejs Cainikovs (2):
  can: c_can: support 64 message objects for D_CAN
  can: c_can: configurable amount of D_CAN RX objects

 drivers/net/can/c_can/Kconfig | 20 ++++++++++
 drivers/net/can/c_can/c_can.c | 93 +++++++++++++++++++++++++++----------------
 drivers/net/can/c_can/c_can.h | 20 +++++++---
 3 files changed, 94 insertions(+), 39 deletions(-)

---
2.11.0


^ permalink raw reply

* Re: [PATCH net-next] ipvlan: decouple l3s mode dependencies from other modes
From: Daniel Borkmann @ 2019-02-08 13:39 UTC (permalink / raw)
  To: Florian Westphal; +Cc: davem, netdev, Mahesh Bandewar, Martynas Pumputis
In-Reply-To: <20190208132941.bmbvuqckocuyhujr@breakpoint.cc>

On 02/08/2019 02:29 PM, Florian Westphal wrote:
> Daniel Borkmann <daniel@iogearbox.net> wrote:
>> Right now ipvlan has a hard dependency on CONFIG_NETFILTER and
>> otherwise it cannot be built. However, the only ipvlan operation
>> mode that actually depends on netfilter is l3s, everything else
>> is independent of it. Break this hard dependency such that users
>> are able to use ipvlan l3 mode on systems where netfilter is not
>> compiled in.
>>
>> Therefore, this adds a hidden CONFIG_IPVLAN_L3S bool which is
>> defaulting to y when CONFIG_NETFILTER is set in order to retain
>> existing behavior for l3s. All l3s related code is refactored
>> into ipvlan_l3s.c that is compiled in when enabled.
> 
> IIRC L3S is only meaningful with netfilter anyway, so this
> looks like a good thing to do.

Yep, agree, it doesn't work without it since inside ipvlan's nfhook
there is the final switch to the target ipvlan slave device. :)

> Acked-by: Florian Westphal <fw@strlen.de>

Thanks!

^ permalink raw reply

* Re: [PATCH v2 3/6] ethtool: introduce new ioctl for per-queue settings
From: Willem de Bruijn @ 2019-02-08 13:56 UTC (permalink / raw)
  To: Michal Kubecek
  Cc: Network Development, Nunley, Nicholas D, Kirsher, Jeffrey T,
	linville@tuxdriver.com, nhorman@redhat.com, sassmann@redhat.com
In-Reply-To: <20190208070200.GA7035@unicorn.suse.cz>

On Fri, Feb 8, 2019 at 2:04 AM Michal Kubecek <mkubecek@suse.cz> wrote:
>
> On Thu, Feb 07, 2019 at 11:37:19PM +0000, Nunley, Nicholas D wrote:
> > From: Michal Kubecek [mailto:mkubecek@suse.cz]
> > > On Tue, Feb 05, 2019 at 04:01:03PM -0800, Jeff Kirsher wrote:
> > > > Introduce a new ioctl for setting per-queue parameters.
> > > > Users can apply commands to specific queues by setting SUB_COMMAND
> > > and
> > > > queue_mask with the following ethtool command:
> > > >
> > > >  ethtool --set-perqueue-command DEVNAME [queue_mask %x]
> > > SUB_COMMAND
> > >
> > > The "set" part is IMHO a bit confusing in combination with "show" type
> > > subcommands.
> >
> > I'm not a fan of the "set" either. This patch set had already gone
> > through some review before it was passed on to me, and the command
> > naming wasn't a previous point of contention and I didn't want to
> > disturb the parts that weren't "broken", but since you've brought it
> > up I agree that this may not be the best name. I think
> > "--perqueue-command" is more appropriate. Does this seem reasonable to
> > you?
>
> That sounds good, I would say either that or "--per-queue".

+1 --per-queue is short and easy to remember.

And perhaps reserve short-hand '-Q'? I expect this to become a popular
option. Especially if perhaps eventually it can dump per queue stats (-S).

^ permalink raw reply

* Re: [PATCH net-next] net: phy: disregard "Clause 22 registers present" bit in get_phy_c45_devs_in_pkg
From: Andrew Lunn @ 2019-02-08 14:02 UTC (permalink / raw)
  To: Heiner Kallweit; +Cc: Florian Fainelli, David Miller, netdev@vger.kernel.org
In-Reply-To: <47ec59c0-27c0-32e3-c75e-7bdfa99551c9@gmail.com>

On Fri, Feb 08, 2019 at 08:12:47AM +0100, Heiner Kallweit wrote:
> Bit 0 in register 1.5 doesn't represent a device but is a flag that
> Clause 22 registers are present. Therefore disregard this bit when
> populating the device list. If code needs this information it
> should read register 1.5 directly instead of accessing the device
> list. Because this bit doesn't represent a device I didn't add a
> MDIO_MMD_C22PRESENT constant or similar.
> 
> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
> ---
>  drivers/net/phy/phy_device.c | 3 ++-
>  include/uapi/linux/mdio.h    | 1 +
>  2 files changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> index 9369e1323..c2316a117 100644
> --- a/drivers/net/phy/phy_device.c
> +++ b/drivers/net/phy/phy_device.c
> @@ -682,7 +682,8 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
>  	phy_reg = mdiobus_read(bus, addr, reg_addr);
>  	if (phy_reg < 0)
>  		return -EIO;
> -	*devices_in_package |= (phy_reg & 0xffff);
> +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> +	*devices_in_package |= (phy_reg & 0xfffe);

Hi Heiner

Just for readability, can we use BIT(0) in there somehow?

>  
>  	return 0;
>  }
> diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
> index 2e6e309f0..0e012b168 100644
> --- a/include/uapi/linux/mdio.h
> +++ b/include/uapi/linux/mdio.h
> @@ -115,6 +115,7 @@
>  
>  /* Device present registers. */
>  #define MDIO_DEVS_PRESENT(devad)	(1 << (devad))
> +#define MDIO_DEVS_C22PRESENT		MDIO_DEVS_PRESENT(0)

Err. The commit message says you did not add this...

     Andrew

^ permalink raw reply

* Re: [PATCH net-next] ipmr: ip6mr: Create new sockopt to clear mfc cache or vifs
From: Nicolas Dichtel @ 2019-02-08 14:18 UTC (permalink / raw)
  To: Callum Sinclair, davem, kuznet, yoshfuji, nikolay, netdev,
	linux-kernel
In-Reply-To: <20190208041103.31299-2-callum.sinclair@alliedtelesis.co.nz>

Le 08/02/2019 à 05:11, Callum Sinclair a écrit :
> Currently the only way to clear the mfc cache was to delete the entries
mfc stands for 'multicast forwarding cache', so 'mfc cache' is a bit strange.

> one by one using the MRT_DEL_MFC socket option or to destroy and
> recreate the socket.
Note that if entries were added with MRT_ADD_MFC_PROXY, they will survive to the
socket destruction. This is not the case with your new cmd. Is it intended?
Maybe a third option (something like MRT_FLUSH_MFC_PROXY) would be useful to
avoid confusion?

> 
> Create a new socket option which will clear the multicast forwarding
> cache on the socket without destroying the socket. The new socket option
> MRT_FLUSH_ENTRIES will clear all multicast entries on the sockets table
> and the MRT_FLUSH_VIFS will delete all multicast vifs on the socket
> table.
> 
> Signed-off-by: Callum Sinclair <callum.sinclair@alliedtelesis.co.nz>
> ---
>  include/uapi/linux/mroute.h  |  7 +++-
>  include/uapi/linux/mroute6.h |  7 +++-
>  net/ipv4/ipmr.c              | 69 ++++++++++++++++++++-------------
>  net/ipv6/ip6mr.c             | 74 ++++++++++++++++++++++--------------
>  4 files changed, 100 insertions(+), 57 deletions(-)
> 
> diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h
> index 5d37a9ccce63..673495ca3495 100644
> --- a/include/uapi/linux/mroute.h
> +++ b/include/uapi/linux/mroute.h
> @@ -28,12 +28,17 @@
>  #define MRT_TABLE	(MRT_BASE+9)	/* Specify mroute table ID		*/
>  #define MRT_ADD_MFC_PROXY	(MRT_BASE+10)	/* Add a (*,*|G) mfc entry	*/
>  #define MRT_DEL_MFC_PROXY	(MRT_BASE+11)	/* Del a (*,*|G) mfc entry	*/
> -#define MRT_MAX		(MRT_BASE+11)
> +#define MRT_FLUSH	(MRT_BASE+12)	/* Flush all multicast entries and vifs	*/
nit: "Flush all mfc entries and/or vifs" ?

> +#define MRT_MAX		(MRT_BASE+12)
>  
>  #define SIOCGETVIFCNT	SIOCPROTOPRIVATE	/* IP protocol privates */
>  #define SIOCGETSGCNT	(SIOCPROTOPRIVATE+1)
>  #define SIOCGETRPF	(SIOCPROTOPRIVATE+2)
>  
> +/* MRT_FLUSH optional flags */
> +#define MRT_FLUSH_ENTRIES	1	/* For flushing all multicast entries */
Maybe MRT_FLUSH_MFC is more consistent with the previous naming (MRT_ADD_MFC, etc.)


Regards,
Nicolas

^ permalink raw reply

* Re: Clang warning in drivers/net/ethernet/intel/igc/igc_ethtool.c
From: Michal Kubecek @ 2019-02-08 14:34 UTC (permalink / raw)
  To: netdev
  Cc: Nathan Chancellor, Sasha Neftin, Jeff Kirsher, Aaron Brown,
	intel-wired-lan, linux-kernel, Nick Desaulniers
In-Reply-To: <20190208050921.GA8758@archlinux-ryzen>

On Thu, Feb 07, 2019 at 10:09:21PM -0700, Nathan Chancellor wrote:
> Hi all,
> 
> After commit 8c5ad0dae93c ("igc: Add ethtool support"), Clang warns:
> 
> drivers/net/ethernet/intel/igc/igc_ethtool.c:9:19: warning: variable 'igc_priv_flags_strings' is not needed and will not be emitted [-Wunneeded-internal-declaration]
> static const char igc_priv_flags_strings[][ETH_GSTRING_LEN] = {
>                   ^
> 1 warning generated.
> 
> igc_priv_flags_strings is only used in an ARRAY_SIZE macro, which is a
> compile time evaluation, so no reference to it is being emitted in the
> final assembly. Is it actually needed and was forgotten to be used
> somewhere or could it be eliminated so that Clang no longer warns?

That's because the driver provides get_priv_flags() and set_priv_flags()
callbacks in its ethtool_ops to allow querying and setting legacy-rx
private flag but it does not provide get_sset_count() and get_strings()
to provide list of private flags to userspace ethtool.

Michal Kubecek

^ permalink raw reply

* Re: [PATCH net-next] ipmr: ip6mr: Create new sockopt to clear mfc cache or vifs
From: Nikolay Aleksandrov @ 2019-02-08 14:43 UTC (permalink / raw)
  To: nicolas.dichtel, Callum Sinclair, davem, kuznet, yoshfuji, netdev,
	linux-kernel
In-Reply-To: <be9e655b-6911-c644-de63-9aabcd131763@6wind.com>

On 08/02/2019 16:18, Nicolas Dichtel wrote:
> Le 08/02/2019 à 05:11, Callum Sinclair a écrit :
>> Currently the only way to clear the mfc cache was to delete the entries
> mfc stands for 'multicast forwarding cache', so 'mfc cache' is a bit strange.
> 
>> one by one using the MRT_DEL_MFC socket option or to destroy and
>> recreate the socket.
> Note that if entries were added with MRT_ADD_MFC_PROXY, they will survive to the
> socket destruction. This is not the case with your new cmd. Is it intended?

I think you're referring to MFC_STATIC entries (sk != mroute_sk). It
doesn't matter how you add an entry - they all get cleaned up if added
through the mroute socket.

> Maybe a third option (something like MRT_FLUSH_MFC_PROXY) would be useful to
> avoid confusion?
> 
>>
>> Create a new socket option which will clear the multicast forwarding
>> cache on the socket without destroying the socket. The new socket option
>> MRT_FLUSH_ENTRIES will clear all multicast entries on the sockets table
>> and the MRT_FLUSH_VIFS will delete all multicast vifs on the socket
>> table.
>>
>> Signed-off-by: Callum Sinclair <callum.sinclair@alliedtelesis.co.nz>
>> ---
>>  include/uapi/linux/mroute.h  |  7 +++-
>>  include/uapi/linux/mroute6.h |  7 +++-
>>  net/ipv4/ipmr.c              | 69 ++++++++++++++++++++-------------
>>  net/ipv6/ip6mr.c             | 74 ++++++++++++++++++++++--------------
>>  4 files changed, 100 insertions(+), 57 deletions(-)
>>
>> diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h
>> index 5d37a9ccce63..673495ca3495 100644
>> --- a/include/uapi/linux/mroute.h
>> +++ b/include/uapi/linux/mroute.h
>> @@ -28,12 +28,17 @@
>>  #define MRT_TABLE	(MRT_BASE+9)	/* Specify mroute table ID		*/
>>  #define MRT_ADD_MFC_PROXY	(MRT_BASE+10)	/* Add a (*,*|G) mfc entry	*/
>>  #define MRT_DEL_MFC_PROXY	(MRT_BASE+11)	/* Del a (*,*|G) mfc entry	*/
>> -#define MRT_MAX		(MRT_BASE+11)
>> +#define MRT_FLUSH	(MRT_BASE+12)	/* Flush all multicast entries and vifs	*/
> nit: "Flush all mfc entries and/or vifs" ?
> 
>> +#define MRT_MAX		(MRT_BASE+12)
>>  
>>  #define SIOCGETVIFCNT	SIOCPROTOPRIVATE	/* IP protocol privates */
>>  #define SIOCGETSGCNT	(SIOCPROTOPRIVATE+1)
>>  #define SIOCGETRPF	(SIOCPROTOPRIVATE+2)
>>  
>> +/* MRT_FLUSH optional flags */
>> +#define MRT_FLUSH_ENTRIES	1	/* For flushing all multicast entries */
> Maybe MRT_FLUSH_MFC is more consistent with the previous naming (MRT_ADD_MFC, etc.)
> 
> 
> Regards,
> Nicolas
> 


^ permalink raw reply

* Re: [PATCH net-next] ipmr: ip6mr: Create new sockopt to clear mfc cache or vifs
From: Nicolas Dichtel @ 2019-02-08 15:08 UTC (permalink / raw)
  To: Nikolay Aleksandrov, Callum Sinclair, davem, kuznet, yoshfuji,
	netdev, linux-kernel
In-Reply-To: <5597e8bc-c23e-1f59-0442-260a7b4ca83d@cumulusnetworks.com>

Le 08/02/2019 à 15:43, Nikolay Aleksandrov a écrit :
> On 08/02/2019 16:18, Nicolas Dichtel wrote:
>> Le 08/02/2019 à 05:11, Callum Sinclair a écrit :
>>> Currently the only way to clear the mfc cache was to delete the entries
>> mfc stands for 'multicast forwarding cache', so 'mfc cache' is a bit strange.
>>
>>> one by one using the MRT_DEL_MFC socket option or to destroy and
>>> recreate the socket.
>> Note that if entries were added with MRT_ADD_MFC_PROXY, they will survive to the
>> socket destruction. This is not the case with your new cmd. Is it intended?
> 
> I think you're referring to MFC_STATIC entries (sk != mroute_sk). It
> doesn't matter how you add an entry - they all get cleaned up if added
> through the mroute socket.
Yes, right.
MRT_FLUSH_MFC_STATIC ?

^ permalink raw reply

* Re: [PATCH] mwifiex: don't print error message on coex event
From: Kalle Valo @ 2019-02-08 15:28 UTC (permalink / raw)
  To: Stefan Agner
  Cc: amitkarwar, nishants, gbhat, huxinming820, davem, linux-wireless,
	netdev, linux-kernel, Stefan Agner
In-Reply-To: <20190128154310.17322-1-stefan@agner.ch>

Stefan Agner <stefan@agner.ch> wrote:

> The BT coex event is not an error condition. Don't print an error
> message in this case. The same even in sta_event.c prints a
> message using the debug level already.
> 
> Signed-off-by: Stefan Agner <stefan@agner.ch>
> Reviewed-by: Brian Norris <briannorris@chromium.org>

Patch applied to wireless-drivers-next.git, thanks.

5208fea64e4f mwifiex: don't print error message on coex event

-- 
https://patchwork.kernel.org/patch/10783935/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches


^ permalink raw reply

* Re: [PATCH][RESEND] rtlwifi: remove set but not used variable 'cmd_seq'
From: Kalle Valo @ 2019-02-08 15:29 UTC (permalink / raw)
  To: YueHaibing
  Cc: davem, pkshih, linux-kernel, netdev, linux-wireless, YueHaibing
In-Reply-To: <20190130031526.20308-1-yuehaibing@huawei.com>

YueHaibing <yuehaibing@huawei.com> wrote:

> Fixes gcc '-Wunused-but-set-variable' warning:
> 
> drivers/net/wireless/realtek/rtlwifi/base.c: In function 'rtl_c2h_content_parsing':
> drivers/net/wireless/realtek/rtlwifi/base.c:2313:13: warning:
>  variable 'cmd_seq' set but not used [-Wunused-but-set-variable]
> 
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
> Acked-by: Ping-Ke Shih <pkshih@realtek.com>

Patch applied to wireless-drivers-next.git, thanks.

78f2ef18e185 rtlwifi: remove set but not used variable 'cmd_seq'

-- 
https://patchwork.kernel.org/patch/10787619/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches


^ permalink raw reply

* Re: [PATCH] rsi:  fix indentation issue with a code block
From: Kalle Valo @ 2019-02-08 15:31 UTC (permalink / raw)
  To: Colin King
  Cc: Amitkumar Karwar, Siva Rebbagondla, David S . Miller,
	linux-wireless, netdev, kernel-janitors, linux-kernel
In-Reply-To: <20190207121141.28336-1-colin.king@canonical.com>

Colin King <colin.king@canonical.com> wrote:

> From: Colin Ian King <colin.king@canonical.com>
> 
> There is a block of code that is indented at the wrong level. Fix this
> with extra tabbing.
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Patch applied to wireless-drivers-next.git, thanks.

34025a1056a3 rsi: fix indentation issue with a code block

-- 
https://patchwork.kernel.org/patch/10801013/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches


^ permalink raw reply

* Re: [PATCH net-next] net: dsa: use struct_size() in devm_kzalloc()
From: Vivien Didelot @ 2019-02-08 15:32 UTC (permalink / raw)
  To: Gustavo A. R. Silva
  Cc: Andrew Lunn, Florian Fainelli, David S. Miller, netdev,
	linux-kernel, Gustavo A. R. Silva
In-Reply-To: <20190208011603.GA927@embeddedor>

On Thu, 7 Feb 2019 19:16:03 -0600, "Gustavo A. R. Silva" <gustavo@embeddedor.com> wrote:
> One of the more common cases of allocation size calculations is finding
> the size of a structure that has a zero-sized array at the end, along
> with memory for some number of elements for that array. For example:
> 
> struct foo {
>     int stuff;
>     struct boo entry[];
> };
> 
> size = sizeof(struct foo) + count * sizeof(struct boo);
> instance = alloc(size, GFP_KERNEL)
> 
> Instead of leaving these open-coded and prone to type mistakes, we can
> now use the new struct_size() helper:
> 
> instance = alloc(struct_size(instance, entry, count), GFP_KERNEL)
> 
> Notice that, in this case, variable size is not necessary, hence it is
> removed.
> 
> This code was detected with the help of Coccinelle.
> 
> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>

Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>

^ permalink raw reply

* Re: Kernel 5.0-rc5 regression with NAT, bisected to: netfilter: nat: remove l4proto->manip_pkt
From: Sander Eikelenboom @ 2019-02-08 15:34 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Pablo Neira Ayuso, David S. Miller, netdev, linux-kernel
In-Reply-To: <20190208115447.ojyfhenf44kqs3w4@breakpoint.cc>

On 08/02/2019 12:54, Florian Westphal wrote:
> Florian Westphal <fw@strlen.de> wrote:
>> Sander Eikelenboom <linux@eikelenboom.it> wrote:
>>> L.S.,
>>>
>>> While trying out a 5.0-RC5 kernel I seem to have stumbled over a regression with NAT.
>>> (using an nftables firewall with NAT and connection tracking).
>>>
>>> Unfortunately it isn't too obvious since no errors are logged, but on clients it
>>> causes symptoms like firefox intermittently not being able to load pages with:
>>>     Network Protocol Error
>>>     An error occurred during a connection to www.example.com
>>>     The page you are trying to view cannot be shown because an error in the network protocol was detected.
>>>     Please contact the website owners to inform them of this problem.
>>>
>>> But it's only intermittently, so i can still visit some webpages with clients, 
>>> could be that packet size and or fragments are at play ?
>>>
>>> So I tried testing with git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git with 
>>> e8c32c32b48c2e889704d8ca0872f92eb027838e as last commit, to be sure to have the latest netdev has to offer,
>>> but to no avail. 
>>>
>>> After that I tried to git bisect and ended up with:
>>>
>>> faec18dbb0405c7d4dda025054511dc3a6696918 is the first bad commit
>>> commit faec18dbb0405c7d4dda025054511dc3a6696918
>>> Author: Florian Westphal <fw@strlen.de>
>>> Date:   Thu Dec 13 16:01:33 2018 +0100
>>>
>>>     netfilter: nat: remove l4proto->manip_pkt
>>
>> Thanks, this is immensely helpful.
>>
>> I think I see the bug, we can't use target->dst.protonum in
>> nf_nat_l4proto_manip_pkt(), it will be TCP in case we're dealing
>> with a related icmp packet.
>>
>> I will send a patch in a few hours when I get back.
> 
> Sander, does this patch fix things for you?

Hi Florian,

You may stick on a reported/tested-by if you like.
Thanks for the swift fix !

--
Sander

> 
> Thanks!
> 
> diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
> --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
> +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
> @@ -215,6 +215,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
>  
>  	/* Change outer to look like the reply to an incoming packet */
>  	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
> +	target.dst.protonum = IPPROTO_ICMP;
>  	if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip))
>  		return 0;
>  
> diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
> --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
> +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
> @@ -226,6 +226,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
>  	}
>  
>  	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
> +	target.dst.protonum = IPPROTO_ICMPV6;
>  	if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip))
>  		return 0;
>  
> 


^ permalink raw reply

* [PATCH net-next 0/5] mvpp2 phylink fixes
From: Russell King - ARM Linux admin @ 2019-02-08 15:34 UTC (permalink / raw)
  To: Antoine Tenart, Maxime Chevallier
  Cc: Baruch Siach, David S. Miller, netdev, Sven Auhagen

Hi,

Having spent a while debugging issues with Sven Auhagen, it appears
that the mvpp2 network driver's phylink support isn't quite correct.

This series fixes that up, but, despite being tested locally, by
Sven, and by Antoine, I would prefer it to be applied to net-next
so that there is time for more people to test before it hits -rc or
stable backports.

The symptoms were that although PHYs would come up, the GMAC never
reported that the link was up, or in some cases it did report link
up but packets would not flow.  Various approaches were tried to
work around that, such as switching to in-band negotiation from
PHY mode, but ultimately the problem was in the way mvpp2 was being
programmed.

This series addresses that by, essentially, making mvpp2 follow the
same implementation pattern as mvneta: we configure the GMAC in three
stages:

1) the PHY interface mode
2) the negotiation advert
3) the negotiation style

Another issue is that mvpp2 was always taking the link down each time
its mac_config method was called: this is disruptive when the link is
already up, and we're just updating settings such as flow control.
There are some circumstances where we make the call despite there
being no changes (eg, when phylink is polling a GPIO or using a custom
link state function.)

This series depends on two previous patches already sent for net-next:
  net: marvell: mvpp2: fix lack of link interrupts
  net: marvell: mvpp2: use phy_interface_mode_is_8023z() helper

There is one last patch which deals with link status interrupts, which
I'll send separately because I think there's other considerations, but
that should not hold up this series of patches.

 drivers/net/ethernet/marvell/mvpp2/mvpp2.h      |   4 +-
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 175 ++++++++++++++----------
 2 files changed, 108 insertions(+), 71 deletions(-)

-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 12.1Mbps down 622kbps up
According to speedtest.net: 11.9Mbps down 500kbps up

^ permalink raw reply

* [PATCH 1/5] net: marvell: mvpp2: phylink compliance updates
From: Russell King @ 2019-02-08 15:35 UTC (permalink / raw)
  To: Antoine Tenart, Maxime Chevallier
  Cc: Baruch Siach, Sven Auhagen, David S. Miller, netdev
In-Reply-To: <20190208153432.igh26ubphiljsswa@shell.armlinux.org.uk>

Sven Auhagen reported issues with negotiation on a couple of his
platforms using a mixture of SFP and PHYs in various different
modes.  Debugging to root cause proved difficult, but essentially
the problem comes down to the mvpp2 phylink implementation being
slightly at odds with what is expected.

phylink operates in three modes: phy, fixed-link, and in-band mode.

In the first two modes, the expected behaviour from a MAC driver is
that phylink resolves the operating mode and passes the mode to the
MAC driver for it to program, including when the link should be
brought up or taken down.  This is basically the same as the libphy
approach.  This does not negate the requirement to advertise a correct
control word for interface modes that have control words where that
can be reasonably controlled.

The second mode is in-band mode, where the MAC is expected to use the
in-band control word to determine the operating mode.

The mvneta driver implements the correct pattern required to support
this: configure the port interface type separately from the in-band
mode(s).  This is now specified in the phylink documentation patches.

mvpp2 was programming in-band mode for SGMII and the 802.3z modes no
what, and avoided forcing the link up in fixed/phy modes.  This caused
a problem with some boards where the PHY is by default programmed to
enter AN bypass mode, the PHY would report that the link was up, but
the mvpp2 never completed the exchange of control word.

Another issue that mvpp2 has is it sets SGMII AN format control word
for both SGMII and 802.3z modes. The format of the control word is
defined by MVPP2_GMAC_INBAND_AN_MASK, which should be set for SGMII
and clear for 802.3z. Available Marvell documentation for earlier
GMAC implementations does not make this clear, but this has been
ascertained via extensive testing on earlier GMAC implementations,
and then confirmed with a Macchiatobin Single Shot connected to a
Clearfog: when MVPP2_GMAC_INBAND_AN_MASK is set, the clearfog does
not receive the advertised pause mode settings.

Lastly, there is no flow control in the in-band control word in Cisco
SGMII, setting the flow control autonegotiation bit even with a PHY
that has the Marvell extension to send this information does not result
in the flow control being enabled at the MAC.  We need to do this
manually using the information provided via phylink.

Re-code mvpp2's mac_config() and mac_link_up() to follow this pattern.
This allows Sven Auhagen's board and Macchiatobin to reliably bring
the link up with the 88e1512 PHY with phylink operating in PHY mode
with COMPHY built as a module but the rest of the networking built-in,
and u-boot having brought up the interface.  in-band mode requires an
additional patch to resolve another problem.

Tested-by: Sven Auhagen <sven.auhagen@voleatech.de>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 97 ++++++++++++++++---------
 1 file changed, 61 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index d8974c446f8e..c590dc7ba70a 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -4572,58 +4572,84 @@ static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
 	an &= ~(MVPP2_GMAC_CONFIG_MII_SPEED | MVPP2_GMAC_CONFIG_GMII_SPEED |
 		MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FC_ADV_EN |
 		MVPP2_GMAC_FC_ADV_ASM_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG |
-		MVPP2_GMAC_CONFIG_FULL_DUPLEX | MVPP2_GMAC_AN_DUPLEX_EN |
-		MVPP2_GMAC_FORCE_LINK_DOWN);
+		MVPP2_GMAC_CONFIG_FULL_DUPLEX | MVPP2_GMAC_AN_DUPLEX_EN);
 	ctrl0 &= ~MVPP2_GMAC_PORT_TYPE_MASK;
-	ctrl2 &= ~(MVPP2_GMAC_PORT_RESET_MASK | MVPP2_GMAC_PCS_ENABLE_MASK);
+	ctrl2 &= ~(MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PORT_RESET_MASK |
+		   MVPP2_GMAC_PCS_ENABLE_MASK);
+	ctrl4 &= ~(MVPP22_CTRL4_RX_FC_EN | MVPP22_CTRL4_TX_FC_EN);
 
+	/* Configure port type */
 	if (phy_interface_mode_is_8023z(state->interface)) {
-		/* 1000BaseX and 2500BaseX ports cannot negotiate speed nor can
-		 * they negotiate duplex: they are always operating with a fixed
-		 * speed of 1000/2500Mbps in full duplex, so force 1000/2500
-		 * speed and full duplex here.
-		 */
-		ctrl0 |= MVPP2_GMAC_PORT_TYPE_MASK;
-		an |= MVPP2_GMAC_CONFIG_GMII_SPEED |
-		      MVPP2_GMAC_CONFIG_FULL_DUPLEX;
-	} else if (!phy_interface_mode_is_rgmii(state->interface)) {
-		an |= MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG;
+		ctrl2 |= MVPP2_GMAC_PCS_ENABLE_MASK;
+		ctrl4 &= ~MVPP22_CTRL4_EXT_PIN_GMII_SEL;
+		ctrl4 |= MVPP22_CTRL4_SYNC_BYPASS_DIS |
+			 MVPP22_CTRL4_DP_CLK_SEL |
+			 MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
+	} else if (state->interface == PHY_INTERFACE_MODE_SGMII) {
+		ctrl2 |= MVPP2_GMAC_PCS_ENABLE_MASK | MVPP2_GMAC_INBAND_AN_MASK;
+		ctrl4 &= ~MVPP22_CTRL4_EXT_PIN_GMII_SEL;
+		ctrl4 |= MVPP22_CTRL4_SYNC_BYPASS_DIS |
+			 MVPP22_CTRL4_DP_CLK_SEL |
+			 MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
+	} else if (phy_interface_mode_is_rgmii(state->interface)) {
+		ctrl4 &= ~MVPP22_CTRL4_DP_CLK_SEL;
+		ctrl4 |= MVPP22_CTRL4_EXT_PIN_GMII_SEL |
+			 MVPP22_CTRL4_SYNC_BYPASS_DIS |
+			 MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
 	}
 
-	if (state->duplex)
-		an |= MVPP2_GMAC_CONFIG_FULL_DUPLEX;
+	/* Configure advertisement bits */
 	if (phylink_test(state->advertising, Pause))
 		an |= MVPP2_GMAC_FC_ADV_EN;
 	if (phylink_test(state->advertising, Asym_Pause))
 		an |= MVPP2_GMAC_FC_ADV_ASM_EN;
 
-	if (phy_interface_mode_is_8023z(state->interface) ||
-	    state->interface == PHY_INTERFACE_MODE_SGMII) {
-		an |= MVPP2_GMAC_IN_BAND_AUTONEG;
-		ctrl2 |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK;
+	/* Configure negotiation style */
+	if (!phylink_autoneg_inband(mode)) {
+		/* Phy or fixed speed - no in-band AN */
+		if (state->duplex)
+			an |= MVPP2_GMAC_CONFIG_FULL_DUPLEX;
 
-		ctrl4 &= ~(MVPP22_CTRL4_EXT_PIN_GMII_SEL |
-			   MVPP22_CTRL4_RX_FC_EN | MVPP22_CTRL4_TX_FC_EN);
-		ctrl4 |= MVPP22_CTRL4_SYNC_BYPASS_DIS |
-			 MVPP22_CTRL4_DP_CLK_SEL |
-			 MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
+		if (state->speed == SPEED_1000 || state->speed == SPEED_2500)
+			an |= MVPP2_GMAC_CONFIG_GMII_SPEED;
+		else if (state->speed == SPEED_100)
+			an |= MVPP2_GMAC_CONFIG_MII_SPEED;
 
 		if (state->pause & MLO_PAUSE_TX)
 			ctrl4 |= MVPP22_CTRL4_TX_FC_EN;
 		if (state->pause & MLO_PAUSE_RX)
 			ctrl4 |= MVPP22_CTRL4_RX_FC_EN;
-	} else if (phy_interface_mode_is_rgmii(state->interface)) {
-		an |= MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS;
+	} else if (state->interface == PHY_INTERFACE_MODE_SGMII) {
+		/* SGMII in-band mode receives the speed and duplex from
+		 * the PHY. Flow control information is not received. */
+		an &= ~(MVPP2_GMAC_FORCE_LINK_DOWN | MVPP2_GMAC_FORCE_LINK_PASS);
+		an |= MVPP2_GMAC_IN_BAND_AUTONEG |
+		      MVPP2_GMAC_AN_SPEED_EN |
+		      MVPP2_GMAC_AN_DUPLEX_EN;
 
-		if (state->speed == SPEED_1000)
-			an |= MVPP2_GMAC_CONFIG_GMII_SPEED;
-		else if (state->speed == SPEED_100)
-			an |= MVPP2_GMAC_CONFIG_MII_SPEED;
+		if (state->pause & MLO_PAUSE_TX)
+			ctrl4 |= MVPP22_CTRL4_TX_FC_EN;
+		if (state->pause & MLO_PAUSE_RX)
+			ctrl4 |= MVPP22_CTRL4_RX_FC_EN;
+	} else if (phy_interface_mode_is_8023z(state->interface)) {
+		/* 1000BaseX and 2500BaseX ports cannot negotiate speed nor can
+		 * they negotiate duplex: they are always operating with a fixed
+		 * speed of 1000/2500Mbps in full duplex, so force 1000/2500
+		 * speed and full duplex here.
+		 */
+		ctrl0 |= MVPP2_GMAC_PORT_TYPE_MASK;
+		an &= ~(MVPP2_GMAC_FORCE_LINK_DOWN | MVPP2_GMAC_FORCE_LINK_PASS);
+		an |= MVPP2_GMAC_CONFIG_GMII_SPEED |
+		      MVPP2_GMAC_CONFIG_FULL_DUPLEX;
 
-		ctrl4 &= ~MVPP22_CTRL4_DP_CLK_SEL;
-		ctrl4 |= MVPP22_CTRL4_EXT_PIN_GMII_SEL |
-			 MVPP22_CTRL4_SYNC_BYPASS_DIS |
-			 MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
+		if (state->pause & MLO_PAUSE_AN && state->an_enabled) {
+			an |= MVPP2_GMAC_FLOW_CTRL_AUTONEG;
+		} else {
+			if (state->pause & MLO_PAUSE_TX)
+				ctrl4 |= MVPP22_CTRL4_TX_FC_EN;
+			if (state->pause & MLO_PAUSE_RX)
+				ctrl4 |= MVPP22_CTRL4_RX_FC_EN;
+		}
 	}
 
 	writel(ctrl0, port->base + MVPP2_GMAC_CTRL_0_REG);
@@ -4685,8 +4711,7 @@ static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode,
 	    interface != PHY_INTERFACE_MODE_10GKR) {
 		val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 		val &= ~MVPP2_GMAC_FORCE_LINK_DOWN;
-		if (phy_interface_mode_is_rgmii(interface))
-			val |= MVPP2_GMAC_FORCE_LINK_PASS;
+		val |= MVPP2_GMAC_FORCE_LINK_PASS;
 		writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 	}
 
-- 
2.7.4


^ permalink raw reply related

* [PATCH 2/5] net: marvell: mvpp2: fix stuck in-band SGMII negotiation
From: Russell King @ 2019-02-08 15:35 UTC (permalink / raw)
  To: Antoine Tenart, Maxime Chevallier
  Cc: Baruch Siach, Sven Auhagen, David S. Miller, netdev
In-Reply-To: <20190208153432.igh26ubphiljsswa@shell.armlinux.org.uk>

It appears that the mvpp22 can get stuck with SGMII negotiation.  The
symptoms are that in-band negotiation never completes and the partner
(eg, PHY) never reports SGMII link up, or if it supports negotiation
bypass, goes into negotiation bypass mode (which will happen when the
PHY sees that the MAC is alive but gets no response.)

Triggering the PHY end of the link to re-negotiate results in the
bypass bit clearing on the PHY, and then re-setting - indicating that
the problem is at the mvpp22 GMAC end.

Asserting the GMAC reset and de-asserting it resolves the issue.
Arrange to assert the GMAC reset at probe time, and deassert it only
after we have configured the GMAC for the appropriate mode.  This
resolves the issue.

Tested-by: Sven Auhagen <sven.auhagen@voleatech.de>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index c590dc7ba70a..1cb5023b2575 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -1392,13 +1392,9 @@ static void mvpp2_port_reset(struct mvpp2_port *port)
 	for (i = 0; i < ARRAY_SIZE(mvpp2_ethtool_regs); i++)
 		mvpp2_read_count(port, &mvpp2_ethtool_regs[i]);
 
-	val = readl(port->base + MVPP2_GMAC_CTRL_2_REG) &
-		    ~MVPP2_GMAC_PORT_RESET_MASK;
+	val = readl(port->base + MVPP2_GMAC_CTRL_2_REG) |
+	      MVPP2_GMAC_PORT_RESET_MASK;
 	writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
-
-	while (readl(port->base + MVPP2_GMAC_CTRL_2_REG) &
-	       MVPP2_GMAC_PORT_RESET_MASK)
-		continue;
 }
 
 /* Change maximum receive size of the port */
@@ -4554,12 +4550,15 @@ static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
 			      const struct phylink_link_state *state)
 {
 	u32 an, ctrl0, ctrl2, ctrl4;
+	u32 old_ctrl2;
 
 	an = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 	ctrl0 = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
 	ctrl2 = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
 	ctrl4 = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
 
+	old_ctrl2 = ctrl2;
+
 	/* Force link down */
 	an &= ~MVPP2_GMAC_FORCE_LINK_PASS;
 	an |= MVPP2_GMAC_FORCE_LINK_DOWN;
@@ -4656,6 +4655,12 @@ static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
 	writel(ctrl2, port->base + MVPP2_GMAC_CTRL_2_REG);
 	writel(ctrl4, port->base + MVPP22_GMAC_CTRL_4_REG);
 	writel(an, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+
+	if (old_ctrl2 & MVPP2_GMAC_PORT_RESET_MASK) {
+		while (readl(port->base + MVPP2_GMAC_CTRL_2_REG) &
+		       MVPP2_GMAC_PORT_RESET_MASK)
+			continue;
+	}
 }
 
 static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
-- 
2.7.4


^ permalink raw reply related

* [PATCH 3/5] net: marvell: mvpp2: only reprogram what is necessary on mac_config
From: Russell King @ 2019-02-08 15:35 UTC (permalink / raw)
  To: Antoine Tenart, Maxime Chevallier
  Cc: Baruch Siach, Sven Auhagen, David S. Miller, netdev
In-Reply-To: <20190208153432.igh26ubphiljsswa@shell.armlinux.org.uk>

mac_config() can be called at any point, and the expected behaviour
from MAC drivers is to only reprogram when necessary - and certainly
avoid taking the link down on every call.

Unfortunately, mvpp2 does exactly that - it takes the link down, and
reprograms everything, and then releases the forced-link down.

This is bad, it can cause the link to bounce:

- SFP detects signal, disables LOS indication.
- SFP code calls into phylink, calling phylink_sfp_link_up() which
  triggers a resolve.
- phylink_resolve() calls phylink_get_mac_state() and finds the MAC
  reporting link up.
- phylink wants to configure the pause mode on the MAC, so calls
  phylink_mac_config()
- mvpp2 takes the link down temporarily, generating a MAC link down
  event followed by another MAC link event.
- phylink calls mac_link_up() and then processes the MAC link down
  event.
- phylink_resolve() gets called again, registers the link down, and
  calls mach_link_down() before re-running itself.
- phylink_resolve() starts again at step 3 above.  This sequence
  repeats.

GMAC versions prior to mvpp2 do not require the link to be taken down
except when certain link properties (eg, switching between SGMII and
1000base-X mode, or enabling/disabling in-band negotiation) are
changed.  Implement this for mvpp2.

Tested-by: Sven Auhagen <sven.auhagen@voleatech.de>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 58 +++++++++++++++----------
 1 file changed, 35 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 1cb5023b2575..624514bc1681 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -4549,29 +4549,21 @@ static void mvpp2_xlg_config(struct mvpp2_port *port, unsigned int mode,
 static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
 			      const struct phylink_link_state *state)
 {
-	u32 an, ctrl0, ctrl2, ctrl4;
-	u32 old_ctrl2;
+	u32 old_an, an;
+	u32 old_ctrl0, ctrl0;
+	u32 old_ctrl2, ctrl2;
+	u32 old_ctrl4, ctrl4;
 
-	an = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-	ctrl0 = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
-	ctrl2 = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
-	ctrl4 = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
-
-	old_ctrl2 = ctrl2;
-
-	/* Force link down */
-	an &= ~MVPP2_GMAC_FORCE_LINK_PASS;
-	an |= MVPP2_GMAC_FORCE_LINK_DOWN;
-	writel(an, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-
-	/* Set the GMAC in a reset state */
-	ctrl2 |= MVPP2_GMAC_PORT_RESET_MASK;
-	writel(ctrl2, port->base + MVPP2_GMAC_CTRL_2_REG);
+	old_an = an = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+	old_ctrl0 = ctrl0 = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
+	old_ctrl2 = ctrl2 = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
+	old_ctrl4 = ctrl4 = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
 
 	an &= ~(MVPP2_GMAC_CONFIG_MII_SPEED | MVPP2_GMAC_CONFIG_GMII_SPEED |
 		MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FC_ADV_EN |
 		MVPP2_GMAC_FC_ADV_ASM_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG |
-		MVPP2_GMAC_CONFIG_FULL_DUPLEX | MVPP2_GMAC_AN_DUPLEX_EN);
+		MVPP2_GMAC_CONFIG_FULL_DUPLEX | MVPP2_GMAC_AN_DUPLEX_EN |
+		MVPP2_GMAC_IN_BAND_AUTONEG | MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS);
 	ctrl0 &= ~MVPP2_GMAC_PORT_TYPE_MASK;
 	ctrl2 &= ~(MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PORT_RESET_MASK |
 		   MVPP2_GMAC_PCS_ENABLE_MASK);
@@ -4638,7 +4630,8 @@ static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
 		 */
 		ctrl0 |= MVPP2_GMAC_PORT_TYPE_MASK;
 		an &= ~(MVPP2_GMAC_FORCE_LINK_DOWN | MVPP2_GMAC_FORCE_LINK_PASS);
-		an |= MVPP2_GMAC_CONFIG_GMII_SPEED |
+		an |= MVPP2_GMAC_IN_BAND_AUTONEG |
+		      MVPP2_GMAC_CONFIG_GMII_SPEED |
 		      MVPP2_GMAC_CONFIG_FULL_DUPLEX;
 
 		if (state->pause & MLO_PAUSE_AN && state->an_enabled) {
@@ -4651,10 +4644,29 @@ static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
 		}
 	}
 
-	writel(ctrl0, port->base + MVPP2_GMAC_CTRL_0_REG);
-	writel(ctrl2, port->base + MVPP2_GMAC_CTRL_2_REG);
-	writel(ctrl4, port->base + MVPP22_GMAC_CTRL_4_REG);
-	writel(an, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+	if ((old_ctrl0 ^ ctrl0) & MVPP2_GMAC_PORT_TYPE_MASK ||
+	    (old_ctrl2 ^ ctrl2) & MVPP2_GMAC_INBAND_AN_MASK ||
+	    (old_an ^ an) & MVPP2_GMAC_IN_BAND_AUTONEG) {
+		/* Force link down */
+		old_an &= ~MVPP2_GMAC_FORCE_LINK_PASS;
+		old_an |= MVPP2_GMAC_FORCE_LINK_DOWN;
+		writel(old_an, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+
+		/* Set the GMAC in a reset state - do this in a way that
+		 * ensures we clear it below.
+		 */
+		old_ctrl2 |= MVPP2_GMAC_PORT_RESET_MASK;
+		writel(old_ctrl2, port->base + MVPP2_GMAC_CTRL_2_REG);
+	}
+
+	if (old_ctrl0 != ctrl0)
+		writel(ctrl0, port->base + MVPP2_GMAC_CTRL_0_REG);
+	if (old_ctrl2 != ctrl2)
+		writel(ctrl2, port->base + MVPP2_GMAC_CTRL_2_REG);
+	if (old_ctrl4 != ctrl4)
+		writel(ctrl4, port->base + MVPP22_GMAC_CTRL_4_REG);
+	if (old_an != an)
+		writel(an, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 
 	if (old_ctrl2 & MVPP2_GMAC_PORT_RESET_MASK) {
 		while (readl(port->base + MVPP2_GMAC_CTRL_2_REG) &
-- 
2.7.4


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox