Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next v5 07/11] libbpf: add flags to umem config
From: Kevin Laatz @ 2019-08-22  1:44 UTC (permalink / raw)
  To: netdev, ast, daniel, bjorn.topel, magnus.karlsson, jakub.kicinski,
	jonathan.lemon, saeedm, maximmi, stephen
  Cc: bruce.richardson, ciara.loftus, bpf, intel-wired-lan, Kevin Laatz
In-Reply-To: <20190822014427.49800-1-kevin.laatz@intel.com>

This patch adds a 'flags' field to the umem_config and umem_reg structs.
This will allow for more options to be added for configuring umems.

The first use for the flags field is to add a flag for unaligned chunks
mode. These flags can either be user-provided or filled with a default.

Since we change the size of the xsk_umem_config struct, we need to version
the ABI. This patch includes the ABI versioning for xsk_umem__create. The
Makefile was also updated to handle multiple function versions in
check-abi.

Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>

---
v2:
  - Removed the headroom check from this patch. It has moved to the
    previous patch.

v4:
  - Modified chunk flag define.

v5:
  - Added ABI versioning for xsk_umem__create().
  - Updated Makefile to handle multiple function versions.
  - Removed bitfields from xdp_umem_reg
  - Fix conflicts after 'bpf-af-xdp-wakeup' was merged
---
 tools/include/uapi/linux/if_xdp.h |  9 +++++++++
 tools/lib/bpf/Makefile            |  5 ++++-
 tools/lib/bpf/libbpf.map          |  1 +
 tools/lib/bpf/xsk.c               | 33 ++++++++++++++++++++++++++++---
 tools/lib/bpf/xsk.h               | 27 +++++++++++++++++++++++++
 5 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index 62b80d57b72a..be328c59389d 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -26,6 +26,9 @@
  */
 #define XDP_USE_NEED_WAKEUP (1 << 3)
 
+/* Flags for xsk_umem_config flags */
+#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
+
 struct sockaddr_xdp {
 	__u16 sxdp_family;
 	__u16 sxdp_flags;
@@ -66,6 +69,7 @@ struct xdp_umem_reg {
 	__u64 len; /* Length of packet data area */
 	__u32 chunk_size;
 	__u32 headroom;
+	__u32 flags;
 };
 
 struct xdp_statistics {
@@ -87,6 +91,11 @@ struct xdp_options {
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000ULL
 #define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000ULL
 
+/* Masks for unaligned chunks mode */
+#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
+#define XSK_UNALIGNED_BUF_ADDR_MASK \
+	((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
+
 /* Rx/Tx descriptor */
 struct xdp_desc {
 	__u64 addr;
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 613acb93b144..c6f94cffe06e 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -134,7 +134,9 @@ LIB_FILE	:= $(addprefix $(OUTPUT),$(LIB_FILE))
 PC_FILE		:= $(addprefix $(OUTPUT),$(PC_FILE))
 
 GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN) | \
-			   awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {s++} END{print s}')
+			   cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \
+			   awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$8}' | \
+			   sort -u | wc -l)
 VERSIONED_SYM_COUNT = $(shell readelf -s --wide $(OUTPUT)libbpf.so | \
 			      grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l)
 
@@ -201,6 +203,7 @@ check_abi: $(OUTPUT)libbpf.so
 		     "Please make sure all LIBBPF_API symbols are"	 \
 		     "versioned in $(VERSION_SCRIPT)." >&2;		 \
 		readelf -s --wide $(OUTPUT)libbpf-in.o |		 \
+		    cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' |	 \
 		    awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$8}'|   \
 		    sort -u > $(OUTPUT)libbpf_global_syms.tmp;		 \
 		readelf -s --wide $(OUTPUT)libbpf.so |			 \
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 664ce8e7a60e..d04c7cb623ed 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -183,6 +183,7 @@ LIBBPF_0.0.4 {
 		perf_buffer__new;
 		perf_buffer__new_raw;
 		perf_buffer__poll;
+		xsk_umem__create;
 } LIBBPF_0.0.3;
 
 LIBBPF_0.0.5 {
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 12ad78510147..842c4fd55859 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -99,6 +99,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg,
 		cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
 		cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
 		cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
+		cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
 		return;
 	}
 
@@ -106,6 +107,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg,
 	cfg->comp_size = usr_cfg->comp_size;
 	cfg->frame_size = usr_cfg->frame_size;
 	cfg->frame_headroom = usr_cfg->frame_headroom;
+	cfg->flags = usr_cfg->flags;
 }
 
 static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
@@ -132,9 +134,10 @@ static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
 	return 0;
 }
 
-int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
-		     struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
-		     const struct xsk_umem_config *usr_config)
+int xsk_umem__create_v0_0_4(struct xsk_umem **umem_ptr, void *umem_area,
+			    __u64 size, struct xsk_ring_prod *fill,
+			    struct xsk_ring_cons *comp,
+			    const struct xsk_umem_config *usr_config)
 {
 	struct xdp_mmap_offsets off;
 	struct xdp_umem_reg mr;
@@ -165,6 +168,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
 	mr.len = size;
 	mr.chunk_size = umem->config.frame_size;
 	mr.headroom = umem->config.frame_headroom;
+	mr.flags = umem->config.flags;
 
 	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
 	if (err) {
@@ -238,6 +242,29 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
 	return err;
 }
 
+struct xsk_umem_config_v1 {
+	__u32 fill_size;
+	__u32 comp_size;
+	__u32 frame_size;
+	__u32 frame_headroom;
+};
+
+int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area,
+			    __u64 size, struct xsk_ring_prod *fill,
+			    struct xsk_ring_cons *comp,
+			    const struct xsk_umem_config *usr_config)
+{
+	struct xsk_umem_config config;
+
+	memcpy(&config, usr_config, sizeof(struct xsk_umem_config_v1));
+	config.flags = 0;
+
+	return xsk_umem__create_v0_0_4(umem_ptr, umem_area, size, fill, comp,
+					&config);
+}
+asm(".symver xsk_umem__create_v0_0_2, xsk_umem__create@LIBBPF_0.0.2");
+asm(".symver xsk_umem__create_v0_0_4, xsk_umem__create@@LIBBPF_0.0.4");
+
 static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 {
 	static const int log_buf_size = 16 * 1024;
diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
index aa1d6122b7db..584f6820a639 100644
--- a/tools/lib/bpf/xsk.h
+++ b/tools/lib/bpf/xsk.h
@@ -168,6 +168,21 @@ static inline void *xsk_umem__get_data(void *umem_area, __u64 addr)
 	return &((char *)umem_area)[addr];
 }
 
+static inline __u64 xsk_umem__extract_addr(__u64 addr)
+{
+	return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
+}
+
+static inline __u64 xsk_umem__extract_offset(__u64 addr)
+{
+	return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+}
+
+static inline __u64 xsk_umem__add_offset_to_addr(__u64 addr)
+{
+	return xsk_umem__extract_addr(addr) + xsk_umem__extract_offset(addr);
+}
+
 LIBBPF_API int xsk_umem__fd(const struct xsk_umem *umem);
 LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
 
@@ -176,12 +191,14 @@ LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
 #define XSK_UMEM__DEFAULT_FRAME_SHIFT    12 /* 4096 bytes */
 #define XSK_UMEM__DEFAULT_FRAME_SIZE     (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT)
 #define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0
+#define XSK_UMEM__DEFAULT_FLAGS 0
 
 struct xsk_umem_config {
 	__u32 fill_size;
 	__u32 comp_size;
 	__u32 frame_size;
 	__u32 frame_headroom;
+	__u32 flags;
 };
 
 /* Flags for the libbpf_flags field. */
@@ -201,6 +218,16 @@ LIBBPF_API int xsk_umem__create(struct xsk_umem **umem,
 				struct xsk_ring_prod *fill,
 				struct xsk_ring_cons *comp,
 				const struct xsk_umem_config *config);
+LIBBPF_API int xsk_umem__create_v0_0_2(struct xsk_umem **umem,
+				       void *umem_area, __u64 size,
+				       struct xsk_ring_prod *fill,
+				       struct xsk_ring_cons *comp,
+				       const struct xsk_umem_config *config);
+LIBBPF_API int xsk_umem__create_v0_0_4(struct xsk_umem **umem,
+				       void *umem_area, __u64 size,
+				       struct xsk_ring_prod *fill,
+				       struct xsk_ring_cons *comp,
+				       const struct xsk_umem_config *config);
 LIBBPF_API int xsk_socket__create(struct xsk_socket **xsk,
 				  const char *ifname, __u32 queue_id,
 				  struct xsk_umem *umem,
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next v5 06/11] mlx5e: modify driver for handling offsets
From: Kevin Laatz @ 2019-08-22  1:44 UTC (permalink / raw)
  To: netdev, ast, daniel, bjorn.topel, magnus.karlsson, jakub.kicinski,
	jonathan.lemon, saeedm, maximmi, stephen
  Cc: bruce.richardson, ciara.loftus, bpf, intel-wired-lan, Kevin Laatz
In-Reply-To: <20190822014427.49800-1-kevin.laatz@intel.com>

With the addition of the unaligned chunks option, we need to make sure we
handle the offsets accordingly based on the mode we are currently running
in. This patch modifies the driver to appropriately mask the address for
each case.

Note: This patch only adds support for the new offset handling. An
additional patch that enables the unaligned chunks feature (makes mlx5e
accept arbitrary frame sizes) will be added later and will apply on top
of this patch.

Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>

---
v3:
  - Use new helper function to handle offset

v4:
  - fixed headroom addition to handle. Using xsk_umem_adjust_headroom()
    now.

v5:
  - Fixed typo: handle_offset -> adjust_offset
---
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c    | 8 ++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c | 3 ++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 1ed5c33e022f..f049e0ac308a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -122,6 +122,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
 		      void *va, u16 *rx_headroom, u32 *len, bool xsk)
 {
 	struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
+	struct xdp_umem *umem = rq->umem;
 	struct xdp_buff xdp;
 	u32 act;
 	int err;
@@ -138,8 +139,11 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
 	xdp.rxq = &rq->xdp_rxq;
 
 	act = bpf_prog_run_xdp(prog, &xdp);
-	if (xsk)
-		xdp.handle += xdp.data - xdp.data_hard_start;
+	if (xsk) {
+		u64 off = xdp.data - xdp.data_hard_start;
+
+		xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off);
+	}
 	switch (act) {
 	case XDP_PASS:
 		*rx_headroom = xdp.data - xdp.data_hard_start;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 6a55573ec8f2..7c49a66d28c9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -24,7 +24,8 @@ int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
 	if (!xsk_umem_peek_addr_rq(umem, &handle))
 		return -ENOMEM;
 
-	dma_info->xsk.handle = handle + rq->buff.umem_headroom;
+	dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle,
+						      rq->buff.umem_headroom);
 	dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle);
 
 	/* No need to add headroom to the DMA address. In striding RQ case, we
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next v5 05/11] ixgbe: modify driver for handling offsets
From: Kevin Laatz @ 2019-08-22  1:44 UTC (permalink / raw)
  To: netdev, ast, daniel, bjorn.topel, magnus.karlsson, jakub.kicinski,
	jonathan.lemon, saeedm, maximmi, stephen
  Cc: bruce.richardson, ciara.loftus, bpf, intel-wired-lan, Kevin Laatz
In-Reply-To: <20190822014427.49800-1-kevin.laatz@intel.com>

With the addition of the unaligned chunks option, we need to make sure we
handle the offsets accordingly based on the mode we are currently running
in. This patch modifies the driver to appropriately mask the address for
each case.

Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>

---
v3:
  - Use new helper function to handle offset
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index e1f743470ae9..17061c799f72 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -143,7 +143,9 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
 			    struct ixgbe_ring *rx_ring,
 			    struct xdp_buff *xdp)
 {
+	struct xdp_umem *umem = rx_ring->xsk_umem;
 	int err, result = IXGBE_XDP_PASS;
+	u64 offset = umem->headroom;
 	struct bpf_prog *xdp_prog;
 	struct xdp_frame *xdpf;
 	u32 act;
@@ -151,7 +153,10 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
 	rcu_read_lock();
 	xdp_prog = READ_ONCE(rx_ring->xdp_prog);
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
-	xdp->handle += xdp->data - xdp->data_hard_start;
+	offset += xdp->data - xdp->data_hard_start;
+
+	xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
+
 	switch (act) {
 	case XDP_PASS:
 		break;
@@ -243,7 +248,7 @@ void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
 	bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
 	bi->addr += hr;
 
-	bi->handle = (u64)handle + rx_ring->xsk_umem->headroom;
+	bi->handle = (u64)handle;
 }
 
 static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
@@ -269,7 +274,7 @@ static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
 	bi->addr = xdp_umem_get_data(umem, handle);
 	bi->addr += hr;
 
-	bi->handle = handle + umem->headroom;
+	bi->handle = handle;
 
 	xsk_umem_discard_addr(umem);
 	return true;
@@ -296,7 +301,7 @@ static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring,
 	bi->addr = xdp_umem_get_data(umem, handle);
 	bi->addr += hr;
 
-	bi->handle = handle + umem->headroom;
+	bi->handle = handle;
 
 	xsk_umem_discard_addr_rq(umem);
 	return true;
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next v5 04/11] i40e: modify driver for handling offsets
From: Kevin Laatz @ 2019-08-22  1:44 UTC (permalink / raw)
  To: netdev, ast, daniel, bjorn.topel, magnus.karlsson, jakub.kicinski,
	jonathan.lemon, saeedm, maximmi, stephen
  Cc: bruce.richardson, ciara.loftus, bpf, intel-wired-lan, Kevin Laatz
In-Reply-To: <20190822014427.49800-1-kevin.laatz@intel.com>

With the addition of the unaligned chunks option, we need to make sure we
handle the offsets accordingly based on the mode we are currently running
in. This patch modifies the driver to appropriately mask the address for
each case.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>

---
v3:
  - Use new helper function for handling the offset
---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 2d6e82f72f48..eaca6162a6e6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -190,7 +190,9 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
  **/
 static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
 {
+	struct xdp_umem *umem = rx_ring->xsk_umem;
 	int err, result = I40E_XDP_PASS;
+	u64 offset = umem->headroom;
 	struct i40e_ring *xdp_ring;
 	struct bpf_prog *xdp_prog;
 	u32 act;
@@ -201,7 +203,10 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
 	 */
 	xdp_prog = READ_ONCE(rx_ring->xdp_prog);
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
-	xdp->handle += xdp->data - xdp->data_hard_start;
+	offset += xdp->data - xdp->data_hard_start;
+
+	xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
+
 	switch (act) {
 	case XDP_PASS:
 		break;
@@ -262,7 +267,7 @@ static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring,
 	bi->addr = xdp_umem_get_data(umem, handle);
 	bi->addr += hr;
 
-	bi->handle = handle + umem->headroom;
+	bi->handle = handle;
 
 	xsk_umem_discard_addr(umem);
 	return true;
@@ -299,7 +304,7 @@ static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring,
 	bi->addr = xdp_umem_get_data(umem, handle);
 	bi->addr += hr;
 
-	bi->handle = handle + umem->headroom;
+	bi->handle = handle;
 
 	xsk_umem_discard_addr_rq(umem);
 	return true;
@@ -464,7 +469,7 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
 	bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
 	bi->addr += hr;
 
-	bi->handle = (u64)handle + rx_ring->xsk_umem->headroom;
+	bi->handle = (u64)handle;
 }
 
 /**
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next v5 03/11] xsk: add support to allow unaligned chunk placement
From: Kevin Laatz @ 2019-08-22  1:44 UTC (permalink / raw)
  To: netdev, ast, daniel, bjorn.topel, magnus.karlsson, jakub.kicinski,
	jonathan.lemon, saeedm, maximmi, stephen
  Cc: bruce.richardson, ciara.loftus, bpf, intel-wired-lan, Kevin Laatz
In-Reply-To: <20190822014427.49800-1-kevin.laatz@intel.com>

Currently, addresses are chunk size aligned. This means, we are very
restricted in terms of where we can place chunk within the umem. For
example, if we have a chunk size of 2k, then our chunks can only be placed
at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).

This patch introduces the ability to use unaligned chunks. With these
changes, we are no longer bound to having to place chunks at a 2k (or
whatever your chunk size is) interval. Since we are no longer dealing with
aligned chunks, they can now cross page boundaries. Checks for page
contiguity have been added in order to keep track of which pages are
followed by a physically contiguous page.

Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>

---
v2:
  - Add checks for the flags coming from userspace
  - Fix how we get chunk_size in xsk_diag.c
  - Add defines for masking the new descriptor format
  - Modified the rx functions to use new descriptor format
  - Modified the tx functions to use new descriptor format

v3:
  - Add helper function to do address/offset masking/addition

v4:
  - fixed page_start calculation in __xsk_rcv_memcpy().
  - move offset handling to the xdp_umem_get_* functions
  - modified the len field in xdp_umem_reg struct. We now use 16 bits from
    this for the flags field.
  - removed next_pg_contig field from xdp_umem_page struct. Using low 12
    bits of addr to store flags instead.
  - other minor changes based on review comments

v5:
  - Added accessors for getting addr and offset
  - Added helper function to add offset to addr
  - Fixed offset handling in xsk_rcv
  - Removed bitfields from xdp_umem_reg
  - Added struct size checking for xdp_umem_reg in xsk_setsockopt to handle
    different versions of the struct.
  - fix conflicts after 'bpf-af-xdp-wakeup' was merged.
---
 include/net/xdp_sock.h      | 75 +++++++++++++++++++++++++++--
 include/uapi/linux/if_xdp.h |  9 ++++
 net/xdp/xdp_umem.c          | 19 ++++++--
 net/xdp/xsk.c               | 96 +++++++++++++++++++++++++++++--------
 net/xdp/xsk_diag.c          |  2 +-
 net/xdp/xsk_queue.h         | 68 ++++++++++++++++++++++----
 6 files changed, 232 insertions(+), 37 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index f023b9940d64..c9398ce7960f 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -16,6 +16,13 @@
 struct net_device;
 struct xsk_queue;
 
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT)
+
 struct xdp_umem_page {
 	void *addr;
 	dma_addr_t dma;
@@ -27,8 +34,12 @@ struct xdp_umem_fq_reuse {
 	u64 handles[];
 };
 
-/* Flags for the umem flags field. */
-#define XDP_UMEM_USES_NEED_WAKEUP (1 << 0)
+/* Flags for the umem flags field.
+ *
+ * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
+ * flags. See inlude/uapi/include/linux/if_xdp.h.
+ */
+#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1)
 
 struct xdp_umem {
 	struct xsk_queue *fq;
@@ -124,14 +135,36 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
 int xsk_map_inc(struct xsk_map *map);
 void xsk_map_put(struct xsk_map *map);
 
+static inline u64 xsk_umem_extract_addr(u64 addr)
+{
+	return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
+}
+
+static inline u64 xsk_umem_extract_offset(u64 addr)
+{
+	return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+}
+
+static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
+{
+	return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
+}
+
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
-	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
+	unsigned long page_addr;
+
+	addr = xsk_umem_add_offset_to_addr(addr);
+	page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
+
+	return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
 }
 
 static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
 {
-	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
+	addr = xsk_umem_add_offset_to_addr(addr);
+
+	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
 }
 
 /* Reuse-queue aware version of FILL queue helpers */
@@ -172,6 +205,19 @@ static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
 
 	rq->handles[rq->length++] = addr;
 }
+
+/* Handle the offset appropriately depending on aligned or unaligned mode.
+ * For unaligned mode, we store the offset in the upper 16-bits of the address.
+ * For aligned mode, we simply add the offset to the address.
+ */
+static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
+					 u64 offset)
+{
+	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
+		return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
+	else
+		return address + offset;
+}
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
@@ -241,6 +287,21 @@ static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
 	return NULL;
 }
 
+static inline u64 xsk_umem_extract_addr(u64 addr)
+{
+	return 0;
+}
+
+static inline u64 xsk_umem_extract_offset(u64 addr)
+{
+	return 0;
+}
+
+static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
+{
+	return 0;
+}
+
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
 	return NULL;
@@ -290,6 +351,12 @@ static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
 	return false;
 }
 
+static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
+					 u64 offset)
+{
+	return 0;
+}
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 62b80d57b72a..be328c59389d 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -26,6 +26,9 @@
  */
 #define XDP_USE_NEED_WAKEUP (1 << 3)
 
+/* Flags for xsk_umem_config flags */
+#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
+
 struct sockaddr_xdp {
 	__u16 sxdp_family;
 	__u16 sxdp_flags;
@@ -66,6 +69,7 @@ struct xdp_umem_reg {
 	__u64 len; /* Length of packet data area */
 	__u32 chunk_size;
 	__u32 headroom;
+	__u32 flags;
 };
 
 struct xdp_statistics {
@@ -87,6 +91,11 @@ struct xdp_options {
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000ULL
 #define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000ULL
 
+/* Masks for unaligned chunks mode */
+#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
+#define XSK_UNALIGNED_BUF_ADDR_MASK \
+	((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
+
 /* Rx/Tx descriptor */
 struct xdp_desc {
 	__u64 addr;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 2d65779282a1..e997b263a0dd 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -340,6 +340,7 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
 
 static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 {
+	bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
 	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
 	unsigned int chunks, chunks_per_page;
 	u64 addr = mr->addr, size = mr->len;
@@ -355,7 +356,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 		return -EINVAL;
 	}
 
-	if (!is_power_of_2(chunk_size))
+	if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
+			XDP_UMEM_USES_NEED_WAKEUP))
+		return -EINVAL;
+
+	if (!unaligned_chunks && !is_power_of_2(chunk_size))
 		return -EINVAL;
 
 	if (!PAGE_ALIGNED(addr)) {
@@ -372,9 +377,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if (chunks == 0)
 		return -EINVAL;
 
-	chunks_per_page = PAGE_SIZE / chunk_size;
-	if (chunks < chunks_per_page || chunks % chunks_per_page)
-		return -EINVAL;
+	if (!unaligned_chunks) {
+		chunks_per_page = PAGE_SIZE / chunk_size;
+		if (chunks < chunks_per_page || chunks % chunks_per_page)
+			return -EINVAL;
+	}
 
 	headroom = ALIGN(headroom, 64);
 
@@ -383,13 +390,15 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 		return -EINVAL;
 
 	umem->address = (unsigned long)addr;
-	umem->chunk_mask = ~((u64)chunk_size - 1);
+	umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
+					    : ~((u64)chunk_size - 1);
 	umem->size = size;
 	umem->headroom = headroom;
 	umem->chunk_size_nohr = chunk_size - headroom;
 	umem->npgs = size / PAGE_SIZE;
 	umem->pgs = NULL;
 	umem->user = NULL;
+	umem->flags = mr->flags;
 	INIT_LIST_HEAD(&umem->xsk_list);
 	spin_lock_init(&umem->xsk_list_lock);
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ee4428a892fa..907e5f12338f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -45,7 +45,7 @@ EXPORT_SYMBOL(xsk_umem_has_addrs);
 
 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
 {
-	return xskq_peek_addr(umem->fq, addr);
+	return xskq_peek_addr(umem->fq, addr, umem);
 }
 EXPORT_SYMBOL(xsk_umem_peek_addr);
 
@@ -115,21 +115,43 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
 }
 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
 
+/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
+ * each page. This is only required in copy mode.
+ */
+static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
+			     u32 len, u32 metalen)
+{
+	void *to_buf = xdp_umem_get_data(umem, addr);
+
+	addr = xsk_umem_add_offset_to_addr(addr);
+	if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
+		void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
+		u64 page_start = addr & ~(PAGE_SIZE - 1);
+		u64 first_len = PAGE_SIZE - (addr - page_start);
+
+		memcpy(to_buf, from_buf, first_len + metalen);
+		memcpy(next_pg_addr, from_buf + first_len, len - first_len);
+
+		return;
+	}
+
+	memcpy(to_buf, from_buf, len + metalen);
+}
+
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-	void *to_buf, *from_buf;
+	u64 offset = xs->umem->headroom;
+	u64 addr, memcpy_addr;
+	void *from_buf;
 	u32 metalen;
-	u64 addr;
 	int err;
 
-	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+	if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
 		xs->rx_dropped++;
 		return -ENOSPC;
 	}
 
-	addr += xs->umem->headroom;
-
 	if (unlikely(xdp_data_meta_unsupported(xdp))) {
 		from_buf = xdp->data;
 		metalen = 0;
@@ -138,9 +160,11 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 		metalen = xdp->data - xdp->data_meta;
 	}
 
-	to_buf = xdp_umem_get_data(xs->umem, addr);
-	memcpy(to_buf, from_buf, len + metalen);
-	addr += metalen;
+	memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
+	__xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
+
+	offset += metalen;
+	addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
 	err = xskq_produce_batch_desc(xs->rx, addr, len);
 	if (!err) {
 		xskq_discard_addr(xs->umem->fq);
@@ -185,6 +209,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	u32 metalen = xdp->data - xdp->data_meta;
 	u32 len = xdp->data_end - xdp->data;
+	u64 offset = xs->umem->headroom;
 	void *buffer;
 	u64 addr;
 	int err;
@@ -196,17 +221,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 		goto out_unlock;
 	}
 
-	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+	if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
 		err = -ENOSPC;
 		goto out_drop;
 	}
 
-	addr += xs->umem->headroom;
-
-	buffer = xdp_umem_get_data(xs->umem, addr);
+	buffer = xdp_umem_get_data(xs->umem, addr + offset);
 	memcpy(buffer, xdp->data_meta, len + metalen);
-	addr += metalen;
+	offset += metalen;
+
+	addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
 	err = xskq_produce_batch_desc(xs->rx, addr, len);
 	if (err)
 		goto out_drop;
@@ -250,7 +275,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
-		if (!xskq_peek_desc(xs->tx, desc))
+		if (!xskq_peek_desc(xs->tx, desc, umem))
 			continue;
 
 		if (xskq_produce_addr_lazy(umem->cq, desc->addr))
@@ -304,7 +329,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
 		goto out;
 
-	while (xskq_peek_desc(xs->tx, &desc)) {
+	while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
 		char *buffer;
 		u64 addr;
 		u32 len;
@@ -333,7 +358,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		skb->dev = xs->dev;
 		skb->priority = sk->sk_priority;
 		skb->mark = sk->sk_mark;
-		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
+		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
 		skb->destructor = xsk_destruct_skb;
 
 		err = dev_direct_xmit(skb, xs->queue_id);
@@ -526,6 +551,24 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
 	return sock;
 }
 
+/* Check if umem pages are contiguous.
+ * If zero-copy mode, use the DMA address to do the page contiguity check
+ * For all other modes we use addr (kernel virtual address)
+ * Store the result in the low bits of addr.
+ */
+static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
+{
+	struct xdp_umem_page *pgs = umem->pages;
+	int i, is_contig;
+
+	for (i = 0; i < umem->npgs - 1; i++) {
+		is_contig = (flags & XDP_ZEROCOPY) ?
+			(pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
+			(pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
+		pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
+	}
+}
+
 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 {
 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@@ -616,6 +659,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
 		if (err)
 			goto out_unlock;
+
+		xsk_check_page_contiguity(xs->umem, flags);
 	}
 
 	xs->dev = dev;
@@ -636,6 +681,13 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	return err;
 }
 
+struct xdp_umem_reg_v1 {
+	__u64 addr; /* Start of packet data area */
+	__u64 len; /* Length of packet data area */
+	__u32 chunk_size;
+	__u32 headroom;
+};
+
 static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, unsigned int optlen)
 {
@@ -673,10 +725,16 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	}
 	case XDP_UMEM_REG:
 	{
-		struct xdp_umem_reg mr;
+		size_t mr_size = sizeof(struct xdp_umem_reg);
+		struct xdp_umem_reg mr = {};
 		struct xdp_umem *umem;
 
-		if (copy_from_user(&mr, optval, sizeof(mr)))
+		if (optlen < sizeof(struct xdp_umem_reg_v1))
+			return -EINVAL;
+		else if (optlen < sizeof(mr))
+			mr_size = sizeof(struct xdp_umem_reg_v1);
+
+		if (copy_from_user(&mr, optval, mr_size))
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index d5e06c8e0cbf..9986a759fe06 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
 	du.id = umem->id;
 	du.size = umem->size;
 	du.num_pages = umem->npgs;
-	du.chunk_size = (__u32)(~umem->chunk_mask + 1);
+	du.chunk_size = umem->chunk_size_nohr + umem->headroom;
 	du.headroom = umem->headroom;
 	du.ifindex = umem->dev ? umem->dev->ifindex : 0;
 	du.queue_id = umem->queue_id;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index dd9e985c2461..6c67c9d0294f 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -134,6 +134,17 @@ static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt)
 
 /* UMEM queue */
 
+static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr,
+					      u64 length)
+{
+	bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
+	bool next_pg_contig =
+		(unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
+			XSK_NEXT_PG_CONTIG_MASK;
+
+	return cross_pg && !next_pg_contig;
+}
+
 static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
 {
 	if (addr >= q->size) {
@@ -144,23 +155,49 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
 	return true;
 }
 
-static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
+static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr,
+						u64 length,
+						struct xdp_umem *umem)
+{
+	addr = xsk_umem_add_offset_to_addr(addr);
+	if (addr >= q->size ||
+	    xskq_crosses_non_contig_pg(umem, addr, length)) {
+		q->invalid_descs++;
+		return false;
+	}
+
+	return true;
+}
+
+static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr,
+				      struct xdp_umem *umem)
 {
 	while (q->cons_tail != q->cons_head) {
 		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
 		*addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
+
+		if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
+			if (xskq_is_valid_addr_unaligned(q, *addr,
+							 umem->chunk_size_nohr,
+							 umem))
+				return addr;
+			goto out;
+		}
+
 		if (xskq_is_valid_addr(q, *addr))
 			return addr;
 
+out:
 		q->cons_tail++;
 	}
 
 	return NULL;
 }
 
-static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
+static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr,
+				  struct xdp_umem *umem)
 {
 	if (q->cons_tail == q->cons_head) {
 		smp_mb(); /* D, matches A */
@@ -171,7 +208,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
 		smp_rmb();
 	}
 
-	return xskq_validate_addr(q, addr);
+	return xskq_validate_addr(q, addr, umem);
 }
 
 static inline void xskq_discard_addr(struct xsk_queue *q)
@@ -230,8 +267,21 @@ static inline int xskq_reserve_addr(struct xsk_queue *q)
 
 /* Rx/Tx queue */
 
-static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
+static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d,
+				      struct xdp_umem *umem)
 {
+	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
+		if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem))
+			return false;
+
+		if (d->len > umem->chunk_size_nohr || d->options) {
+			q->invalid_descs++;
+			return false;
+		}
+
+		return true;
+	}
+
 	if (!xskq_is_valid_addr(q, d->addr))
 		return false;
 
@@ -245,14 +295,15 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
 }
 
 static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
-						  struct xdp_desc *desc)
+						  struct xdp_desc *desc,
+						  struct xdp_umem *umem)
 {
 	while (q->cons_tail != q->cons_head) {
 		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
 		*desc = READ_ONCE(ring->desc[idx]);
-		if (xskq_is_valid_desc(q, desc))
+		if (xskq_is_valid_desc(q, desc, umem))
 			return desc;
 
 		q->cons_tail++;
@@ -262,7 +313,8 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
 }
 
 static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
-					      struct xdp_desc *desc)
+					      struct xdp_desc *desc,
+					      struct xdp_umem *umem)
 {
 	if (q->cons_tail == q->cons_head) {
 		smp_mb(); /* D, matches A */
@@ -273,7 +325,7 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
 		smp_rmb(); /* C, matches B */
 	}
 
-	return xskq_validate_desc(q, desc);
+	return xskq_validate_desc(q, desc, umem);
 }
 
 static inline void xskq_discard_desc(struct xsk_queue *q)
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next v5 02/11] ixgbe: simplify Rx buffer recycle
From: Kevin Laatz @ 2019-08-22  1:44 UTC (permalink / raw)
  To: netdev, ast, daniel, bjorn.topel, magnus.karlsson, jakub.kicinski,
	jonathan.lemon, saeedm, maximmi, stephen
  Cc: bruce.richardson, ciara.loftus, bpf, intel-wired-lan, Kevin Laatz
In-Reply-To: <20190822014427.49800-1-kevin.laatz@intel.com>

Currently, the dma, addr and handle are modified when we reuse Rx buffers
in zero-copy mode. However, this is not required as the inputs to the
function are copies, not the original values themselves. As we use the
copies within the function, we can use the original 'obi' values
directly without having to mask and add the headroom.

Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index 9a28d98a1484..e1f743470ae9 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -201,8 +201,6 @@ ixgbe_rx_buffer *ixgbe_get_rx_buffer_zc(struct ixgbe_ring *rx_ring,
 static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring,
 				     struct ixgbe_rx_buffer *obi)
 {
-	unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
-	u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
 	u16 nta = rx_ring->next_to_alloc;
 	struct ixgbe_rx_buffer *nbi;
 
@@ -212,14 +210,9 @@ static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring,
 	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
 
 	/* transfer page from old buffer to new buffer */
-	nbi->dma = obi->dma & mask;
-	nbi->dma += hr;
-
-	nbi->addr = (void *)((unsigned long)obi->addr & mask);
-	nbi->addr += hr;
-
-	nbi->handle = obi->handle & mask;
-	nbi->handle += rx_ring->xsk_umem->headroom;
+	nbi->dma = obi->dma;
+	nbi->addr = obi->addr;
+	nbi->handle = obi->handle;
 
 	obi->addr = NULL;
 	obi->skb = NULL;
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next v5 01/11] i40e: simplify Rx buffer recycle
From: Kevin Laatz @ 2019-08-22  1:44 UTC (permalink / raw)
  To: netdev, ast, daniel, bjorn.topel, magnus.karlsson, jakub.kicinski,
	jonathan.lemon, saeedm, maximmi, stephen
  Cc: bruce.richardson, ciara.loftus, bpf, intel-wired-lan, Kevin Laatz
In-Reply-To: <20190822014427.49800-1-kevin.laatz@intel.com>

Currently, the dma, addr and handle are modified when we reuse Rx buffers
in zero-copy mode. However, this is not required as the inputs to the
function are copies, not the original values themselves. As we use the
copies within the function, we can use the original 'old_bi' values
directly without having to mask and add the headroom.

Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 42c90126b6c4..2d6e82f72f48 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -420,8 +420,6 @@ static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
 				    struct i40e_rx_buffer *old_bi)
 {
 	struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc];
-	unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
-	u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
 	u16 nta = rx_ring->next_to_alloc;
 
 	/* update, and store next to alloc */
@@ -429,14 +427,9 @@ static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
 	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
 
 	/* transfer page from old buffer to new buffer */
-	new_bi->dma = old_bi->dma & mask;
-	new_bi->dma += hr;
-
-	new_bi->addr = (void *)((unsigned long)old_bi->addr & mask);
-	new_bi->addr += hr;
-
-	new_bi->handle = old_bi->handle & mask;
-	new_bi->handle += rx_ring->xsk_umem->headroom;
+	new_bi->dma = old_bi->dma;
+	new_bi->addr = old_bi->addr;
+	new_bi->handle = old_bi->handle;
 
 	old_bi->addr = NULL;
 }
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next v5 00/11] XDP unaligned chunk placement support
From: Kevin Laatz @ 2019-08-22  1:44 UTC (permalink / raw)
  To: netdev, ast, daniel, bjorn.topel, magnus.karlsson, jakub.kicinski,
	jonathan.lemon, saeedm, maximmi, stephen
  Cc: bruce.richardson, ciara.loftus, bpf, intel-wired-lan, Kevin Laatz
In-Reply-To: <20190730085400.10376-1-kevin.laatz@intel.com>

This patch set adds the ability to use unaligned chunks in the XDP umem.

Currently, all chunk addresses passed to the umem are masked to be chunk
size aligned (max is PAGE_SIZE). This limits where we can place chunks
within the umem as well as limiting the packet sizes that are supported.

The changes in this patch set removes these restrictions, allowing XDP to
be more flexible in where it can place a chunk within a umem. By relaxing
where the chunks can be placed, it allows us to use an arbitrary buffer
size and place that wherever we have a free address in the umem. These
changes add the ability to support arbitrary frame sizes up to 4k
(PAGE_SIZE) and make it easy to integrate with other existing frameworks
that have their own memory management systems, such as DPDK.
In DPDK, for example, there is already support for AF_XDP with zero-copy.
However, with this patch set the integration will be much more seamless.
You can find the DPDK AF_XDP driver at:
https://git.dpdk.org/dpdk/tree/drivers/net/af_xdp

Since we are now dealing with arbitrary frame sizes, we need also need to
update how we pass around addresses. Currently, the addresses can simply be
masked to 2k to get back to the original address. This becomes less trivial
when using frame sizes that are not a 'power of 2' size. This patch set
modifies the Rx/Tx descriptor format to use the upper 16-bits of the addr
field for an offset value, leaving the lower 48-bits for the address (this
leaves us with 256 Terabytes, which should be enough!). We only need to use
the upper 16-bits to store the offset when running in unaligned mode.
Rather than adding the offset (headroom etc) to the address, we will store
it in the upper 16-bits of the address field. This way, we can easily add
the offset to the address where we need it, using some bit manipulation and
addition, and we can also easily get the original address wherever we need
it (for example in i40e_zca_free) by simply masking to get the lower
48-bits of the address field.

Note: The mlx5e patch only adds support for the new offset handling. Maxim
Mikityanski <maximmi@mellanox.com> is working on an additional patch that
enables the unaligned chunks feature (makes mlx5e accept arbitrary frame
sizes) which will be added later and applies on top of this patch.

The numbers below were recorded with the following set up:
  - Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz
  - Intel Corporation Ethernet Controller XXV710 for 25GbE SFP28 (rev 02)
  - Driver: i40e
  - Application: xdpsock with l2fwd (single interface)
  - Turbo disabled in BIOS

These are solely for comparing performance with and without the patches.
The largest drop was ~1.5% (in zero-copy mode).

+-------------------------+-------------+-----------------+-------------+
| Buffer size: 2048       | SKB mode    | Zero-copy       | Copy        |
+-------------------------+-------------+-----------------+-------------+
| Aligned (baseline)      | 1.25 Mpps   | 10.0 Mpps       | 1.66 Mpps   |
+-------------------------+-------------+-----------------+-------------+
| Aligned (with patches)  | 1.25 Mpps   | 9.85 Mpps       | 1.66 Mpps   |
+-------------------------+-------------+-----------------+-------------+
| Unaligned               | 1.25 Mpps   | 9.65 Mpps       | 1.66 Mpps   |
+-------------------------+-------------+-----------------+-------------+

This patch set has been applied against
commit 0bb52b0dfc88 ("tools: bpftool: add 'bpftool map freeze' subcommand")

Structure of the patch set:
Patch 1:
  - Remove unnecessary masking and headroom addition during zero-copy Rx
    buffer recycling in i40e. This change is required in order for the
    buffer recycling to work in the unaligned chunk mode.

Patch 2:
  - Remove unnecessary masking and headroom addition during
    zero-copy Rx buffer recycling in ixgbe. This change is required in
    order for the  buffer recycling to work in the unaligned chunk mode.

Patch 3:
  - Add infrastructure for unaligned chunks. Since we are dealing with
    unaligned chunks that could potentially cross a physical page boundary,
    we add checks to keep track of that information. We can later use this
    information to correctly handle buffers that are placed at an address
    where they cross a page boundary.  This patch also modifies the
    existing Rx and Tx functions to use the new descriptor format. To
    handle addresses correctly, we need to mask appropriately based on
    whether we are in aligned or unaligned mode.

Patch 4:
  - This patch updates the i40e driver to make use of the new descriptor
    format.

Patch 5:
  - This patch updates the ixgbe driver to make use of the new descriptor
    format.

Patch 6:
  - This patch updates the mlx5e driver to make use of the new descriptor
    format. These changes are required to handle the new descriptor format
    and for unaligned chunks support.

Patch 7:
  - Add flags for umem configuration to libbpf. Since we increase the size
    of the struct by adding flags, we also need to add the ABI versioning
    in this patch.

Patch 8:
  - Modify xdpsock application to add a command line option for
    unaligned chunks

Patch 9:
  - Since we can now run the application in unaligned chunk mode, we need
    to make sure we recycle the buffers appropriately.

Patch 10:
  - Adds hugepage support to the xdpsock application

Patch 11:
  - Documentation update to include the unaligned chunk scenario. We need
    to explicitly state that the incoming addresses are only masked in the
    aligned chunk mode and not the unaligned chunk mode.

---
v2:
  - fixed checkpatch issues
  - fixed Rx buffer recycling for unaligned chunks in xdpsock
  - removed unused defines
  - fixed how chunk_size is calculated in xsk_diag.c
  - added some performance numbers to cover letter
  - modified descriptor format to make it easier to retrieve original
    address
  - removed patch adding off_t off to the zero copy allocator. This is no
    longer needed with the new descriptor format.

v3:
  - added patch for mlx5 driver changes needed for unaligned chunks
  - moved offset handling to new helper function
  - changed value used for the umem chunk_mask. Now using the new
    descriptor format to save us doing the calculations in a number of
    places meaning more of the code is left unchanged while adding
    unaligned chunk support.

v4:
  - reworked the next_pg_contig field in the xdp_umem_page struct. We now
    use the low 12 bits of the addr for flags rather than adding an extra
    field in the struct.
  - modified unaligned chunks flag define
  - fixed page_start calculation in __xsk_rcv_memcpy().
  - move offset handling to the xdp_umem_get_* functions
  - modified the len field in xdp_umem_reg struct. We now use 16 bits from
    this for the flags field.
  - fixed headroom addition to handle in the mlx5e driver
  - other minor changes based on review comments

v5:
  - Added ABI versioning in the libbpf patch
  - Removed bitfields in the xdp_umem_reg struct. Adding new flags field.
  - Added accessors for getting addr and offset.
  - Added helper function for adding the offset to the addr.
  - Fixed conflicts with 'bpf-af-xdp-wakeup' which was merged recently.
  - Fixed typo in mlx driver patch.
  - Moved libbpf patch to later in the set (7/11, just before the sample
    app changes)

Kevin Laatz (11):
  i40e: simplify Rx buffer recycle
  ixgbe: simplify Rx buffer recycle
  xsk: add support to allow unaligned chunk placement
  i40e: modify driver for handling offsets
  ixgbe: modify driver for handling offsets
  mlx5e: modify driver for handling offsets
  libbpf: add flags to umem config
  samples/bpf: add unaligned chunks mode support to xdpsock
  samples/bpf: add buffer recycling for unaligned chunks to xdpsock
  samples/bpf: use hugepages in xdpsock app
  doc/af_xdp: include unaligned chunk case

 Documentation/networking/af_xdp.rst           | 10 +-
 drivers/net/ethernet/intel/i40e/i40e_xsk.c    | 26 +++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c  | 26 +++--
 .../net/ethernet/mellanox/mlx5/core/en/xdp.c  |  8 +-
 .../ethernet/mellanox/mlx5/core/en/xsk/rx.c   |  3 +-
 include/net/xdp_sock.h                        | 75 ++++++++++++++-
 include/uapi/linux/if_xdp.h                   |  9 ++
 net/xdp/xdp_umem.c                            | 19 +++-
 net/xdp/xsk.c                                 | 96 +++++++++++++++----
 net/xdp/xsk_diag.c                            |  2 +-
 net/xdp/xsk_queue.h                           | 68 +++++++++++--
 samples/bpf/xdpsock_user.c                    | 61 ++++++++----
 tools/include/uapi/linux/if_xdp.h             |  9 ++
 tools/lib/bpf/Makefile                        |  5 +-
 tools/lib/bpf/libbpf.map                      |  1 +
 tools/lib/bpf/xsk.c                           | 33 ++++++-
 tools/lib/bpf/xsk.h                           | 27 ++++++
 17 files changed, 384 insertions(+), 94 deletions(-)

-- 
2.17.1


^ permalink raw reply

* Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
From: Jiri Pirko @ 2019-08-22  9:58 UTC (permalink / raw)
  To: Parav Pandit
  Cc: Alex Williamson, Jiri Pirko, David S . Miller, Kirti Wankhede,
	Cornelia Huck, kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	cjia, netdev@vger.kernel.org
In-Reply-To: <AM0PR05MB4866A20F831A5D42E6C79EFED1A50@AM0PR05MB4866.eurprd05.prod.outlook.com>

Thu, Aug 22, 2019 at 11:42:13AM CEST, parav@mellanox.com wrote:
>
>
>> -----Original Message-----
>> From: Jiri Pirko <jiri@resnulli.us>
>> Sent: Thursday, August 22, 2019 2:59 PM
>> To: Parav Pandit <parav@mellanox.com>
>> Cc: Alex Williamson <alex.williamson@redhat.com>; Jiri Pirko
>> <jiri@mellanox.com>; David S . Miller <davem@davemloft.net>; Kirti
>> Wankhede <kwankhede@nvidia.com>; Cornelia Huck <cohuck@redhat.com>;
>> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; cjia <cjia@nvidia.com>;
>> netdev@vger.kernel.org
>> Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
>> 
>> Wed, Aug 21, 2019 at 08:23:17AM CEST, parav@mellanox.com wrote:
>> >
>> >
>> >> -----Original Message-----
>> >> From: Alex Williamson <alex.williamson@redhat.com>
>> >> Sent: Wednesday, August 21, 2019 10:56 AM
>> >> To: Parav Pandit <parav@mellanox.com>
>> >> Cc: Jiri Pirko <jiri@mellanox.com>; David S . Miller
>> >> <davem@davemloft.net>; Kirti Wankhede <kwankhede@nvidia.com>;
>> >> Cornelia Huck <cohuck@redhat.com>; kvm@vger.kernel.org;
>> >> linux-kernel@vger.kernel.org; cjia <cjia@nvidia.com>;
>> >> netdev@vger.kernel.org
>> >> Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
>> >>
>> >> > > > > Just an example of the alias, not proposing how it's set.  In
>> >> > > > > fact, proposing that the user does not set it, mdev-core
>> >> > > > > provides one
>> >> > > automatically.
>> >> > > > >
>> >> > > > > > > Since there seems to be some prefix overhead, as I ask
>> >> > > > > > > about above in how many characters we actually have to
>> >> > > > > > > work with in IFNAMESZ, maybe we start with 8 characters
>> >> > > > > > > (matching your "index" namespace) and expand as necessary for
>> disambiguation.
>> >> > > > > > > If we can eliminate overhead in IFNAMESZ, let's start with 12.
>> >> > > > > > > Thanks,
>> >> > > > > > >
>> >> > > > > > If user is going to choose the alias, why does it have to
>> >> > > > > > be limited to
>> >> sha1?
>> >> > > > > > Or you just told it as an example?
>> >> > > > > >
>> >> > > > > > It can be an alpha-numeric string.
>> >> > > > >
>> >> > > > > No, I'm proposing a different solution where mdev-core
>> >> > > > > creates an alias based on an abbreviated sha1.  The user does
>> >> > > > > not provide the
>> >> alias.
>> >> > > > >
>> >> > > > > > Instead of mdev imposing number of characters on the alias,
>> >> > > > > > it should be best
>> >> > > > > left to the user.
>> >> > > > > > Because in future if netdev improves on the naming scheme,
>> >> > > > > > mdev will be
>> >> > > > > limiting it, which is not right.
>> >> > > > > > So not restricting alias size seems right to me.
>> >> > > > > > User configuring mdev for networking devices in a given
>> >> > > > > > kernel knows what
>> >> > > > > user is doing.
>> >> > > > > > So user can choose alias name size as it finds suitable.
>> >> > > > >
>> >> > > > > That's not what I'm proposing, please read again.  Thanks,
>> >> > > >
>> >> > > > I understood your point. But mdev doesn't know how user is
>> >> > > > going to use
>> >> > > udev/systemd to name the netdev.
>> >> > > > So even if mdev chose to pick 12 characters, it could result in collision.
>> >> > > > Hence the proposal to provide the alias by the user, as user
>> >> > > > know the best
>> >> > > policy for its use case in the environment its using.
>> >> > > > So 12 character sha1 method will still work by user.
>> >> > >
>> >> > > Haven't you already provided examples where certain drivers or
>> >> > > subsystems have unique netdev prefixes?  If mdev provides a
>> >> > > unique alias within the subsystem, couldn't we simply define a
>> >> > > netdev prefix for the mdev subsystem and avoid all other
>> >> > > collisions?  I'm not in favor of the user providing both a uuid
>> >> > > and an alias/instance.  Thanks,
>> >> > >
>> >> > For a given prefix, say ens2f0, can two UUID->sha1 first 9
>> >> > characters have
>> >> collision?
>> >>
>> >> I think it would be a mistake to waste so many chars on a prefix, but
>> >> 9 characters of sha1 likely wouldn't have a collision before we have
>> >> 10s of thousands of devices.  Thanks,
>> >>
>> >> Alex
>> >
>> >Jiri, Dave,
>> >Are you ok with it for devlink/netdev part?
>> >Mdev core will create an alias from a UUID.
>> >
>> >This will be supplied during devlink port attr set such as,
>> >
>> >devlink_port_attrs_mdev_set(struct devlink_port *port, const char
>> >*mdev_alias);
>> >
>> >This alias is used to generate representor netdev's phys_port_name.
>> >This alias from the mdev device's sysfs will be used by the udev/systemd to
>> generate predicable netdev's name.
>> >Example: enm<mdev_alias_first_12_chars>
>> 
>> What happens in unlikely case of 2 UUIDs collide?
>> 
>Since users sees two devices with same phys_port_name, user should destroy recently created mdev and recreate mdev with different UUID?

Driver should make sure phys port name wont collide, in this case that
it does not provide 2 same attrs for 2 different ports.
Hmm, so the order of creation matters. That is not good.

>> 
>> >I took Ethernet mdev as an example.
>> >New prefix 'm' stands for mediated device.
>> >Remaining 12 characters are first 12 chars of the mdev alias.
>> 
>> Does this resolve the identification of devlink port representor? 
>Not sure if I understood your question correctly, attemping to answer below.
>phys_port_name of devlink port is defined by the first 12 characters of mdev alias.
>> I assume you want to use the same 12(or so) chars, don't you?
>Mdev's netdev will also use the same mdev alias from the sysfs to rename netdev name from ethX to enm<mdev_alias>, where en=Etherenet, m=mdev.
>
>So yes, same 12 characters are use for mdev's netdev and mdev devlink port's phys_port_name.
>
>Is that what are you asking?

Yes. Then you have 3 chars to handle the rest of the name (pci, pf)...

^ permalink raw reply

* [PATCH bpf-next v5] libbpf: add xsk_ring_prod__nb_free() function
From: Eelco Chaudron @ 2019-08-22  9:54 UTC (permalink / raw)
  To: netdev
  Cc: ast, daniel, kafai, songliubraving, yhs, andrii.nakryiko,
	magnus.karlsson

When an AF_XDP application received X packets, it does not mean X
frames can be stuffed into the producer ring. To make it easier for
AF_XDP applications this API allows them to check how many frames can
be added into the ring.

The patch below looks like a name change only, but the xsk_prod__
prefix denotes that this API is exposed to be used by applications.

Besides, if you set the nb value to the size of the ring, you will
get the exact amount of slots available, at the cost of performance
(you touch shared state for sure). nb is there to limit the
touching of the shared state.

Also the example xdpsock application has been modified to use this
new API, so it's also able to process flows at a 1pps rate on veth
interfaces.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>

---
v4 -> v5
  - Rebase on latest bpf-next

v3 -> v4
  - Cleanedup commit message
  - Updated AF_XDP sample application to use this new API

v2 -> v3
  - Removed cache by pass option

v1 -> v2
  - Renamed xsk_ring_prod__free() to xsk_ring_prod__nb_free()
  - Add caching so it will only touch global state when needed

 samples/bpf/xdpsock_user.c | 119 ++++++++++++++++++++++++++++---------
 tools/lib/bpf/xsk.h        |   4 +-
 2 files changed, 93 insertions(+), 30 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index da84c760c094..bec0ee463184 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -470,9 +470,13 @@ static void kick_tx(struct xsk_socket_info *xsk)
 static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
 				     struct pollfd *fds)
 {
-	u32 idx_cq = 0, idx_fq = 0;
-	unsigned int rcvd;
+	static u64 free_frames[NUM_FRAMES];
+	static size_t nr_free_frames;
+
+	u32 idx_cq = 0, idx_fq = 0, free_slots;
+	unsigned int rcvd, i;
 	size_t ndescs;
+	int ret;
 
 	if (!xsk->outstanding_tx)
 		return;
@@ -485,29 +489,56 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
 
 	/* re-add completed Tx buffers */
 	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, ndescs, &idx_cq);
-	if (rcvd > 0) {
-		unsigned int i;
-		int ret;
-
-		ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
-		while (ret != rcvd) {
-			if (ret < 0)
-				exit_with_error(-ret);
-			if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
-				ret = poll(fds, num_socks, opt_timeout);
-			ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd,
-						     &idx_fq);
-		}
-		for (i = 0; i < rcvd; i++)
+	if (!rcvd)
+		return;
+
+	/* When xsk_ring_cons__peek() for example returns that 5 packets
+	 * have been received, it does not automatically mean that
+	 * xsk_ring_prod__reserve() will have 5 slots available. You will
+	 * see this, for example, when using a veth interface due to the
+	 * RX_BATCH_SIZE used by the generic driver.
+	 *
+	 * In this example we store unused buffers and try to re-stock
+	 * them the next iteration.
+	 */
+
+	free_slots = xsk_prod__nb_free(&xsk->umem->fq, rcvd + nr_free_frames);
+	if (free_slots > rcvd + nr_free_frames)
+		free_slots = rcvd + nr_free_frames;
+
+	ret = xsk_ring_prod__reserve(&xsk->umem->fq, free_slots, &idx_fq);
+	while (ret != free_slots) {
+		if (ret < 0)
+			exit_with_error(-ret);
+
+		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+			ret = poll(fds, num_socks, opt_timeout);
+
+		ret = xsk_ring_prod__reserve(&xsk->umem->fq, free_slots,
+					     &idx_fq);
+	}
+	for (i = 0; i < rcvd; i++) {
+		u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx_cq++);
+
+		if (i < free_slots)
 			*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) =
-				*xsk_ring_cons__comp_addr(&xsk->umem->cq,
-							  idx_cq++);
+				addr;
+		else
+			free_frames[nr_free_frames++] = addr;
+	}
 
-		xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
-		xsk_ring_cons__release(&xsk->umem->cq, rcvd);
-		xsk->outstanding_tx -= rcvd;
-		xsk->tx_npkts += rcvd;
+	if (free_slots > rcvd) {
+		for (i = 0; i < (free_slots - rcvd); i++) {
+			u64 addr = free_frames[--nr_free_frames];
+			*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) =
+				addr;
+		}
 	}
+
+	xsk_ring_prod__submit(&xsk->umem->fq, free_slots);
+	xsk_ring_cons__release(&xsk->umem->cq, rcvd);
+	xsk->outstanding_tx -= rcvd;
+	xsk->tx_npkts += rcvd;
 }
 
 static inline void complete_tx_only(struct xsk_socket_info *xsk)
@@ -531,8 +562,11 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
 
 static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
 {
+	static u64 free_frames[NUM_FRAMES];
+	static size_t nr_free_frames;
+
 	unsigned int rcvd, i;
-	u32 idx_rx = 0, idx_fq = 0;
+	u32 idx_rx = 0, idx_fq = 0, free_slots;
 	int ret;
 
 	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
@@ -542,13 +576,30 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
 		return;
 	}
 
-	ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
-	while (ret != rcvd) {
+	/* When xsk_ring_cons__peek() for example returns that 5 packets
+	 * have been received, it does not automatically mean that
+	 * xsk_ring_prod__reserve() will have 5 slots available. You will
+	 * see this, for example, when using a veth interface due to the
+	 * RX_BATCH_SIZE used by the generic driver.
+	 *
+	 * In this example we store unused buffers and try to re-stock
+	 * them the next iteration.
+	 */
+
+	free_slots = xsk_prod__nb_free(&xsk->umem->fq, rcvd + nr_free_frames);
+	if (free_slots > rcvd + nr_free_frames)
+		free_slots = rcvd + nr_free_frames;
+
+	ret = xsk_ring_prod__reserve(&xsk->umem->fq, free_slots, &idx_fq);
+	while (ret != free_slots) {
 		if (ret < 0)
 			exit_with_error(-ret);
+
 		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
 			ret = poll(fds, num_socks, opt_timeout);
-		ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+
+		ret = xsk_ring_prod__reserve(&xsk->umem->fq, free_slots,
+					     &idx_fq);
 	}
 
 	for (i = 0; i < rcvd; i++) {
@@ -557,10 +608,22 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
 		char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
 
 		hex_dump(pkt, len, addr);
-		*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = addr;
+		if (i < free_slots)
+			*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) =
+				addr;
+		else
+			free_frames[nr_free_frames++] = addr;
+	}
+
+	if (free_slots > rcvd) {
+		for (i = 0; i < (free_slots - rcvd); i++) {
+			u64 addr = free_frames[--nr_free_frames];
+			*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) =
+				addr;
+		}
 	}
 
-	xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
+	xsk_ring_prod__submit(&xsk->umem->fq, free_slots);
 	xsk_ring_cons__release(&xsk->rx, rcvd);
 	xsk->rx_npkts += rcvd;
 }
diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
index aa1d6122b7db..520a772c882c 100644
--- a/tools/lib/bpf/xsk.h
+++ b/tools/lib/bpf/xsk.h
@@ -82,7 +82,7 @@ static inline int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r)
 	return *r->flags & XDP_RING_NEED_WAKEUP;
 }
 
-static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
+static inline __u32 xsk_prod__nb_free(struct xsk_ring_prod *r, __u32 nb)
 {
 	__u32 free_entries = r->cached_cons - r->cached_prod;
 
@@ -116,7 +116,7 @@ static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb)
 static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod,
 					    size_t nb, __u32 *idx)
 {
-	if (xsk_prod_nb_free(prod, nb) < nb)
+	if (xsk_prod__nb_free(prod, nb) < nb)
 		return 0;
 
 	*idx = prod->cached_prod;
-- 
2.18.1


^ permalink raw reply related

* RE: [PATCH v2 0/2] Simplify mtty driver and mdev core
From: Parav Pandit @ 2019-08-22  9:42 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Alex Williamson, Jiri Pirko, David S . Miller, Kirti Wankhede,
	Cornelia Huck, kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	cjia, netdev@vger.kernel.org
In-Reply-To: <20190822092903.GA2276@nanopsycho.orion>



> -----Original Message-----
> From: Jiri Pirko <jiri@resnulli.us>
> Sent: Thursday, August 22, 2019 2:59 PM
> To: Parav Pandit <parav@mellanox.com>
> Cc: Alex Williamson <alex.williamson@redhat.com>; Jiri Pirko
> <jiri@mellanox.com>; David S . Miller <davem@davemloft.net>; Kirti
> Wankhede <kwankhede@nvidia.com>; Cornelia Huck <cohuck@redhat.com>;
> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; cjia <cjia@nvidia.com>;
> netdev@vger.kernel.org
> Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> 
> Wed, Aug 21, 2019 at 08:23:17AM CEST, parav@mellanox.com wrote:
> >
> >
> >> -----Original Message-----
> >> From: Alex Williamson <alex.williamson@redhat.com>
> >> Sent: Wednesday, August 21, 2019 10:56 AM
> >> To: Parav Pandit <parav@mellanox.com>
> >> Cc: Jiri Pirko <jiri@mellanox.com>; David S . Miller
> >> <davem@davemloft.net>; Kirti Wankhede <kwankhede@nvidia.com>;
> >> Cornelia Huck <cohuck@redhat.com>; kvm@vger.kernel.org;
> >> linux-kernel@vger.kernel.org; cjia <cjia@nvidia.com>;
> >> netdev@vger.kernel.org
> >> Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> >>
> >> > > > > Just an example of the alias, not proposing how it's set.  In
> >> > > > > fact, proposing that the user does not set it, mdev-core
> >> > > > > provides one
> >> > > automatically.
> >> > > > >
> >> > > > > > > Since there seems to be some prefix overhead, as I ask
> >> > > > > > > about above in how many characters we actually have to
> >> > > > > > > work with in IFNAMESZ, maybe we start with 8 characters
> >> > > > > > > (matching your "index" namespace) and expand as necessary for
> disambiguation.
> >> > > > > > > If we can eliminate overhead in IFNAMESZ, let's start with 12.
> >> > > > > > > Thanks,
> >> > > > > > >
> >> > > > > > If user is going to choose the alias, why does it have to
> >> > > > > > be limited to
> >> sha1?
> >> > > > > > Or you just told it as an example?
> >> > > > > >
> >> > > > > > It can be an alpha-numeric string.
> >> > > > >
> >> > > > > No, I'm proposing a different solution where mdev-core
> >> > > > > creates an alias based on an abbreviated sha1.  The user does
> >> > > > > not provide the
> >> alias.
> >> > > > >
> >> > > > > > Instead of mdev imposing number of characters on the alias,
> >> > > > > > it should be best
> >> > > > > left to the user.
> >> > > > > > Because in future if netdev improves on the naming scheme,
> >> > > > > > mdev will be
> >> > > > > limiting it, which is not right.
> >> > > > > > So not restricting alias size seems right to me.
> >> > > > > > User configuring mdev for networking devices in a given
> >> > > > > > kernel knows what
> >> > > > > user is doing.
> >> > > > > > So user can choose alias name size as it finds suitable.
> >> > > > >
> >> > > > > That's not what I'm proposing, please read again.  Thanks,
> >> > > >
> >> > > > I understood your point. But mdev doesn't know how user is
> >> > > > going to use
> >> > > udev/systemd to name the netdev.
> >> > > > So even if mdev chose to pick 12 characters, it could result in collision.
> >> > > > Hence the proposal to provide the alias by the user, as user
> >> > > > know the best
> >> > > policy for its use case in the environment its using.
> >> > > > So 12 character sha1 method will still work by user.
> >> > >
> >> > > Haven't you already provided examples where certain drivers or
> >> > > subsystems have unique netdev prefixes?  If mdev provides a
> >> > > unique alias within the subsystem, couldn't we simply define a
> >> > > netdev prefix for the mdev subsystem and avoid all other
> >> > > collisions?  I'm not in favor of the user providing both a uuid
> >> > > and an alias/instance.  Thanks,
> >> > >
> >> > For a given prefix, say ens2f0, can two UUID->sha1 first 9
> >> > characters have
> >> collision?
> >>
> >> I think it would be a mistake to waste so many chars on a prefix, but
> >> 9 characters of sha1 likely wouldn't have a collision before we have
> >> 10s of thousands of devices.  Thanks,
> >>
> >> Alex
> >
> >Jiri, Dave,
> >Are you ok with it for devlink/netdev part?
> >Mdev core will create an alias from a UUID.
> >
> >This will be supplied during devlink port attr set such as,
> >
> >devlink_port_attrs_mdev_set(struct devlink_port *port, const char
> >*mdev_alias);
> >
> >This alias is used to generate representor netdev's phys_port_name.
> >This alias from the mdev device's sysfs will be used by the udev/systemd to
> generate predicable netdev's name.
> >Example: enm<mdev_alias_first_12_chars>
> 
> What happens in unlikely case of 2 UUIDs collide?
> 
Since users sees two devices with same phys_port_name, user should destroy recently created mdev and recreate mdev with different UUID?
> 
> >I took Ethernet mdev as an example.
> >New prefix 'm' stands for mediated device.
> >Remaining 12 characters are first 12 chars of the mdev alias.
> 
> Does this resolve the identification of devlink port representor? 
Not sure if I understood your question correctly, attemping to answer below.
phys_port_name of devlink port is defined by the first 12 characters of mdev alias.
> I assume you want to use the same 12(or so) chars, don't you?
Mdev's netdev will also use the same mdev alias from the sysfs to rename netdev name from ethX to enm<mdev_alias>, where en=Etherenet, m=mdev.

So yes, same 12 characters are use for mdev's netdev and mdev devlink port's phys_port_name.

Is that what are you asking?

^ permalink raw reply

* Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
From: Jiri Pirko @ 2019-08-22  9:29 UTC (permalink / raw)
  To: Parav Pandit
  Cc: Alex Williamson, Jiri Pirko, David S . Miller, Kirti Wankhede,
	Cornelia Huck, kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	cjia, netdev@vger.kernel.org
In-Reply-To: <AM0PR05MB4866437FAA63C447CACCD7E5D1AA0@AM0PR05MB4866.eurprd05.prod.outlook.com>

Wed, Aug 21, 2019 at 08:23:17AM CEST, parav@mellanox.com wrote:
>
>
>> -----Original Message-----
>> From: Alex Williamson <alex.williamson@redhat.com>
>> Sent: Wednesday, August 21, 2019 10:56 AM
>> To: Parav Pandit <parav@mellanox.com>
>> Cc: Jiri Pirko <jiri@mellanox.com>; David S . Miller <davem@davemloft.net>;
>> Kirti Wankhede <kwankhede@nvidia.com>; Cornelia Huck
>> <cohuck@redhat.com>; kvm@vger.kernel.org; linux-kernel@vger.kernel.org;
>> cjia <cjia@nvidia.com>; netdev@vger.kernel.org
>> Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
>> 
>> > > > > Just an example of the alias, not proposing how it's set.  In
>> > > > > fact, proposing that the user does not set it, mdev-core
>> > > > > provides one
>> > > automatically.
>> > > > >
>> > > > > > > Since there seems to be some prefix overhead, as I ask about
>> > > > > > > above in how many characters we actually have to work with
>> > > > > > > in IFNAMESZ, maybe we start with 8 characters (matching your
>> > > > > > > "index" namespace) and expand as necessary for disambiguation.
>> > > > > > > If we can eliminate overhead in IFNAMESZ, let's start with 12.
>> > > > > > > Thanks,
>> > > > > > >
>> > > > > > If user is going to choose the alias, why does it have to be limited to
>> sha1?
>> > > > > > Or you just told it as an example?
>> > > > > >
>> > > > > > It can be an alpha-numeric string.
>> > > > >
>> > > > > No, I'm proposing a different solution where mdev-core creates
>> > > > > an alias based on an abbreviated sha1.  The user does not provide the
>> alias.
>> > > > >
>> > > > > > Instead of mdev imposing number of characters on the alias, it
>> > > > > > should be best
>> > > > > left to the user.
>> > > > > > Because in future if netdev improves on the naming scheme,
>> > > > > > mdev will be
>> > > > > limiting it, which is not right.
>> > > > > > So not restricting alias size seems right to me.
>> > > > > > User configuring mdev for networking devices in a given kernel
>> > > > > > knows what
>> > > > > user is doing.
>> > > > > > So user can choose alias name size as it finds suitable.
>> > > > >
>> > > > > That's not what I'm proposing, please read again.  Thanks,
>> > > >
>> > > > I understood your point. But mdev doesn't know how user is going
>> > > > to use
>> > > udev/systemd to name the netdev.
>> > > > So even if mdev chose to pick 12 characters, it could result in collision.
>> > > > Hence the proposal to provide the alias by the user, as user know
>> > > > the best
>> > > policy for its use case in the environment its using.
>> > > > So 12 character sha1 method will still work by user.
>> > >
>> > > Haven't you already provided examples where certain drivers or
>> > > subsystems have unique netdev prefixes?  If mdev provides a unique
>> > > alias within the subsystem, couldn't we simply define a netdev
>> > > prefix for the mdev subsystem and avoid all other collisions?  I'm
>> > > not in favor of the user providing both a uuid and an
>> > > alias/instance.  Thanks,
>> > >
>> > For a given prefix, say ens2f0, can two UUID->sha1 first 9 characters have
>> collision?
>> 
>> I think it would be a mistake to waste so many chars on a prefix, but 9
>> characters of sha1 likely wouldn't have a collision before we have 10s of
>> thousands of devices.  Thanks,
>> 
>> Alex
>
>Jiri, Dave,
>Are you ok with it for devlink/netdev part?
>Mdev core will create an alias from a UUID.
>
>This will be supplied during devlink port attr set such as,
>
>devlink_port_attrs_mdev_set(struct devlink_port *port, const char *mdev_alias);
>
>This alias is used to generate representor netdev's phys_port_name.
>This alias from the mdev device's sysfs will be used by the udev/systemd to generate predicable netdev's name.
>Example: enm<mdev_alias_first_12_chars>

What happens in unlikely case of 2 UUIDs collide?


>I took Ethernet mdev as an example.
>New prefix 'm' stands for mediated device.
>Remaining 12 characters are first 12 chars of the mdev alias.

Does this resolve the identification of devlink port representor? I
assume you want to use the same 12(or so) chars, don't you?


^ permalink raw reply

* RE: [PATCH v1] ocfs2/dlm: Move BITS_TO_BYTES() to bitops.h for wider use
From: Sudarsana Reddy Kalluru @ 2019-08-22  9:16 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: Joseph Qi, Mark Fasheh, Joel Becker, ocfs2-devel@oss.oracle.com,
	Ariel Elior, GR-everest-linux-l2, David S. Miller,
	netdev@vger.kernel.org, Colin Ian King
In-Reply-To: <20190822090855.GL30120@smile.fi.intel.com>


> -----Original Message-----
> From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
> Sent: Thursday, August 22, 2019 2:39 PM
> To: Sudarsana Reddy Kalluru <skalluru@marvell.com>
> Cc: Joseph Qi <joseph.qi@linux.alibaba.com>; Mark Fasheh
> <mark@fasheh.com>; Joel Becker <jlbec@evilplan.org>; ocfs2-
> devel@oss.oracle.com; Ariel Elior <aelior@marvell.com>; GR-everest-linux-l2
> <GR-everest-linux-l2@marvell.com>; David S. Miller
> <davem@davemloft.net>; netdev@vger.kernel.org; Colin Ian King
> <colin.king@canonical.com>
> Subject: Re: [PATCH v1] ocfs2/dlm: Move BITS_TO_BYTES() to bitops.h for
> wider use
> 
> On Thu, Aug 22, 2019 at 05:46:07AM +0000, Sudarsana Reddy Kalluru wrote:
> >
> > > -----Original Message-----
> > > From: netdev-owner@vger.kernel.org <netdev-owner@vger.kernel.org>
> On
> > > Behalf Of Andy Shevchenko
> > > Sent: Wednesday, August 21, 2019 2:56 PM
> > > To: Joseph Qi <joseph.qi@linux.alibaba.com>
> > > Cc: Mark Fasheh <mark@fasheh.com>; Joel Becker <jlbec@evilplan.org>;
> > > ocfs2-devel@oss.oracle.com; Ariel Elior <aelior@marvell.com>;
> > > Sudarsana Reddy Kalluru <skalluru@marvell.com>; GR-everest-linux-l2
> > > <GR-everest- linux-l2@marvell.com>; David S. Miller
> > > <davem@davemloft.net>; netdev@vger.kernel.org; Colin Ian King
> > > <colin.king@canonical.com>
> > > Subject: Re: [PATCH v1] ocfs2/dlm: Move BITS_TO_BYTES() to bitops.h
> > > for wider use
> > >
> > > On Wed, Aug 21, 2019 at 09:29:04AM +0800, Joseph Qi wrote:
> > > > On 19/8/21 00:31, Andy Shevchenko wrote:
> > > > > There are users already and will be more of BITS_TO_BYTES() macro.
> > > > > Move it to bitops.h for wider use.
> 
> > > > > -#define BITS_TO_BYTES(x) ((x)/8)>
> > > > I don't think this is a equivalent replace, or it is in fact wrong
> > > > before?
> > >
> > > I was thinking about this one and there are two applications:
> > > - calculus of the amount of structures of certain type per PAGE
> > >   (obviously off-by-one error in the original code IIUC purpose of
> > > STRUCT_SIZE)
> > > - calculus of some threshold based on line speed in bytes per second
> > >   (I dunno it will have any difference on the Gbs / 100 MBs speeds)
> > >
> > I see that both the implementations (existing vs new) yield same value for
> standard speeds 10G (i.e.,10000), 1G (1000) that device supports. Hence the
> change look to be ok.
> 
> Thank you for testing, may I use your Tested-by tag?
Sorry, I didn't test the actual driver flows. Was only referring to the output values for 1000/10000 speed values.

> 
> --
> With Best Regards,
> Andy Shevchenko
> 


^ permalink raw reply

* Re: [PATCH v2 11/11] vsock_test: wait for the remote to close the connection
From: Stefano Garzarella @ 2019-08-22  9:15 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: netdev, kvm, Dexuan Cui, virtualization, David S. Miller,
	Jorgen Hansen, linux-kernel
In-Reply-To: <20190820082828.GA9855@stefanha-x1.localdomain>

On Tue, Aug 20, 2019 at 09:28:28AM +0100, Stefan Hajnoczi wrote:
> On Thu, Aug 01, 2019 at 05:25:41PM +0200, Stefano Garzarella wrote:
> > +/* Wait for the remote to close the connection */
> > +void vsock_wait_remote_close(int fd)
> > +{
> > +	struct epoll_event ev;
> > +	int epollfd, nfds;
> > +
> > +	epollfd = epoll_create1(0);
> > +	if (epollfd == -1) {
> > +		perror("epoll_create1");
> > +		exit(EXIT_FAILURE);
> > +	}
> > +
> > +	ev.events = EPOLLRDHUP | EPOLLHUP;
> > +	ev.data.fd = fd;
> > +	if (epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) {
> > +		perror("epoll_ctl");
> > +		exit(EXIT_FAILURE);
> > +	}
> > +
> > +	nfds = epoll_wait(epollfd, &ev, 1, TIMEOUT * 1000);
> > +	if (nfds == -1) {
> > +		perror("epoll_wait");
> > +		exit(EXIT_FAILURE);
> > +	}
> > +
> > +	if (nfds == 0) {
> > +		fprintf(stderr, "epoll_wait timed out\n");
> > +		exit(EXIT_FAILURE);
> > +	}
> > +
> > +	assert(nfds == 1);
> > +	assert(ev.events & (EPOLLRDHUP | EPOLLHUP));
> > +	assert(ev.data.fd == fd);
> > +
> > +	close(epollfd);
> > +}
> 
> Please use timeout_begin()/timeout_end() so that the test cannot hang.
> 

I used the TIMEOUT macro in the epoll_wait() to avoid the hang.
Do you think is better to use the timeout_begin()/timeout_end()?
In this case, should I remove the timeout in the epoll_wait()?

> > diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
> > index 64adf45501ca..a664675bec5a 100644
> > --- a/tools/testing/vsock/vsock_test.c
> > +++ b/tools/testing/vsock/vsock_test.c
> > @@ -84,6 +84,11 @@ static void test_stream_client_close_server(const struct test_opts *opts)
> >  
> >  	control_expectln("CLOSED");
> >  
> > +	/* Wait for the remote to close the connection, before check
> > +	 * -EPIPE error on send.
> > +	 */
> > +	vsock_wait_remote_close(fd);
> 
> Is control_expectln("CLOSED") still necessary now that we're waiting for
> the poll event?  The control message was an attempt to wait until the
> other side closed the socket.

Right, I'll remove it in the v3

Thanks,
Stefano

^ permalink raw reply

* Re: [PATCH] can: Delete unnecessary checks before the macro call “dev_kfree_skb”
From: Sean Nyekjaer @ 2019-08-22  9:13 UTC (permalink / raw)
  To: Markus Elfring, linux-can, netdev, Allison Randal,
	David S. Miller, Enrico Weigelt, Greg Kroah-Hartman,
	Gustavo A. R. Silva, Lukas Wunner, Marc Kleine-Budde,
	Thomas Gleixner, Weitao Hou, Wolfgang Grandegger
  Cc: LKML, kernel-janitors
In-Reply-To: <27674907-fd2a-7f0c-84fd-d8b5124739a9@web.de>



On 21/08/2019 21.30, Markus Elfring wrote:
> From: Markus Elfring <elfring@users.sourceforge.net>
> Date: Wed, 21 Aug 2019 21:16:15 +0200
> 
> The dev_kfree_skb() function performs also input parameter validation.
> Thus the test around the shown calls is not needed.
> 
> This issue was detected by using the Coccinelle software.
> 
> Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Acked-by: Sean Nyekjaer <sean@geanix.com>
> ---
>   drivers/net/can/spi/hi311x.c  | 3 +--
>   drivers/net/can/spi/mcp251x.c | 3 +--
>   2 files changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c
> index 03a711c3221b..7c7c7e78214c 100644
> --- a/drivers/net/can/spi/hi311x.c
> +++ b/drivers/net/can/spi/hi311x.c
> @@ -184,8 +184,7 @@ static void hi3110_clean(struct net_device *net)
> 
>   	if (priv->tx_skb || priv->tx_len)
>   		net->stats.tx_errors++;
> -	if (priv->tx_skb)
> -		dev_kfree_skb(priv->tx_skb);
> +	dev_kfree_skb(priv->tx_skb);
>   	if (priv->tx_len)
>   		can_free_echo_skb(priv->net, 0);
>   	priv->tx_skb = NULL;
> diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c
> index 12358f06d194..1c496d2adb45 100644
> --- a/drivers/net/can/spi/mcp251x.c
> +++ b/drivers/net/can/spi/mcp251x.c
> @@ -274,8 +274,7 @@ static void mcp251x_clean(struct net_device *net)
> 
>   	if (priv->tx_skb || priv->tx_len)
>   		net->stats.tx_errors++;
> -	if (priv->tx_skb)
> -		dev_kfree_skb(priv->tx_skb);
> +	dev_kfree_skb(priv->tx_skb);
>   	if (priv->tx_len)
>   		can_free_echo_skb(priv->net, 0);
>   	priv->tx_skb = NULL;
> --
> 2.23.0
> 

Good catch Markus :-)

/Sean

^ permalink raw reply

* [PATCH bpf-next 4/4] xsk: lock the control mutex in sock_diag interface
From: Björn Töpel @ 2019-08-22  9:13 UTC (permalink / raw)
  To: ast, daniel, netdev
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, bpf,
	jonathan.lemon, syzbot+c82697e3043781e08802, hdanton, i.maximets
In-Reply-To: <20190822091306.20581-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

When accessing the members of an XDP socket, the control mutex should
be held. This commit fixes that.

Fixes: a36b38aa2af6 ("xsk: add sock_diag interface for AF_XDP")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 net/xdp/xsk_diag.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index d5e06c8e0cbf..c8f4f11edbbc 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -97,6 +97,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
 	msg->xdiag_ino = sk_ino;
 	sock_diag_save_cookie(sk, msg->xdiag_cookie);
 
+	mutex_lock(&xs->mutex);
 	if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb))
 		goto out_nlmsg_trim;
 
@@ -117,10 +118,12 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
 	    sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO))
 		goto out_nlmsg_trim;
 
+	mutex_unlock(&xs->mutex);
 	nlmsg_end(nlskb, nlh);
 	return 0;
 
 out_nlmsg_trim:
+	mutex_unlock(&xs->mutex);
 	nlmsg_cancel(nlskb, nlh);
 	return -EMSGSIZE;
 }
-- 
2.20.1


^ permalink raw reply related

* [PATCH bpf-next 3/4] xsk: avoid store-tearing when assigning umem
From: Björn Töpel @ 2019-08-22  9:13 UTC (permalink / raw)
  To: ast, daniel, netdev
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, bpf,
	jonathan.lemon, syzbot+c82697e3043781e08802, hdanton, i.maximets
In-Reply-To: <20190822091306.20581-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

The umem member of struct xdp_sock is read outside of the control
mutex, in the mmap implementation, and needs a WRITE_ONCE to avoid
potentional store-tearing.

Fixes: 423f38329d26 ("xsk: add umem fill queue support and mmap")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 net/xdp/xsk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 31236e61069b..6dde1857ed52 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -718,7 +718,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 
 		/* Make sure umem is ready before it can be seen by others */
 		smp_wmb();
-		xs->umem = umem;
+		WRITE_ONCE(xs->umem, umem);
 		mutex_unlock(&xs->mutex);
 		return 0;
 	}
-- 
2.20.1


^ permalink raw reply related

* [PATCH bpf-next 2/4] xsk: add proper barriers and {READ, WRITE}_ONCE-correctness for state
From: Björn Töpel @ 2019-08-22  9:13 UTC (permalink / raw)
  To: ast, daniel, netdev
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, bpf,
	jonathan.lemon, syzbot+c82697e3043781e08802, hdanton, i.maximets
In-Reply-To: <20190822091306.20581-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

The state variable was read, and written outside the control mutex
(struct xdp_sock, mutex), without proper barriers and {READ,
WRITE}_ONCE correctness.

In this commit this issue is addressed, and the state member is now
used a point of synchronization whether the socket is setup correctly
or not.

This also fixes a race, found by syzcaller, in xsk_poll() where umem
could be accessed when stale.

Suggested-by: Hillf Danton <hdanton@sina.com>
Reported-by: syzbot+c82697e3043781e08802@syzkaller.appspotmail.com
Fixes: 77cd0d7b3f25 ("xsk: add support for need_wakeup flag in AF_XDP rings")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 net/xdp/xsk.c | 57 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f3351013c2a5..31236e61069b 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -162,10 +162,23 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 	return err;
 }
 
+static bool xsk_is_bound(struct xdp_sock *xs)
+{
+	if (READ_ONCE(xs->state) == XSK_BOUND) {
+		/* Matches smp_wmb() in bind(). */
+		smp_rmb();
+		return true;
+	}
+	return false;
+}
+
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	u32 len;
 
+	if (!xsk_is_bound(xs))
+		return -EINVAL;
+
 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
 
@@ -362,6 +375,8 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
 
+	if (unlikely(!xsk_is_bound(xs)))
+		return -ENXIO;
 	if (unlikely(!xs->dev))
 		return -ENXIO;
 	if (unlikely(!(xs->dev->flags & IFF_UP)))
@@ -378,10 +393,15 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock,
 			     struct poll_table_struct *wait)
 {
 	unsigned int mask = datagram_poll(file, sock, wait);
-	struct sock *sk = sock->sk;
-	struct xdp_sock *xs = xdp_sk(sk);
-	struct net_device *dev = xs->dev;
-	struct xdp_umem *umem = xs->umem;
+	struct xdp_sock *xs = xdp_sk(sock->sk);
+	struct net_device *dev;
+	struct xdp_umem *umem;
+
+	if (unlikely(!xsk_is_bound(xs)))
+		return mask;
+
+	dev = xs->dev;
+	umem = xs->umem;
 
 	if (umem->need_wakeup)
 		dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
@@ -417,10 +437,9 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
 {
 	struct net_device *dev = xs->dev;
 
-	if (!dev || xs->state != XSK_BOUND)
+	if (xs->state != XSK_BOUND)
 		return;
-
-	xs->state = XSK_UNBOUND;
+	WRITE_ONCE(xs->state, XSK_UNBOUND);
 
 	/* Wait for driver to stop using the xdp socket. */
 	xdp_del_sk_umem(xs->umem, xs);
@@ -495,7 +514,9 @@ static int xsk_release(struct socket *sock)
 	local_bh_enable();
 
 	xsk_delete_from_maps(xs);
+	mutex_lock(&xs->mutex);
 	xsk_unbind_dev(xs);
+	mutex_unlock(&xs->mutex);
 
 	xskq_destroy(xs->rx);
 	xskq_destroy(xs->tx);
@@ -589,19 +610,18 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		}
 
 		umem_xs = xdp_sk(sock->sk);
-		if (!umem_xs->umem) {
-			/* No umem to inherit. */
+		if (!xsk_is_bound(umem_xs)) {
 			err = -EBADF;
 			sockfd_put(sock);
 			goto out_unlock;
-		} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
+		}
+		if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
 			err = -EINVAL;
 			sockfd_put(sock);
 			goto out_unlock;
 		}
-
 		xdp_get_umem(umem_xs->umem);
-		xs->umem = umem_xs->umem;
+		WRITE_ONCE(xs->umem, umem_xs->umem);
 		sockfd_put(sock);
 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
 		err = -EINVAL;
@@ -626,10 +646,15 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	xdp_add_sk_umem(xs->umem, xs);
 
 out_unlock:
-	if (err)
+	if (err) {
 		dev_put(dev);
-	else
-		xs->state = XSK_BOUND;
+	} else {
+		/* Matches smp_rmb() in bind() for shared umem
+		 * sockets, and xsk_is_bound().
+		 */
+		smp_wmb();
+		WRITE_ONCE(xs->state, XSK_BOUND);
+	}
 out_release:
 	mutex_unlock(&xs->mutex);
 	rtnl_unlock();
@@ -869,7 +894,7 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 	unsigned long pfn;
 	struct page *qpg;
 
-	if (xs->state != XSK_READY)
+	if (READ_ONCE(xs->state) != XSK_READY)
 		return -EBUSY;
 
 	if (offset == XDP_PGOFF_RX_RING) {
-- 
2.20.1


^ permalink raw reply related

* [PATCH bpf-next 1/4] xsk: avoid store-tearing when assigning queues
From: Björn Töpel @ 2019-08-22  9:13 UTC (permalink / raw)
  To: ast, daniel, netdev
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, bpf,
	jonathan.lemon, syzbot+c82697e3043781e08802, hdanton, i.maximets
In-Reply-To: <20190822091306.20581-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

Use WRITE_ONCE when doing the store of tx, rx, fq, and cq, to avoid
potential store-tearing. These members are read outside of the control
mutex in the mmap implementation.

Fixes: 37b076933a8e ("xsk: add missing write- and data-dependency barrier")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 net/xdp/xsk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ee4428a892fa..f3351013c2a5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -409,7 +409,7 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 
 	/* Make sure queue is ready before it can be seen by others */
 	smp_wmb();
-	*queue = q;
+	WRITE_ONCE(*queue, q);
 	return 0;
 }
 
-- 
2.20.1


^ permalink raw reply related

* [PATCH bpf-next 0/4] xsk: various CPU barrier and {READ, WRITE}_ONCE fixes
From: Björn Töpel @ 2019-08-22  9:13 UTC (permalink / raw)
  To: ast, daniel, netdev
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, bpf,
	bjorn.topel, jonathan.lemon, syzbot+c82697e3043781e08802, hdanton,
	i.maximets

This is a four patch series of various barrier, {READ, WRITE}_ONCE
cleanups in the AF_XDP socket code. More details can be found in the
corresponding commit message.

For an AF_XDP socket, most control plane operations are done under the
control mutex (struct xdp_sock, mutex), but there are some places
where members of the struct is read outside the control mutex. This,
as pointed out by Daniel in [1], requires proper {READ,
WRITE}_ONCE-correctness [2] [3]. To address this, and to simplify the
code, the state variable (introduced by Ilya), is now used a point of
synchronization ("is the socket in a valid state, or not").

Thanks,
Björn

[1] https://lore.kernel.org/bpf/beef16bb-a09b-40f1-7dd0-c323b4b89b17@iogearbox.net/
[2] https://lwn.net/Articles/793253/
[3] https://github.com/google/ktsan/wiki/READ_ONCE-and-WRITE_ONCE

Björn Töpel (4):
  xsk: avoid store-tearing when assigning queues
  xsk: add proper barriers and {READ, WRITE}_ONCE-correctness for state
  xsk: avoid store-tearing when assigning umem
  xsk: lock the control mutex in sock_diag interface

 net/xdp/xsk.c      | 61 ++++++++++++++++++++++++++++++++--------------
 net/xdp/xsk_diag.c |  3 +++
 2 files changed, 46 insertions(+), 18 deletions(-)

-- 
2.20.1

^ permalink raw reply

* Re: [PATCH v1] ocfs2/dlm: Move BITS_TO_BYTES() to bitops.h for wider use
From: Andy Shevchenko @ 2019-08-22  9:08 UTC (permalink / raw)
  To: Sudarsana Reddy Kalluru
  Cc: Joseph Qi, Mark Fasheh, Joel Becker, ocfs2-devel@oss.oracle.com,
	Ariel Elior, GR-everest-linux-l2, David S. Miller,
	netdev@vger.kernel.org, Colin Ian King
In-Reply-To: <MN2PR18MB2528511CEFCBC2BE07947BAAD3A50@MN2PR18MB2528.namprd18.prod.outlook.com>

On Thu, Aug 22, 2019 at 05:46:07AM +0000, Sudarsana Reddy Kalluru wrote:
> 
> > -----Original Message-----
> > From: netdev-owner@vger.kernel.org <netdev-owner@vger.kernel.org> On
> > Behalf Of Andy Shevchenko
> > Sent: Wednesday, August 21, 2019 2:56 PM
> > To: Joseph Qi <joseph.qi@linux.alibaba.com>
> > Cc: Mark Fasheh <mark@fasheh.com>; Joel Becker <jlbec@evilplan.org>;
> > ocfs2-devel@oss.oracle.com; Ariel Elior <aelior@marvell.com>; Sudarsana
> > Reddy Kalluru <skalluru@marvell.com>; GR-everest-linux-l2 <GR-everest-
> > linux-l2@marvell.com>; David S. Miller <davem@davemloft.net>;
> > netdev@vger.kernel.org; Colin Ian King <colin.king@canonical.com>
> > Subject: Re: [PATCH v1] ocfs2/dlm: Move BITS_TO_BYTES() to bitops.h for
> > wider use
> > 
> > On Wed, Aug 21, 2019 at 09:29:04AM +0800, Joseph Qi wrote:
> > > On 19/8/21 00:31, Andy Shevchenko wrote:
> > > > There are users already and will be more of BITS_TO_BYTES() macro.
> > > > Move it to bitops.h for wider use.

> > > > -#define BITS_TO_BYTES(x) ((x)/8)>
> > > I don't think this is a equivalent replace, or it is in fact wrong
> > > before?
> > 
> > I was thinking about this one and there are two applications:
> > - calculus of the amount of structures of certain type per PAGE
> >   (obviously off-by-one error in the original code IIUC purpose of
> > STRUCT_SIZE)
> > - calculus of some threshold based on line speed in bytes per second
> >   (I dunno it will have any difference on the Gbs / 100 MBs speeds)
> > 
> I see that both the implementations (existing vs new) yield same value for standard speeds 10G (i.e.,10000), 1G (1000) that device supports. Hence the change look to be ok.

Thank you for testing, may I use your Tested-by tag?

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply

* Re: [RFC bpf-next 4/5] iproute2: Allow compiling against libbpf
From: Daniel Borkmann @ 2019-08-22  8:58 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, Stephen Hemminger,
	Alexei Starovoitov
  Cc: Martin KaFai Lau, Song Liu, Yonghong Song, David Miller,
	Jesper Dangaard Brouer, netdev, bpf, andrii.nakryiko
In-Reply-To: <20190820114706.18546-5-toke@redhat.com>

On 8/20/19 1:47 PM, Toke Høiland-Jørgensen wrote:
> This adds a configure check for libbpf and renames functions to allow
> lib/bpf.c to be compiled with it present. This makes it possible to
> port functionality piecemeal to use libbpf.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> ---
>   configure          | 16 ++++++++++++++++
>   include/bpf_util.h |  6 +++---
>   ip/ipvrf.c         |  4 ++--
>   lib/bpf.c          | 33 +++++++++++++++++++--------------
>   4 files changed, 40 insertions(+), 19 deletions(-)
> 
> diff --git a/configure b/configure
> index 45fcffb6..5a89ee9f 100755
> --- a/configure
> +++ b/configure
> @@ -238,6 +238,19 @@ check_elf()
>       fi
>   }
>   
> +check_libbpf()
> +{
> +    if ${PKG_CONFIG} libbpf --exists; then
> +	echo "HAVE_LIBBPF:=y" >>$CONFIG
> +	echo "yes"
> +
> +	echo 'CFLAGS += -DHAVE_LIBBPF' `${PKG_CONFIG} libbpf --cflags` >> $CONFIG
> +	echo 'LDLIBS += ' `${PKG_CONFIG} libbpf --libs` >>$CONFIG
> +    else
> +	echo "no"
> +    fi
> +}
> +
>   check_selinux()

More of an implementation detail at this point in time, but want to make sure this
doesn't get missed along the way: as discussed at bpfconf [0] best for iproute2 to
handle libbpf support would be the same way of integration as pahole does, that is,
to integrate it via submodule [1] to allow kernel and libbpf features to be in sync
with iproute2 releases and therefore easily consume extensions we're adding to libbpf
to aide iproute2 integration.

Thanks,
Daniel

   [0] http://vger.kernel.org/bpfconf2019.html#session-4
   [1] https://git.kernel.org/pub/scm/devel/pahole/pahole.git/commit/?id=21507cd3e97bc5692d97201ee68df044c6767e9a

^ permalink raw reply

* Re: [RFC v2] vsock: proposal to support multiple transports at runtime
From: Stefano Garzarella @ 2019-08-22  8:54 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: netdev, Dexuan Cui, Jorgen Hansen, David S. Miller, Vishnu Dasa,
	K. Y. Srinivasan, Haiyang Zhang, Stephen Hemminger, Sasha Levin
In-Reply-To: <20190819130911.GE28081@stefanha-x1.localdomain>

On Mon, Aug 19, 2019 at 02:09:11PM +0100, Stefan Hajnoczi wrote:
> On Thu, Jun 06, 2019 at 12:09:12PM +0200, Stefano Garzarella wrote:
> > 
> > Hi all,
> > this is a v2 of a proposal addressing the comments made by Dexuan, Stefan,
> > and Jorgen.
> > 
> > v1: https://www.spinics.net/lists/netdev/msg570274.html
> > 
> > 
> > 
> > We can define two types of transport that we have to handle at the same time
> > (e.g. in a nested VM we would have both types of transport running together):
> > 
> > - 'host->guest' transport, it runs in the host and it is used to communicate
> >   with the guests of a specific hypervisor (KVM, VMWare or Hyper-V). It also
> >   runs in the guest who has nested guests, to communicate with them.
> > 
> >   [Phase 2]
> >   We can support multiple 'host->guest' transport running at the same time,
> >   but on x86 only one hypervisor uses VMX at any given time.
> > 
> > - 'guest->host' transport, it runs in the guest and it is used to communicate
> >   with the host.
> > 
> > 
> > The main goal is to find a way to decide what transport use in these cases:
> > 1. connect() / sendto()
> > 
> >    a. use the 'host->guest' transport, if the destination is the guest
> >       (dest_cid > VMADDR_CID_HOST).
> > 
> >       [Phase 2]
> >       In order to support multiple 'host->guest' transports running at the same
> >       time, we should assign CIDs uniquely across all transports. In this way,
> >       a packet generated by the host side will get directed to the appropriate
> >       transport based on the CID.
> > 
> >    b. use the 'guest->host' transport, if the destination is the host or the
> >       hypervisor.
> >       (dest_cid == VMADDR_CID_HOST || dest_cid == VMADDR_CID_HYPERVISOR)
> > 
> > 
> > 2. listen() / recvfrom()
> > 
> >    a. use the 'host->guest' transport, if the socket is bound to
> >       VMADDR_CID_HOST, or it is bound to VMADDR_CID_ANY and there is no
> >       'guest->host' transport.
> >       We could also define a new VMADDR_CID_LISTEN_FROM_GUEST in order to
> >       address this case.
> > 
> >       [Phase 2]
> >       We can support network namespaces to create independent AF_VSOCK
> >       addressing domains:
> >       - could be used to partition VMs between hypervisors or at a finer
> >    	 granularity;
> >       - could be used to isolate host applications from guest applications
> >    	 using the same ports with CID_ANY;
> > 
> >    b. use the 'guest->host' transport, if the socket is bound to local CID
> >       different from the VMADDR_CID_HOST (guest CID get with
> >       IOCTL_VM_SOCKETS_GET_LOCAL_CID), or it is bound to VMADDR_CID_ANY (to be
> >       backward compatible).
> >       Also in this case, we could define a new VMADDR_CID_LISTEN_FROM_HOST.
> > 
> >    c. shared port space between transports
> >       For incoming requests or packets, we should be able to choose which
> >       transport use, looking at the 'port' requested.
> > 
> >       - stream sockets already support shared port space between transports
> >         (one port can be assigned to only one transport)
> > 
> >       [Phase 2]
> >       - datagram sockets will support it, but for now VMCI transport is the
> >         default transport for any host side datagram socket (KVM and Hyper-V
> >         do not yet support datagrams sockets)
> > 
> > We will make the loading of af_vsock.ko independent of the transports to
> > allow to:
> >    - create a AF_VSOCK socket without any loaded transports;
> >    - listen on a socket (e.g. bound to VMADDR_CID_ANY) without any loaded
> >      transports;
> > 
> > Hopefully, we could move MODULE_ALIAS_NETPROTO(PF_VSOCK) from the
> > vmci_transport.ko to the af_vsock.ko.
> > [Jorgen will check if this will impact the existing VMware products]
> > 
> > Notes:
> >    - For Hyper-V sockets, the host can only be Windows. No changes should
> >      be required on the Windows host to support the changes on this proposal.
> > 
> >    - Communication between guests are not allowed on any transports, so we can
> >      drop packets sent from a guest to another guest (dest_cid >
> >      VMADDR_CID_HOST) if the 'host->guest' transport is not available.
> > 
> >    - [Phase 2] tag used to identify things that can be done at a later stage,
> >      but that should be taken into account during this design.
> > 
> >    - Namespace support will be developed in [Phase 2] or in a separate project.
> > 
> > 
> > 
> > Comments and suggestions are welcome.
> > I'll be on PTO for next two weeks, so sorry in advance if I'll answer later.
> > 
> > If we agree on this proposal, when I get back, I'll start working on the code
> > to get a first PATCH RFC.
> 
> Stefano,
> I've reviewed your proposal and it looks good for solving nested
> virtualization.

Hi Stefan,
Thank you very much for the review!

> 
> The tricky implementation details will be supporting listen sockets,
> especially with VMADDR_CID_ANY so they can be accessed from both
> transports.

Yes, it will be tricky because the current implementation has 1 to 1
mapping with the transport callbacks.

Maybe I could move some logic in the core (e.g. for listening sockets)
to have a single point of control. (e.g. using vsk->pending_links in all
transports)

I'll work on it in the next weeks, I'll keep you updated.

Thanks,
Stefano

^ permalink raw reply

* Re: [RFC bpf-next 0/5] Convert iproute2 to use libbpf (WIP)
From: Daniel Borkmann @ 2019-08-22  8:33 UTC (permalink / raw)
  To: Andrii Nakryiko, Toke Høiland-Jørgensen
  Cc: Stephen Hemminger, Alexei Starovoitov, Martin KaFai Lau, Song Liu,
	Yonghong Song, David Miller, Jesper Dangaard Brouer, Networking,
	bpf
In-Reply-To: <CAEf4BzYMKPbfOu4a4UDEfJVcNW1-KvRwJ7PVo+Mf_1YUJgE4Qw@mail.gmail.com>

On 8/22/19 9:49 AM, Andrii Nakryiko wrote:
> On Wed, Aug 21, 2019 at 2:07 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>>> On Tue, Aug 20, 2019 at 4:47 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>>>
>>>> iproute2 uses its own bpf loader to load eBPF programs, which has
>>>> evolved separately from libbpf. Since we are now standardising on
>>>> libbpf, this becomes a problem as iproute2 is slowly accumulating
>>>> feature incompatibilities with libbpf-based loaders. In particular,
>>>> iproute2 has its own (expanded) version of the map definition struct,
>>>> which makes it difficult to write programs that can be loaded with both
>>>> custom loaders and iproute2.
>>>>
>>>> This series seeks to address this by converting iproute2 to using libbpf
>>>> for all its bpf needs. This version is an early proof-of-concept RFC, to
>>>> get some feedback on whether people think this is the right direction.
>>>>
>>>> What this series does is the following:
>>>>
>>>> - Updates the libbpf map definition struct to match that of iproute2
>>>>    (patch 1).
>>>
>>> Thanks for taking a stab at unifying libbpf and iproute2 loaders. I'm
>>> totally in support of making iproute2 use libbpf to load/initialize
>>> BPF programs. But I'm against adding iproute2-specific fields to
>>> libbpf's bpf_map_def definitions to support this.
>>>
>>> I've proposed the plan of extending libbpf's supported features so
>>> that it can be used to load iproute2-style BPF programs earlier,
>>> please see discussions in [0] and [1].
>>
>> Yeah, I've seen that discussion, and agree that longer term this is
>> probably a better way to do map-in-map definitions.
>>
>> However, I view your proposal as complementary to this series: we'll
>> probably also want the BTF-based definition to work with iproute2, and
>> that means iproute2 needs to be ported to libbpf. But iproute2 needs to
>> be backwards compatible with the format it supports now, and, well, this
>> series is the simplest way to achieve that IMO :)
> 
> Ok, I understand that. But I'd still want to avoid adding extra cruft
> to libbpf just for backwards-compatibility with *exact* iproute2
> format. Libbpf as a whole is trying to move away from relying on
> binary bpf_map_def and into using BTF-defined map definitions, and
> this patch series is a step backwards in that regard, that adds,
> essentially, already outdated stuff that we'll need to support forever
> (I mean those extra fields in bpf_map_def, that will stay there
> forever).

Agree, adding these extensions for libbpf would be a step backwards
compared to using BTF defined map defs.

> We've discussed one way to deal with it, IMO, in a cleaner way. It can
> be done in few steps:
> 
> 1. I originally wanted BTF-defined map definitions to ignore unknown
> fields. It shouldn't be a default mode, but it should be supported
> (and of course is very easy to add). So let's add that and let libbpf
> ignore unknown stuff.
> 
> 2. Then to let iproute2 loader deal with backwards-compatibility for
> libbpf-incompatible bpf_elf_map, we need to "pass-through" all those
> fields so that users of libbpf (iproute2 loader, in this case) can
> make use of it. The easiest and cleanest way to do this is to expose
> BTF ID of a type describing each map entry and let iproute2 process
> that in whichever way it sees fit.
> 
> Luckily, bpf_elf_map is compatible in `type` field, which will let
> libbpf recognize bpf_elf_map as map definition. All the rest setup
> will be done by iproute2, by processing BTF of bpf_elf_map, which will
> let it set up map sizes, flags and do all of its map-in-map magic.
> 
> The only additions to libbpf in this case would be a new `__u32
> bpf_map__btf_id(struct bpf_map* map);` API.
> 
> I haven't written any code and haven't 100% checked that this will
> cover everything, but I think we should try. This will allow to let
> users of libbpf do custom stuff with map definitions without having to
> put all this extra logic into libbpf itself, which I think is
> desirable outcome.

Sounds reasonable in general, but all this still has the issue that we're
assuming that BTF is /always/ present. Existing object files that would load
just fine /today/ but do not have BTF attached won't be handled here. Wouldn't
it be more straight forward to allow passing callbacks to the libbpf loader
such that if the map section is not found to be bpf_map_def compatible, we
rely on external user aka callback to parse the ELF section, handle any
non-default libbpf behavior like pinning/retrieving from BPF fs, populate
related internal libbpf map data structures and pass control back to libbpf
loader afterwards. (Similar callback with prog section name handling for the
case where tail call maps get automatically populated.)

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCHv2 net] ipv6/addrconf: allow adding multicast addr if IFA_F_MCAUTOJOIN is set
From: Hangbin Liu @ 2019-08-22  8:20 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, Madhu Challa, David S . Miller, Jianlin Shi
In-Reply-To: <4306235d-db31-bf06-9d26-ce19319feae3@gmail.com>

On Mon, Aug 19, 2019 at 10:33:58PM -0400, David Ahern wrote:
> On 8/19/19 10:19 PM, Hangbin Liu wrote:
> > But in ipv6_add_addr() it will check the address type and reject multicast
> > address directly. So this feature is never worked for IPv6.
> 
> If true, that is really disappointing.
> 
> We need to get a functional test script started for various address cases.

Do you mean an `ip addr add` testing for all kinds of address types?

Thanks
Hangbin

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox