Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 01/20] io_uring: add interface queue
From: David Wei @ 2023-11-07 21:40 UTC (permalink / raw)
  To: io-uring, netdev
  Cc: Jens Axboe, Pavel Begunkov, Jakub Kicinski, Paolo Abeni,
	David S. Miller, Eric Dumazet, Jesper Dangaard Brouer,
	David Ahern, Mina Almasry, Willem de Bruijn, Dragos Tatulea
In-Reply-To: <20231107214045.2172393-1-dw@davidwei.uk>

This patch introduces a new object in io_uring called an interface queue
(ifq) which contains:

* A pool region allocated by userspace and registered w/ io_uring where
  Rx data is written to.
* A net device and one specific Rx queue in it that will be configured
  for ZC Rx.
* A pair of shared ringbuffers w/ userspace, dubbed registered buf
  (rbuf) rings. Each entry contains a pool region id and an offset + len
  within that region. The kernel writes entries into the completion ring
  to tell userspace where RX data is relative to the start of a region.
  Userspace writes entries into the refill ring to tell the kernel when
  it is done with the data.

For now, each io_uring instance has a single ifq, and each ifq has a
single pool region associated with one Rx queue.

Add a new opcode to io_uring_register that sets up an ifq. Size and
offsets of shared ringbuffers are returned to userspace for it to mmap.
The implementation will be added in a later patch.

Co-developed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
 include/linux/io_uring_types.h |  6 +++
 include/uapi/linux/io_uring.h  | 50 +++++++++++++++++++++
 io_uring/Makefile              |  3 +-
 io_uring/io_uring.c            |  8 ++++
 io_uring/kbuf.c                | 27 ++++++++++++
 io_uring/kbuf.h                |  5 +++
 io_uring/zc_rx.c               | 79 ++++++++++++++++++++++++++++++++++
 io_uring/zc_rx.h               | 34 +++++++++++++++
 8 files changed, 211 insertions(+), 1 deletion(-)
 create mode 100644 io_uring/zc_rx.c
 create mode 100644 io_uring/zc_rx.h

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 13d19b9be9f4..4f902e17b9c7 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -151,6 +151,10 @@ struct io_rings {
 	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
 };
 
+struct io_rbuf_ring {
+	struct io_uring		rq, cq;
+};
+
 struct io_restriction {
 	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
@@ -336,6 +340,8 @@ struct io_ring_ctx {
 	struct io_rsrc_data		*file_data;
 	struct io_rsrc_data		*buf_data;
 
+	struct io_zc_rx_ifq		*ifq;
+
 	/* protected by ->uring_lock */
 	struct list_head		rsrc_ref_list;
 	struct io_alloc_cache		rsrc_node_cache;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8e61f8b7c2ce..84c82a789543 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -546,6 +546,9 @@ enum {
 	/* register a range of fixed file slots for automatic slot allocation */
 	IORING_REGISTER_FILE_ALLOC_RANGE	= 25,
 
+	/* register a network interface queue for zerocopy */
+	IORING_REGISTER_ZC_RX_IFQ		= 26,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
@@ -736,6 +739,53 @@ enum {
 	SOCKET_URING_OP_SIOCOUTQ,
 };
 
+struct io_uring_rbuf_rqe {
+	__u32	off;
+	__u32	len;
+	__u16	region;
+	__u8	__pad[6];
+};
+
+struct io_uring_rbuf_cqe {
+	__u32	off;
+	__u32	len;
+	__u16	region;
+	__u8	flags;
+	__u8	__pad[3];
+};
+
+struct io_rbuf_rqring_offsets {
+	__u32	head;
+	__u32	tail;
+	__u32	rqes;
+	__u8	__pad[4];
+};
+
+struct io_rbuf_cqring_offsets {
+	__u32	head;
+	__u32	tail;
+	__u32	cqes;
+	__u8	__pad[4];
+};
+
+/*
+ * Argument for IORING_REGISTER_ZC_RX_IFQ
+ */
+struct io_uring_zc_rx_ifq_reg {
+	__u32	if_idx;
+	/* hw rx descriptor ring id */
+	__u32	if_rxq_id;
+	__u32	region_id;
+	__u32	rq_entries;
+	__u32	cq_entries;
+	__u32	flags;
+	__u16	cpu;
+
+	__u32	mmap_sz;
+	struct io_rbuf_rqring_offsets rq_off;
+	struct io_rbuf_cqring_offsets cq_off;
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 8cc8e5387a75..7818b015a1f2 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
 					sqpoll.o fdinfo.o tctx.o poll.o \
-					cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
+					cancel.o kbuf.o rsrc.o rw.o opdef.o \
+					notif.o zc_rx.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 783ed0fff71b..ae7f37aabe78 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -92,6 +92,7 @@
 #include "cancel.h"
 #include "net.h"
 #include "notif.h"
+#include "zc_rx.h"
 
 #include "timeout.h"
 #include "poll.h"
@@ -3160,6 +3161,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	percpu_ref_kill(&ctx->refs);
 	xa_for_each(&ctx->personalities, index, creds)
 		io_unregister_personality(ctx, index);
+	io_unregister_zc_rx_ifq(ctx);
 	if (ctx->rings)
 		io_poll_remove_all(ctx, NULL, true);
 	mutex_unlock(&ctx->uring_lock);
@@ -4536,6 +4538,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_file_alloc_range(ctx, arg);
 		break;
+	case IORING_REGISTER_ZC_RX_IFQ:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_zc_rx_ifq(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 556f4df25b0f..30c3e5b20ab3 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -630,3 +630,30 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
 
 	return bl->buf_ring;
 }
+
+int io_allocate_rbuf_ring(struct io_zc_rx_ifq *ifq,
+			  struct io_uring_zc_rx_ifq_reg *reg)
+{
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+	size_t off, size, rq_size, cq_size;
+	void *ptr;
+
+	off = sizeof(struct io_rbuf_ring);
+	rq_size = reg->rq_entries * sizeof(struct io_uring_rbuf_rqe);
+	cq_size = reg->cq_entries * sizeof(struct io_uring_rbuf_cqe);
+	size = off + rq_size + cq_size;
+	ptr = (void *) __get_free_pages(gfp, get_order(size));
+	if (!ptr)
+		return -ENOMEM;
+	ifq->ring = (struct io_rbuf_ring *)ptr;
+	ifq->rqes = (struct io_uring_rbuf_rqe *)((char *)ptr + off);
+	ifq->cqes = (struct io_uring_rbuf_cqe *)((char *)ifq->rqes + rq_size);
+
+	return 0;
+}
+
+void io_free_rbuf_ring(struct io_zc_rx_ifq *ifq)
+{
+	if (ifq->ring)
+		folio_put(virt_to_folio(ifq->ring));
+}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index d14345ef61fc..6c8afda93646 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -4,6 +4,8 @@
 
 #include <uapi/linux/io_uring.h>
 
+#include "zc_rx.h"
+
 struct io_buffer_list {
 	/*
 	 * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
@@ -57,6 +59,9 @@ void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
 
 void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
 
+int io_allocate_rbuf_ring(struct io_zc_rx_ifq *ifq, struct io_uring_zc_rx_ifq_reg *reg);
+void io_free_rbuf_ring(struct io_zc_rx_ifq *ifq);
+
 static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
 {
 	/*
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
new file mode 100644
index 000000000000..45dab29fe0ae
--- /dev/null
+++ b/io_uring/zc_rx.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+#if defined(CONFIG_NET)
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "kbuf.h"
+#include "zc_rx.h"
+
+static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx)
+{
+	struct io_zc_rx_ifq *ifq;
+
+	ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
+	if (!ifq)
+		return NULL;
+
+	ifq->ctx = ctx;
+
+	return ifq;
+}
+
+static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq)
+{
+	io_free_rbuf_ring(ifq);
+	kfree(ifq);
+}
+
+int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
+			  struct io_uring_zc_rx_ifq_reg __user *arg)
+{
+	struct io_uring_zc_rx_ifq_reg reg;
+	struct io_zc_rx_ifq *ifq;
+	int ret;
+
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+	if (ctx->ifq)
+		return -EBUSY;
+
+	ifq = io_zc_rx_ifq_alloc(ctx);
+	if (!ifq)
+		return -ENOMEM;
+
+	/* TODO: initialise network interface */
+
+	ret = io_allocate_rbuf_ring(ifq, &reg);
+	if (ret)
+		goto err;
+
+	/* TODO: map zc region and initialise zc pool */
+
+	ifq->rq_entries = reg.rq_entries;
+	ifq->cq_entries = reg.cq_entries;
+	ifq->if_rxq_id = reg.if_rxq_id;
+	ctx->ifq = ifq;
+
+	return 0;
+err:
+	io_zc_rx_ifq_free(ifq);
+	return ret;
+}
+
+int io_unregister_zc_rx_ifq(struct io_ring_ctx *ctx)
+{
+	struct io_zc_rx_ifq *ifq = ctx->ifq;
+
+	if (!ifq)
+		return -EINVAL;
+
+	ctx->ifq = NULL;
+	io_zc_rx_ifq_free(ifq);
+	return 0;
+}
+#endif
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
new file mode 100644
index 000000000000..5f6d80c1c2b8
--- /dev/null
+++ b/io_uring/zc_rx.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_ZC_RX_H
+#define IOU_ZC_RX_H
+
+struct io_zc_rx_ifq {
+	struct io_ring_ctx	*ctx;
+	struct net_device	*dev;
+	struct io_rbuf_ring	*ring;
+	struct io_uring_rbuf_rqe *rqes;
+	struct io_uring_rbuf_cqe *cqes;
+	u32			rq_entries, cq_entries;
+	void			*pool;
+
+	/* hw rx descriptor ring id */
+	u32			if_rxq_id;
+};
+
+#if defined(CONFIG_NET)
+int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
+			  struct io_uring_zc_rx_ifq_reg __user *arg);
+int io_unregister_zc_rx_ifq(struct io_ring_ctx *ctx);
+#else
+static inline int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
+			  struct io_uring_zc_rx_ifq_reg __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+static inline int io_unregister_zc_rx_ifq(struct io_ring_ctx *ctx)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+#endif
-- 
2.39.3


^ permalink raw reply related

* [RFC PATCH v2 00/20] Zero copy Rx using io_uring
From: David Wei @ 2023-11-07 21:40 UTC (permalink / raw)
  To: io-uring, netdev
  Cc: Jens Axboe, Pavel Begunkov, Jakub Kicinski, Paolo Abeni,
	David S. Miller, Eric Dumazet, Jesper Dangaard Brouer,
	David Ahern, Mina Almasry, Willem de Bruijn, Dragos Tatulea

Changes in RFC v2:
------------------

* Added copy fallback support if userspace memory allocated for ZC Rx
  runs out, or if header splitting or flow steering fails.
* Added veth support for ZC Rx, for testing and demonstration. We will
  need to figure out what driver would be best for such testing
  functionality in the future. Perhaps netdevsim?
* Added socket registration API to io_uring to associate specific
  sockets with ifqs/Rx queues for ZC.
* Added multi-socket support, such that multiple connections can be
  steered into the same hardware Rx queue.
* Added Netbench server/client support.

Known deficiencies that we will address in a future patchset:

* Rebase on top of Kuba's page pool memory provider RFC.
* Proper test driver + selftests, maybe netdevsim.
* Further optimisation work.
* ...and more.

We are looking for feedback on our approach. Here are some example
points we would like to specifically discuss:

* Use of bpf_netdev_command to set up a hardware Rx queue for ZC?
* Tagging page private fields with a magic cookie to distinguish special
  userspace pages used for ZC Rx. This is used when reading skbs from a
  socket in io_uring to decide what to do.

This patchset is a proposal that adds zero copy network Rx to io_uring.
With it, userspace can register a region of host memory for receiving
data directly from a NIC using DMA, without needing a kernel to user
copy.

Full kernel tree including some out of tree BNXT changes:

https://github.com/spikeh/linux/tree/zcrx_sil

On the userspace side, support is added to both liburing and Netbench:

https://github.com/spikeh/liburing/tree/zcrx2
https://github.com/spikeh/netbench/tree/zcrx

If you would like to try out this patchset, build and run the kernel
tree then build Netbench using liburing, all from forks above.

Run setup.sh first:

https://gist.github.com/isilence/e6a28ce41a545a261566672104afa461

Then run the following commands:

sudo ip netns exec nsserv ./netbench --server_only 1 --v6 false \
    --rx "io_uring --provide_buffers 0 --use_zc 1 \
    --zc_pool_pages 16384 --zc_ifname ptp-serv" --use_port 9999

sudo ip netns exec nscl ./netbench --client_only 1 --v6 false \
    --tx "epoll --threads 1 --per_thread 1 --size 2800" \
    --host 10.10.10.20 --use_port 9999

Hardware support is added to the Broadcom BNXT driver. This patchset +
userspace code was tested on an Intel Xeon Platinum 8321HC CPU and
Broadcom BCM57504 NIC.

Early benchmarks using this prototype, with iperf3 as a load generator,
showed a ~50% reduction in overall system memory bandwidth as measured
using perf counters. Note that DDIO must be disabled on Intel systems.
Build Netbench using the modified liburing above.

This patchset is based on the work by Jonathan Lemon
<jonathan.lemon@gmail.com>:
https://lore.kernel.org/io-uring/20221108050521.3198458-1-jonathan.lemon@gmail.com/

David Wei (13):
  io_uring: add interface queue
  io_uring: add mmap support for shared ifq ringbuffers
  netdev: add XDP_SETUP_ZC_RX command
  io_uring: setup ZC for an Rx queue when registering an ifq
  io_uring: add ZC buf and pool
  io_uring: add ZC pool API
  skbuff: add SKBFL_FIXED_FRAG and skb_fixed()
  io_uring: allocate a uarg for freeing zero copy skbs
  io_uring: delay ZC pool destruction
  net: add data pool
  io_uring: add io_recvzc request
  bnxt: use data pool
  io_uring/zcrx: add multi socket support per Rx queue

Pavel Begunkov (7):
  io_uring/zcrx: implement socket registration
  io_uring/zcrx: propagate ifq down the stack
  io_uring/zcrx: introduce io_zc_get_rbuf_cqe
  io_uring/zcrx: add copy fallback
  net: execute custom callback from napi
  io_uring/zcrx: copy fallback to ring buffers
  veth: add support for io_uring zc rx

 drivers/net/ethernet/broadcom/bnxt/bnxt.c     |  61 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     |   5 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c |   3 +
 drivers/net/veth.c                            | 179 +++-
 include/linux/io_uring.h                      |  33 +
 include/linux/io_uring_types.h                |   6 +
 include/linux/net.h                           |   2 +
 include/linux/netdevice.h                     |   7 +
 include/linux/skbuff.h                        |  10 +-
 include/net/busy_poll.h                       |   2 +
 include/net/data_pool.h                       |  74 ++
 include/net/netdev_rx_queue.h                 |   2 +
 include/uapi/linux/io_uring.h                 |  61 ++
 io_uring/Makefile                             |   3 +-
 io_uring/io_uring.c                           |  19 +
 io_uring/kbuf.c                               |  27 +
 io_uring/kbuf.h                               |   5 +
 io_uring/net.c                                | 136 ++-
 io_uring/opdef.c                              |  16 +
 io_uring/zc_rx.c                              | 967 ++++++++++++++++++
 io_uring/zc_rx.h                              |  69 ++
 net/core/dev.c                                |  51 +
 net/socket.c                                  |   1 +
 23 files changed, 1721 insertions(+), 18 deletions(-)
 create mode 100644 include/net/data_pool.h
 create mode 100644 io_uring/zc_rx.c
 create mode 100644 io_uring/zc_rx.h

-- 
2.39.3

^ permalink raw reply

* Re: [RFC PATCH v3 08/12] net: support non paged skb frags
From: Mina Almasry @ 2023-11-07 21:19 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: netdev, linux-kernel, linux-arch, linux-kselftest, linux-media,
	dri-devel, linaro-mm-sig, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Jesper Dangaard Brouer,
	Ilias Apalodimas, Arnd Bergmann, David Ahern, Willem de Bruijn,
	Shuah Khan, Sumit Semwal, Christian König, Shakeel Butt,
	Jeroen de Borst, Praveen Kaligineedi
In-Reply-To: <7e851882-9a85-3672-c3d5-73b47599873c@huawei.com>

On Tue, Nov 7, 2023 at 1:00 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2023/11/6 10:44, Mina Almasry wrote:
> > Make skb_frag_page() fail in the case where the frag is not backed
> > by a page, and fix its relevent callers to handle this case.
> >
> > Correctly handle skb_frag refcounting in the page_pool_iovs case.
> >
> > Signed-off-by: Mina Almasry <almasrymina@google.com>
> >
>
> ...
>
> >  /**
> >   * skb_frag_page - retrieve the page referred to by a paged fragment
> >   * @frag: the paged fragment
> >   *
> > - * Returns the &struct page associated with @frag.
> > + * Returns the &struct page associated with @frag. Returns NULL if this frag
> > + * has no associated page.
> >   */
> >  static inline struct page *skb_frag_page(const skb_frag_t *frag)
> >  {
> > -     return frag->bv_page;
> > +     if (!page_is_page_pool_iov(frag->bv_page))
> > +             return frag->bv_page;
> > +
> > +     return NULL;
>
> It seems most of callers don't expect NULL returning for skb_frag_page(),
> and this patch only changes a few relevant callers to handle the NULL case.
>

Yes, I did not change code that I guessed was not likely to be
affected or enable the devmem TCP case. Here is my breakdown:

➜  cos-kernel git:(tcpdevmem) ✗ ack -i "skb_frag_page\("
--ignore-dir=drivers -t cc -l
net/core/dev.c
net/core/datagram.c
net/core/xdp.c
net/core/skbuff.c
net/core/filter.c
net/core/gro.c
net/appletalk/ddp.c
net/wireless/util.c
net/tls/tls_device.c
net/tls/tls_device_fallback.c
net/ipv4/tcp.c
net/ipv4/tcp_output.c
net/bpf/test_run.c
include/linux/skbuff.h

I'm ignoring ank skb_frag_page() calls in drivers because drivers need
to add support for devmem TCP, and handle these calls at time of
adding support, I think that's reasonable.

net/core/dev.c:
I think I missed ilegal_highdma()

net/core/datagram.c:
__skb_datagram_iter() protected by not_readable(skb) check.

net/core/skbuff.c:
protected by not_readable(skb) check.

net/core/filter.c:
bpf_xdp_frags_shrink_tail seems like xdp specific, not sure it's relevant here.

net/core/gro.c:
skb_gro_reset_offset: protected by NULL check

net/ipv4/tcp.c:
tcp_zerocopy_receive protected by NULL check.

net/ipv4/tcp_output.c:
tcp_clone_payload: handles NULL return fine.

net/bpf/test_run.c:
seems xdp specific and not sure if it can run into devmem issues.

include/linux/skbuff.h:
I think the multiple calls here are being handled correctly, but let
me know if not.

All the calls in these files, I think, are code paths not possible to
hit devmem TCP with the current support, I think:
net/core/xdp.c
net/appletalk/ddp.c
net/wireless/util.c
net/tls/tls_device.c
net/tls/tls_device_fallback.c

All in all I think maybe all in all I missed illegal_highdma(). I'll
fix it in the next iteration.

> It may make more sense to add a new helper to do the above checking, and
> add a warning in skb_frag_page() to catch any missing NULL checking for
> skb_frag_page() caller, something like below?
>
>  static inline struct page *skb_frag_page(const skb_frag_t *frag)
>  {
> -       return frag->bv_page;
> +       struct page *page = frag->bv_page;
> +
> +       BUG_ON(page_is_page_pool_iov(page));
> +
> +       return page;
> +}
> +
> +static inline struct page *skb_frag_readable_page(const skb_frag_t *frag)
> +{
> +       struct page *page = frag->bv_page;
> +
> +       if (!page_is_page_pool_iov(page))
> +               return page;
> +
> +       return NULL;
>  }
>
>

My personal immediate reaction is that this may just introduce code
churn without significant benefit. If an unsuspecting caller call
skb_frag_page() on devmem frag and doesn't correctly handle NULL
return, it will crash or error out anyway, and likely in some obvious
way, so maybe the BUG_ON() isn't so useful that it's worth changing
all the call sites. But if there is consensus on adding a change like
you propose, I have no problem adding it.

-- 
Thanks,
Mina

^ permalink raw reply

* Re: [RFC PATCH v3 09/12] net: add support for skbs with unreadable frags
From: Eric Dumazet @ 2023-11-07 21:17 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Mina Almasry, Willem de Bruijn, David Ahern, netdev, linux-kernel,
	linux-arch, linux-kselftest, linux-media, dri-devel,
	linaro-mm-sig, David S. Miller, Jakub Kicinski, Paolo Abeni,
	Jesper Dangaard Brouer, Ilias Apalodimas, Arnd Bergmann,
	Shuah Khan, Sumit Semwal, Christian König, Shakeel Butt,
	Jeroen de Borst, Praveen Kaligineedi, Willem de Bruijn,
	Kaiyuan Zhang
In-Reply-To: <ZUqms8QzQpfPQWyy@google.com>

On Tue, Nov 7, 2023 at 10:05 PM Stanislav Fomichev <sdf@google.com> wrote:

>
> I don't understand. We require an elaborate setup to receive devmem cmsgs,
> why would some random application receive those?

A TCP socket can receive 'valid TCP packets' from many different sources,
especially with BPF hooks...

Think of a bonding setup, packets being mirrored by some switches or
even from tc.

Better double check than be sorry.

We have not added a 5th component in the 4-tuple lookups, being "is
this socket a devmem one".

A mix of regular/devmem skb is supported.

^ permalink raw reply

* [PATCH net,v3, 2/2] hv_netvsc: Fix race of register_netdevice_notifier and VF register
From: Haiyang Zhang @ 2023-11-07 21:05 UTC (permalink / raw)
  To: linux-hyperv, netdev
  Cc: haiyangz, kys, wei.liu, decui, edumazet, kuba, pabeni, davem,
	linux-kernel, stable
In-Reply-To: <1699391132-30317-1-git-send-email-haiyangz@microsoft.com>

If VF NIC is registered earlier, NETDEV_REGISTER event is replayed,
but NETDEV_POST_INIT is not.

Move register_netdevice_notifier() earlier, so the call back
function is set before probing.

Cc: stable@vger.kernel.org
Fixes: e04e7a7bbd4b ("hv_netvsc: Fix a deadlock by getting rtnl lock earlier in netvsc_probe()")
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>

---
v3:
  Divide it into two patches, suggested by Jakub Kicinski.

v2:
  Fix rtnl_unlock() in error handling as found by Wojciech Drewek.
---
 drivers/net/hyperv/netvsc_drv.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 5e528a76f5f5..1d1491da303b 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2793,11 +2793,14 @@ static int __init netvsc_drv_init(void)
 	}
 	netvsc_ring_bytes = ring_size * PAGE_SIZE;
 
+	register_netdevice_notifier(&netvsc_netdev_notifier);
+
 	ret = vmbus_driver_register(&netvsc_drv);
-	if (ret)
+	if (ret) {
+		unregister_netdevice_notifier(&netvsc_netdev_notifier);
 		return ret;
+	}
 
-	register_netdevice_notifier(&netvsc_netdev_notifier);
 	return 0;
 }
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH net,v3, 1/2] hv_netvsc: Fix race of netvsc and VF register_netdevice
From: Haiyang Zhang @ 2023-11-07 21:05 UTC (permalink / raw)
  To: linux-hyperv, netdev
  Cc: haiyangz, kys, wei.liu, decui, edumazet, kuba, pabeni, davem,
	linux-kernel, stable
In-Reply-To: <1699391132-30317-1-git-send-email-haiyangz@microsoft.com>

The rtnl lock also needs to be held before rndis_filter_device_add()
which advertises nvsp_2_vsc_capability / sriov bit, and triggers
VF NIC offering and registering. If VF NIC finished register_netdev()
earlier it may cause name based config failure.

To fix this issue, move the call to rtnl_lock() before
rndis_filter_device_add(), so VF will be registered later than netvsc
/ synthetic NIC, and gets a name numbered (ethX) after netvsc.

Cc: stable@vger.kernel.org
Fixes: e04e7a7bbd4b ("hv_netvsc: Fix a deadlock by getting rtnl lock earlier in netvsc_probe()")
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>

---
v3:
  Divide it into two patches, suggested by Jakub Kicinski.

v2:
  Fix rtnl_unlock() in error handling as found by Wojciech Drewek.
---
 drivers/net/hyperv/netvsc_drv.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 3ba3c8fb28a5..5e528a76f5f5 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2531,15 +2531,6 @@ static int netvsc_probe(struct hv_device *dev,
 		goto devinfo_failed;
 	}
 
-	nvdev = rndis_filter_device_add(dev, device_info);
-	if (IS_ERR(nvdev)) {
-		ret = PTR_ERR(nvdev);
-		netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
-		goto rndis_failed;
-	}
-
-	eth_hw_addr_set(net, device_info->mac_adr);
-
 	/* We must get rtnl lock before scheduling nvdev->subchan_work,
 	 * otherwise netvsc_subchan_work() can get rtnl lock first and wait
 	 * all subchannels to show up, but that may not happen because
@@ -2547,9 +2538,23 @@ static int netvsc_probe(struct hv_device *dev,
 	 * -> ... -> device_add() -> ... -> __device_attach() can't get
 	 * the device lock, so all the subchannels can't be processed --
 	 * finally netvsc_subchan_work() hangs forever.
+	 *
+	 * The rtnl lock also needs to be held before rndis_filter_device_add()
+	 * which advertises nvsp_2_vsc_capability / sriov bit, and triggers
+	 * VF NIC offering and registering. If VF NIC finished register_netdev()
+	 * earlier it may cause name based config failure.
 	 */
 	rtnl_lock();
 
+	nvdev = rndis_filter_device_add(dev, device_info);
+	if (IS_ERR(nvdev)) {
+		ret = PTR_ERR(nvdev);
+		netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
+		goto rndis_failed;
+	}
+
+	eth_hw_addr_set(net, device_info->mac_adr);
+
 	if (nvdev->num_chn > 1)
 		schedule_work(&nvdev->subchan_work);
 
@@ -2586,9 +2591,9 @@ static int netvsc_probe(struct hv_device *dev,
 	return 0;
 
 register_failed:
-	rtnl_unlock();
 	rndis_filter_device_remove(dev, nvdev);
 rndis_failed:
+	rtnl_unlock();
 	netvsc_devinfo_put(device_info);
 devinfo_failed:
 	free_percpu(net_device_ctx->vf_stats);
-- 
2.25.1


^ permalink raw reply related

* [PATCH net,v3, 0/2] hv_netvsc: fix race of netvsc and VF register
From: Haiyang Zhang @ 2023-11-07 21:05 UTC (permalink / raw)
  To: linux-hyperv, netdev
  Cc: haiyangz, kys, wei.liu, decui, edumazet, kuba, pabeni, davem,
	linux-kernel

There are some races between netvsc probe, set notifier and VF register.
This patch set fixes them.

Haiyang Zhang (2):
  hv_netvsc: Fix race of netvsc and VF register_netdevice
  hv_netvsc: Fix race of register_netdevice_notifier and VF register

 drivers/net/hyperv/netvsc_drv.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

-- 
2.25.1


^ permalink raw reply

* Re: [RFC PATCH v3 09/12] net: add support for skbs with unreadable frags
From: Stanislav Fomichev @ 2023-11-07 21:05 UTC (permalink / raw)
  To: Mina Almasry
  Cc: Willem de Bruijn, David Ahern, netdev, linux-kernel, linux-arch,
	linux-kselftest, linux-media, dri-devel, linaro-mm-sig,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Jesper Dangaard Brouer, Ilias Apalodimas, Arnd Bergmann,
	Shuah Khan, Sumit Semwal, Christian König, Shakeel Butt,
	Jeroen de Borst, Praveen Kaligineedi, Willem de Bruijn,
	Kaiyuan Zhang
In-Reply-To: <CAHS8izNTDsHTahkd17zQVQnjzniZAk-dKNs-Mq0E4shdrXOJbg@mail.gmail.com>

On 11/07, Mina Almasry wrote:
> On Mon, Nov 6, 2023 at 5:06 PM Stanislav Fomichev <sdf@google.com> wrote:
> [..]
> > > > > And the socket has to know this association; otherwise those tokens
> > > > > are useless since they don't carry anything to identify the dmabuf.
> > > > >
> > > > > I think my other issue with MSG_SOCK_DEVMEM being on recvmsg is that
> > > > > it somehow implies that I have an option of passing or not passing it
> > > > > for an individual system call.
> > >
> > > You do have the option of passing it or not passing it per system
> > > call. The MSG_SOCK_DEVMEM says the application is willing to receive
> > > devmem cmsgs - that's all. The application doesn't get to decide
> > > whether it's actually going to receive a devmem cmsg or not, because
> > > that's dictated by the type of skb that is present in the receive
> > > queue, and not up to the application. I should explain this in the
> > > commit message...
> >
> > What would be the case of passing it or not passing it? Some fallback to
> > the host memory after flow steering update? Yeah, would be useful to
> > document those constrains. I'd lean on starting stricter and relaxing
> > those conditions if we find the use-cases.
> >
> 
> MSG_SOCK_DEVMEM (or its replacement SOCK_DEVMEM or SO_SOCK_DEVMEM),
> just says that the application is able to receive devmem cmsgs and
> will parse them. The use case for not setting that flag is existing
> applications that are not aware of devmem cmsgs. I don't want those
> applications to think they're receiving data in the linear buffer only
> to find out that the data is in devmem and they ignored the devmem
> cmsg.
> 
> So, what happens:
> 
> - MSG_SOCK_DEVMEM provided and next skb in the queue is devmem:
> application receives cmsgs.
> - MSG_SOCK_DEVMEM provided and next skb in the queue is non-devmem:
> application receives in the linear buffer.
> - MSG_SOCK_DEVMEM not provided and net skb is devmem: application
> receives EFAULT.
> - MSG_SOCK_DEVMEM not provided and next skb is non-devmem: application
> receives in the linear buffer.
> 
> My bad on not including some docs about this. The next version should
> have the commit message beefed up to explain all this, or a docs
> patch.

I don't understand. We require an elaborate setup to receive devmem cmsgs,
why would some random application receive those?

^ permalink raw reply

* [PATCH RESEND] ptp: Fixes a null pointer dereference in ptp_ioctl
From: Yuran Pereira @ 2023-11-07 20:48 UTC (permalink / raw)
  To: richardcochran, netdev
  Cc: Yuran Pereira, eadavis, davem, reibax, linux-kernel,
	linux-kernel-mentees, syzbot+8a78ecea7ac1a2ea26e5

Syzkaller found a null pointer dereference in ptp_ioctl
originating from the lack of a null check for tsevq.

```
general protection fault, probably for non-canonical
	address 0xdffffc000000020b: 0000 [#1] PREEMPT SMP KASAN
KASAN: probably user-memory-access in range
	[0x0000000000001058-0x000000000000105f]
CPU: 0 PID: 5053 Comm: syz-executor353 Not tainted
	6.6.0-syzkaller-10396-g4652b8e4f3ff #0
Hardware name: Google Google Compute Engine/Google Compute Engine,
	BIOS Google 10/09/2023
RIP: 0010:ptp_ioctl+0xcb7/0x1d10 drivers/ptp/ptp_chardev.c:476
...
Call Trace:
 <TASK>
 posix_clock_ioctl+0xf8/0x160 kernel/time/posix-clock.c:86
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:871 [inline]
 __se_sys_ioctl fs/ioctl.c:857 [inline]
 __x64_sys_ioctl+0x18f/0x210 fs/ioctl.c:857
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82
 entry_SYSCALL_64_after_hwframe+0x63/0x6b
```

This patch fixes the issue by adding a check for tsevq and
ensuring ptp_ioctl returns with an error if tsevq is null.

Reported-by: syzbot+8a78ecea7ac1a2ea26e5@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=8a78ecea7ac1a2ea26e5
Fixes: c5a445b1e934 ("ptp: support event queue reader channel masks")
Signed-off-by: Yuran Pereira <yuran.pereira@hotmail.com>
---
 drivers/ptp/ptp_chardev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 282cd7d24077..5b36c34629a0 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -173,6 +173,8 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
 	int enable, err = 0;
 
 	tsevq = pccontext->private_clkdata;
+	if (!tsevq)
+		return -EINVAL;
 
 	switch (cmd) {
 
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH net-next] tcp: make the first N SYN RTO backoffs linear
From: Eric Dumazet @ 2023-11-07 20:26 UTC (permalink / raw)
  To: David Morley
  Cc: David Miller, Jakub Kicinski, netdev, David Morley, Yuchung Cheng,
	Neal Cardwell
In-Reply-To: <20230509180558.2541885-1-morleyd.kernel@gmail.com>

On Tue, May 9, 2023 at 8:06 PM David Morley <morleyd.kernel@gmail.com> wrote:
>
> From: David Morley <morleyd@google.com>
>
> Currently the SYN RTO schedule follows an exponential backoff
> scheme, which can be unnecessarily conservative in cases where
> there are link failures. In such cases, it's better to
> aggressively try to retransmit packets, so it takes routers
> less time to find a repath with a working link.
>
> We chose a default value for this sysctl of 4, to follow
> the macOS and IOS backoff scheme of 1,1,1,1,1,2,4,8, ...
> MacOS and IOS have used this backoff schedule for over
> a decade, since before this 2009 IETF presentation
> discussed the behavior:
> https://www.ietf.org/proceedings/75/slides/tcpm-1.pdf
>
> This commit makes the SYN RTO schedule start with a number of
> linear backoffs given by the following sysctl:
> * tcp_syn_linear_timeouts
>
> This changes the SYN RTO scheme to be: init_rto_val for
> tcp_syn_linear_timeouts, exp backoff starting at init_rto_val
>
> For example if init_rto_val = 1 and tcp_syn_linear_timeouts = 2, our
> backoff scheme would be: 1, 1, 1, 2, 4, 8, 16, ...
>
> Signed-off-by: David Morley <morleyd@google.com>
> Signed-off-by: Yuchung Cheng <ycheng@google.com>
> Signed-off-by: Neal Cardwell <ncardwell@google.com>
> Tested-by: David Morley <morleyd@google.com>
> Reviewed-by: Eric Dumazet <edumazet@google.com>
> ---
>  Documentation/networking/ip-sysctl.rst | 17 ++++++++++++++---
>  include/net/netns/ipv4.h               |  1 +
>  net/ipv4/sysctl_net_ipv4.c             | 10 ++++++++++
>  net/ipv4/tcp_ipv4.c                    |  1 +
>  net/ipv4/tcp_timer.c                   | 17 +++++++++++++----
>  5 files changed, 39 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 6ec06a33688a..3f6d3d5f5626 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -881,9 +881,10 @@ tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs
>  tcp_syn_retries - INTEGER
>         Number of times initial SYNs for an active TCP connection attempt
>         will be retransmitted. Should not be higher than 127. Default value
> -       is 6, which corresponds to 63seconds till the last retransmission
> -       with the current initial RTO of 1second. With this the final timeout
> -       for an active TCP connection attempt will happen after 127seconds.
> +       is 6, which corresponds to 67seconds (with tcp_syn_linear_timeouts = 4)
> +       till the last retransmission with the current initial RTO of 1second.
> +       With this the final timeout for an active TCP connection attempt
> +       will happen after 131seconds.
>
>  tcp_timestamps - INTEGER
>         Enable timestamps as defined in RFC1323.
> @@ -946,6 +947,16 @@ tcp_pacing_ca_ratio - INTEGER
>
>         Default: 120
>
> +tcp_syn_linear_timeouts - INTEGER
> +       The number of times for an active TCP connection to retransmit SYNs with
> +       a linear backoff timeout before defaulting to an exponential backoff
> +       timeout. This has no effect on SYNACK at the passive TCP side.
> +
> +       With an initial RTO of 1 and tcp_syn_linear_timeouts = 4 we would
> +       expect SYN RTOs to be: 1, 1, 1, 1, 1, 2, 4, ... (4 linear timeouts,
> +       and the first exponential backoff using 2^0 * initial_RTO).
> +       Default: 4
> +
>  tcp_tso_win_divisor - INTEGER
>         This allows control over what percentage of the congestion window
>         can be consumed by a single TSO frame.
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index db762e35aca9..a4efb7a2796c 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -194,6 +194,7 @@ struct netns_ipv4 {
>         int sysctl_udp_rmem_min;
>
>         u8 sysctl_fib_notify_on_flag_change;
> +       u8 sysctl_tcp_syn_linear_timeouts;
>
>  #ifdef CONFIG_NET_L3_MASTER_DEV
>         u8 sysctl_udp_l3mdev_accept;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 40fe70fc2015..6ae3345a3bdf 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -34,6 +34,7 @@ static int ip_ttl_min = 1;
>  static int ip_ttl_max = 255;
>  static int tcp_syn_retries_min = 1;
>  static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
> +static int tcp_syn_linear_timeouts_max = MAX_TCP_SYNCNT;
>  static int ip_ping_group_range_min[] = { 0, 0 };
>  static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
>  static u32 u32_max_div_HZ = UINT_MAX / HZ;
> @@ -1470,6 +1471,15 @@ static struct ctl_table ipv4_net_table[] = {
>                 .extra1         = SYSCTL_ZERO,
>                 .extra2         = &tcp_plb_max_cong_thresh,
>         },
> +       {
> +               .procname = "tcp_syn_linear_timeouts",
> +               .data = &init_net.ipv4.sysctl_tcp_syn_linear_timeouts,
> +               .maxlen = sizeof(u8),
> +               .mode = 0644,
> +               .proc_handler = proc_dou8vec_minmax,
> +               .extra1 = SYSCTL_ZERO,
> +               .extra2 = &tcp_syn_linear_timeouts_max,
> +       },
>         { }
>  };
>
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 39bda2b1066e..db24ed8f8509 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -3275,6 +3275,7 @@ static int __net_init tcp_sk_init(struct net *net)
>         else
>                 net->ipv4.tcp_congestion_control = &tcp_reno;
>
> +       net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
>         return 0;
>  }
>
> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index b839c2f91292..0d93a2573807 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -234,14 +234,19 @@ static int tcp_write_timeout(struct sock *sk)
>         struct tcp_sock *tp = tcp_sk(sk);
>         struct net *net = sock_net(sk);
>         bool expired = false, do_reset;
> -       int retry_until;
> +       int retry_until, max_retransmits;
>
>         if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
>                 if (icsk->icsk_retransmits)
>                         __dst_negative_advice(sk);
>                 retry_until = icsk->icsk_syn_retries ? :
>                         READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
> -               expired = icsk->icsk_retransmits >= retry_until;
> +
> +               max_retransmits = retry_until;
> +               if (sk->sk_state == TCP_SYN_SENT)
> +                       max_retransmits += READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts);
> +
> +               expired = icsk->icsk_retransmits >= max_retransmits;
>         } else {
>                 if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) {
>                         /* Black hole detection */
> @@ -577,8 +582,12 @@ void tcp_retransmit_timer(struct sock *sk)
>             icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
>                 icsk->icsk_backoff = 0;
>                 icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
> -       } else {
> -               /* Use normal (exponential) backoff */
> +       } else if (sk->sk_state != TCP_SYN_SENT ||
> +                  icsk->icsk_backoff >
> +                  READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) {
> +               /* Use normal (exponential) backoff unless linear timeouts are
> +                * activated.
> +                */

Hi David, back to this patch.

If sysctl_tcp_syn_linear_timeouts is set to a high value, we could end up with
 icsk->icsk_backoff > 64

This can generate various overflows later, eg from inet_csk_rto_backoff(),
called from tcp_ld_RTO_revert()

More generally tcp_ld_RTO_revert() is not aware of linear timeouts.

Thank you.

^ permalink raw reply

* Re: [RFC PATCH v3 09/12] net: add support for skbs with unreadable frags
From: Mina Almasry @ 2023-11-07 19:53 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Willem de Bruijn, David Ahern, netdev, linux-kernel, linux-arch,
	linux-kselftest, linux-media, dri-devel, linaro-mm-sig,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Jesper Dangaard Brouer, Ilias Apalodimas, Arnd Bergmann,
	Shuah Khan, Sumit Semwal, Christian König, Shakeel Butt,
	Jeroen de Borst, Praveen Kaligineedi, Willem de Bruijn,
	Kaiyuan Zhang
In-Reply-To: <ZUmNk98LyO_Ntcy7@google.com>

On Mon, Nov 6, 2023 at 5:06 PM Stanislav Fomichev <sdf@google.com> wrote:
[..]
> > > > And the socket has to know this association; otherwise those tokens
> > > > are useless since they don't carry anything to identify the dmabuf.
> > > >
> > > > I think my other issue with MSG_SOCK_DEVMEM being on recvmsg is that
> > > > it somehow implies that I have an option of passing or not passing it
> > > > for an individual system call.
> >
> > You do have the option of passing it or not passing it per system
> > call. The MSG_SOCK_DEVMEM says the application is willing to receive
> > devmem cmsgs - that's all. The application doesn't get to decide
> > whether it's actually going to receive a devmem cmsg or not, because
> > that's dictated by the type of skb that is present in the receive
> > queue, and not up to the application. I should explain this in the
> > commit message...
>
> What would be the case of passing it or not passing it? Some fallback to
> the host memory after flow steering update? Yeah, would be useful to
> document those constrains. I'd lean on starting stricter and relaxing
> those conditions if we find the use-cases.
>

MSG_SOCK_DEVMEM (or its replacement SOCK_DEVMEM or SO_SOCK_DEVMEM),
just says that the application is able to receive devmem cmsgs and
will parse them. The use case for not setting that flag is existing
applications that are not aware of devmem cmsgs. I don't want those
applications to think they're receiving data in the linear buffer only
to find out that the data is in devmem and they ignored the devmem
cmsg.

So, what happens:

- MSG_SOCK_DEVMEM provided and next skb in the queue is devmem:
application receives cmsgs.
- MSG_SOCK_DEVMEM provided and next skb in the queue is non-devmem:
application receives in the linear buffer.
- MSG_SOCK_DEVMEM not provided and net skb is devmem: application
receives EFAULT.
- MSG_SOCK_DEVMEM not provided and next skb is non-devmem: application
receives in the linear buffer.

My bad on not including some docs about this. The next version should
have the commit message beefed up to explain all this, or a docs
patch.


-- 
Thanks,
Mina

^ permalink raw reply

* Re: [net-next RFC PATCH v5 4/4] dt-bindings: Document bindings for Marvell Aquantia PHY
From: Rob Herring @ 2023-11-07 19:22 UTC (permalink / raw)
  To: Christian Marangi
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Krzysztof Kozlowski, Conor Dooley, Andrew Lunn, Heiner Kallweit,
	Russell King, Robert Marko, Vladimir Oltean, netdev, devicetree,
	linux-kernel
In-Reply-To: <20231106165433.2746-4-ansuelsmth@gmail.com>

On Mon, Nov 06, 2023 at 05:54:33PM +0100, Christian Marangi wrote:
> Document bindings for Marvell Aquantia PHY.

For the subject: dt-bindings: net: Add Marvell Aquantia PHY

('Document bindings' is redundant)

> 
> The Marvell Aquantia PHY require a firmware to work correctly and there
> at least 3 way to load this firmware.
> 
> Describe all the different way and document the binding "firmware-name"
> to load the PHY firmware from userspace.
> 
> Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
> ---
> Changes v5:
> - Drop extra entry not related to HW description
> Changes v3:
> - Make DT description more OS agnostic
> - Use custom select to fix dtbs checks
> Changes v2:
> - Add DT patch
> 
>  .../bindings/net/marvell,aquantia.yaml        | 123 ++++++++++++++++++
>  1 file changed, 123 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/net/marvell,aquantia.yaml
> 
> diff --git a/Documentation/devicetree/bindings/net/marvell,aquantia.yaml b/Documentation/devicetree/bindings/net/marvell,aquantia.yaml
> new file mode 100644
> index 000000000000..7106c5bdf73c
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/marvell,aquantia.yaml
> @@ -0,0 +1,123 @@
> +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/net/marvell,aquantia.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Marvell Aquantia Ethernet PHY
> +
> +maintainers:
> +  - Christian Marangi <ansuelsmth@gmail.com>
> +
> +description: |
> +  Marvell Aquantia Ethernet PHY require a firmware to be loaded to actually
> +  work.
> +
> +  This can be done and is implemented by OEM in 3 different way:
> +    - Attached SPI flash directly to the PHY with the firmware. The PHY
> +      will self load the firmware in the presence of this configuration.
> +    - Dedicated partition on system NAND with firmware in it. NVMEM
> +      subsystem will be used and the declared NVMEM cell will load
> +      the firmware to the PHY using the PHY mailbox interface.
> +    - Manually provided firmware loaded from a file in the filesystem.
> +
> +allOf:
> +  - $ref: ethernet-phy.yaml#
> +
> +select:
> +  properties:
> +    compatible:
> +      contains:
> +        enum:
> +          - ethernet-phy-id03a1.b445
> +          - ethernet-phy-id03a1.b460
> +          - ethernet-phy-id03a1.b4a2
> +          - ethernet-phy-id03a1.b4d0
> +          - ethernet-phy-id03a1.b4e0
> +          - ethernet-phy-id03a1.b5c2
> +          - ethernet-phy-id03a1.b4b0
> +          - ethernet-phy-id03a1.b662
> +          - ethernet-phy-id03a1.b712
> +          - ethernet-phy-id31c3.1c12
> +  required:
> +    - compatible
> +
> +properties:
> +  reg:
> +    maxItems: 1
> +
> +  firmware-name:
> +    description: specify the name of PHY firmware to load
> +
> +  nvmem-cells:
> +    description: phandle to the firmware nvmem cell
> +    maxItems: 1
> +
> +  nvmem-cell-names:
> +    const: firmware
> +
> +required:
> +  - compatible
> +  - reg
> +
> +unevaluatedProperties: false
> +
> +examples:
> +  - |
> +    mdio {
> +        #address-cells = <1>;
> +        #size-cells = <0>;
> +
> +        ethernet-phy@0 {
> +            /*  Only needed to make DT lint tools work. Do not copy/paste
> +             *  into real DTS files.
> +             */

I don't agree with this statement. Pretty sure we've been through this 
before...

If we have a node, we need to define what it is. The way that is done is 
with compatible. Whether some particular OS implementation (currently) 
needs compatible or not is irrelevant. It's not about dtschema needing 
it, that just exposes the issue.

These MDIO PHY bindings are all broken because they are never actually 
applied to anything.

Rob

^ permalink raw reply

* Re: [PATCH] Fixes a null pointer dereference in ptp_ioctl
From: Yuran Pereira @ 2023-11-07 19:18 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: richardcochran, netdev, linux-kernel, linux-kernel-mentees,
	syzbot+8a78ecea7ac1a2ea26e5
In-Reply-To: <20231107100034.09786df1@kernel.org>

Hey Jakub,
On Tue, Nov 07, 2023 at 10:00:34AM -0800, Jakub Kicinski wrote:
> 
> Just Link:
> 
> > Reported-by: syzbot+8a78ecea7ac1a2ea26e5@syzkaller.appspotmail.com
> > Fixes: c5a445b1e934 ("ptp: support event queue reader channel masks")
> > 
> 
> No empty lines between tags.
> 
Rookie mistake from me. I will pay better attention next time.

> 
> When you repost please make sure you CC everyone get_maintainer 
> (run on  the patch file, not the paths) points out.
Interesting, I always run get_maintainer on the modified file
will do it on the patch file too from now on.

> And CC Edward Adam Davis <eadavis@qq.com> since his fixing similar
> issues.
Alright, I'll do that.

Thank you for the feedback.

Yuran

^ permalink raw reply

* Re: net: bcmasp: Use common error handling code in bcmasp_probe()
From: Justin Chen @ 2023-11-07 18:48 UTC (permalink / raw)
  To: Markus Elfring
  Cc: Jakub Kicinski, Wojciech Drewek, Julia Lawall, David S. Miller,
	Eric Dumazet, Florian Fainelli, Paolo Abeni,
	bcm-kernel-feedback-list, netdev, kernel-janitors, cocci, LKML,
	Simon Horman
In-Reply-To: <dce77105-47ab-4ec7-8d46-b983c630dad8@web.de>

[-- Attachment #1: Type: text/plain, Size: 749 bytes --]

On Mon, Nov 6, 2023 at 10:38 PM Markus Elfring <Markus.Elfring@web.de> wrote:
>
> >> Add a jump target so that a bit of exception handling can be better
> >> reused at the end of this function.
> …
> >> ---
> >>  drivers/net/ethernet/broadcom/asp2/bcmasp.c | 10 ++++++----
> >>  1 file changed, 6 insertions(+), 4 deletions(-)
> >
> > The diffstat proves otherwise.
> > Please don't send such patches to networking.
>
> How does this feedback fit to a change possibility which was reviewed by
> Wojciech Drewek yesterday?
>
> Regards,
> Markus

We are making the code harder to follow with these changes. Also
adding more lines than removing. Don't think this patch is an
improvement IMHO. NAK on my end.

Thanks,
Justin

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4206 bytes --]

^ permalink raw reply

* [PATCH v9 2/2] can: esd: add support for esd GmbH PCIe/402 CAN interface family
From: Stefan Mätje @ 2023-11-07 18:41 UTC (permalink / raw)
  To: Marc Kleine-Budde, linux-can, netdev, linux-kernel
  Cc: Wolfgang Grandegger, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
In-Reply-To: <20231107184103.2802678-1-stefan.maetje@esd.eu>

This patch adds support for the PCI based PCIe/402 CAN interface family
from esd GmbH that is available with various form factors
(https://esd.eu/en/products/402-series-can-interfaces).

All boards utilize a FPGA based CAN controller solution developed
by esd (esdACC). For more information on the esdACC see
https://esd.eu/en/products/esdacc.

This driver detects all available CAN interface board variants of
the family but atm. operates the CAN-FD capable devices in
Classic-CAN mode only! A later patch will introduce the CAN-FD
functionality in this driver.

Co-developed-by: Thomas Körper <thomas.koerper@esd.eu>
Signed-off-by: Thomas Körper <thomas.koerper@esd.eu>
Signed-off-by: Stefan Mätje <stefan.maetje@esd.eu>
---
 drivers/net/can/Kconfig                |   1 +
 drivers/net/can/Makefile               |   1 +
 drivers/net/can/esd/Kconfig            |  12 +
 drivers/net/can/esd/Makefile           |   7 +
 drivers/net/can/esd/esd_402_pci-core.c | 512 ++++++++++++++++
 drivers/net/can/esd/esdacc.c           | 771 +++++++++++++++++++++++++
 drivers/net/can/esd/esdacc.h           | 393 +++++++++++++
 7 files changed, 1697 insertions(+)
 create mode 100644 drivers/net/can/esd/Kconfig
 create mode 100644 drivers/net/can/esd/Makefile
 create mode 100644 drivers/net/can/esd/esd_402_pci-core.c
 create mode 100644 drivers/net/can/esd/esdacc.c
 create mode 100644 drivers/net/can/esd/esdacc.h

diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
index 649453a3c858..b9a65b44ccef 100644
--- a/drivers/net/can/Kconfig
+++ b/drivers/net/can/Kconfig
@@ -217,6 +217,7 @@ config CAN_XILINXCAN
 source "drivers/net/can/c_can/Kconfig"
 source "drivers/net/can/cc770/Kconfig"
 source "drivers/net/can/ctucanfd/Kconfig"
+source "drivers/net/can/esd/Kconfig"
 source "drivers/net/can/ifi_canfd/Kconfig"
 source "drivers/net/can/m_can/Kconfig"
 source "drivers/net/can/mscan/Kconfig"
diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile
index ff8f76295d13..4669cd51e7bf 100644
--- a/drivers/net/can/Makefile
+++ b/drivers/net/can/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_CAN_VXCAN)		+= vxcan.o
 obj-$(CONFIG_CAN_SLCAN)		+= slcan/
 
 obj-y				+= dev/
+obj-y				+= esd/
 obj-y				+= rcar/
 obj-y				+= spi/
 obj-y				+= usb/
diff --git a/drivers/net/can/esd/Kconfig b/drivers/net/can/esd/Kconfig
new file mode 100644
index 000000000000..54bfc366634c
--- /dev/null
+++ b/drivers/net/can/esd/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config CAN_ESD_402_PCI
+	tristate "esd electronics gmbh CAN-PCI(e)/402 family"
+	depends on PCI && HAS_DMA
+	help
+	  Support for C402 card family from esd electronics gmbh.
+	  This card family is based on the ESDACC CAN controller and
+	  available in several form factors:  PCI, PCIe, PCIe Mini,
+	  M.2 PCIe, CPCIserial, PMC, XMC  (see https://esd.eu/en)
+
+	  This driver can also be built as a module. In this case the
+	  module will be called esd_402_pci.
diff --git a/drivers/net/can/esd/Makefile b/drivers/net/can/esd/Makefile
new file mode 100644
index 000000000000..5dd2d470c286
--- /dev/null
+++ b/drivers/net/can/esd/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+#  Makefile for esd gmbh ESDACC controller driver
+#
+esd_402_pci-objs := esdacc.o esd_402_pci-core.o
+
+obj-$(CONFIG_CAN_ESD_402_PCI) += esd_402_pci.o
diff --git a/drivers/net/can/esd/esd_402_pci-core.c b/drivers/net/can/esd/esd_402_pci-core.c
new file mode 100644
index 000000000000..5793d41c17c2
--- /dev/null
+++ b/drivers/net/can/esd/esd_402_pci-core.c
@@ -0,0 +1,512 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2015 - 2016 Thomas Körper, esd electronic system design gmbh
+ * Copyright (C) 2017 - 2022 Stefan Mätje, esd electronics gmbh
+ */
+
+#include <linux/can/dev.h>
+#include <linux/can.h>
+#include <linux/can/netlink.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+
+#include "esdacc.h"
+
+#define ESD_PCI_DEVICE_ID_PCIE402 0x0402
+
+#define PCI402_FPGA_VER_MIN 0x003d
+#define PCI402_MAX_CORES 6
+#define PCI402_BAR 0
+#define PCI402_IO_OV_OFFS 0
+#define PCI402_IO_PCIEP_OFFS 0x10000
+#define PCI402_IO_LEN_TOTAL 0x20000
+#define PCI402_IO_LEN_CORE 0x2000
+#define PCI402_PCICFG_MSICAP 0x50
+
+#define PCI402_DMA_MASK DMA_BIT_MASK(32)
+#define PCI402_DMA_SIZE ALIGN(0x10000, PAGE_SIZE)
+
+#define PCI402_PCIEP_OF_INT_ENABLE 0x0050
+#define PCI402_PCIEP_OF_BM_ADDR_LO 0x1000
+#define PCI402_PCIEP_OF_BM_ADDR_HI 0x1004
+#define PCI402_PCIEP_OF_MSI_ADDR_LO 0x1008
+#define PCI402_PCIEP_OF_MSI_ADDR_HI 0x100c
+
+/* The BTR register capabilities described by the can_bittiming_const structures
+ * below are valid since ESDACC version 0x0032.
+ */
+
+/* Used if the ESDACC FPGA is built as CAN-Classic version. */
+static const struct can_bittiming_const pci402_bittiming_const = {
+	.name = "esd_402",
+	.tseg1_min = 1,
+	.tseg1_max = 16,
+	.tseg2_min = 1,
+	.tseg2_max = 8,
+	.sjw_max = 4,
+	.brp_min = 1,
+	.brp_max = 512,
+	.brp_inc = 1,
+};
+
+/* Used if the ESDACC FPGA is built as CAN-FD version. */
+static const struct can_bittiming_const pci402_bittiming_const_canfd = {
+	.name = "esd_402fd",
+	.tseg1_min = 1,
+	.tseg1_max = 256,
+	.tseg2_min = 1,
+	.tseg2_max = 128,
+	.sjw_max = 128,
+	.brp_min = 1,
+	.brp_max = 256,
+	.brp_inc = 1,
+};
+
+static const struct net_device_ops pci402_acc_netdev_ops = {
+	.ndo_open = acc_open,
+	.ndo_stop = acc_close,
+	.ndo_start_xmit = acc_start_xmit,
+	.ndo_change_mtu = can_change_mtu
+};
+
+struct pci402_card {
+	/* Actually mapped io space, all other iomem derived from this */
+	void __iomem *addr;
+	void __iomem *addr_pciep;
+
+	void *dma_buf;
+	dma_addr_t dma_hnd;
+
+	struct acc_ov ov;
+	struct acc_core *cores;
+
+	bool msi_enabled;
+};
+
+static irqreturn_t pci402_interrupt(int irq, void *dev_id)
+{
+	struct pci_dev *pdev = dev_id;
+	struct pci402_card *card = pci_get_drvdata(pdev);
+	irqreturn_t irq_status;
+
+	irq_status = acc_card_interrupt(&card->ov, card->cores);
+
+	return irq_status;
+}
+
+static int pci402_set_msiconfig(struct pci_dev *pdev)
+{
+	struct pci402_card *card = pci_get_drvdata(pdev);
+	u32 addr_lo_offs = 0;
+	u32 addr_lo = 0;
+	u32 addr_hi = 0;
+	u32 data = 0;
+	u16 csr = 0;
+	int err;
+
+	/* The FPGA hard IP PCIe core implements a 64-bit MSI Capability
+	 * Register Format
+	 */
+	err = pci_read_config_word(pdev, PCI402_PCICFG_MSICAP + PCI_MSI_FLAGS, &csr);
+	if (err)
+		goto failed;
+
+	err = pci_read_config_dword(pdev, PCI402_PCICFG_MSICAP + PCI_MSI_ADDRESS_LO,
+				    &addr_lo);
+	if (err)
+		goto failed;
+	err = pci_read_config_dword(pdev, PCI402_PCICFG_MSICAP + PCI_MSI_ADDRESS_HI,
+				    &addr_hi);
+	if (err)
+		goto failed;
+
+	err = pci_read_config_dword(pdev, PCI402_PCICFG_MSICAP + PCI_MSI_DATA_64,
+				    &data);
+	if (err)
+		goto failed;
+
+	addr_lo_offs = addr_lo & 0x0000ffff;
+	addr_lo &= 0xffff0000;
+
+	if (addr_hi)
+		addr_lo |= 1; /* To enable 64-Bit addressing in PCIe endpoint */
+
+	if (!(csr & PCI_MSI_FLAGS_ENABLE)) {
+		err = -EINVAL;
+		goto failed;
+	}
+
+	iowrite32(addr_lo, card->addr_pciep + PCI402_PCIEP_OF_MSI_ADDR_LO);
+	iowrite32(addr_hi, card->addr_pciep + PCI402_PCIEP_OF_MSI_ADDR_HI);
+	acc_ov_write32(&card->ov, ACC_OV_OF_MSI_ADDRESSOFFSET, addr_lo_offs);
+	acc_ov_write32(&card->ov, ACC_OV_OF_MSI_DATA, data);
+
+	return 0;
+
+failed:
+	pci_warn(pdev, "Error while setting MSI configuration:\n"
+		 "CSR: 0x%.4x, addr: 0x%.8x%.8x, offs: 0x%.4x, data: 0x%.8x\n",
+		 csr, addr_hi, addr_lo, addr_lo_offs, data);
+
+	return err;
+}
+
+static int pci402_init_card(struct pci_dev *pdev)
+{
+	struct pci402_card *card = pci_get_drvdata(pdev);
+
+	card->ov.addr = card->addr + PCI402_IO_OV_OFFS;
+	card->addr_pciep = card->addr + PCI402_IO_PCIEP_OFFS;
+
+	acc_reset_fpga(&card->ov);
+	acc_init_ov(&card->ov, &pdev->dev);
+
+	if (card->ov.version < PCI402_FPGA_VER_MIN) {
+		pci_err(pdev,
+			"ESDACC version (0x%.4x) outdated, please update\n",
+			card->ov.version);
+		return -EINVAL;
+	}
+
+	if (card->ov.timestamp_frequency != ACC_TS_FREQ_80MHZ) {
+		pci_err(pdev,
+			"esdACC timestamp frequency of %uHz not supported by driver. Aborted.\n",
+			card->ov.timestamp_frequency);
+		return -EINVAL;
+	}
+
+	if (card->ov.active_cores > PCI402_MAX_CORES) {
+		pci_err(pdev,
+			"Card has more active cores (%u) than supported by driver. Aborted.\n",
+			card->ov.active_cores);
+		return -EINVAL;
+	}
+	card->cores = devm_kcalloc(&pdev->dev, card->ov.active_cores,
+				   sizeof(struct acc_core), GFP_KERNEL);
+	if (!card->cores)
+		return -ENOMEM;
+
+	if (card->ov.features & ACC_OV_REG_FEAT_MASK_CANFD) {
+		pci_warn(pdev,
+			 "ESDACC with CAN-FD feature detected. This driver doesn't support CAN-FD yet.\n");
+	}
+
+#ifdef __LITTLE_ENDIAN
+	/* So card converts all busmastered data to LE for us: */
+	acc_ov_set_bits(&card->ov, ACC_OV_OF_MODE,
+			ACC_OV_REG_MODE_MASK_ENDIAN_LITTLE);
+#endif
+
+	return 0;
+}
+
+static int pci402_init_interrupt(struct pci_dev *pdev)
+{
+	struct pci402_card *card = pci_get_drvdata(pdev);
+	int err;
+
+	err = pci_enable_msi(pdev);
+	if (!err) {
+		err = pci402_set_msiconfig(pdev);
+		if (!err) {
+			card->msi_enabled = true;
+			acc_ov_set_bits(&card->ov, ACC_OV_OF_MODE,
+					ACC_OV_REG_MODE_MASK_MSI_ENABLE);
+			pci_info(pdev, "MSI enabled\n");
+		}
+	}
+
+	err = devm_request_irq(&pdev->dev, pdev->irq, pci402_interrupt,
+			       IRQF_SHARED, dev_name(&pdev->dev), pdev);
+	if (err)
+		goto failure_msidis;
+
+	iowrite32(1, card->addr_pciep + PCI402_PCIEP_OF_INT_ENABLE);
+
+	return 0;
+
+failure_msidis:
+	if (card->msi_enabled) {
+		acc_ov_clear_bits(&card->ov, ACC_OV_OF_MODE,
+				  ACC_OV_REG_MODE_MASK_MSI_ENABLE);
+		pci_disable_msi(pdev);
+		card->msi_enabled = false;
+	}
+
+	return err;
+}
+
+static void pci402_finish_interrupt(struct pci_dev *pdev)
+{
+	struct pci402_card *card = pci_get_drvdata(pdev);
+
+	iowrite32(0, card->addr_pciep + PCI402_PCIEP_OF_INT_ENABLE);
+	devm_free_irq(&pdev->dev, pdev->irq, pdev);
+
+	if (card->msi_enabled) {
+		acc_ov_clear_bits(&card->ov, ACC_OV_OF_MODE,
+				  ACC_OV_REG_MODE_MASK_MSI_ENABLE);
+		pci_disable_msi(pdev);
+		card->msi_enabled = false;
+	}
+}
+
+static int pci402_init_dma(struct pci_dev *pdev)
+{
+	struct pci402_card *card = pci_get_drvdata(pdev);
+	int err;
+
+	err = dma_set_coherent_mask(&pdev->dev, PCI402_DMA_MASK);
+	if (err) {
+		pci_err(pdev, "DMA set mask failed!\n");
+		return err;
+	}
+
+	/* The ESDACC DMA engine needs the DMA buffer aligned to a 64k
+	 * boundary. The DMA API guarantees to align the returned buffer to the
+	 * smallest PAGE_SIZE order which is greater than or equal to the
+	 * requested size. With PCI402_DMA_SIZE == 64kB this suffices here.
+	 */
+	card->dma_buf = dma_alloc_coherent(&pdev->dev, PCI402_DMA_SIZE,
+					   &card->dma_hnd, GFP_KERNEL);
+	if (!card->dma_buf)
+		return -ENOMEM;
+
+	acc_init_bm_ptr(&card->ov, card->cores, card->dma_buf);
+
+	iowrite32(card->dma_hnd,
+		  card->addr_pciep + PCI402_PCIEP_OF_BM_ADDR_LO);
+	iowrite32(0, card->addr_pciep + PCI402_PCIEP_OF_BM_ADDR_HI);
+
+	pci_set_master(pdev);
+
+	acc_ov_set_bits(&card->ov, ACC_OV_OF_MODE,
+			ACC_OV_REG_MODE_MASK_BM_ENABLE);
+
+	return 0;
+}
+
+static void pci402_finish_dma(struct pci_dev *pdev)
+{
+	struct pci402_card *card = pci_get_drvdata(pdev);
+	int i;
+
+	acc_ov_clear_bits(&card->ov, ACC_OV_OF_MODE,
+			  ACC_OV_REG_MODE_MASK_BM_ENABLE);
+
+	pci_clear_master(pdev);
+
+	iowrite32(0, card->addr_pciep + PCI402_PCIEP_OF_BM_ADDR_LO);
+	iowrite32(0, card->addr_pciep + PCI402_PCIEP_OF_BM_ADDR_HI);
+
+	card->ov.bmfifo.messages = NULL;
+	card->ov.bmfifo.irq_cnt = NULL;
+	for (i = 0; i < card->ov.active_cores; i++) {
+		struct acc_core *core = &card->cores[i];
+
+		core->bmfifo.messages = NULL;
+		core->bmfifo.irq_cnt = NULL;
+	}
+
+	dma_free_coherent(&pdev->dev, PCI402_DMA_SIZE, card->dma_buf,
+			  card->dma_hnd);
+	card->dma_buf = NULL;
+}
+
+static int pci402_init_cores(struct pci_dev *pdev)
+{
+	struct pci402_card *card = pci_get_drvdata(pdev);
+	int err;
+	int i;
+
+	for (i = 0; i < card->ov.active_cores; i++) {
+		struct acc_core *core = &card->cores[i];
+		struct acc_net_priv *priv;
+		struct net_device *netdev;
+		u32 fifo_config;
+
+		core->addr = card->ov.addr + (i + 1) * PCI402_IO_LEN_CORE;
+
+		fifo_config = acc_read32(core, ACC_CORE_OF_TXFIFO_CONFIG);
+		core->tx_fifo_size = (fifo_config >> 24);
+		if (core->tx_fifo_size <= 1) {
+			pci_err(pdev, "Invalid tx_fifo_size!\n");
+			err = -EINVAL;
+			goto failure;
+		}
+
+		netdev = alloc_candev(sizeof(*priv), core->tx_fifo_size);
+		if (!netdev) {
+			err = -ENOMEM;
+			goto failure;
+		}
+		core->netdev = netdev;
+
+		netdev->flags |= IFF_ECHO;
+		netdev->dev_port = i;
+		netdev->netdev_ops = &pci402_acc_netdev_ops;
+		SET_NETDEV_DEV(netdev, &pdev->dev);
+
+		priv = netdev_priv(netdev);
+		priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK |
+			CAN_CTRLMODE_LISTENONLY |
+			CAN_CTRLMODE_BERR_REPORTING |
+			CAN_CTRLMODE_CC_LEN8_DLC;
+
+		priv->can.clock.freq = card->ov.core_frequency;
+		if (card->ov.features & ACC_OV_REG_FEAT_MASK_CANFD)
+			priv->can.bittiming_const = &pci402_bittiming_const_canfd;
+		else
+			priv->can.bittiming_const = &pci402_bittiming_const;
+		priv->can.do_set_bittiming = acc_set_bittiming;
+		priv->can.do_set_mode = acc_set_mode;
+		priv->can.do_get_berr_counter = acc_get_berr_counter;
+
+		priv->core = core;
+		priv->ov = &card->ov;
+
+		err = register_candev(netdev);
+		if (err) {
+			free_candev(core->netdev);
+			core->netdev = NULL;
+			goto failure;
+		}
+
+		netdev_info(netdev, "registered\n");
+	}
+
+	return 0;
+
+failure:
+	for (i--; i >= 0; i--) {
+		struct acc_core *core = &card->cores[i];
+
+		netdev_info(core->netdev, "unregistering...\n");
+		unregister_candev(core->netdev);
+
+		free_candev(core->netdev);
+		core->netdev = NULL;
+	}
+
+	return err;
+}
+
+static void pci402_finish_cores(struct pci_dev *pdev)
+{
+	struct pci402_card *card = pci_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < card->ov.active_cores; i++) {
+		struct acc_core *core = &card->cores[i];
+
+		netdev_info(core->netdev, "unregister\n");
+		unregister_candev(core->netdev);
+
+		free_candev(core->netdev);
+		core->netdev = NULL;
+	}
+}
+
+static int pci402_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct pci402_card *card = NULL;
+	int err;
+
+	err = pci_enable_device(pdev);
+	if (err)
+		return err;
+
+	card = devm_kzalloc(&pdev->dev, sizeof(*card), GFP_KERNEL);
+	if (!card) {
+		err = -ENOMEM;
+		goto failure_disable_pci;
+	}
+
+	pci_set_drvdata(pdev, card);
+
+	err = pci_request_regions(pdev, pci_name(pdev));
+	if (err)
+		goto failure_disable_pci;
+
+	card->addr = pci_iomap(pdev, PCI402_BAR, PCI402_IO_LEN_TOTAL);
+	if (!card->addr) {
+		err = -ENOMEM;
+		goto failure_release_regions;
+	}
+
+	err = pci402_init_card(pdev);
+	if (err)
+		goto failure_unmap;
+
+	err = pci402_init_dma(pdev);
+	if (err)
+		goto failure_unmap;
+
+	err = pci402_init_interrupt(pdev);
+	if (err)
+		goto failure_finish_dma;
+
+	err = pci402_init_cores(pdev);
+	if (err)
+		goto failure_finish_interrupt;
+
+	return 0;
+
+failure_finish_interrupt:
+	pci402_finish_interrupt(pdev);
+
+failure_finish_dma:
+	pci402_finish_dma(pdev);
+
+failure_unmap:
+	pci_iounmap(pdev, card->addr);
+
+failure_release_regions:
+	pci_release_regions(pdev);
+
+failure_disable_pci:
+	pci_disable_device(pdev);
+
+	return err;
+}
+
+static void pci402_remove(struct pci_dev *pdev)
+{
+	struct pci402_card *card = pci_get_drvdata(pdev);
+
+	pci402_finish_interrupt(pdev);
+	pci402_finish_cores(pdev);
+	pci402_finish_dma(pdev);
+	pci_iounmap(pdev, card->addr);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+}
+
+static const struct pci_device_id pci402_tbl[] = {
+	{
+		.vendor = PCI_VENDOR_ID_ESDGMBH,
+		.device = ESD_PCI_DEVICE_ID_PCIE402,
+		.subvendor = PCI_VENDOR_ID_ESDGMBH,
+		.subdevice = PCI_ANY_ID,
+	},
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, pci402_tbl);
+
+static struct pci_driver pci402_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = pci402_tbl,
+	.probe = pci402_probe,
+	.remove = pci402_remove,
+};
+module_pci_driver(pci402_driver);
+
+MODULE_DESCRIPTION("Socket-CAN driver for esd CAN 402 card family with esdACC core on PCIe");
+MODULE_AUTHOR("Thomas Körper <socketcan@esd.eu>");
+MODULE_AUTHOR("Stefan Mätje <stefan.maetje@esd.eu>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/can/esd/esdacc.c b/drivers/net/can/esd/esdacc.c
new file mode 100644
index 000000000000..49017c986c70
--- /dev/null
+++ b/drivers/net/can/esd/esdacc.c
@@ -0,0 +1,771 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2015 - 2016 Thomas Körper, esd electronic system design gmbh
+ * Copyright (C) 2017 - 2022 Stefan Mätje, esd electronics gmbh
+ */
+
+#include "esdacc.h"
+
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/ktime.h>
+
+/* ecc value of esdACC equals SJA1000's ECC register */
+#define ACC_ECC_SEG			0x1f
+#define ACC_ECC_DIR			0x20
+#define ACC_ECC_BIT			0x00
+#define ACC_ECC_FORM			0x40
+#define ACC_ECC_STUFF			0x80
+#define ACC_ECC_MASK			0xc0
+
+#define ACC_BM_IRQ_UNMASK_ALL		0x55555555U
+#define ACC_BM_IRQ_MASK_ALL		0xaaaaaaaaU
+#define ACC_BM_IRQ_MASK			0x2U
+#define ACC_BM_IRQ_UNMASK		0x1U
+#define ACC_BM_LENFLAG_TX		0x20
+
+#define ACC_REG_STATUS_IDX_STATUS_DOS	16
+#define ACC_REG_STATUS_IDX_STATUS_ES	17
+#define ACC_REG_STATUS_IDX_STATUS_EP	18
+#define ACC_REG_STATUS_IDX_STATUS_BS	19
+#define ACC_REG_STATUS_IDX_STATUS_RBS	20
+#define ACC_REG_STATUS_IDX_STATUS_RS	21
+#define ACC_REG_STATUS_MASK_STATUS_DOS	BIT(ACC_REG_STATUS_IDX_STATUS_DOS)
+#define ACC_REG_STATUS_MASK_STATUS_ES	BIT(ACC_REG_STATUS_IDX_STATUS_ES)
+#define ACC_REG_STATUS_MASK_STATUS_EP	BIT(ACC_REG_STATUS_IDX_STATUS_EP)
+#define ACC_REG_STATUS_MASK_STATUS_BS	BIT(ACC_REG_STATUS_IDX_STATUS_BS)
+#define ACC_REG_STATUS_MASK_STATUS_RBS	BIT(ACC_REG_STATUS_IDX_STATUS_RBS)
+#define ACC_REG_STATUS_MASK_STATUS_RS	BIT(ACC_REG_STATUS_IDX_STATUS_RS)
+
+static void acc_resetmode_enter(struct acc_core *core)
+{
+	acc_set_bits(core, ACC_CORE_OF_CTRL_MODE,
+		     ACC_REG_CONTROL_MASK_MODE_RESETMODE);
+
+	/* Read back reset mode bit to flush PCI write posting */
+	acc_resetmode_entered(core);
+}
+
+static void acc_resetmode_leave(struct acc_core *core)
+{
+	acc_clear_bits(core, ACC_CORE_OF_CTRL_MODE,
+		       ACC_REG_CONTROL_MASK_MODE_RESETMODE);
+
+	/* Read back reset mode bit to flush PCI write posting */
+	acc_resetmode_entered(core);
+}
+
+static void acc_txq_put(struct acc_core *core, u32 acc_id, u8 acc_dlc,
+			const void *data)
+{
+	acc_write32_noswap(core, ACC_CORE_OF_TXFIFO_DATA_1,
+			   *((const u32 *)(data + 4)));
+	acc_write32_noswap(core, ACC_CORE_OF_TXFIFO_DATA_0,
+			   *((const u32 *)data));
+	acc_write32(core, ACC_CORE_OF_TXFIFO_DLC, acc_dlc);
+	/* CAN id must be written at last. This write starts TX. */
+	acc_write32(core, ACC_CORE_OF_TXFIFO_ID, acc_id);
+}
+
+/* Convert timestamp from esdACC time stamp ticks to ns
+ *
+ * The conversion factor ts2ns from time stamp counts to ns is basically
+ *	ts2ns = NSEC_PER_SEC / timestamp_frequency
+ *
+ * We handle here only a fixed timestamp frequency of 80MHz. The
+ * resulting ts2ns factor would be 12.5.
+ *
+ * At the end we multiply by 12 and add the half of the HW timestamp
+ * to get a multiplication by 12.5. This way any overflow is
+ * avoided until ktime_t itself overflows.
+ */
+#define ACC_TS_FACTOR (NSEC_PER_SEC / ACC_TS_FREQ_80MHZ)
+#define ACC_TS_80MHZ_SHIFT 1
+
+static ktime_t acc_ts2ktime(struct acc_ov *ov, u64 ts)
+{
+	u64 ns;
+
+	ns = (ts * ACC_TS_FACTOR) + (ts >> ACC_TS_80MHZ_SHIFT);
+
+	return ns_to_ktime(ns);
+}
+
+void acc_init_ov(struct acc_ov *ov, struct device *dev)
+{
+	u32 temp;
+
+	temp = acc_ov_read32(ov, ACC_OV_OF_VERSION);
+	ov->version = temp;
+	ov->features = (temp >> 16);
+
+	temp = acc_ov_read32(ov, ACC_OV_OF_INFO);
+	ov->total_cores = temp;
+	ov->active_cores = (temp >> 8);
+
+	ov->core_frequency = acc_ov_read32(ov, ACC_OV_OF_CANCORE_FREQ);
+	ov->timestamp_frequency = acc_ov_read32(ov, ACC_OV_OF_TS_FREQ_LO);
+
+	/* Depending on ESDACC feature NEW_PSC enable the new prescaler
+	 * or adjust core_frequency according to the implicit division by 2.
+	 */
+	if (ov->features & ACC_OV_REG_FEAT_MASK_NEW_PSC) {
+		acc_ov_set_bits(ov, ACC_OV_OF_MODE,
+				ACC_OV_REG_MODE_MASK_NEW_PSC_ENABLE);
+	} else {
+		ov->core_frequency /= 2;
+	}
+
+	dev_info(dev,
+		 "ESDACC v%u, freq: %u/%u, feat/strap: 0x%x/0x%x, cores: %u/%u\n",
+		 ov->version, ov->core_frequency, ov->timestamp_frequency,
+		 ov->features, acc_ov_read32(ov, ACC_OV_OF_INFO) >> 16,
+		 ov->active_cores, ov->total_cores);
+}
+
+void acc_init_bm_ptr(struct acc_ov *ov, struct acc_core *cores, const void *mem)
+{
+	unsigned int u;
+
+	/* DMA buffer layout as follows where N is the number of CAN cores
+	 * implemented in the FPGA, i.e. N = ov->total_cores
+	 *
+	 *   Layout                   Section size
+	 * +-----------------------+
+	 * | FIFO Card/Overview	   |  ACC_CORE_DMABUF_SIZE
+	 * |			   |
+	 * +-----------------------+
+	 * | FIFO Core0		   |  ACC_CORE_DMABUF_SIZE
+	 * |			   |
+	 * +-----------------------+
+	 * | ...		   |  ...
+	 * |			   |
+	 * +-----------------------+
+	 * | FIFO CoreN		   |  ACC_CORE_DMABUF_SIZE
+	 * |			   |
+	 * +-----------------------+
+	 * | irq_cnt Card/Overview |  sizeof(u32)
+	 * +-----------------------+
+	 * | irq_cnt Core0	   |  sizeof(u32)
+	 * +-----------------------+
+	 * | ...		   |  ...
+	 * +-----------------------+
+	 * | irq_cnt CoreN	   |  sizeof(u32)
+	 * +-----------------------+
+	 */
+	ov->bmfifo.messages = mem;
+	ov->bmfifo.irq_cnt = mem + (ov->total_cores + 1U) * ACC_CORE_DMABUF_SIZE;
+
+	for (u = 0U; u < ov->active_cores; u++) {
+		struct acc_core *core = &cores[u];
+
+		core->bmfifo.messages = mem + (u + 1U) * ACC_CORE_DMABUF_SIZE;
+		core->bmfifo.irq_cnt = ov->bmfifo.irq_cnt + (u + 1U);
+	}
+}
+
+int acc_open(struct net_device *netdev)
+{
+	struct acc_net_priv *priv = netdev_priv(netdev);
+	struct acc_core *core = priv->core;
+	u32 tx_fifo_status;
+	u32 ctrl_mode;
+	int err;
+
+	/* Retry to enter RESET mode if out of sync. */
+	if (priv->can.state != CAN_STATE_STOPPED) {
+		netdev_warn(netdev, "Entered %s() with bad can.state: %s\n",
+			    __func__, can_get_state_str(priv->can.state));
+		acc_resetmode_enter(core);
+		priv->can.state = CAN_STATE_STOPPED;
+	}
+
+	err = open_candev(netdev);
+	if (err)
+		return err;
+
+	ctrl_mode = ACC_REG_CONTROL_MASK_IE_RXTX |
+			ACC_REG_CONTROL_MASK_IE_TXERROR |
+			ACC_REG_CONTROL_MASK_IE_ERRWARN |
+			ACC_REG_CONTROL_MASK_IE_OVERRUN |
+			ACC_REG_CONTROL_MASK_IE_ERRPASS;
+
+	if (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING)
+		ctrl_mode |= ACC_REG_CONTROL_MASK_IE_BUSERR;
+
+	if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY)
+		ctrl_mode |= ACC_REG_CONTROL_MASK_MODE_LOM;
+
+	acc_set_bits(core, ACC_CORE_OF_CTRL_MODE, ctrl_mode);
+
+	acc_resetmode_leave(core);
+	priv->can.state = CAN_STATE_ERROR_ACTIVE;
+
+	/* Resync TX FIFO indices to HW state after (re-)start. */
+	tx_fifo_status = acc_read32(core, ACC_CORE_OF_TXFIFO_STATUS);
+	core->tx_fifo_head = tx_fifo_status & 0xff;
+	core->tx_fifo_tail = (tx_fifo_status >> 8) & 0xff;
+
+	netif_start_queue(netdev);
+	return 0;
+}
+
+int acc_close(struct net_device *netdev)
+{
+	struct acc_net_priv *priv = netdev_priv(netdev);
+	struct acc_core *core = priv->core;
+
+	acc_clear_bits(core, ACC_CORE_OF_CTRL_MODE,
+		       ACC_REG_CONTROL_MASK_IE_RXTX |
+		       ACC_REG_CONTROL_MASK_IE_TXERROR |
+		       ACC_REG_CONTROL_MASK_IE_ERRWARN |
+		       ACC_REG_CONTROL_MASK_IE_OVERRUN |
+		       ACC_REG_CONTROL_MASK_IE_ERRPASS |
+		       ACC_REG_CONTROL_MASK_IE_BUSERR);
+
+	netif_stop_queue(netdev);
+	acc_resetmode_enter(core);
+	priv->can.state = CAN_STATE_STOPPED;
+
+	/* Mark pending TX requests to be aborted after controller restart. */
+	acc_write32(core, ACC_CORE_OF_TX_ABORT_MASK, 0xffff);
+
+	/* ACC_REG_CONTROL_MASK_MODE_LOM is only accessible in RESET mode */
+	acc_clear_bits(core, ACC_CORE_OF_CTRL_MODE,
+		       ACC_REG_CONTROL_MASK_MODE_LOM);
+
+	close_candev(netdev);
+	return 0;
+}
+
+netdev_tx_t acc_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct acc_net_priv *priv = netdev_priv(netdev);
+	struct acc_core *core = priv->core;
+	struct can_frame *cf = (struct can_frame *)skb->data;
+	u8 tx_fifo_head = core->tx_fifo_head;
+	int fifo_usage;
+	u32 acc_id;
+	u8 acc_dlc;
+
+	if (can_dropped_invalid_skb(netdev, skb))
+		return NETDEV_TX_OK;
+
+	/* Access core->tx_fifo_tail only once because it may be changed
+	 * from the interrupt level.
+	 */
+	fifo_usage = tx_fifo_head - core->tx_fifo_tail;
+	if (fifo_usage < 0)
+		fifo_usage += core->tx_fifo_size;
+
+	if (fifo_usage >= core->tx_fifo_size - 1) {
+		netdev_err(core->netdev,
+			   "BUG: TX ring full when queue awake!\n");
+		netif_stop_queue(netdev);
+		return NETDEV_TX_BUSY;
+	}
+
+	if (fifo_usage == core->tx_fifo_size - 2)
+		netif_stop_queue(netdev);
+
+	acc_dlc = can_get_cc_dlc(cf, priv->can.ctrlmode);
+	if (cf->can_id & CAN_RTR_FLAG)
+		acc_dlc |= ACC_CAN_RTR_FLAG;
+
+	if (cf->can_id & CAN_EFF_FLAG) {
+		acc_id = cf->can_id & CAN_EFF_MASK;
+		acc_id |= ACC_CAN_EFF_FLAG;
+	} else {
+		acc_id = cf->can_id & CAN_SFF_MASK;
+	}
+
+	can_put_echo_skb(skb, netdev, core->tx_fifo_head, 0);
+
+	tx_fifo_head++;
+	if (tx_fifo_head >= core->tx_fifo_size)
+		tx_fifo_head = 0U;
+	core->tx_fifo_head = tx_fifo_head;
+
+	acc_txq_put(core, acc_id, acc_dlc, cf->data);
+
+	return NETDEV_TX_OK;
+}
+
+int acc_get_berr_counter(const struct net_device *netdev,
+			 struct can_berr_counter *bec)
+{
+	struct acc_net_priv *priv = netdev_priv(netdev);
+	u32 core_status = acc_read32(priv->core, ACC_CORE_OF_STATUS);
+
+	bec->txerr = (core_status >> 8) & 0xff;
+	bec->rxerr = core_status & 0xff;
+
+	return 0;
+}
+
+int acc_set_mode(struct net_device *netdev, enum can_mode mode)
+{
+	struct acc_net_priv *priv = netdev_priv(netdev);
+
+	switch (mode) {
+	case CAN_MODE_START:
+		/* Paranoid FIFO index check. */
+		{
+			const u32 tx_fifo_status =
+				acc_read32(priv->core, ACC_CORE_OF_TXFIFO_STATUS);
+			const u8 hw_fifo_head = tx_fifo_status;
+
+			if (hw_fifo_head != priv->core->tx_fifo_head ||
+			    hw_fifo_head != priv->core->tx_fifo_tail) {
+				netdev_warn(netdev,
+					    "TX FIFO mismatch: T %2u H %2u; TFHW %#08x\n",
+					    priv->core->tx_fifo_tail,
+					    priv->core->tx_fifo_head,
+					    tx_fifo_status);
+			}
+		}
+		acc_resetmode_leave(priv->core);
+		/* To leave the bus-off state the esdACC controller begins
+		 * here a grace period where it counts 128 "idle conditions" (each
+		 * of 11 consecutive recessive bits) on the bus as required
+		 * by the CAN spec.
+		 *
+		 * During this time the TX FIFO may still contain already
+		 * aborted "zombie" frames that are only drained from the FIFO
+		 * at the end of the grace period.
+		 *
+		 * To not to interfere with this drain process we don't
+		 * call netif_wake_queue() here. When the controller reaches
+		 * the error-active state again, it informs us about that
+		 * with an acc_bmmsg_errstatechange message. Then
+		 * netif_wake_queue() is called from
+		 * handle_core_msg_errstatechange() instead.
+		 */
+		break;
+
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+int acc_set_bittiming(struct net_device *netdev)
+{
+	struct acc_net_priv *priv = netdev_priv(netdev);
+	const struct can_bittiming *bt = &priv->can.bittiming;
+	u32 brp;
+	u32 btr;
+
+	if (priv->ov->features & ACC_OV_REG_FEAT_MASK_CANFD) {
+		u32 fbtr = 0;
+
+		netdev_dbg(netdev, "bit timing: brp %u, prop %u, ph1 %u ph2 %u, sjw %u\n",
+			   bt->brp, bt->prop_seg,
+			   bt->phase_seg1, bt->phase_seg2, bt->sjw);
+
+		brp = FIELD_PREP(ACC_REG_BRP_FD_MASK_BRP, bt->brp - 1);
+
+		btr = FIELD_PREP(ACC_REG_BTR_FD_MASK_TSEG1, bt->phase_seg1 + bt->prop_seg - 1);
+		btr |= FIELD_PREP(ACC_REG_BTR_FD_MASK_TSEG2, bt->phase_seg2 - 1);
+		btr |= FIELD_PREP(ACC_REG_BTR_FD_MASK_SJW, bt->sjw - 1);
+
+		/* Keep order of accesses to ACC_CORE_OF_BRP and ACC_CORE_OF_BTR. */
+		acc_write32(priv->core, ACC_CORE_OF_BRP, brp);
+		acc_write32(priv->core, ACC_CORE_OF_BTR, btr);
+
+		netdev_dbg(netdev, "ESDACC: BRP %u, NBTR 0x%08x, DBTR 0x%08x",
+			   brp, btr, fbtr);
+	} else {
+		netdev_dbg(netdev, "bit timing: brp %u, prop %u, ph1 %u ph2 %u, sjw %u\n",
+			   bt->brp, bt->prop_seg,
+			   bt->phase_seg1, bt->phase_seg2, bt->sjw);
+
+		brp = FIELD_PREP(ACC_REG_BRP_CL_MASK_BRP, bt->brp - 1);
+
+		btr = FIELD_PREP(ACC_REG_BTR_CL_MASK_TSEG1, bt->phase_seg1 + bt->prop_seg - 1);
+		btr |= FIELD_PREP(ACC_REG_BTR_CL_MASK_TSEG2, bt->phase_seg2 - 1);
+		btr |= FIELD_PREP(ACC_REG_BTR_CL_MASK_SJW, bt->sjw - 1);
+
+		/* Keep order of accesses to ACC_CORE_OF_BRP and ACC_CORE_OF_BTR. */
+		acc_write32(priv->core, ACC_CORE_OF_BRP, brp);
+		acc_write32(priv->core, ACC_CORE_OF_BTR, btr);
+
+		netdev_dbg(netdev, "ESDACC: BRP %u, BTR 0x%08x", brp, btr);
+	}
+
+	return 0;
+}
+
+static void handle_core_msg_rxtxdone(struct acc_core *core,
+				     const struct acc_bmmsg_rxtxdone *msg)
+{
+	struct acc_net_priv *priv = netdev_priv(core->netdev);
+	struct net_device_stats *stats = &core->netdev->stats;
+	struct sk_buff *skb;
+
+	if (msg->dlc.rxtx.len & ACC_BM_LENFLAG_TX) {
+		u8 tx_fifo_tail = core->tx_fifo_tail;
+
+		if (core->tx_fifo_head == tx_fifo_tail) {
+			netdev_warn(core->netdev,
+				    "TX interrupt, but queue is empty!?\n");
+			return;
+		}
+
+		/* Direct access echo skb to attach HW time stamp. */
+		skb = priv->can.echo_skb[tx_fifo_tail];
+		if (skb) {
+			skb_hwtstamps(skb)->hwtstamp =
+				acc_ts2ktime(priv->ov, msg->ts);
+		}
+
+		stats->tx_packets++;
+		stats->tx_bytes += can_get_echo_skb(core->netdev, tx_fifo_tail,
+						    NULL);
+
+		tx_fifo_tail++;
+		if (tx_fifo_tail >= core->tx_fifo_size)
+			tx_fifo_tail = 0U;
+		core->tx_fifo_tail = tx_fifo_tail;
+
+		netif_wake_queue(core->netdev);
+
+	} else {
+		struct can_frame *cf;
+
+		skb = alloc_can_skb(core->netdev, &cf);
+		if (!skb) {
+			stats->rx_dropped++;
+			return;
+		}
+
+		cf->can_id = msg->id & CAN_EFF_MASK;
+		if (msg->id & ACC_CAN_EFF_FLAG)
+			cf->can_id |= CAN_EFF_FLAG;
+
+		can_frame_set_cc_len(cf, msg->dlc.rx.len & ACC_CAN_DLC_MASK,
+				     priv->can.ctrlmode);
+
+		if (msg->dlc.rx.len & ACC_CAN_RTR_FLAG) {
+			cf->can_id |= CAN_RTR_FLAG;
+		} else {
+			memcpy(cf->data, msg->data, cf->len);
+			stats->rx_bytes += cf->len;
+		}
+		stats->rx_packets++;
+
+		skb_hwtstamps(skb)->hwtstamp = acc_ts2ktime(priv->ov, msg->ts);
+
+		netif_rx(skb);
+	}
+}
+
+static void handle_core_msg_txabort(struct acc_core *core,
+				    const struct acc_bmmsg_txabort *msg)
+{
+	struct net_device_stats *stats = &core->netdev->stats;
+	u8 tx_fifo_tail = core->tx_fifo_tail;
+	u32 abort_mask = msg->abort_mask;   /* u32 extend to avoid warnings later */
+
+	/* The abort_mask shows which frames were aborted in ESDACC's FIFO. */
+	while (tx_fifo_tail != core->tx_fifo_head && (abort_mask)) {
+		const u32 tail_mask = (1U << tx_fifo_tail);
+
+		if (!(abort_mask & tail_mask))
+			break;
+		abort_mask &= ~tail_mask;
+
+		can_free_echo_skb(core->netdev, tx_fifo_tail, NULL);
+		stats->tx_dropped++;
+		stats->tx_aborted_errors++;
+
+		tx_fifo_tail++;
+		if (tx_fifo_tail >= core->tx_fifo_size)
+			tx_fifo_tail = 0;
+	}
+	core->tx_fifo_tail = tx_fifo_tail;
+	if (abort_mask)
+		netdev_warn(core->netdev, "Unhandled aborted messages\n");
+
+	if (!acc_resetmode_entered(core))
+		netif_wake_queue(core->netdev);
+}
+
+static void handle_core_msg_overrun(struct acc_core *core,
+				    const struct acc_bmmsg_overrun *msg)
+{
+	struct acc_net_priv *priv = netdev_priv(core->netdev);
+	struct net_device_stats *stats = &core->netdev->stats;
+	struct can_frame *cf;
+	struct sk_buff *skb;
+
+	/* lost_cnt may be 0 if not supported by ESDACC version */
+	if (msg->lost_cnt) {
+		stats->rx_errors += msg->lost_cnt;
+		stats->rx_over_errors += msg->lost_cnt;
+	} else {
+		stats->rx_errors++;
+		stats->rx_over_errors++;
+	}
+
+	skb = alloc_can_err_skb(core->netdev, &cf);
+	if (!skb) {
+		stats->rx_dropped++;
+		return;
+	}
+
+	cf->can_id |= CAN_ERR_CRTL;
+	cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
+
+	skb_hwtstamps(skb)->hwtstamp = acc_ts2ktime(priv->ov, msg->ts);
+
+	netif_rx(skb);
+}
+
+static void handle_core_msg_buserr(struct acc_core *core,
+				   const struct acc_bmmsg_buserr *msg)
+{
+	struct acc_net_priv *priv = netdev_priv(core->netdev);
+	struct net_device_stats *stats = &core->netdev->stats;
+	struct can_frame *cf;
+	struct sk_buff *skb;
+	const u32 reg_status = msg->reg_status;
+	const u8 rxerr = reg_status;
+	const u8 txerr = (reg_status >> 8);
+	u8 can_err_prot_type = 0U;
+
+	priv->can.can_stats.bus_error++;
+
+	/* Error occurred during transmission? */
+	if ((msg->ecc & ACC_ECC_DIR) == 0) {
+		can_err_prot_type |= CAN_ERR_PROT_TX;
+		stats->tx_errors++;
+	} else {
+		stats->rx_errors++;
+	}
+	/* Determine error type */
+	switch (msg->ecc & ACC_ECC_MASK) {
+	case ACC_ECC_BIT:
+		can_err_prot_type |= CAN_ERR_PROT_BIT;
+		break;
+	case ACC_ECC_FORM:
+		can_err_prot_type |= CAN_ERR_PROT_FORM;
+		break;
+	case ACC_ECC_STUFF:
+		can_err_prot_type |= CAN_ERR_PROT_STUFF;
+		break;
+	default:
+		can_err_prot_type |= CAN_ERR_PROT_UNSPEC;
+		break;
+	}
+
+	skb = alloc_can_err_skb(core->netdev, &cf);
+	if (!skb) {
+		stats->rx_dropped++;
+		return;
+	}
+
+	cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR;
+
+	/* Set protocol error type */
+	cf->data[2] = can_err_prot_type;
+	/* Set error location */
+	cf->data[3] = msg->ecc & ACC_ECC_SEG;
+
+	/* Insert CAN TX and RX error counters. */
+	cf->data[6] = txerr;
+	cf->data[7] = rxerr;
+
+	skb_hwtstamps(skb)->hwtstamp = acc_ts2ktime(priv->ov, msg->ts);
+
+	netif_rx(skb);
+}
+
+static void
+handle_core_msg_errstatechange(struct acc_core *core,
+			       const struct acc_bmmsg_errstatechange *msg)
+{
+	struct acc_net_priv *priv = netdev_priv(core->netdev);
+	struct net_device_stats *stats = &core->netdev->stats;
+	struct can_frame *cf = NULL;
+	struct sk_buff *skb;
+	const u32 reg_status = msg->reg_status;
+	const u8 rxerr = reg_status;
+	const u8 txerr = (reg_status >> 8);
+	enum can_state new_state;
+
+	if (reg_status & ACC_REG_STATUS_MASK_STATUS_BS) {
+		new_state = CAN_STATE_BUS_OFF;
+	} else if (reg_status & ACC_REG_STATUS_MASK_STATUS_EP) {
+		new_state = CAN_STATE_ERROR_PASSIVE;
+	} else if (reg_status & ACC_REG_STATUS_MASK_STATUS_ES) {
+		new_state = CAN_STATE_ERROR_WARNING;
+	} else {
+		new_state = CAN_STATE_ERROR_ACTIVE;
+		if (priv->can.state == CAN_STATE_BUS_OFF) {
+			/* See comment in acc_set_mode() for CAN_MODE_START */
+			netif_wake_queue(core->netdev);
+		}
+	}
+
+	skb = alloc_can_err_skb(core->netdev, &cf);
+
+	if (new_state != priv->can.state) {
+		enum can_state tx_state, rx_state;
+
+		tx_state = (txerr >= rxerr) ?
+			new_state : CAN_STATE_ERROR_ACTIVE;
+		rx_state = (rxerr >= txerr) ?
+			new_state : CAN_STATE_ERROR_ACTIVE;
+
+		/* Always call can_change_state() to update the state
+		 * even if alloc_can_err_skb() may have failed.
+		 * can_change_state() can cope with a NULL cf pointer.
+		 */
+		can_change_state(core->netdev, cf, tx_state, rx_state);
+	}
+
+	if (skb) {
+		cf->data[6] = txerr;
+		cf->data[7] = rxerr;
+
+		skb_hwtstamps(skb)->hwtstamp = acc_ts2ktime(priv->ov, msg->ts);
+
+		netif_rx(skb);
+	} else {
+		stats->rx_dropped++;
+	}
+
+	if (new_state == CAN_STATE_BUS_OFF) {
+		acc_write32(core, ACC_CORE_OF_TX_ABORT_MASK, 0xffff);
+		can_bus_off(core->netdev);
+	}
+}
+
+static void handle_core_interrupt(struct acc_core *core)
+{
+	u32 msg_fifo_head = core->bmfifo.local_irq_cnt & 0xff;
+
+	while (core->bmfifo.msg_fifo_tail != msg_fifo_head) {
+		const union acc_bmmsg *msg =
+			&core->bmfifo.messages[core->bmfifo.msg_fifo_tail];
+
+		switch (msg->msg_id) {
+		case BM_MSG_ID_RXTXDONE:
+			handle_core_msg_rxtxdone(core, &msg->rxtxdone);
+			break;
+
+		case BM_MSG_ID_TXABORT:
+			handle_core_msg_txabort(core, &msg->txabort);
+			break;
+
+		case BM_MSG_ID_OVERRUN:
+			handle_core_msg_overrun(core, &msg->overrun);
+			break;
+
+		case BM_MSG_ID_BUSERR:
+			handle_core_msg_buserr(core, &msg->buserr);
+			break;
+
+		case BM_MSG_ID_ERRPASSIVE:
+		case BM_MSG_ID_ERRWARN:
+			handle_core_msg_errstatechange(core,
+						       &msg->errstatechange);
+			break;
+
+		default:
+			/* Ignore all other BM messages (like the CAN-FD messages) */
+			break;
+		}
+
+		core->bmfifo.msg_fifo_tail =
+				(core->bmfifo.msg_fifo_tail + 1) & 0xff;
+	}
+}
+
+/**
+ * acc_card_interrupt() - handle the interrupts of an esdACC FPGA
+ *
+ * @ov: overview module structure
+ * @cores: array of core structures
+ *
+ * This function handles all interrupts pending for the overview module and the
+ * CAN cores of the esdACC FPGA.
+ *
+ * It examines for all cores (the overview module core and the CAN cores)
+ * the bmfifo.irq_cnt and compares it with the previously saved
+ * bmfifo.local_irq_cnt. An IRQ is pending if they differ. The esdACC FPGA
+ * updates the bmfifo.irq_cnt values by DMA.
+ *
+ * The pending interrupts are masked by writing to the IRQ mask register at
+ * ACC_OV_OF_BM_IRQ_MASK. This register has for each core a two bit command
+ * field evaluated as follows:
+ *
+ * Define,   bit pattern: meaning
+ *                    00: no action
+ * ACC_BM_IRQ_UNMASK, 01: unmask interrupt
+ * ACC_BM_IRQ_MASK,   10: mask interrupt
+ *                    11: no action
+ *
+ * For each CAN core with a pending IRQ handle_core_interrupt() handles all
+ * busmaster messages from the message FIFO. The last handled message (FIFO
+ * index) is written to the CAN core to acknowledge its handling.
+ *
+ * Last step is to unmask all interrupts in the FPGA using
+ * ACC_BM_IRQ_UNMASK_ALL.
+ */
+irqreturn_t acc_card_interrupt(struct acc_ov *ov, struct acc_core *cores)
+{
+	u32 irqmask;
+	int i;
+
+	/* First we look for whom interrupts are pending, card/overview
+	 * or any of the cores. Two bits in irqmask are used for each;
+	 * Each two bit field is set to ACC_BM_IRQ_MASK if an IRQ is
+	 * pending.
+	 */
+	irqmask = 0;
+	if (READ_ONCE(*ov->bmfifo.irq_cnt) != ov->bmfifo.local_irq_cnt) {
+		irqmask |= ACC_BM_IRQ_MASK;
+		ov->bmfifo.local_irq_cnt = READ_ONCE(*ov->bmfifo.irq_cnt);
+	}
+
+	for (i = 0; i < ov->active_cores; i++) {
+		struct acc_core *core = &cores[i];
+
+		if (READ_ONCE(*core->bmfifo.irq_cnt) != core->bmfifo.local_irq_cnt) {
+			irqmask |= (ACC_BM_IRQ_MASK << (2 * (i + 1)));
+			core->bmfifo.local_irq_cnt = READ_ONCE(*core->bmfifo.irq_cnt);
+		}
+	}
+
+	if (!irqmask)
+		return IRQ_NONE;
+
+	/* At second we tell the card we're working on them by writing irqmask,
+	 * call handle_{ov|core}_interrupt and then acknowledge the
+	 * interrupts by writing irq_cnt:
+	 */
+	acc_ov_write32(ov, ACC_OV_OF_BM_IRQ_MASK, irqmask);
+
+	if (irqmask & ACC_BM_IRQ_MASK) {
+		/* handle_ov_interrupt(); - no use yet. */
+		acc_ov_write32(ov, ACC_OV_OF_BM_IRQ_COUNTER,
+			       ov->bmfifo.local_irq_cnt);
+	}
+
+	for (i = 0; i < ov->active_cores; i++) {
+		struct acc_core *core = &cores[i];
+
+		if (irqmask & (ACC_BM_IRQ_MASK << (2 * (i + 1)))) {
+			handle_core_interrupt(core);
+			acc_write32(core, ACC_OV_OF_BM_IRQ_COUNTER,
+				    core->bmfifo.local_irq_cnt);
+		}
+	}
+
+	acc_ov_write32(ov, ACC_OV_OF_BM_IRQ_MASK, ACC_BM_IRQ_UNMASK_ALL);
+
+	return IRQ_HANDLED;
+}
diff --git a/drivers/net/can/esd/esdacc.h b/drivers/net/can/esd/esdacc.h
new file mode 100644
index 000000000000..73651bc1d52c
--- /dev/null
+++ b/drivers/net/can/esd/esdacc.h
@@ -0,0 +1,393 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2015 - 2016 Thomas Körper, esd electronic system design gmbh
+ * Copyright (C) 2017 - 2022 Stefan Mätje, esd electronics gmbh
+ */
+
+#include <linux/bits.h>
+#include <linux/can/dev.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/units.h>
+
+#define ACC_TS_FREQ_80MHZ			(80 * HZ_PER_MHZ)
+
+#define ACC_CAN_EFF_FLAG			0x20000000
+#define ACC_CAN_RTR_FLAG			0x10
+#define ACC_CAN_DLC_MASK			0x0f
+
+#define ACC_OV_OF_PROBE				0x0000
+#define ACC_OV_OF_VERSION			0x0004
+#define ACC_OV_OF_INFO				0x0008
+#define ACC_OV_OF_CANCORE_FREQ			0x000c
+#define ACC_OV_OF_TS_FREQ_LO			0x0010
+#define ACC_OV_OF_TS_FREQ_HI			0x0014
+#define ACC_OV_OF_IRQ_STATUS_CORES		0x0018
+#define ACC_OV_OF_TS_CURR_LO			0x001c
+#define ACC_OV_OF_TS_CURR_HI			0x0020
+#define ACC_OV_OF_IRQ_STATUS			0x0028
+#define ACC_OV_OF_MODE				0x002c
+#define ACC_OV_OF_BM_IRQ_COUNTER		0x0070
+#define ACC_OV_OF_BM_IRQ_MASK			0x0074
+#define ACC_OV_OF_MSI_DATA			0x0080
+#define ACC_OV_OF_MSI_ADDRESSOFFSET		0x0084
+
+/* Feature flags are contained in the upper 16 bit of the version
+ * register at ACC_OV_OF_VERSION but only used with these masks after
+ * extraction into an extra variable => (xx - 16).
+ */
+#define ACC_OV_REG_FEAT_IDX_CANFD		(27 - 16)
+#define ACC_OV_REG_FEAT_IDX_NEW_PSC		(28 - 16)
+#define ACC_OV_REG_FEAT_MASK_CANFD		BIT(ACC_OV_REG_FEAT_IDX_CANFD)
+#define ACC_OV_REG_FEAT_MASK_NEW_PSC		BIT(ACC_OV_REG_FEAT_IDX_NEW_PSC)
+
+#define ACC_OV_REG_MODE_MASK_ENDIAN_LITTLE	0x00000001
+#define ACC_OV_REG_MODE_MASK_BM_ENABLE		0x00000002
+#define ACC_OV_REG_MODE_MASK_MODE_LED		0x00000004
+#define ACC_OV_REG_MODE_MASK_TIMER		0x00000070
+#define ACC_OV_REG_MODE_MASK_TIMER_ENABLE	0x00000010
+#define ACC_OV_REG_MODE_MASK_TIMER_ONE_SHOT	0x00000020
+#define ACC_OV_REG_MODE_MASK_TIMER_ABSOLUTE	0x00000040
+#define ACC_OV_REG_MODE_MASK_TS_SRC		0x00000180
+#define ACC_OV_REG_MODE_MASK_I2C_ENABLE		0x00000800
+#define ACC_OV_REG_MODE_MASK_MSI_ENABLE		0x00004000
+#define ACC_OV_REG_MODE_MASK_NEW_PSC_ENABLE	0x00008000
+#define ACC_OV_REG_MODE_MASK_FPGA_RESET		0x80000000
+
+#define ACC_CORE_OF_CTRL_MODE			0x0000
+#define ACC_CORE_OF_STATUS_IRQ			0x0008
+#define ACC_CORE_OF_BRP				0x000c
+#define ACC_CORE_OF_BTR				0x0010
+#define ACC_CORE_OF_FBTR			0x0014
+#define ACC_CORE_OF_STATUS			0x0030
+#define ACC_CORE_OF_TXFIFO_CONFIG		0x0048
+#define ACC_CORE_OF_TXFIFO_STATUS		0x004c
+#define ACC_CORE_OF_TX_STATUS_IRQ		0x0050
+#define ACC_CORE_OF_TX_ABORT_MASK		0x0054
+#define ACC_CORE_OF_BM_IRQ_COUNTER		0x0070
+#define ACC_CORE_OF_TXFIFO_ID			0x00c0
+#define ACC_CORE_OF_TXFIFO_DLC			0x00c4
+#define ACC_CORE_OF_TXFIFO_DATA_0		0x00c8
+#define ACC_CORE_OF_TXFIFO_DATA_1		0x00cc
+
+#define ACC_REG_CONTROL_IDX_MODE_RESETMODE	0
+#define ACC_REG_CONTROL_IDX_MODE_LOM		1
+#define ACC_REG_CONTROL_IDX_MODE_STM		2
+#define ACC_REG_CONTROL_IDX_MODE_TRANSEN	5
+#define ACC_REG_CONTROL_IDX_MODE_TS		6
+#define ACC_REG_CONTROL_IDX_MODE_SCHEDULE	7
+#define ACC_REG_CONTROL_MASK_MODE_RESETMODE	\
+				BIT(ACC_REG_CONTROL_IDX_MODE_RESETMODE)
+#define ACC_REG_CONTROL_MASK_MODE_LOM		\
+				BIT(ACC_REG_CONTROL_IDX_MODE_LOM)
+#define ACC_REG_CONTROL_MASK_MODE_STM		\
+				BIT(ACC_REG_CONTROL_IDX_MODE_STM)
+#define ACC_REG_CONTROL_MASK_MODE_TRANSEN	\
+				BIT(ACC_REG_CONTROL_IDX_MODE_TRANSEN)
+#define ACC_REG_CONTROL_MASK_MODE_TS		\
+				BIT(ACC_REG_CONTROL_IDX_MODE_TS)
+#define ACC_REG_CONTROL_MASK_MODE_SCHEDULE	\
+				BIT(ACC_REG_CONTROL_IDX_MODE_SCHEDULE)
+
+#define ACC_REG_CONTROL_IDX_IE_RXTX	8
+#define ACC_REG_CONTROL_IDX_IE_TXERROR	9
+#define ACC_REG_CONTROL_IDX_IE_ERRWARN	10
+#define ACC_REG_CONTROL_IDX_IE_OVERRUN	11
+#define ACC_REG_CONTROL_IDX_IE_TSI	12
+#define ACC_REG_CONTROL_IDX_IE_ERRPASS	13
+#define ACC_REG_CONTROL_IDX_IE_BUSERR	15
+#define ACC_REG_CONTROL_MASK_IE_RXTX	BIT(ACC_REG_CONTROL_IDX_IE_RXTX)
+#define ACC_REG_CONTROL_MASK_IE_TXERROR BIT(ACC_REG_CONTROL_IDX_IE_TXERROR)
+#define ACC_REG_CONTROL_MASK_IE_ERRWARN BIT(ACC_REG_CONTROL_IDX_IE_ERRWARN)
+#define ACC_REG_CONTROL_MASK_IE_OVERRUN BIT(ACC_REG_CONTROL_IDX_IE_OVERRUN)
+#define ACC_REG_CONTROL_MASK_IE_TSI	BIT(ACC_REG_CONTROL_IDX_IE_TSI)
+#define ACC_REG_CONTROL_MASK_IE_ERRPASS BIT(ACC_REG_CONTROL_IDX_IE_ERRPASS)
+#define ACC_REG_CONTROL_MASK_IE_BUSERR	BIT(ACC_REG_CONTROL_IDX_IE_BUSERR)
+
+/* BRP and BTR register layout for CAN-Classic version */
+#define ACC_REG_BRP_CL_MASK_BRP         GENMASK(8, 0)
+#define ACC_REG_BTR_CL_MASK_TSEG1       GENMASK(3, 0)
+#define ACC_REG_BTR_CL_MASK_TSEG2       GENMASK(18, 16)
+#define ACC_REG_BTR_CL_MASK_SJW         GENMASK(25, 24)
+
+/* BRP and BTR register layout for CAN-FD version */
+#define ACC_REG_BRP_FD_MASK_BRP         GENMASK(7, 0)
+#define ACC_REG_BTR_FD_MASK_TSEG1       GENMASK(7, 0)
+#define ACC_REG_BTR_FD_MASK_TSEG2       GENMASK(22, 16)
+#define ACC_REG_BTR_FD_MASK_SJW         GENMASK(30, 24)
+
+/* 256 BM_MSGs of 32 byte size */
+#define ACC_CORE_DMAMSG_SIZE		32U
+#define ACC_CORE_DMABUF_SIZE		(256U * ACC_CORE_DMAMSG_SIZE)
+
+enum acc_bmmsg_id {
+	BM_MSG_ID_RXTXDONE = 0x01,
+	BM_MSG_ID_TXABORT = 0x02,
+	BM_MSG_ID_OVERRUN = 0x03,
+	BM_MSG_ID_BUSERR = 0x04,
+	BM_MSG_ID_ERRPASSIVE = 0x05,
+	BM_MSG_ID_ERRWARN = 0x06,
+	BM_MSG_ID_TIMESLICE = 0x07,
+	BM_MSG_ID_HWTIMER = 0x08,
+	BM_MSG_ID_HOTPLUG = 0x09,
+};
+
+/* The struct acc_bmmsg_* structure declarations that follow here provide
+ * access to the ring buffer of bus master messages maintained by the FPGA
+ * bus master engine. All bus master messages have the same size of
+ * ACC_CORE_DMAMSG_SIZE and a minimum alignment of ACC_CORE_DMAMSG_SIZE in
+ * memory.
+ *
+ * All structure members are natural aligned. Therefore we should not need
+ * a __packed attribute. All struct acc_bmmsg_* declarations have at least
+ * reserved* members to fill the structure to the full ACC_CORE_DMAMSG_SIZE.
+ *
+ * A failure of this property due padding will be detected at compile time
+ * by static_assert(sizeof(union acc_bmmsg) == ACC_CORE_DMAMSG_SIZE).
+ */
+
+struct acc_bmmsg_rxtxdone {
+	u8 msg_id;
+	u8 txfifo_level;
+	u8 reserved1[2];
+	u8 txtsfifo_level;
+	u8 reserved2[3];
+	u32 id;
+	union {
+		struct {
+			u8 len;
+			u8 reserved0;
+			u8 bits;
+			u8 state;
+		} rxtx;
+		struct {
+			u8 len;
+			u8 msg_lost;
+			u8 bits;
+			u8 state;
+		} rx;
+		struct {
+			u8 len;
+			u8 txfifo_idx;
+			u8 bits;
+			u8 state;
+		} tx;
+	} dlc;
+	u8 data[8];
+	/* Time stamps in struct acc_ov::timestamp_frequency ticks. */
+	u64 ts;
+};
+
+struct acc_bmmsg_txabort {
+	u8 msg_id;
+	u8 txfifo_level;
+	u16 abort_mask;
+	u8 txtsfifo_level;
+	u8 reserved2[1];
+	u16 abort_mask_txts;
+	u64 ts;
+	u32 reserved3[4];
+};
+
+struct acc_bmmsg_overrun {
+	u8 msg_id;
+	u8 txfifo_level;
+	u8 lost_cnt;
+	u8 reserved1;
+	u8 txtsfifo_level;
+	u8 reserved2[3];
+	u64 ts;
+	u32 reserved3[4];
+};
+
+struct acc_bmmsg_buserr {
+	u8 msg_id;
+	u8 txfifo_level;
+	u8 ecc;
+	u8 reserved1;
+	u8 txtsfifo_level;
+	u8 reserved2[3];
+	u64 ts;
+	u32 reg_status;
+	u32 reg_btr;
+	u32 reserved3[2];
+};
+
+struct acc_bmmsg_errstatechange {
+	u8 msg_id;
+	u8 txfifo_level;
+	u8 reserved1[2];
+	u8 txtsfifo_level;
+	u8 reserved2[3];
+	u64 ts;
+	u32 reg_status;
+	u32 reserved3[3];
+};
+
+struct acc_bmmsg_timeslice {
+	u8 msg_id;
+	u8 txfifo_level;
+	u8 reserved1[2];
+	u8 txtsfifo_level;
+	u8 reserved2[3];
+	u64 ts;
+	u32 reserved3[4];
+};
+
+struct acc_bmmsg_hwtimer {
+	u8 msg_id;
+	u8 reserved1[3];
+	u32 reserved2[1];
+	u64 timer;
+	u32 reserved3[4];
+};
+
+struct acc_bmmsg_hotplug {
+	u8 msg_id;
+	u8 reserved1[3];
+	u32 reserved2[7];
+};
+
+union acc_bmmsg {
+	u8 msg_id;
+	struct acc_bmmsg_rxtxdone rxtxdone;
+	struct acc_bmmsg_txabort txabort;
+	struct acc_bmmsg_overrun overrun;
+	struct acc_bmmsg_buserr buserr;
+	struct acc_bmmsg_errstatechange errstatechange;
+	struct acc_bmmsg_timeslice timeslice;
+	struct acc_bmmsg_hwtimer hwtimer;
+};
+
+/* Check size of union acc_bmmsg to be of expected size. */
+static_assert(sizeof(union acc_bmmsg) == ACC_CORE_DMAMSG_SIZE);
+
+struct acc_bmfifo {
+	const union acc_bmmsg *messages;
+	/* irq_cnt points to an u32 value where the ESDACC FPGA deposits
+	 * the bm_fifo head index in coherent DMA memory. Only bits 7..0
+	 * are valid. Use READ_ONCE() to access this memory location.
+	 */
+	const u32 *irq_cnt;
+	u32 local_irq_cnt;
+	u32 msg_fifo_tail;
+};
+
+struct acc_core {
+	void __iomem *addr;
+	struct net_device *netdev;
+	struct acc_bmfifo bmfifo;
+	u8 tx_fifo_size;
+	u8 tx_fifo_head;
+	u8 tx_fifo_tail;
+};
+
+struct acc_ov {
+	void __iomem *addr;
+	struct acc_bmfifo bmfifo;
+	u32 timestamp_frequency;
+	u32 core_frequency;
+	u16 version;
+	u16 features;
+	u8 total_cores;
+	u8 active_cores;
+};
+
+struct acc_net_priv {
+	struct can_priv can; /* must be the first member! */
+	struct acc_core *core;
+	struct acc_ov *ov;
+};
+
+static inline u32 acc_read32(struct acc_core *core, unsigned short offs)
+{
+	return ioread32be(core->addr + offs);
+}
+
+static inline void acc_write32(struct acc_core *core,
+			       unsigned short offs, u32 v)
+{
+	iowrite32be(v, core->addr + offs);
+}
+
+static inline void acc_write32_noswap(struct acc_core *core,
+				      unsigned short offs, u32 v)
+{
+	iowrite32(v, core->addr + offs);
+}
+
+static inline void acc_set_bits(struct acc_core *core,
+				unsigned short offs, u32 mask)
+{
+	u32 v = acc_read32(core, offs);
+
+	v |= mask;
+	acc_write32(core, offs, v);
+}
+
+static inline void acc_clear_bits(struct acc_core *core,
+				  unsigned short offs, u32 mask)
+{
+	u32 v = acc_read32(core, offs);
+
+	v &= ~mask;
+	acc_write32(core, offs, v);
+}
+
+static inline int acc_resetmode_entered(struct acc_core *core)
+{
+	u32 ctrl = acc_read32(core, ACC_CORE_OF_CTRL_MODE);
+
+	return (ctrl & ACC_REG_CONTROL_MASK_MODE_RESETMODE) != 0;
+}
+
+static inline u32 acc_ov_read32(struct acc_ov *ov, unsigned short offs)
+{
+	return ioread32be(ov->addr + offs);
+}
+
+static inline void acc_ov_write32(struct acc_ov *ov,
+				  unsigned short offs, u32 v)
+{
+	iowrite32be(v, ov->addr + offs);
+}
+
+static inline void acc_ov_set_bits(struct acc_ov *ov,
+				   unsigned short offs, u32 b)
+{
+	u32 v = acc_ov_read32(ov, offs);
+
+	v |= b;
+	acc_ov_write32(ov, offs, v);
+}
+
+static inline void acc_ov_clear_bits(struct acc_ov *ov,
+				     unsigned short offs, u32 b)
+{
+	u32 v = acc_ov_read32(ov, offs);
+
+	v &= ~b;
+	acc_ov_write32(ov, offs, v);
+}
+
+static inline void acc_reset_fpga(struct acc_ov *ov)
+{
+	acc_ov_write32(ov, ACC_OV_OF_MODE, ACC_OV_REG_MODE_MASK_FPGA_RESET);
+
+	/* Also reset I^2C, to re-detect card addons at every driver start: */
+	acc_ov_clear_bits(ov, ACC_OV_OF_MODE, ACC_OV_REG_MODE_MASK_I2C_ENABLE);
+	mdelay(2);
+	acc_ov_set_bits(ov, ACC_OV_OF_MODE, ACC_OV_REG_MODE_MASK_I2C_ENABLE);
+	mdelay(10);
+}
+
+void acc_init_ov(struct acc_ov *ov, struct device *dev);
+void acc_init_bm_ptr(struct acc_ov *ov, struct acc_core *cores,
+		     const void *mem);
+int acc_open(struct net_device *netdev);
+int acc_close(struct net_device *netdev);
+netdev_tx_t acc_start_xmit(struct sk_buff *skb, struct net_device *netdev);
+int acc_get_berr_counter(const struct net_device *netdev,
+			 struct can_berr_counter *bec);
+int acc_set_mode(struct net_device *netdev, enum can_mode mode);
+int acc_set_bittiming(struct net_device *netdev);
+irqreturn_t acc_card_interrupt(struct acc_ov *ov, struct acc_core *cores);
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 0/2] can: esd: add support for esd GmbH PCIe/402 CAN interface
From: Stefan Mätje @ 2023-11-07 18:41 UTC (permalink / raw)
  To: Marc Kleine-Budde, linux-can, netdev, linux-kernel
  Cc: Wolfgang Grandegger, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni

This patch picks up the work from last year. Any comments on needed or
recommended changes are appreciated.

The purpose of this patch is to introduce a new CAN driver to support
the esd GmbH 402 family of CAN interface boards. The hardware design
is based on a CAN controller implemented in a FPGA attached to a
PCIe link.

More information on these boards can be found following the links
included in the commit message.

This patch supports all boards but will operate the CAN-FD capable
boards only in Classic-CAN mode. The CAN-FD support will be added
when the initial patch has stabilized.

The patch is reuses the previous work of my former colleague:
Link: https://lore.kernel.org/linux-can/1426592308-23817-1-git-send-email-thomas.koerper@esd.eu/


The patch is based on the linux-can-next main branch.

Changes in v9:
  - Fix returning success error code in case of allocation failure in
    pci402_probe().

Changes in v8:
  - Rebased to 6.6-rc2 on linux-can-next branch main

Changes in v7:
  - Numerous changes. Find the quoted with inline comments about changes
    below after the changes list. Stuff that I don't understand and
    where I have questions is marked with ????.
    Unfortunately I will be AFK till 28th of November.

Changes in v6:
  - Fixed the statistic handling of RX overrun errors and increase 
    net_device_stats::rx_errors instead of net_device_stats::rx_dropped.
  - Added a patch to not increase rx statistics when generating a CAN
    rx error message frame as suggested on the linux-can list.
  - Added a patch to not not increase rx_bytes statistics for RTR frames
    as suggested on the linux-can list.

    The last two patches change the statistics handling from the previous
    style used in other drivers to the newly suggested one.

Changes in v5:
  - Added the initialization for netdev::dev_port as it is implemented
    for another CAN driver. See
    https://lore.kernel.org/linux-can/20211026180553.1953189-1-mailhol.vincent@wanadoo.fr/

Changes in v4:
  - Fixed the build failure on ARCH=arm64 that was found by the Intel
    kernel test robot. See
    https://lore.kernel.org/linux-can/202109120608.7ZbQXkRh-lkp@intel.com/

    Removed error monitoring code that used GCC's built-in compiler
    functions for atomic access (__sync_* functions). GCC versions
    after 9 (tested with "gcc-10 (Ubuntu 10.3.0-1ubuntu1~20.04)")
    don't implement the intrinsic atomic as in-line code but call
    "__aarch64_ldadd4_acq_rel" on arm64. This GCC support function
    is not exported by the kernel and therefore the module build
    post-processing fails.

    Removed that code because the error monitoring never showed a
    problem during the development this year.


Changes in v3:
  - Rework the bus-off restart logic in acc_set_mode() and
    handle_core_msg_errstatechange() to call netif_wake_queue() from the
    error active event.
  - Changed pci402_init_card() to allocate a variable sized array of
    struct acc_core using devm_kcalloc() instead of using a fixed size
    array in struct pci402_card.
  - Changed handle_core_msg_txabort() to release aborted TX frames in
    TX FIFO order.
  - Fixed the acc_close() function to abort all pending TX request in
    esdACC controller.
  - Fixed counting of transmit aborts in handle_core_msg_txabort().
    It is now done like in can_flush_echo_skb().
  - Fixed handle_core_msg_buserr() to create error frames including the
    CAN RX and TX error counters that were missing.
  - Fixed acc_set_bittiming() neither to touch LOM mode setting of
    esdACC controller nor to enter or leave RESET mode.
    The esdACC controller is going active on the CAN bus in acc_open()
    and is going inactive (RESET mode) again in acc_close().
  - Rely on the automatic release of memory fetched by devm_kzalloc().
    But still use devm_irq_free() explicitely to make sure that the
    interrupt handler is disconnected at that point.
    This avoids a possible crash in non-MSI mode due to the IRQ
    triggered by another device on the same PCI IRQ line.
  - Changed to use DMA map API instead of pci_*_consistent compatibility
    wrappers.
  - Fixed stale email references and updated copyright information.
  - Removed any traces of future CAN-FD support.


Changes in v2:
  - Avoid warning triggered by -Wshift-count-overflow on architectures
    with 32-bit dma_addr_t.
  - Fixed Makefile not to build the kernel module always. Doing this
    renamed esd402_pci.c to esd_402_pci-core.c as recommended by Marc.

previous versions:
v1 - https://lore.kernel.org/linux-can/20210728203647.15240-1-Stefan.Maetje@esd.eu/
v2 - https://lore.kernel.org/linux-can/20210730173805.3926-1-Stefan.Maetje@esd.eu/
v3 - https://lore.kernel.org/linux-can/20210908164640.23243-1-stefan.maetje@esd.eu/
v4 - https://lore.kernel.org/linux-can/20210916172152.5127-1-stefan.maetje@esd.eu/
v5 - https://lore.kernel.org/linux-can/20211109155326.2608822-1-stefan.maetje@esd.eu/
v6 - https://lore.kernel.org/linux-can/20211201220328.3079270-1-stefan.maetje@esd.eu/
v7 - https://lore.kernel.org/linux-can/20221106224156.3619334-1-stefan.maetje@esd.eu/
v8 - https://lore.kernel.org/linux-can/20231025141635.1459606-1-stefan.maetje@esd.eu/



Stefan Mätje (2):
  MAINTAINERS: add Stefan Mätje as maintainer for the esd electronics
    GmbH PCIe/402 CAN drivers
  can: esd: add support for esd GmbH PCIe/402 CAN interface family

 MAINTAINERS                            |   7 +
 drivers/net/can/Kconfig                |   1 +
 drivers/net/can/Makefile               |   1 +
 drivers/net/can/esd/Kconfig            |  12 +
 drivers/net/can/esd/Makefile           |   7 +
 drivers/net/can/esd/esd_402_pci-core.c | 512 ++++++++++++++++
 drivers/net/can/esd/esdacc.c           | 771 +++++++++++++++++++++++++
 drivers/net/can/esd/esdacc.h           | 393 +++++++++++++
 8 files changed, 1704 insertions(+)
 create mode 100644 drivers/net/can/esd/Kconfig
 create mode 100644 drivers/net/can/esd/Makefile
 create mode 100644 drivers/net/can/esd/esd_402_pci-core.c
 create mode 100644 drivers/net/can/esd/esdacc.c
 create mode 100644 drivers/net/can/esd/esdacc.h


base-commit: 93e7eca853ca0087b129433630ddd89288d2b8b4
-- 
2.34.1


^ permalink raw reply

* [PATCH v9 1/2] MAINTAINERS: add Stefan Mätje as maintainer for the esd electronics GmbH PCIe/402 CAN drivers
From: Stefan Mätje @ 2023-11-07 18:41 UTC (permalink / raw)
  To: Marc Kleine-Budde, linux-can, netdev, linux-kernel
  Cc: Wolfgang Grandegger, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
In-Reply-To: <20231107184103.2802678-1-stefan.maetje@esd.eu>

Adding myself (Stefan Mätje) as a maintainer for the upcoming driver of
the PCIe/402 interface card family.

Signed-off-by: Stefan Mätje <stefan.maetje@esd.eu>
---
 MAINTAINERS | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 03011d7ee087..7db1bd399822 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7753,6 +7753,13 @@ L:	linux-can@vger.kernel.org
 S:	Maintained
 F:	drivers/net/can/usb/esd_usb.c
 
+ESD CAN NETWORK DRIVERS
+M:	Stefan Mätje <stefan.maetje@esd.eu>
+R:	socketcan@esd.eu
+L:	linux-can@vger.kernel.org
+S:	Maintained
+F:	drivers/net/can/esd/
+
 ET131X NETWORK DRIVER
 M:	Mark Einon <mark.einon@gmail.com>
 S:	Odd Fixes
-- 
2.34.1


^ permalink raw reply related

* Re: [RFC PATCH v3 09/12] net: add support for skbs with unreadable frags
From: Stanislav Fomichev @ 2023-11-07 18:14 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Mina Almasry, David Ahern, netdev, linux-kernel, linux-arch,
	linux-kselftest, linux-media, dri-devel, linaro-mm-sig,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Jesper Dangaard Brouer, Ilias Apalodimas, Arnd Bergmann,
	Shuah Khan, Sumit Semwal, Christian König, Shakeel Butt,
	Jeroen de Borst, Praveen Kaligineedi, Willem de Bruijn,
	Kaiyuan Zhang
In-Reply-To: <CAF=yD-Ltd0REhOS78q_t8bSEpefQsZuJV_Aq7zxXmFDh+BmJhg@mail.gmail.com>

On 11/07, Willem de Bruijn wrote:
> On Tue, Nov 7, 2023 at 12:44 PM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > On 11/06, Willem de Bruijn wrote:
> > > > > > > I think my other issue with MSG_SOCK_DEVMEM being on recvmsg is that
> > > > > > > it somehow implies that I have an option of passing or not passing it
> > > > > > > for an individual system call.
> > > > > > > If we know that we're going to use dmabuf with the socket, maybe we
> > > > > > > should move this flag to the socket() syscall?
> > > > > > >
> > > > > > > fd = socket(AF_INET6, SOCK_STREAM, SOCK_DEVMEM);
> > > > > > >
> > > > > > > ?
> > > > > >
> > > > > > I think it should then be a setsockopt called before any data is
> > > > > > exchanged, with no change of modifying mode later. We generally use
> > > > > > setsockopts for the mode of a socket. This use of the protocol field
> > > > > > in socket() for setting a mode would be novel. Also, it might miss
> > > > > > passively opened connections, or be overly restrictive: one approach
> > > > > > for all accepted child sockets.
> > > > >
> > > > > I was thinking this is similar to SOCK_CLOEXEC or SOCK_NONBLOCK? There
> > > > > are plenty of bits we can grab. But setsockopt works as well!
> > > >
> > > > To follow up: if we have this flag on a socket, not on a per-message
> > > > basis, can we also use recvmsg for the recycling part maybe?
> > > >
> > > > while (true) {
> > > >         memset(msg, 0, ...);
> > > >
> > > >         /* receive the tokens */
> > > >         ret = recvmsg(fd, &msg, 0);
> > > >
> > > >         /* recycle the tokens from the above recvmsg() */
> > > >         ret = recvmsg(fd, &msg, MSG_RECYCLE);
> > > > }
> > > >
> > > > recvmsg + MSG_RECYCLE can parse the same format that regular recvmsg
> > > > exports (SO_DEVMEM_OFFSET) and we can also add extra cmsg option
> > > > to recycle a range.
> > > >
> > > > Will this be more straightforward than a setsockopt(SO_DEVMEM_DONTNEED)?
> > > > Or is it more confusing?
> > >
> > > It would have to be sendmsg, as recvmsg is a copy_to_user operation.
> > >
> > >
> > > I am not aware of any precedent in multiplexing the data stream and a
> > > control operation stream in this manner. It would also require adding
> > > a branch in the sendmsg hot path.
> >
> > Is it too much plumbing to copy_from_user msg_control deep in recvmsg
> > stack where we need it? Mixing in sendmsg is indeed ugly :-(
> 
> I tried exactly the inverse of that when originally adding
> MSG_ZEROCOPY: to allow piggy-backing zerocopy completion notifications
> on sendmsg calls by writing to sendmsg msg_control on return to user.
> It required significant code churn, which the performance gains did
> not warrant. Doing so also breaks the simple rule that recv is for
> reading and send is for writing.

We're breaking so many rules here, so not sure we should be super
constrained :-D

> > Regarding hot patch: aren't we already doing copy_to_user for the tokens in
> > this hot path, so having one extra condition shouldn't hurt too much?
> 
> We're doing that in the optional cmsg handling of recvmsg, which is
> already a slow path (compared to the data read() itself).
> 
> > > The memory is associated with the socket, freed when the socket is
> > > closed as well as on SO_DEVMEM_DONTNEED. Fundamentally it is a socket
> > > state operation, for which setsockopt is the socket interface.
> > >
> > > Is your request purely a dislike, or is there some technical concern
> > > with BPF and setsockopt?
> >
> > It's mostly because I've been bitten too much by custom socket options that
> > are not really on/off/update-value operations:
> >
> > 29ebbba7d461 - bpf: Don't EFAULT for {g,s}setsockopt with wrong optlen
> > 00e74ae08638 - bpf: Don't EFAULT for getsockopt with optval=NULL
> > 9cacf81f8161 - bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE
> > d8fe449a9c51 - bpf: Don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE
> >
> > I do agree that this particular case of SO_DEVMEM_DONTNEED seems ok, but
> > things tend to evolve and change.
> 
> I see. I'm a bit concerned if we start limiting what we can do in
> sockets because of dependencies that BPF processing places on them.
> The use case for BPF [gs]etsockopt is limited to specific control mode
> calls. Would it make sense to just exclude calls like
> SO_DEVMEM_DONTNEED from this interpositioning?

Yup, that's why I'm asking. We already have ->bpf_bypass_getsockopt()
to special-case tcp zerocopy. We might add another bpf_bypass_setsockopt
to special case SO_DEVMEM_DONTNEED. That's why I'm trying to see if
there is a better alternative.

> At a high level what we really want is a high rate metadata path from
> user to kernel. And there are no perfect solutions. From kernel to
> user we use the socket error queue for this. That was never intended
> for high event rate itself, dealing with ICMP errors and the like
> before timestamps and zerocopy notifications were added.
>
> If I squint hard enough I can see some prior art in mixing data and
> high rate state changes within the same channel in NIC descriptor
> queues, where some devices do this, e.g.,  { "insert encryption key",
> "send packet" }. But fundamentally I think we should keep the socket
> queues for data only.

+1, we keep taking an easy route with using sockopt for this :-(

Anyway, let's see if any better suggestions pop up. Worst case - we stick
with a socket option and will add a bypass on the bpf side.

^ permalink raw reply

* Re: [PATCH net-next v2 00/21] virtio-net: support AF_XDP zero copy
From: Jakub Kicinski @ 2023-11-07 18:01 UTC (permalink / raw)
  To: Xuan Zhuo
  Cc: netdev, David S. Miller, Eric Dumazet, Paolo Abeni,
	Michael S. Tsirkin, Jason Wang, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	virtualization, bpf
In-Reply-To: <20231107031227.100015-1-xuanzhuo@linux.alibaba.com>

On Tue,  7 Nov 2023 11:12:06 +0800 Xuan Zhuo wrote:
> Please review.

## Form letter - net-next-closed

The merge window for v6.7 has begun and we have already posted our pull
request. Therefore net-next is closed for new drivers, features, code
refactoring and optimizations. We are currently accepting bug fixes only.

Please repost when net-next reopens after Nov 12th.

RFC patches sent for review only are obviously welcome at any time.

See: https://www.kernel.org/doc/html/next/process/maintainer-netdev.html#development-cycle
-- 
pw-bot: defer

^ permalink raw reply

* Re: [PATCH] Fixes a null pointer dereference in ptp_ioctl
From: Jakub Kicinski @ 2023-11-07 18:00 UTC (permalink / raw)
  To: Yuran Pereira
  Cc: richardcochran, netdev, linux-kernel, linux-kernel-mentees,
	syzbot+8a78ecea7ac1a2ea26e5
In-Reply-To: <DB3PR10MB683554F488A562C8A89286C2E8AAA@DB3PR10MB6835.EURPRD10.PROD.OUTLOOK.COM>

On Mon,  6 Nov 2023 06:18:29 +0530 Yuran Pereira wrote:
> This patch fixes the issue by adding a check for tsevq and
> ensuring ptp_ioctl returns with an error if tsevq is null.
> 
> Dashboard link: https://syzkaller.appspot.com/bug?extid=8a78ecea7ac1a2ea26e5

Just Link:

> Reported-by: syzbot+8a78ecea7ac1a2ea26e5@syzkaller.appspotmail.com
> Fixes: c5a445b1e934 ("ptp: support event queue reader channel masks")
> 

No empty lines between tags.

> Signed-off-by: Yuran Pereira <yuran.pereira@hotmail.com>

When you repost please make sure you CC everyone get_maintainer 
(run on  the patch file, not the paths) points out.
And CC Edward Adam Davis <eadavis@qq.com> since his fixing similar
issues.
-- 
pw-bot: cr

^ permalink raw reply

* Re: [RFC PATCH v3 09/12] net: add support for skbs with unreadable frags
From: Willem de Bruijn @ 2023-11-07 17:57 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Mina Almasry, David Ahern, netdev, linux-kernel, linux-arch,
	linux-kselftest, linux-media, dri-devel, linaro-mm-sig,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Jesper Dangaard Brouer, Ilias Apalodimas, Arnd Bergmann,
	Shuah Khan, Sumit Semwal, Christian König, Shakeel Butt,
	Jeroen de Borst, Praveen Kaligineedi, Willem de Bruijn,
	Kaiyuan Zhang
In-Reply-To: <ZUp3j2TLNKhPYwch@google.com>

On Tue, Nov 7, 2023 at 12:44 PM Stanislav Fomichev <sdf@google.com> wrote:
>
> On 11/06, Willem de Bruijn wrote:
> > > > > > I think my other issue with MSG_SOCK_DEVMEM being on recvmsg is that
> > > > > > it somehow implies that I have an option of passing or not passing it
> > > > > > for an individual system call.
> > > > > > If we know that we're going to use dmabuf with the socket, maybe we
> > > > > > should move this flag to the socket() syscall?
> > > > > >
> > > > > > fd = socket(AF_INET6, SOCK_STREAM, SOCK_DEVMEM);
> > > > > >
> > > > > > ?
> > > > >
> > > > > I think it should then be a setsockopt called before any data is
> > > > > exchanged, with no change of modifying mode later. We generally use
> > > > > setsockopts for the mode of a socket. This use of the protocol field
> > > > > in socket() for setting a mode would be novel. Also, it might miss
> > > > > passively opened connections, or be overly restrictive: one approach
> > > > > for all accepted child sockets.
> > > >
> > > > I was thinking this is similar to SOCK_CLOEXEC or SOCK_NONBLOCK? There
> > > > are plenty of bits we can grab. But setsockopt works as well!
> > >
> > > To follow up: if we have this flag on a socket, not on a per-message
> > > basis, can we also use recvmsg for the recycling part maybe?
> > >
> > > while (true) {
> > >         memset(msg, 0, ...);
> > >
> > >         /* receive the tokens */
> > >         ret = recvmsg(fd, &msg, 0);
> > >
> > >         /* recycle the tokens from the above recvmsg() */
> > >         ret = recvmsg(fd, &msg, MSG_RECYCLE);
> > > }
> > >
> > > recvmsg + MSG_RECYCLE can parse the same format that regular recvmsg
> > > exports (SO_DEVMEM_OFFSET) and we can also add extra cmsg option
> > > to recycle a range.
> > >
> > > Will this be more straightforward than a setsockopt(SO_DEVMEM_DONTNEED)?
> > > Or is it more confusing?
> >
> > It would have to be sendmsg, as recvmsg is a copy_to_user operation.
> >
> >
> > I am not aware of any precedent in multiplexing the data stream and a
> > control operation stream in this manner. It would also require adding
> > a branch in the sendmsg hot path.
>
> Is it too much plumbing to copy_from_user msg_control deep in recvmsg
> stack where we need it? Mixing in sendmsg is indeed ugly :-(

I tried exactly the inverse of that when originally adding
MSG_ZEROCOPY: to allow piggy-backing zerocopy completion notifications
on sendmsg calls by writing to sendmsg msg_control on return to user.
It required significant code churn, which the performance gains did
not warrant. Doing so also breaks the simple rule that recv is for
reading and send is for writing.

> Regarding hot patch: aren't we already doing copy_to_user for the tokens in
> this hot path, so having one extra condition shouldn't hurt too much?

We're doing that in the optional cmsg handling of recvmsg, which is
already a slow path (compared to the data read() itself).

> > The memory is associated with the socket, freed when the socket is
> > closed as well as on SO_DEVMEM_DONTNEED. Fundamentally it is a socket
> > state operation, for which setsockopt is the socket interface.
> >
> > Is your request purely a dislike, or is there some technical concern
> > with BPF and setsockopt?
>
> It's mostly because I've been bitten too much by custom socket options that
> are not really on/off/update-value operations:
>
> 29ebbba7d461 - bpf: Don't EFAULT for {g,s}setsockopt with wrong optlen
> 00e74ae08638 - bpf: Don't EFAULT for getsockopt with optval=NULL
> 9cacf81f8161 - bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE
> d8fe449a9c51 - bpf: Don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE
>
> I do agree that this particular case of SO_DEVMEM_DONTNEED seems ok, but
> things tend to evolve and change.

I see. I'm a bit concerned if we start limiting what we can do in
sockets because of dependencies that BPF processing places on them.
The use case for BPF [gs]etsockopt is limited to specific control mode
calls. Would it make sense to just exclude calls like
SO_DEVMEM_DONTNEED from this interpositioning?

At a high level what we really want is a high rate metadata path from
user to kernel. And there are no perfect solutions. From kernel to
user we use the socket error queue for this. That was never intended
for high event rate itself, dealing with ICMP errors and the like
before timestamps and zerocopy notifications were added.

If I squint hard enough I can see some prior art in mixing data and
high rate state changes within the same channel in NIC descriptor
queues, where some devices do this, e.g.,  { "insert encryption key",
"send packet" }. But fundamentally I think we should keep the socket
queues for data only.

^ permalink raw reply

* Re: [PATCH net 0/2] r8169: fix DASH deviceis network lost issue
From: Jakub Kicinski @ 2023-11-07 17:53 UTC (permalink / raw)
  To: ChunHao Lin; +Cc: hkallweit1, netdev, nic_swsd
In-Reply-To: <20231106151124.9175-1-hau@realtek.com>

On Mon, 6 Nov 2023 23:11:22 +0800 ChunHao Lin wrote:
> This series are used to fix network lost issue on systems that support
> DASH.

Please use get_maintainer on the patch files to make sure you CC all
relevant people.

^ permalink raw reply

* Re: [PATCH net v2] page_pool: Add myself as page pool reviewer in MAINTAINERS
From: Jakub Kicinski @ 2023-11-07 17:49 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, pabeni, netdev, linux-kernel, Jesper Dangaard Brouer,
	Ilias Apalodimas
In-Reply-To: <20231107123825.61051-1-linyunsheng@huawei.com>

On Tue, 7 Nov 2023 20:38:24 +0800 Yunsheng Lin wrote:
> I have added frag support for page pool, made some improvement
> for it recently, and reviewed some related patches too.
> 
> So add myself as reviewer so that future patch will be cc'ed
> to my email.

Not sure what to do about this, it feels somewhat wrong to add
as a reviewer someone who seem to not follow our basic process
requirements:

https://www.kernel.org/doc/html/next/process/maintainer-netdev.html

:(

^ permalink raw reply

* Re: [RFC PATCH v3 09/12] net: add support for skbs with unreadable frags
From: Stanislav Fomichev @ 2023-11-07 17:44 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Mina Almasry, David Ahern, netdev, linux-kernel, linux-arch,
	linux-kselftest, linux-media, dri-devel, linaro-mm-sig,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Jesper Dangaard Brouer, Ilias Apalodimas, Arnd Bergmann,
	Shuah Khan, Sumit Semwal, Christian König, Shakeel Butt,
	Jeroen de Borst, Praveen Kaligineedi, Willem de Bruijn,
	Kaiyuan Zhang
In-Reply-To: <CAF=yD-+tZ7xaU0rKWBuVbfdVWptj88Z=Xf4Mqx+zaC-gZ1U1mw@mail.gmail.com>

On 11/06, Willem de Bruijn wrote:
> > > > > I think my other issue with MSG_SOCK_DEVMEM being on recvmsg is that
> > > > > it somehow implies that I have an option of passing or not passing it
> > > > > for an individual system call.
> > > > > If we know that we're going to use dmabuf with the socket, maybe we
> > > > > should move this flag to the socket() syscall?
> > > > >
> > > > > fd = socket(AF_INET6, SOCK_STREAM, SOCK_DEVMEM);
> > > > >
> > > > > ?
> > > >
> > > > I think it should then be a setsockopt called before any data is
> > > > exchanged, with no change of modifying mode later. We generally use
> > > > setsockopts for the mode of a socket. This use of the protocol field
> > > > in socket() for setting a mode would be novel. Also, it might miss
> > > > passively opened connections, or be overly restrictive: one approach
> > > > for all accepted child sockets.
> > >
> > > I was thinking this is similar to SOCK_CLOEXEC or SOCK_NONBLOCK? There
> > > are plenty of bits we can grab. But setsockopt works as well!
> >
> > To follow up: if we have this flag on a socket, not on a per-message
> > basis, can we also use recvmsg for the recycling part maybe?
> >
> > while (true) {
> >         memset(msg, 0, ...);
> >
> >         /* receive the tokens */
> >         ret = recvmsg(fd, &msg, 0);
> >
> >         /* recycle the tokens from the above recvmsg() */
> >         ret = recvmsg(fd, &msg, MSG_RECYCLE);
> > }
> >
> > recvmsg + MSG_RECYCLE can parse the same format that regular recvmsg
> > exports (SO_DEVMEM_OFFSET) and we can also add extra cmsg option
> > to recycle a range.
> >
> > Will this be more straightforward than a setsockopt(SO_DEVMEM_DONTNEED)?
> > Or is it more confusing?
> 
> It would have to be sendmsg, as recvmsg is a copy_to_user operation.
>
>
> I am not aware of any precedent in multiplexing the data stream and a
> control operation stream in this manner. It would also require adding
> a branch in the sendmsg hot path.

Is it too much plumbing to copy_from_user msg_control deep in recvmsg
stack where we need it? Mixing in sendmsg is indeed ugly :-(

Regarding hot patch: aren't we already doing copy_to_user for the tokens in
this hot path, so having one extra condition shouldn't hurt too much?

> The memory is associated with the socket, freed when the socket is
> closed as well as on SO_DEVMEM_DONTNEED. Fundamentally it is a socket
> state operation, for which setsockopt is the socket interface.
> 
> Is your request purely a dislike, or is there some technical concern
> with BPF and setsockopt?

It's mostly because I've been bitten too much by custom socket options that
are not really on/off/update-value operations:

29ebbba7d461 - bpf: Don't EFAULT for {g,s}setsockopt with wrong optlen
00e74ae08638 - bpf: Don't EFAULT for getsockopt with optval=NULL
9cacf81f8161 - bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE
d8fe449a9c51 - bpf: Don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE

I do agree that this particular case of SO_DEVMEM_DONTNEED seems ok, but
things tend to evolve and change.

^ permalink raw reply

* [PATCH net v2] net: phylink: initialize carrier state at creation
From: Klaus Kudielka @ 2023-11-07 17:44 UTC (permalink / raw)
  To: Russell King, Andrew Lunn, Heiner Kallweit
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	netdev, linux-kernel, Klaus Kudielka, stable

Background: Turris Omnia (Armada 385); eth2 (mvneta) connected to SFP bus;
SFP module is present, but no fiber connected, so definitely no carrier.

After booting, eth2 is down, but netdev LED trigger surprisingly reports
link active. Then, after "ip link set eth2 up", the link indicator goes
away - as I would have expected it from the beginning.

It turns out, that the default carrier state after netdev creation is
"carrier ok". Some ethernet drivers explicitly call netif_carrier_off
during probing, others (like mvneta) don't - which explains the current
behaviour: only when the device is brought up, phylink_start calls
netif_carrier_off.

Fix this for all drivers using phylink, by calling netif_carrier_off in
phylink_create.

Fixes: 089381b27abe ("leds: initial support for Turris Omnia LEDs")
Cc: stable@vger.kernel.org
Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Klaus Kudielka <klaus.kudielka@gmail.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
---
v2: clarified fixed drivers; added fixes tag & cc stable

 drivers/net/phy/phylink.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 6712883498..a28da80bde 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1616,6 +1616,7 @@ struct phylink *phylink_create(struct phylink_config *config,
 	pl->config = config;
 	if (config->type == PHYLINK_NETDEV) {
 		pl->netdev = to_net_dev(config->dev);
+		netif_carrier_off(pl->netdev);
 	} else if (config->type == PHYLINK_DEV) {
 		pl->dev = config->dev;
 	} else {
-- 
2.42.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox