Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next v4 2/8] xsk: add support for need_wakeup flag in AF_XDP rings
From: Magnus Karlsson @ 2019-08-14  7:27 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev, brouer,
	maximmi
  Cc: bpf, bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	jonathan.lemon, kiran.patil, axboe, maciej.fijalkowski,
	maciejromanfijalkowski, intel-wired-lan
In-Reply-To: <1565767643-4908-1-git-send-email-magnus.karlsson@intel.com>

This commit adds support for a new flag called need_wakeup in the
AF_XDP Tx and fill rings. When this flag is set, it means that the
application has to explicitly wake up the kernel Rx (for the bit in
the fill ring) or kernel Tx (for bit in the Tx ring) processing by
issuing a syscall. Poll() can wake up both depending on the flags
submitted and sendto() will wake up tx processing only.

The main reason for introducing this new flag is to be able to
efficiently support the case when application and driver is executing
on the same core. Previously, the driver was just busy-spinning on the
fill ring if it ran out of buffers in the HW and there were none on
the fill ring. This approach works when the application is running on
another core as it can replenish the fill ring while the driver is
busy-spinning. Though, this is a lousy approach if both of them are
running on the same core as the probability of the fill ring getting
more entries when the driver is busy-spinning is zero. With this new
feature the driver now sets the need_wakeup flag and returns to the
application. The application can then replenish the fill queue and
then explicitly wake up the Rx processing in the kernel using the
syscall poll(). For Tx, the flag is only set to one if the driver has
no outstanding Tx completion interrupts. If it has some, the flag is
zero as it will be woken up by a completion interrupt anyway.

As a nice side effect, this new flag also improves the performance of
the case where application and driver are running on two different
cores as it reduces the number of syscalls to the kernel. The kernel
tells user space if it needs to be woken up by a syscall, and this
eliminates many of the syscalls.

This flag needs some simple driver support. If the driver does not
support this, the Rx flag is always zero and the Tx flag is always
one. This makes any application relying on this feature default to the
old behaviour of not requiring any syscalls in the Rx path and always
having to call sendto() in the Tx path.

For backwards compatibility reasons, this feature has to be explicitly
turned on using a new bind flag (XDP_USE_NEED_WAKEUP). I recommend
that you always turn it on as it so far always have had a positive
performance impact.

The name and inspiration of the flag has been taken from io_uring by
Jens Axboe. Details about this feature in io_uring can be found in
http://kernel.dk/io_uring.pdf, section 8.3.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 include/net/xdp_sock.h      |  33 +++++++++-
 include/uapi/linux/if_xdp.h |  13 ++++
 net/xdp/xdp_umem.c          |   9 +++
 net/xdp/xsk.c               | 146 ++++++++++++++++++++++++++++++++++++++------
 net/xdp/xsk.h               |  13 ++++
 net/xdp/xsk_queue.h         |   1 +
 6 files changed, 195 insertions(+), 20 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 69796d2..6aebea2 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -27,6 +27,9 @@ struct xdp_umem_fq_reuse {
 	u64 handles[];
 };
 
+/* Flags for the umem flags field. */
+#define XDP_UMEM_USES_NEED_WAKEUP (1 << 0)
+
 struct xdp_umem {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
@@ -41,10 +44,12 @@ struct xdp_umem {
 	struct work_struct work;
 	struct page **pgs;
 	u32 npgs;
+	u16 queue_id;
+	u8 need_wakeup;
+	u8 flags;
 	int id;
 	struct net_device *dev;
 	struct xdp_umem_fq_reuse *fq_reuse;
-	u16 queue_id;
 	bool zc;
 	spinlock_t xsk_list_lock;
 	struct list_head xsk_list;
@@ -95,6 +100,11 @@ struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
 					  struct xdp_umem_fq_reuse *newq);
 void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
 struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
@@ -241,6 +251,27 @@ static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
 {
 }
 
+static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+	return false;
+}
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index faaa5ca..62b80d5 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -16,6 +16,15 @@
 #define XDP_SHARED_UMEM	(1 << 0)
 #define XDP_COPY	(1 << 1) /* Force copy-mode */
 #define XDP_ZEROCOPY	(1 << 2) /* Force zero-copy mode */
+/* If this option is set, the driver might go sleep and in that case
+ * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be
+ * set. If it is set, the application need to explicitly wake up the
+ * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are
+ * running the driver and the application on the same core, you should
+ * use this option so that the kernel will yield to the user space
+ * application.
+ */
+#define XDP_USE_NEED_WAKEUP (1 << 3)
 
 struct sockaddr_xdp {
 	__u16 sxdp_family;
@@ -25,10 +34,14 @@ struct sockaddr_xdp {
 	__u32 sxdp_shared_umem_fd;
 };
 
+/* XDP_RING flags */
+#define XDP_RING_NEED_WAKEUP (1 << 0)
+
 struct xdp_ring_offset {
 	__u64 producer;
 	__u64 consumer;
 	__u64 desc;
+	__u64 flags;
 };
 
 struct xdp_mmap_offsets {
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 6e2d4da..cda6feb 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -106,6 +106,15 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 	umem->dev = dev;
 	umem->queue_id = queue_id;
 
+	if (flags & XDP_USE_NEED_WAKEUP) {
+		umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
+		/* Tx needs to be explicitly woken up the first time.
+		 * Also for supporting drivers that do not implement this
+		 * feature. They will always have to call sendto().
+		 */
+		xsk_set_tx_need_wakeup(umem);
+	}
+
 	dev_hold(dev);
 
 	if (force_copy)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 1fe40a9..9f900b5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -55,6 +55,66 @@ void xsk_umem_discard_addr(struct xdp_umem *umem)
 }
 EXPORT_SYMBOL(xsk_umem_discard_addr);
 
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+	if (umem->need_wakeup & XDP_WAKEUP_RX)
+		return;
+
+	umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
+	umem->need_wakeup |= XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
+
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+	struct xdp_sock *xs;
+
+	if (umem->need_wakeup & XDP_WAKEUP_TX)
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
+	}
+	rcu_read_unlock();
+
+	umem->need_wakeup |= XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
+
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+	if (!(umem->need_wakeup & XDP_WAKEUP_RX))
+		return;
+
+	umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+	umem->need_wakeup &= ~XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
+
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+	struct xdp_sock *xs;
+
+	if (!(umem->need_wakeup & XDP_WAKEUP_TX))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+	}
+	rcu_read_unlock();
+
+	umem->need_wakeup &= ~XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
+
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+	return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
+}
+EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
+
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
 	void *to_buf, *from_buf;
@@ -320,6 +380,12 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock,
 	unsigned int mask = datagram_poll(file, sock, wait);
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
+	struct net_device *dev = xs->dev;
+	struct xdp_umem *umem = xs->umem;
+
+	if (umem->need_wakeup)
+		dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
+						umem->need_wakeup);
 
 	if (xs->rx && !xskq_empty_desc(xs->rx))
 		mask |= POLLIN | POLLRDNORM;
@@ -428,7 +494,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		return -EINVAL;
 
 	flags = sxdp->sxdp_flags;
-	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY))
+	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
+		      XDP_USE_NEED_WAKEUP))
 		return -EINVAL;
 
 	rtnl_lock();
@@ -455,7 +522,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		struct xdp_sock *umem_xs;
 		struct socket *sock;
 
-		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
+		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
+		    (flags & XDP_USE_NEED_WAKEUP)) {
 			/* Cannot specify flags for shared sockets. */
 			err = -EINVAL;
 			goto out_unlock;
@@ -550,6 +618,9 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		}
 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
 		err = xsk_init_queue(entries, q, false);
+		if (!err && optname == XDP_TX_RING)
+			/* Tx needs to be explicitly woken up the first time */
+			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
 		mutex_unlock(&xs->mutex);
 		return err;
 	}
@@ -611,6 +682,20 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	return -ENOPROTOOPT;
 }
 
+static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
+{
+	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
+}
+
+static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
+{
+	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+	ring->desc = offsetof(struct xdp_umem_ring, desc);
+}
+
 static int xsk_getsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, int __user *optlen)
 {
@@ -650,26 +735,49 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
 	case XDP_MMAP_OFFSETS:
 	{
 		struct xdp_mmap_offsets off;
+		struct xdp_mmap_offsets_v1 off_v1;
+		bool flags_supported = true;
+		void *to_copy;
 
-		if (len < sizeof(off))
+		if (len < sizeof(off_v1))
 			return -EINVAL;
+		else if (len < sizeof(off))
+			flags_supported = false;
+
+		if (flags_supported) {
+			/* xdp_ring_offset is identical to xdp_ring_offset_v1
+			 * except for the flags field added to the end.
+			 */
+			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+					       &off.rx);
+			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+					       &off.tx);
+			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+					       &off.fr);
+			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+					       &off.cr);
+			off.rx.flags = offsetof(struct xdp_rxtx_ring,
+						ptrs.flags);
+			off.tx.flags = offsetof(struct xdp_rxtx_ring,
+						ptrs.flags);
+			off.fr.flags = offsetof(struct xdp_umem_ring,
+						ptrs.flags);
+			off.cr.flags = offsetof(struct xdp_umem_ring,
+						ptrs.flags);
+
+			len = sizeof(off);
+			to_copy = &off;
+		} else {
+			xsk_enter_rxtx_offsets(&off_v1.rx);
+			xsk_enter_rxtx_offsets(&off_v1.tx);
+			xsk_enter_umem_offsets(&off_v1.fr);
+			xsk_enter_umem_offsets(&off_v1.cr);
+
+			len = sizeof(off_v1);
+			to_copy = &off_v1;
+		}
 
-		off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
-		off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
-		off.rx.desc	= offsetof(struct xdp_rxtx_ring, desc);
-		off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
-		off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
-		off.tx.desc	= offsetof(struct xdp_rxtx_ring, desc);
-
-		off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
-		off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
-		off.fr.desc	= offsetof(struct xdp_umem_ring, desc);
-		off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
-		off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
-		off.cr.desc	= offsetof(struct xdp_umem_ring, desc);
-
-		len = sizeof(off);
-		if (copy_to_user(optval, &off, len))
+		if (copy_to_user(optval, to_copy, len))
 			return -EFAULT;
 		if (put_user(len, optlen))
 			return -EFAULT;
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index ba81206..4cfd106 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -4,6 +4,19 @@
 #ifndef XSK_H_
 #define XSK_H_
 
+struct xdp_ring_offset_v1 {
+	__u64 producer;
+	__u64 consumer;
+	__u64 desc;
+};
+
+struct xdp_mmap_offsets_v1 {
+	struct xdp_ring_offset_v1 rx;
+	struct xdp_ring_offset_v1 tx;
+	struct xdp_ring_offset_v1 fr;
+	struct xdp_ring_offset_v1 cr;
+};
+
 static inline struct xdp_sock *xdp_sk(struct sock *sk)
 {
 	return (struct xdp_sock *)sk;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 909c516..dd9e985 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -16,6 +16,7 @@
 struct xdp_ring {
 	u32 producer ____cacheline_aligned_in_smp;
 	u32 consumer ____cacheline_aligned_in_smp;
+	u32 flags;
 };
 
 /* Used for the RX and TX queues for packets */
-- 
2.7.4


^ permalink raw reply related

* [PATCH bpf-next v4 3/8] i40e: add support for AF_XDP need_wakeup feature
From: Magnus Karlsson @ 2019-08-14  7:27 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev, brouer,
	maximmi
  Cc: bpf, bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	jonathan.lemon, kiran.patil, axboe, maciej.fijalkowski,
	maciejromanfijalkowski, intel-wired-lan
In-Reply-To: <1565767643-4908-1-git-send-email-magnus.karlsson@intel.com>

This patch adds support for the need_wakeup feature of AF_XDP. If the
application has told the kernel that it might sleep using the new bind
flag XDP_USE_NEED_WAKEUP, the driver will then set this flag if it has
no more buffers on the NIC Rx ring and yield to the application. For
Tx, it will set the flag if it has no outstanding Tx completion
interrupts and return to the application.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index d0ff5d8..42c9012 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -626,6 +626,15 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
 
 	i40e_finalize_xdp_rx(rx_ring, xdp_xmit);
 	i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets);
+
+	if (xsk_umem_uses_need_wakeup(rx_ring->xsk_umem)) {
+		if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
+			xsk_set_rx_need_wakeup(rx_ring->xsk_umem);
+		else
+			xsk_clear_rx_need_wakeup(rx_ring->xsk_umem);
+
+		return (int)total_rx_packets;
+	}
 	return failure ? budget : (int)total_rx_packets;
 }
 
@@ -681,6 +690,8 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
 		i40e_xdp_ring_update_tail(xdp_ring);
 
 		xsk_umem_consume_tx_done(xdp_ring->xsk_umem);
+		if (xsk_umem_uses_need_wakeup(xdp_ring->xsk_umem))
+			xsk_clear_tx_need_wakeup(xdp_ring->xsk_umem);
 	}
 
 	return !!budget && work_done;
@@ -759,6 +770,13 @@ bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
 	i40e_update_tx_stats(tx_ring, completed_frames, total_bytes);
 
 out_xmit:
+	if (xsk_umem_uses_need_wakeup(tx_ring->xsk_umem)) {
+		if (tx_ring->next_to_clean == tx_ring->next_to_use)
+			xsk_set_tx_need_wakeup(tx_ring->xsk_umem);
+		else
+			xsk_clear_tx_need_wakeup(tx_ring->xsk_umem);
+	}
+
 	xmit_done = i40e_xmit_zc(tx_ring, budget);
 
 	return work_done && xmit_done;
-- 
2.7.4


^ permalink raw reply related

* [PATCH bpf-next v4 4/8] ixgbe: add support for AF_XDP need_wakeup feature
From: Magnus Karlsson @ 2019-08-14  7:27 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev, brouer,
	maximmi
  Cc: bpf, bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	jonathan.lemon, kiran.patil, axboe, maciej.fijalkowski,
	maciejromanfijalkowski, intel-wired-lan
In-Reply-To: <1565767643-4908-1-git-send-email-magnus.karlsson@intel.com>

This patch adds support for the need_wakeup feature of AF_XDP. If the
application has told the kernel that it might sleep using the new bind
flag XDP_USE_NEED_WAKEUP, the driver will then set this flag if it has
no more buffers on the NIC Rx ring and yield to the application. For
Tx, it will set the flag if it has no outstanding Tx completion
interrupts and return to the application.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index e598af9..9a28d98 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -547,6 +547,14 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
 	q_vector->rx.total_packets += total_rx_packets;
 	q_vector->rx.total_bytes += total_rx_bytes;
 
+	if (xsk_umem_uses_need_wakeup(rx_ring->xsk_umem)) {
+		if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
+			xsk_set_rx_need_wakeup(rx_ring->xsk_umem);
+		else
+			xsk_clear_rx_need_wakeup(rx_ring->xsk_umem);
+
+		return (int)total_rx_packets;
+	}
 	return failure ? budget : (int)total_rx_packets;
 }
 
@@ -615,6 +623,8 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
 	if (tx_desc) {
 		ixgbe_xdp_ring_update_tail(xdp_ring);
 		xsk_umem_consume_tx_done(xdp_ring->xsk_umem);
+		if (xsk_umem_uses_need_wakeup(xdp_ring->xsk_umem))
+			xsk_clear_tx_need_wakeup(xdp_ring->xsk_umem);
 	}
 
 	return !!budget && work_done;
@@ -688,7 +698,15 @@ bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector,
 	if (xsk_frames)
 		xsk_umem_complete_tx(umem, xsk_frames);
 
+	if (xsk_umem_uses_need_wakeup(tx_ring->xsk_umem)) {
+		if (tx_ring->next_to_clean == tx_ring->next_to_use)
+			xsk_set_tx_need_wakeup(tx_ring->xsk_umem);
+		else
+			xsk_clear_tx_need_wakeup(tx_ring->xsk_umem);
+	}
+
 	xmit_done = ixgbe_xmit_zc(tx_ring, q_vector->tx.work_limit);
+
 	return budget > 0 && xmit_done;
 }
 
-- 
2.7.4


^ permalink raw reply related

* [PATCH bpf-next v4 5/8] libbpf: add support for need_wakeup flag in AF_XDP part
From: Magnus Karlsson @ 2019-08-14  7:27 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev, brouer,
	maximmi
  Cc: bpf, bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	jonathan.lemon, kiran.patil, axboe, maciej.fijalkowski,
	maciejromanfijalkowski, intel-wired-lan
In-Reply-To: <1565767643-4908-1-git-send-email-magnus.karlsson@intel.com>

This commit adds support for the new need_wakeup flag in AF_XDP. The
xsk_socket__create function is updated to handle this and a new
function is introduced called xsk_ring_prod__needs_wakeup(). This
function can be used by the application to check if Rx and/or Tx
processing needs to be explicitly woken up.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 tools/include/uapi/linux/if_xdp.h | 13 +++++++++++++
 tools/lib/bpf/xsk.c               |  4 ++++
 tools/lib/bpf/xsk.h               |  6 ++++++
 3 files changed, 23 insertions(+)

diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index faaa5ca..62b80d5 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -16,6 +16,15 @@
 #define XDP_SHARED_UMEM	(1 << 0)
 #define XDP_COPY	(1 << 1) /* Force copy-mode */
 #define XDP_ZEROCOPY	(1 << 2) /* Force zero-copy mode */
+/* If this option is set, the driver might go sleep and in that case
+ * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be
+ * set. If it is set, the application need to explicitly wake up the
+ * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are
+ * running the driver and the application on the same core, you should
+ * use this option so that the kernel will yield to the user space
+ * application.
+ */
+#define XDP_USE_NEED_WAKEUP (1 << 3)
 
 struct sockaddr_xdp {
 	__u16 sxdp_family;
@@ -25,10 +34,14 @@ struct sockaddr_xdp {
 	__u32 sxdp_shared_umem_fd;
 };
 
+/* XDP_RING flags */
+#define XDP_RING_NEED_WAKEUP (1 << 0)
+
 struct xdp_ring_offset {
 	__u64 producer;
 	__u64 consumer;
 	__u64 desc;
+	__u64 flags;
 };
 
 struct xdp_mmap_offsets {
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 680e630..17e8d79 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -224,6 +224,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
 	fill->size = umem->config.fill_size;
 	fill->producer = map + off.fr.producer;
 	fill->consumer = map + off.fr.consumer;
+	fill->flags = map + off.fr.flags;
 	fill->ring = map + off.fr.desc;
 	fill->cached_cons = umem->config.fill_size;
 
@@ -241,6 +242,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
 	comp->size = umem->config.comp_size;
 	comp->producer = map + off.cr.producer;
 	comp->consumer = map + off.cr.consumer;
+	comp->flags = map + off.cr.flags;
 	comp->ring = map + off.cr.desc;
 
 	*umem_ptr = umem;
@@ -564,6 +566,7 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
 		rx->size = xsk->config.rx_size;
 		rx->producer = rx_map + off.rx.producer;
 		rx->consumer = rx_map + off.rx.consumer;
+		rx->flags = rx_map + off.rx.flags;
 		rx->ring = rx_map + off.rx.desc;
 	}
 	xsk->rx = rx;
@@ -583,6 +586,7 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
 		tx->size = xsk->config.tx_size;
 		tx->producer = tx_map + off.tx.producer;
 		tx->consumer = tx_map + off.tx.consumer;
+		tx->flags = tx_map + off.tx.flags;
 		tx->ring = tx_map + off.tx.desc;
 		tx->cached_cons = xsk->config.tx_size;
 	}
diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
index 833a6e6..aa1d612 100644
--- a/tools/lib/bpf/xsk.h
+++ b/tools/lib/bpf/xsk.h
@@ -32,6 +32,7 @@ struct name { \
 	__u32 *producer; \
 	__u32 *consumer; \
 	void *ring; \
+	__u32 *flags; \
 }
 
 DEFINE_XSK_RING(xsk_ring_prod);
@@ -76,6 +77,11 @@ xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx)
 	return &descs[idx & rx->mask];
 }
 
+static inline int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r)
+{
+	return *r->flags & XDP_RING_NEED_WAKEUP;
+}
+
 static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
 {
 	__u32 free_entries = r->cached_cons - r->cached_prod;
-- 
2.7.4


^ permalink raw reply related

* [PATCH bpf-next v4 6/8] samples/bpf: add use of need_wakeup flag in xdpsock
From: Magnus Karlsson @ 2019-08-14  7:27 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev, brouer,
	maximmi
  Cc: bpf, bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	jonathan.lemon, kiran.patil, axboe, maciej.fijalkowski,
	maciejromanfijalkowski, intel-wired-lan
In-Reply-To: <1565767643-4908-1-git-send-email-magnus.karlsson@intel.com>

This commit adds using the need_wakeup flag to the xdpsock sample
application. It is turned on by default as we think it is a feature
that seems to always produce a performance benefit, if the application
has been written taking advantage of it. It can be turned off in the
sample app by using the '-m' command line option.

The txpush and l2fwd sub applications have also been updated to
support poll() with multiple sockets.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 samples/bpf/xdpsock_user.c | 192 ++++++++++++++++++++++++++++-----------------
 1 file changed, 120 insertions(+), 72 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 93eaaf7..da84c76 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -67,8 +67,10 @@ static int opt_ifindex;
 static int opt_queue;
 static int opt_poll;
 static int opt_interval = 1;
-static u32 opt_xdp_bind_flags;
+static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
 static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+static int opt_timeout = 1000;
+static bool opt_need_wakeup = true;
 static __u32 prog_id;
 
 struct xsk_umem_info {
@@ -352,6 +354,7 @@ static struct option long_options[] = {
 	{"zero-copy", no_argument, 0, 'z'},
 	{"copy", no_argument, 0, 'c'},
 	{"frame-size", required_argument, 0, 'f'},
+	{"no-need-wakeup", no_argument, 0, 'm'},
 	{0, 0, 0, 0}
 };
 
@@ -372,6 +375,7 @@ static void usage(const char *prog)
 		"  -z, --zero-copy      Force zero-copy mode.\n"
 		"  -c, --copy           Force copy mode.\n"
 		"  -f, --frame-size=n   Set the frame size (must be a power of two, default is %d).\n"
+		"  -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n"
 		"\n";
 	fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE);
 	exit(EXIT_FAILURE);
@@ -384,8 +388,9 @@ static void parse_command_line(int argc, char **argv)
 	opterr = 0;
 
 	for (;;) {
-		c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options,
-				&option_index);
+
+		c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:m",
+				long_options, &option_index);
 		if (c == -1)
 			break;
 
@@ -429,6 +434,9 @@ static void parse_command_line(int argc, char **argv)
 			break;
 		case 'f':
 			opt_xsk_frame_size = atoi(optarg);
+		case 'm':
+			opt_need_wakeup = false;
+			opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP;
 			break;
 		default:
 			usage(basename(argv[0]));
@@ -459,7 +467,8 @@ static void kick_tx(struct xsk_socket_info *xsk)
 	exit_with_error(errno);
 }
 
-static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
+static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
+				     struct pollfd *fds)
 {
 	u32 idx_cq = 0, idx_fq = 0;
 	unsigned int rcvd;
@@ -468,7 +477,9 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
 	if (!xsk->outstanding_tx)
 		return;
 
-	kick_tx(xsk);
+	if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
+		kick_tx(xsk);
+
 	ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
 		xsk->outstanding_tx;
 
@@ -482,6 +493,8 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
 		while (ret != rcvd) {
 			if (ret < 0)
 				exit_with_error(-ret);
+			if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+				ret = poll(fds, num_socks, opt_timeout);
 			ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd,
 						     &idx_fq);
 		}
@@ -505,7 +518,8 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
 	if (!xsk->outstanding_tx)
 		return;
 
-	kick_tx(xsk);
+	if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
+		kick_tx(xsk);
 
 	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx);
 	if (rcvd > 0) {
@@ -515,20 +529,25 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
 	}
 }
 
-static void rx_drop(struct xsk_socket_info *xsk)
+static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
 {
 	unsigned int rcvd, i;
 	u32 idx_rx = 0, idx_fq = 0;
 	int ret;
 
 	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
-	if (!rcvd)
+	if (!rcvd) {
+		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+			ret = poll(fds, num_socks, opt_timeout);
 		return;
+	}
 
 	ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
 	while (ret != rcvd) {
 		if (ret < 0)
 			exit_with_error(-ret);
+		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+			ret = poll(fds, num_socks, opt_timeout);
 		ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
 	}
 
@@ -549,42 +568,65 @@ static void rx_drop(struct xsk_socket_info *xsk)
 static void rx_drop_all(void)
 {
 	struct pollfd fds[MAX_SOCKS + 1];
-	int i, ret, timeout, nfds = 1;
+	int i, ret;
 
 	memset(fds, 0, sizeof(fds));
 
 	for (i = 0; i < num_socks; i++) {
 		fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
 		fds[i].events = POLLIN;
-		timeout = 1000; /* 1sn */
 	}
 
 	for (;;) {
 		if (opt_poll) {
-			ret = poll(fds, nfds, timeout);
+			ret = poll(fds, num_socks, opt_timeout);
 			if (ret <= 0)
 				continue;
 		}
 
 		for (i = 0; i < num_socks; i++)
-			rx_drop(xsks[i]);
+			rx_drop(xsks[i], fds);
+	}
+}
+
+static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb)
+{
+	u32 idx;
+
+	if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == BATCH_SIZE) {
+		unsigned int i;
+
+		for (i = 0; i < BATCH_SIZE; i++) {
+			xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr	=
+				(frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
+			xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
+				sizeof(pkt_data) - 1;
+		}
+
+		xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
+		xsk->outstanding_tx += BATCH_SIZE;
+		frame_nb += BATCH_SIZE;
+		frame_nb %= NUM_FRAMES;
 	}
+
+	complete_tx_only(xsk);
 }
 
-static void tx_only(struct xsk_socket_info *xsk)
+static void tx_only_all(void)
 {
-	int timeout, ret, nfds = 1;
-	struct pollfd fds[nfds + 1];
-	u32 idx, frame_nb = 0;
+	struct pollfd fds[MAX_SOCKS];
+	u32 frame_nb[MAX_SOCKS] = {};
+	int i, ret;
 
 	memset(fds, 0, sizeof(fds));
-	fds[0].fd = xsk_socket__fd(xsk->xsk);
-	fds[0].events = POLLOUT;
-	timeout = 1000; /* 1sn */
+	for (i = 0; i < num_socks; i++) {
+		fds[0].fd = xsk_socket__fd(xsks[i]->xsk);
+		fds[0].events = POLLOUT;
+	}
 
 	for (;;) {
 		if (opt_poll) {
-			ret = poll(fds, nfds, timeout);
+			ret = poll(fds, num_socks, opt_timeout);
 			if (ret <= 0)
 				continue;
 
@@ -592,69 +634,75 @@ static void tx_only(struct xsk_socket_info *xsk)
 				continue;
 		}
 
-		if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) ==
-		    BATCH_SIZE) {
-			unsigned int i;
-
-			for (i = 0; i < BATCH_SIZE; i++) {
-				xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
-					= (frame_nb + i) * opt_xsk_frame_size;
-				xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
-					sizeof(pkt_data) - 1;
-			}
-
-			xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
-			xsk->outstanding_tx += BATCH_SIZE;
-			frame_nb += BATCH_SIZE;
-			frame_nb %= NUM_FRAMES;
-		}
-
-		complete_tx_only(xsk);
+		for (i = 0; i < num_socks; i++)
+			tx_only(xsks[i], frame_nb[i]);
 	}
 }
 
-static void l2fwd(struct xsk_socket_info *xsk)
+static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
 {
-	for (;;) {
-		unsigned int rcvd, i;
-		u32 idx_rx = 0, idx_tx = 0;
-		int ret;
+	unsigned int rcvd, i;
+	u32 idx_rx = 0, idx_tx = 0;
+	int ret;
 
-		for (;;) {
-			complete_tx_l2fwd(xsk);
+	complete_tx_l2fwd(xsk, fds);
 
-			rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE,
-						   &idx_rx);
-			if (rcvd > 0)
-				break;
-		}
+	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
+	if (!rcvd) {
+		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+			ret = poll(fds, num_socks, opt_timeout);
+		return;
+	}
 
+	ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
+	while (ret != rcvd) {
+		if (ret < 0)
+			exit_with_error(-ret);
+		if (xsk_ring_prod__needs_wakeup(&xsk->tx))
+			kick_tx(xsk);
 		ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
-		while (ret != rcvd) {
-			if (ret < 0)
-				exit_with_error(-ret);
-			ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
-		}
+	}
+
+	for (i = 0; i < rcvd; i++) {
+		u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
+		u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
+		char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
+
+		swap_mac_addresses(pkt);
 
-		for (i = 0; i < rcvd; i++) {
-			u64 addr = xsk_ring_cons__rx_desc(&xsk->rx,
-							  idx_rx)->addr;
-			u32 len = xsk_ring_cons__rx_desc(&xsk->rx,
-							 idx_rx++)->len;
-			char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
+		hex_dump(pkt, len, addr);
+		xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
+		xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
+	}
 
-			swap_mac_addresses(pkt);
+	xsk_ring_prod__submit(&xsk->tx, rcvd);
+	xsk_ring_cons__release(&xsk->rx, rcvd);
 
-			hex_dump(pkt, len, addr);
-			xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
-			xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
-		}
+	xsk->rx_npkts += rcvd;
+	xsk->outstanding_tx += rcvd;
+}
+
+static void l2fwd_all(void)
+{
+	struct pollfd fds[MAX_SOCKS];
+	int i, ret;
+
+	memset(fds, 0, sizeof(fds));
+
+	for (i = 0; i < num_socks; i++) {
+		fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
+		fds[i].events = POLLOUT | POLLIN;
+	}
 
-		xsk_ring_prod__submit(&xsk->tx, rcvd);
-		xsk_ring_cons__release(&xsk->rx, rcvd);
+	for (;;) {
+		if (opt_poll) {
+			ret = poll(fds, num_socks, opt_timeout);
+			if (ret <= 0)
+				continue;
+		}
 
-		xsk->rx_npkts += rcvd;
-		xsk->outstanding_tx += rcvd;
+		for (i = 0; i < num_socks; i++)
+			l2fwd(xsks[i], fds);
 	}
 }
 
@@ -705,9 +753,9 @@ int main(int argc, char **argv)
 	if (opt_bench == BENCH_RXDROP)
 		rx_drop_all();
 	else if (opt_bench == BENCH_TXONLY)
-		tx_only(xsks[0]);
+		tx_only_all();
 	else
-		l2fwd(xsks[0]);
+		l2fwd_all();
 
 	return 0;
 }
-- 
2.7.4


^ permalink raw reply related

* [PATCH bpf-next v4 7/8] net/mlx5e: Move the SW XSK code from NAPI poll to a separate function
From: Magnus Karlsson @ 2019-08-14  7:27 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev, brouer,
	maximmi
  Cc: bpf, bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	jonathan.lemon, kiran.patil, axboe, maciej.fijalkowski,
	maciejromanfijalkowski, intel-wired-lan
In-Reply-To: <1565767643-4908-1-git-send-email-magnus.karlsson@intel.com>

From: Maxim Mikityanskiy <maximmi@mellanox.com>

Two XSK tasks are performed during NAPI polling, that are not bound to
hardware interrupts: TXing packets and polling for frames in the Fill
Ring. They are special in a way that the hardware doesn't know about
these tasks, so it doesn't trigger interrupts if there is still some
work to be done, it's our driver's responsibility to ensure NAPI will be
rescheduled if needed.

Create a new function to handle these tasks and move the corresponding
code from mlx5e_napi_poll to the new function to improve modularity and
prepare for the changes in the following patch.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 49b06b2..6d16dee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -81,6 +81,16 @@ void mlx5e_trigger_irq(struct mlx5e_icosq *sq)
 	mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nopwqe->ctrl);
 }
 
+static bool mlx5e_napi_xsk_post(struct mlx5e_xdpsq *xsksq, struct mlx5e_rq *xskrq)
+{
+	bool busy_xsk = false;
+
+	busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
+	busy_xsk |= xskrq->post_wqes(xskrq);
+
+	return busy_xsk;
+}
+
 int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 {
 	struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
@@ -122,8 +132,7 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 	if (xsk_open) {
 		mlx5e_poll_ico_cq(&c->xskicosq.cq);
 		busy |= mlx5e_poll_xdpsq_cq(&xsksq->cq);
-		busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
-		busy_xsk |= xskrq->post_wqes(xskrq);
+		busy_xsk |= mlx5e_napi_xsk_post(xsksq, xskrq);
 	}
 
 	busy |= busy_xsk;
-- 
2.7.4


^ permalink raw reply related

* [PATCH bpf-next v4 8/8] net/mlx5e: Add AF_XDP need_wakeup support
From: Magnus Karlsson @ 2019-08-14  7:27 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev, brouer,
	maximmi
  Cc: bpf, bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	jonathan.lemon, kiran.patil, axboe, maciej.fijalkowski,
	maciejromanfijalkowski, intel-wired-lan
In-Reply-To: <1565767643-4908-1-git-send-email-magnus.karlsson@intel.com>

From: Maxim Mikityanskiy <maximmi@mellanox.com>

This commit adds support for the new need_wakeup feature of AF_XDP. The
applications can opt-in by using the XDP_USE_NEED_WAKEUP bind() flag.
When this feature is enabled, some behavior changes:

RX side: If the Fill Ring is empty, instead of busy-polling, set the
flag to tell the application to kick the driver when it refills the Fill
Ring.

TX side: If there are pending completions or packets queued for
transmission, set the flag to tell the application that it can skip the
sendto() syscall and save time.

The performance testing was performed on a machine with the following
configuration:

- 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz
- Mellanox ConnectX-5 Ex with 100 Gbit/s link

The results with retpoline disabled:

       | without need_wakeup  | with need_wakeup     |
       |----------------------|----------------------|
       | one core | two cores | one core | two cores |
-------|----------|-----------|----------|-----------|
txonly | 20.1     | 33.5      | 29.0     | 34.2      |
rxdrop | 0.065    | 14.1      | 12.0     | 14.1      |
l2fwd  | 0.032    | 7.3       | 6.6      | 7.2       |

"One core" means the application and NAPI run on the same core. "Two
cores" means they are pinned to different cores.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h | 14 ++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h | 12 ++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c     |  7 +++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c   | 20 +++++++++++++++++---
 4 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index 307b923..cab0e93 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -5,6 +5,7 @@
 #define __MLX5_EN_XSK_RX_H__
 
 #include "en.h"
+#include <net/xdp_sock.h>
 
 /* RX data path */
 
@@ -24,4 +25,17 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 					      struct mlx5e_wqe_frag_info *wi,
 					      u32 cqe_bcnt);
 
+static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err)
+{
+	if (!xsk_umem_uses_need_wakeup(rq->umem))
+		return alloc_err;
+
+	if (unlikely(alloc_err))
+		xsk_set_rx_need_wakeup(rq->umem);
+	else
+		xsk_clear_rx_need_wakeup(rq->umem);
+
+	return false;
+}
+
 #endif /* __MLX5_EN_XSK_RX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
index 9c50515..79b487d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
@@ -5,6 +5,7 @@
 #define __MLX5_EN_XSK_TX_H__
 
 #include "en.h"
+#include <net/xdp_sock.h>
 
 /* TX data path */
 
@@ -12,4 +13,15 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
 
 bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget);
 
+static inline void mlx5e_xsk_update_tx_wakeup(struct mlx5e_xdpsq *sq)
+{
+	if (!xsk_umem_uses_need_wakeup(sq->umem))
+		return;
+
+	if (sq->pc != sq->cc)
+		xsk_clear_tx_need_wakeup(sq->umem);
+	else
+		xsk_set_tx_need_wakeup(sq->umem);
+}
+
 #endif /* __MLX5_EN_XSK_TX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 60570b4..fae0694 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -692,8 +692,11 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
 	rq->mpwqe.umr_in_progress += rq->mpwqe.umr_last_bulk;
 	rq->mpwqe.actual_wq_head   = head;
 
-	/* If XSK Fill Ring doesn't have enough frames, busy poll by
-	 * rescheduling the NAPI poll.
+	/* If XSK Fill Ring doesn't have enough frames, report the error, so
+	 * that one of the actions can be performed:
+	 * 1. If need_wakeup is used, signal that the application has to kick
+	 * the driver when it refills the Fill Ring.
+	 * 2. Otherwise, busy poll by rescheduling the NAPI poll.
 	 */
 	if (unlikely(alloc_err == -ENOMEM && rq->umem))
 		return true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 6d16dee..257a7c9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -33,6 +33,7 @@
 #include <linux/irq.h>
 #include "en.h"
 #include "en/xdp.h"
+#include "en/xsk/rx.h"
 #include "en/xsk/tx.h"
 
 static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
@@ -83,10 +84,23 @@ void mlx5e_trigger_irq(struct mlx5e_icosq *sq)
 
 static bool mlx5e_napi_xsk_post(struct mlx5e_xdpsq *xsksq, struct mlx5e_rq *xskrq)
 {
-	bool busy_xsk = false;
-
+	bool busy_xsk = false, xsk_rx_alloc_err;
+
+	/* Handle the race between the application querying need_wakeup and the
+	 * driver setting it:
+	 * 1. Update need_wakeup both before and after the TX. If it goes to
+	 * "yes", it can only happen with the first update.
+	 * 2. If the application queried need_wakeup before we set it, the
+	 * packets will be transmitted anyway, even w/o a wakeup.
+	 * 3. Give a chance to clear need_wakeup after new packets were queued
+	 * for TX.
+	 */
+	mlx5e_xsk_update_tx_wakeup(xsksq);
 	busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
-	busy_xsk |= xskrq->post_wqes(xskrq);
+	mlx5e_xsk_update_tx_wakeup(xsksq);
+
+	xsk_rx_alloc_err = xskrq->post_wqes(xskrq);
+	busy_xsk |= mlx5e_xsk_update_rx_wakeup(xskrq, xsk_rx_alloc_err);
 
 	return busy_xsk;
 }
-- 
2.7.4


^ permalink raw reply related

* Re: [RFC PATCH bpf-next 00/14] xdp_flow: Flow offload to XDP
From: Toshiaki Makita @ 2019-08-14  7:33 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, David S. Miller, Jakub Kicinski,
	Jesper Dangaard Brouer, John Fastabend, Jamal Hadi Salim,
	Cong Wang, Jiri Pirko, netdev, bpf, William Tu
In-Reply-To: <20190814014445.3dnduyrass5jycr5@ast-mbp>

Hi Alexei, thank you for taking a look!

On 2019/08/14 10:44, Alexei Starovoitov wrote:
> On Tue, Aug 13, 2019 at 09:05:44PM +0900, Toshiaki Makita wrote:
>> This is a rough PoC for an idea to offload TC flower to XDP.
> ...
>>   xdp_flow  TC        ovs kmod
>>   --------  --------  --------
>>   4.0 Mpps  1.1 Mpps  1.1 Mpps
> 
> Is xdp_flow limited to 4 Mpps due to veth or something else?

Looking at perf, accumulation of each layer's overhead resulted in the number.
With XDP prog which only redirects packets and does not touch the data,
the drop rate is 10 Mpps. In this case the main overhead is XDP's redirect processing
and handling of 2 XDP progs (in veth and i40e).
In the xdp_flow test the overhead additionally includes flow key parse in XDP prog
and hash table lookup (including jhash calculation) which resulted in 4 Mpps.

>> So xdp_flow drop rate is roughly 4x faster than software TC or ovs kmod.
>>
>> OTOH the time to add a flow increases with xdp_flow.
>>
>> ping latency of first packet when veth1 does XDP_PASS instead of DROP:
>>
>>   xdp_flow  TC        ovs kmod
>>   --------  --------  --------
>>   25ms      12ms      0.6ms
>>
>> xdp_flow does a lot of work to emulate TC behavior including UMH
>> transaction and multiple bpf map update from UMH which I think increases
>> the latency.
> 
> make sense, but why vanilla TC is so slow ?

No ideas. At least TC requires additional syscall to insert a flow compared to ovs kmod,
but 12 ms looks too long for that.

>> * Implementation
>>
>> xdp_flow makes use of UMH to load an eBPF program for XDP, similar to
>> bpfilter. The difference is that xdp_flow does not generate the eBPF
>> program dynamically but a prebuilt program is embedded in UMH. This is
>> mainly because flow insertion is considerably frequent. If we generate
>> and load an eBPF program on each insertion of a flow, the latency of the
>> first packet of ping in above test will incease, which I want to avoid.
> 
> I think UMH approach is a good fit for this.
> Clearly the same algorithm can be done as kernel code or kernel module, but
> bpfilter-like UMH is a safer approach.
> 
>> - patch 9
>>   Add tc-offload-xdp netdev feature and hooks to call xdp_flow kmod in
>>   TC flower offload code.
> 
> The hook into UMH from TC looks simple. Do you expect the same interface to be
> reused from OVS ?

Do you mean openvswitch kernel module by OVS?
If so, no, at this point. TC hook is simple because I reused flow offload mechanism.
OVS kmod does not have offload interface and ovs-vswitchd is using TC for offload.
I wanted to reuse this mechanism for offloading to XDP, so using TC.

>> * About alternative userland (ovs-vswitchd etc.) implementation
>>
>> Maybe a similar logic can be implemented in ovs-vswitchd offload
>> mechanism, instead of adding code to kernel. I just thought offloading
>> TC is more generic and allows wider usage with direct TC command.
>>
>> For example, considering that OVS inserts a flow to kernel only when
>> flow miss happens in kernel, we can in advance add offloaded flows via
>> tc filter to avoid flow insertion latency for certain sensitive flows.
>> TC flower usage without using OVS is also possible.
>>
>> Also as written above nftables can be offloaded to XDP with this
>> mechanism as well.
> 
> Makes sense to me.
> 
>>    bpf, hashtab: Compare keys in long
> 
> 3Mpps vs 4Mpps just from this patch ?
> or combined with i40 prefech patch ?

Combined.

>>   drivers/net/ethernet/intel/i40e/i40e_txrx.c  |    1 +
> 
> Could you share "perf report" for just hash tab optimization
> and for i40 ?

Sure, I'll get some more data and post them.

> I haven't seen memcmp to be bottle neck in hash tab.
> What is the the of the key?

typo of "size of the key"? IIRC 64 bytes.

Toshiaki Makita

^ permalink raw reply

* Re: [patch net-next] selftests: netdevsim: add devlink params tests
From: Jiri Pirko @ 2019-08-14  7:47 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, davem, mlxsw
In-Reply-To: <20190813154108.30509472@cakuba.netronome.com>

Wed, Aug 14, 2019 at 12:41:08AM CEST, jakub.kicinski@netronome.com wrote:
>On Tue, 13 Aug 2019 15:04:46 +0200, Jiri Pirko wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>> 
>> Test recently added netdevsim devlink param implementation.
>> 
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>
>Thanks for the test, but it doesn't pass here:
>
>TEST: fw flash test                                                 [ OK ]
>TEST: params test                                                   [FAIL]
>	Failed to get test1 param value

Interesting. Fors for me correctly. When I run it manually, I get this:
bash-5.0# devlink dev param show netdevsim/netdevsim11 name test1 -j | jq -e -r '.[][][].values[] | select(.cmode == "driverinit").value'
true
bash-5.0# echo $?
0
bash-5.0# devlink dev param set netdevsim/netdevsim11 name test1 cmode driverinit value false
bash-5.0# devlink dev param show netdevsim/netdevsim11 name test1 -j | jq -e -r '.[][][].values[] | select(.cmode == "driverinit").value'
false
bash-5.0# echo $?
0




>
>> diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
>> index 9d8baf5d14b3..858ebdc8d8a3 100755
>> --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
>> +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
>> @@ -3,7 +3,7 @@
>>  
>>  lib_dir=$(dirname $0)/../../../net/forwarding
>>  
>> -ALL_TESTS="fw_flash_test"
>> +ALL_TESTS="fw_flash_test params_test"
>>  NUM_NETIFS=0
>>  source $lib_dir/lib.sh
>>  
>> @@ -30,6 +30,66 @@ fw_flash_test()
>>  	log_test "fw flash test"
>>  }
>>  
>> +param_get()
>> +{
>> +	local name=$1
>> +
>> +	devlink dev param show $DL_HANDLE name $name -j | \
>> +		jq -e -r '.[][][].values[] | select(.cmode == "driverinit").value'
>
>                   ^^
>
>The -e makes jq set exit code to 1 when test1 param is false.
>
>Quoting the man page:
>
>       ·   -e / --exit-status:
>
>           Sets the exit status of jq to 0 if the last output values
>           was neither false nor null, 1 if the last output value was
>           either false or  null,  or  4  if  no valid  result  was
>           ever produced. Normally jq exits with 2 if there was any
>           usage problem or system error, 3 if there was a jq program
>           compile error, or 0 if the jq program ran.
>
>Without the -e all is well:

Not really, for non-existent param the return value would be wrong:
bash-5.0# devlink dev param show netdevsim/netdevsim11 name test2 -j | jq -e -r '.[][][].values[] | select(.cmode == "driverinit").value'
devlink answers: Invalid argument
bash-5.0# echo $?
4
bash-5.0# devlink dev param show netdevsim/netdevsim11 name test2 -j | jq -r '.[][][].values[] | select(.cmode == "driverinit").value'
devlink answers: Invalid argument
bash-5.0# echo $?
0

The return value is 0 like everyone is fine. You probably have a
different jq version (1.6). Looks like I need to use the same
workaround I have in tools/testing/selftests/net/forwarding/tc_common.sh.
I thought that -e would avoid that.



>
># ./devlink.sh 
>TEST: fw flash test                                                 [ OK ]
>TEST: params test                                                   [ OK ]
>
>> +}
>> +

^ permalink raw reply

* [PATCH 2/2] PTP: add support for one-shot output
From: Felipe Balbi @ 2019-08-14  7:47 UTC (permalink / raw)
  To: Richard Cochran; +Cc: Christopher S Hall, netdev, linux-kernel, Felipe Balbi
In-Reply-To: <20190814074712.10684-1-felipe.balbi@linux.intel.com>

Some controllers allow for a one-shot output pulse, in contrast to
periodic output. Now that we have extensible versions of our IOCTLs, we
can finally make use of the 'flags' field to pass a bit telling driver
that if we want one-shot pulse output.

Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/ptp/ptp_chardev.c      | 5 ++---
 include/uapi/linux/ptp_clock.h | 4 +++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 204212fc3f8c..b75a65880056 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -173,9 +173,8 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 			err = -EFAULT;
 			break;
 		}
-		if ((req.perout.flags || req.perout.rsv[0] || req.perout.rsv[1]
-				|| req.perout.rsv[2] || req.perout.rsv[3])
-			&& cmd == PTP_PEROUT_REQUEST2) {
+		if ((req.perout.rsv[0] || req.perout.rsv[1] || req.perout.rsv[2]
+			|| req.perout.rsv[3]) && cmd == PTP_PEROUT_REQUEST2) {
 			err = -EINVAL;
 			break;
 		} else if (cmd == PTP_PEROUT_REQUEST) {
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index 039cd62ec706..9412b16cc8ed 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -67,7 +67,9 @@ struct ptp_perout_request {
 	struct ptp_clock_time start;  /* Absolute start time. */
 	struct ptp_clock_time period; /* Desired period, zero means disable. */
 	unsigned int index;           /* Which channel to configure. */
-	unsigned int flags;           /* Reserved for future use. */
+
+#define PTP_PEROUT_ONE_SHOT BIT(0)
+	unsigned int flags;           /* Bit 0 -> oneshot output. */
 	unsigned int rsv[4];          /* Reserved for future use. */
 };
 
-- 
2.22.0


^ permalink raw reply related

* [PATCH 1/2] PTP: introduce new versions of IOCTLs
From: Felipe Balbi @ 2019-08-14  7:47 UTC (permalink / raw)
  To: Richard Cochran; +Cc: Christopher S Hall, netdev, linux-kernel, Felipe Balbi

The current version of the IOCTL have a small problem which prevents us
from extending the API by making use of reserved fields. In these new
IOCTLs, we are now making sure that flags and rsv fields are zero which
will allow us to extend the API in the future.

Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/ptp/ptp_chardev.c      | 58 ++++++++++++++++++++++++++++++++--
 include/uapi/linux/ptp_clock.h | 12 +++++++
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 18ffe449efdf..204212fc3f8c 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -123,9 +123,11 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 	struct timespec64 ts;
 	int enable, err = 0;
 
+	memset(&req, 0, sizeof(req));
 	switch (cmd) {
 
 	case PTP_CLOCK_GETCAPS:
+	case PTP_CLOCK_GETCAPS2:
 		memset(&caps, 0, sizeof(caps));
 		caps.max_adj = ptp->info->max_adj;
 		caps.n_alarm = ptp->info->n_alarm;
@@ -139,11 +141,22 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_EXTTS_REQUEST:
+	case PTP_EXTTS_REQUEST2:
 		if (copy_from_user(&req.extts, (void __user *)arg,
 				   sizeof(req.extts))) {
 			err = -EFAULT;
 			break;
 		}
+		if ((req.extts.flags || req.extts.rsv[0] || req.extts.rsv[1])
+			&& cmd == PTP_EXTTS_REQUEST2) {
+			err = -EINVAL;
+			break;
+		} else if (cmd == PTP_EXTTS_REQUEST) {
+			req.extts.flags = 0;
+			req.extts.rsv[0] = 0;
+			req.extts.rsv[1] = 0;
+		}
+			
 		if (req.extts.index >= ops->n_ext_ts) {
 			err = -EINVAL;
 			break;
@@ -154,11 +167,24 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_PEROUT_REQUEST:
+	case PTP_PEROUT_REQUEST2:
 		if (copy_from_user(&req.perout, (void __user *)arg,
 				   sizeof(req.perout))) {
 			err = -EFAULT;
 			break;
 		}
+		if ((req.perout.flags || req.perout.rsv[0] || req.perout.rsv[1]
+				|| req.perout.rsv[2] || req.perout.rsv[3])
+			&& cmd == PTP_PEROUT_REQUEST2) {
+			err = -EINVAL;
+			break;
+		} else if (cmd == PTP_PEROUT_REQUEST) {
+			req.perout.flags = 0;
+			req.perout.rsv[0] = 0;
+			req.perout.rsv[1] = 0;
+			req.perout.rsv[2] = 0;
+			req.perout.rsv[3] = 0;
+		}
 		if (req.perout.index >= ops->n_per_out) {
 			err = -EINVAL;
 			break;
@@ -169,6 +195,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_ENABLE_PPS:
+	case PTP_ENABLE_PPS2:
 		if (!capable(CAP_SYS_TIME))
 			return -EPERM;
 		req.type = PTP_CLK_REQ_PPS;
@@ -177,6 +204,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_SYS_OFFSET_PRECISE:
+	case PTP_SYS_OFFSET_PRECISE2:
 		if (!ptp->info->getcrosststamp) {
 			err = -EOPNOTSUPP;
 			break;
@@ -201,6 +229,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_SYS_OFFSET_EXTENDED:
+	case PTP_SYS_OFFSET_EXTENDED2:
 		if (!ptp->info->gettimex64) {
 			err = -EOPNOTSUPP;
 			break;
@@ -232,6 +261,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_SYS_OFFSET:
+	case PTP_SYS_OFFSET2:
 		sysoff = memdup_user((void __user *)arg, sizeof(*sysoff));
 		if (IS_ERR(sysoff)) {
 			err = PTR_ERR(sysoff);
@@ -265,11 +295,23 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 			err = -EFAULT;
 		break;
 
-	case PTP_PIN_GETFUNC:
+	case PTP_PIN_GETFUNC2:
 		if (copy_from_user(&pd, (void __user *)arg, sizeof(pd))) {
 			err = -EFAULT;
 			break;
 		}
+		if ((pd.rsv[0] || pd.rsv[1] || pd.rsv[2]
+				|| pd.rsv[3] || pd.rsv[4])
+			&& cmd == PTP_PIN_GETFUNC2) {
+			err = -EINVAL;
+			break;
+		} else if (cmd == PTP_PIN_GETFUNC) {
+			pd.rsv[0] = 0;
+			pd.rsv[1] = 0;
+			pd.rsv[2] = 0;
+			pd.rsv[3] = 0;
+			pd.rsv[4] = 0;
+		}
 		pin_index = pd.index;
 		if (pin_index >= ops->n_pins) {
 			err = -EINVAL;
@@ -284,11 +326,23 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 			err = -EFAULT;
 		break;
 
-	case PTP_PIN_SETFUNC:
+	case PTP_PIN_SETFUNC2:
 		if (copy_from_user(&pd, (void __user *)arg, sizeof(pd))) {
 			err = -EFAULT;
 			break;
 		}
+		if ((pd.rsv[0] || pd.rsv[1] || pd.rsv[2]
+				|| pd.rsv[3] || pd.rsv[4])
+			&& cmd == PTP_PIN_SETFUNC2) {
+			err = -EINVAL;
+			break;
+		} else if (cmd == PTP_PIN_SETFUNC) {
+			pd.rsv[0] = 0;
+			pd.rsv[1] = 0;
+			pd.rsv[2] = 0;
+			pd.rsv[3] = 0;
+			pd.rsv[4] = 0;
+		}
 		pin_index = pd.index;
 		if (pin_index >= ops->n_pins) {
 			err = -EINVAL;
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index 1bc794ad957a..039cd62ec706 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -149,6 +149,18 @@ struct ptp_pin_desc {
 #define PTP_SYS_OFFSET_EXTENDED \
 	_IOWR(PTP_CLK_MAGIC, 9, struct ptp_sys_offset_extended)
 
+#define PTP_CLOCK_GETCAPS2  _IOR(PTP_CLK_MAGIC, 10, struct ptp_clock_caps)
+#define PTP_EXTTS_REQUEST2  _IOW(PTP_CLK_MAGIC, 11, struct ptp_extts_request)
+#define PTP_PEROUT_REQUEST2 _IOW(PTP_CLK_MAGIC, 12, struct ptp_perout_request)
+#define PTP_ENABLE_PPS2     _IOW(PTP_CLK_MAGIC, 13, int)
+#define PTP_SYS_OFFSET2     _IOW(PTP_CLK_MAGIC, 14, struct ptp_sys_offset)
+#define PTP_PIN_GETFUNC2    _IOWR(PTP_CLK_MAGIC, 15, struct ptp_pin_desc)
+#define PTP_PIN_SETFUNC2    _IOW(PTP_CLK_MAGIC, 16, struct ptp_pin_desc)
+#define PTP_SYS_OFFSET_PRECISE2 \
+	_IOWR(PTP_CLK_MAGIC, 17, struct ptp_sys_offset_precise)
+#define PTP_SYS_OFFSET_EXTENDED2 \
+	_IOWR(PTP_CLK_MAGIC, 18, struct ptp_sys_offset_extended)
+
 struct ptp_extts_event {
 	struct ptp_clock_time t; /* Time event occured. */
 	unsigned int index;      /* Which channel produced the event. */
-- 
2.22.0


^ permalink raw reply related

* [PATCH net-next v4 0/2] qed*: Support for NVM config attributes.
From: Sudarsana Reddy Kalluru @ 2019-08-14  8:11 UTC (permalink / raw)
  To: davem; +Cc: netdev, mkalderon, aelior

The patch series adds support for managing the NVM config attributes.
Patch (1) adds functionality to update config attributes via MFW.
Patch (2) adds driver interface for updating the config attributes.

Changes from previous versions:
-------------------------------
v4: Added more details on the functionality and its usage.
v3: Removed unused variable.
v2: Removed unused API.

Please consider applying this series to "net-next".

Sudarsana Reddy Kalluru (2):
  qed: Add API for configuring NVM attributes.
  qed: Add driver API for flashing the config attributes.

 drivers/net/ethernet/qlogic/qed/qed_hsi.h  | 17 ++++++++
 drivers/net/ethernet/qlogic/qed/qed_main.c | 68 ++++++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c  | 32 ++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h  | 20 +++++++++
 include/linux/qed/qed_if.h                 |  1 +
 5 files changed, 138 insertions(+)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH net-next v4 1/2] qed: Add API for configuring NVM attributes.
From: Sudarsana Reddy Kalluru @ 2019-08-14  8:11 UTC (permalink / raw)
  To: davem; +Cc: netdev, mkalderon, aelior
In-Reply-To: <20190814081153.18889-1-skalluru@marvell.com>

The patch adds API for configuring the NVM config attributes using
Management FW (MFW) interfaces.

Signed-off-by: Sudarsana Reddy Kalluru <skalluru@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
---
 drivers/net/ethernet/qlogic/qed/qed_hsi.h | 17 ++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c | 32 +++++++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h | 20 +++++++++++++++++++
 3 files changed, 69 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index e054f6c..557a12e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -12580,6 +12580,8 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_BW_UPDATE_ACK		0x32000000
 #define DRV_MSG_CODE_NIG_DRAIN			0x30000000
 #define DRV_MSG_CODE_S_TAG_UPDATE_ACK		0x3b000000
+#define DRV_MSG_CODE_GET_NVM_CFG_OPTION		0x003e0000
+#define DRV_MSG_CODE_SET_NVM_CFG_OPTION		0x003f0000
 #define DRV_MSG_CODE_INITIATE_PF_FLR            0x02010000
 #define DRV_MSG_CODE_VF_DISABLED_DONE		0xc0000000
 #define DRV_MSG_CODE_CFG_VF_MSIX		0xc0010000
@@ -12748,6 +12750,21 @@ struct public_drv_mb {
 #define DRV_MB_PARAM_FEATURE_SUPPORT_PORT_EEE		0x00000002
 #define DRV_MB_PARAM_FEATURE_SUPPORT_FUNC_VLINK		0x00010000
 
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ID_SHIFT		0
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ID_MASK		0x0000FFFF
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ALL_SHIFT		16
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ALL_MASK		0x00010000
+#define DRV_MB_PARAM_NVM_CFG_OPTION_INIT_SHIFT		17
+#define DRV_MB_PARAM_NVM_CFG_OPTION_INIT_MASK		0x00020000
+#define DRV_MB_PARAM_NVM_CFG_OPTION_COMMIT_SHIFT	18
+#define DRV_MB_PARAM_NVM_CFG_OPTION_COMMIT_MASK		0x00040000
+#define DRV_MB_PARAM_NVM_CFG_OPTION_FREE_SHIFT		19
+#define DRV_MB_PARAM_NVM_CFG_OPTION_FREE_MASK		0x00080000
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ENTITY_SEL_SHIFT	20
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ENTITY_SEL_MASK	0x00100000
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ENTITY_ID_SHIFT	24
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ENTITY_ID_MASK	0x0f000000
+
 	u32 fw_mb_header;
 #define FW_MSG_CODE_MASK			0xffff0000
 #define FW_MSG_CODE_UNSUPPORTED                 0x00000000
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 758702c..89462c4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -3750,3 +3750,35 @@ int qed_mcp_get_ppfid_bitmap(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 
 	return 0;
 }
+
+int qed_mcp_nvm_set_cfg(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			u16 option_id, u8 entity_id, u16 flags, u8 *p_buf,
+			u32 len)
+{
+	u32 mb_param = 0, resp, param;
+
+	QED_MFW_SET_FIELD(mb_param, DRV_MB_PARAM_NVM_CFG_OPTION_ID, option_id);
+	if (flags & QED_NVM_CFG_OPTION_ALL)
+		QED_MFW_SET_FIELD(mb_param,
+				  DRV_MB_PARAM_NVM_CFG_OPTION_ALL, 1);
+	if (flags & QED_NVM_CFG_OPTION_INIT)
+		QED_MFW_SET_FIELD(mb_param,
+				  DRV_MB_PARAM_NVM_CFG_OPTION_INIT, 1);
+	if (flags & QED_NVM_CFG_OPTION_COMMIT)
+		QED_MFW_SET_FIELD(mb_param,
+				  DRV_MB_PARAM_NVM_CFG_OPTION_COMMIT, 1);
+	if (flags & QED_NVM_CFG_OPTION_FREE)
+		QED_MFW_SET_FIELD(mb_param,
+				  DRV_MB_PARAM_NVM_CFG_OPTION_FREE, 1);
+	if (flags & QED_NVM_CFG_OPTION_ENTITY_SEL) {
+		QED_MFW_SET_FIELD(mb_param,
+				  DRV_MB_PARAM_NVM_CFG_OPTION_ENTITY_SEL, 1);
+		QED_MFW_SET_FIELD(mb_param,
+				  DRV_MB_PARAM_NVM_CFG_OPTION_ENTITY_ID,
+				  entity_id);
+	}
+
+	return qed_mcp_nvm_wr_cmd(p_hwfn, p_ptt,
+				  DRV_MSG_CODE_SET_NVM_CFG_OPTION,
+				  mb_param, &resp, &param, len, (u32 *)p_buf);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index e4f8fe4..83649a8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -251,6 +251,12 @@ struct qed_mfw_tlv_generic {
 	struct qed_mfw_tlv_iscsi iscsi;
 };
 
+#define QED_NVM_CFG_OPTION_ALL		BIT(0)
+#define QED_NVM_CFG_OPTION_INIT		BIT(1)
+#define QED_NVM_CFG_OPTION_COMMIT       BIT(2)
+#define QED_NVM_CFG_OPTION_FREE		BIT(3)
+#define QED_NVM_CFG_OPTION_ENTITY_SEL	BIT(4)
+
 /**
  * @brief - returns the link params of the hw function
  *
@@ -1202,4 +1208,18 @@ void qed_mcp_resc_lock_default_init(struct qed_resc_lock_params *p_lock,
  */
 int qed_mcp_get_ppfid_bitmap(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 
+/**
+ * @brief Set NVM config attribute value.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param option_id
+ * @param entity_id
+ * @param flags
+ * @param p_buf
+ * @param len
+ */
+int qed_mcp_nvm_set_cfg(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			u16 option_id, u8 entity_id, u16 flags, u8 *p_buf,
+			u32 len);
 #endif
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next v4 2/2] qed: Add driver API for flashing the config attributes.
From: Sudarsana Reddy Kalluru @ 2019-08-14  8:11 UTC (permalink / raw)
  To: davem; +Cc: netdev, mkalderon, aelior
In-Reply-To: <20190814081153.18889-1-skalluru@marvell.com>

The patch adds driver interface for reading the config attributes from user
provided buffer, and updates these values on nvm config flash partition.

This is basically an expansion of our existing ethtool -f implementation.
The management FW has exposed an additional method of configuring some of
the nvram options, and this makes use of that. This implementation will
come into use when newer FW files which contain configuration directives
employing this API will be provided to ethtool -f.

Signed-off-by: Sudarsana Reddy Kalluru <skalluru@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
---
 drivers/net/ethernet/qlogic/qed/qed_main.c | 68 ++++++++++++++++++++++++++++++
 include/linux/qed/qed_if.h                 |  1 +
 2 files changed, 69 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index e5ac8bd..0a76459 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -67,6 +67,8 @@
 #define QED_ROCE_QPS			(8192)
 #define QED_ROCE_DPIS			(8)
 #define QED_RDMA_SRQS                   QED_ROCE_QPS
+#define QED_NVM_CFG_SET_FLAGS		0xE
+#define QED_NVM_CFG_SET_PF_FLAGS	0x1E
 
 static char version[] =
 	"QLogic FastLinQ 4xxxx Core Module qed " DRV_MODULE_VERSION "\n";
@@ -2231,6 +2233,69 @@ static int qed_nvm_flash_image_validate(struct qed_dev *cdev,
 	return 0;
 }
 
+/* Binary file format -
+ *     /----------------------------------------------------------------------\
+ * 0B  |                       0x5 [command index]                            |
+ * 4B  | Entity ID     | Reserved        |  Number of config attributes       |
+ * 8B  | Config ID                       | Length        | Value              |
+ *     |                                                                      |
+ *     \----------------------------------------------------------------------/
+ * There can be several cfg_id-Length-Value sets as specified by 'Number of...'.
+ * Entity ID - A non zero entity value for which the config need to be updated.
+ *
+ * The API parses config attributes from the user provided buffer and flashes
+ * them to the respective NVM path using Management FW inerface.
+ */
+static int qed_nvm_flash_cfg_write(struct qed_dev *cdev, const u8 **data)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	u8 entity_id, len, buf[32];
+	struct qed_ptt *ptt;
+	u16 cfg_id, count;
+	int rc = 0, i;
+	u32 flags;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	/* NVM CFG ID attribute header */
+	*data += 4;
+	entity_id = **data;
+	*data += 2;
+	count = *((u16 *)*data);
+	*data += 2;
+
+	DP_VERBOSE(cdev, NETIF_MSG_DRV,
+		   "Read config ids: entity id %02x num _attrs = %0d\n",
+		   entity_id, count);
+	/* NVM CFG ID attributes */
+	for (i = 0; i < count; i++) {
+		cfg_id = *((u16 *)*data);
+		*data += 2;
+		len = **data;
+		(*data)++;
+		memcpy(buf, *data, len);
+		*data += len;
+
+		flags = entity_id ? QED_NVM_CFG_SET_PF_FLAGS :
+			QED_NVM_CFG_SET_FLAGS;
+
+		DP_VERBOSE(cdev, NETIF_MSG_DRV,
+			   "cfg_id = %d len = %d\n", cfg_id, len);
+		rc = qed_mcp_nvm_set_cfg(hwfn, ptt, cfg_id, entity_id, flags,
+					 buf, len);
+		if (rc) {
+			DP_ERR(cdev, "Error %d configuring %d\n", rc, cfg_id);
+			break;
+		}
+	}
+
+	qed_ptt_release(hwfn, ptt);
+
+	return rc;
+}
+
 static int qed_nvm_flash(struct qed_dev *cdev, const char *name)
 {
 	const struct firmware *image;
@@ -2272,6 +2337,9 @@ static int qed_nvm_flash(struct qed_dev *cdev, const char *name)
 			rc = qed_nvm_flash_image_access(cdev, &data,
 							&check_resp);
 			break;
+		case QED_NVM_FLASH_CMD_NVM_CFG_ID:
+			rc = qed_nvm_flash_cfg_write(cdev, &data);
+			break;
 		default:
 			DP_ERR(cdev, "Unknown command %08x\n", cmd_type);
 			rc = -EINVAL;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 2302136..e366399 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -804,6 +804,7 @@ enum qed_nvm_flash_cmd {
 	QED_NVM_FLASH_CMD_FILE_DATA = 0x2,
 	QED_NVM_FLASH_CMD_FILE_START = 0x3,
 	QED_NVM_FLASH_CMD_NVM_CHANGE = 0x4,
+	QED_NVM_FLASH_CMD_NVM_CFG_ID = 0x5,
 	QED_NVM_FLASH_CMD_NVM_MAX,
 };
 
-- 
1.8.3.1


^ permalink raw reply related

* Re: pull-request: bpf-next 2019-08-14
From: Daniel Borkmann @ 2019-08-14  8:21 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: davem, ast, andrii.nakryiko, netdev, bpf
In-Reply-To: <20190813165902.41da0730@cakuba.netronome.com>

On 8/14/19 1:59 AM, Jakub Kicinski wrote:
> On Wed, 14 Aug 2019 01:16:39 +0200, Daniel Borkmann wrote:
>> Hi David, hi Jakub,
>>
>> The following pull-request contains BPF updates for your *net-next* tree.
> 
> Pulled, let me know if I did it wrong 🤞

LGTM, thanks a lot! :-)

^ permalink raw reply

* Re: [v5,0/4] tools: bpftool: add net attach/detach command to attach XDP prog
From: Daniel T. Lee @ 2019-08-14  8:25 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: Daniel Borkmann, Alexei Starovoitov, netdev
In-Reply-To: <20190813144303.10da8ff0@cakuba.netronome.com>

On Wed, Aug 14, 2019 at 6:43 AM Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
>
> On Tue, 13 Aug 2019 11:46:17 +0900, Daniel T. Lee wrote:
> > Currently, bpftool net only supports dumping progs attached on the
> > interface. To attach XDP prog on interface, user must use other tool
> > (eg. iproute2). By this patch, with `bpftool net attach/detach`, user
> > can attach/detach XDP prog on interface.
> >
> >     # bpftool prog
> >         16: xdp  name xdp_prog1  tag 539ec6ce11b52f98  gpl
> >         loaded_at 2019-08-07T08:30:17+0900  uid 0
> >         ...
> >         20: xdp  name xdp_fwd_prog  tag b9cb69f121e4a274  gpl
> >         loaded_at 2019-08-07T08:30:17+0900  uid 0
> >
> >     # bpftool net attach xdpdrv id 16 dev enp6s0np0
> >     # bpftool net
> >     xdp:
> >         enp6s0np0(4) driver id 16
> >
> >     # bpftool net attach xdpdrv id 20 dev enp6s0np0 overwrite
> >     # bpftool net
> >     xdp:
> >         enp6s0np0(4) driver id 20
> >
> >     # bpftool net detach xdpdrv dev enp6s0np0
> >     # bpftool net
> >     xdp:
> >
> >
> > While this patch only contains support for XDP, through `net
> > attach/detach`, bpftool can further support other prog attach types.
> >
> > XDP attach/detach tested on Mellanox ConnectX-4 and Netronome Agilio.
> >
> > ---
> > Changes in v5:
> >   - fix wrong error message, from errno to err with do_attach/detach
>
> The inconsistency in libbpf's error reporting is generally troubling,
> but a problem of this set, so:
>
> Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>
> In the future please keep review tags if you have only made minor
> changes to the code.

Thank you for the review.
Sorry to bother you. I'll keep that in mind.

Thanks!

^ permalink raw reply

* Re: [PATCH] net: ethernet: mediatek: Add MT7628/88 SoC support
From: Stefan Roese @ 2019-08-14  8:26 UTC (permalink / raw)
  To: René van Dorst
  Cc: netdev, linux-mediatek, Sean Wang, Felix Fietkau, John Crispin,
	Daniel Golle
In-Reply-To: <20190717125345.Horde.JcDE_nBChPFDDjEgIRfPSl3@www.vdorst.com>

Hi Rene,

On 17.07.19 14:53, René van Dorst wrote:

<snip>

>> +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
>> @@ -39,7 +39,8 @@
>>   				 NETIF_F_SG | NETIF_F_TSO | \
>>   				 NETIF_F_TSO6 | \
>>   				 NETIF_F_IPV6_CSUM)
>> -#define NEXT_RX_DESP_IDX(X, Y)	(((X) + 1) & ((Y) - 1))
>> +#define MTK_HW_FEATURES_MT7628	(NETIF_F_SG | NETIF_F_RXCSUM)
>> +#define NEXT_DESP_IDX(X, Y)	(((X) + 1) & ((Y) - 1))
>>
>>   #define MTK_MAX_RX_RING_NUM	4
>>   #define MTK_HW_LRO_DMA_SIZE	8
>> @@ -118,6 +119,7 @@
>>   /* PDMA Global Configuration Register */
>>   #define MTK_PDMA_GLO_CFG	0xa04
>>   #define MTK_MULTI_EN		BIT(10)
>> +#define MTK_PDMA_SIZE_8DWORDS	(1 << 4)
>>
>>   /* PDMA Reset Index Register */
>>   #define MTK_PDMA_RST_IDX	0xa08
>> @@ -276,11 +278,18 @@
>>   #define TX_DMA_OWNER_CPU	BIT(31)
>>   #define TX_DMA_LS0		BIT(30)
>>   #define TX_DMA_PLEN0(_x)	(((_x) & MTK_TX_DMA_BUF_LEN) << 16)
>> +#define TX_DMA_PLEN1(_x)	((_x) & MTK_TX_DMA_BUF_LEN)
>>   #define TX_DMA_SWC		BIT(14)
>>   #define TX_DMA_SDL(_x)		(((_x) & 0x3fff) << 16)
>>
>> +/* PDMA on MT7628 */
>> +#define TX_DMA_DONE		BIT(31)
>> +#define TX_DMA_LS1		BIT(14)
>> +#define TX_DMA_DESP2_DEF	(TX_DMA_LS0 | TX_DMA_DONE)
>> +
>>   /* QDMA descriptor rxd2 */
>>   #define RX_DMA_DONE		BIT(31)
>> +#define RX_DMA_LSO		BIT(30)
>>   #define RX_DMA_PLEN0(_x)	(((_x) & 0x3fff) << 16)
>>   #define RX_DMA_GET_PLEN0(_x)	(((_x) >> 16) & 0x3fff)
>>
>> @@ -289,6 +298,7 @@
>>
>>   /* QDMA descriptor rxd4 */
>>   #define RX_DMA_L4_VALID		BIT(24)
>> +#define RX_DMA_L4_VALID_PDMA	BIT(30)		/* when PDMA is used */
>>   #define RX_DMA_FPORT_SHIFT	19
>>   #define RX_DMA_FPORT_MASK	0x7
>>
>> @@ -412,6 +422,19 @@
>>   #define CO_QPHY_SEL            BIT(0)
>>   #define GEPHY_MAC_SEL          BIT(1)
>>
>> +/* MT7628/88 specific stuff */
>> +#define MT7628_PDMA_OFFSET	0x0800
>> +#define MT7628_SDM_OFFSET	0x0c00
>> +
>> +#define MT7628_TX_BASE_PTR0	(MT7628_PDMA_OFFSET + 0x00)
>> +#define MT7628_TX_MAX_CNT0	(MT7628_PDMA_OFFSET + 0x04)
>> +#define MT7628_TX_CTX_IDX0	(MT7628_PDMA_OFFSET + 0x08)
>> +#define MT7628_TX_DTX_IDX0	(MT7628_PDMA_OFFSET + 0x0c)
>> +#define MT7628_PST_DTX_IDX0	BIT(0)
>> +
>> +#define MT7628_SDM_MAC_ADRL	(MT7628_SDM_OFFSET + 0x0c)
>> +#define MT7628_SDM_MAC_ADRH	(MT7628_SDM_OFFSET + 0x10)
>> +
>>   struct mtk_rx_dma {
>>   	unsigned int rxd1;
>>   	unsigned int rxd2;
>> @@ -509,6 +532,7 @@ enum mtk_clks_map {
>>   				 BIT(MTK_CLK_SGMII_CK) | \
>>   				 BIT(MTK_CLK_ETH2PLL))
>>   #define MT7621_CLKS_BITMAP	(0)
>> +#define MT7628_CLKS_BITMAP	(0)
>>   #define MT7629_CLKS_BITMAP	(BIT(MTK_CLK_ETHIF) | BIT(MTK_CLK_ESW) |  \
>>   				 BIT(MTK_CLK_GP0) | BIT(MTK_CLK_GP1) | \
>>   				 BIT(MTK_CLK_GP2) | BIT(MTK_CLK_FE) | \
>> @@ -563,6 +587,10 @@ struct mtk_tx_ring {
>>   	struct mtk_tx_dma *last_free;
>>   	u16 thresh;
>>   	atomic_t free_count;
>> +	int dma_size;
>> +	struct mtk_tx_dma *dma_pdma;	/* For MT7628/88 PDMA handling */
>> +	dma_addr_t phys_pdma;
>> +	int cpu_idx;
>>   };
>>
>>   /* PDMA rx ring mode */
>> @@ -604,6 +632,7 @@ enum mkt_eth_capabilities {
>>   	MTK_HWLRO_BIT,
>>   	MTK_SHARED_INT_BIT,
>>   	MTK_TRGMII_MT7621_CLK_BIT,
>> +	MTK_SOC_MT7628,
> 
> This should be MTK_SOC_MT7628_BIT, this only defines the bit number!
> 
> and futher on #define MTK_SOC_MT7628 BIT(MTK_SOC_MT7628_BIT)

Okay, thanks.
  
> Based on this commit [0], MT7621 also needs the PDMA for the RX path.
> I know that is not your issue but I think it is better to add a extra
> capability bit for the PDMA bits so it can also be used on other socs.

Yes, MT7621 also uses PDMA for RX. The code for RX is pretty much
shared (re-used), with slight changes for the MT7628/88 to work
correctly on this SoC.

I'll work on a capability bit for PDMA vs QDMA on TX though. This
might make things a little more transparent.
  
> Greats,
> 
> René
> 
> [0] https://lkml.org/lkml/2018/3/14/1038

Thanks,
Stefan

^ permalink raw reply

* [PATCH net-next] r8152: divide the tx and rx bottom functions
From: Hayes Wang @ 2019-08-14  8:30 UTC (permalink / raw)
  To: netdev; +Cc: nic_swsd, linux-kernel, Hayes Wang

Move the tx bottom function from NAPI to a new tasklet. Then, for
multi-cores, the bottom functions of tx and rx may be run at same
time with different cores. This is used to improve performance.

Signed-off-by: Hayes Wang <hayeswang@realtek.com>
---
 drivers/net/usb/r8152.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 40d18e866269..3ed9f8e082c9 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -619,7 +619,7 @@ enum rtl8152_flags {
 	RTL8152_LINK_CHG,
 	SELECTIVE_SUSPEND,
 	PHY_RESET,
-	SCHEDULE_NAPI,
+	SCHEDULE_TASKLET,
 	GREEN_ETHERNET,
 	DELL_TB_RX_AGG_BUG,
 };
@@ -733,6 +733,7 @@ struct r8152 {
 #ifdef CONFIG_PM_SLEEP
 	struct notifier_block pm_notifier;
 #endif
+	struct tasklet_struct tx_tl;
 
 	struct rtl_ops {
 		void (*init)(struct r8152 *);
@@ -1401,7 +1402,7 @@ static void write_bulk_callback(struct urb *urb)
 		return;
 
 	if (!skb_queue_empty(&tp->tx_queue))
-		napi_schedule(&tp->napi);
+		tasklet_schedule(&tp->tx_tl);
 }
 
 static void intr_callback(struct urb *urb)
@@ -2179,8 +2180,12 @@ static void tx_bottom(struct r8152 *tp)
 	} while (res == 0);
 }
 
-static void bottom_half(struct r8152 *tp)
+static void bottom_half(unsigned long data)
 {
+	struct r8152 *tp;
+
+	tp = (struct r8152 *)data;
+
 	if (test_bit(RTL8152_UNPLUG, &tp->flags))
 		return;
 
@@ -2192,7 +2197,7 @@ static void bottom_half(struct r8152 *tp)
 	if (!netif_carrier_ok(tp->netdev))
 		return;
 
-	clear_bit(SCHEDULE_NAPI, &tp->flags);
+	clear_bit(SCHEDULE_TASKLET, &tp->flags);
 
 	tx_bottom(tp);
 }
@@ -2203,16 +2208,12 @@ static int r8152_poll(struct napi_struct *napi, int budget)
 	int work_done;
 
 	work_done = rx_bottom(tp, budget);
-	bottom_half(tp);
 
 	if (work_done < budget) {
 		if (!napi_complete_done(napi, work_done))
 			goto out;
 		if (!list_empty(&tp->rx_done))
 			napi_schedule(napi);
-		else if (!skb_queue_empty(&tp->tx_queue) &&
-			 !list_empty(&tp->tx_free))
-			napi_schedule(napi);
 	}
 
 out:
@@ -2366,11 +2367,11 @@ static netdev_tx_t rtl8152_start_xmit(struct sk_buff *skb,
 
 	if (!list_empty(&tp->tx_free)) {
 		if (test_bit(SELECTIVE_SUSPEND, &tp->flags)) {
-			set_bit(SCHEDULE_NAPI, &tp->flags);
+			set_bit(SCHEDULE_TASKLET, &tp->flags);
 			schedule_delayed_work(&tp->schedule, 0);
 		} else {
 			usb_mark_last_busy(tp->udev);
-			napi_schedule(&tp->napi);
+			tasklet_schedule(&tp->tx_tl);
 		}
 	} else if (skb_queue_len(&tp->tx_queue) > tp->tx_qlen) {
 		netif_stop_queue(netdev);
@@ -4020,9 +4021,11 @@ static void set_carrier(struct r8152 *tp)
 	} else {
 		if (netif_carrier_ok(netdev)) {
 			netif_carrier_off(netdev);
+			tasklet_disable(&tp->tx_tl);
 			napi_disable(napi);
 			tp->rtl_ops.disable(tp);
 			napi_enable(napi);
+			tasklet_enable(&tp->tx_tl);
 			netif_info(tp, link, netdev, "carrier off\n");
 		}
 	}
@@ -4055,10 +4058,10 @@ static void rtl_work_func_t(struct work_struct *work)
 	if (test_and_clear_bit(RTL8152_SET_RX_MODE, &tp->flags))
 		_rtl8152_set_rx_mode(tp->netdev);
 
-	/* don't schedule napi before linking */
-	if (test_and_clear_bit(SCHEDULE_NAPI, &tp->flags) &&
+	/* don't schedule tasket before linking */
+	if (test_and_clear_bit(SCHEDULE_TASKLET, &tp->flags) &&
 	    netif_carrier_ok(tp->netdev))
-		napi_schedule(&tp->napi);
+		tasklet_schedule(&tp->tx_tl);
 
 	mutex_unlock(&tp->control);
 
@@ -4144,6 +4147,7 @@ static int rtl8152_open(struct net_device *netdev)
 		goto out_unlock;
 	}
 	napi_enable(&tp->napi);
+	tasklet_enable(&tp->tx_tl);
 
 	mutex_unlock(&tp->control);
 
@@ -4171,6 +4175,7 @@ static int rtl8152_close(struct net_device *netdev)
 #ifdef CONFIG_PM_SLEEP
 	unregister_pm_notifier(&tp->pm_notifier);
 #endif
+	tasklet_disable(&tp->tx_tl);
 	if (!test_bit(RTL8152_UNPLUG, &tp->flags))
 		napi_disable(&tp->napi);
 	clear_bit(WORK_ENABLE, &tp->flags);
@@ -4440,6 +4445,7 @@ static int rtl8152_pre_reset(struct usb_interface *intf)
 		return 0;
 
 	netif_stop_queue(netdev);
+	tasklet_disable(&tp->tx_tl);
 	napi_disable(&tp->napi);
 	clear_bit(WORK_ENABLE, &tp->flags);
 	usb_kill_urb(tp->intr_urb);
@@ -4483,6 +4489,7 @@ static int rtl8152_post_reset(struct usb_interface *intf)
 	}
 
 	napi_enable(&tp->napi);
+	tasklet_enable(&tp->tx_tl);
 	netif_wake_queue(netdev);
 	usb_submit_urb(tp->intr_urb, GFP_KERNEL);
 
@@ -4636,10 +4643,12 @@ static int rtl8152_system_suspend(struct r8152 *tp)
 
 		clear_bit(WORK_ENABLE, &tp->flags);
 		usb_kill_urb(tp->intr_urb);
+		tasklet_disable(&tp->tx_tl);
 		napi_disable(napi);
 		cancel_delayed_work_sync(&tp->schedule);
 		tp->rtl_ops.down(tp);
 		napi_enable(napi);
+		tasklet_enable(&tp->tx_tl);
 	}
 
 	return 0;
@@ -5499,6 +5508,8 @@ static int rtl8152_probe(struct usb_interface *intf,
 	mutex_init(&tp->control);
 	INIT_DELAYED_WORK(&tp->schedule, rtl_work_func_t);
 	INIT_DELAYED_WORK(&tp->hw_phy_work, rtl_hw_phy_work_func_t);
+	tasklet_init(&tp->tx_tl, bottom_half, (unsigned long)tp);
+	tasklet_disable(&tp->tx_tl);
 
 	netdev->netdev_ops = &rtl8152_netdev_ops;
 	netdev->watchdog_timeo = RTL8152_TX_TIMEOUT;
@@ -5585,6 +5596,7 @@ static int rtl8152_probe(struct usb_interface *intf,
 
 out1:
 	netif_napi_del(&tp->napi);
+	tasklet_kill(&tp->tx_tl);
 	usb_set_intfdata(intf, NULL);
 out:
 	free_netdev(netdev);
@@ -5601,6 +5613,7 @@ static void rtl8152_disconnect(struct usb_interface *intf)
 
 		netif_napi_del(&tp->napi);
 		unregister_netdev(tp->netdev);
+		tasklet_kill(&tp->tx_tl);
 		cancel_delayed_work_sync(&tp->hw_phy_work);
 		tp->rtl_ops.unload(tp);
 		free_netdev(tp->netdev);
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH net-next v2 6/9] net: macsec: hardware offloading infrastructure
From: Antoine Tenart @ 2019-08-14  8:31 UTC (permalink / raw)
  To: Igor Russkikh
  Cc: Andrew Lunn, Antoine Tenart, davem@davemloft.net,
	sd@queasysnail.net, f.fainelli@gmail.com, hkallweit1@gmail.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	thomas.petazzoni@bootlin.com, alexandre.belloni@bootlin.com,
	allan.nielsen@microchip.com, camelia.groza@nxp.com,
	Simon Edelhaus, Pavel Belous
In-Reply-To: <2e3c2307-d414-a531-26cb-064e05fa01fc@aquantia.com>

Hi Igor,

On Tue, Aug 13, 2019 at 04:18:40PM +0000, Igor Russkikh wrote:
> On 13.08.2019 16:17, Andrew Lunn wrote:
> > On Tue, Aug 13, 2019 at 10:58:17AM +0200, Antoine Tenart wrote:
> >> I think this question is linked to the use of a MACsec virtual interface
> >> when using h/w offloading. The starting point for me was that I wanted
> >> to reuse the data structures and the API exposed to the userspace by the
> >> s/w implementation of MACsec. I then had two choices: keeping the exact
> >> same interface for the user (having a virtual MACsec interface), or
> >> registering the MACsec genl ops onto the real net devices (and making
> >> the s/w implementation a virtual net dev and a provider of the MACsec
> >> "offloading" ops).
> >>
> >> The advantages of the first option were that nearly all the logic of the
> >> s/w implementation could be kept and especially that it would be
> >> transparent for the user to use both implementations of MACsec.
> > 
> > We have always talked about offloading operations to the hardware,
> > accelerating what the linux stack can do by making use of hardware
> > accelerators. The basic user API should not change because of
> > acceleration. Those are the general guidelines.
> > 
> > It would however be interesting to get comments from those who did the
> > software implementation and what they think of this architecture. I've
> > no personal experience with MACSec, so it is hard for me to say if the
> > current architecture makes sense when using accelerators.
> 
> In terms of overall concepts, I'd add the following:
> 
> 1) With current implementation it's impossible to install SW macsec engine onto
> the device which supports HW offload. That could be a strong limitation in
> cases when user sees HW macsec offload is broken or work differently, and he/she
> wants to replace it with SW one.
> MACSec is a complex feature, and it may happen something is missing in HW.
> Trivial example is 256bit encryption, which is not always a musthave in HW
> implementations.

Agreed. I'm not sure it would be possible to have both used at the same
time but there should be a way to switch between the two
implementations. That is not supported for now, but I think that would
be a good thing, and can probably come later on.

> 2) I think, Antoine, its not totally true that otherwise the user macsec API
> will be broken/changed. netlink api is the same, the only thing we may want to
> add is an optional parameter to force selection of SW macsec engine.

I meant that we can either have a virtual net device representing the
MACsec feature and being the iface used to configure it, or we could
have it only when s/w MACsec is used. That to me is part of the "API",
or at least part of what's exposed to the user.

> I'm also eager to hear from sw macsec users/devs on whats better here.

I'd like more comments as well :)

Thanks!
Antoine

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* Re: [PATCH net-next v2 6/9] net: macsec: hardware offloading infrastructure
From: Antoine Tenart @ 2019-08-14  8:32 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Igor Russkikh, Antoine Tenart, davem@davemloft.net,
	sd@queasysnail.net, f.fainelli@gmail.com, hkallweit1@gmail.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	thomas.petazzoni@bootlin.com, alexandre.belloni@bootlin.com,
	allan.nielsen@microchip.com, camelia.groza@nxp.com,
	Simon Edelhaus, Pavel Belous
In-Reply-To: <20190813162823.GH15047@lunn.ch>

Hi Andrew,

On Tue, Aug 13, 2019 at 06:28:23PM +0200, Andrew Lunn wrote:
> 
> It would also be nice to add extra information to the netlink API to
> indicate if HW or SW is being used. In other places where we offload
> to accelerators we have such additional information.

Yes, that would be very nice to have.

Thanks!
Antoine

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* Re: [PATCH 3/3] ocelot_ace: fix action of trap
From: Allan W. Nielsen @ 2019-08-14  8:57 UTC (permalink / raw)
  To: Y.b. Lu
  Cc: Andrew Lunn, netdev@vger.kernel.org, David S . Miller,
	Alexandre Belloni, Microchip Linux Driver Support
In-Reply-To: <VI1PR0401MB2237D9358AA17400E72A776EF8AD0@VI1PR0401MB2237.eurprd04.prod.outlook.com>

Hi Y.b. and Andrew,

The 08/14/2019 04:28, Y.b. Lu wrote:
> > > I'd like to trap all IEEE 1588 PTP Ethernet frames to CPU through etype
> > 0x88f7.
> > 
> > Is this the correct way to handle PTP for this switch? For other switches we
> > don't need such traps. The switch itself identifies PTP frames and forwards
> > them to the CPU so it can process them.
> > 
> > I'm just wondering if your general approach is wrong?
> 
> [Y.b. Lu] PTP messages over Ethernet will use two multicast addresses.
> 01-80-C2-00-00-0E for peer delay messages.
Yes, and as you write, this is a BPDU which must not be forwarded (and they are
not).

> 01-1B-19-00-00-00 for other messages.
Yes, this is a normal L2 multicast address, which by default are broadcastet.

> But only 01-80-C2-00-00-0E could be handled by hardware filter for BPDU frames
> (01-80-C2-00-00-0x).  For PTP messages handling, trapping them to CPU through
> VCAP IS2 is the suggested way by Ocelot/Felix.
As I see it there are at least 3 scenarios which needs to be considered and
ideally supported:

1) Operate as a PTP-unaware switch. This means that Peer-delays messages are
   trapped to the CPU and not handled. End-to-End PTP sessions can still run on
   the network as 01-1B-19-00-00-00 frames are forwarded normally.

   This is what we have by default today.

2) A "passive" PTP switch (in MSCC/MCHP we call this an end-to-end transparent
   clock) where 01-80-C2-00-00-0E frames are still trapped to the CPU (and not
   handled), 01-1B-19-00-00-00 frames are forwarded, but we use the TCAM to add
   the residence time to the correction field in Sync and Delay request
   messages.

   This is a simple mechanism which allow end-to-end PTP sessions to synchronize
   their time, and compensate for the variable delay caused by the switch.

   Compared to implement a complete boundary clock, this is much simpler, and
   cause a much lower work load on the CPU (even small switches may be serving
   many many PTP sessions).

3) Full PTP aware switch. In this mode we need all PTP frames trapped (on the
   ports where PTP are running) to the CPU, and we need a PTP daemon in
   user-space to process them in-order for things to work.

   I guess this is what you are trying to achieve.

Eventually, I hope we can get to a point where all (and maybe more) scenarios
are supported.

Lets consider them case by case:

1) This is what we have today.

2) To support this, we need a SW implementation of this, and then we can add
   hooks to offload this in HW.

   We would certainly be interested in supporting this in both SW and HW.

3) It can be done via 'tc' using the trap action, but I do not know if this is
   the desired way of doing it. Ocelot will be using a TCAM rule to do this,
   which align nicely with the 'tc' approach, but other chips may be have
   dedicated HW for doing this.

   Also, in the current implementation we will be using a rule per port, and
   ideally we could have done it with a single rule (this is what Y.B. prepared
   in this patch series).

I'm very much against configuring option 3 in the driver initialization, as it
will prevent us from having a conforming switch if a PTP daemon is not running
in user-space, and it give us very little room for supporting other ways in the
future without breaking backwards compatibility.

> I have a question since you are experts.
I'm not really an expert on this, but I have access to some good guidance from
collages knowing PTP very well :-D

> For other switches, whether they are always trapping PTP messages to CPU?
Good question, I could not find anything in the SW bridge forcing option 3.

My understanding is that the SW bridge is implementing option 1, but I could be
wrong.

> Is there any common method in linux to configure switch to select trapping or
> just forwarding PTP messages?
You should be able to use TC for this. But due to the port vs port-mask
limitation you will need to install a rule per port.

I do not know if this is what others are doing, but would like to learn about
that.

/Allan

^ permalink raw reply

* Re: [PATCH v4 13/14] net: phy: adin: add ethtool get_stats support
From: Ardelean, Alexandru @ 2019-08-14  9:08 UTC (permalink / raw)
  To: andrew@lunn.ch
  Cc: davem@davemloft.net, hkallweit1@gmail.com,
	devicetree@vger.kernel.org, mark.rutland@arm.com,
	linux-kernel@vger.kernel.org, f.fainelli@gmail.com,
	netdev@vger.kernel.org, robh+dt@kernel.org
In-Reply-To: <c3fdb21c40900dae0e52b02b98fe27924a76c256.camel@analog.com>

On Tue, 2019-08-13 at 08:48 +0300, Alexandru Ardelean wrote:
> On Mon, 2019-08-12 at 16:33 +0200, Andrew Lunn wrote:
> > [External]
> > 
> > > +static int adin_read_mmd_stat_regs(struct phy_device *phydev,
> > > +				   struct adin_hw_stat *stat,
> > > +				   u32 *val)
> > > +{
> > > +	int ret;
> > > +
> > > +	ret = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg1);
> > > +	if (ret < 0)
> > > +		return ret;
> > > +
> > > +	*val = (ret & 0xffff);
> > > +
> > > +	if (stat->reg2 == 0)
> > > +		return 0;
> > > +
> > > +	ret = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg2);
> > > +	if (ret < 0)
> > > +		return ret;
> > > +
> > > +	*val <<= 16;
> > > +	*val |= (ret & 0xffff);
> > > +
> > > +	return 0;
> > > +}
> > 
> > It still looks like you have not dealt with overflow from the LSB into
> > the MSB between the two reads.
> 
> Apologies for forgetting to address this.
> I did not intentionally leave it out; this item got lost after V1 [which had the most remarks].
> Changelog V1 -> V2 was quite bulky, and I did not look at V1 remarks after I finished V2.
> 
> Thanks for snippet.

So, I have to apologize again here.
I guess I was an idiot/n00b about this.

The PHY stats do support snapshot, and I sync-ed with someone from the chip-team to confirm.

Also, from the datasheet[1] (page 29 - FRAME GENERATOR AND CHECKER - 5th paragraph):
--------------------------------------------------------------
The frame checker counts the number of CRC errors and these 
are reported in the receive error counter register (RxErrCnt 
register,  address 0x0014). To ensure synchronization between 
the frame checker error counter and frame checker frame counters,
all of the counters are latched once the receive error counter
register is read. Hence when using the frame checker, the
receive error counter should be read first and then all the other
frame counters and error counters should be read. A latched copy
of the receive frame counter register is available in the
FcFrmCntH and FcFrmCntL registers (register addresses 0x1E.0x940A
and 0x1E.0x940B).
-------------------------------------------------------------

Then in the description of these regs, it mentions (repeteadly):
-------------------------------------------------------------
This register is a latched copy of bits 31:16 of the 32-bit
receive frame counter register. When the receive error counter
(RxErrCnt register address 0x0014) is read, the receive
frame
counter register is latched. A copy of the receive frame counter
register is latched when RxErrCnt is read so that
the error count
and receive frame count are synchronized
-------------------------------------------------------------

I'll re-spin this with the rename of the strings, and maybe do a minor polish of the code.

Thanks & sorry for the noise/trouble
Alex

[1] https://www.analog.com/media/en/technical-documentation/data-sheets/ADIN1300.pdf


> 
> > 	do {
> > 		hi1 = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg2);
> > 		if (hi1 < 0)
> > 			return hi1;
> > 		
> > 		low = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg1);
> > 		if (low < 0)
> > 			return low;
> > 
> > 		hi2 = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg2);
> > 		if (hi2 < 0)
> > 			return hi1;
> > 	} while (hi1 != hi2)
> > 
> > 	return low | (hi << 16);
> > 
> > 	Andrew

^ permalink raw reply

* [PATCH net] net/packet: fix race in tpacket_snd()
From: Eric Dumazet @ 2019-08-14  9:11 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet, syzbot

packet_sendmsg() checks tx_ring.pg_vec to decide
if it must call tpacket_snd().

Problem is that the check is lockless, meaning another thread
can issue a concurrent setsockopt(PACKET_TX_RING ) to flip
tx_ring.pg_vec back to NULL.

Given that tpacket_snd() grabs pg_vec_lock mutex, we can
perform the check again to solve the race.

syzbot reported :

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 11429 Comm: syz-executor394 Not tainted 5.3.0-rc4+ #101
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:packet_lookup_frame+0x8d/0x270 net/packet/af_packet.c:474
Code: c1 ee 03 f7 73 0c 80 3c 0e 00 0f 85 cb 01 00 00 48 8b 0b 89 c0 4c 8d 24 c1 48 b8 00 00 00 00 00 fc ff df 4c 89 e1 48 c1 e9 03 <80> 3c 01 00 0f 85 94 01 00 00 48 8d 7b 10 4d 8b 3c 24 48 b8 00 00
RSP: 0018:ffff88809f82f7b8 EFLAGS: 00010246
RAX: dffffc0000000000 RBX: ffff8880a45c7030 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 1ffff110148b8e06 RDI: ffff8880a45c703c
RBP: ffff88809f82f7e8 R08: ffff888087aea200 R09: fffffbfff134ae50
R10: fffffbfff134ae4f R11: ffffffff89a5727f R12: 0000000000000000
R13: 0000000000000001 R14: ffff8880a45c6ac0 R15: 0000000000000000
FS:  00007fa04716f700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fa04716edb8 CR3: 0000000091eb4000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 packet_current_frame net/packet/af_packet.c:487 [inline]
 tpacket_snd net/packet/af_packet.c:2667 [inline]
 packet_sendmsg+0x590/0x6250 net/packet/af_packet.c:2975
 sock_sendmsg_nosec net/socket.c:637 [inline]
 sock_sendmsg+0xd7/0x130 net/socket.c:657
 ___sys_sendmsg+0x3e2/0x920 net/socket.c:2311
 __sys_sendmmsg+0x1bf/0x4d0 net/socket.c:2413
 __do_sys_sendmmsg net/socket.c:2442 [inline]
 __se_sys_sendmmsg net/socket.c:2439 [inline]
 __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2439
 do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fixes: 69e3c75f4d54 ("net: TX_RING and packet mmap")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
 net/packet/af_packet.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8d54f3047768d2272cbd28a7bcda33df800aa589..e2742b006d255f598fc98953dbb823f615d2bf9a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2618,6 +2618,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 
 	mutex_lock(&po->pg_vec_lock);
 
+	/* packet_sendmsg() check on tx_ring.pg_vec was lockless,
+	 * we need to confirm it under protection of pg_vec_lock.
+	 */
+	if (unlikely(!po->tx_ring.pg_vec)) {
+		err = -EBUSY;
+		goto out;
+	}
 	if (likely(saddr == NULL)) {
 		dev	= packet_cached_dev_get(po);
 		proto	= po->num;
-- 
2.23.0.rc1.153.gdeed80330f-goog


^ permalink raw reply related

* Re: [v2, 4/4] ocelot: add VCAP IS2 rule to trap PTP Ethernet frames
From: Allan W . Nielsen @ 2019-08-14  9:16 UTC (permalink / raw)
  To: Y.b. Lu
  Cc: Andrew Lunn, netdev@vger.kernel.org, David S . Miller,
	Alexandre Belloni, Microchip Linux Driver Support
In-Reply-To: <VI1PR0401MB2237E0F32D6CC719682E8C1AF8AD0@VI1PR0401MB2237.eurprd04.prod.outlook.com>

The 08/14/2019 04:56, Y.b. Lu wrote:
> > -----Original Message-----
> > From: Allan W . Nielsen <allan.nielsen@microchip.com>
> > Sent: Tuesday, August 13, 2019 2:25 PM
> > To: Y.b. Lu <yangbo.lu@nxp.com>
> > Cc: netdev@vger.kernel.org; David S . Miller <davem@davemloft.net>;
> > Alexandre Belloni <alexandre.belloni@bootlin.com>; Microchip Linux Driver
> > Support <UNGLinuxDriver@microchip.com>
> > Subject: Re: [v2, 4/4] ocelot: add VCAP IS2 rule to trap PTP Ethernet frames
> > 
> > The 08/13/2019 10:52, Yangbo Lu wrote:
> > > All the PTP messages over Ethernet have etype 0x88f7 on them.
> > > Use etype as the key to trap PTP messages.
> > >
> > > Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
> > > ---
> > > Changes for v2:
> > > 	- Added this patch.
> > > ---
> > >  drivers/net/ethernet/mscc/ocelot.c | 28 ++++++++++++++++++++++++++++
> > >  1 file changed, 28 insertions(+)
> > >
> > > diff --git a/drivers/net/ethernet/mscc/ocelot.c
> > > b/drivers/net/ethernet/mscc/ocelot.c
> > > index 6932e61..40f4e0d 100644
> > > --- a/drivers/net/ethernet/mscc/ocelot.c
> > > +++ b/drivers/net/ethernet/mscc/ocelot.c
> > > @@ -1681,6 +1681,33 @@ int ocelot_probe_port(struct ocelot *ocelot, u8
> > > port,  }  EXPORT_SYMBOL(ocelot_probe_port);
> > >
> > > +static int ocelot_ace_add_ptp_rule(struct ocelot *ocelot) {
> > > +	struct ocelot_ace_rule *rule;
> > > +
> > > +	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
> > > +	if (!rule)
> > > +		return -ENOMEM;
> > > +
> > > +	/* Entry for PTP over Ethernet (etype 0x88f7)
> > > +	 * Action: trap to CPU port
> > > +	 */
> > > +	rule->ocelot = ocelot;
> > > +	rule->prio = 1;
> > > +	rule->type = OCELOT_ACE_TYPE_ETYPE;
> > > +	/* Available on all ingress port except CPU port */
> > > +	rule->ingress_port = ~BIT(ocelot->num_phys_ports);
> > > +	rule->dmac_mc = OCELOT_VCAP_BIT_1;
> > > +	rule->frame.etype.etype.value[0] = 0x88;
> > > +	rule->frame.etype.etype.value[1] = 0xf7;
> > > +	rule->frame.etype.etype.mask[0] = 0xff;
> > > +	rule->frame.etype.etype.mask[1] = 0xff;
> > > +	rule->action = OCELOT_ACL_ACTION_TRAP;
> > > +
> > > +	ocelot_ace_rule_offload_add(rule);
> > > +	return 0;
> > > +}
> > > +
> > >  int ocelot_init(struct ocelot *ocelot)  {
> > >  	u32 port;
> > > @@ -1708,6 +1735,7 @@ int ocelot_init(struct ocelot *ocelot)
> > >  	ocelot_mact_init(ocelot);
> > >  	ocelot_vlan_init(ocelot);
> > >  	ocelot_ace_init(ocelot);
> > > +	ocelot_ace_add_ptp_rule(ocelot);
> > >
> > >  	for (port = 0; port < ocelot->num_phys_ports; port++) {
> > >  		/* Clear all counters (5 groups) */
> > This seems really wrong to me, and much too hard-coded...
> > 
> > What if I want to forward the PTP frames to be forwarded like a normal
> > non-aware PTP switch?
> 
> [Y.b. Lu] As Andrew said, other switches could identify PTP messages and forward to CPU for processing.
> https://patchwork.ozlabs.org/patch/1145627/
Yes, it would be good to see some exampels to understand this better.

> I'm also wondering whether there is common method in linux to address your questions.
Me too.

> If no, I think trapping all PTP messages on all ports to CPU could be used for now.
> If users require PTP synchronization, they actually don’t want a non-aware PTP switch.
Can we continue this discussion in the other thread where I listed the 3
scenarios?

> I once see other ocelot code configure ptp trap rules in ioctl timestamping
> setting. But I don’t think it's proper either.  Enable timestamping doesn’t
> mean we want to trap PTP messages.
Where did you see this?

The effort in [1] is just about the time-stamping and does not really consider
the bridge part of it, and it should not be installing any TCAM rules (I believe
it did in earlier versions, but this has been changed).

[1] https://patchwork.ozlabs.org/patch/1145777/

> > What if do not want this on all ports?
> [Y.b. Lu] Actually I don’t think there should be difference of handling PTP messages on each port.
> You don’t need to run PTP protocol application on the specific port if you don’t want.
What if you want some vlans or some ports to be PTP unaware, and other to be PTP
aware.

> > If you do not have an application behind this implementing a boundary or
> > transparent clock, then you are breaking PTP on the network.
> [Y.b. Lu] You're right. But actually for PTP network, all PTP devices should run PTP protocol on it.
> Of course, it's better to have a way to configure it as non-aware PTP switch.
I think we agree.

In my point of view, it is the PTP daemon who should configure frames to be
trapped. Then the switch will be PTP unaware until the PTP daemon starts up and
is ready to make it aware.

If we put it in the init function, then it will be of PTP broken until the PTP
daemon starts.

/Allan


^ permalink raw reply

* [PATCH] can: rcar_can: Remove unused platform data support
From: Geert Uytterhoeven @ 2019-08-14  9:22 UTC (permalink / raw)
  To: Wolfgang Grandegger, Marc Kleine-Budde, Sergei Shtylyov
  Cc: David S . Miller, Wolfram Sang, linux-can, linux-renesas-soc,
	netdev, Geert Uytterhoeven

All R-Car platforms use DT for describing CAN controllers.
R-Car CAN platform data support was never used in any upstream kernel.

Move the Clock Select Register settings enum into the driver, and remove
platform data support and the corresponding header file.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 drivers/net/can/rcar/rcar_can.c       | 22 +++++++++-------------
 include/linux/can/platform/rcar_can.h | 18 ------------------
 2 files changed, 9 insertions(+), 31 deletions(-)
 delete mode 100644 include/linux/can/platform/rcar_can.h

diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c
index cf218949a8fb52d5..3c5e9c2c5342147f 100644
--- a/drivers/net/can/rcar/rcar_can.c
+++ b/drivers/net/can/rcar/rcar_can.c
@@ -15,11 +15,17 @@
 #include <linux/can/led.h>
 #include <linux/can/dev.h>
 #include <linux/clk.h>
-#include <linux/can/platform/rcar_can.h>
 #include <linux/of.h>
 
 #define RCAR_CAN_DRV_NAME	"rcar_can"
 
+/* Clock Select Register settings */
+enum CLKR {
+	CLKR_CLKP1 = 0,	/* Peripheral clock (clkp1) */
+	CLKR_CLKP2 = 1,	/* Peripheral clock (clkp2) */
+	CLKR_CLKEXT = 3	/* Externally input clock */
+};
+
 #define RCAR_SUPPORTED_CLOCKS	(BIT(CLKR_CLKP1) | BIT(CLKR_CLKP2) | \
 				 BIT(CLKR_CLKEXT))
 
@@ -736,7 +742,6 @@ static const char * const clock_names[] = {
 
 static int rcar_can_probe(struct platform_device *pdev)
 {
-	struct rcar_can_platform_data *pdata;
 	struct rcar_can_priv *priv;
 	struct net_device *ndev;
 	struct resource *mem;
@@ -745,17 +750,8 @@ static int rcar_can_probe(struct platform_device *pdev)
 	int err = -ENODEV;
 	int irq;
 
-	if (pdev->dev.of_node) {
-		of_property_read_u32(pdev->dev.of_node,
-				     "renesas,can-clock-select", &clock_select);
-	} else {
-		pdata = dev_get_platdata(&pdev->dev);
-		if (!pdata) {
-			dev_err(&pdev->dev, "No platform data provided!\n");
-			goto fail;
-		}
-		clock_select = pdata->clock_select;
-	}
+	of_property_read_u32(pdev->dev.of_node, "renesas,can-clock-select",
+			     &clock_select);
 
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0) {
diff --git a/include/linux/can/platform/rcar_can.h b/include/linux/can/platform/rcar_can.h
deleted file mode 100644
index a43dcd0cf79ee3ec..0000000000000000
--- a/include/linux/can/platform/rcar_can.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _CAN_PLATFORM_RCAR_CAN_H_
-#define _CAN_PLATFORM_RCAR_CAN_H_
-
-#include <linux/types.h>
-
-/* Clock Select Register settings */
-enum CLKR {
-	CLKR_CLKP1 = 0,	/* Peripheral clock (clkp1) */
-	CLKR_CLKP2 = 1,	/* Peripheral clock (clkp2) */
-	CLKR_CLKEXT = 3	/* Externally input clock */
-};
-
-struct rcar_can_platform_data {
-	enum CLKR clock_select;	/* Clock source select */
-};
-
-#endif	/* !_CAN_PLATFORM_RCAR_CAN_H_ */
-- 
2.17.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox