Netdev List
 help / color / mirror / Atom feed
* [bpf-next V4 PATCH 08/15] bpf: cpumap convert to use generic xdp_frame
From: Jesper Dangaard Brouer @ 2018-03-22 14:21 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152172842149.20979.12110131083451936498.stgit@firesoul>

The generic xdp_frame format, was inspired by the cpumap own internal
xdp_pkt format.  It is now time to convert it over to the generic
xdp_frame format.  The cpumap needs one extra field dev_rx.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/net/xdp.h   |    1 +
 kernel/bpf/cpumap.c |  100 ++++++++++++++-------------------------------------
 2 files changed, 29 insertions(+), 72 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 13f71a15c79f..bc0cb97e20dc 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -68,6 +68,7 @@ struct xdp_frame {
 	 * while mem info is valid on remote CPU.
 	 */
 	struct xdp_mem_info mem;
+	struct net_device *dev_rx; /* used by cpumap */
 };
 
 /* Convert xdp_buff to xdp_frame */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 3e4bbcbe3e86..bcdc4dea5ce7 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -159,52 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 	kthread_stop(rcpu->kthread);
 }
 
-/* For now, xdp_pkt is a cpumap internal data structure, with info
- * carried between enqueue to dequeue. It is mapped into the top
- * headroom of the packet, to avoid allocating separate mem.
- */
-struct xdp_pkt {
-	void *data;
-	u16 len;
-	u16 headroom;
-	u16 metasize;
-	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
-	 * while mem info is valid on remote CPU.
-	 */
-	struct xdp_mem_info mem;
-	struct net_device *dev_rx;
-};
-
-/* Convert xdp_buff to xdp_pkt */
-static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
-{
-	struct xdp_pkt *xdp_pkt;
-	int metasize;
-	int headroom;
-
-	/* Assure headroom is available for storing info */
-	headroom = xdp->data - xdp->data_hard_start;
-	metasize = xdp->data - xdp->data_meta;
-	metasize = metasize > 0 ? metasize : 0;
-	if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
-		return NULL;
-
-	/* Store info in top of packet */
-	xdp_pkt = xdp->data_hard_start;
-
-	xdp_pkt->data = xdp->data;
-	xdp_pkt->len  = xdp->data_end - xdp->data;
-	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
-	xdp_pkt->metasize = metasize;
-
-	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
-	xdp_pkt->mem = xdp->rxq->mem;
-
-	return xdp_pkt;
-}
-
 static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
-					 struct xdp_pkt *xdp_pkt)
+					 struct xdp_frame *xdpf)
 {
 	unsigned int frame_size;
 	void *pkt_data_start;
@@ -219,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * would be preferred to set frame_size to 2048 or 4096
 	 * depending on the driver.
 	 *   frame_size = 2048;
-	 *   frame_len  = frame_size - sizeof(*xdp_pkt);
+	 *   frame_len  = frame_size - sizeof(*xdp_frame);
 	 *
 	 * Instead, with info avail, skb_shared_info in placed after
 	 * packet len.  This, unfortunately fakes the truesize.
@@ -227,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * is not at a fixed memory location, with mixed length
 	 * packets, which is bad for cache-line hotness.
 	 */
-	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+	frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom +
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
-	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+	pkt_data_start = xdpf->data - xdpf->headroom;
 	skb = build_skb(pkt_data_start, frame_size);
 	if (!skb)
 		return NULL;
 
-	skb_reserve(skb, xdp_pkt->headroom);
-	__skb_put(skb, xdp_pkt->len);
-	if (xdp_pkt->metasize)
-		skb_metadata_set(skb, xdp_pkt->metasize);
+	skb_reserve(skb, xdpf->headroom);
+	__skb_put(skb, xdpf->len);
+	if (xdpf->metasize)
+		skb_metadata_set(skb, xdpf->metasize);
 
 	/* Essential SKB info: protocol and skb->dev */
-	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+	skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
 
 	/* Optional SKB info, currently missing:
 	 * - HW checksum info		(skb->ip_summed)
@@ -259,11 +215,11 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
 	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
 	 * gracefully and warn once.
 	 */
-	struct xdp_pkt *xdp_pkt;
+	struct xdp_frame *xdpf;
 
-	while ((xdp_pkt = ptr_ring_consume(ring)))
-		if (WARN_ON_ONCE(xdp_pkt))
-			xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
+	while ((xdpf = ptr_ring_consume(ring)))
+		if (WARN_ON_ONCE(xdpf))
+			xdp_return_frame(xdpf->data, &xdpf->mem);
 }
 
 static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
@@ -290,7 +246,7 @@ static int cpu_map_kthread_run(void *data)
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
 		unsigned int processed = 0, drops = 0, sched = 0;
-		struct xdp_pkt *xdp_pkt;
+		struct xdp_frame *xdpf;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
@@ -313,13 +269,13 @@ static int cpu_map_kthread_run(void *data)
 		 * kthread CPU pinned. Lockless access to ptr_ring
 		 * consume side valid as no-resize allowed of queue.
 		 */
-		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+		while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
 			struct sk_buff *skb;
 			int ret;
 
-			skb = cpu_map_build_skb(rcpu, xdp_pkt);
+			skb = cpu_map_build_skb(rcpu, xdpf);
 			if (!skb) {
-				xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
+				xdp_return_frame(xdpf->data, &xdpf->mem);
 				continue;
 			}
 
@@ -616,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	spin_lock(&q->producer_lock);
 
 	for (i = 0; i < bq->count; i++) {
-		struct xdp_pkt *xdp_pkt = bq->q[i];
+		struct xdp_frame *xdpf = bq->q[i];
 		int err;
 
-		err = __ptr_ring_produce(q, xdp_pkt);
+		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem);
+			xdp_return_frame(xdpf->data, &xdpf->mem);
 		}
 		processed++;
 	}
@@ -637,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
@@ -648,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 	 * driver to code invoking us to finished, due to driver
 	 * (e.g. ixgbe) recycle tricks based on page-refcnt.
 	 *
-	 * Thus, incoming xdp_pkt is always queued here (else we race
+	 * Thus, incoming xdp_frame is always queued here (else we race
 	 * with another CPU on page-refcnt and remaining driver code).
 	 * Queue time is very short, as driver will invoke flush
 	 * operation, when completing napi->poll call.
 	 */
-	bq->q[bq->count++] = xdp_pkt;
+	bq->q[bq->count++] = xdpf;
 	return 0;
 }
 
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
-	struct xdp_pkt *xdp_pkt;
+	struct xdp_frame *xdpf;
 
-	xdp_pkt = convert_to_xdp_pkt(xdp);
-	if (unlikely(!xdp_pkt))
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
 	/* Info needed when constructing SKB on remote CPU */
-	xdp_pkt->dev_rx = dev_rx;
+	xdpf->dev_rx = dev_rx;
 
-	bq_enqueue(rcpu, xdp_pkt);
+	bq_enqueue(rcpu, xdpf);
 	return 0;
 }
 

^ permalink raw reply related

* [bpf-next V4 PATCH 07/15] virtio_net: convert to use generic xdp_frame and xdp_return_frame API
From: Jesper Dangaard Brouer @ 2018-03-22 14:21 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152172842149.20979.12110131083451936498.stgit@firesoul>

The virtio_net driver assumes XDP frames are always released based on
page refcnt (via put_page).  Thus, is only queues the XDP data pointer
address and uses virt_to_head_page() to retrieve struct page.

Use the XDP return API to get away from such assumptions. Instead
queue an xdp_frame, which allow us to use the xdp_return_frame API,
when releasing the frame.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/virtio_net.c |   31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 23374603e4d9..6c4220450506 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -419,30 +419,41 @@ static bool __virtnet_xdp_xmit(struct virtnet_info *vi,
 			       struct xdp_buff *xdp)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
-	unsigned int len;
+	struct xdp_frame *xdpf, *xdpf_sent;
 	struct send_queue *sq;
+	unsigned int len;
 	unsigned int qp;
-	void *xdp_sent;
 	int err;
 
 	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
 	sq = &vi->sq[qp];
 
 	/* Free up any pending old buffers before queueing new ones. */
-	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
-		struct page *sent_page = virt_to_head_page(xdp_sent);
+	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+		xdp_return_frame(xdpf_sent->data, &xdpf_sent->mem);
 
-		put_page(sent_page);
-	}
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	/* virtqueue want to use data area in-front of packet */
+	if (unlikely(xdpf->metasize > 0))
+		return -EOPNOTSUPP;
+
+	if (unlikely(xdpf->headroom < vi->hdr_len))
+		return -EOVERFLOW;
 
-	xdp->data -= vi->hdr_len;
+	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
+	xdpf->data -= vi->hdr_len;
 	/* Zero header and leave csum up to XDP layers */
-	hdr = xdp->data;
+	hdr = xdpf->data;
 	memset(hdr, 0, vi->hdr_len);
+	hdr->hdr.hdr_len = xdpf->len; /* Q: is this needed? */
+	xdpf->len   += vi->hdr_len;
 
-	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
+	sg_init_one(sq->sg, xdpf->data, xdpf->len);
 
-	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
+	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdpf, GFP_ATOMIC);
 	if (unlikely(err))
 		return false; /* Caller handle free/refcnt */
 

^ permalink raw reply related

* [bpf-next V4 PATCH 06/15] tun: convert to use generic xdp_frame and xdp_return_frame API
From: Jesper Dangaard Brouer @ 2018-03-22 14:21 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152172842149.20979.12110131083451936498.stgit@firesoul>

The tuntap driver invented it's own driver specific way of queuing
XDP packets, by storing the xdp_buff information in the top of
the XDP frame data.

Convert it over to use the more generic xdp_frame structure.  The
main problem with the in-driver method is that the xdp_rxq_info pointer
cannot be trused/used when dequeueing the frame.

V3: Remove check based on feedback from Jason

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/tun.c      |   43 ++++++++++++++++++++-----------------------
 drivers/vhost/net.c    |    7 ++++---
 include/linux/if_tun.h |    4 ++--
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index baeafa004463..6750980d9f30 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -248,11 +248,11 @@ struct veth {
 	__be16 h_vlan_TCI;
 };
 
-bool tun_is_xdp_buff(void *ptr)
+bool tun_is_xdp_frame(void *ptr)
 {
 	return (unsigned long)ptr & TUN_XDP_FLAG;
 }
-EXPORT_SYMBOL(tun_is_xdp_buff);
+EXPORT_SYMBOL(tun_is_xdp_frame);
 
 void *tun_xdp_to_ptr(void *ptr)
 {
@@ -660,10 +660,10 @@ static void tun_ptr_free(void *ptr)
 {
 	if (!ptr)
 		return;
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		put_page(virt_to_head_page(xdp->data));
+		xdp_return_frame(xdpf->data, &xdpf->mem);
 	} else {
 		__skb_array_destroy_skb(ptr);
 	}
@@ -1290,17 +1290,14 @@ static const struct net_device_ops tun_netdev_ops = {
 static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 {
 	struct tun_struct *tun = netdev_priv(dev);
-	struct xdp_buff *buff = xdp->data_hard_start;
-	int headroom = xdp->data - xdp->data_hard_start;
+	struct xdp_frame *frame;
 	struct tun_file *tfile;
 	u32 numqueues;
 	int ret = 0;
 
-	/* Assure headroom is available and buff is properly aligned */
-	if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp)))
-		return -ENOSPC;
-
-	*buff = *xdp;
+	frame = convert_to_xdp_frame(xdp);
+	if (unlikely(!frame))
+		return -EOVERFLOW;
 
 	rcu_read_lock();
 
@@ -1315,7 +1312,7 @@ static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	/* Encode the XDP flag into lowest bit for consumer to differ
 	 * XDP buffer from sk_buff.
 	 */
-	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) {
+	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
 		this_cpu_inc(tun->pcpu_stats->tx_dropped);
 		ret = -ENOSPC;
 	}
@@ -1993,11 +1990,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 				struct tun_file *tfile,
-				struct xdp_buff *xdp,
+				struct xdp_frame *xdp_frame,
 				struct iov_iter *iter)
 {
 	int vnet_hdr_sz = 0;
-	size_t size = xdp->data_end - xdp->data;
+	size_t size = xdp_frame->len;
 	struct tun_pcpu_stats *stats;
 	size_t ret;
 
@@ -2013,7 +2010,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
 	}
 
-	ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz;
+	ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
 
 	stats = get_cpu_ptr(tun->pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
@@ -2181,11 +2178,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 			return err;
 	}
 
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		ret = tun_put_user_xdp(tun, tfile, xdp, to);
-		put_page(virt_to_head_page(xdp->data));
+		ret = tun_put_user_xdp(tun, tfile, xdpf, to);
+		xdp_return_frame(xdpf->data, &xdpf->mem);
 	} else {
 		struct sk_buff *skb = ptr;
 
@@ -2424,10 +2421,10 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
 static int tun_ptr_peek_len(void *ptr)
 {
 	if (likely(ptr)) {
-		if (tun_is_xdp_buff(ptr)) {
-			struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+		if (tun_is_xdp_frame(ptr)) {
+			struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-			return xdp->data_end - xdp->data;
+			return xdpf->len;
 		}
 		return __skb_array_len_with_tag(ptr);
 	} else {
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index b5fb56b822fd..5aee3aaf6c8c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,7 @@
 #include <linux/skbuff.h>
 
 #include <net/sock.h>
+#include <net/xdp.h>
 
 #include "vhost.h"
 
@@ -177,10 +178,10 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
 
 static int vhost_net_buf_peek_len(void *ptr)
 {
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		return xdp->data_end - xdp->data;
+		return xdpf->len;
 	}
 
 	return __skb_array_len_with_tag(ptr);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index c5b0a75a7812..33b817b172af 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -22,7 +22,7 @@
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
-bool tun_is_xdp_buff(void *ptr);
+bool tun_is_xdp_frame(void *ptr);
 void *tun_xdp_to_ptr(void *ptr);
 void *tun_ptr_to_xdp(void *ptr);
 #else
@@ -38,7 +38,7 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline bool tun_is_xdp_buff(void *ptr)
+static inline bool tun_is_xdp_frame(void *ptr)
 {
 	return false;
 }

^ permalink raw reply related

* [bpf-next V4 PATCH 05/15] xdp: introduce a new xdp_frame type
From: Jesper Dangaard Brouer @ 2018-03-22 14:21 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152172842149.20979.12110131083451936498.stgit@firesoul>

This is needed to convert drivers tuntap and virtio_net.

This is a generalization of what is done inside cpumap, which will be
converted later.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/net/xdp.h |   40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 1ee154fe0be6..13f71a15c79f 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -59,6 +59,46 @@ struct xdp_buff {
 	struct xdp_rxq_info *rxq;
 };
 
+struct xdp_frame {
+	void *data;
+	u16 len;
+	u16 headroom;
+	u16 metasize;
+	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
+	 * while mem info is valid on remote CPU.
+	 */
+	struct xdp_mem_info mem;
+};
+
+/* Convert xdp_buff to xdp_frame */
+static inline
+struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
+{
+	struct xdp_frame *xdp_frame;
+	int metasize;
+	int headroom;
+
+	/* Assure headroom is available for storing info */
+	headroom = xdp->data - xdp->data_hard_start;
+	metasize = xdp->data - xdp->data_meta;
+	metasize = metasize > 0 ? metasize : 0;
+	if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
+		return NULL;
+
+	/* Store info in top of packet */
+	xdp_frame = xdp->data_hard_start;
+
+	xdp_frame->data = xdp->data;
+	xdp_frame->len  = xdp->data_end - xdp->data;
+	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
+	xdp_frame->metasize = metasize;
+
+	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
+	xdp_frame->mem = xdp->rxq->mem;
+
+	return xdp_frame;
+}
+
 static inline
 void xdp_return_frame(void *data, struct xdp_mem_info *mem)
 {

^ permalink raw reply related

* [bpf-next V4 PATCH 04/15] xdp: move struct xdp_buff from filter.h to xdp.h
From: Jesper Dangaard Brouer @ 2018-03-22 14:21 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152172842149.20979.12110131083451936498.stgit@firesoul>

This is done to prepare for the next patch, and it is also
nice to move this XDP related struct out of filter.h.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/linux/filter.h |   24 +-----------------------
 include/net/xdp.h      |   22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 109d05ccea9a..340ba653dd80 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -30,6 +30,7 @@ struct sock;
 struct seccomp_data;
 struct bpf_prog_aux;
 struct xdp_rxq_info;
+struct xdp_buff;
 
 /* ArgX, context and stack frame pointer register positions. Note,
  * Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -499,14 +500,6 @@ struct bpf_skb_data_end {
 	void *data_end;
 };
 
-struct xdp_buff {
-	void *data;
-	void *data_end;
-	void *data_meta;
-	void *data_hard_start;
-	struct xdp_rxq_info *rxq;
-};
-
 struct sk_msg_buff {
 	void *data;
 	void *data_end;
@@ -769,21 +762,6 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
-/* Drivers not supporting XDP metadata can use this helper, which
- * rejects any room expansion for metadata as a result.
- */
-static __always_inline void
-xdp_set_data_meta_invalid(struct xdp_buff *xdp)
-{
-	xdp->data_meta = xdp->data + 1;
-}
-
-static __always_inline bool
-xdp_data_meta_unsupported(const struct xdp_buff *xdp)
-{
-	return unlikely(xdp->data_meta > xdp->data);
-}
-
 void bpf_warn_invalid_xdp_action(u32 act);
 
 struct sock *do_sk_redirect_map(struct sk_buff *skb);
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 15b546325e31..1ee154fe0be6 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -51,6 +51,13 @@ struct xdp_rxq_info {
 	struct xdp_mem_info mem;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
+struct xdp_buff {
+	void *data;
+	void *data_end;
+	void *data_meta;
+	void *data_hard_start;
+	struct xdp_rxq_info *rxq;
+};
 
 static inline
 void xdp_return_frame(void *data, struct xdp_mem_info *mem)
@@ -73,4 +80,19 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
 int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 			       enum mem_type type, void *allocator);
 
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
+
 #endif /* __LINUX_NET_XDP_H__ */

^ permalink raw reply related

* [bpf-next V4 PATCH 03/15] ixgbe: use xdp_return_frame API
From: Jesper Dangaard Brouer @ 2018-03-22 14:21 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152172842149.20979.12110131083451936498.stgit@firesoul>

Extend struct ixgbe_tx_buffer to store the xdp_mem_info.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |    1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |    6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index c1e3a0039ea5..cbc20f199364 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -249,6 +249,7 @@ struct ixgbe_tx_buffer {
 	DEFINE_DMA_UNMAP_ADDR(dma);
 	DEFINE_DMA_UNMAP_LEN(len);
 	u32 tx_flags;
+	struct xdp_mem_info xdp_mem;
 };
 
 struct ixgbe_rx_buffer {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 85369423452d..45520eb503ee 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1207,7 +1207,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 
 		/* free the skb */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buffer->data);
+			xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem);
 		else
 			napi_consume_skb(tx_buffer->skb, napi_budget);
 
@@ -5787,7 +5787,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)
 
 		/* Free all the Tx ring sk_buffs */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buffer->data);
+			xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 
@@ -8351,6 +8351,8 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
 	dma_unmap_len_set(tx_buffer, len, len);
 	dma_unmap_addr_set(tx_buffer, dma, dma);
 	tx_buffer->data = xdp->data;
+	tx_buffer->xdp_mem = xdp->rxq->mem;
+
 	tx_desc->read.buffer_addr = cpu_to_le64(dma);
 
 	/* put descriptor type bits */

^ permalink raw reply related

* [bpf-next V4 PATCH 02/15] xdp: introduce xdp_return_frame API and use in cpumap
From: Jesper Dangaard Brouer @ 2018-03-22 14:21 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152172842149.20979.12110131083451936498.stgit@firesoul>

Introduce an xdp_return_frame API, and convert over cpumap as
the first user, given it have queued XDP frame structure to leverage.

V3: Cleanup and remove C99 style comments, pointed out by Alex Duyck.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/net/xdp.h   |   28 ++++++++++++++++++++++++
 kernel/bpf/cpumap.c |   60 +++++++++++++++++++++++++++++++--------------------
 net/core/xdp.c      |   18 +++++++++++++++
 3 files changed, 82 insertions(+), 24 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index b2362ddfa694..15b546325e31 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -33,16 +33,44 @@
  * also mandatory during RX-ring setup.
  */
 
+enum mem_type {
+	MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
+	MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
+	MEM_TYPE_MAX,
+};
+
+struct xdp_mem_info {
+	u32 type; /* enum mem_type, but known size type */
+	/* u32 id; will be added later in this patchset */
+};
+
 struct xdp_rxq_info {
 	struct net_device *dev;
 	u32 queue_index;
 	u32 reg_state;
+	struct xdp_mem_info mem;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
+
+static inline
+void xdp_return_frame(void *data, struct xdp_mem_info *mem)
+{
+	if (mem->type == MEM_TYPE_PAGE_SHARED)
+		page_frag_free(data);
+
+	if (mem->type == MEM_TYPE_PAGE_ORDER0) {
+		struct page *page = virt_to_page(data); /* Assumes order0 page*/
+
+		put_page(page);
+	}
+}
+
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
 void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
 bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+			       enum mem_type type, void *allocator);
 
 #endif /* __LINUX_NET_XDP_H__ */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a4bb0b34375a..3e4bbcbe3e86 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -19,6 +19,7 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/ptr_ring.h>
+#include <net/xdp.h>
 
 #include <linux/sched.h>
 #include <linux/workqueue.h>
@@ -137,27 +138,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	return ERR_PTR(err);
 }
 
-static void __cpu_map_queue_destructor(void *ptr)
-{
-	/* The tear-down procedure should have made sure that queue is
-	 * empty.  See __cpu_map_entry_replace() and work-queue
-	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
-	 * gracefully and warn once.
-	 */
-	if (WARN_ON_ONCE(ptr))
-		page_frag_free(ptr);
-}
-
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
-	if (atomic_dec_and_test(&rcpu->refcnt)) {
-		/* The queue should be empty at this point */
-		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
-		kfree(rcpu->queue);
-		kfree(rcpu);
-	}
-}
-
 static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 {
 	atomic_inc(&rcpu->refcnt);
@@ -188,6 +168,10 @@ struct xdp_pkt {
 	u16 len;
 	u16 headroom;
 	u16 metasize;
+	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
+	 * while mem info is valid on remote CPU.
+	 */
+	struct xdp_mem_info mem;
 	struct net_device *dev_rx;
 };
 
@@ -213,6 +197,9 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
 	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
 	xdp_pkt->metasize = metasize;
 
+	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
+	xdp_pkt->mem = xdp->rxq->mem;
+
 	return xdp_pkt;
 }
 
@@ -265,6 +252,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	return skb;
 }
 
+static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
+{
+	/* The tear-down procedure should have made sure that queue is
+	 * empty.  See __cpu_map_entry_replace() and work-queue
+	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+	 * gracefully and warn once.
+	 */
+	struct xdp_pkt *xdp_pkt;
+
+	while ((xdp_pkt = ptr_ring_consume(ring)))
+		if (WARN_ON_ONCE(xdp_pkt))
+			xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		/* The queue should be empty at this point */
+		__cpu_map_ring_cleanup(rcpu->queue);
+		ptr_ring_cleanup(rcpu->queue, NULL);
+		kfree(rcpu->queue);
+		kfree(rcpu);
+	}
+}
+
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
@@ -307,7 +319,7 @@ static int cpu_map_kthread_run(void *data)
 
 			skb = cpu_map_build_skb(rcpu, xdp_pkt);
 			if (!skb) {
-				page_frag_free(xdp_pkt);
+				xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
 				continue;
 			}
 
@@ -604,13 +616,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	spin_lock(&q->producer_lock);
 
 	for (i = 0; i < bq->count; i++) {
-		void *xdp_pkt = bq->q[i];
+		struct xdp_pkt *xdp_pkt = bq->q[i];
 		int err;
 
 		err = __ptr_ring_produce(q, xdp_pkt);
 		if (err) {
 			drops++;
-			page_frag_free(xdp_pkt); /* Free xdp_pkt */
+			xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem);
 		}
 		processed++;
 	}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 097a0f74e004..9eee0c431126 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -71,3 +71,21 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
 	return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+			       enum mem_type type, void *allocator)
+{
+	if (type >= MEM_TYPE_MAX)
+		return -EINVAL;
+
+	xdp_rxq->mem.type = type;
+
+	if (allocator)
+		return -EOPNOTSUPP;
+
+	/* TODO: Allocate an ID that maps to allocator pointer
+	 * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
+	 */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);

^ permalink raw reply related

* [bpf-next V4 PATCH 01/15] mlx5: basic XDP_REDIRECT forward support
From: Jesper Dangaard Brouer @ 2018-03-22 14:21 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152172842149.20979.12110131083451936498.stgit@firesoul>

This implements basic XDP redirect support in mlx5 driver.

Notice that the ndo_xdp_xmit() is NOT implemented, because that API
need some changes that this patchset is working towards.

The main purpose of this patch is have different drivers doing
XDP_REDIRECT to show how different memory models behave in a cross
driver world.

Update(pre-RFCv2 Tariq): Need to DMA unmap page before xdp_do_redirect,
as the return API does not exist yet to to keep this mapped.

Update(pre-RFCv3 Saeed): Don't mix XDP_TX and XDP_REDIRECT flushing,
introduce xdpsq.db.redirect_flush boolian.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h    |    1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |   27 ++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 4c9360b25532..28cc26debeda 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -398,6 +398,7 @@ struct mlx5e_xdpsq {
 	struct {
 		struct mlx5e_dma_info     *di;
 		bool                       doorbell;
+		bool                       redirect_flush;
 	} db;
 
 	/* read only */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 8cce90dc461d..6dcc3e8fbd3e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -236,14 +236,20 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
 	return 0;
 }
 
+static inline void mlx5e_page_dma_unmap(struct mlx5e_rq *rq,
+					struct mlx5e_dma_info *dma_info)
+{
+	dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
+		       rq->buff.map_dir);
+}
+
 void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
 			bool recycle)
 {
 	if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
 		return;
 
-	dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
-		       rq->buff.map_dir);
+	mlx5e_page_dma_unmap(rq, dma_info);
 	put_page(dma_info->page);
 }
 
@@ -822,9 +828,10 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 				   struct mlx5e_dma_info *di,
 				   void *va, u16 *rx_headroom, u32 *len)
 {
-	const struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
+	struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
 	struct xdp_buff xdp;
 	u32 act;
+	int err;
 
 	if (!prog)
 		return false;
@@ -845,6 +852,15 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp)))
 			trace_xdp_exception(rq->netdev, prog, act);
 		return true;
+	case XDP_REDIRECT:
+		/* When XDP enabled then page-refcnt==1 here */
+		err = xdp_do_redirect(rq->netdev, &xdp, prog);
+		if (!err) {
+			rq->wqe.xdp_xmit = true; /* XDP xmit owns page */
+			rq->xdpsq.db.redirect_flush = true;
+			mlx5e_page_dma_unmap(rq, di);
+		}
+		return true;
 	default:
 		bpf_warn_invalid_xdp_action(act);
 	case XDP_ABORTED:
@@ -1107,6 +1123,11 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 		xdpsq->db.doorbell = false;
 	}
 
+	if (xdpsq->db.redirect_flush) {
+		xdp_do_flush_map();
+		xdpsq->db.redirect_flush = false;
+	}
+
 	mlx5_cqwq_update_db_record(&cq->wq);
 
 	/* ensure cq space is freed before enabling more cqes */

^ permalink raw reply related

* [bpf-next V4 PATCH 00/15] XDP redirect memory return API
From: Jesper Dangaard Brouer @ 2018-03-22 14:21 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan

This patchset works towards supporting different XDP RX-ring memory
allocators.  As this will be needed by the AF_XDP zero-copy mode.

The patchset uses mlx5 as the sample driver, which gets implemented
XDP_REDIRECT RX-mode, but not ndo_xdp_xmit (as this API is subject to
change thought the patchset).

A new struct xdp_frame is introduced (modeled after cpumap xdp_pkt).
And both ndo_xdp_xmit and the new xdp_return_frame end-up using this.

Support for a driver supplied allocator is implemented, and a
refurbished version of page_pool is the first return allocator type
introduced.  This will be a integration point for AF_XDP zero-copy.

The mlx5 driver evolve into using the page_pool, and see a performance
increase (with ndo_xdp_xmit out ixgbe driver) from 6Mpps to 12Mpps.


The patchset stop at the 15 patches limit, but more API changes are
planned.  Specifically extending ndo_xdp_xmit and xdp_return_frame
APIs to support bulking.  As this will address some known limits.

V2: Updated according to Tariq's feedback
V3: Feedback from Jason Wang and Alex Duyck
V4: Feedback from Tariq and Jason

---

Jesper Dangaard Brouer (15):
      mlx5: basic XDP_REDIRECT forward support
      xdp: introduce xdp_return_frame API and use in cpumap
      ixgbe: use xdp_return_frame API
      xdp: move struct xdp_buff from filter.h to xdp.h
      xdp: introduce a new xdp_frame type
      tun: convert to use generic xdp_frame and xdp_return_frame API
      virtio_net: convert to use generic xdp_frame and xdp_return_frame API
      bpf: cpumap convert to use generic xdp_frame
      mlx5: register a memory model when XDP is enabled
      xdp: rhashtable with allocator ID to pointer mapping
      page_pool: refurbish version of page_pool code
      xdp: allow page_pool as an allocator type in xdp_return_frame
      mlx5: use page_pool for xdp_return_frame call
      xdp: transition into using xdp_frame for return API
      xdp: transition into using xdp_frame for ndo_xdp_xmit


 drivers/net/ethernet/intel/ixgbe/ixgbe.h          |    3 
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |   37 ++
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |    4 
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   37 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |   42 ++-
 drivers/net/tun.c                                 |   60 ++--
 drivers/net/virtio_net.c                          |   52 ++-
 drivers/vhost/net.c                               |    7 
 include/linux/filter.h                            |   24 --
 include/linux/if_tun.h                            |    4 
 include/linux/netdevice.h                         |    4 
 include/net/page_pool.h                           |  133 ++++++++
 include/net/xdp.h                                 |   83 +++++
 kernel/bpf/cpumap.c                               |  132 +++-----
 net/core/Makefile                                 |    1 
 net/core/filter.c                                 |   17 +
 net/core/page_pool.c                              |  329 +++++++++++++++++++++
 net/core/xdp.c                                    |  257 ++++++++++++++++
 18 files changed, 1050 insertions(+), 176 deletions(-)
 create mode 100644 include/net/page_pool.h
 create mode 100644 net/core/page_pool.c

--

^ permalink raw reply

* Re: WARNING: CPU: 3 PID: 0 at net/sched/sch_hfsc.c:1388 hfsc_dequeue+0x319/0x350 [sch_hfsc]
From: Marco Berizzi @ 2018-03-22 14:12 UTC (permalink / raw)
  To: Jamal Hadi Salim, Cong Wang; +Cc: Linux Kernel Network Developers, Eric Dumazet
In-Reply-To: <11d36e94-0b07-425d-78d1-dd41b60a586f@mojatatu.com>

Il 19 marzo 2018 alle 11.07 Jamal Hadi Salim <jhs@mojatatu.com> ha scritto:
> 
> On 18-03-15 08:48 PM, Cong Wang wrote:
> 
> > On Wed, Mar 14, 2018 at 1:10 AM, Marco Berizzi <pupilla@libero.it> wrote:
> > 
> > > > Il 9 marzo 2018 alle 0.14 Cong Wang <xiyou.wangcong@gmail.com> ha scritto:
> > > > 
> > > > On Thu, Mar 8, 2018 at 8:02 AM, Marco Berizzi <pupilla@libero.it> wrote:
> > > > 
> > > > > > Marco Berizzi wrote:
> > > > > > 
> > > > > > Hello everyone,
> > > > > > 
> > > > > > Yesterday I got this error on a slackware linux 4.16-rc4 system
> > > > > > running as a traffic shaping gateway and netfilter nat.
> > > > > > The error has been arisen after a partial ISP network outage,
> > > > > > so unfortunately it will not trivial for me to reproduce it again.
> > > > > 
> > > > > Hello everyone,
> > > > > 
> > > > > I'm getting this error twice/day, so fortunately I'm able to
> > > > > reproduce it.
> > > > 
> > > > IIRC, there was a patch for this, but it got lost...
> > > > 
> > > > I will take a look anyway.
> > > 
> > > ok, thanks for the response. Let me know when there will be a patch
> > > available to test.
> > 
> > It has been reported here:
> > https://bugzilla.kernel.org/show_bug.cgi?id=109581
> > 
> > And there is a workaround from Konstantin:
> > https://patchwork.ozlabs.org/patch/803885/
> > 
> > Unfortunately I don't think that is a real fix, we probably need to
> > fix HFSC itself rather than just workaround the qlen==0. It is not
> > trivial since HFSC implementation is not easy to understand.
> > Maybe Jamal knows better than me.
> 
> Sorry for the latency - I looked at this on the plane and it is very
> specific to fq/codel. It is not clear to me why codel needs this but
> i note it has been there from the initial commit and from that
> perspective the patch looks reasonable. In any case:
> Punting it to Eric (on Cc).

Hello everyone,

just for ask: it there any luck to get this patch before 4.16 final?

^ permalink raw reply

* Re: [PATCH net] virtio-net: Fix operstate for virtio when no VIRTIO_NET_F_STATUS
From: Michael S. Tsirkin @ 2018-03-22 14:08 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: netdev, Jason Wang, David Miller, Ben Hutchings
In-Reply-To: <19842.1521720130@nyx>

On Thu, Mar 22, 2018 at 12:02:10PM +0000, Jay Vosburgh wrote:
> Michael S. Tsirkin <mst@redhat.com> wrote:
> 
> >On Thu, Mar 22, 2018 at 09:05:52AM +0000, Jay Vosburgh wrote:
> >> 	The operstate update logic will leave an interface in the
> >> default UNKNOWN operstate if the interface carrier state never changes
> >> from the default carrier up state set at creation.  This includes the
> >> case of an explicit call to netif_carrier_on, as the carrier on to on
> >> transition has no effect on operstate.
> >> 
> >> 	This affects virtio-net for the case that the virtio peer does
> >> not support VIRTIO_NET_F_STATUS (the feature that provides carrier state
> >> updates).  Without this feature, the virtio specification states that
> >> "the link should be assumed active," so, logically, the operstate should
> >> be UP instead of UNKNOWN.  This has impact on user space applications
> >> that use the operstate to make availability decisions for the interface.
> >> 
> >> 	Resolve this by changing the virtio probe logic slightly to call
> >> netif_carrier_off for both the "with" and "without" VIRTIO_NET_F_STATUS
> >> cases, and then the existing call to netif_carrier_on for the "without"
> >> case will cause an operstate transition.
> >> 
> >> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> >> Cc: Jason Wang <jasowang@redhat.com>
> >> Cc: Ben Hutchings <ben@decadent.org.uk>
> >> Fixes: 167c25e4c550 ("virtio-net: init link state correctly")
> >
> >I'd say that's an abuse of this notation. openstate was UNKNOWN
> >even before that fix.
> 
> 	I went back to the commit that added the dependency on
> VIRTIO_NET_F_STATUS (and that this patch would thus apply on top of).
> If that's an issue, I can resubmit without it.
> 
> 	-J

The patch can be trivially backported to any version that has virtio.

The issue was present since the original version of virtio.
VIRTIO_NET_F_STATUS fixed it for new devices.
So the tag is incorrectly blaming a partial fix for not being a full
one.

Also, I think it's more appropriate for net-next - it's a
minor ABI change (previously presence of VIRTIO_NET_F_STATUS
could be detected by looking at operstate, now it can't).
Hopefully this makes more apps work than it breaks.

So yes, pls repost without Fixes and with net-next unless
davem can make the change himself.

> >> Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
> >
> >Acked-by: Michael S. Tsirkin <mst@redhat.com>
> >
> >
> >> ---
> >> 
> >> 	I considered resolving this by changing linkwatch_init_dev to
> >> unconditionally call rfc2863_policy, as that would always set operstate
> >> for all interfaces.
> >> 
> >> 	This would not have any impact on most cases (as most drivers
> >> call netif_carrier_off during probe), except for the loopback device,
> >> which currently has an operstate of UNKNOWN (because it never does any
> >> carrier state transitions).  This change would add a round trip on the
> >> dev_base_lock for every loopback device creation, which could have a
> >> negative impact when creating many loopback devices, e.g., when
> >> concurrently creating large numbers of containers.
> >> 
> >> 
> >>  drivers/net/virtio_net.c | 2 +-
> >>  1 file changed, 1 insertion(+), 1 deletion(-)
> >> 
> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >> index 23374603e4d9..7b187ec7411e 100644
> >> --- a/drivers/net/virtio_net.c
> >> +++ b/drivers/net/virtio_net.c
> >> @@ -2857,8 +2857,8 @@ static int virtnet_probe(struct virtio_device *vdev)
> >>  
> >>  	/* Assume link up if device can't report link status,
> >>  	   otherwise get link status from config. */
> >> +	netif_carrier_off(dev);
> >>  	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> >> -		netif_carrier_off(dev);
> >>  		schedule_work(&vi->config_work);
> >>  	} else {
> >>  		vi->status = VIRTIO_NET_S_LINK_UP;
> >> -- 
> >> 2.14.1

^ permalink raw reply

* [iproute PATCH] man: ip-route.8: ssthresh parameter is NUMBER
From: Phil Sutter @ 2018-03-22 14:00 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev

Synopsis section was inconsistent with regards to help text and later
description of ssthresh parameter.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
 man/man8/ip-route.8.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in
index 487a87489a46a..b28f3d2c7d117 100644
--- a/man/man8/ip-route.8.in
+++ b/man/man8/ip-route.8.in
@@ -125,7 +125,7 @@ replace " } "
 .B  cwnd
 .IR NUMBER " ] [ "
 .B  ssthresh
-.IR REALM " ] [ "
+.IR NUMBER " ] [ "
 .B  realms
 .IR REALM " ] [ "
 .B  rto_min
-- 
2.16.1

^ permalink raw reply related

* Re: DTS for our Configuration
From: Andrew Lunn @ 2018-03-22 13:55 UTC (permalink / raw)
  To: Alayev Michael
  Cc: Efter Yoram, Dror Alon, 'netdev@vger.kernel.org',
	Margalit Ofer
In-Reply-To: <48F7D4389F30BA4383F214EE802BA47101706555DE@EXS10.iai.co.il>

> As you understand, I prefer not to change the driver. 

Actually, i don't understand why you prefer not to change the driver.

> Is there a way for me to bypass this issue?
> Can I use other property than 'fixed-link'?

My quick look at the driver makes me think you are going to have to
change it. But the changes can be mainlined, if done correctly.

I think it is time you rolled up your sleeves and start really
debugging this yourselves, and make the needed changes.

	  Andrew

^ permalink raw reply

* [RFC PATCH 5/5] net: macb: Add WOL support with ARP
From: harinikatakamlinux @ 2018-03-22 13:51 UTC (permalink / raw)
  To: nicolas.ferre, davem, harinikatakamlinux
  Cc: netdev, linux-kernel, harinik, michals, appanad
In-Reply-To: <1521726700-22634-1-git-send-email-harinikatakamlinux@gmail.com>

From: Harini Katakam <harinik@xilinx.com>

This patch enables ARP wake event support in GEM through the following:

-> WOL capability can be selected based on the SoC/GEM IP version rather
than a devictree property alone. Hence add a new capability property and
set device as "wakeup capable" in probe in this case.
-> Wake source selection can be done via ethtool or by enabling wakeup
in /sys/devices/platform/..../ethx/power/
This patch adds default wake source as ARP and the existing selection of
WOL using magic packet remains unchanged.
-> When GEM is the wake device with ARP as the wake event, the current
IP address to match is written to WOL register along with other
necessary confuguration required for MAC to recognize an ARP event.
-> While RX needs to remain enabled, there is no need to process the
actual wake packet - hence tie off all RX queues to avoid unnecessary
processing by DMA in the background. This tie off is done using a
dummy buffer descriptor with used bit set. (There is no other provision
to disable RX DMA in the GEM IP version in ZynqMP)
-> TX is disabled and all interrupts except WOL on Q0 are disabled.
Clear the WOL interrupt as no other action is required from driver.
Power management of the SoC will already have got the event and will
take care of initiating resume.
-> Upon resume ARP WOL config is cleared and macb is reinitialized.

Signed-off-by: Harini Katakam <harinik@xilinx.com>
---
 drivers/net/ethernet/cadence/macb.h      |   6 ++
 drivers/net/ethernet/cadence/macb_main.c | 130 +++++++++++++++++++++++++++++--
 2 files changed, 131 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index 9e7fb14..e18ff34 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -93,6 +93,7 @@
 #define GEM_SA3T		0x009C /* Specific3 Top */
 #define GEM_SA4B		0x00A0 /* Specific4 Bottom */
 #define GEM_SA4T		0x00A4 /* Specific4 Top */
+#define GEM_WOL			0x00B8 /* Wake on LAN */
 #define GEM_EFTSH		0x00e8 /* PTP Event Frame Transmitted Seconds Register 47:32 */
 #define GEM_EFRSH		0x00ec /* PTP Event Frame Received Seconds Register 47:32 */
 #define GEM_PEFTSH		0x00f0 /* PTP Peer Event Frame Transmitted Seconds Register 47:32 */
@@ -398,6 +399,8 @@
 #define MACB_PDRSFT_SIZE	1
 #define MACB_SRI_OFFSET		26 /* TSU Seconds Register Increment */
 #define MACB_SRI_SIZE		1
+#define GEM_WOL_OFFSET		28 /* Enable wake-on-lan interrupt in GEM */
+#define GEM_WOL_SIZE		1
 
 /* Timer increment fields */
 #define MACB_TI_CNS_OFFSET	0
@@ -635,6 +638,7 @@
 #define MACB_CAPS_USRIO_DISABLED		0x00000010
 #define MACB_CAPS_JUMBO				0x00000020
 #define MACB_CAPS_GEM_HAS_PTP			0x00000040
+#define MACB_CAPS_WOL				0x00000080
 #define MACB_CAPS_FIFO_MODE			0x10000000
 #define MACB_CAPS_GIGABIT_MODE_AVAILABLE	0x20000000
 #define MACB_CAPS_SG_DISABLED			0x40000000
@@ -1147,6 +1151,8 @@ struct macb {
 	unsigned int		num_queues;
 	unsigned int		queue_mask;
 	struct macb_queue	queues[MACB_MAX_QUEUES];
+	dma_addr_t		rx_ring_tieoff_dma;
+	struct macb_dma_desc	*rx_ring_tieoff;
 
 	spinlock_t		lock;
 	struct platform_device	*pdev;
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index bca91bd..9902654 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -36,6 +36,7 @@
 #include <linux/udp.h>
 #include <linux/tcp.h>
 #include <linux/pm_runtime.h>
+#include <linux/inetdevice.h>
 #include "macb.h"
 
 #define MACB_RX_BUFFER_SIZE	128
@@ -1400,6 +1401,12 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 	spin_lock(&bp->lock);
 
 	while (status) {
+		if (status & GEM_BIT(WOL)) {
+			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
+				queue_writel(queue, ISR, GEM_BIT(WOL));
+			break;
+		}
+
 		/* close possible race with dev_close */
 		if (unlikely(!netif_running(dev))) {
 			queue_writel(queue, IDR, -1);
@@ -1900,6 +1907,12 @@ static void macb_free_consistent(struct macb *bp)
 		queue->rx_ring = NULL;
 	}
 
+	if (bp->rx_ring_tieoff) {
+		dma_free_coherent(&bp->pdev->dev, macb_dma_desc_get_size(bp),
+				  bp->rx_ring_tieoff, bp->rx_ring_tieoff_dma);
+		bp->rx_ring_tieoff = NULL;
+	}
+
 	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
 		kfree(queue->tx_skb);
 		queue->tx_skb = NULL;
@@ -1979,6 +1992,14 @@ static int macb_alloc_consistent(struct macb *bp)
 			   "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
 			   size, (unsigned long)queue->rx_ring_dma, queue->rx_ring);
 	}
+	/* Allocate one dummy descriptor to tie off RX queues when required */
+	bp->rx_ring_tieoff = dma_alloc_coherent(&bp->pdev->dev,
+						macb_dma_desc_get_size(bp),
+						&bp->rx_ring_tieoff_dma,
+						GFP_KERNEL);
+	if (!bp->rx_ring_tieoff)
+		goto out_err;
+
 	if (bp->macbgem_ops.mog_alloc_rx_buffers(bp))
 		goto out_err;
 
@@ -1989,6 +2010,34 @@ static int macb_alloc_consistent(struct macb *bp)
 	return -ENOMEM;
 }
 
+static void macb_init_tieoff(struct macb *bp)
+{
+	struct macb_dma_desc *d = bp->rx_ring_tieoff;
+
+	/* Setup a wrapping descriptor with no free slots
+	 * (WRAP and USED) to tie off/disable unused RX queues.
+	 */
+	macb_set_addr(bp, d, MACB_BIT(RX_WRAP) | MACB_BIT(RX_USED));
+	d->ctrl = 0;
+}
+
+static inline void macb_rx_tieoff(struct macb *bp)
+{
+	struct macb_queue *queue = bp->queues;
+	unsigned int q;
+
+	for (q = 0, queue = bp->queues; q < bp->num_queues;
+	     ++q, ++queue) {
+		queue_writel(queue, RBQP,
+			     lower_32_bits(bp->rx_ring_tieoff_dma));
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+		if (bp->hw_dma_cap & HW_DMA_CAP_64B)
+			queue_writel(queue, RBQPH,
+				     upper_32_bits(bp->rx_ring_tieoff_dma));
+#endif
+	}
+}
+
 static void gem_init_rings(struct macb *bp)
 {
 	struct macb_queue *queue;
@@ -2011,6 +2060,7 @@ static void gem_init_rings(struct macb *bp)
 
 		gem_rx_refill(queue);
 	}
+	macb_init_tieoff(bp);
 
 }
 
@@ -2653,6 +2703,13 @@ static void macb_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
 		if (bp->wol & MACB_WOL_ENABLED)
 			wol->wolopts |= WAKE_MAGIC;
 	}
+
+	if (bp->caps & MACB_CAPS_WOL) {
+		wol->supported = WAKE_ARP;
+
+		if (bp->wol & MACB_WOL_ENABLED)
+			wol->wolopts |= WAKE_ARP;
+	}
 }
 
 static int macb_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
@@ -2660,10 +2717,11 @@ static int macb_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
 	struct macb *bp = netdev_priv(netdev);
 
 	if (!(bp->wol & MACB_WOL_HAS_MAGIC_PACKET) ||
-	    (wol->wolopts & ~WAKE_MAGIC))
+	    !(bp->caps & MACB_CAPS_WOL) ||
+	    (wol->wolopts & ~WAKE_MAGIC) || (wol->wolopts & ~WAKE_ARP))
 		return -EOPNOTSUPP;
 
-	if (wol->wolopts & WAKE_MAGIC)
+	if (wol->wolopts & (WAKE_MAGIC | WAKE_ARP))
 		bp->wol |= MACB_WOL_ENABLED;
 	else
 		bp->wol &= ~MACB_WOL_ENABLED;
@@ -3895,7 +3953,7 @@ static const struct macb_config np4_config = {
 static const struct macb_config zynqmp_config = {
 	.caps = MACB_CAPS_GIGABIT_MODE_AVAILABLE |
 			MACB_CAPS_JUMBO |
-			MACB_CAPS_GEM_HAS_PTP,
+			MACB_CAPS_GEM_HAS_PTP | MACB_CAPS_WOL,
 	.dma_burst_length = 16,
 	.clk_init = macb_clk_init,
 	.init = macb_init,
@@ -4093,6 +4151,9 @@ static int macb_probe(struct platform_device *pdev)
 
 	phy_attached_info(phydev);
 
+	if (bp->caps & MACB_CAPS_WOL)
+		device_set_wakeup_capable(&bp->dev->dev, 1);
+
 	netdev_info(dev, "Cadence %s rev 0x%08x at 0x%08lx irq %d (%pM)\n",
 		    macb_is_gem(bp) ? "GEM" : "MACB", macb_readl(bp, MID),
 		    dev->base_addr, dev->irq, dev->dev_addr);
@@ -4170,16 +4231,58 @@ static int __maybe_unused macb_suspend(struct device *dev)
 	struct macb_queue *queue = bp->queues;
 	unsigned long flags;
 	unsigned int q;
+	u32 ctrl, arpipmask;
 
 	if (!netif_running(netdev))
 		return 0;
 
 
-	if (bp->wol & MACB_WOL_ENABLED) {
+	if ((bp->wol & MACB_WOL_ENABLED) &&
+	    (bp->wol & MACB_WOL_HAS_MAGIC_PACKET)) {
 		macb_writel(bp, IER, MACB_BIT(WOL));
 		macb_writel(bp, WOL, MACB_BIT(MAG));
 		enable_irq_wake(bp->queues[0].irq);
 		netif_device_detach(netdev);
+	} else if (device_may_wakeup(&bp->dev->dev)) {
+		/* Use ARP as default wake source */
+		spin_lock_irqsave(&bp->lock, flags);
+		ctrl = macb_readl(bp, NCR);
+		/* Disable TX as is it not required;
+		 * Disable RX to change BD pointers and enable again
+		 */
+		ctrl &= ~(MACB_BIT(TE) | MACB_BIT(RE));
+		macb_writel(bp, NCR, ctrl);
+		/* Tie all RX queues */
+		macb_rx_tieoff(bp);
+		ctrl = macb_readl(bp, NCR);
+		ctrl |= MACB_BIT(RE);
+		macb_writel(bp, NCR, ctrl);
+		/* Broadcast should be enabled for ARP wake event */
+		gem_writel(bp, NCFGR, gem_readl(bp, NCFGR) & ~MACB_BIT(NBC));
+		macb_writel(bp, TSR, -1);
+		macb_writel(bp, RSR, -1);
+		macb_readl(bp, ISR);
+		if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
+			macb_writel(bp, ISR, -1);
+
+		/* Enable WOL (Q0 only) and disable all other interrupts */
+		queue = bp->queues;
+		queue_writel(queue, IER, GEM_BIT(WOL));
+		for (q = 0, queue = bp->queues; q < bp->num_queues;
+		     ++q, ++queue) {
+			queue_writel(queue, IDR, MACB_RX_INT_FLAGS |
+						 MACB_TX_INT_FLAGS |
+						 MACB_BIT(HRESP));
+		}
+
+		arpipmask = cpu_to_be32p(&bp->dev->ip_ptr->ifa_list->ifa_local)
+					 & 0xFFFF;
+		gem_writel(bp, WOL, MACB_BIT(ARP) | arpipmask);
+		spin_unlock_irqrestore(&bp->lock, flags);
+		enable_irq_wake(bp->queues[0].irq);
+		netif_device_detach(netdev);
+		for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
+			napi_disable(&queue->napi);
 	} else {
 		netif_device_detach(netdev);
 		for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
@@ -4206,16 +4309,33 @@ static int __maybe_unused macb_resume(struct device *dev)
 	struct macb *bp = netdev_priv(netdev);
 	struct macb_queue *queue = bp->queues;
 	unsigned int q;
+	unsigned long flags;
 
 	if (!netif_running(netdev))
 		return 0;
 
 	pm_runtime_force_resume(dev);
 
-	if (bp->wol & MACB_WOL_ENABLED) {
+	if ((bp->wol & MACB_WOL_ENABLED) &&
+	    (bp->wol & MACB_WOL_HAS_MAGIC_PACKET)) {
 		macb_writel(bp, IDR, MACB_BIT(WOL));
 		macb_writel(bp, WOL, 0);
 		disable_irq_wake(bp->queues[0].irq);
+	} else if (device_may_wakeup(&bp->dev->dev)) {
+		/* Resume after ARP wake event */
+		spin_lock_irqsave(&bp->lock, flags);
+		queue = bp->queues;
+		queue_writel(queue, IDR, GEM_BIT(WOL));
+		gem_writel(bp, WOL, 0);
+		/* Clear Q0 ISR as WOL was enabled on Q0 */
+		if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
+			macb_writel(bp, ISR, -1);
+		disable_irq_wake(bp->queues[0].irq);
+		spin_unlock_irqrestore(&bp->lock, flags);
+		macb_writel(bp, NCR, MACB_BIT(MPE));
+		for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
+			napi_enable(&queue->napi);
+		netif_carrier_on(netdev);
 	} else {
 		macb_writel(bp, NCR, MACB_BIT(MPE));
 		for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 4/5] net: macb: Add support for suspend/resume with full power down
From: harinikatakamlinux @ 2018-03-22 13:51 UTC (permalink / raw)
  To: nicolas.ferre, davem, harinikatakamlinux
  Cc: netdev, linux-kernel, harinik, michals, appanad
In-Reply-To: <1521726700-22634-1-git-send-email-harinikatakamlinux@gmail.com>

From: Harini Katakam <harinik@xilinx.com>

When macb device is suspended and system is powered down, the clocks
are removed and hence macb should be closed gracefully and restored
upon resume. This patch does the same by switching off the net device,
suspending phy and performing necessary cleanup of interrupts and BDs.
Upon resume, all these are reinitialized again.

Reset of macb device is done only when GEM is not a wake device.
Even when gem is a wake device, tx queues can be stopped and ptp device
can be closed (tsu clock will be disabled in pm_runtime_suspend) as
wake event detection has no dependency on this.

Signed-off-by: Kedareswara rao Appana <appanad@xilinx.com>
Signed-off-by: Harini Katakam <harinik@xilinx.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 38 ++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index ce75088..bca91bd 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -4167,16 +4167,33 @@ static int __maybe_unused macb_suspend(struct device *dev)
 	struct platform_device *pdev = to_platform_device(dev);
 	struct net_device *netdev = platform_get_drvdata(pdev);
 	struct macb *bp = netdev_priv(netdev);
+	struct macb_queue *queue = bp->queues;
+	unsigned long flags;
+	unsigned int q;
+
+	if (!netif_running(netdev))
+		return 0;
 
-	netif_carrier_off(netdev);
-	netif_device_detach(netdev);
 
 	if (bp->wol & MACB_WOL_ENABLED) {
 		macb_writel(bp, IER, MACB_BIT(WOL));
 		macb_writel(bp, WOL, MACB_BIT(MAG));
 		enable_irq_wake(bp->queues[0].irq);
+		netif_device_detach(netdev);
+	} else {
+		netif_device_detach(netdev);
+		for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
+			napi_disable(&queue->napi);
+		phy_stop(netdev->phydev);
+		phy_suspend(netdev->phydev);
+		spin_lock_irqsave(&bp->lock, flags);
+		macb_reset_hw(bp);
+		spin_unlock_irqrestore(&bp->lock, flags);
 	}
 
+	netif_carrier_off(netdev);
+	if (bp->ptp_info)
+		bp->ptp_info->ptp_remove(netdev);
 	pm_runtime_force_suspend(dev);
 
 	return 0;
@@ -4187,6 +4204,11 @@ static int __maybe_unused macb_resume(struct device *dev)
 	struct platform_device *pdev = to_platform_device(dev);
 	struct net_device *netdev = platform_get_drvdata(pdev);
 	struct macb *bp = netdev_priv(netdev);
+	struct macb_queue *queue = bp->queues;
+	unsigned int q;
+
+	if (!netif_running(netdev))
+		return 0;
 
 	pm_runtime_force_resume(dev);
 
@@ -4194,9 +4216,21 @@ static int __maybe_unused macb_resume(struct device *dev)
 		macb_writel(bp, IDR, MACB_BIT(WOL));
 		macb_writel(bp, WOL, 0);
 		disable_irq_wake(bp->queues[0].irq);
+	} else {
+		macb_writel(bp, NCR, MACB_BIT(MPE));
+		for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
+			napi_enable(&queue->napi);
+		netif_carrier_on(netdev);
+		phy_resume(netdev->phydev);
+		phy_start(netdev->phydev);
 	}
 
+	bp->macbgem_ops.mog_init_rings(bp);
+	macb_init_hw(bp);
+	macb_set_rx_mode(netdev);
 	netif_device_attach(netdev);
+	if (bp->ptp_info)
+		bp->ptp_info->ptp_init(netdev);
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 3/5] net: macb: Add pm runtime support
From: harinikatakamlinux @ 2018-03-22 13:51 UTC (permalink / raw)
  To: nicolas.ferre, davem, harinikatakamlinux
  Cc: netdev, linux-kernel, harinik, michals, appanad,
	Shubhrajyoti Datta
In-Reply-To: <1521726700-22634-1-git-send-email-harinikatakamlinux@gmail.com>

From: Harini Katakam <harinik@xilinx.com>

Add runtime pm functions and move clock handling there.
Enable clocks in mdio read/write functions.

Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
Signed-off-by: Harini Katakam <harinik@xilinx.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 105 ++++++++++++++++++++++++++-----
 1 file changed, 90 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index ae61927..ce75088 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -35,6 +35,7 @@
 #include <linux/ip.h>
 #include <linux/udp.h>
 #include <linux/tcp.h>
+#include <linux/pm_runtime.h>
 #include "macb.h"
 
 #define MACB_RX_BUFFER_SIZE	128
@@ -77,6 +78,7 @@
  * 1 frame time (10 Mbits/s, full-duplex, ignoring collisions)
  */
 #define MACB_HALT_TIMEOUT	1230
+#define MACB_PM_TIMEOUT  100 /* ms */
 
 /* DMA buffer descriptor might be different size
  * depends on hardware configuration:
@@ -321,8 +323,13 @@ static int macb_mdio_read(struct mii_bus *bus, int mii_id, int regnum)
 {
 	struct macb *bp = bus->priv;
 	int value;
+	int err;
 	ulong timeout;
 
+	err = pm_runtime_get_sync(&bp->pdev->dev);
+	if (err < 0)
+		return err;
+
 	timeout = jiffies + msecs_to_jiffies(1000);
 	/* wait for end of transfer */
 	do {
@@ -334,6 +341,8 @@ static int macb_mdio_read(struct mii_bus *bus, int mii_id, int regnum)
 
 	if (time_after_eq(jiffies, timeout)) {
 		netdev_err(bp->dev, "wait for end of transfer timed out\n");
+		pm_runtime_mark_last_busy(&bp->pdev->dev);
+		pm_runtime_put_autosuspend(&bp->pdev->dev);
 		return -ETIMEDOUT;
 	}
 
@@ -354,11 +363,15 @@ static int macb_mdio_read(struct mii_bus *bus, int mii_id, int regnum)
 
 	if (time_after_eq(jiffies, timeout)) {
 		netdev_err(bp->dev, "wait for end of transfer timed out\n");
+		pm_runtime_mark_last_busy(&bp->pdev->dev);
+		pm_runtime_put_autosuspend(&bp->pdev->dev);
 		return -ETIMEDOUT;
 	}
 
 	value = MACB_BFEXT(DATA, macb_readl(bp, MAN));
 
+	pm_runtime_mark_last_busy(&bp->pdev->dev);
+	pm_runtime_put_autosuspend(&bp->pdev->dev);
 	return value;
 }
 
@@ -366,8 +379,13 @@ static int macb_mdio_write(struct mii_bus *bus, int mii_id, int regnum,
 			   u16 value)
 {
 	struct macb *bp = bus->priv;
+	int err;
 	ulong timeout;
 
+	err = pm_runtime_get_sync(&bp->pdev->dev);
+	if (err < 0)
+		return err;
+
 	timeout = jiffies + msecs_to_jiffies(1000);
 	/* wait for end of transfer */
 	do {
@@ -379,6 +397,8 @@ static int macb_mdio_write(struct mii_bus *bus, int mii_id, int regnum,
 
 	if (time_after_eq(jiffies, timeout)) {
 		netdev_err(bp->dev, "wait for end of transfer timed out\n");
+		pm_runtime_mark_last_busy(&bp->pdev->dev);
+		pm_runtime_put_autosuspend(&bp->pdev->dev);
 		return -ETIMEDOUT;
 	}
 
@@ -400,9 +420,13 @@ static int macb_mdio_write(struct mii_bus *bus, int mii_id, int regnum,
 
 	if (time_after_eq(jiffies, timeout)) {
 		netdev_err(bp->dev, "wait for end of transfer timed out\n");
+		pm_runtime_mark_last_busy(&bp->pdev->dev);
+		pm_runtime_put_autosuspend(&bp->pdev->dev);
 		return -ETIMEDOUT;
 	}
 
+	pm_runtime_mark_last_busy(&bp->pdev->dev);
+	pm_runtime_put_autosuspend(&bp->pdev->dev);
 	return 0;
 }
 
@@ -2338,6 +2362,10 @@ static int macb_open(struct net_device *dev)
 
 	netdev_dbg(bp->dev, "open\n");
 
+	err = pm_runtime_get_sync(&bp->pdev->dev);
+	if (err < 0)
+		return err;
+
 	/* carrier starts down */
 	netif_carrier_off(dev);
 
@@ -2397,6 +2425,8 @@ static int macb_close(struct net_device *dev)
 	if (bp->ptp_info)
 		bp->ptp_info->ptp_remove(dev);
 
+	pm_runtime_put(&bp->pdev->dev);
+
 	return 0;
 }
 
@@ -3949,6 +3979,11 @@ static int macb_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
+	pm_runtime_set_autosuspend_delay(&pdev->dev, MACB_PM_TIMEOUT);
+	pm_runtime_use_autosuspend(&pdev->dev);
+	pm_runtime_get_noresume(&pdev->dev);
+	pm_runtime_set_active(&pdev->dev);
+	pm_runtime_enable(&pdev->dev);
 	native_io = hw_is_native_io(mem);
 
 	macb_probe_queues(mem, native_io, &queue_mask, &num_queues);
@@ -4062,6 +4097,9 @@ static int macb_probe(struct platform_device *pdev)
 		    macb_is_gem(bp) ? "GEM" : "MACB", macb_readl(bp, MID),
 		    dev->base_addr, dev->irq, dev->dev_addr);
 
+	pm_runtime_mark_last_busy(&bp->pdev->dev);
+	pm_runtime_put_autosuspend(&bp->pdev->dev);
+
 	return 0;
 
 err_out_unregister_mdio:
@@ -4081,6 +4119,9 @@ static int macb_probe(struct platform_device *pdev)
 	clk_disable_unprepare(pclk);
 	clk_disable_unprepare(rx_clk);
 	clk_disable_unprepare(tsu_clk);
+	pm_runtime_disable(&pdev->dev);
+	pm_runtime_set_suspended(&pdev->dev);
+	pm_runtime_dont_use_autosuspend(&pdev->dev);
 
 	return err;
 }
@@ -4104,11 +4145,16 @@ static int macb_remove(struct platform_device *pdev)
 		mdiobus_free(bp->mii_bus);
 
 		unregister_netdev(dev);
-		clk_disable_unprepare(bp->tx_clk);
-		clk_disable_unprepare(bp->hclk);
-		clk_disable_unprepare(bp->pclk);
-		clk_disable_unprepare(bp->rx_clk);
-		clk_disable_unprepare(bp->tsu_clk);
+		pm_runtime_disable(&pdev->dev);
+		pm_runtime_dont_use_autosuspend(&pdev->dev);
+		if (!pm_runtime_suspended(&pdev->dev)) {
+			clk_disable_unprepare(bp->tx_clk);
+			clk_disable_unprepare(bp->hclk);
+			clk_disable_unprepare(bp->pclk);
+			clk_disable_unprepare(bp->rx_clk);
+			clk_disable_unprepare(bp->tsu_clk);
+			pm_runtime_set_suspended(&pdev->dev);
+		}
 		of_node_put(bp->phy_node);
 		free_netdev(dev);
 	}
@@ -4129,13 +4175,9 @@ static int __maybe_unused macb_suspend(struct device *dev)
 		macb_writel(bp, IER, MACB_BIT(WOL));
 		macb_writel(bp, WOL, MACB_BIT(MAG));
 		enable_irq_wake(bp->queues[0].irq);
-	} else {
-		clk_disable_unprepare(bp->tx_clk);
-		clk_disable_unprepare(bp->hclk);
-		clk_disable_unprepare(bp->pclk);
-		clk_disable_unprepare(bp->rx_clk);
 	}
-	clk_disable_unprepare(bp->tsu_clk);
+
+	pm_runtime_force_suspend(dev);
 
 	return 0;
 }
@@ -4146,11 +4188,43 @@ static int __maybe_unused macb_resume(struct device *dev)
 	struct net_device *netdev = platform_get_drvdata(pdev);
 	struct macb *bp = netdev_priv(netdev);
 
+	pm_runtime_force_resume(dev);
+
 	if (bp->wol & MACB_WOL_ENABLED) {
 		macb_writel(bp, IDR, MACB_BIT(WOL));
 		macb_writel(bp, WOL, 0);
 		disable_irq_wake(bp->queues[0].irq);
-	} else {
+	}
+
+	netif_device_attach(netdev);
+
+	return 0;
+}
+
+static int __maybe_unused macb_runtime_suspend(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct net_device *netdev = platform_get_drvdata(pdev);
+	struct macb *bp = netdev_priv(netdev);
+
+	if (!(device_may_wakeup(&bp->dev->dev))) {
+		clk_disable_unprepare(bp->tx_clk);
+		clk_disable_unprepare(bp->hclk);
+		clk_disable_unprepare(bp->pclk);
+		clk_disable_unprepare(bp->rx_clk);
+	}
+	clk_disable_unprepare(bp->tsu_clk);
+
+	return 0;
+}
+
+static int __maybe_unused macb_runtime_resume(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct net_device *netdev = platform_get_drvdata(pdev);
+	struct macb *bp = netdev_priv(netdev);
+
+	if (!(device_may_wakeup(&bp->dev->dev))) {
 		clk_prepare_enable(bp->pclk);
 		clk_prepare_enable(bp->hclk);
 		clk_prepare_enable(bp->tx_clk);
@@ -4158,12 +4232,13 @@ static int __maybe_unused macb_resume(struct device *dev)
 	}
 	clk_prepare_enable(bp->tsu_clk);
 
-	netif_device_attach(netdev);
-
 	return 0;
 }
 
-static SIMPLE_DEV_PM_OPS(macb_pm_ops, macb_suspend, macb_resume);
+static const struct dev_pm_ops macb_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(macb_suspend, macb_resume)
+	SET_RUNTIME_PM_OPS(macb_runtime_suspend, macb_runtime_resume, NULL)
+};
 
 static struct platform_driver macb_driver = {
 	.probe		= macb_probe,
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 2/5] net: macb: Support clock management for tsu_clk
From: harinikatakamlinux @ 2018-03-22 13:51 UTC (permalink / raw)
  To: nicolas.ferre, davem, harinikatakamlinux
  Cc: netdev, linux-kernel, harinik, michals, appanad
In-Reply-To: <1521726700-22634-1-git-send-email-harinikatakamlinux@gmail.com>

From: Harini Katakam <harinik@xilinx.com>

TSU clock needs to be enabled/disabled as per support in devicetree
and it should also be controlled during suspend/resume (WOL has no
dependency on this clock).

Signed-off-by: Harini Katakam <harinik@xilinx.com>
---
 drivers/net/ethernet/cadence/macb.h      |  3 ++-
 drivers/net/ethernet/cadence/macb_main.c | 30 +++++++++++++++++++++++++-----
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index 8665982..9e7fb14 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -1074,7 +1074,7 @@ struct macb_config {
 	unsigned int		dma_burst_length;
 	int	(*clk_init)(struct platform_device *pdev, struct clk **pclk,
 			    struct clk **hclk, struct clk **tx_clk,
-			    struct clk **rx_clk);
+			    struct clk **rx_clk, struct clk **tsu_clk);
 	int	(*init)(struct platform_device *pdev);
 	int	jumbo_max_len;
 };
@@ -1154,6 +1154,7 @@ struct macb {
 	struct clk		*hclk;
 	struct clk		*tx_clk;
 	struct clk		*rx_clk;
+	struct clk		*tsu_clk;
 	struct net_device	*dev;
 	union {
 		struct macb_stats	macb;
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index f4030c1..ae61927 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -3245,7 +3245,7 @@ static void macb_probe_queues(void __iomem *mem,
 
 static int macb_clk_init(struct platform_device *pdev, struct clk **pclk,
 			 struct clk **hclk, struct clk **tx_clk,
-			 struct clk **rx_clk)
+			 struct clk **rx_clk, struct clk **tsu_clk)
 {
 	struct macb_platform_data *pdata;
 	int err;
@@ -3279,6 +3279,10 @@ static int macb_clk_init(struct platform_device *pdev, struct clk **pclk,
 	if (IS_ERR(*rx_clk))
 		*rx_clk = NULL;
 
+	*tsu_clk = devm_clk_get(&pdev->dev, "tsu_clk");
+	if (IS_ERR(*tsu_clk))
+		*tsu_clk = NULL;
+
 	err = clk_prepare_enable(*pclk);
 	if (err) {
 		dev_err(&pdev->dev, "failed to enable pclk (%u)\n", err);
@@ -3303,8 +3307,17 @@ static int macb_clk_init(struct platform_device *pdev, struct clk **pclk,
 		goto err_disable_txclk;
 	}
 
+	err = clk_prepare_enable(*tsu_clk);
+	if (err) {
+		dev_err(&pdev->dev, "failed to enable tsu_clk (%u)\n", err);
+		goto err_disable_rxclk;
+	}
+
 	return 0;
 
+err_disable_rxclk:
+	clk_disable_unprepare(*rx_clk);
+
 err_disable_txclk:
 	clk_disable_unprepare(*tx_clk);
 
@@ -3754,13 +3767,14 @@ static const struct net_device_ops at91ether_netdev_ops = {
 
 static int at91ether_clk_init(struct platform_device *pdev, struct clk **pclk,
 			      struct clk **hclk, struct clk **tx_clk,
-			      struct clk **rx_clk)
+			      struct clk **rx_clk, struct clk **tsu_clk)
 {
 	int err;
 
 	*hclk = NULL;
 	*tx_clk = NULL;
 	*rx_clk = NULL;
+	*tsu_clk = NULL;
 
 	*pclk = devm_clk_get(&pdev->dev, "ether_clk");
 	if (IS_ERR(*pclk))
@@ -3898,11 +3912,12 @@ static int macb_probe(struct platform_device *pdev)
 {
 	const struct macb_config *macb_config = &default_gem_config;
 	int (*clk_init)(struct platform_device *, struct clk **,
-			struct clk **, struct clk **,  struct clk **)
-					      = macb_config->clk_init;
+			struct clk **, struct clk **,  struct clk **,
+			struct clk **) = macb_config->clk_init;
 	int (*init)(struct platform_device *) = macb_config->init;
 	struct device_node *np = pdev->dev.of_node;
 	struct clk *pclk, *hclk = NULL, *tx_clk = NULL, *rx_clk = NULL;
+	struct clk *tsu_clk = NULL;
 	unsigned int queue_mask, num_queues;
 	struct macb_platform_data *pdata;
 	bool native_io;
@@ -3930,7 +3945,7 @@ static int macb_probe(struct platform_device *pdev)
 		}
 	}
 
-	err = clk_init(pdev, &pclk, &hclk, &tx_clk, &rx_clk);
+	err = clk_init(pdev, &pclk, &hclk, &tx_clk, &rx_clk, &tsu_clk);
 	if (err)
 		return err;
 
@@ -3967,6 +3982,7 @@ static int macb_probe(struct platform_device *pdev)
 	bp->hclk = hclk;
 	bp->tx_clk = tx_clk;
 	bp->rx_clk = rx_clk;
+	bp->tsu_clk = tsu_clk;
 	if (macb_config)
 		bp->jumbo_max_len = macb_config->jumbo_max_len;
 
@@ -4064,6 +4080,7 @@ static int macb_probe(struct platform_device *pdev)
 	clk_disable_unprepare(hclk);
 	clk_disable_unprepare(pclk);
 	clk_disable_unprepare(rx_clk);
+	clk_disable_unprepare(tsu_clk);
 
 	return err;
 }
@@ -4091,6 +4108,7 @@ static int macb_remove(struct platform_device *pdev)
 		clk_disable_unprepare(bp->hclk);
 		clk_disable_unprepare(bp->pclk);
 		clk_disable_unprepare(bp->rx_clk);
+		clk_disable_unprepare(bp->tsu_clk);
 		of_node_put(bp->phy_node);
 		free_netdev(dev);
 	}
@@ -4117,6 +4135,7 @@ static int __maybe_unused macb_suspend(struct device *dev)
 		clk_disable_unprepare(bp->pclk);
 		clk_disable_unprepare(bp->rx_clk);
 	}
+	clk_disable_unprepare(bp->tsu_clk);
 
 	return 0;
 }
@@ -4137,6 +4156,7 @@ static int __maybe_unused macb_resume(struct device *dev)
 		clk_prepare_enable(bp->tx_clk);
 		clk_prepare_enable(bp->rx_clk);
 	}
+	clk_prepare_enable(bp->tsu_clk);
 
 	netif_device_attach(netdev);
 
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 1/5] net: macb: Check MDIO state before read/write and use timeouts
From: harinikatakamlinux @ 2018-03-22 13:51 UTC (permalink / raw)
  To: nicolas.ferre, davem, harinikatakamlinux
  Cc: netdev, linux-kernel, harinik, michals, appanad,
	Shubhrajyoti Datta
In-Reply-To: <1521726700-22634-1-git-send-email-harinikatakamlinux@gmail.com>

From: Harini Katakam <harinik@xilinx.com>

Replace the while loop in MDIO read/write functions with a timeout.
In addition, add a check for MDIO bus busy before initiating a new
operation as well to make sure there is no ongoing MDIO operation.

Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
Signed-off-by: Harini Katakam <harinik@xilinx.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 54 ++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index d09bd43..f4030c1 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -321,6 +321,21 @@ static int macb_mdio_read(struct mii_bus *bus, int mii_id, int regnum)
 {
 	struct macb *bp = bus->priv;
 	int value;
+	ulong timeout;
+
+	timeout = jiffies + msecs_to_jiffies(1000);
+	/* wait for end of transfer */
+	do {
+		if (MACB_BFEXT(IDLE, macb_readl(bp, NSR)))
+			break;
+
+		cpu_relax();
+	} while (!time_after_eq(jiffies, timeout));
+
+	if (time_after_eq(jiffies, timeout)) {
+		netdev_err(bp->dev, "wait for end of transfer timed out\n");
+		return -ETIMEDOUT;
+	}
 
 	macb_writel(bp, MAN, (MACB_BF(SOF, MACB_MAN_SOF)
 			      | MACB_BF(RW, MACB_MAN_READ)
@@ -328,9 +343,19 @@ static int macb_mdio_read(struct mii_bus *bus, int mii_id, int regnum)
 			      | MACB_BF(REGA, regnum)
 			      | MACB_BF(CODE, MACB_MAN_CODE)));
 
+	timeout = jiffies + msecs_to_jiffies(1000);
 	/* wait for end of transfer */
-	while (!MACB_BFEXT(IDLE, macb_readl(bp, NSR)))
+	do {
+		if (MACB_BFEXT(IDLE, macb_readl(bp, NSR)))
+			break;
+
 		cpu_relax();
+	} while (!time_after_eq(jiffies, timeout));
+
+	if (time_after_eq(jiffies, timeout)) {
+		netdev_err(bp->dev, "wait for end of transfer timed out\n");
+		return -ETIMEDOUT;
+	}
 
 	value = MACB_BFEXT(DATA, macb_readl(bp, MAN));
 
@@ -341,6 +366,21 @@ static int macb_mdio_write(struct mii_bus *bus, int mii_id, int regnum,
 			   u16 value)
 {
 	struct macb *bp = bus->priv;
+	ulong timeout;
+
+	timeout = jiffies + msecs_to_jiffies(1000);
+	/* wait for end of transfer */
+	do {
+		if (MACB_BFEXT(IDLE, macb_readl(bp, NSR)))
+			break;
+
+		cpu_relax();
+	} while (!time_after_eq(jiffies, timeout));
+
+	if (time_after_eq(jiffies, timeout)) {
+		netdev_err(bp->dev, "wait for end of transfer timed out\n");
+		return -ETIMEDOUT;
+	}
 
 	macb_writel(bp, MAN, (MACB_BF(SOF, MACB_MAN_SOF)
 			      | MACB_BF(RW, MACB_MAN_WRITE)
@@ -349,9 +389,19 @@ static int macb_mdio_write(struct mii_bus *bus, int mii_id, int regnum,
 			      | MACB_BF(CODE, MACB_MAN_CODE)
 			      | MACB_BF(DATA, value)));
 
+	timeout = jiffies + msecs_to_jiffies(1000);
 	/* wait for end of transfer */
-	while (!MACB_BFEXT(IDLE, macb_readl(bp, NSR)))
+	do {
+		if (MACB_BFEXT(IDLE, macb_readl(bp, NSR)))
+			break;
+
 		cpu_relax();
+	} while (!time_after_eq(jiffies, timeout));
+
+	if (time_after_eq(jiffies, timeout)) {
+		netdev_err(bp->dev, "wait for end of transfer timed out\n");
+		return -ETIMEDOUT;
+	}
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 0/5] Macb power management support for ZynqMP
From: harinikatakamlinux @ 2018-03-22 13:51 UTC (permalink / raw)
  To: nicolas.ferre, davem, harinikatakamlinux
  Cc: netdev, linux-kernel, harinik, michals, appanad

From: Harini Katakam <harinik@xilinx.com>

This series adds support for macb suspend/resume with system power down
and wake on LAN with ARP packets.
In relation to the above, this series also updates mdio_read/write
function for PM and adds tsu clock management.

Harini Katakam (5):
  net: macb: Check MDIO state before read/write and use timeouts
  net: macb: Support clock management for tsu_clk
  net: macb: Add pm runtime support
  net: macb: Add support for suspend/resume with full power down
  net: macb: Add WOL support with ARP

 drivers/net/ethernet/cadence/macb.h      |   9 +-
 drivers/net/ethernet/cadence/macb_main.c | 349 ++++++++++++++++++++++++++++---
 2 files changed, 332 insertions(+), 26 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH 28/28] random: convert to ->poll_mask
From: Theodore Y. Ts'o @ 2018-03-22 13:49 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: viro, Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180321074032.14211-29-hch@lst.de>

On Wed, Mar 21, 2018 at 08:40:32AM +0100, Christoph Hellwig wrote:
> The big change is that random_read_wait and random_write_wait are merged
> into a single waitqueue that uses keyed wakeups.  Because wait_event_*
> doesn't know about that this will lead to occassional spurious wakeups
> in _random_read and add_hwgenerator_randomness, but wait_event_* is
> designed to handle these and were are not in a a hot path there.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Acked-by: Theodore Ts'o <tytso@mit.edu>

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply

* RE: [PATCH net 1/1] qede: Fix barrier usage after tx doorbell write.
From: Chopra, Manish @ 2018-03-22 13:48 UTC (permalink / raw)
  To: Elior, Ariel, davem@davemloft.net
  Cc: netdev@vger.kernel.org, Kalderon, Michal
In-Reply-To: <CY1PR0701MB1337ABBF94C1625F4B30BE0890AA0@CY1PR0701MB1337.namprd07.prod.outlook.com>

> -----Original Message-----
> From: Elior, Ariel
> Sent: Wednesday, March 21, 2018 7:10 PM
> To: davem@davemloft.net
> Cc: netdev@vger.kernel.org; Kalderon, Michal <Michal.Kalderon@cavium.com>;
> Chopra, Manish <Manish.Chopra@cavium.com>
> Subject: RE: [PATCH net 1/1] qede: Fix barrier usage after tx doorbell write.
> 
> > Subject: [PATCH net 1/1] qede: Fix barrier usage after tx doorbell write.
> >
> > Since commit c5ad119fb6c09b0297446be05bd66602fa564758
> > ("net: sched: pfifo_fast use skb_array") driver is exposed to an issue
> > where it is hitting NULL skbs while handling TX completions. Driver
> > uses mmiowb() to flush the writes to the doorbell bar which is a
> > write-combined bar, however on x86
> > mmiowb() does not flush the write combined buffer.
> >
> > This patch fixes this problem by replacing mmiowb() with wmb() after
> > the write combined doorbell write so that writes are flushed and
> > synchronized from more than one processor.
> >
> > Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
> > Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
> > ---
> >  drivers/net/ethernet/qlogic/qede/qede_fp.c |   10 ++++------
> >  1 files changed, 4 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c
> > b/drivers/net/ethernet/qlogic/qede/qede_fp.c
> > index dafc079..2e921ca 100644
> > --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
> > +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
> > @@ -320,13 +320,11 @@ static inline void
> > qede_update_tx_producer(struct qede_tx_queue *txq)
> >  	barrier();
> >  	writel(txq->tx_db.raw, txq->doorbell_addr);
> >
> > -	/* mmiowb is needed to synchronize doorbell writes from more than
> one
> > -	 * processor. It guarantees that the write arrives to the device before
> > -	 * the queue lock is released and another start_xmit is called (possibly
> > -	 * on another CPU). Without this barrier, the next doorbell can bypass
> > -	 * this doorbell. This is applicable to IA64/Altix systems.
> > +	/* Fence required to flush the write combined buffer, since another
> > +	 * CPU may write to the same doorbell address and data may be lost
> > +	 * due to relaxed order nature of write combined bar.
> >  	 */
> > -	mmiowb();
> > +	wmb();
> >  }
> >
> >  static int qede_xdp_xmit(struct qede_dev *edev, struct qede_fastpath
> > *fp,
> > --
> > 1.7.1
> 
> Hi Dave,
> This patch appears as "superseded" in patchwork. I am not really sure why that is
> - I noticed some other barrier work is going on, but none of it will solve this
> issue. This patch solves an important bug in the driver - please consider applying
> it.
> Thanks,
> Ariel

Hi Dave,

The other patchwork which is going on to use writel_relaxed doesn't solve this issue.
I think the writel_relaxed patchwork might have caused this fix as superseded since those changes are in same area/function.
This is an important fix which is after the write combined doorbell write.
Please let me know if I should re-submit this fix or not ?

Thanks,
Manish

^ permalink raw reply

* Re: [trivial PATCH V2] treewide: Align function definition open/close braces
From: Steven Rostedt @ 2018-03-22 13:37 UTC (permalink / raw)
  To: Joe Perches
  Cc: linux-wireless, Alexandre Belloni, x86, Xiubo Li, Peter Zijlstra,
	Jeff Layton, Will Deacon, Timur Tabi, dri-devel, Liam Girdwood,
	J. Bruce Fields, Adaptec OEM Raid Solutions, H. Peter Anvin,
	linux-acpi, linux-rtc, James E.J. Bottomley, Paul Moore,
	linux-scsi, Darrick J. Wong, Dept-GELinuxNICDev, Mark Fasheh,
	Sathya Prakash, amd-gfx, David Airlie, Darren Hart
In-Reply-To: <5ccbbf083e26bddfb4ea4f819ed62347ce266f39.1521669820.git.joe@perches.com>

On Wed, 21 Mar 2018 15:09:32 -0700
Joe Perches <joe@perches.com> wrote:

> diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
> index ad1d6164e946..50f44b7b2b32 100644
> --- a/kernel/trace/trace_printk.c
> +++ b/kernel/trace/trace_printk.c
> @@ -196,7 +196,7 @@ struct notifier_block module_trace_bprintk_format_nb = {
>  };
>  
>  int __trace_bprintk(unsigned long ip, const char *fmt, ...)
> - {
> +{
>  	int ret;
>  	va_list ap;
>  
> @@ -214,7 +214,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...)
>  EXPORT_SYMBOL_GPL(__trace_bprintk);
>  
>  int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
> - {
> +{
>  	if (unlikely(!fmt))
>  		return 0;
>  

Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

-- Steve
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply

* Re: [PATCH v2 bpf-next 4/8] tracepoint: compute num_args at build time
From: Steven Rostedt @ 2018-03-22 13:36 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, David Miller, Daniel Borkmann, Peter Zijlstra,
	Network Development, kernel-team, Linux API
In-Reply-To: <2559d7cb-ec60-1200-2362-04fa34fd02bb@fb.com>

On Wed, 21 Mar 2018 15:05:46 -0700
Alexei Starovoitov <ast@fb.com> wrote:

> Like the only reason my patch is counting till 17 is because of
> trace_iwlwifi_dev_ucode_error().
> The next offenders are using 12 arguments:
> trace_mc_event()
> trace_mm_vmscan_lru_shrink_inactive()
> 
> Clearly not every efficient usage of it:
>          trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
>                          nr_scanned, nr_reclaimed,
>                          stat.nr_dirty,  stat.nr_writeback,
>                          stat.nr_congested, stat.nr_immediate,
>                          stat.nr_activate, stat.nr_ref_keep,
>                          stat.nr_unmap_fail,
>                          sc->priority, file);
> could have passed &stat instead.

Yes they should have, and if I was on the Cc for that patch, I would
have yelled at them and told them that's exactly what they needed to do.

Perhaps I should add something to keep any tracepoint from having more
than 6 arguments. That should force a clean up quickly.

I think I may start doing that.

-- Steve

^ permalink raw reply

* Re: [PATCH 1/2] bpf: Remove struct bpf_verifier_env argument from print_bpf_insn
From: Jiri Olsa @ 2018-03-22 13:32 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Quentin Monnet, Jiri Olsa, Alexei Starovoitov, lkml, netdev
In-Reply-To: <e30c5b79-c2a9-d0ed-e626-bf81a0705805@iogearbox.net>

On Thu, Mar 22, 2018 at 10:34:18AM +0100, Daniel Borkmann wrote:
> On 03/21/2018 07:37 PM, Jiri Olsa wrote:
> > On Wed, Mar 21, 2018 at 05:25:33PM +0000, Quentin Monnet wrote:
> >> 2018-03-21 16:02 UTC+0100 ~ Jiri Olsa <jolsa@kernel.org>
> >>> We use print_bpf_insn in user space (bpftool and soon perf),
> >>> so it'd be nice to keep it generic and strip it off the kernel
> >>> struct bpf_verifier_env argument.
> >>>
> >>> This argument can be safely removed, because its users can
> >>> use the struct bpf_insn_cbs::private_data to pass it.
> >>>
> >>> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> >>> ---
> >>>  kernel/bpf/disasm.c   | 52 +++++++++++++++++++++++++--------------------------
> >>>  kernel/bpf/disasm.h   |  5 +----
> >>>  kernel/bpf/verifier.c |  6 +++---
> >>>  3 files changed, 30 insertions(+), 33 deletions(-)
> >>
> >> [...]
> >>
> >>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >>> index c6eff108aa99..9f27d3fa7259 100644
> >>> --- a/kernel/bpf/verifier.c
> >>> +++ b/kernel/bpf/verifier.c
> >>> @@ -202,8 +202,7 @@ EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
> >>>   * generic for symbol export. The function was renamed, but not the calls in
> >>>   * the verifier to avoid complicating backports. Hence the alias below.
> >>>   */
> >>> -static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
> >>> -				   const char *fmt, ...)
> >>> +static __printf(2, 3) void verbose(void *private_data, const char *fmt, ...)
> >>>  	__attribute__((alias("bpf_verifier_log_write")));
> >>
> >> Just as a note, verbose() will be aliased to a function whose prototype
> >> differs (bpf_verifier_log_write() still expects a struct
> >> bpf_verifier_env as its first argument). I am not so familiar with
> >> function aliases, could this change be a concern?
> > 
> > yea, but as it was pointer for pointer switch I did not
> > see any problem with that.. I'll check more
> 
> Ok, holding off for now until we have clarification. Other option could also
> be to make it void *private_data everywhere and for the kernel writer then
> do struct bpf_verifier_env *env = private_data.

can't find much info about the alias behaviour for this
case.. so how about having separate function for the
print_cb like below.. I still need to test it

thanks,
jirka


---
 kernel/bpf/disasm.c   | 52 +++++++++++++++++++++++++--------------------------
 kernel/bpf/disasm.h   |  5 +----
 kernel/bpf/verifier.c | 41 +++++++++++++++++++++++++++++-----------
 3 files changed, 57 insertions(+), 41 deletions(-)

diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 8740406df2cd..d6b76377cb6e 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -113,16 +113,16 @@ static const char *const bpf_jmp_string[16] = {
 };
 
 static void print_bpf_end_insn(bpf_insn_print_t verbose,
-			       struct bpf_verifier_env *env,
+			       void *private_data,
 			       const struct bpf_insn *insn)
 {
-	verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+	verbose(private_data, "(%02x) r%d = %s%d r%d\n",
+		insn->code, insn->dst_reg,
 		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
 		insn->imm, insn->dst_reg);
 }
 
 void print_bpf_insn(const struct bpf_insn_cbs *cbs,
-		    struct bpf_verifier_env *env,
 		    const struct bpf_insn *insn,
 		    bool allow_ptr_leaks)
 {
@@ -132,23 +132,23 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 	if (class == BPF_ALU || class == BPF_ALU64) {
 		if (BPF_OP(insn->code) == BPF_END) {
 			if (class == BPF_ALU64)
-				verbose(env, "BUG_alu64_%02x\n", insn->code);
+				verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code);
 			else
-				print_bpf_end_insn(verbose, env, insn);
+				print_bpf_end_insn(verbose, cbs->private_data, insn);
 		} else if (BPF_OP(insn->code) == BPF_NEG) {
-			verbose(env, "(%02x) r%d = %s-r%d\n",
+			verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n",
 				insn->code, insn->dst_reg,
 				class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose(env, "(%02x) %sr%d %s %sr%d\n",
+			verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
 				class == BPF_ALU ? "(u32) " : "",
 				insn->src_reg);
 		} else {
-			verbose(env, "(%02x) %sr%d %s %s%d\n",
+			verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
@@ -157,46 +157,46 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 		}
 	} else if (class == BPF_STX) {
 		if (BPF_MODE(insn->code) == BPF_MEM)
-			verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+			verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg,
 				insn->off, insn->src_reg);
 		else if (BPF_MODE(insn->code) == BPF_XADD)
-			verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+			verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off,
 				insn->src_reg);
 		else
-			verbose(env, "BUG_%02x\n", insn->code);
+			verbose(cbs->private_data, "BUG_%02x\n", insn->code);
 	} else if (class == BPF_ST) {
 		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose(env, "BUG_st_%02x\n", insn->code);
+			verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
 			return;
 		}
-		verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+		verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
 			insn->code,
 			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 			insn->dst_reg,
 			insn->off, insn->imm);
 	} else if (class == BPF_LDX) {
 		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose(env, "BUG_ldx_%02x\n", insn->code);
+			verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
 			return;
 		}
-		verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+		verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n",
 			insn->code, insn->dst_reg,
 			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 			insn->src_reg, insn->off);
 	} else if (class == BPF_LD) {
 		if (BPF_MODE(insn->code) == BPF_ABS) {
-			verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+			verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->imm);
 		} else if (BPF_MODE(insn->code) == BPF_IND) {
-			verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+			verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->src_reg, insn->imm);
@@ -212,12 +212,12 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 			if (map_ptr && !allow_ptr_leaks)
 				imm = 0;
 
-			verbose(env, "(%02x) r%d = %s\n",
+			verbose(cbs->private_data, "(%02x) r%d = %s\n",
 				insn->code, insn->dst_reg,
 				__func_imm_name(cbs, insn, imm,
 						tmp, sizeof(tmp)));
 		} else {
-			verbose(env, "BUG_ld_%02x\n", insn->code);
+			verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code);
 			return;
 		}
 	} else if (class == BPF_JMP) {
@@ -227,35 +227,35 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 			char tmp[64];
 
 			if (insn->src_reg == BPF_PSEUDO_CALL) {
-				verbose(env, "(%02x) call pc%s\n",
+				verbose(cbs->private_data, "(%02x) call pc%s\n",
 					insn->code,
 					__func_get_name(cbs, insn,
 							tmp, sizeof(tmp)));
 			} else {
 				strcpy(tmp, "unknown");
-				verbose(env, "(%02x) call %s#%d\n", insn->code,
+				verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code,
 					__func_get_name(cbs, insn,
 							tmp, sizeof(tmp)),
 					insn->imm);
 			}
 		} else if (insn->code == (BPF_JMP | BPF_JA)) {
-			verbose(env, "(%02x) goto pc%+d\n",
+			verbose(cbs->private_data, "(%02x) goto pc%+d\n",
 				insn->code, insn->off);
 		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
-			verbose(env, "(%02x) exit\n", insn->code);
+			verbose(cbs->private_data, "(%02x) exit\n", insn->code);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+			verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n",
 				insn->code, insn->dst_reg,
 				bpf_jmp_string[BPF_OP(insn->code) >> 4],
 				insn->src_reg, insn->off);
 		} else {
-			verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+			verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n",
 				insn->code, insn->dst_reg,
 				bpf_jmp_string[BPF_OP(insn->code) >> 4],
 				insn->imm, insn->off);
 		}
 	} else {
-		verbose(env, "(%02x) %s\n",
+		verbose(cbs->private_data, "(%02x) %s\n",
 			insn->code, bpf_class_string[class]);
 	}
 }
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index 266fe8ee542b..e1324a834a24 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -22,14 +22,12 @@
 #include <string.h>
 #endif
 
-struct bpf_verifier_env;
-
 extern const char *const bpf_alu_string[16];
 extern const char *const bpf_class_string[8];
 
 const char *func_id_name(int id);
 
-typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env,
+typedef __printf(2, 3) void (*bpf_insn_print_t)(void *private_data,
 						const char *, ...);
 typedef const char *(*bpf_insn_revmap_call_t)(void *private_data,
 					      const struct bpf_insn *insn);
@@ -45,7 +43,6 @@ struct bpf_insn_cbs {
 };
 
 void print_bpf_insn(const struct bpf_insn_cbs *cbs,
-		    struct bpf_verifier_env *env,
 		    const struct bpf_insn *insn,
 		    bool allow_ptr_leaks);
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e9f7c20691c1..69bf7590877c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -168,23 +168,16 @@ struct bpf_call_arg_meta {
 
 static DEFINE_MUTEX(bpf_verifier_lock);
 
-/* log_level controls verbosity level of eBPF verifier.
- * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * so the user can figure out what's wrong with the program
- */
-__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
-					   const char *fmt, ...)
+static void log_write(struct bpf_verifier_env *env, const char *fmt,
+		      va_list args)
 {
 	struct bpf_verifer_log *log = &env->log;
 	unsigned int n;
-	va_list args;
 
 	if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
 		return;
 
-	va_start(args, fmt);
 	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
-	va_end(args);
 
 	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
 		  "verifier log line truncated - local buffer too short\n");
@@ -197,7 +190,32 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 	else
 		log->ubuf = NULL;
 }
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
+ */
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+					   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	log_write(env, fmt, args);
+	va_end(args);
+}
 EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) static void print_ins(void *private_data,
+				     const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	log_write(private_data, fmt, args);
+	va_end(args);
+}
+
 /* Historically bpf_verifier_log_write was called verbose, but the name was too
  * generic for symbol export. The function was renamed, but not the calls in
  * the verifier to avoid complicating backports. Hence the alias below.
@@ -4599,11 +4617,12 @@ static int do_check(struct bpf_verifier_env *env)
 
 		if (env->log.level) {
 			const struct bpf_insn_cbs cbs = {
-				.cb_print	= verbose,
+				.cb_print	= print_ins,
+				.private_data	= env,
 			};
 
 			verbose(env, "%d: ", insn_idx);
-			print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks);
+			print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
 		}
 
 		if (bpf_prog_is_dev_bound(env->prog->aux)) {
-- 
2.13.6

^ permalink raw reply related

* Re: [PATCH net-next 1/1] net/ipv4: disable SMC TCP option with SYN Cookies
From: Ursula Braun @ 2018-03-22 13:23 UTC (permalink / raw)
  To: Eric Dumazet, davem
  Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl
In-Reply-To: <27701c4c-3759-6226-3307-fa03a9cc49b8@gmail.com>



On 03/20/2018 05:43 PM, Eric Dumazet wrote:
> 
> 
> On 03/20/2018 09:21 AM, Eric Dumazet wrote:
>>
>>
>> On 03/20/2018 08:53 AM, Ursula Braun wrote:
>>> From: Hans Wippel <hwippel@linux.vnet.ibm.com>
>>>
>>> Currently, the SMC experimental TCP option in a SYN packet is lost on
>>> the server side when SYN Cookies are active. However, the corresponding
>>> SYNACK sent back to the client contains the SMC option. This causes an
>>> inconsistent view of the SMC capabilities on the client and server.
>>>
>>> This patch disables the SMC option in the SYNACK when SYN Cookies are
>>> active to avoid this issue.
>>>
>>> Signed-off-by: Hans Wippel <hwippel@linux.vnet.ibm.com>
>>> Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
>>> ---
>>>  net/ipv4/tcp_output.c | 2 ++
>>>  1 file changed, 2 insertions(+)
>>>
>>> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
>>> index 383cac0ff0ec..22894514feae 100644
>>> --- a/net/ipv4/tcp_output.c
>>> +++ b/net/ipv4/tcp_output.c
>>> @@ -3199,6 +3199,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
>>>  		/* Under synflood, we do not attach skb to a socket,
>>>  		 * to avoid false sharing.
>>>  		 */
>>> +		if (IS_ENABLED(CONFIG_SMC))
>>> +			ireq->smc_ok = 0;
>>>  		break;
>>>  	case TCP_SYNACK_FASTOPEN:
>>>  		/* sk is a const pointer, because we want to express multiple
>>>
>>
>> I disagree with net-next qualification.
>>
>> This fixes a bug, so please send it for net tree, and including an appropriate Fixes: tag.
>>

Okay, I will send it for the net tree.

> 
> Also, please do not add the fix in tcp_make_synack()
> 
> tcp_make_synack() builds an skb, and really should not modify ireq, ideally.
> The only reason ireq is not const is because of the skb_set_owner_w().
> 
> I would clear it in cookie_v4_check()/cookie_v6_check()
> 
> (We could have a common helper to allocate a TCP ireq btw, but this will wait a future patch for net-next)
>

We moved the clear to cookie_v4_check()/cookie_v6_check. However, this does not seem to
be sufficient to prevent the SYNACK from containing the SMC experimental option.
We found that an additional check in tcp_conn_request() helps:

--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6248,6 +6248,9 @@ int tcp_conn_request(struct request_sock
 	if (want_cookie && !tmp_opt.saw_tstamp)
 		tcp_clear_options(&tmp_opt);
 
+	if (IS_ENABLED(CONFIG_SMC) && want_cookie && tmp_opt.smc_ok)
+		tmp_opt.smc_ok = 0;
+
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
 	inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;

Do you think this could be the right place for clearing the smc_ok bit?

  

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox