Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next V9 PATCH 10/16] mlx5: register a memory model when XDP is enabled
From: Jesper Dangaard Brouer @ 2018-04-03 11:08 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152275360298.1026.10333759008401281682.stgit@firesoul>

Now all the users of ndo_xdp_xmit have been converted to use xdp_return_frame.
This enable a different memory model, thus activating another code path
in the xdp_return_frame API.

V2: Fixed issues pointed out by Tariq.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |    8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 0aab3afc6885..13c1e61258a7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -512,6 +512,14 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		rq->mkey_be = c->mkey_be;
 	}
 
+	/* This must only be activate for order-0 pages */
+	if (rq->xdp_prog) {
+		err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
+						 MEM_TYPE_PAGE_ORDER0, NULL);
+		if (err)
+			goto err_rq_wq_destroy;
+	}
+
 	for (i = 0; i < wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
 

^ permalink raw reply related

* [net-next V9 PATCH 09/16] i40e: convert to use generic xdp_frame and xdp_return_frame API
From: Jesper Dangaard Brouer @ 2018-04-03 11:08 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152275360298.1026.10333759008401281682.stgit@firesoul>

Also convert driver i40e, which very recently got XDP_REDIRECT support
in commit d9314c474d4f ("i40e: add support for XDP_REDIRECT").

V7: This patch got added in V7 of this patchset.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c |   20 +++++++++++++++-----
 drivers/net/ethernet/intel/i40e/i40e_txrx.h |    1 +
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index f174c72480ab..96c54cbfb1f9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -638,7 +638,8 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
 		if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
 			kfree(tx_buffer->raw_buf);
 		else if (ring_is_xdp(ring))
-			page_frag_free(tx_buffer->raw_buf);
+			xdp_return_frame(tx_buffer->xdpf->data,
+					 &tx_buffer->xdpf->mem);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 		if (dma_unmap_len(tx_buffer, len))
@@ -841,7 +842,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 
 		/* free the skb/XDP data */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buf->raw_buf);
+			xdp_return_frame(tx_buf->xdpf->data, &tx_buf->xdpf->mem);
 		else
 			napi_consume_skb(tx_buf->skb, napi_budget);
 
@@ -2225,6 +2226,8 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
 	if (!xdp_prog)
 		goto xdp_out;
 
+	prefetchw(xdp->data_hard_start); /* xdp_frame write */
+
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
 	switch (act) {
 	case XDP_PASS:
@@ -3481,25 +3484,32 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
 static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
 			      struct i40e_ring *xdp_ring)
 {
-	u32 size = xdp->data_end - xdp->data;
 	u16 i = xdp_ring->next_to_use;
 	struct i40e_tx_buffer *tx_bi;
 	struct i40e_tx_desc *tx_desc;
+	struct xdp_frame *xdpf;
 	dma_addr_t dma;
+	u32 size;
+
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return I40E_XDP_CONSUMED;
+
+	size = xdpf->len;
 
 	if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
 		xdp_ring->tx_stats.tx_busy++;
 		return I40E_XDP_CONSUMED;
 	}
 
-	dma = dma_map_single(xdp_ring->dev, xdp->data, size, DMA_TO_DEVICE);
+	dma = dma_map_single(xdp_ring->dev, xdpf->data, size, DMA_TO_DEVICE);
 	if (dma_mapping_error(xdp_ring->dev, dma))
 		return I40E_XDP_CONSUMED;
 
 	tx_bi = &xdp_ring->tx_bi[i];
 	tx_bi->bytecount = size;
 	tx_bi->gso_segs = 1;
-	tx_bi->raw_buf = xdp->data;
+	tx_bi->xdpf = xdpf;
 
 	/* record length, and DMA address */
 	dma_unmap_len_set(tx_bi, len, size);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 3043483ec426..857b1d743c8d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -306,6 +306,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size)
 struct i40e_tx_buffer {
 	struct i40e_tx_desc *next_to_watch;
 	union {
+		struct xdp_frame *xdpf;
 		struct sk_buff *skb;
 		void *raw_buf;
 	};

^ permalink raw reply related

* [net-next V9 PATCH 08/16] bpf: cpumap convert to use generic xdp_frame
From: Jesper Dangaard Brouer @ 2018-04-03 11:08 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152275360298.1026.10333759008401281682.stgit@firesoul>

The generic xdp_frame format, was inspired by the cpumap own internal
xdp_pkt format.  It is now time to convert it over to the generic
xdp_frame format.  The cpumap needs one extra field dev_rx.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/net/xdp.h   |    1 +
 kernel/bpf/cpumap.c |  100 ++++++++++++++-------------------------------------
 2 files changed, 29 insertions(+), 72 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 756c42811e78..ea3773f94f65 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -67,6 +67,7 @@ struct xdp_frame {
 	 * while mem info is valid on remote CPU.
 	 */
 	struct xdp_mem_info mem;
+	struct net_device *dev_rx; /* used by cpumap */
 };
 
 /* Convert xdp_buff to xdp_frame */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 3e4bbcbe3e86..bcdc4dea5ce7 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -159,52 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 	kthread_stop(rcpu->kthread);
 }
 
-/* For now, xdp_pkt is a cpumap internal data structure, with info
- * carried between enqueue to dequeue. It is mapped into the top
- * headroom of the packet, to avoid allocating separate mem.
- */
-struct xdp_pkt {
-	void *data;
-	u16 len;
-	u16 headroom;
-	u16 metasize;
-	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
-	 * while mem info is valid on remote CPU.
-	 */
-	struct xdp_mem_info mem;
-	struct net_device *dev_rx;
-};
-
-/* Convert xdp_buff to xdp_pkt */
-static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
-{
-	struct xdp_pkt *xdp_pkt;
-	int metasize;
-	int headroom;
-
-	/* Assure headroom is available for storing info */
-	headroom = xdp->data - xdp->data_hard_start;
-	metasize = xdp->data - xdp->data_meta;
-	metasize = metasize > 0 ? metasize : 0;
-	if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
-		return NULL;
-
-	/* Store info in top of packet */
-	xdp_pkt = xdp->data_hard_start;
-
-	xdp_pkt->data = xdp->data;
-	xdp_pkt->len  = xdp->data_end - xdp->data;
-	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
-	xdp_pkt->metasize = metasize;
-
-	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
-	xdp_pkt->mem = xdp->rxq->mem;
-
-	return xdp_pkt;
-}
-
 static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
-					 struct xdp_pkt *xdp_pkt)
+					 struct xdp_frame *xdpf)
 {
 	unsigned int frame_size;
 	void *pkt_data_start;
@@ -219,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * would be preferred to set frame_size to 2048 or 4096
 	 * depending on the driver.
 	 *   frame_size = 2048;
-	 *   frame_len  = frame_size - sizeof(*xdp_pkt);
+	 *   frame_len  = frame_size - sizeof(*xdp_frame);
 	 *
 	 * Instead, with info avail, skb_shared_info in placed after
 	 * packet len.  This, unfortunately fakes the truesize.
@@ -227,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * is not at a fixed memory location, with mixed length
 	 * packets, which is bad for cache-line hotness.
 	 */
-	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+	frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom +
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
-	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+	pkt_data_start = xdpf->data - xdpf->headroom;
 	skb = build_skb(pkt_data_start, frame_size);
 	if (!skb)
 		return NULL;
 
-	skb_reserve(skb, xdp_pkt->headroom);
-	__skb_put(skb, xdp_pkt->len);
-	if (xdp_pkt->metasize)
-		skb_metadata_set(skb, xdp_pkt->metasize);
+	skb_reserve(skb, xdpf->headroom);
+	__skb_put(skb, xdpf->len);
+	if (xdpf->metasize)
+		skb_metadata_set(skb, xdpf->metasize);
 
 	/* Essential SKB info: protocol and skb->dev */
-	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+	skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
 
 	/* Optional SKB info, currently missing:
 	 * - HW checksum info		(skb->ip_summed)
@@ -259,11 +215,11 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
 	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
 	 * gracefully and warn once.
 	 */
-	struct xdp_pkt *xdp_pkt;
+	struct xdp_frame *xdpf;
 
-	while ((xdp_pkt = ptr_ring_consume(ring)))
-		if (WARN_ON_ONCE(xdp_pkt))
-			xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
+	while ((xdpf = ptr_ring_consume(ring)))
+		if (WARN_ON_ONCE(xdpf))
+			xdp_return_frame(xdpf->data, &xdpf->mem);
 }
 
 static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
@@ -290,7 +246,7 @@ static int cpu_map_kthread_run(void *data)
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
 		unsigned int processed = 0, drops = 0, sched = 0;
-		struct xdp_pkt *xdp_pkt;
+		struct xdp_frame *xdpf;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
@@ -313,13 +269,13 @@ static int cpu_map_kthread_run(void *data)
 		 * kthread CPU pinned. Lockless access to ptr_ring
 		 * consume side valid as no-resize allowed of queue.
 		 */
-		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+		while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
 			struct sk_buff *skb;
 			int ret;
 
-			skb = cpu_map_build_skb(rcpu, xdp_pkt);
+			skb = cpu_map_build_skb(rcpu, xdpf);
 			if (!skb) {
-				xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
+				xdp_return_frame(xdpf->data, &xdpf->mem);
 				continue;
 			}
 
@@ -616,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	spin_lock(&q->producer_lock);
 
 	for (i = 0; i < bq->count; i++) {
-		struct xdp_pkt *xdp_pkt = bq->q[i];
+		struct xdp_frame *xdpf = bq->q[i];
 		int err;
 
-		err = __ptr_ring_produce(q, xdp_pkt);
+		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem);
+			xdp_return_frame(xdpf->data, &xdpf->mem);
 		}
 		processed++;
 	}
@@ -637,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
@@ -648,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 	 * driver to code invoking us to finished, due to driver
 	 * (e.g. ixgbe) recycle tricks based on page-refcnt.
 	 *
-	 * Thus, incoming xdp_pkt is always queued here (else we race
+	 * Thus, incoming xdp_frame is always queued here (else we race
 	 * with another CPU on page-refcnt and remaining driver code).
 	 * Queue time is very short, as driver will invoke flush
 	 * operation, when completing napi->poll call.
 	 */
-	bq->q[bq->count++] = xdp_pkt;
+	bq->q[bq->count++] = xdpf;
 	return 0;
 }
 
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
-	struct xdp_pkt *xdp_pkt;
+	struct xdp_frame *xdpf;
 
-	xdp_pkt = convert_to_xdp_pkt(xdp);
-	if (unlikely(!xdp_pkt))
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
 	/* Info needed when constructing SKB on remote CPU */
-	xdp_pkt->dev_rx = dev_rx;
+	xdpf->dev_rx = dev_rx;
 
-	bq_enqueue(rcpu, xdp_pkt);
+	bq_enqueue(rcpu, xdpf);
 	return 0;
 }
 

^ permalink raw reply related

* [net-next V9 PATCH 07/16] virtio_net: convert to use generic xdp_frame and xdp_return_frame API
From: Jesper Dangaard Brouer @ 2018-04-03 11:08 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152275360298.1026.10333759008401281682.stgit@firesoul>

The virtio_net driver assumes XDP frames are always released based on
page refcnt (via put_page).  Thus, is only queues the XDP data pointer
address and uses virt_to_head_page() to retrieve struct page.

Use the XDP return API to get away from such assumptions. Instead
queue an xdp_frame, which allow us to use the xdp_return_frame API,
when releasing the frame.

V8: Avoid endianness issues (found by kbuild test robot)
V9: Change __virtnet_xdp_xmit from bool to int return value (found by Dan Carpenter)

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/virtio_net.c |   54 +++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7b187ec7411e..f50e1ad81ad4 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -415,38 +415,48 @@ static void virtnet_xdp_flush(struct net_device *dev)
 	virtqueue_kick(sq->vq);
 }
 
-static bool __virtnet_xdp_xmit(struct virtnet_info *vi,
-			       struct xdp_buff *xdp)
+static int __virtnet_xdp_xmit(struct virtnet_info *vi,
+			      struct xdp_buff *xdp)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
-	unsigned int len;
+	struct xdp_frame *xdpf, *xdpf_sent;
 	struct send_queue *sq;
+	unsigned int len;
 	unsigned int qp;
-	void *xdp_sent;
 	int err;
 
 	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
 	sq = &vi->sq[qp];
 
 	/* Free up any pending old buffers before queueing new ones. */
-	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
-		struct page *sent_page = virt_to_head_page(xdp_sent);
+	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+		xdp_return_frame(xdpf_sent->data, &xdpf_sent->mem);
 
-		put_page(sent_page);
-	}
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	/* virtqueue want to use data area in-front of packet */
+	if (unlikely(xdpf->metasize > 0))
+		return -EOPNOTSUPP;
 
-	xdp->data -= vi->hdr_len;
+	if (unlikely(xdpf->headroom < vi->hdr_len))
+		return -EOVERFLOW;
+
+	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
+	xdpf->data -= vi->hdr_len;
 	/* Zero header and leave csum up to XDP layers */
-	hdr = xdp->data;
+	hdr = xdpf->data;
 	memset(hdr, 0, vi->hdr_len);
+	xdpf->len   += vi->hdr_len;
 
-	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
+	sg_init_one(sq->sg, xdpf->data, xdpf->len);
 
-	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
+	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdpf, GFP_ATOMIC);
 	if (unlikely(err))
-		return false; /* Caller handle free/refcnt */
+		return -ENOSPC; /* Caller handle free/refcnt */
 
-	return true;
+	return 0;
 }
 
 static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
@@ -454,7 +464,6 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
 	struct bpf_prog *xdp_prog;
-	bool sent;
 
 	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
 	 * indicate XDP resources have been successfully allocated.
@@ -463,10 +472,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	if (!xdp_prog)
 		return -ENXIO;
 
-	sent = __virtnet_xdp_xmit(vi, xdp);
-	if (!sent)
-		return -ENOSPC;
-	return 0;
+	return __virtnet_xdp_xmit(vi, xdp);
 }
 
 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
@@ -555,7 +561,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
 	struct page *page = virt_to_head_page(buf);
 	unsigned int delta = 0;
 	struct page *xdp_page;
-	bool sent;
 	int err;
 
 	len -= vi->hdr_len;
@@ -606,8 +611,8 @@ static struct sk_buff *receive_small(struct net_device *dev,
 			delta = orig_data - xdp.data;
 			break;
 		case XDP_TX:
-			sent = __virtnet_xdp_xmit(vi, &xdp);
-			if (unlikely(!sent)) {
+			err = __virtnet_xdp_xmit(vi, &xdp);
+			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				goto err_xdp;
 			}
@@ -690,7 +695,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	struct bpf_prog *xdp_prog;
 	unsigned int truesize;
 	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
-	bool sent;
 	int err;
 
 	head_skb = NULL;
@@ -762,8 +766,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			}
 			break;
 		case XDP_TX:
-			sent = __virtnet_xdp_xmit(vi, &xdp);
-			if (unlikely(!sent)) {
+			err = __virtnet_xdp_xmit(vi, &xdp);
+			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				if (unlikely(xdp_page != page))
 					put_page(xdp_page);

^ permalink raw reply related

* [net-next V9 PATCH 06/16] tun: convert to use generic xdp_frame and xdp_return_frame API
From: Jesper Dangaard Brouer @ 2018-04-03 11:08 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152275360298.1026.10333759008401281682.stgit@firesoul>

From: Jesper Dangaard Brouer <brouer@redhat.com>

The tuntap driver invented it's own driver specific way of queuing
XDP packets, by storing the xdp_buff information in the top of
the XDP frame data.

Convert it over to use the more generic xdp_frame structure.  The
main problem with the in-driver method is that the xdp_rxq_info pointer
cannot be trused/used when dequeueing the frame.

V3: Remove check based on feedback from Jason

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/tun.c      |   43 ++++++++++++++++++++-----------------------
 drivers/vhost/net.c    |    7 ++++---
 include/linux/if_tun.h |    4 ++--
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a1ba262f40ad..714735c6d3ff 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -248,11 +248,11 @@ struct veth {
 	__be16 h_vlan_TCI;
 };
 
-bool tun_is_xdp_buff(void *ptr)
+bool tun_is_xdp_frame(void *ptr)
 {
 	return (unsigned long)ptr & TUN_XDP_FLAG;
 }
-EXPORT_SYMBOL(tun_is_xdp_buff);
+EXPORT_SYMBOL(tun_is_xdp_frame);
 
 void *tun_xdp_to_ptr(void *ptr)
 {
@@ -660,10 +660,10 @@ void tun_ptr_free(void *ptr)
 {
 	if (!ptr)
 		return;
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		put_page(virt_to_head_page(xdp->data));
+		xdp_return_frame(xdpf->data, &xdpf->mem);
 	} else {
 		__skb_array_destroy_skb(ptr);
 	}
@@ -1291,17 +1291,14 @@ static const struct net_device_ops tun_netdev_ops = {
 static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 {
 	struct tun_struct *tun = netdev_priv(dev);
-	struct xdp_buff *buff = xdp->data_hard_start;
-	int headroom = xdp->data - xdp->data_hard_start;
+	struct xdp_frame *frame;
 	struct tun_file *tfile;
 	u32 numqueues;
 	int ret = 0;
 
-	/* Assure headroom is available and buff is properly aligned */
-	if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp)))
-		return -ENOSPC;
-
-	*buff = *xdp;
+	frame = convert_to_xdp_frame(xdp);
+	if (unlikely(!frame))
+		return -EOVERFLOW;
 
 	rcu_read_lock();
 
@@ -1316,7 +1313,7 @@ static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	/* Encode the XDP flag into lowest bit for consumer to differ
 	 * XDP buffer from sk_buff.
 	 */
-	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) {
+	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
 		this_cpu_inc(tun->pcpu_stats->tx_dropped);
 		ret = -ENOSPC;
 	}
@@ -1994,11 +1991,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 				struct tun_file *tfile,
-				struct xdp_buff *xdp,
+				struct xdp_frame *xdp_frame,
 				struct iov_iter *iter)
 {
 	int vnet_hdr_sz = 0;
-	size_t size = xdp->data_end - xdp->data;
+	size_t size = xdp_frame->len;
 	struct tun_pcpu_stats *stats;
 	size_t ret;
 
@@ -2014,7 +2011,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
 	}
 
-	ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz;
+	ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
 
 	stats = get_cpu_ptr(tun->pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
@@ -2182,11 +2179,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 			return err;
 	}
 
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		ret = tun_put_user_xdp(tun, tfile, xdp, to);
-		put_page(virt_to_head_page(xdp->data));
+		ret = tun_put_user_xdp(tun, tfile, xdpf, to);
+		xdp_return_frame(xdpf->data, &xdpf->mem);
 	} else {
 		struct sk_buff *skb = ptr;
 
@@ -2425,10 +2422,10 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
 static int tun_ptr_peek_len(void *ptr)
 {
 	if (likely(ptr)) {
-		if (tun_is_xdp_buff(ptr)) {
-			struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+		if (tun_is_xdp_frame(ptr)) {
+			struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-			return xdp->data_end - xdp->data;
+			return xdpf->len;
 		}
 		return __skb_array_len_with_tag(ptr);
 	} else {
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index edc6fec9ad84..a29df80bf5b0 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,7 @@
 #include <linux/skbuff.h>
 
 #include <net/sock.h>
+#include <net/xdp.h>
 
 #include "vhost.h"
 
@@ -177,10 +178,10 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
 
 static int vhost_net_buf_peek_len(void *ptr)
 {
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		return xdp->data_end - xdp->data;
+		return xdpf->len;
 	}
 
 	return __skb_array_len_with_tag(ptr);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index fd00170b494f..3d2996dc7d85 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -22,7 +22,7 @@
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
-bool tun_is_xdp_buff(void *ptr);
+bool tun_is_xdp_frame(void *ptr);
 void *tun_xdp_to_ptr(void *ptr);
 void *tun_ptr_to_xdp(void *ptr);
 void tun_ptr_free(void *ptr);
@@ -39,7 +39,7 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline bool tun_is_xdp_buff(void *ptr)
+static inline bool tun_is_xdp_frame(void *ptr)
 {
 	return false;
 }

^ permalink raw reply related

* [net-next V9 PATCH 05/16] xdp: introduce a new xdp_frame type
From: Jesper Dangaard Brouer @ 2018-04-03 11:08 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152275360298.1026.10333759008401281682.stgit@firesoul>

This is needed to convert drivers tuntap and virtio_net.

This is a generalization of what is done inside cpumap, which will be
converted later.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/net/xdp.h |   40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 15f8ade008b5..756c42811e78 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -58,6 +58,46 @@ struct xdp_buff {
 	struct xdp_rxq_info *rxq;
 };
 
+struct xdp_frame {
+	void *data;
+	u16 len;
+	u16 headroom;
+	u16 metasize;
+	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
+	 * while mem info is valid on remote CPU.
+	 */
+	struct xdp_mem_info mem;
+};
+
+/* Convert xdp_buff to xdp_frame */
+static inline
+struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
+{
+	struct xdp_frame *xdp_frame;
+	int metasize;
+	int headroom;
+
+	/* Assure headroom is available for storing info */
+	headroom = xdp->data - xdp->data_hard_start;
+	metasize = xdp->data - xdp->data_meta;
+	metasize = metasize > 0 ? metasize : 0;
+	if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
+		return NULL;
+
+	/* Store info in top of packet */
+	xdp_frame = xdp->data_hard_start;
+
+	xdp_frame->data = xdp->data;
+	xdp_frame->len  = xdp->data_end - xdp->data;
+	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
+	xdp_frame->metasize = metasize;
+
+	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
+	xdp_frame->mem = xdp->rxq->mem;
+
+	return xdp_frame;
+}
+
 static inline
 void xdp_return_frame(void *data, struct xdp_mem_info *mem)
 {

^ permalink raw reply related

* [net-next V9 PATCH 04/16] xdp: move struct xdp_buff from filter.h to xdp.h
From: Jesper Dangaard Brouer @ 2018-04-03 11:07 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152275360298.1026.10333759008401281682.stgit@firesoul>

This is done to prepare for the next patch, and it is also
nice to move this XDP related struct out of filter.h.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/linux/filter.h |   24 +-----------------------
 include/net/xdp.h      |   22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index fc4e8f91b03d..4da8b2308174 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -30,6 +30,7 @@ struct sock;
 struct seccomp_data;
 struct bpf_prog_aux;
 struct xdp_rxq_info;
+struct xdp_buff;
 
 /* ArgX, context and stack frame pointer register positions. Note,
  * Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -500,14 +501,6 @@ struct bpf_skb_data_end {
 	void *data_end;
 };
 
-struct xdp_buff {
-	void *data;
-	void *data_end;
-	void *data_meta;
-	void *data_hard_start;
-	struct xdp_rxq_info *rxq;
-};
-
 struct sk_msg_buff {
 	void *data;
 	void *data_end;
@@ -772,21 +765,6 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
-/* Drivers not supporting XDP metadata can use this helper, which
- * rejects any room expansion for metadata as a result.
- */
-static __always_inline void
-xdp_set_data_meta_invalid(struct xdp_buff *xdp)
-{
-	xdp->data_meta = xdp->data + 1;
-}
-
-static __always_inline bool
-xdp_data_meta_unsupported(const struct xdp_buff *xdp)
-{
-	return unlikely(xdp->data_meta > xdp->data);
-}
-
 void bpf_warn_invalid_xdp_action(u32 act);
 
 struct sock *do_sk_redirect_map(struct sk_buff *skb);
diff --git a/include/net/xdp.h b/include/net/xdp.h
index e4207699c410..15f8ade008b5 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -50,6 +50,13 @@ struct xdp_rxq_info {
 	struct xdp_mem_info mem;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
+struct xdp_buff {
+	void *data;
+	void *data_end;
+	void *data_meta;
+	void *data_hard_start;
+	struct xdp_rxq_info *rxq;
+};
 
 static inline
 void xdp_return_frame(void *data, struct xdp_mem_info *mem)
@@ -72,4 +79,19 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
 int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 			       enum xdp_mem_type type, void *allocator);
 
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
+
 #endif /* __LINUX_NET_XDP_H__ */

^ permalink raw reply related

* [net-next V9 PATCH 03/16] ixgbe: use xdp_return_frame API
From: Jesper Dangaard Brouer @ 2018-04-03 11:07 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152275360298.1026.10333759008401281682.stgit@firesoul>

Extend struct ixgbe_tx_buffer to store the xdp_mem_info.

Notice that this could be optimized further by putting this into
a union in the struct ixgbe_tx_buffer, but this patchset
works towards removing this again.  Thus, this is not done.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |    1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |    6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 4f08c712e58e..abb5248e917e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -250,6 +250,7 @@ struct ixgbe_tx_buffer {
 	DEFINE_DMA_UNMAP_ADDR(dma);
 	DEFINE_DMA_UNMAP_LEN(len);
 	u32 tx_flags;
+	struct xdp_mem_info xdp_mem;
 };
 
 struct ixgbe_rx_buffer {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index afadba99f7b8..0bfe6cf2bf8b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1216,7 +1216,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 
 		/* free the skb */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buffer->data);
+			xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem);
 		else
 			napi_consume_skb(tx_buffer->skb, napi_budget);
 
@@ -5797,7 +5797,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)
 
 		/* Free all the Tx ring sk_buffs */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buffer->data);
+			xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 
@@ -8366,6 +8366,8 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
 	dma_unmap_len_set(tx_buffer, len, len);
 	dma_unmap_addr_set(tx_buffer, dma, dma);
 	tx_buffer->data = xdp->data;
+	tx_buffer->xdp_mem = xdp->rxq->mem;
+
 	tx_desc->read.buffer_addr = cpu_to_le64(dma);
 
 	/* put descriptor type bits */

^ permalink raw reply related

* [net-next V9 PATCH 02/16] xdp: introduce xdp_return_frame API and use in cpumap
From: Jesper Dangaard Brouer @ 2018-04-03 11:07 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152275360298.1026.10333759008401281682.stgit@firesoul>

Introduce an xdp_return_frame API, and convert over cpumap as
the first user, given it have queued XDP frame structure to leverage.

V3: Cleanup and remove C99 style comments, pointed out by Alex Duyck.
V6: Remove comment that id will be added later (Req by Alex Duyck)
V8: Rename enum mem_type to xdp_mem_type (found by kbuild test robot)

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/net/xdp.h   |   27 +++++++++++++++++++++++
 kernel/bpf/cpumap.c |   60 +++++++++++++++++++++++++++++++--------------------
 net/core/xdp.c      |   18 +++++++++++++++
 3 files changed, 81 insertions(+), 24 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index b2362ddfa694..e4207699c410 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -33,16 +33,43 @@
  * also mandatory during RX-ring setup.
  */
 
+enum xdp_mem_type {
+	MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
+	MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
+	MEM_TYPE_MAX,
+};
+
+struct xdp_mem_info {
+	u32 type; /* enum xdp_mem_type, but known size type */
+};
+
 struct xdp_rxq_info {
 	struct net_device *dev;
 	u32 queue_index;
 	u32 reg_state;
+	struct xdp_mem_info mem;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
+
+static inline
+void xdp_return_frame(void *data, struct xdp_mem_info *mem)
+{
+	if (mem->type == MEM_TYPE_PAGE_SHARED)
+		page_frag_free(data);
+
+	if (mem->type == MEM_TYPE_PAGE_ORDER0) {
+		struct page *page = virt_to_page(data); /* Assumes order0 page*/
+
+		put_page(page);
+	}
+}
+
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
 void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
 bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+			       enum xdp_mem_type type, void *allocator);
 
 #endif /* __LINUX_NET_XDP_H__ */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a4bb0b34375a..3e4bbcbe3e86 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -19,6 +19,7 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/ptr_ring.h>
+#include <net/xdp.h>
 
 #include <linux/sched.h>
 #include <linux/workqueue.h>
@@ -137,27 +138,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	return ERR_PTR(err);
 }
 
-static void __cpu_map_queue_destructor(void *ptr)
-{
-	/* The tear-down procedure should have made sure that queue is
-	 * empty.  See __cpu_map_entry_replace() and work-queue
-	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
-	 * gracefully and warn once.
-	 */
-	if (WARN_ON_ONCE(ptr))
-		page_frag_free(ptr);
-}
-
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
-	if (atomic_dec_and_test(&rcpu->refcnt)) {
-		/* The queue should be empty at this point */
-		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
-		kfree(rcpu->queue);
-		kfree(rcpu);
-	}
-}
-
 static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 {
 	atomic_inc(&rcpu->refcnt);
@@ -188,6 +168,10 @@ struct xdp_pkt {
 	u16 len;
 	u16 headroom;
 	u16 metasize;
+	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
+	 * while mem info is valid on remote CPU.
+	 */
+	struct xdp_mem_info mem;
 	struct net_device *dev_rx;
 };
 
@@ -213,6 +197,9 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
 	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
 	xdp_pkt->metasize = metasize;
 
+	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
+	xdp_pkt->mem = xdp->rxq->mem;
+
 	return xdp_pkt;
 }
 
@@ -265,6 +252,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	return skb;
 }
 
+static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
+{
+	/* The tear-down procedure should have made sure that queue is
+	 * empty.  See __cpu_map_entry_replace() and work-queue
+	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+	 * gracefully and warn once.
+	 */
+	struct xdp_pkt *xdp_pkt;
+
+	while ((xdp_pkt = ptr_ring_consume(ring)))
+		if (WARN_ON_ONCE(xdp_pkt))
+			xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		/* The queue should be empty at this point */
+		__cpu_map_ring_cleanup(rcpu->queue);
+		ptr_ring_cleanup(rcpu->queue, NULL);
+		kfree(rcpu->queue);
+		kfree(rcpu);
+	}
+}
+
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
@@ -307,7 +319,7 @@ static int cpu_map_kthread_run(void *data)
 
 			skb = cpu_map_build_skb(rcpu, xdp_pkt);
 			if (!skb) {
-				page_frag_free(xdp_pkt);
+				xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
 				continue;
 			}
 
@@ -604,13 +616,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	spin_lock(&q->producer_lock);
 
 	for (i = 0; i < bq->count; i++) {
-		void *xdp_pkt = bq->q[i];
+		struct xdp_pkt *xdp_pkt = bq->q[i];
 		int err;
 
 		err = __ptr_ring_produce(q, xdp_pkt);
 		if (err) {
 			drops++;
-			page_frag_free(xdp_pkt); /* Free xdp_pkt */
+			xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem);
 		}
 		processed++;
 	}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 097a0f74e004..7e6b3545277d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -71,3 +71,21 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
 	return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+			       enum xdp_mem_type type, void *allocator)
+{
+	if (type >= MEM_TYPE_MAX)
+		return -EINVAL;
+
+	xdp_rxq->mem.type = type;
+
+	if (allocator)
+		return -EOPNOTSUPP;
+
+	/* TODO: Allocate an ID that maps to allocator pointer
+	 * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
+	 */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);

^ permalink raw reply related

* [net-next V9 PATCH 00/16] XDP redirect memory return API
From: Jesper Dangaard Brouer @ 2018-04-03 11:07 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan

This is V9, but it's worth mentioning that V8 was send against
net-next, because i40e got XDP_REDIRECT support in-between V6, and it
doesn't exist in bpf-next yet.  Most significant change in V8 was that
page_pool only gets compiled into the kernel when a drivers Kconfig
'select' the feature.

This patchset works towards supporting different XDP RX-ring memory
allocators.  As this will be needed by the AF_XDP zero-copy mode.

The patchset uses mlx5 as the sample driver, which gets implemented
XDP_REDIRECT RX-mode, but not ndo_xdp_xmit (as this API is subject to
change thought the patchset).

A new struct xdp_frame is introduced (modeled after cpumap xdp_pkt).
And both ndo_xdp_xmit and the new xdp_return_frame end-up using this.

Support for a driver supplied allocator is implemented, and a
refurbished version of page_pool is the first return allocator type
introduced.  This will be a integration point for AF_XDP zero-copy.

The mlx5 driver evolve into using the page_pool, and see a performance
increase (with ndo_xdp_xmit out ixgbe driver) from 6Mpps to 12Mpps.


The patchset stop at 16 patches (one over limit), but more API changes
are planned.  Specifically extending ndo_xdp_xmit and xdp_return_frame
APIs to support bulking.  As this will address some known limits.

V2: Updated according to Tariq's feedback
V3: Updated based on feedback from Jason Wang and Alex Duyck
V4: Updated based on feedback from Tariq and Jason
V5: Fix SPDX license, add Tariq's reviews, improve patch desc for perf test
V6: Updated based on feedback from Eric Dumazet and Alex Duyck
V7: Adapt to i40e that got XDP_REDIRECT support in-between
V8: Updated based on feedback kbuild test robot, and adjust for mlx5 changes
V9:
 Remove some inline statements, let compiler decide what to inline
 Fix return value in virtio_net driver
 Adjust for mlx5 changes in-between submissions

---

Jesper Dangaard Brouer (16):
      mlx5: basic XDP_REDIRECT forward support
      xdp: introduce xdp_return_frame API and use in cpumap
      ixgbe: use xdp_return_frame API
      xdp: move struct xdp_buff from filter.h to xdp.h
      xdp: introduce a new xdp_frame type
      tun: convert to use generic xdp_frame and xdp_return_frame API
      virtio_net: convert to use generic xdp_frame and xdp_return_frame API
      bpf: cpumap convert to use generic xdp_frame
      i40e: convert to use generic xdp_frame and xdp_return_frame API
      mlx5: register a memory model when XDP is enabled
      xdp: rhashtable with allocator ID to pointer mapping
      page_pool: refurbish version of page_pool code
      xdp: allow page_pool as an allocator type in xdp_return_frame
      mlx5: use page_pool for xdp_return_frame call
      xdp: transition into using xdp_frame for return API
      xdp: transition into using xdp_frame for ndo_xdp_xmit


 drivers/net/ethernet/intel/i40e/i40e_txrx.c       |   33 ++
 drivers/net/ethernet/intel/i40e/i40e_txrx.h       |    3 
 drivers/net/ethernet/intel/ixgbe/ixgbe.h          |    3 
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |   37 ++
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig   |    1 
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |    4 
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   37 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |   42 ++-
 drivers/net/tun.c                                 |   60 ++--
 drivers/net/virtio_net.c                          |   67 +++-
 drivers/vhost/net.c                               |    7 
 include/linux/filter.h                            |   24 --
 include/linux/if_tun.h                            |    4 
 include/linux/netdevice.h                         |    4 
 include/net/page_pool.h                           |  143 +++++++++
 include/net/xdp.h                                 |   83 +++++
 kernel/bpf/cpumap.c                               |  132 +++------
 net/Kconfig                                       |    3 
 net/core/Makefile                                 |    1 
 net/core/filter.c                                 |   17 +
 net/core/page_pool.c                              |  317 +++++++++++++++++++++
 net/core/xdp.c                                    |  269 ++++++++++++++++++
 22 files changed, 1093 insertions(+), 198 deletions(-)
 create mode 100644 include/net/page_pool.h
 create mode 100644 net/core/page_pool.c

^ permalink raw reply

* [net-next V9 PATCH 01/16] mlx5: basic XDP_REDIRECT forward support
From: Jesper Dangaard Brouer @ 2018-04-03 11:07 UTC (permalink / raw)
  To: netdev, BjörnTöpel, magnus.karlsson
  Cc: eugenia, Jason Wang, John Fastabend, Eran Ben Elisha,
	Saeed Mahameed, galp, Jesper Dangaard Brouer, Daniel Borkmann,
	Alexei Starovoitov, Tariq Toukan
In-Reply-To: <152275360298.1026.10333759008401281682.stgit@firesoul>

This implements basic XDP redirect support in mlx5 driver.

Notice that the ndo_xdp_xmit() is NOT implemented, because that API
need some changes that this patchset is working towards.

The main purpose of this patch is have different drivers doing
XDP_REDIRECT to show how different memory models behave in a cross
driver world.

Update(pre-RFCv2 Tariq): Need to DMA unmap page before xdp_do_redirect,
as the return API does not exist yet to to keep this mapped.

Update(pre-RFCv3 Saeed): Don't mix XDP_TX and XDP_REDIRECT flushing,
introduce xdpsq.db.redirect_flush boolian.

V9: Adjust for commit 121e89275471 ("net/mlx5e: Refactor RQ XDP_TX indication")

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h    |    1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |   27 ++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 30cad07be2b5..1a05d1072c5e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -392,6 +392,7 @@ struct mlx5e_xdpsq {
 	struct {
 		struct mlx5e_dma_info     *di;
 		bool                       doorbell;
+		bool                       redirect_flush;
 	} db;
 
 	/* read only */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 176645762e49..0e24be05907f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -236,14 +236,20 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
 	return 0;
 }
 
+static void mlx5e_page_dma_unmap(struct mlx5e_rq *rq,
+					struct mlx5e_dma_info *dma_info)
+{
+	dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
+		       rq->buff.map_dir);
+}
+
 void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
 			bool recycle)
 {
 	if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
 		return;
 
-	dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
-		       rq->buff.map_dir);
+	mlx5e_page_dma_unmap(rq, dma_info);
 	put_page(dma_info->page);
 }
 
@@ -800,9 +806,10 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 				   struct mlx5e_dma_info *di,
 				   void *va, u16 *rx_headroom, u32 *len)
 {
-	const struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
+	struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
 	struct xdp_buff xdp;
 	u32 act;
+	int err;
 
 	if (!prog)
 		return false;
@@ -823,6 +830,15 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp)))
 			trace_xdp_exception(rq->netdev, prog, act);
 		return true;
+	case XDP_REDIRECT:
+		/* When XDP enabled then page-refcnt==1 here */
+		err = xdp_do_redirect(rq->netdev, &xdp, prog);
+		if (!err) {
+			__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
+			rq->xdpsq.db.redirect_flush = true;
+			mlx5e_page_dma_unmap(rq, di);
+		}
+		return true;
 	default:
 		bpf_warn_invalid_xdp_action(act);
 	case XDP_ABORTED:
@@ -1140,6 +1156,11 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 		xdpsq->db.doorbell = false;
 	}
 
+	if (xdpsq->db.redirect_flush) {
+		xdp_do_flush_map();
+		xdpsq->db.redirect_flush = false;
+	}
+
 	mlx5_cqwq_update_db_record(&cq->wq);
 
 	/* ensure cq space is freed before enabling more cqes */

^ permalink raw reply related

* Re: [GIT PULL] remove in-kernel calls to syscalls
From: Ingo Molnar @ 2018-04-03 10:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Dominik Brodowski, Linux Kernel Mailing List, Al Viro,
	Arnd Bergmann, linux-arch, hmclauchlan, tautschn, Amir Goldstein,
	Andi Kleen, Andrew Morton, Christoph Hellwig, Darren Hart,
	David S. Miller, Eric W. Biederman, H. Peter Anvin,
	Jaswinder Singh, Jeff Dike, Jiri Slaby, Kexec Mailing List,
	linux-fsdevel
In-Reply-To: <CA+55aFyaVVKKbXPFzW1Tr7CTpiLCK+1nGdhS21wnm1j64bqWPA@mail.gmail.com>


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Mon, Apr 2, 2018 at 12:04 PM, Dominik Brodowski
> <linux@dominikbrodowski.net> wrote:
> >
> > This patchset removes all in-kernel calls to syscall functions in the
> > kernel with the exception of arch/.
> 
> Ok, this finished off my arch updates for today, I'll probably move on
> to driver pulls tomorrow.
> 
> Anyway, it's in my tree, will push out once my test build finishes.

Thanks!

Dominik, if you submit the x86 ptregs conversion patches in the next 1-2 days on 
top of Linus's tree (642e7fd23353), then I can apply them and if they are 
problem-free I can perhaps tempt Linus with a pull request early next week or so.

The Spectre angle does make me want those changes as well.

Thanks,

	Ingo

^ permalink raw reply

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
From: Ingo Molnar @ 2018-04-03 10:36 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Thomas Gleixner, David Laight, 'Rahul Lakkireddy',
	x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com, Andy Lutomirski,
	Peter Zijlstra, Fenghua Yu
In-Reply-To: <20180403084932.GA3926@amd>


* Pavel Machek <pavel@ucw.cz> wrote:

> > > > Yeah, so generic memcpy() replacement is only feasible I think if the most 
> > > > optimistic implementation is actually correct:
> > > > 
> > > >  - if no preempt disable()/enable() is required
> > > > 
> > > >  - if direct access to the AVX[2] registers does not disturb legacy FPU state in 
> > > >    any fashion
> > > > 
> > > >  - if direct access to the AVX[2] registers cannot raise weird exceptions or have
> > > >    weird behavior if the FPU control word is modified to non-standard values by 
> > > >    untrusted user-space
> > > > 
> > > > If we have to touch the FPU tag or control words then it's probably only good for 
> > > > a specialized API.
> > > 
> > > I did not mean to have a general memcpy replacement. Rather something like
> > > magic_memcpy() which falls back to memcpy when AVX is not usable or the
> > > length does not justify the AVX stuff at all.
> > 
> > OK, fair enough.
> > 
> > Note that a generic version might still be worth trying out, if and only if it's 
> > safe to access those vector registers directly: modern x86 CPUs will do their 
> > non-constant memcpy()s via the common memcpy_erms() function - which could in 
> > theory be an easy common point to be (cpufeatures-) patched to an AVX2 variant, if 
> > size (and alignment, perhaps) is a multiple of 32 bytes or so.
> 
> How is AVX2 supposed to help the memcpy speed?
> 
> If the copy is small, constant overhead will dominate, and I don't
> think AVX2 is going to be win there.

There are several advantages:

1)

"REP; MOVS" (also called ERMS) has a significant constant "setup cost".

In the scheme I suggested (and if it's possible) then single-register AVX2 access 
on the other hand has a setup cost on the "few cycles" order of magnitude.

2)

AVX2 have various non-temporary load and store behavioral variants - while "REP; 
MOVS" doesn't (or rather, any such caching optimizations, to the extent they 
exist, are hidden in the microcode).

> If the copy is big, well, the copy loop will likely run out of L1 and maybe even 
> out of L2, and at that point speed of the loop does not matter because memory is 
> slow...?

In many cases "memory" will be something very fast, such as another level of 
cache. Also, on NUMA "memory" can also be something locally wired to the CPU - 
again accessible at ridiculous bandwidths.

Nevertheless ERMS is probably wins for the regular bulk memcpy by a few percentage 
points, so I don't think AVX2 is a win in the generic large-memcpy case, as long 
as continued caching of both the loads and the stores is beneficial.

Thanks,

	Ingo

^ permalink raw reply

* Re: [PATCH 00/47] Netfilter/IPVS updates for net-next
From: Pablo Neira Ayuso @ 2018-04-03 10:19 UTC (permalink / raw)
  To: Rafał Miłecki
  Cc: netfilter-devel, Network Development, David Miller
In-Reply-To: <fbd3cd82-cece-74cb-2305-bbb6980b52cc@gmail.com>

Hi Rafal,

On Tue, Apr 03, 2018 at 08:13:49AM +0200, Rafał Miłecki wrote:
> Hi Pablo,
> 
[...]
> I see you mentioned changes from Felix in the pull request but:
> 1) I don't see any commits from Felix listed below
> 2) I don't think you sent any of these patches
> 
> Can you take a look at what has happened to them, please?

I will include them in my next pull request once net-next opens up
again.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next] bridge: Allow max MTU when multiple VLANs present
From: Chas Williams @ 2018-04-03 10:14 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: Toshiaki Makita, David Miller, netdev, Stephen Hemminger,
	Nikolay Aleksandrov
In-Reply-To: <CAJieiUg5PtpZKT6cwNiAgbE8BJGB5DqW4TcOoZcXD-x1TtUHJw@mail.gmail.com>

On Tue, Apr 3, 2018 at 2:13 AM, Roopa Prabhu <roopa@cumulusnetworks.com> wrote:
> On Mon, Apr 2, 2018 at 8:26 AM, Chas Williams <3chas3@gmail.com> wrote:
>> On Mon, Apr 2, 2018 at 11:08 AM, Roopa Prabhu <roopa@cumulusnetworks.com> wrote:
>>>
>
> [snip]
>
>>> they are popular...in-fact they are the default bridge mode on our
>>> network switches.
>>> And they have been around for some time now to ignore its users.
>>> Plus it is not right to change default mtu behavior for one mode of the bridge
>>> and not the others (bridge mtu handling from user-space is complex enough today
>>> due to dynamic mtu changes on port enslave/deslave).
>>
>> I don't see the issue with one mode of bridge behaving differently
>> from another mode.
>> The VLAN behavior between the two bridge modes is completely different so having
>> a different MTU behavior doesn't seem that surprising.
>>
>> You are potentially mixing different sized VLAN on a same bridge.  The only sane
>> choice is to pick the largest MTU for the bridge.  This lets you have
>> whatever MTU
>> is appropriate on the child VLAN interfaces of the bridge.  If you
>> attempt to forward
>> from a port with a larger MTU to a smaller MTU, you get the expected behavior.
>
>
> you mean larger MTU on the vlan device on the bridge to a smaller MTU
> on the bridge port ?.
> this will result in dropping the packet. how is this supposed to be
> expected default behavior ?.

If a user configures the VLAN device to be a larger than MTU than the port,
then yes, I expect the packet to be dropped.  That's a msconfiguration of either
the VLAN's or port's MTU.  We can't protect the user from that by simply making
sure they can't mismatch the MTUs because you can still get packets dropped
during ingress from the large MTU VLAN.

>> Forcing the end user to configure all the ports to the maximum MTU of
>> all the VLANs
>> on the bridge is wrong IMHO.
>> You then risk attempting to forward
>> oversize packets
>> on a network that can't support that.
>
> I am a bit confused: Are you trying to solve the config problem by
> implicitly making it the default and there by creating the oversize
> packet drop issue by default ?

I am attempting to allow a configuration that lets me choose the appropriate
MTU size for each port.  With the previous code to configure a VLAN device
with an MTU of 9000, I would need to configure all the ports
on the bridge with an MTU of 9000 regardless of whether those ports should
be passing large MTU traffic.  I am creating a potential packet drop issue
by forwarding traffic between ports that have an artificially inflated MTUs.


>>>> I don't think those drops are unexpected.  If a user has misconfigured
>>>> the bridge
>>>> we can't be expected to fix that for them.  It is the user's
>>>> responsbility to ensure
>>>> that the ports on the VLAN have a size consistent with the traffic
>>>> they expect to
>>>> pass.
>>>>
>>>
>>> By default they are not expected today. The problem is changing the bridge
>>> to max mtu changes 'all' the vlan devices on top of the vlan aware bridge to
>>> max mtu by default which makes drops at the bridge driver more common if the
>>> user had mixed mtu on its ports.
>>
>> That's not been my experience.  The MTU on the vlan devices is only
>> limited by the
>> bridges's MTU.  Setting the bridge MTU doesn't change the children
>> VLAN devices MTUs.
>
> It does not, but it now allows vlan devices on the bridge to have a
> larger MTU if they need to (some or all of them).
> This is consistent with vxlan driver as well: picks default mtu to be
> lower or equal to the default dst dev mtu and allows user to override
> it with a larger MTU.

The VLAN device MTU can't be larger than the parent MTU.  The end user
is just going to set the parent bridge MTU to be larger anyway, so why not
just make that the default?

^ permalink raw reply

* Re: [PATCH net-next 02/12] clk: sunxi-ng: r40: export a regmap to access the GMAC register
From: Chen-Yu Tsai @ 2018-04-03  9:58 UTC (permalink / raw)
  To: Icenowy Zheng
  Cc: Maxime Ripard, Michael Turquette, Stephen Boyd,
	Giuseppe Cavallaro, Rob Herring, Mark Rutland, Mark Brown,
	linux-arm-kernel, linux-clk, devicetree, netdev, Corentin Labbe
In-Reply-To: <9982975F-0911-48F2-BEEB-CE93AB561A55@aosc.io>

On Tue, Apr 3, 2018 at 5:54 PM, Icenowy Zheng <icenowy@aosc.io> wrote:
>
>
> 于 2018年4月3日 GMT+08:00 下午5:53:08, Chen-Yu Tsai <wens@csie.org> 写到:
>>On Tue, Apr 3, 2018 at 5:50 PM, Maxime Ripard
>><maxime.ripard@bootlin.com> wrote:
>>> On Tue, Apr 03, 2018 at 11:48:45AM +0200, Maxime Ripard wrote:
>>>> On Tue, Mar 20, 2018 at 03:15:02PM +0800, Chen-Yu Tsai wrote:
>>>> > On Mon, Mar 19, 2018 at 5:31 AM, Maxime Ripard
>>>> > <maxime.ripard@bootlin.com> wrote:
>>>> > > On Sat, Mar 17, 2018 at 05:28:47PM +0800, Chen-Yu Tsai wrote:
>>>> > >> From: Icenowy Zheng <icenowy@aosc.io>
>>>> > >>
>>>> > >> There's a GMAC configuration register, which exists on
>>A64/A83T/H3/H5 in
>>>> > >> the syscon part, in the CCU of R40 SoC.
>>>> > >>
>>>> > >> Export a regmap of the CCU.
>>>> > >>
>>>> > >> Read access is not restricted to all registers, but only the
>>GMAC
>>>> > >> register is allowed to be written.
>>>> > >>
>>>> > >> Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
>>>> > >> Signed-off-by: Chen-Yu Tsai <wens@csie.org>
>>>> > >
>>>> > > Gah, this is crazy. I'm really starting to regret letting that
>>syscon
>>>> > > in in the first place...
>>>> >
>>>> > IMHO syscon is really a better fit. It's part of the glue layer
>>and
>>>> > most other dwmac user platforms treat it as such and use a syscon.
>>>> > Plus the controls encompass delays (phase), inverters (polarity),
>>>> > and even signal routing. It's not really just a group of clock
>>controls,
>>>> > like what we poorly modeled for A20/A31. I think that was really a
>>>> > mistake.
>>>> >
>>>> > As I mentioned in the cover letter, a slightly saner approach
>>would
>>>> > be to let drivers add custom syscon entries, which would then
>>require
>>>> > less custom plumbing.
>>>>
>>>> A syscon is convenient, sure, but it also bypasses any abstraction
>>>> layer we have everywhere else, which means that we'll have to
>>maintain
>>>> the register layout in each and every driver that uses it.
>>>>
>>>> So far, it's only be the GMAC, but it can also be others (the SRAM
>>>> controller comes to my mind), and then, if there's any difference in
>>>> the design in a future SoC, we'll have to maintain that in the GMAC
>>>> driver as well.
>>>
>>> I guess I forgot to say something, I'm fine with using a syscon we
>>> already have.
>>>
>>> I'm just questionning if merging any other driver using one is the
>>> right move.
>>
>>Right. So in this case, we are not actually going through the syscon
>>API. Rather we are exporting a regmap whose properties we actually
>>define. If it makes you more acceptable to it, we could map just
>>the GMAC register in the new regmap, and also have it named. This
>>is all plumbing within the kernel so the device tree stays the same.
>
> I think my driver has already restricted the write permission
> only to GMAC register.

Correct, but it still maps the entire region out, which means the
consumer needs to know which offset to use. Maxime is saying this
is something that is troublesome to maintain. So my proposal was
to create a regmap with a base at the GMAC register offset. That
way, the consumer doesn't need to use an offset to access it.

ChenYu

^ permalink raw reply

* Re: [PATCH net-next 02/12] clk: sunxi-ng: r40: export a regmap to access the GMAC register
From: Icenowy Zheng @ 2018-04-03  9:54 UTC (permalink / raw)
  To: Chen-Yu Tsai, Maxime Ripard
  Cc: Michael Turquette, Stephen Boyd, Giuseppe Cavallaro, Rob Herring,
	Mark Rutland, Mark Brown, linux-arm-kernel, linux-clk, devicetree,
	netdev, Corentin Labbe
In-Reply-To: <CAGb2v67NO0hbATReq11Y_-2M_yF4hbj_sOtK3D_8yUQb1xi2ww@mail.gmail.com>



于 2018年4月3日 GMT+08:00 下午5:53:08, Chen-Yu Tsai <wens@csie.org> 写到:
>On Tue, Apr 3, 2018 at 5:50 PM, Maxime Ripard
><maxime.ripard@bootlin.com> wrote:
>> On Tue, Apr 03, 2018 at 11:48:45AM +0200, Maxime Ripard wrote:
>>> On Tue, Mar 20, 2018 at 03:15:02PM +0800, Chen-Yu Tsai wrote:
>>> > On Mon, Mar 19, 2018 at 5:31 AM, Maxime Ripard
>>> > <maxime.ripard@bootlin.com> wrote:
>>> > > On Sat, Mar 17, 2018 at 05:28:47PM +0800, Chen-Yu Tsai wrote:
>>> > >> From: Icenowy Zheng <icenowy@aosc.io>
>>> > >>
>>> > >> There's a GMAC configuration register, which exists on
>A64/A83T/H3/H5 in
>>> > >> the syscon part, in the CCU of R40 SoC.
>>> > >>
>>> > >> Export a regmap of the CCU.
>>> > >>
>>> > >> Read access is not restricted to all registers, but only the
>GMAC
>>> > >> register is allowed to be written.
>>> > >>
>>> > >> Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
>>> > >> Signed-off-by: Chen-Yu Tsai <wens@csie.org>
>>> > >
>>> > > Gah, this is crazy. I'm really starting to regret letting that
>syscon
>>> > > in in the first place...
>>> >
>>> > IMHO syscon is really a better fit. It's part of the glue layer
>and
>>> > most other dwmac user platforms treat it as such and use a syscon.
>>> > Plus the controls encompass delays (phase), inverters (polarity),
>>> > and even signal routing. It's not really just a group of clock
>controls,
>>> > like what we poorly modeled for A20/A31. I think that was really a
>>> > mistake.
>>> >
>>> > As I mentioned in the cover letter, a slightly saner approach
>would
>>> > be to let drivers add custom syscon entries, which would then
>require
>>> > less custom plumbing.
>>>
>>> A syscon is convenient, sure, but it also bypasses any abstraction
>>> layer we have everywhere else, which means that we'll have to
>maintain
>>> the register layout in each and every driver that uses it.
>>>
>>> So far, it's only be the GMAC, but it can also be others (the SRAM
>>> controller comes to my mind), and then, if there's any difference in
>>> the design in a future SoC, we'll have to maintain that in the GMAC
>>> driver as well.
>>
>> I guess I forgot to say something, I'm fine with using a syscon we
>> already have.
>>
>> I'm just questionning if merging any other driver using one is the
>> right move.
>
>Right. So in this case, we are not actually going through the syscon
>API. Rather we are exporting a regmap whose properties we actually
>define. If it makes you more acceptable to it, we could map just
>the GMAC register in the new regmap, and also have it named. This
>is all plumbing within the kernel so the device tree stays the same.

I think my driver has already restricted the write permission
only to GMAC register.

>
>ChenYu

^ permalink raw reply

* Re: [PATCH net-next 02/12] clk: sunxi-ng: r40: export a regmap to access the GMAC register
From: Icenowy Zheng @ 2018-04-03  9:52 UTC (permalink / raw)
  To: Maxime Ripard, Chen-Yu Tsai
  Cc: Michael Turquette, Stephen Boyd, Giuseppe Cavallaro, Rob Herring,
	Mark Rutland, Mark Brown, linux-arm-kernel, linux-clk, devicetree,
	netdev, Corentin Labbe
In-Reply-To: <20180403095005.skflxb7m2qzbhjix@flea>



于 2018年4月3日 GMT+08:00 下午5:50:05, Maxime Ripard <maxime.ripard@bootlin.com> 写到:
>On Tue, Apr 03, 2018 at 11:48:45AM +0200, Maxime Ripard wrote:
>> On Tue, Mar 20, 2018 at 03:15:02PM +0800, Chen-Yu Tsai wrote:
>> > On Mon, Mar 19, 2018 at 5:31 AM, Maxime Ripard
>> > <maxime.ripard@bootlin.com> wrote:
>> > > On Sat, Mar 17, 2018 at 05:28:47PM +0800, Chen-Yu Tsai wrote:
>> > >> From: Icenowy Zheng <icenowy@aosc.io>
>> > >>
>> > >> There's a GMAC configuration register, which exists on
>A64/A83T/H3/H5 in
>> > >> the syscon part, in the CCU of R40 SoC.
>> > >>
>> > >> Export a regmap of the CCU.
>> > >>
>> > >> Read access is not restricted to all registers, but only the
>GMAC
>> > >> register is allowed to be written.
>> > >>
>> > >> Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
>> > >> Signed-off-by: Chen-Yu Tsai <wens@csie.org>
>> > >
>> > > Gah, this is crazy. I'm really starting to regret letting that
>syscon
>> > > in in the first place...
>> > 
>> > IMHO syscon is really a better fit. It's part of the glue layer and
>> > most other dwmac user platforms treat it as such and use a syscon.
>> > Plus the controls encompass delays (phase), inverters (polarity),
>> > and even signal routing. It's not really just a group of clock
>controls,
>> > like what we poorly modeled for A20/A31. I think that was really a
>> > mistake.
>> > 
>> > As I mentioned in the cover letter, a slightly saner approach would
>> > be to let drivers add custom syscon entries, which would then
>require
>> > less custom plumbing.
>> 
>> A syscon is convenient, sure, but it also bypasses any abstraction
>> layer we have everywhere else, which means that we'll have to
>maintain
>> the register layout in each and every driver that uses it.
>> 
>> So far, it's only be the GMAC, but it can also be others (the SRAM
>> controller comes to my mind), and then, if there's any difference in
>> the design in a future SoC, we'll have to maintain that in the GMAC
>> driver as well.
>
>I guess I forgot to say something, I'm fine with using a syscon we
>already have.
>
>I'm just questionning if merging any other driver using one is the
>right move.

Even for current SoCs supported by dwnac-sun8i, there
is a syscon/sram-controller problem. They're both at 0x1c00000.

The first examples for the need of sram-controller is
A64, which we need to claim SRAM C for DE2 access.

>
>Maxime

^ permalink raw reply

* Re: [PATCH net-next 02/12] clk: sunxi-ng: r40: export a regmap to access the GMAC register
From: Chen-Yu Tsai @ 2018-04-03  9:53 UTC (permalink / raw)
  To: Maxime Ripard
  Cc: Michael Turquette, Stephen Boyd, Giuseppe Cavallaro, Rob Herring,
	Mark Rutland, Mark Brown, Icenowy Zheng, linux-arm-kernel,
	linux-clk, devicetree, netdev, Corentin Labbe
In-Reply-To: <20180403095005.skflxb7m2qzbhjix@flea>

On Tue, Apr 3, 2018 at 5:50 PM, Maxime Ripard <maxime.ripard@bootlin.com> wrote:
> On Tue, Apr 03, 2018 at 11:48:45AM +0200, Maxime Ripard wrote:
>> On Tue, Mar 20, 2018 at 03:15:02PM +0800, Chen-Yu Tsai wrote:
>> > On Mon, Mar 19, 2018 at 5:31 AM, Maxime Ripard
>> > <maxime.ripard@bootlin.com> wrote:
>> > > On Sat, Mar 17, 2018 at 05:28:47PM +0800, Chen-Yu Tsai wrote:
>> > >> From: Icenowy Zheng <icenowy@aosc.io>
>> > >>
>> > >> There's a GMAC configuration register, which exists on A64/A83T/H3/H5 in
>> > >> the syscon part, in the CCU of R40 SoC.
>> > >>
>> > >> Export a regmap of the CCU.
>> > >>
>> > >> Read access is not restricted to all registers, but only the GMAC
>> > >> register is allowed to be written.
>> > >>
>> > >> Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
>> > >> Signed-off-by: Chen-Yu Tsai <wens@csie.org>
>> > >
>> > > Gah, this is crazy. I'm really starting to regret letting that syscon
>> > > in in the first place...
>> >
>> > IMHO syscon is really a better fit. It's part of the glue layer and
>> > most other dwmac user platforms treat it as such and use a syscon.
>> > Plus the controls encompass delays (phase), inverters (polarity),
>> > and even signal routing. It's not really just a group of clock controls,
>> > like what we poorly modeled for A20/A31. I think that was really a
>> > mistake.
>> >
>> > As I mentioned in the cover letter, a slightly saner approach would
>> > be to let drivers add custom syscon entries, which would then require
>> > less custom plumbing.
>>
>> A syscon is convenient, sure, but it also bypasses any abstraction
>> layer we have everywhere else, which means that we'll have to maintain
>> the register layout in each and every driver that uses it.
>>
>> So far, it's only be the GMAC, but it can also be others (the SRAM
>> controller comes to my mind), and then, if there's any difference in
>> the design in a future SoC, we'll have to maintain that in the GMAC
>> driver as well.
>
> I guess I forgot to say something, I'm fine with using a syscon we
> already have.
>
> I'm just questionning if merging any other driver using one is the
> right move.

Right. So in this case, we are not actually going through the syscon
API. Rather we are exporting a regmap whose properties we actually
define. If it makes you more acceptable to it, we could map just
the GMAC register in the new regmap, and also have it named. This
is all plumbing within the kernel so the device tree stays the same.

ChenYu

^ permalink raw reply

* FW: gretap tunnel redirecting 2 different networks on destination host
From: Marc Roos @ 2018-04-03  4:33 UTC (permalink / raw)
  To: netdev

I see you are quite busy with discussing the patches etc. If this is the 
incorrect place to ask for a little help please let me know. I just got 
this from some one on stack overflow who got some answers here.

-----Original Message-----
Subject: gretap tunnel redirecting 2 different networks on destination 
host

How can I get the 10.11.12.x traffic received on tun1 at server B to 
eth2, and 172.16.1.x to eth1? 

I have a server A that sends 172.16.1.x and 10.11.12.x traffic via a 
gretab tunnel 192.168.1.x to server B.

            +-------------+                             +------------+
 172.16.1.x |      B      |                             |      A     |
     -------|eth1         |         192.168.1.x GRETAP  |            |
            |         tun1|-----------------------------|tun1        |
 10.11.12.x |             |                             |            |
     -------|eth2         |                             |            |
            +-------------+                             +------------+

When I put the tun1 interface of server B in a bridge with eth1 I am 
able to ping several 172.16.1.x ip's from server A. And communication on 

this network seems to be ok

- I cannot put eth2 on the same bridge. 
- I thought of creating a 2nd gretab tunnel and use each tunnel for a 
network, but I think there is probably a better solution.

^ permalink raw reply

* FW: gretap tunnel redirecting 2 different networks on destination host
From: Marc Roos @ 2018-04-03  4:33 UTC (permalink / raw)
  To: netdev

I see you are quite busy with discussing the patches etc. If this is the 
incorrect place to ask for a little help please let me know. I just got 
this from some one on stack overflow who got some answers here.

-----Original Message-----
Subject: gretap tunnel redirecting 2 different networks on destination 
host

How can I get the 10.11.12.x traffic received on tun1 at server B to 
eth2, and 172.16.1.x to eth1? 

I have a server A that sends 172.16.1.x and 10.11.12.x traffic via a 
gretab tunnel 192.168.1.x to server B.

            +-------------+                             +------------+
 172.16.1.x |      B      |                             |      A     |
     -------|eth1         |         192.168.1.x GRETAP  |            |
            |         tun1|-----------------------------|tun1        |
 10.11.12.x |             |                             |            |
     -------|eth2         |                             |            |
            +-------------+                             +------------+

When I put the tun1 interface of server B in a bridge with eth1 I am 
able to ping several 172.16.1.x ip's from server A. And communication on 

this network seems to be ok

- I cannot put eth2 on the same bridge. 
- I thought of creating a 2nd gretab tunnel and use each tunnel for a 
network, but I think there is probably a better solution.

^ permalink raw reply

* Re: possible deadlock in skb_queue_tail
From: Kirill Tkhai @ 2018-04-03  9:50 UTC (permalink / raw)
  To: syzbot, davem, dh.herrmann, dvlasenk, dwindsor, elena.reshetova,
	ishkamiel, keescook, linux-kernel, matthew, mjurczyk, netdev,
	syzkaller-bugs, viro, xemul
In-Reply-To: <0000000000003584570568da18dd@google.com>

On 02.04.2018 12:20, syzbot wrote:
> Hello,
> 
> syzbot hit the following crash on net-next commit
> 06b19fe9a6df7aaa423cd8404ebe5ac9ec4b2960 (Sun Apr 1 03:37:33 2018 +0000)
> Merge branch 'chelsio-inline-tls'
> syzbot dashboard link: https://syzkaller.appspot.com/bug?extid=6b495100f17ca8554ab9
> 
> Unfortunately, I don't have any reproducer for this crash yet.
> Raw console output: https://syzkaller.appspot.com/x/log.txt?id=6218830443446272
> Kernel config: https://syzkaller.appspot.com/x/.config?id=3327544840960562528
> compiler: gcc (GCC) 7.1.1 20170620
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+6b495100f17ca8554ab9@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for details.
> If you forward the report, please keep this part and the footer.
> 
> 
> ======================================================
> WARNING: possible circular locking dependency detected
> 4.16.0-rc6+ #290 Not tainted
> ------------------------------------------------------
> syz-executor7/20971 is trying to acquire lock:
>  (&af_unix_sk_receive_queue_lock_key){+.+.}, at: [<00000000271ef0d8>] skb_queue_tail+0x26/0x150 net/core/skbuff.c:2899
> 
> but task is already holding lock:
>  (&(&u->lock)->rlock/1){+.+.}, at: [<000000004e725e14>] unix_state_double_lock+0x7b/0xb0 net/unix/af_unix.c:1088
> 
> which lock already depends on the new lock.
> 
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #1 (&(&u->lock)->rlock/1){+.+.}:
>        _raw_spin_lock_nested+0x28/0x40 kernel/locking/spinlock.c:354
>        sk_diag_dump_icons net/unix/diag.c:82 [inline]
>        sk_diag_fill.isra.4+0xa52/0xfe0 net/unix/diag.c:144
>        sk_diag_dump net/unix/diag.c:178 [inline]
>        unix_diag_dump+0x400/0x4f0 net/unix/diag.c:206
>        netlink_dump+0x492/0xcf0 net/netlink/af_netlink.c:2221
>        __netlink_dump_start+0x4ec/0x710 net/netlink/af_netlink.c:2318
>        netlink_dump_start include/linux/netlink.h:214 [inline]
>        unix_diag_handler_dump+0x3e7/0x750 net/unix/diag.c:307
>        __sock_diag_cmd net/core/sock_diag.c:230 [inline]
>        sock_diag_rcv_msg+0x204/0x360 net/core/sock_diag.c:261
>        netlink_rcv_skb+0x14b/0x380 net/netlink/af_netlink.c:2443
>        sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:272
>        netlink_unicast_kernel net/netlink/af_netlink.c:1307 [inline]
>        netlink_unicast+0x4c4/0x6b0 net/netlink/af_netlink.c:1333
>        netlink_sendmsg+0xa4a/0xe80 net/netlink/af_netlink.c:1896
>        sock_sendmsg_nosec net/socket.c:629 [inline]
>        sock_sendmsg+0xca/0x110 net/socket.c:639
>        sock_write_iter+0x31a/0x5d0 net/socket.c:908
>        call_write_iter include/linux/fs.h:1782 [inline]
>        new_sync_write fs/read_write.c:469 [inline]
>        __vfs_write+0x684/0x970 fs/read_write.c:482
>        vfs_write+0x189/0x510 fs/read_write.c:544
>        SYSC_write fs/read_write.c:589 [inline]
>        SyS_write+0xef/0x220 fs/read_write.c:581
>        do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
>        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> 
> -> #0 (&af_unix_sk_receive_queue_lock_key){+.+.}:
>        lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920
>        __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>        _raw_spin_lock_irqsave+0x96/0xc0 kernel/locking/spinlock.c:152
>        skb_queue_tail+0x26/0x150 net/core/skbuff.c:2899
>        unix_dgram_sendmsg+0xa30/0x1610 net/unix/af_unix.c:1807
>        sock_sendmsg_nosec net/socket.c:629 [inline]
>        sock_sendmsg+0xca/0x110 net/socket.c:639
>        ___sys_sendmsg+0x320/0x8b0 net/socket.c:2047
>        __sys_sendmmsg+0x1ee/0x620 net/socket.c:2137
>        SYSC_sendmmsg net/socket.c:2168 [inline]
>        SyS_sendmmsg+0x35/0x60 net/socket.c:2163
>        do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
>        entry_SYSCALL_64_after_hwframe+0x42/0xb7

sk_diag_dump_icons() dumps only sockets in TCP_LISTEN state.
TCP_LISTEN state may be assigned in only place in net/unix/af_unix.c:
it's unix_listen(). The function is applied to stream and seqpacket
socket types.

It can't be stream because of the second stack, and seqpacket also can't,
as I don't think it's possible for gcc to inline unix_seqpacket_sendmsg()
in the way, we don't see it in the stack.

So, this is looks like false positive result for me.

Kirill

> 
> other info that might help us debug this:
> 
>  Possible unsafe locking scenario:
> 
>        CPU0                    CPU1
>        ----                    ----
>   lock(&(&u->lock)->rlock/1);
>                                lock(&af_unix_sk_receive_queue_lock_key);
>                                lock(&(&u->lock)->rlock/1);
>   lock(&af_unix_sk_receive_queue_lock_key);
> 
>  *** DEADLOCK ***
> 
> 1 lock held by syz-executor7/20971:
>  #0:  (&(&u->lock)->rlock/1){+.+.}, at: [<000000004e725e14>] unix_state_double_lock+0x7b/0xb0 net/unix/af_unix.c:1088
> 
> stack backtrace:
> CPU: 0 PID: 20971 Comm: syz-executor7 Not tainted 4.16.0-rc6+ #290
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:17 [inline]
>  dump_stack+0x194/0x24d lib/dump_stack.c:53
>  print_circular_bug.isra.38+0x2cd/0x2dc kernel/locking/lockdep.c:1223
>  check_prev_add kernel/locking/lockdep.c:1863 [inline]
>  check_prevs_add kernel/locking/lockdep.c:1976 [inline]
>  validate_chain kernel/locking/lockdep.c:2417 [inline]
>  __lock_acquire+0x30a8/0x3e00 kernel/locking/lockdep.c:3431
>  lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920
>  __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>  _raw_spin_lock_irqsave+0x96/0xc0 kernel/locking/spinlock.c:152
>  skb_queue_tail+0x26/0x150 net/core/skbuff.c:2899
>  unix_dgram_sendmsg+0xa30/0x1610 net/unix/af_unix.c:1807
>  sock_sendmsg_nosec net/socket.c:629 [inline]
>  sock_sendmsg+0xca/0x110 net/socket.c:639
>  ___sys_sendmsg+0x320/0x8b0 net/socket.c:2047
>  __sys_sendmmsg+0x1ee/0x620 net/socket.c:2137
>  SYSC_sendmmsg net/socket.c:2168 [inline]
>  SyS_sendmmsg+0x35/0x60 net/socket.c:2163
>  do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
>  entry_SYSCALL_64_after_hwframe+0x42/0xb7
> RIP: 0033:0x455269
> RSP: 002b:00007f71ffad6c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000133
> RAX: ffffffffffffffda RBX: 00007f71ffad76d4 RCX: 0000000000455269
> RDX: 04924924924924f4 RSI: 0000000020000200 RDI: 0000000000000016
> RBP: 000000000072bf58 R08: 0000000000000000 R09: 0000000000000000
> R10: 00000000200000d4 R11: 0000000000000246 R12: 00000000ffffffff
> R13: 00000000000004ca R14: 00000000006f9390 R15: 0000000000000001
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: sync thread started: state = BACKUP, mcast_ifn = bcsh0, syncid = 0, id = 0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> IPVS: Unknown mcast interface: bcsh0
> 
> 
> ---
> This bug is generated by a dumb bot. It may contain errors.
> See https://goo.gl/tpsmEJ for details.
> Direct all questions to syzkaller@googlegroups.com.
> 
> syzbot will keep track of this bug report.
> If you forgot to add the Reported-by tag, once the fix for this bug is merged
> into any tree, please reply to this email with:
> #syz fix: exact-commit-title
> To mark this as a duplicate of another syzbot report, please reply with:
> #syz dup: exact-subject-of-another-report
> If it's a one-off invalid bug report, please reply with:
> #syz invalid
> Note: if the crash happens again, it will cause creation of a new bug report.
> Note: all commands must start from beginning of the line in the email body.

^ permalink raw reply

* Re: [PATCH net-next 02/12] clk: sunxi-ng: r40: export a regmap to access the GMAC register
From: Maxime Ripard @ 2018-04-03  9:50 UTC (permalink / raw)
  To: Chen-Yu Tsai
  Cc: Michael Turquette, Stephen Boyd, Giuseppe Cavallaro, Rob Herring,
	Mark Rutland, Mark Brown, Icenowy Zheng, linux-arm-kernel,
	linux-clk, devicetree, netdev, Corentin Labbe
In-Reply-To: <20180403094845.le2hfuxktlv66lre@flea>

[-- Attachment #1: Type: text/plain, Size: 2181 bytes --]

On Tue, Apr 03, 2018 at 11:48:45AM +0200, Maxime Ripard wrote:
> On Tue, Mar 20, 2018 at 03:15:02PM +0800, Chen-Yu Tsai wrote:
> > On Mon, Mar 19, 2018 at 5:31 AM, Maxime Ripard
> > <maxime.ripard@bootlin.com> wrote:
> > > On Sat, Mar 17, 2018 at 05:28:47PM +0800, Chen-Yu Tsai wrote:
> > >> From: Icenowy Zheng <icenowy@aosc.io>
> > >>
> > >> There's a GMAC configuration register, which exists on A64/A83T/H3/H5 in
> > >> the syscon part, in the CCU of R40 SoC.
> > >>
> > >> Export a regmap of the CCU.
> > >>
> > >> Read access is not restricted to all registers, but only the GMAC
> > >> register is allowed to be written.
> > >>
> > >> Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
> > >> Signed-off-by: Chen-Yu Tsai <wens@csie.org>
> > >
> > > Gah, this is crazy. I'm really starting to regret letting that syscon
> > > in in the first place...
> > 
> > IMHO syscon is really a better fit. It's part of the glue layer and
> > most other dwmac user platforms treat it as such and use a syscon.
> > Plus the controls encompass delays (phase), inverters (polarity),
> > and even signal routing. It's not really just a group of clock controls,
> > like what we poorly modeled for A20/A31. I think that was really a
> > mistake.
> > 
> > As I mentioned in the cover letter, a slightly saner approach would
> > be to let drivers add custom syscon entries, which would then require
> > less custom plumbing.
> 
> A syscon is convenient, sure, but it also bypasses any abstraction
> layer we have everywhere else, which means that we'll have to maintain
> the register layout in each and every driver that uses it.
> 
> So far, it's only be the GMAC, but it can also be others (the SRAM
> controller comes to my mind), and then, if there's any difference in
> the design in a future SoC, we'll have to maintain that in the GMAC
> driver as well.

I guess I forgot to say something, I'm fine with using a syscon we
already have.

I'm just questionning if merging any other driver using one is the
right move.

Maxime

-- 
Maxime Ripard, Bootlin (formerly Free Electrons)
Embedded Linux and Kernel engineering
https://bootlin.com

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 02/12] clk: sunxi-ng: r40: export a regmap to access the GMAC register
From: Maxime Ripard @ 2018-04-03  9:48 UTC (permalink / raw)
  To: Chen-Yu Tsai
  Cc: Michael Turquette, Stephen Boyd, Giuseppe Cavallaro, Rob Herring,
	Mark Rutland, Mark Brown, Icenowy Zheng, linux-arm-kernel,
	linux-clk, devicetree, netdev, Corentin Labbe
In-Reply-To: <CAGb2v67Fp20wkmqWyRowpAmm4EDwENnKFG+mccqWaPa5Jj4zBw@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2404 bytes --]

On Tue, Mar 20, 2018 at 03:15:02PM +0800, Chen-Yu Tsai wrote:
> On Mon, Mar 19, 2018 at 5:31 AM, Maxime Ripard
> <maxime.ripard@bootlin.com> wrote:
> > On Sat, Mar 17, 2018 at 05:28:47PM +0800, Chen-Yu Tsai wrote:
> >> From: Icenowy Zheng <icenowy@aosc.io>
> >>
> >> There's a GMAC configuration register, which exists on A64/A83T/H3/H5 in
> >> the syscon part, in the CCU of R40 SoC.
> >>
> >> Export a regmap of the CCU.
> >>
> >> Read access is not restricted to all registers, but only the GMAC
> >> register is allowed to be written.
> >>
> >> Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
> >> Signed-off-by: Chen-Yu Tsai <wens@csie.org>
> >
> > Gah, this is crazy. I'm really starting to regret letting that syscon
> > in in the first place...
> 
> IMHO syscon is really a better fit. It's part of the glue layer and
> most other dwmac user platforms treat it as such and use a syscon.
> Plus the controls encompass delays (phase), inverters (polarity),
> and even signal routing. It's not really just a group of clock controls,
> like what we poorly modeled for A20/A31. I think that was really a
> mistake.
> 
> As I mentioned in the cover letter, a slightly saner approach would
> be to let drivers add custom syscon entries, which would then require
> less custom plumbing.

A syscon is convenient, sure, but it also bypasses any abstraction
layer we have everywhere else, which means that we'll have to maintain
the register layout in each and every driver that uses it.

So far, it's only be the GMAC, but it can also be others (the SRAM
controller comes to my mind), and then, if there's any difference in
the design in a future SoC, we'll have to maintain that in the GMAC
driver as well.

> > And I'm not really looking forward the time where SCPI et al. will be
> > mature and we'll have the clock controller completely outside of our
> > control.
> 
> I don't think it's going to happen for any of the older SoCs. The R40
> only stands out because the GMAC controls are in the clock controller
> address space, presumably to be like the A20.

SCPI (or equivalent) is a really nice feature to have when it comes to
virtualization, so even if it's less likely, it doesn't make it less
relevant on other SoCs.

Maxime

-- 
Maxime Ripard, Bootlin (formerly Free Electrons)
Embedded Linux and Kernel engineering
https://bootlin.com

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* [PATCH v2] net: phy: marvell10g: add thermal hwmon device
From: Russell King @ 2018-04-03  9:31 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli, Guenter Roeck; +Cc: netdev

Add a thermal monitoring device for the Marvell 88x3310, which updates
once a second.  We also need to hook into the suspend/resume mechanism
to ensure that the thermal monitoring is reconfigured when we resume.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
v2: update to apply to net-next

 drivers/net/phy/marvell10g.c | 184 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 182 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 8a0bd98fdec7..db9d66781da6 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -21,8 +21,10 @@
  * If both the fiber and copper ports are connected, the first to gain
  * link takes priority and the other port is completely locked out.
  */
-#include <linux/phy.h>
+#include <linux/ctype.h>
+#include <linux/hwmon.h>
 #include <linux/marvell_phy.h>
+#include <linux/phy.h>
 
 enum {
 	MV_PCS_BASE_T		= 0x0000,
@@ -40,6 +42,19 @@ enum {
 	 */
 	MV_AN_CTRL1000		= 0x8000, /* 1000base-T control register */
 	MV_AN_STAT1000		= 0x8001, /* 1000base-T status register */
+
+	/* Vendor2 MMD registers */
+	MV_V2_TEMP_CTRL		= 0xf08a,
+	MV_V2_TEMP_CTRL_MASK	= 0xc000,
+	MV_V2_TEMP_CTRL_SAMPLE	= 0x0000,
+	MV_V2_TEMP_CTRL_DISABLE	= 0xc000,
+	MV_V2_TEMP		= 0xf08c,
+	MV_V2_TEMP_UNKNOWN	= 0x9600, /* unknown function */
+};
+
+struct mv3310_priv {
+	struct device *hwmon_dev;
+	char *hwmon_name;
 };
 
 static int mv3310_modify(struct phy_device *phydev, int devad, u16 reg,
@@ -60,17 +75,180 @@ static int mv3310_modify(struct phy_device *phydev, int devad, u16 reg,
 	return ret < 0 ? ret : 1;
 }
 
+#ifdef CONFIG_HWMON
+static umode_t mv3310_hwmon_is_visible(const void *data,
+				       enum hwmon_sensor_types type,
+				       u32 attr, int channel)
+{
+	if (type == hwmon_chip && attr == hwmon_chip_update_interval)
+		return 0444;
+	if (type == hwmon_temp && attr == hwmon_temp_input)
+		return 0444;
+	return 0;
+}
+
+static int mv3310_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+			     u32 attr, int channel, long *value)
+{
+	struct phy_device *phydev = dev_get_drvdata(dev);
+	int temp;
+
+	if (type == hwmon_chip && attr == hwmon_chip_update_interval) {
+		*value = MSEC_PER_SEC;
+		return 0;
+	}
+
+	if (type == hwmon_temp && attr == hwmon_temp_input) {
+		temp = phy_read_mmd(phydev, MDIO_MMD_VEND2, MV_V2_TEMP);
+		if (temp < 0)
+			return temp;
+
+		*value = ((temp & 0xff) - 75) * 1000;
+
+		return 0;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static const struct hwmon_ops mv3310_hwmon_ops = {
+	.is_visible = mv3310_hwmon_is_visible,
+	.read = mv3310_hwmon_read,
+};
+
+static u32 mv3310_hwmon_chip_config[] = {
+	HWMON_C_REGISTER_TZ | HWMON_C_UPDATE_INTERVAL,
+	0,
+};
+
+static const struct hwmon_channel_info mv3310_hwmon_chip = {
+	.type = hwmon_chip,
+	.config = mv3310_hwmon_chip_config,
+};
+
+static u32 mv3310_hwmon_temp_config[] = {
+	HWMON_T_INPUT,
+	0,
+};
+
+static const struct hwmon_channel_info mv3310_hwmon_temp = {
+	.type = hwmon_temp,
+	.config = mv3310_hwmon_temp_config,
+};
+
+static const struct hwmon_channel_info *mv3310_hwmon_info[] = {
+	&mv3310_hwmon_chip,
+	&mv3310_hwmon_temp,
+	NULL,
+};
+
+static const struct hwmon_chip_info mv3310_hwmon_chip_info = {
+	.ops = &mv3310_hwmon_ops,
+	.info = mv3310_hwmon_info,
+};
+
+static int mv3310_hwmon_config(struct phy_device *phydev, bool enable)
+{
+	u16 val;
+	int ret;
+
+	ret = phy_write_mmd(phydev, MDIO_MMD_VEND2, MV_V2_TEMP,
+			    MV_V2_TEMP_UNKNOWN);
+	if (ret < 0)
+		return ret;
+
+	val = enable ? MV_V2_TEMP_CTRL_SAMPLE : MV_V2_TEMP_CTRL_DISABLE;
+	ret = mv3310_modify(phydev, MDIO_MMD_VEND2, MV_V2_TEMP_CTRL,
+			    MV_V2_TEMP_CTRL_MASK, val);
+
+	return ret < 0 ? ret : 0;
+}
+
+static void mv3310_hwmon_disable(void *data)
+{
+	struct phy_device *phydev = data;
+
+	mv3310_hwmon_config(phydev, false);
+}
+
+static int mv3310_hwmon_probe(struct phy_device *phydev)
+{
+	struct device *dev = &phydev->mdio.dev;
+	struct mv3310_priv *priv = dev_get_drvdata(&phydev->mdio.dev);
+	int i, j, ret;
+
+	priv->hwmon_name = devm_kstrdup(dev, dev_name(dev), GFP_KERNEL);
+	if (!priv->hwmon_name)
+		return -ENODEV;
+
+	for (i = j = 0; priv->hwmon_name[i]; i++) {
+		if (isalnum(priv->hwmon_name[i])) {
+			if (i != j)
+				priv->hwmon_name[j] = priv->hwmon_name[i];
+			j++;
+		}
+	}
+	priv->hwmon_name[j] = '\0';
+
+	ret = mv3310_hwmon_config(phydev, true);
+	if (ret)
+		return ret;
+
+	ret = devm_add_action_or_reset(dev, mv3310_hwmon_disable, phydev);
+	if (ret)
+		return ret;
+
+	priv->hwmon_dev = devm_hwmon_device_register_with_info(dev,
+				priv->hwmon_name, phydev,
+				&mv3310_hwmon_chip_info, NULL);
+
+	return PTR_ERR_OR_ZERO(priv->hwmon_dev);
+}
+#else
+static inline int mv3310_hwmon_config(struct phy_device *phydev, bool enable)
+{
+	return 0;
+}
+
+static int mv3310_hwmon_probe(struct phy_device *phydev)
+{
+	return 0;
+}
+#endif
+
 static int mv3310_probe(struct phy_device *phydev)
 {
+	struct mv3310_priv *priv;
 	u32 mmd_mask = MDIO_DEVS_PMAPMD | MDIO_DEVS_AN;
+	int ret;
 
 	if (!phydev->is_c45 ||
 	    (phydev->c45_ids.devices_in_package & mmd_mask) != mmd_mask)
 		return -ENODEV;
 
+	priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	dev_set_drvdata(&phydev->mdio.dev, priv);
+
+	ret = mv3310_hwmon_probe(phydev);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int mv3310_suspend(struct phy_device *phydev)
+{
 	return 0;
 }
 
+static int mv3310_resume(struct phy_device *phydev)
+{
+	return mv3310_hwmon_config(phydev, true);
+}
+
 static int mv3310_config_init(struct phy_device *phydev)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported) = { 0, };
@@ -367,9 +545,11 @@ static struct phy_driver mv3310_drivers[] = {
 				  SUPPORTED_FIBRE |
 				  SUPPORTED_10000baseT_Full |
 				  SUPPORTED_Backplane,
-		.probe		= mv3310_probe,
 		.soft_reset	= gen10g_no_soft_reset,
 		.config_init	= mv3310_config_init,
+		.probe		= mv3310_probe,
+		.suspend	= mv3310_suspend,
+		.resume		= mv3310_resume,
 		.config_aneg	= mv3310_config_aneg,
 		.aneg_done	= mv3310_aneg_done,
 		.read_status	= mv3310_read_status,
-- 
2.7.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox