Netdev List
 help / color / mirror / Atom feed
* [PATCH RFC v2 9/9] veth: Bulk skb xmit for XDP path
From: Toshiaki Makita @ 2018-06-10 16:02 UTC (permalink / raw)
  To: netdev
  Cc: Toshiaki Makita, Jesper Dangaard Brouer, Alexei Starovoitov,
	Daniel Borkmann
In-Reply-To: <20180610160217.3146-1-toshiaki.makita1@gmail.com>

From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

Aquire txq lock instead of rxq ptr_ring lock so we avoid per-packet
lock when skb->xmit_more is true. We ensure that rxqs are always not
less than txqs and txq to rxq is one to one mapping, so we can
completely remove rxq side lock.

Since we removed rxq side lock, this change does not increase the number
of locking even when bulk sending is not possible, e.g. non-GSO packets.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 drivers/net/veth.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 67debd3eafe6..376d70f983e5 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -138,7 +138,7 @@ static void __veth_xdp_flush(struct veth_rq *rq)
 
 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
 {
-	if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
+	if (unlikely(__ptr_ring_produce(&rq->xdp_ring, skb))) {
 		dev_kfree_skb_any(skb);
 		return NET_RX_DROP;
 	}
@@ -188,7 +188,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 		atomic64_inc(&priv->dropped);
 	}
 
-	if (rcv_xdp)
+	if (rcv_xdp && !skb->xmit_more)
 		__veth_xdp_flush(rq);
 
 	rcu_read_unlock();
@@ -829,15 +829,21 @@ static netdev_features_t veth_fix_features(struct net_device *dev,
 {
 	struct veth_priv *priv = netdev_priv(dev);
 	struct net_device *peer;
+	bool xdp = false;
 
 	peer = rtnl_dereference(priv->peer);
 	if (peer) {
 		struct veth_priv *peer_priv = netdev_priv(peer);
 
 		if (rtnl_dereference(peer_priv->rq[0].xdp_prog))
-			features &= ~NETIF_F_GSO_SOFTWARE;
+			xdp = true;
 	}
 
+	if (xdp)
+		features &= ~(NETIF_F_GSO_SOFTWARE | NETIF_F_LLTX);
+	else
+		features |= NETIF_F_LLTX;
+
 	return features;
 }
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH RFC v2 8/9] veth: Support per queue XDP ring
From: Toshiaki Makita @ 2018-06-10 16:02 UTC (permalink / raw)
  To: netdev
  Cc: Toshiaki Makita, Jesper Dangaard Brouer, Alexei Starovoitov,
	Daniel Borkmann
In-Reply-To: <20180610160217.3146-1-toshiaki.makita1@gmail.com>

From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

Move XDP and napi related fields in veth_priv to newly created veth_rq
structure.

When xdp_frames are enqueued from ndo_xdp_xmit and XDP_TX, rxq is
selected by current cpu.
When skbs are enqueued from the peer device, rxq is determined by its
peer's txq. In this way we can implement bulk packet send using
skb->xmit_more later.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 drivers/net/veth.c | 290 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 191 insertions(+), 99 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a47e1ba7d7e6..67debd3eafe6 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -38,20 +38,24 @@ struct pcpu_vstats {
 	struct u64_stats_sync	syncp;
 };
 
-struct veth_priv {
+struct veth_rq {
 	struct napi_struct	xdp_napi;
 	struct net_device	*dev;
 	struct bpf_prog __rcu	*xdp_prog;
-	struct net_device __rcu	*peer;
-	atomic64_t		dropped;
 	struct xdp_mem_info	xdp_mem;
-	unsigned		requested_headroom;
 	bool			rx_notify_masked;
 	struct ptr_ring		xdp_ring;
 	struct ptr_ring		xdp_tx_ring;
 	struct xdp_rxq_info	xdp_rxq;
 };
 
+struct veth_priv {
+	struct net_device __rcu	*peer;
+	atomic64_t		dropped;
+	struct veth_rq		*rq;
+	unsigned int		requested_headroom;
+};
+
 /*
  * ethtool interface
  */
@@ -122,19 +126,19 @@ static void veth_xdp_free(void *frame)
 	xdp_return_frame(frame);
 }
 
-static void __veth_xdp_flush(struct veth_priv *priv)
+static void __veth_xdp_flush(struct veth_rq *rq)
 {
 	/* Write ptr_ring before reading rx_notify_masked */
 	smp_mb();
-	if (!priv->rx_notify_masked) {
-		priv->rx_notify_masked = true;
-		napi_schedule(&priv->xdp_napi);
+	if (!rq->rx_notify_masked) {
+		rq->rx_notify_masked = true;
+		napi_schedule(&rq->xdp_napi);
 	}
 }
 
-static int veth_xdp_rx(struct veth_priv *priv, struct sk_buff *skb)
+static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
 {
-	if (unlikely(ptr_ring_produce(&priv->xdp_ring, skb))) {
+	if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
 		dev_kfree_skb_any(skb);
 		return NET_RX_DROP;
 	}
@@ -142,12 +146,11 @@ static int veth_xdp_rx(struct veth_priv *priv, struct sk_buff *skb)
 	return NET_RX_SUCCESS;
 }
 
-static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, bool xdp)
+static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
+			    struct veth_rq *rq, bool xdp)
 {
-	struct veth_priv *priv = netdev_priv(dev);
-
 	return __dev_forward_skb(dev, skb) ?: xdp ?
-		veth_xdp_rx(priv, skb) :
+		veth_xdp_rx(rq, skb) :
 		netif_rx(skb);
 }
 
@@ -157,6 +160,8 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct net_device *rcv;
 	int length = skb->len;
 	bool rcv_xdp = false;
+	struct veth_rq *rq;
+	int rxq;
 
 	rcu_read_lock();
 	rcv = rcu_dereference(priv->peer);
@@ -166,9 +171,12 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	rcv_priv = netdev_priv(rcv);
-	rcv_xdp = rcu_access_pointer(rcv_priv->xdp_prog);
+	rxq = skb_get_queue_mapping(skb);
+	skb_record_rx_queue(skb, rxq);
+	rq = &rcv_priv->rq[rxq];
+	rcv_xdp = rcu_access_pointer(rq->xdp_prog);
 
-	if (likely(veth_forward_skb(rcv, skb, rcv_xdp) == NET_RX_SUCCESS)) {
+	if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
 		struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
 
 		u64_stats_update_begin(&stats->syncp);
@@ -181,7 +189,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	if (rcv_xdp)
-		__veth_xdp_flush(rcv_priv);
+		__veth_xdp_flush(rq);
 
 	rcu_read_unlock();
 
@@ -256,11 +264,17 @@ static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
 	return skb;
 }
 
+static int veth_select_rxq(struct net_device *dev)
+{
+	return smp_processor_id() % dev->real_num_rx_queues;
+}
+
 static int veth_xdp_xmit(struct net_device *dev, int n,
 			 struct xdp_frame **frames, u32 flags)
 {
 	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
 	struct net_device *rcv;
+	struct veth_rq *rq;
 	int i, drops = 0;
 
 	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
@@ -271,24 +285,25 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
 		return -ENXIO;
 
 	rcv_priv = netdev_priv(rcv);
+	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
 	/* xdp_ring is initialized on receive side? */
-	if (!rcu_access_pointer(rcv_priv->xdp_prog))
+	if (!rcu_access_pointer(rq->xdp_prog))
 		return -ENXIO;
 
-	spin_lock(&rcv_priv->xdp_tx_ring.producer_lock);
+	spin_lock(&rq->xdp_tx_ring.producer_lock);
 	for (i = 0; i < n; i++) {
 		struct xdp_frame *frame = frames[i];
 
 		if (unlikely(xdp_ok_fwd_dev(rcv, frame->len) ||
-			     __ptr_ring_produce(&rcv_priv->xdp_tx_ring, frame))) {
+			     __ptr_ring_produce(&rq->xdp_tx_ring, frame))) {
 			xdp_return_frame_rx_napi(frame);
 			drops++;
 		}
 	}
-	spin_unlock(&rcv_priv->xdp_tx_ring.producer_lock);
+	spin_unlock(&rq->xdp_tx_ring.producer_lock);
 
 	if (flags & XDP_XMIT_FLUSH)
-		__veth_xdp_flush(rcv_priv);
+		__veth_xdp_flush(rq);
 
 	return n - drops;
 }
@@ -297,6 +312,7 @@ static void veth_xdp_flush(struct net_device *dev)
 {
 	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
 	struct net_device *rcv;
+	struct veth_rq *rq;
 
 	rcu_read_lock();
 	rcv = rcu_dereference(priv->peer);
@@ -304,11 +320,12 @@ static void veth_xdp_flush(struct net_device *dev)
 		goto out;
 
 	rcv_priv = netdev_priv(rcv);
+	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
 	/* xdp_ring is initialized on receive side? */
-	if (unlikely(!rcu_access_pointer(rcv_priv->xdp_prog)))
+	if (unlikely(!rcu_access_pointer(rq->xdp_prog)))
 		goto out;
 
-	__veth_xdp_flush(rcv_priv);
+	__veth_xdp_flush(rq);
 out:
 	rcu_read_unlock();
 }
@@ -323,7 +340,7 @@ static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	return veth_xdp_xmit(dev, 1, &frame, 0);
 }
 
-static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
+static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
 					struct xdp_frame *frame, bool *xdp_xmit,
 					bool *xdp_redir)
 {
@@ -334,7 +351,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 	struct sk_buff *skb;
 
 	rcu_read_lock();
-	xdp_prog = rcu_dereference(priv->xdp_prog);
+	xdp_prog = rcu_dereference(rq->xdp_prog);
 	if (xdp_prog) {
 		struct xdp_buff xdp;
 		u32 act;
@@ -343,7 +360,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 		xdp.data = frame->data;
 		xdp.data_end = frame->data + frame->len;
 		xdp.data_meta = frame->data - frame->metasize;
-		xdp.rxq = &priv->xdp_rxq;
+		xdp.rxq = &rq->xdp_rxq;
 
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
@@ -357,8 +374,8 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 			xdp.data_hard_start = frame;
 			xdp.rxq->mem = frame->mem;
 			xdp.rxq->mem.flags |= XDP_MEM_RF_NO_DIRECT;
-			if (unlikely(veth_xdp_tx(priv->dev, &xdp))) {
-				trace_xdp_exception(priv->dev, xdp_prog, act);
+			if (unlikely(veth_xdp_tx(rq->dev, &xdp))) {
+				trace_xdp_exception(rq->dev, xdp_prog, act);
 				frame = &orig_frame;
 				goto err_xdp;
 			}
@@ -370,7 +387,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 			xdp.data_hard_start = frame;
 			xdp.rxq->mem = frame->mem;
 			xdp.rxq->mem.flags |= XDP_MEM_RF_NO_DIRECT;
-			if (xdp_do_redirect(priv->dev, &xdp, xdp_prog)) {
+			if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
 				frame = &orig_frame;
 				goto err_xdp;
 			}
@@ -380,7 +397,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 		default:
 			bpf_warn_invalid_xdp_action(act);
 		case XDP_ABORTED:
-			trace_xdp_exception(priv->dev, xdp_prog, act);
+			trace_xdp_exception(rq->dev, xdp_prog, act);
 		case XDP_DROP:
 			goto err_xdp;
 		}
@@ -395,7 +412,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 	}
 
 	memset(frame, 0, sizeof(*frame));
-	skb->protocol = eth_type_trans(skb, priv->dev);
+	skb->protocol = eth_type_trans(skb, rq->dev);
 err:
 	return skb;
 err_xdp:
@@ -405,9 +422,8 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 	return NULL;
 }
 
-static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
-					struct sk_buff *skb, bool *xdp_xmit,
-					bool *xdp_redir)
+static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb,
+					bool *xdp_xmit, bool *xdp_redir)
 {
 	u32 pktlen, headroom, act, metalen;
 	void *orig_data, *orig_data_end;
@@ -416,7 +432,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 	struct xdp_buff xdp;
 
 	rcu_read_lock();
-	xdp_prog = rcu_dereference(priv->xdp_prog);
+	xdp_prog = rcu_dereference(rq->xdp_prog);
 	if (!xdp_prog) {
 		rcu_read_unlock();
 		goto out;
@@ -467,7 +483,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 	xdp.data = skb_mac_header(skb);
 	xdp.data_end = xdp.data + pktlen;
 	xdp.data_meta = xdp.data;
-	xdp.rxq = &priv->xdp_rxq;
+	xdp.rxq = &rq->xdp_rxq;
 	orig_data = xdp.data;
 	orig_data_end = xdp.data_end;
 
@@ -479,9 +495,9 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 	case XDP_TX:
 		get_page(virt_to_page(xdp.data));
 		dev_consume_skb_any(skb);
-		xdp.rxq->mem = priv->xdp_mem;
-		if (unlikely(veth_xdp_tx(priv->dev, &xdp))) {
-			trace_xdp_exception(priv->dev, xdp_prog, act);
+		xdp.rxq->mem = rq->xdp_mem;
+		if (unlikely(veth_xdp_tx(rq->dev, &xdp))) {
+			trace_xdp_exception(rq->dev, xdp_prog, act);
 			goto err_xdp;
 		}
 		*xdp_xmit = true;
@@ -490,8 +506,8 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 	case XDP_REDIRECT:
 		get_page(virt_to_page(xdp.data));
 		dev_consume_skb_any(skb);
-		xdp.rxq->mem = priv->xdp_mem;
-		if (xdp_do_redirect(priv->dev, &xdp, xdp_prog))
+		xdp.rxq->mem = rq->xdp_mem;
+		if (xdp_do_redirect(rq->dev, &xdp, xdp_prog))
 			goto err_xdp;
 		*xdp_redir = true;
 		rcu_read_unlock();
@@ -499,7 +515,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 	default:
 		bpf_warn_invalid_xdp_action(act);
 	case XDP_ABORTED:
-		trace_xdp_exception(priv->dev, xdp_prog, act);
+		trace_xdp_exception(rq->dev, xdp_prog, act);
 	case XDP_DROP:
 		goto drop;
 	}
@@ -515,7 +531,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 	off = xdp.data_end - orig_data_end;
 	if (off != 0)
 		__skb_put(skb, off);
-	skb->protocol = eth_type_trans(skb, priv->dev);
+	skb->protocol = eth_type_trans(skb, rq->dev);
 
 	metalen = xdp.data - xdp.data_meta;
 	if (metalen)
@@ -533,7 +549,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 	return NULL;
 }
 
-static int veth_xdp_rcv(struct veth_priv *priv, int budget, bool *xdp_xmit,
+static int veth_xdp_rcv(struct veth_rq *rq, int budget, bool *xdp_xmit,
 			bool *xdp_redir)
 {
 	int done = 0;
@@ -551,15 +567,15 @@ static int veth_xdp_rcv(struct veth_priv *priv, int budget, bool *xdp_xmit,
 			struct xdp_frame *frame;
 			struct sk_buff *skb;
 
-			frame = __ptr_ring_consume(&priv->xdp_tx_ring);
+			frame = __ptr_ring_consume(&rq->xdp_tx_ring);
 			if (!frame) {
 				curr_more = false;
 				break;
 			}
 
-			skb = veth_xdp_rcv_one(priv, frame, xdp_xmit, xdp_redir);
+			skb = veth_xdp_rcv_one(rq, frame, xdp_xmit, xdp_redir);
 			if (skb)
-				napi_gro_receive(&priv->xdp_napi, skb);
+				napi_gro_receive(&rq->xdp_napi, skb);
 
 			done++;
 		}
@@ -568,16 +584,16 @@ static int veth_xdp_rcv(struct veth_priv *priv, int budget, bool *xdp_xmit,
 		curr_more = true;
 		curr_budget = min(budget - done, budget >> 1);
 		for (i = 0; i < curr_budget; i++) {
-			struct sk_buff *skb = __ptr_ring_consume(&priv->xdp_ring);
+			struct sk_buff *skb = __ptr_ring_consume(&rq->xdp_ring);
 
 			if (!skb) {
 				curr_more = false;
 				break;
 			}
 
-			skb = veth_xdp_rcv_skb(priv, skb, xdp_xmit, xdp_redir);
+			skb = veth_xdp_rcv_skb(rq, skb, xdp_xmit, xdp_redir);
 			if (skb)
-				napi_gro_receive(&priv->xdp_napi, skb);
+				napi_gro_receive(&rq->xdp_napi, skb);
 
 			done++;
 		}
@@ -589,26 +605,26 @@ static int veth_xdp_rcv(struct veth_priv *priv, int budget, bool *xdp_xmit,
 
 static int veth_poll(struct napi_struct *napi, int budget)
 {
-	struct veth_priv *priv =
-		container_of(napi, struct veth_priv, xdp_napi);
+	struct veth_rq *rq =
+		container_of(napi, struct veth_rq, xdp_napi);
 	bool xdp_xmit = false;
 	bool xdp_redir = false;
 	int done;
 
-	done = veth_xdp_rcv(priv, budget, &xdp_xmit, &xdp_redir);
+	done = veth_xdp_rcv(rq, budget, &xdp_xmit, &xdp_redir);
 
 	if (done < budget && napi_complete_done(napi, done)) {
 		/* Write rx_notify_masked before reading ptr_ring */
-		smp_store_mb(priv->rx_notify_masked, false);
-		if (unlikely(!__ptr_ring_empty(&priv->xdp_tx_ring) ||
-			     !__ptr_ring_empty(&priv->xdp_ring))) {
-			priv->rx_notify_masked = true;
-			napi_schedule(&priv->xdp_napi);
+		smp_store_mb(rq->rx_notify_masked, false);
+		if (unlikely(!__ptr_ring_empty(&rq->xdp_tx_ring) ||
+			     !__ptr_ring_empty(&rq->xdp_ring))) {
+			rq->rx_notify_masked = true;
+			napi_schedule(&rq->xdp_napi);
 		}
 	}
 
 	if (xdp_xmit)
-		veth_xdp_flush(priv->dev);
+		veth_xdp_flush(rq->dev);
 	if (xdp_redir)
 		xdp_do_flush_map();
 
@@ -618,22 +634,36 @@ static int veth_poll(struct napi_struct *napi, int budget)
 static int veth_napi_add(struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
-	int err;
+	int err, i;
 
-	err = ptr_ring_init(&priv->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
-	if (err)
-		return err;
+	for (i = 0; i < dev->real_num_rx_queues; i++) {
+		struct veth_rq *rq = &priv->rq[i];
 
-	err = ptr_ring_init(&priv->xdp_tx_ring, VETH_RING_SIZE, GFP_KERNEL);
-	if (err)
-		goto err_xdp_tx_ring;
+		err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
+		if (err)
+			goto err_xdp_ring;
+
+		err = ptr_ring_init(&rq->xdp_tx_ring, VETH_RING_SIZE,
+				    GFP_KERNEL);
+		if (err)
+			goto err_xdp_tx_ring;
+	}
+
+	for (i = 0; i < dev->real_num_rx_queues; i++) {
+		struct veth_rq *rq = &priv->rq[i];
 
-	netif_napi_add(dev, &priv->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
-	napi_enable(&priv->xdp_napi);
+		netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
+		napi_enable(&rq->xdp_napi);
+	}
 
 	return 0;
 err_xdp_tx_ring:
-	ptr_ring_cleanup(&priv->xdp_ring, __skb_array_destroy_skb);
+	ptr_ring_cleanup(&priv->rq[i].xdp_ring, __skb_array_destroy_skb);
+err_xdp_ring:
+	for (i--; i >= 0; i--) {
+		ptr_ring_cleanup(&priv->rq[i].xdp_ring, __skb_array_destroy_skb);
+		ptr_ring_cleanup(&priv->rq[i].xdp_tx_ring, veth_xdp_free);
+	}
 
 	return err;
 }
@@ -641,37 +671,56 @@ static int veth_napi_add(struct net_device *dev)
 static void veth_napi_del(struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
+	int i;
 
-	napi_disable(&priv->xdp_napi);
-	netif_napi_del(&priv->xdp_napi);
-	ptr_ring_cleanup(&priv->xdp_ring, __skb_array_destroy_skb);
-	ptr_ring_cleanup(&priv->xdp_tx_ring, veth_xdp_free);
+	for (i = 0; i < dev->real_num_rx_queues; i++) {
+		struct veth_rq *rq = &priv->rq[i];
+
+		napi_disable(&rq->xdp_napi);
+		napi_hash_del(&rq->xdp_napi);
+	}
+	synchronize_net();
+
+	for (i = 0; i < dev->real_num_rx_queues; i++) {
+		struct veth_rq *rq = &priv->rq[i];
+
+		netif_napi_del(&rq->xdp_napi);
+		ptr_ring_cleanup(&rq->xdp_ring, __skb_array_destroy_skb);
+		ptr_ring_cleanup(&rq->xdp_tx_ring, veth_xdp_free);
+	}
 }
 
 static int veth_enable_xdp(struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
-	int err;
+	int err, i;
 
-	err = xdp_rxq_info_reg(&priv->xdp_rxq, dev, 0);
-	if (err < 0)
-		return err;
+	for (i = 0; i < dev->real_num_rx_queues; i++) {
+		struct veth_rq *rq = &priv->rq[i];
 
-	err = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq,
-					 MEM_TYPE_PAGE_SHARED, NULL);
-	if (err < 0)
-		goto err;
+		err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i);
+		if (err < 0)
+			goto err_rxq_reg;
+
+		err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
+						 MEM_TYPE_PAGE_SHARED, NULL);
+		if (err < 0)
+			goto err_reg_mem;
 
-	/* Save original mem info as it can be overwritten */
-	priv->xdp_mem = priv->xdp_rxq.mem;
+		/* Save original mem info as it can be overwritten */
+		rq->xdp_mem = rq->xdp_rxq.mem;
+	}
 
 	err = veth_napi_add(dev);
 	if (err)
-		goto err;
+		goto err_rxq_reg;
 
 	return 0;
-err:
-	xdp_rxq_info_unreg(&priv->xdp_rxq);
+err_reg_mem:
+	xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
+err_rxq_reg:
+	for (i--; i >= 0; i--)
+		xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
 
 	return err;
 }
@@ -679,10 +728,15 @@ static int veth_enable_xdp(struct net_device *dev)
 static void veth_disable_xdp(struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
+	int i;
 
 	veth_napi_del(dev);
-	priv->xdp_rxq.mem = priv->xdp_mem;
-	xdp_rxq_info_unreg(&priv->xdp_rxq);
+	for (i = 0; i < dev->real_num_rx_queues; i++) {
+		struct veth_rq *rq = &priv->rq[i];
+
+		rq->xdp_rxq.mem = rq->xdp_mem;
+		xdp_rxq_info_unreg(&rq->xdp_rxq);
+	}
 }
 
 static int veth_open(struct net_device *dev)
@@ -694,7 +748,7 @@ static int veth_open(struct net_device *dev)
 	if (!peer)
 		return -ENOTCONN;
 
-	if (rtnl_dereference(priv->xdp_prog)) {
+	if (rtnl_dereference(priv->rq[0].xdp_prog)) {
 		err = veth_enable_xdp(dev);
 		if (err)
 			return err;
@@ -717,7 +771,7 @@ static int veth_close(struct net_device *dev)
 	if (peer)
 		netif_carrier_off(peer);
 
-	if (rtnl_dereference(priv->xdp_prog))
+	if (rtnl_dereference(priv->rq[0].xdp_prog))
 		veth_disable_xdp(dev);
 
 	return 0;
@@ -780,7 +834,7 @@ static netdev_features_t veth_fix_features(struct net_device *dev,
 	if (peer) {
 		struct veth_priv *peer_priv = netdev_priv(peer);
 
-		if (rtnl_dereference(peer_priv->xdp_prog))
+		if (rtnl_dereference(peer_priv->rq[0].xdp_prog))
 			features &= ~NETIF_F_GSO_SOFTWARE;
 	}
 
@@ -816,9 +870,9 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	struct veth_priv *priv = netdev_priv(dev);
 	struct bpf_prog *old_prog;
 	struct net_device *peer;
-	int err;
+	int err, i;
 
-	old_prog = rtnl_dereference(priv->xdp_prog);
+	old_prog = rtnl_dereference(priv->rq[0].xdp_prog);
 	peer = rtnl_dereference(priv->peer);
 
 	if (prog) {
@@ -826,6 +880,9 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 			return -ENOTCONN;
 
 		if (!old_prog) {
+			if (dev->real_num_rx_queues < peer->real_num_tx_queues)
+				return -ENOSPC;
+
 			if (dev->flags & IFF_UP) {
 				err = veth_enable_xdp(dev);
 				if (err)
@@ -841,7 +898,8 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 		}
 	}
 
-	rcu_assign_pointer(priv->xdp_prog, prog);
+	for (i = 0; i < dev->real_num_rx_queues; i++)
+		rcu_assign_pointer(priv->rq[i].xdp_prog, prog);
 
 	if (old_prog) {
 		bpf_prog_put(old_prog);
@@ -867,7 +925,7 @@ static u32 veth_xdp_query(struct net_device *dev)
 	struct veth_priv *priv = netdev_priv(dev);
 	const struct bpf_prog *xdp_prog;
 
-	xdp_prog = rtnl_dereference(priv->xdp_prog);
+	xdp_prog = rtnl_dereference(priv->rq[0].xdp_prog);
 	if (xdp_prog)
 		return xdp_prog->aux->id;
 
@@ -960,13 +1018,31 @@ static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
 	return 0;
 }
 
+static int veth_alloc_queues(struct net_device *dev)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+
+	priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL);
+	if (!priv->rq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void veth_free_queues(struct net_device *dev)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+
+	kfree(priv->rq);
+}
+
 static struct rtnl_link_ops veth_link_ops;
 
 static int veth_newlink(struct net *src_net, struct net_device *dev,
 			struct nlattr *tb[], struct nlattr *data[],
 			struct netlink_ext_ack *extack)
 {
-	int err;
+	int err, i;
 	struct net_device *peer;
 	struct veth_priv *priv;
 	char ifname[IFNAMSIZ];
@@ -1019,6 +1095,12 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 		return PTR_ERR(peer);
 	}
 
+	err = veth_alloc_queues(peer);
+	if (err) {
+		put_net(net);
+		goto err_peer_alloc_queues;
+	}
+
 	if (!ifmp || !tbp[IFLA_ADDRESS])
 		eth_hw_addr_random(peer);
 
@@ -1047,6 +1129,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 	 * should be re-allocated
 	 */
 
+	err = veth_alloc_queues(dev);
+	if (err)
+		goto err_alloc_queues;
+
 	if (tb[IFLA_ADDRESS] == NULL)
 		eth_hw_addr_random(dev);
 
@@ -1066,22 +1152,28 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 	 */
 
 	priv = netdev_priv(dev);
-	priv->dev = dev;
+	for (i = 0; i < dev->real_num_rx_queues; i++)
+		priv->rq[i].dev = dev;
 	rcu_assign_pointer(priv->peer, peer);
 
 	priv = netdev_priv(peer);
-	priv->dev = peer;
+	for (i = 0; i < peer->real_num_rx_queues; i++)
+		priv->rq[i].dev = peer;
 	rcu_assign_pointer(priv->peer, dev);
 
 	return 0;
 
 err_register_dev:
+	veth_free_queues(dev);
+err_alloc_queues:
 	/* nothing to do */
 err_configure_peer:
 	unregister_netdevice(peer);
 	return err;
 
 err_register_peer:
+	veth_free_queues(peer);
+err_peer_alloc_queues:
 	free_netdev(peer);
 	return err;
 }
-- 
2.14.3

^ permalink raw reply related

* [PATCH RFC v2 7/9] veth: Add XDP TX and REDIRECT
From: Toshiaki Makita @ 2018-06-10 16:02 UTC (permalink / raw)
  To: netdev
  Cc: Toshiaki Makita, Jesper Dangaard Brouer, Alexei Starovoitov,
	Daniel Borkmann
In-Reply-To: <20180610160217.3146-1-toshiaki.makita1@gmail.com>

From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

This allows further redirection of xdp_frames like

 NIC   -> veth--veth -> veth--veth
 (XDP)          (XDP)         (XDP)

The intermediate XDP, redirecting packets from NIC to the other veth,
reuses xdp_mem_info from NIC so that page recycling of the NIC works on
the destination veth's XDP.
In this way return_frame is not fully guarded by NAPI, since another
NAPI handler on another cpu may use the same xdp_mem_info concurrently.
Thus disable napi_direct by XDP_MEM_RF_NO_DIRECT flag.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 drivers/net/veth.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 103 insertions(+), 7 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index b809d609a642..a47e1ba7d7e6 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -44,6 +44,7 @@ struct veth_priv {
 	struct bpf_prog __rcu	*xdp_prog;
 	struct net_device __rcu	*peer;
 	atomic64_t		dropped;
+	struct xdp_mem_info	xdp_mem;
 	unsigned		requested_headroom;
 	bool			rx_notify_masked;
 	struct ptr_ring		xdp_ring;
@@ -292,10 +293,42 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
 	return n - drops;
 }
 
+static void veth_xdp_flush(struct net_device *dev)
+{
+	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
+	struct net_device *rcv;
+
+	rcu_read_lock();
+	rcv = rcu_dereference(priv->peer);
+	if (unlikely(!rcv))
+		goto out;
+
+	rcv_priv = netdev_priv(rcv);
+	/* xdp_ring is initialized on receive side? */
+	if (unlikely(!rcu_access_pointer(rcv_priv->xdp_prog)))
+		goto out;
+
+	__veth_xdp_flush(rcv_priv);
+out:
+	rcu_read_unlock();
+}
+
+static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
+{
+	struct xdp_frame *frame = convert_to_xdp_frame(xdp);
+
+	if (unlikely(!frame))
+		return -EOVERFLOW;
+
+	return veth_xdp_xmit(dev, 1, &frame, 0);
+}
+
 static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
-					struct xdp_frame *frame)
+					struct xdp_frame *frame, bool *xdp_xmit,
+					bool *xdp_redir)
 {
 	int len = frame->len, delta = 0;
+	struct xdp_frame orig_frame;
 	struct bpf_prog *xdp_prog;
 	unsigned int headroom;
 	struct sk_buff *skb;
@@ -319,6 +352,31 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 			delta = frame->data - xdp.data;
 			len = xdp.data_end - xdp.data;
 			break;
+		case XDP_TX:
+			orig_frame = *frame;
+			xdp.data_hard_start = frame;
+			xdp.rxq->mem = frame->mem;
+			xdp.rxq->mem.flags |= XDP_MEM_RF_NO_DIRECT;
+			if (unlikely(veth_xdp_tx(priv->dev, &xdp))) {
+				trace_xdp_exception(priv->dev, xdp_prog, act);
+				frame = &orig_frame;
+				goto err_xdp;
+			}
+			*xdp_xmit = true;
+			rcu_read_unlock();
+			goto xdp_xmit;
+		case XDP_REDIRECT:
+			orig_frame = *frame;
+			xdp.data_hard_start = frame;
+			xdp.rxq->mem = frame->mem;
+			xdp.rxq->mem.flags |= XDP_MEM_RF_NO_DIRECT;
+			if (xdp_do_redirect(priv->dev, &xdp, xdp_prog)) {
+				frame = &orig_frame;
+				goto err_xdp;
+			}
+			*xdp_redir = true;
+			rcu_read_unlock();
+			goto xdp_xmit;
 		default:
 			bpf_warn_invalid_xdp_action(act);
 		case XDP_ABORTED:
@@ -343,12 +401,13 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 err_xdp:
 	rcu_read_unlock();
 	xdp_return_frame(frame);
-
+xdp_xmit:
 	return NULL;
 }
 
 static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
-					struct sk_buff *skb)
+					struct sk_buff *skb, bool *xdp_xmit,
+					bool *xdp_redir)
 {
 	u32 pktlen, headroom, act, metalen;
 	void *orig_data, *orig_data_end;
@@ -417,6 +476,26 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 	switch (act) {
 	case XDP_PASS:
 		break;
+	case XDP_TX:
+		get_page(virt_to_page(xdp.data));
+		dev_consume_skb_any(skb);
+		xdp.rxq->mem = priv->xdp_mem;
+		if (unlikely(veth_xdp_tx(priv->dev, &xdp))) {
+			trace_xdp_exception(priv->dev, xdp_prog, act);
+			goto err_xdp;
+		}
+		*xdp_xmit = true;
+		rcu_read_unlock();
+		goto xdp_xmit;
+	case XDP_REDIRECT:
+		get_page(virt_to_page(xdp.data));
+		dev_consume_skb_any(skb);
+		xdp.rxq->mem = priv->xdp_mem;
+		if (xdp_do_redirect(priv->dev, &xdp, xdp_prog))
+			goto err_xdp;
+		*xdp_redir = true;
+		rcu_read_unlock();
+		goto xdp_xmit;
 	default:
 		bpf_warn_invalid_xdp_action(act);
 	case XDP_ABORTED:
@@ -447,9 +526,15 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 	rcu_read_unlock();
 	dev_kfree_skb_any(skb);
 	return NULL;
+err_xdp:
+	rcu_read_unlock();
+	page_frag_free(xdp.data);
+xdp_xmit:
+	return NULL;
 }
 
-static int veth_xdp_rcv(struct veth_priv *priv, int budget)
+static int veth_xdp_rcv(struct veth_priv *priv, int budget, bool *xdp_xmit,
+			bool *xdp_redir)
 {
 	int done = 0;
 	bool more;
@@ -472,7 +557,7 @@ static int veth_xdp_rcv(struct veth_priv *priv, int budget)
 				break;
 			}
 
-			skb = veth_xdp_rcv_one(priv, frame);
+			skb = veth_xdp_rcv_one(priv, frame, xdp_xmit, xdp_redir);
 			if (skb)
 				napi_gro_receive(&priv->xdp_napi, skb);
 
@@ -490,7 +575,7 @@ static int veth_xdp_rcv(struct veth_priv *priv, int budget)
 				break;
 			}
 
-			skb = veth_xdp_rcv_skb(priv, skb);
+			skb = veth_xdp_rcv_skb(priv, skb, xdp_xmit, xdp_redir);
 			if (skb)
 				napi_gro_receive(&priv->xdp_napi, skb);
 
@@ -506,9 +591,11 @@ static int veth_poll(struct napi_struct *napi, int budget)
 {
 	struct veth_priv *priv =
 		container_of(napi, struct veth_priv, xdp_napi);
+	bool xdp_xmit = false;
+	bool xdp_redir = false;
 	int done;
 
-	done = veth_xdp_rcv(priv, budget);
+	done = veth_xdp_rcv(priv, budget, &xdp_xmit, &xdp_redir);
 
 	if (done < budget && napi_complete_done(napi, done)) {
 		/* Write rx_notify_masked before reading ptr_ring */
@@ -520,6 +607,11 @@ static int veth_poll(struct napi_struct *napi, int budget)
 		}
 	}
 
+	if (xdp_xmit)
+		veth_xdp_flush(priv->dev);
+	if (xdp_redir)
+		xdp_do_flush_map();
+
 	return done;
 }
 
@@ -570,6 +662,9 @@ static int veth_enable_xdp(struct net_device *dev)
 	if (err < 0)
 		goto err;
 
+	/* Save original mem info as it can be overwritten */
+	priv->xdp_mem = priv->xdp_rxq.mem;
+
 	err = veth_napi_add(dev);
 	if (err)
 		goto err;
@@ -586,6 +681,7 @@ static void veth_disable_xdp(struct net_device *dev)
 	struct veth_priv *priv = netdev_priv(dev);
 
 	veth_napi_del(dev);
+	priv->xdp_rxq.mem = priv->xdp_mem;
 	xdp_rxq_info_unreg(&priv->xdp_rxq);
 }
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH RFC v2 6/9] xdp: Add a flag for disabling napi_direct of xdp_return_frame in xdp_mem_info
From: Toshiaki Makita @ 2018-06-10 16:02 UTC (permalink / raw)
  To: netdev
  Cc: Toshiaki Makita, Jesper Dangaard Brouer, Alexei Starovoitov,
	Daniel Borkmann
In-Reply-To: <20180610160217.3146-1-toshiaki.makita1@gmail.com>

From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

We need some mechanism to disable napi_direct on calling
xdp_return_frame_rx_napi() from some context.
When veth gets support of XDP_REDIRECT, it will redirects packets which
are redirected from other devices. On redirection veth will reuse
xdp_mem_info of the redirection source device to make return_frame work.
But in this case .ndo_xdp_xmit() called from veth redirection uses
xdp_mem_info which is not guarded by NAPI, because the .ndo_xdp_xmit is
not called directly from the rxq which owns the xdp_mem_info.

This approach introduces a flag in xdp_mem_info to indicate that
napi_direct should be disabled even when _rx_napi variant is used.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 include/net/xdp.h | 4 ++++
 net/core/xdp.c    | 6 ++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 2deea7166a34..ea0c80f6c8ee 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -41,6 +41,9 @@ enum xdp_mem_type {
 	MEM_TYPE_MAX,
 };
 
+/* XDP flags for xdp_mem_info */
+#define XDP_MEM_RF_NO_DIRECT	BIT(0)	/* don't use napi_direct */
+
 /* XDP flags for ndo_xdp_xmit */
 #define XDP_XMIT_FLUSH		(1U << 0)	/* doorbell signal consumer */
 #define XDP_XMIT_FLAGS_MASK	XDP_XMIT_FLUSH
@@ -48,6 +51,7 @@ enum xdp_mem_type {
 struct xdp_mem_info {
 	u32 type; /* enum xdp_mem_type, but known size type */
 	u32 id;
+	u32 flags;
 };
 
 struct page_pool;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 9d1f22072d5d..e94f146360b2 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -327,10 +327,12 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
 		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 		page = virt_to_head_page(data);
-		if (xa)
+		if (xa) {
+			napi_direct &= !(mem->flags & XDP_MEM_RF_NO_DIRECT);
 			page_pool_put_page(xa->page_pool, page, napi_direct);
-		else
+		} else {
 			put_page(page);
+		}
 		rcu_read_unlock();
 		break;
 	case MEM_TYPE_PAGE_SHARED:
-- 
2.14.3

^ permalink raw reply related

* [PATCH RFC v2 5/9] veth: Add ndo_xdp_xmit
From: Toshiaki Makita @ 2018-06-10 16:02 UTC (permalink / raw)
  To: netdev
  Cc: Toshiaki Makita, Jesper Dangaard Brouer, Alexei Starovoitov,
	Daniel Borkmann
In-Reply-To: <20180610160217.3146-1-toshiaki.makita1@gmail.com>

From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

This allows NIC's XDP to redirect packets to veth. The destination veth
device enqueues redirected packets to the napi ring of its peer, then
they are processed by XDP on its peer veth device.
This can be thought as calling another XDP program by XDP program using
REDIRECT, when the peer enables driver XDP.

Note that when the peer veth device does not set driver xdp, redirected
packets will be dropped because the peer is not ready for NAPI.

v2:
- Drop the part converting xdp_frame into skb when XDP is not enabled.
- Implement bulk interface of ndo_xdp_xmit.
- Implement XDP_XMIT_FLUSH bit and drop ndo_xdp_flush.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 drivers/net/veth.c     | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/filter.h | 16 ++++++++++++++++
 net/core/filter.c      | 11 +----------
 3 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index cb3fa558fbe0..b809d609a642 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -17,6 +17,7 @@
 #include <net/rtnetlink.h>
 #include <net/dst.h>
 #include <net/xfrm.h>
+#include <net/xdp.h>
 #include <linux/veth.h>
 #include <linux/module.h>
 #include <linux/bpf.h>
@@ -254,6 +255,43 @@ static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
 	return skb;
 }
 
+static int veth_xdp_xmit(struct net_device *dev, int n,
+			 struct xdp_frame **frames, u32 flags)
+{
+	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
+	struct net_device *rcv;
+	int i, drops = 0;
+
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+
+	rcv = rcu_dereference(priv->peer);
+	if (unlikely(!rcv))
+		return -ENXIO;
+
+	rcv_priv = netdev_priv(rcv);
+	/* xdp_ring is initialized on receive side? */
+	if (!rcu_access_pointer(rcv_priv->xdp_prog))
+		return -ENXIO;
+
+	spin_lock(&rcv_priv->xdp_tx_ring.producer_lock);
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *frame = frames[i];
+
+		if (unlikely(xdp_ok_fwd_dev(rcv, frame->len) ||
+			     __ptr_ring_produce(&rcv_priv->xdp_tx_ring, frame))) {
+			xdp_return_frame_rx_napi(frame);
+			drops++;
+		}
+	}
+	spin_unlock(&rcv_priv->xdp_tx_ring.producer_lock);
+
+	if (flags & XDP_XMIT_FLUSH)
+		__veth_xdp_flush(rcv_priv);
+
+	return n - drops;
+}
+
 static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 					struct xdp_frame *frame)
 {
@@ -770,6 +808,7 @@ static const struct net_device_ops veth_netdev_ops = {
 	.ndo_features_check	= passthru_features_check,
 	.ndo_set_rx_headroom	= veth_set_rx_headroom,
 	.ndo_bpf		= veth_xdp,
+	.ndo_xdp_xmit		= veth_xdp_xmit,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 45fc0f5000d8..12777eb70b40 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -19,6 +19,7 @@
 #include <linux/cryptohash.h>
 #include <linux/set_memory.h>
 #include <linux/kallsyms.h>
+#include <linux/if_vlan.h>
 
 #include <net/sch_generic.h>
 
@@ -786,6 +787,21 @@ static inline bool bpf_dump_raw_ok(void)
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 
+static __always_inline int
+xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen)
+{
+	unsigned int len;
+
+	if (unlikely(!(fwd->flags & IFF_UP)))
+		return -ENETDOWN;
+
+	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
+	if (pktlen > len)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 /* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the
  * same cpu context. Further for best results no more than a single map
  * for the do_redirect/do_flush pair should be used. This limitation is
diff --git a/net/core/filter.c b/net/core/filter.c
index 3d9ba7e5965a..05d9e84566a4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3216,16 +3216,7 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
 static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
 {
-	unsigned int len;
-
-	if (unlikely(!(fwd->flags & IFF_UP)))
-		return -ENETDOWN;
-
-	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
-	if (skb->len > len)
-		return -EMSGSIZE;
-
-	return 0;
+	return xdp_ok_fwd_dev(fwd, skb->len);
 }
 
 static int xdp_do_generic_redirect_map(struct net_device *dev,
-- 
2.14.3

^ permalink raw reply related

* [PATCH RFC v2 4/9] veth: Add another napi ring for ndo_xdp_xmit and handle xdp_frames
From: Toshiaki Makita @ 2018-06-10 16:02 UTC (permalink / raw)
  To: netdev
  Cc: Toshiaki Makita, Jesper Dangaard Brouer, Alexei Starovoitov,
	Daniel Borkmann
In-Reply-To: <20180610160217.3146-1-toshiaki.makita1@gmail.com>

From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

This is preparation for XDP TX and ndo_xdp_xmit.
Add another napi ring and handle redirected xdp_frames through it.

v2:
- Use another ring instead of using flag to differentiate skb and
  xdp_frame. This approach makes bulk skb transmit possible in
  veth_xmit later.
- Clear xdp_frame feilds in skb->head.
- Implement adjust_tail.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 drivers/net/veth.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 114 insertions(+), 11 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 88d349da72cc..cb3fa558fbe0 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -46,6 +46,7 @@ struct veth_priv {
 	unsigned		requested_headroom;
 	bool			rx_notify_masked;
 	struct ptr_ring		xdp_ring;
+	struct ptr_ring		xdp_tx_ring;
 	struct xdp_rxq_info	xdp_rxq;
 };
 
@@ -114,6 +115,11 @@ static const struct ethtool_ops veth_ethtool_ops = {
 
 /* general routines */
 
+static void veth_xdp_free(void *frame)
+{
+	xdp_return_frame(frame);
+}
+
 static void __veth_xdp_flush(struct veth_priv *priv)
 {
 	/* Write ptr_ring before reading rx_notify_masked */
@@ -248,6 +254,61 @@ static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
 	return skb;
 }
 
+static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
+					struct xdp_frame *frame)
+{
+	int len = frame->len, delta = 0;
+	struct bpf_prog *xdp_prog;
+	unsigned int headroom;
+	struct sk_buff *skb;
+
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(priv->xdp_prog);
+	if (xdp_prog) {
+		struct xdp_buff xdp;
+		u32 act;
+
+		xdp.data_hard_start = frame->data - frame->headroom;
+		xdp.data = frame->data;
+		xdp.data_end = frame->data + frame->len;
+		xdp.data_meta = frame->data - frame->metasize;
+		xdp.rxq = &priv->xdp_rxq;
+
+		act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+		switch (act) {
+		case XDP_PASS:
+			delta = frame->data - xdp.data;
+			len = xdp.data_end - xdp.data;
+			break;
+		default:
+			bpf_warn_invalid_xdp_action(act);
+		case XDP_ABORTED:
+			trace_xdp_exception(priv->dev, xdp_prog, act);
+		case XDP_DROP:
+			goto err_xdp;
+		}
+	}
+	rcu_read_unlock();
+
+	headroom = frame->data - delta - (void *)frame;
+	skb = veth_build_skb(frame, headroom, len, 0);
+	if (!skb) {
+		xdp_return_frame(frame);
+		goto err;
+	}
+
+	memset(frame, 0, sizeof(*frame));
+	skb->protocol = eth_type_trans(skb, priv->dev);
+err:
+	return skb;
+err_xdp:
+	rcu_read_unlock();
+	xdp_return_frame(frame);
+
+	return NULL;
+}
+
 static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 					struct sk_buff *skb)
 {
@@ -352,21 +413,53 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 
 static int veth_xdp_rcv(struct veth_priv *priv, int budget)
 {
-	int i, done = 0;
+	int done = 0;
+	bool more;
 
-	for (i = 0; i < budget; i++) {
-		struct sk_buff *skb = __ptr_ring_consume(&priv->xdp_ring);
+	do {
+		int curr_budget, i;
+		bool curr_more;
 
-		if (!skb)
-			break;
+		more = false;
 
-		skb = veth_xdp_rcv_skb(priv, skb);
+		curr_more = true;
+		curr_budget = min(budget - done, budget >> 1);
+		for (i = 0; i < curr_budget; i++) {
+			struct xdp_frame *frame;
+			struct sk_buff *skb;
 
-		if (skb)
-			napi_gro_receive(&priv->xdp_napi, skb);
+			frame = __ptr_ring_consume(&priv->xdp_tx_ring);
+			if (!frame) {
+				curr_more = false;
+				break;
+			}
 
-		done++;
-	}
+			skb = veth_xdp_rcv_one(priv, frame);
+			if (skb)
+				napi_gro_receive(&priv->xdp_napi, skb);
+
+			done++;
+		}
+		more |= curr_more;
+
+		curr_more = true;
+		curr_budget = min(budget - done, budget >> 1);
+		for (i = 0; i < curr_budget; i++) {
+			struct sk_buff *skb = __ptr_ring_consume(&priv->xdp_ring);
+
+			if (!skb) {
+				curr_more = false;
+				break;
+			}
+
+			skb = veth_xdp_rcv_skb(priv, skb);
+			if (skb)
+				napi_gro_receive(&priv->xdp_napi, skb);
+
+			done++;
+		}
+		more |= curr_more;
+	} while (more && done < budget);
 
 	return done;
 }
@@ -382,7 +475,8 @@ static int veth_poll(struct napi_struct *napi, int budget)
 	if (done < budget && napi_complete_done(napi, done)) {
 		/* Write rx_notify_masked before reading ptr_ring */
 		smp_store_mb(priv->rx_notify_masked, false);
-		if (unlikely(!__ptr_ring_empty(&priv->xdp_ring))) {
+		if (unlikely(!__ptr_ring_empty(&priv->xdp_tx_ring) ||
+			     !__ptr_ring_empty(&priv->xdp_ring))) {
 			priv->rx_notify_masked = true;
 			napi_schedule(&priv->xdp_napi);
 		}
@@ -400,10 +494,18 @@ static int veth_napi_add(struct net_device *dev)
 	if (err)
 		return err;
 
+	err = ptr_ring_init(&priv->xdp_tx_ring, VETH_RING_SIZE, GFP_KERNEL);
+	if (err)
+		goto err_xdp_tx_ring;
+
 	netif_napi_add(dev, &priv->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
 	napi_enable(&priv->xdp_napi);
 
 	return 0;
+err_xdp_tx_ring:
+	ptr_ring_cleanup(&priv->xdp_ring, __skb_array_destroy_skb);
+
+	return err;
 }
 
 static void veth_napi_del(struct net_device *dev)
@@ -413,6 +515,7 @@ static void veth_napi_del(struct net_device *dev)
 	napi_disable(&priv->xdp_napi);
 	netif_napi_del(&priv->xdp_napi);
 	ptr_ring_cleanup(&priv->xdp_ring, __skb_array_destroy_skb);
+	ptr_ring_cleanup(&priv->xdp_tx_ring, veth_xdp_free);
 }
 
 static int veth_enable_xdp(struct net_device *dev)
-- 
2.14.3

^ permalink raw reply related

* [PATCH RFC v2 3/9] veth: Avoid drops by oversized packets when XDP is enabled
From: Toshiaki Makita @ 2018-06-10 16:02 UTC (permalink / raw)
  To: netdev
  Cc: Toshiaki Makita, Jesper Dangaard Brouer, Alexei Starovoitov,
	Daniel Borkmann
In-Reply-To: <20180610160217.3146-1-toshiaki.makita1@gmail.com>

From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

All oversized packets including GSO packets are dropped if XDP is
enabled on receiver side, so don't send such packets from peer.

Drop TSO and SCTP fragmentation features so that veth devices themselves
segment packets with XDP enabled. Also cap MTU accordingly.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 drivers/net/veth.c | 49 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 317ec92cf816..88d349da72cc 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -533,6 +533,23 @@ static int veth_get_iflink(const struct net_device *dev)
 	return iflink;
 }
 
+static netdev_features_t veth_fix_features(struct net_device *dev,
+					   netdev_features_t features)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	struct net_device *peer;
+
+	peer = rtnl_dereference(priv->peer);
+	if (peer) {
+		struct veth_priv *peer_priv = netdev_priv(peer);
+
+		if (rtnl_dereference(peer_priv->xdp_prog))
+			features &= ~NETIF_F_GSO_SOFTWARE;
+	}
+
+	return features;
+}
+
 static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
 {
 	struct veth_priv *peer_priv, *priv = netdev_priv(dev);
@@ -571,10 +588,19 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 		if (!peer)
 			return -ENOTCONN;
 
-		if (!old_prog && dev->flags & IFF_UP) {
-			err = veth_enable_xdp(dev);
-			if (err)
-				return err;
+		if (!old_prog) {
+			if (dev->flags & IFF_UP) {
+				err = veth_enable_xdp(dev);
+				if (err)
+					return err;
+			}
+
+			peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
+			peer->max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
+				peer->hard_header_len -
+				SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+			if (peer->mtu > peer->max_mtu)
+				dev_set_mtu(peer, peer->max_mtu);
 		}
 	}
 
@@ -582,10 +608,20 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 
 	if (old_prog) {
 		bpf_prog_put(old_prog);
-		if (!prog && dev->flags & IFF_UP)
-			veth_disable_xdp(dev);
+		if (!prog) {
+			if (dev->flags & IFF_UP)
+				veth_disable_xdp(dev);
+
+			if (peer) {
+				peer->hw_features |= NETIF_F_GSO_SOFTWARE;
+				peer->max_mtu = ETH_MAX_MTU;
+			}
+		}
 	}
 
+	if ((!!old_prog ^ !!prog) && peer)
+		netdev_update_features(peer);
+
 	return 0;
 }
 
@@ -627,6 +663,7 @@ static const struct net_device_ops veth_netdev_ops = {
 	.ndo_poll_controller	= veth_poll_controller,
 #endif
 	.ndo_get_iflink		= veth_get_iflink,
+	.ndo_fix_features	= veth_fix_features,
 	.ndo_features_check	= passthru_features_check,
 	.ndo_set_rx_headroom	= veth_set_rx_headroom,
 	.ndo_bpf		= veth_xdp,
-- 
2.14.3

^ permalink raw reply related

* [PATCH RFC v2 2/9] veth: Add driver XDP
From: Toshiaki Makita @ 2018-06-10 16:02 UTC (permalink / raw)
  To: netdev
  Cc: Toshiaki Makita, Jesper Dangaard Brouer, Alexei Starovoitov,
	Daniel Borkmann
In-Reply-To: <20180610160217.3146-1-toshiaki.makita1@gmail.com>

From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

This is basic implementation of veth driver XDP.

Incoming packets are sent from the peer veth device in the form of skb,
so this is generally doing the same thing as generic XDP.

This itself is not so useful, but a starting point to implement other
useful veth XDP features like TX and REDIRECT.

This introduces NAPI when XDP is enabled, because XDP is now heavily
relies on NAPI context. Use ptr_ring to emulate NIC ring. Tx function
enqueues packets to the ring and peer NAPI handler drains the ring.

Currently only one ring is allocated for each veth device, so it does
not scale on multiqueue env. This can be resolved by allocating rings
on the per-queue basis later.

Note that NAPI is not used but netif_rx is used when XDP is not loaded,
so this does not change the default behaviour.

v2:
- Squashed with the patch adding NAPI.
- Implement adjust_tail.
- Don't acquire consumer lock because it is guarded by NAPI.
- Make poll_controller noop since it is unnecessary.
- Register rxq_info on enabling XDP rather than on opening the device.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 drivers/net/veth.c | 357 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 350 insertions(+), 7 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a69ad39ee57e..317ec92cf816 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -19,10 +19,18 @@
 #include <net/xfrm.h>
 #include <linux/veth.h>
 #include <linux/module.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+#include <linux/skb_array.h>
+#include <linux/bpf_trace.h>
 
 #define DRV_NAME	"veth"
 #define DRV_VERSION	"1.0"
 
+#define VETH_RING_SIZE		256
+#define VETH_XDP_HEADROOM	(XDP_PACKET_HEADROOM + NET_IP_ALIGN)
+
 struct pcpu_vstats {
 	u64			packets;
 	u64			bytes;
@@ -30,9 +38,15 @@ struct pcpu_vstats {
 };
 
 struct veth_priv {
+	struct napi_struct	xdp_napi;
+	struct net_device	*dev;
+	struct bpf_prog __rcu	*xdp_prog;
 	struct net_device __rcu	*peer;
 	atomic64_t		dropped;
 	unsigned		requested_headroom;
+	bool			rx_notify_masked;
+	struct ptr_ring		xdp_ring;
+	struct xdp_rxq_info	xdp_rxq;
 };
 
 /*
@@ -98,11 +112,43 @@ static const struct ethtool_ops veth_ethtool_ops = {
 	.get_link_ksettings	= veth_get_link_ksettings,
 };
 
-static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
+/* general routines */
+
+static void __veth_xdp_flush(struct veth_priv *priv)
+{
+	/* Write ptr_ring before reading rx_notify_masked */
+	smp_mb();
+	if (!priv->rx_notify_masked) {
+		priv->rx_notify_masked = true;
+		napi_schedule(&priv->xdp_napi);
+	}
+}
+
+static int veth_xdp_rx(struct veth_priv *priv, struct sk_buff *skb)
+{
+	if (unlikely(ptr_ring_produce(&priv->xdp_ring, skb))) {
+		dev_kfree_skb_any(skb);
+		return NET_RX_DROP;
+	}
+
+	return NET_RX_SUCCESS;
+}
+
+static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, bool xdp)
 {
 	struct veth_priv *priv = netdev_priv(dev);
+
+	return __dev_forward_skb(dev, skb) ?: xdp ?
+		veth_xdp_rx(priv, skb) :
+		netif_rx(skb);
+}
+
+static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
 	struct net_device *rcv;
 	int length = skb->len;
+	bool rcv_xdp = false;
 
 	rcu_read_lock();
 	rcv = rcu_dereference(priv->peer);
@@ -111,7 +157,10 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto drop;
 	}
 
-	if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
+	rcv_priv = netdev_priv(rcv);
+	rcv_xdp = rcu_access_pointer(rcv_priv->xdp_prog);
+
+	if (likely(veth_forward_skb(rcv, skb, rcv_xdp) == NET_RX_SUCCESS)) {
 		struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
 
 		u64_stats_update_begin(&stats->syncp);
@@ -122,14 +171,15 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 drop:
 		atomic64_inc(&priv->dropped);
 	}
+
+	if (rcv_xdp)
+		__veth_xdp_flush(rcv_priv);
+
 	rcu_read_unlock();
+
 	return NETDEV_TX_OK;
 }
 
-/*
- * general routines
- */
-
 static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
@@ -179,18 +229,245 @@ static void veth_set_multicast_list(struct net_device *dev)
 {
 }
 
+static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
+				      int buflen)
+{
+	struct sk_buff *skb;
+
+	if (!buflen) {
+		buflen = SKB_DATA_ALIGN(headroom + len) +
+			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	}
+	skb = build_skb(head, buflen);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, headroom);
+	skb_put(skb, len);
+
+	return skb;
+}
+
+static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
+					struct sk_buff *skb)
+{
+	u32 pktlen, headroom, act, metalen;
+	void *orig_data, *orig_data_end;
+	int size, mac_len, delta, off;
+	struct bpf_prog *xdp_prog;
+	struct xdp_buff xdp;
+
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(priv->xdp_prog);
+	if (!xdp_prog) {
+		rcu_read_unlock();
+		goto out;
+	}
+
+	mac_len = skb->data - skb_mac_header(skb);
+	pktlen = skb->len + mac_len;
+	size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
+	       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	if (size > PAGE_SIZE)
+		goto drop;
+
+	headroom = skb_headroom(skb) - mac_len;
+	if (skb_shared(skb) || skb_head_is_locked(skb) ||
+	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
+		struct sk_buff *nskb;
+		void *head, *start;
+		struct page *page;
+		int head_off;
+
+		page = alloc_page(GFP_ATOMIC);
+		if (!page)
+			goto drop;
+
+		head = page_address(page);
+		start = head + VETH_XDP_HEADROOM;
+		if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
+			page_frag_free(head);
+			goto drop;
+		}
+
+		nskb = veth_build_skb(head,
+				      VETH_XDP_HEADROOM + mac_len, skb->len,
+				      PAGE_SIZE);
+		if (!nskb) {
+			page_frag_free(head);
+			goto drop;
+		}
+
+		skb_copy_header(nskb, skb);
+		head_off = skb_headroom(nskb) - skb_headroom(skb);
+		skb_headers_offset_update(nskb, head_off);
+		dev_consume_skb_any(skb);
+		skb = nskb;
+	}
+
+	xdp.data_hard_start = skb->head;
+	xdp.data = skb_mac_header(skb);
+	xdp.data_end = xdp.data + pktlen;
+	xdp.data_meta = xdp.data;
+	xdp.rxq = &priv->xdp_rxq;
+	orig_data = xdp.data;
+	orig_data_end = xdp.data_end;
+
+	act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+	switch (act) {
+	case XDP_PASS:
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+	case XDP_ABORTED:
+		trace_xdp_exception(priv->dev, xdp_prog, act);
+	case XDP_DROP:
+		goto drop;
+	}
+	rcu_read_unlock();
+
+	delta = orig_data - xdp.data;
+	off = mac_len + delta;
+	if (off > 0)
+		__skb_push(skb, off);
+	else if (off < 0)
+		__skb_pull(skb, -off);
+	skb->mac_header -= delta;
+	off = xdp.data_end - orig_data_end;
+	if (off != 0)
+		__skb_put(skb, off);
+	skb->protocol = eth_type_trans(skb, priv->dev);
+
+	metalen = xdp.data - xdp.data_meta;
+	if (metalen)
+		skb_metadata_set(skb, metalen);
+out:
+	return skb;
+drop:
+	rcu_read_unlock();
+	dev_kfree_skb_any(skb);
+	return NULL;
+}
+
+static int veth_xdp_rcv(struct veth_priv *priv, int budget)
+{
+	int i, done = 0;
+
+	for (i = 0; i < budget; i++) {
+		struct sk_buff *skb = __ptr_ring_consume(&priv->xdp_ring);
+
+		if (!skb)
+			break;
+
+		skb = veth_xdp_rcv_skb(priv, skb);
+
+		if (skb)
+			napi_gro_receive(&priv->xdp_napi, skb);
+
+		done++;
+	}
+
+	return done;
+}
+
+static int veth_poll(struct napi_struct *napi, int budget)
+{
+	struct veth_priv *priv =
+		container_of(napi, struct veth_priv, xdp_napi);
+	int done;
+
+	done = veth_xdp_rcv(priv, budget);
+
+	if (done < budget && napi_complete_done(napi, done)) {
+		/* Write rx_notify_masked before reading ptr_ring */
+		smp_store_mb(priv->rx_notify_masked, false);
+		if (unlikely(!__ptr_ring_empty(&priv->xdp_ring))) {
+			priv->rx_notify_masked = true;
+			napi_schedule(&priv->xdp_napi);
+		}
+	}
+
+	return done;
+}
+
+static int veth_napi_add(struct net_device *dev)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	int err;
+
+	err = ptr_ring_init(&priv->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
+	if (err)
+		return err;
+
+	netif_napi_add(dev, &priv->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
+	napi_enable(&priv->xdp_napi);
+
+	return 0;
+}
+
+static void veth_napi_del(struct net_device *dev)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+
+	napi_disable(&priv->xdp_napi);
+	netif_napi_del(&priv->xdp_napi);
+	ptr_ring_cleanup(&priv->xdp_ring, __skb_array_destroy_skb);
+}
+
+static int veth_enable_xdp(struct net_device *dev)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	int err;
+
+	err = xdp_rxq_info_reg(&priv->xdp_rxq, dev, 0);
+	if (err < 0)
+		return err;
+
+	err = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq,
+					 MEM_TYPE_PAGE_SHARED, NULL);
+	if (err < 0)
+		goto err;
+
+	err = veth_napi_add(dev);
+	if (err)
+		goto err;
+
+	return 0;
+err:
+	xdp_rxq_info_unreg(&priv->xdp_rxq);
+
+	return err;
+}
+
+static void veth_disable_xdp(struct net_device *dev)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+
+	veth_napi_del(dev);
+	xdp_rxq_info_unreg(&priv->xdp_rxq);
+}
+
 static int veth_open(struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
 	struct net_device *peer = rtnl_dereference(priv->peer);
+	int err;
 
 	if (!peer)
 		return -ENOTCONN;
 
+	if (rtnl_dereference(priv->xdp_prog)) {
+		err = veth_enable_xdp(dev);
+		if (err)
+			return err;
+	}
+
 	if (peer->flags & IFF_UP) {
 		netif_carrier_on(dev);
 		netif_carrier_on(peer);
 	}
+
 	return 0;
 }
 
@@ -203,6 +480,9 @@ static int veth_close(struct net_device *dev)
 	if (peer)
 		netif_carrier_off(peer);
 
+	if (rtnl_dereference(priv->xdp_prog))
+		veth_disable_xdp(dev);
+
 	return 0;
 }
 
@@ -228,7 +508,7 @@ static void veth_dev_free(struct net_device *dev)
 static void veth_poll_controller(struct net_device *dev)
 {
 	/* veth only receives frames when its peer sends one
-	 * Since it's a synchronous operation, we are guaranteed
+	 * Since it has nothing to do with disabling irqs, we are guaranteed
 	 * never to have pending data when we poll for it so
 	 * there is nothing to do here.
 	 *
@@ -276,6 +556,65 @@ static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
 	rcu_read_unlock();
 }
 
+static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
+			struct netlink_ext_ack *extack)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+	struct net_device *peer;
+	int err;
+
+	old_prog = rtnl_dereference(priv->xdp_prog);
+	peer = rtnl_dereference(priv->peer);
+
+	if (prog) {
+		if (!peer)
+			return -ENOTCONN;
+
+		if (!old_prog && dev->flags & IFF_UP) {
+			err = veth_enable_xdp(dev);
+			if (err)
+				return err;
+		}
+	}
+
+	rcu_assign_pointer(priv->xdp_prog, prog);
+
+	if (old_prog) {
+		bpf_prog_put(old_prog);
+		if (!prog && dev->flags & IFF_UP)
+			veth_disable_xdp(dev);
+	}
+
+	return 0;
+}
+
+static u32 veth_xdp_query(struct net_device *dev)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	const struct bpf_prog *xdp_prog;
+
+	xdp_prog = rtnl_dereference(priv->xdp_prog);
+	if (xdp_prog)
+		return xdp_prog->aux->id;
+
+	return 0;
+}
+
+static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return veth_xdp_set(dev, xdp->prog, xdp->extack);
+	case XDP_QUERY_PROG:
+		xdp->prog_id = veth_xdp_query(dev);
+		xdp->prog_attached = !!xdp->prog_id;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops veth_netdev_ops = {
 	.ndo_init            = veth_dev_init,
 	.ndo_open            = veth_open,
@@ -290,6 +629,7 @@ static const struct net_device_ops veth_netdev_ops = {
 	.ndo_get_iflink		= veth_get_iflink,
 	.ndo_features_check	= passthru_features_check,
 	.ndo_set_rx_headroom	= veth_set_rx_headroom,
+	.ndo_bpf		= veth_xdp,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
@@ -451,10 +791,13 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 	 */
 
 	priv = netdev_priv(dev);
+	priv->dev = dev;
 	rcu_assign_pointer(priv->peer, peer);
 
 	priv = netdev_priv(peer);
+	priv->dev = peer;
 	rcu_assign_pointer(priv->peer, dev);
+
 	return 0;
 
 err_register_dev:
-- 
2.14.3

^ permalink raw reply related

* [PATCH RFC v2 1/9] net: Export skb_headers_offset_update
From: Toshiaki Makita @ 2018-06-10 16:02 UTC (permalink / raw)
  To: netdev
  Cc: Toshiaki Makita, Jesper Dangaard Brouer, Alexei Starovoitov,
	Daniel Borkmann
In-Reply-To: <20180610160217.3146-1-toshiaki.makita1@gmail.com>

From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

v2:
- Drop skb_copy_header part because it has already been exported now.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 include/linux/skbuff.h | 1 +
 net/core/skbuff.c      | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 164cdedf6012..2bdba543fda7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1030,6 +1030,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 }
 
 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
+void skb_headers_offset_update(struct sk_buff *skb, int off);
 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c642304f178c..180ab7d7f84f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1290,7 +1290,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(skb_clone);
 
-static void skb_headers_offset_update(struct sk_buff *skb, int off)
+void skb_headers_offset_update(struct sk_buff *skb, int off)
 {
 	/* Only adjust this if it actually is csum_start rather than csum */
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
@@ -1304,6 +1304,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
 	skb->inner_network_header += off;
 	skb->inner_mac_header += off;
 }
+EXPORT_SYMBOL(skb_headers_offset_update);
 
 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
 {
-- 
2.14.3

^ permalink raw reply related

* [PATCH RFC v2 0/9] veth: Driver XDP
From: Toshiaki Makita @ 2018-06-10 16:02 UTC (permalink / raw)
  To: netdev
  Cc: Toshiaki Makita, Jesper Dangaard Brouer, Alexei Starovoitov,
	Daniel Borkmann

From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

This patch set introduces driver XDP for veth.
Basically this is used in conjunction with redirect action of another XDP
program.

  NIC -----------> veth===veth
 (XDP) (redirect)        (XDP)

In this case xdp_frame can be forwarded to the peer veth without
modification, so we can expect far better performance than generic XDP.

The envisioned use cases are:

* Container managed XDP program
Container host redirects frames to containers by XDP redirect action, and
privileged containers can deploy their own XDP programs.

* XDP program cascading
Two or more XDP programs can be called for each packet by redirecting
xdp frames to veth.

* Internal interface for an XDP bridge
When using XDP redirection to create a virtual bridge, veth can be used
to create an internal interface for the bridge.

With single core and simple XDP programs which only redirect and drop
packets, I got 10.5 Mpps redirect/drop rate with i40e 25G NIC + veth.

XXV710 (i40e) --- (XDP redirect) --> veth===veth (XDP drop)

This changeset is making use of NAPI to implement ndo_xdp_xmit and
XDP_TX/REDIRECT. This is mainly because XDP heavily relies on NAPI
context.

This patchset is based on top of net-next commit 75d4e704fa8d
(netdev-FAQ: clarify DaveM's position for stable backports).
Any feedback is welcome. Thanks!

v2:
- Squash NAPI patch with "Add driver XDP" patch.
- Remove conversion from xdp_frame to skb when NAPI is not enabled.
- Introduce per-queue XDP ring (patch 8).
- Introduce bulk skb xmit when XDP is enabled on the peer (patch 9).

Toshiaki Makita (9):
  net: Export skb_headers_offset_update
  veth: Add driver XDP
  veth: Avoid drops by oversized packets when XDP is enabled
  veth: Add another napi ring for ndo_xdp_xmit and handle xdp_frames
  veth: Add ndo_xdp_xmit
  xdp: Add a flag for disabling napi_direct of xdp_return_frame in
    xdp_mem_info
  veth: Add XDP TX and REDIRECT
  veth: Support per queue XDP ring
  veth: Bulk skb xmit for XDP path

 drivers/net/veth.c     | 734 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/filter.h |  16 ++
 include/linux/skbuff.h |   1 +
 include/net/xdp.h      |   4 +
 net/core/filter.c      |  11 +-
 net/core/skbuff.c      |   3 +-
 net/core/xdp.c         |   6 +-
 7 files changed, 753 insertions(+), 22 deletions(-)

-- 
2.14.3

^ permalink raw reply

* Re: netdevice notifier and device private data
From: Alexander Aring @ 2018-06-10 15:39 UTC (permalink / raw)
  To: Michael Richardson; +Cc: netdev, linux-wpan, linux-bluetooth
In-Reply-To: <29706.1528570878@localhost>

Hi,

On Sat, Jun 09, 2018 at 03:01:18PM -0400, Michael Richardson wrote:
> 
> Alexander Aring <aring@mojatatu.com> wrote:
>     > Futhermore user space programs e.g. radvd will do 6lowpan specific
>     > handling on 6lowpan dev->type, it will not work either on tun
>     > devices.
> 
>     > I know that wpantund from NestLabs do this switch, I am very
>     > curious about the reason but I think they do it because the name
>     > is 6LoWPAN. But wpantund is just a SLIP like protocol with
>     > additional radio/foo commands.
> 
> How do they change it then, and what does it do?

They change it with the ioctl() of tun characte device, see [0].

What it does, it just changing the interface type to something else,
also there is no check at all that Linux has this interface type.

User space software e.g. radvd [1] will evaluate this type and doing
specific handling. Obviously changing it to 6LoWPAN and using this code
will confuse everything, because the handling makes only sense for a
6LoWPAN Linux interface which actually also use the 6LoWPAN subsystem.

They just using tun as all other to feed a IPv6 stack on a remote
microcontroller e.g. openthread, contiki, riot. via slip. (wpantund also
allow some radio, foo configuration).

> It totally seems like broken behaviour.  Maybe it's not even intentional.
> Maybe they are just foobar.
> 

They simple don't know what they doing... somebody thought 6LoWPAN need
to be 6LoWPAN, but they actually don't use the 6LoWPAN handling inside
the kernel. _Except_ they doing out of tree stuff which I don't believe.

According to [0] it also works with tun default (I suppsoe raw IPv6),
because ifdef. And they should not change it because they don't use
in-kernel 6LoWPAN functionality.

I really think that this tun/tap feature makes a lot of trouble for some
type changes. I probably introduce lowpan_dev pointer to netdevice and
then check if it's really a 6LoPWAN interface, a dev->type will not
garantuee anymore you have a 6LoWPAN interface. At least in user space
it's not possible to have a check if you really have a 6LoWPAN interface.

- Alex

[0] https://github.com/openthread/wpantund/blob/master/src/util/tunnel.c#L180
[1] https://github.com/reubenhwk/radvd/blob/master/device-linux.c#L75

^ permalink raw reply

* KASAN: slab-out-of-bounds Read in bpf_skb_change_proto
From: syzbot @ 2018-06-10 15:27 UTC (permalink / raw)
  To: ast, daniel, davem, linux-kernel, netdev, syzkaller-bugs

Hello,

syzbot found the following crash on:

HEAD commit:    a16afaf7928b Merge tag 'for-v4.18' of git://git.kernel.org..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=1338f6bf800000
kernel config:  https://syzkaller.appspot.com/x/.config?x=314f2150f36c16ca
dashboard link: https://syzkaller.appspot.com/bug?extid=d2d729bdde65dee3eae6
compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=1173381f800000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=171f90cf800000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+d2d729bdde65dee3eae6@syzkaller.appspotmail.com

random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
==================================================================
BUG: KASAN: slab-out-of-bounds in bpf_skb_proto_xlat net/core/filter.c:2637  
[inline]
BUG: KASAN: slab-out-of-bounds in ____bpf_skb_change_proto  
net/core/filter.c:2675 [inline]
BUG: KASAN: slab-out-of-bounds in bpf_skb_change_proto+0xe37/0x1300  
net/core/filter.c:2650
Read of size 2 at addr ffff8801b04646c0 by task syz-executor241/4519

CPU: 0 PID: 4519 Comm: syz-executor241 Not tainted 4.17.0+ #93
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
  print_address_description+0x6c/0x20b mm/kasan/report.c:256
  kasan_report_error mm/kasan/report.c:354 [inline]
  kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
  __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:431
  bpf_skb_proto_xlat net/core/filter.c:2637 [inline]
  ____bpf_skb_change_proto net/core/filter.c:2675 [inline]
  bpf_skb_change_proto+0xe37/0x1300 net/core/filter.c:2650

Allocated by task 0:
(stack is not available)

Freed by task 0:
(stack is not available)

The buggy address belongs to the object at ffff8801b04646c0
  which belongs to the cache skbuff_head_cache of size 232
The buggy address is located 0 bytes inside of
  232-byte region [ffff8801b04646c0, ffff8801b04647a8)
The buggy address belongs to the page:
page:ffffea0006c11900 count:1 mapcount:0 mapping:ffff8801d9a0d080 index:0x0
flags: 0x2fffc0000000100(slab)
raw: 02fffc0000000100 ffffea0006c49388 ffffea0006ae5c48 ffff8801d9a0d080
raw: 0000000000000000 ffff8801b0464080 000000010000000c 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
  ffff8801b0464580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
  ffff8801b0464600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> ffff8801b0464680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
                                            ^
  ffff8801b0464700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
  ffff8801b0464780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with  
syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* [BUG] make htmldocs failed with error after add converted RST file.
From: Masanari Iida @ 2018-06-10 14:37 UTC (permalink / raw)
  To: linux-kernel, jeffrey.t.kirsher, netdev, Jonathan Corbet,
	linux-doc, intel-wired-lan

After merger a patch,  make htmldocs  and make xmldocs
failed with error.

reST markup error:
/home/iida/Repo/linux-2.6/Documentation/networking/e100.rst:90:
(SEVERE/4) Unexpected section title.

Configuring the Driver on Different Distributions
-------------------------------------------------
Documentation/Makefile:68: recipe for target 'htmldocs' failed
make[1]: *** [htmldocs] Error 1
Makefile:1542: recipe for target 'htmldocs' failed
make: *** [htmldocs] Error 2


85d63445f41125dafeddda74e5b13b7eefac9407 is the first bad commit
commit 85d63445f41125dafeddda74e5b13b7eefac9407
Author: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date:   Thu May 10 12:20:13 2018 -0700

    Documentation: e100: Update the Intel 10/100 driver doc


Reported-by: Masanari Iida <standby24x7@gmail.com>

Masanari Iida

^ permalink raw reply

* Re: UML build broken on master
From: Geert Uytterhoeven @ 2018-06-10 14:04 UTC (permalink / raw)
  To: Thomas Meyer
  Cc: alexei.starovoitov, netdev, Alexei Starovoitov, David S. Miller,
	Arnd Bergmann, Linux Kernel Mailing List, yuehaibing,
	Daniel Borkmann
In-Reply-To: <ae09ac81-c2a5-4a18-bb7f-61219b0c0ac4@email.android.com>

On Sun, Jun 10, 2018 at 3:49 PM Thomas Meyer <thomas@m3y3r.de> wrote:
>The umh stuff seems to have broken this build config:
>
> http://kisskb.ellerman.id.au/kisskb/buildresult/13394304/
>
> Bug or feature?

Bug. It's broken everywhere except for native X86.
Multiple patches to fix this are floating around.

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: WARNING: kmalloc bug in xdp_umem_create
From: Björn Töpel @ 2018-06-10 13:03 UTC (permalink / raw)
  To: penguin-kernel
  Cc: dvyukov, syzbot+4abadc5d69117b346506, Björn Töpel,
	Karlsson, Magnus, David Miller, LKML, Netdev, syzkaller-bugs
In-Reply-To: <13f6777a-2170-d0cc-1066-1b48a27ec981@i-love.sakura.ne.jp>

Den sön 10 juni 2018 kl 14:53 skrev Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp>:
>
> On 2018/06/10 20:52, Dmitry Vyukov wrote:
> > On Sun, Jun 10, 2018 at 11:31 AM, Björn Töpel <bjorn.topel@gmail.com> wrote:
> >> Den sön 10 juni 2018 kl 04:53 skrev Tetsuo Handa
> >> <penguin-kernel@i-love.sakura.ne.jp>:
> >>>
> >>> On 2018/06/10 7:47, syzbot wrote:
> >>>> Hello,
> >>>>
> >>>> syzbot found the following crash on:
> >>>>
> >>>> HEAD commit:    7d3bf613e99a Merge tag 'libnvdimm-for-4.18' of git://git.k..
> >>>> git tree:       upstream
> >>>> console output: https://syzkaller.appspot.com/x/log.txt?x=1073f68f800000
> >>>> kernel config:  https://syzkaller.appspot.com/x/.config?x=f04d8d0a2afb789a
> >>>> dashboard link: https://syzkaller.appspot.com/bug?extid=4abadc5d69117b346506
> >>>> compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
> >>>> syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=13c9756f800000
> >>>> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=16366f9f800000
> >>>>
> >>>> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> >>>> Reported-by: syzbot+4abadc5d69117b346506@syzkaller.appspotmail.com
> >>>>
> >>>> random: sshd: uninitialized urandom read (32 bytes read)
> >>>> random: sshd: uninitialized urandom read (32 bytes read)
> >>>> random: sshd: uninitialized urandom read (32 bytes read)
> >>>> random: sshd: uninitialized urandom read (32 bytes read)
> >>>> random: sshd: uninitialized urandom read (32 bytes read)
> >>>> WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996
> >>>> Kernel panic - not syncing: panic_on_warn set ...
> >>>
> >>> syzbot gave up upon kmalloc(), but actually error handling path has
> >>> NULL pointer dereference bug.
> >>>
> >>
> >> Thanks Tetsuo! This crash has been fixed by Daniel Borkmann in commit
> >> c09290c56376 ("bpf, xdp: fix crash in xdp_umem_unaccount_pages").
> >
> > Let's tell syzbot about this:
> >
> > #syz fix: bpf, xdp: fix crash in xdp_umem_unaccount_pages
> >
> >
> Excuse me, but that patch fixes NULL pointer dereference which occurs after kmalloc()'s
> "WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996"
> message. That is, "Too large memory allocation" itself is not yet fixed.

The code relies on that the sl{u,a,o}b layer says no, and the
setsockopt bails out. The warning could be opted out using
__GFP_NOWARN. Is there another preferred way? Two get_user_pages
calls, where the first call would set pages to NULL just to fault the
region? Walk the process' VMAs? Something else?


Björn

^ permalink raw reply

* Re: WARNING: kmalloc bug in xdp_umem_create
From: Dmitry Vyukov @ 2018-06-10 12:58 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Björn Töpel, syzbot+4abadc5d69117b346506,
	Björn Töpel, Karlsson, Magnus, David Miller, LKML,
	Netdev, syzkaller-bugs
In-Reply-To: <13f6777a-2170-d0cc-1066-1b48a27ec981@i-love.sakura.ne.jp>

On Sun, Jun 10, 2018 at 2:53 PM, Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
> On 2018/06/10 20:52, Dmitry Vyukov wrote:
>> On Sun, Jun 10, 2018 at 11:31 AM, Björn Töpel <bjorn.topel@gmail.com> wrote:
>>> Den sön 10 juni 2018 kl 04:53 skrev Tetsuo Handa
>>> <penguin-kernel@i-love.sakura.ne.jp>:
>>>>
>>>> On 2018/06/10 7:47, syzbot wrote:
>>>>> Hello,
>>>>>
>>>>> syzbot found the following crash on:
>>>>>
>>>>> HEAD commit:    7d3bf613e99a Merge tag 'libnvdimm-for-4.18' of git://git.k..
>>>>> git tree:       upstream
>>>>> console output: https://syzkaller.appspot.com/x/log.txt?x=1073f68f800000
>>>>> kernel config:  https://syzkaller.appspot.com/x/.config?x=f04d8d0a2afb789a
>>>>> dashboard link: https://syzkaller.appspot.com/bug?extid=4abadc5d69117b346506
>>>>> compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
>>>>> syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=13c9756f800000
>>>>> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=16366f9f800000
>>>>>
>>>>> IMPORTANT: if you fix the bug, please add the following tag to the commit:
>>>>> Reported-by: syzbot+4abadc5d69117b346506@syzkaller.appspotmail.com
>>>>>
>>>>> random: sshd: uninitialized urandom read (32 bytes read)
>>>>> random: sshd: uninitialized urandom read (32 bytes read)
>>>>> random: sshd: uninitialized urandom read (32 bytes read)
>>>>> random: sshd: uninitialized urandom read (32 bytes read)
>>>>> random: sshd: uninitialized urandom read (32 bytes read)
>>>>> WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996
>>>>> Kernel panic - not syncing: panic_on_warn set ...
>>>>
>>>> syzbot gave up upon kmalloc(), but actually error handling path has
>>>> NULL pointer dereference bug.
>>>>
>>>
>>> Thanks Tetsuo! This crash has been fixed by Daniel Borkmann in commit
>>> c09290c56376 ("bpf, xdp: fix crash in xdp_umem_unaccount_pages").
>>
>> Let's tell syzbot about this:
>>
>> #syz fix: bpf, xdp: fix crash in xdp_umem_unaccount_pages
>>
>>
> Excuse me, but that patch fixes NULL pointer dereference which occurs after kmalloc()'s
> "WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996"
> message. That is, "Too large memory allocation" itself is not yet fixed.

You are right! I fixed it up. Thanks

^ permalink raw reply

* Re: WARNING: kmalloc bug in xdp_umem_create
From: Tetsuo Handa @ 2018-06-10 12:53 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Björn Töpel, syzbot+4abadc5d69117b346506,
	Björn Töpel, Karlsson, Magnus, David Miller, LKML,
	Netdev, syzkaller-bugs
In-Reply-To: <CACT4Y+bfZK6yucoDrXPPW3Tc40RYq61u0Zcb=CKtdao8E+v1cQ@mail.gmail.com>

On 2018/06/10 20:52, Dmitry Vyukov wrote:
> On Sun, Jun 10, 2018 at 11:31 AM, Björn Töpel <bjorn.topel@gmail.com> wrote:
>> Den sön 10 juni 2018 kl 04:53 skrev Tetsuo Handa
>> <penguin-kernel@i-love.sakura.ne.jp>:
>>>
>>> On 2018/06/10 7:47, syzbot wrote:
>>>> Hello,
>>>>
>>>> syzbot found the following crash on:
>>>>
>>>> HEAD commit:    7d3bf613e99a Merge tag 'libnvdimm-for-4.18' of git://git.k..
>>>> git tree:       upstream
>>>> console output: https://syzkaller.appspot.com/x/log.txt?x=1073f68f800000
>>>> kernel config:  https://syzkaller.appspot.com/x/.config?x=f04d8d0a2afb789a
>>>> dashboard link: https://syzkaller.appspot.com/bug?extid=4abadc5d69117b346506
>>>> compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
>>>> syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=13c9756f800000
>>>> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=16366f9f800000
>>>>
>>>> IMPORTANT: if you fix the bug, please add the following tag to the commit:
>>>> Reported-by: syzbot+4abadc5d69117b346506@syzkaller.appspotmail.com
>>>>
>>>> random: sshd: uninitialized urandom read (32 bytes read)
>>>> random: sshd: uninitialized urandom read (32 bytes read)
>>>> random: sshd: uninitialized urandom read (32 bytes read)
>>>> random: sshd: uninitialized urandom read (32 bytes read)
>>>> random: sshd: uninitialized urandom read (32 bytes read)
>>>> WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996
>>>> Kernel panic - not syncing: panic_on_warn set ...
>>>
>>> syzbot gave up upon kmalloc(), but actually error handling path has
>>> NULL pointer dereference bug.
>>>
>>
>> Thanks Tetsuo! This crash has been fixed by Daniel Borkmann in commit
>> c09290c56376 ("bpf, xdp: fix crash in xdp_umem_unaccount_pages").
> 
> Let's tell syzbot about this:
> 
> #syz fix: bpf, xdp: fix crash in xdp_umem_unaccount_pages
> 
> 
Excuse me, but that patch fixes NULL pointer dereference which occurs after kmalloc()'s
"WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996"
message. That is, "Too large memory allocation" itself is not yet fixed.

^ permalink raw reply

* Re: WARNING: kmalloc bug in xdp_umem_create
From: Dmitry Vyukov @ 2018-06-10 11:52 UTC (permalink / raw)
  To: Björn Töpel
  Cc: Tetsuo Handa, syzbot+4abadc5d69117b346506, Björn Töpel,
	Karlsson, Magnus, David Miller, LKML, Netdev, syzkaller-bugs
In-Reply-To: <CAJ+HfNhnmvTHsE7cgi8pROjprsYvCZuzZDmYmpiUx4jqtHboUw@mail.gmail.com>

On Sun, Jun 10, 2018 at 11:31 AM, Björn Töpel <bjorn.topel@gmail.com> wrote:
> Den sön 10 juni 2018 kl 04:53 skrev Tetsuo Handa
> <penguin-kernel@i-love.sakura.ne.jp>:
>>
>> On 2018/06/10 7:47, syzbot wrote:
>> > Hello,
>> >
>> > syzbot found the following crash on:
>> >
>> > HEAD commit:    7d3bf613e99a Merge tag 'libnvdimm-for-4.18' of git://git.k..
>> > git tree:       upstream
>> > console output: https://syzkaller.appspot.com/x/log.txt?x=1073f68f800000
>> > kernel config:  https://syzkaller.appspot.com/x/.config?x=f04d8d0a2afb789a
>> > dashboard link: https://syzkaller.appspot.com/bug?extid=4abadc5d69117b346506
>> > compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
>> > syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=13c9756f800000
>> > C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=16366f9f800000
>> >
>> > IMPORTANT: if you fix the bug, please add the following tag to the commit:
>> > Reported-by: syzbot+4abadc5d69117b346506@syzkaller.appspotmail.com
>> >
>> > random: sshd: uninitialized urandom read (32 bytes read)
>> > random: sshd: uninitialized urandom read (32 bytes read)
>> > random: sshd: uninitialized urandom read (32 bytes read)
>> > random: sshd: uninitialized urandom read (32 bytes read)
>> > random: sshd: uninitialized urandom read (32 bytes read)
>> > WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996
>> > Kernel panic - not syncing: panic_on_warn set ...
>>
>> syzbot gave up upon kmalloc(), but actually error handling path has
>> NULL pointer dereference bug.
>>
>
> Thanks Tetsuo! This crash has been fixed by Daniel Borkmann in commit
> c09290c56376 ("bpf, xdp: fix crash in xdp_umem_unaccount_pages").

Let's tell syzbot about this:

#syz fix: bpf, xdp: fix crash in xdp_umem_unaccount_pages

^ permalink raw reply

* Re: [PATCH bpf-next v2 3/3] bpf: add ability to configure BPF JIT kallsyms export at the boot time
From: kbuild test robot @ 2018-06-10 11:42 UTC (permalink / raw)
  To: Eugene Syromiatnikov
  Cc: kbuild-all, netdev, linux-kernel, linux-doc, Kees Cook,
	Kai-Heng Feng, Daniel Borkmann, Alexei Starovoitov,
	Jonathan Corbet, Jiri Olsa, Jesper Dangaard Brouer
In-Reply-To: <20180523121837.GA31550@asgard.redhat.com>

[-- Attachment #1: Type: text/plain, Size: 1290 bytes --]

Hi Eugene,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Eugene-Syromiatnikov/bpf-add-boot-parameters-for-sysctl-knobs/20180526-164048
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: i386-randconfig-x074-06101602 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

>> kernel//bpf/core.c:325:38: error: 'CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM_VALUE' undeclared here (not in a function); did you mean 'CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM'?
    int bpf_jit_kallsyms __read_mostly = CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM_VALUE;
                                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                         CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM

vim +325 kernel//bpf/core.c

   323	
   324	#ifdef CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM
 > 325	int bpf_jit_kallsyms __read_mostly = CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM_VALUE;
   326	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 30098 bytes --]

^ permalink raw reply

* Re: [PATCH] fs: 9p: Adding new return type vm_fault_t
From: Souptick Joarder @ 2018-06-10  9:56 UTC (permalink / raw)
  To: Eric Van Hensbergen, rminnich, Latchesar Ionkov
  Cc: v9fs-developer, Matthew Wilcox, netdev, linux-kernel
In-Reply-To: <20180610095640.GA4061@jordon-HP-15-Notebook-PC>

On Sun, Jun 10, 2018 at 3:26 PM, Souptick Joarder <jrdr.linux@gmail.com> wrote:
> Use new return type vm_fault_t for page_mkwrite
> handler.
>
> Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
> Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
> ---
>  fs/9p/vfs_file.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
> index 03c9e32..5f2e48d 100644
> --- a/fs/9p/vfs_file.c
> +++ b/fs/9p/vfs_file.c
> @@ -533,7 +533,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
>         return retval;
>  }
>
> -static int
> +static vm_fault_t
>  v9fs_vm_page_mkwrite(struct vm_fault *vmf)
>  {
>         struct v9fs_inode *v9inode;
> --
> 1.9.1
>

Eric, as requested, posted it in net-dev and lkml mailing list.

^ permalink raw reply

* [PATCH] fs: 9p: Adding new return type vm_fault_t
From: Souptick Joarder @ 2018-06-10  9:56 UTC (permalink / raw)
  To: ericvh, rminnich, lucho; +Cc: v9fs-developer, willy, netdev, linux-kernel

Use new return type vm_fault_t for page_mkwrite
handler.

Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 fs/9p/vfs_file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 03c9e32..5f2e48d 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -533,7 +533,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
 	return retval;
 }

-static int
+static vm_fault_t
 v9fs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct v9fs_inode *v9inode;

^ permalink raw reply related

* Re: [PATCH 2/2] net-next: xsurf100: drop include of lib8390.c
From: Geert Uytterhoeven @ 2018-06-10  9:34 UTC (permalink / raw)
  To: Michael Schmitz
  Cc: netdev, Linux/m68k, Andrew Lunn, Finn Thain, Michael Karcher
In-Reply-To: <1528604559-972-3-git-send-email-schmitzmic@gmail.com>

On Sun, Jun 10, 2018 at 6:22 AM Michael Schmitz <schmitzmic@gmail.com> wrote:
> Now that ax88796.c exports the ax_NS8390_reinit() symbol, we can
> include 8390.h instead of lib8390.c, avoiding duplication of that
> function and killing a few compile warnings in the bargain.
>
> Fixes: 861928f4e60e826c ("net-next: New ax88796 platform
> driver for Amiga X-Surf 100 Zorro board (m68k)")
>
> Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>

Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH 1/2] net-next: ax88796: export ax_NS8390_init() hook
From: Geert Uytterhoeven @ 2018-06-10  9:34 UTC (permalink / raw)
  To: Michael Schmitz
  Cc: netdev, Linux/m68k, Andrew Lunn, Finn Thain, Michael Karcher
In-Reply-To: <1528604559-972-2-git-send-email-schmitzmic@gmail.com>

Hi Michael,

Thanks for the update!

On Sun, Jun 10, 2018 at 6:22 AM Michael Schmitz <schmitzmic@gmail.com> wrote:
> The block I/O code for the new X-Surf 100 ax88796 driver needs
> ax_NS8390_init() for error fixup in its block_output function.
>
> Export this static function through the ax_NS8390_reinit()
> wrapper so we can lose the lib8380.c include in the X-Surf 100
> driver.
>
> Fixes: 861928f4e60e826c ("net-next: New ax88796 platform
> driver for Amiga X-Surf 100 Zorro board (m68k)")
>
> Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>

Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: WARNING: kmalloc bug in xdp_umem_create
From: Björn Töpel @ 2018-06-10  9:31 UTC (permalink / raw)
  To: penguin-kernel
  Cc: syzbot+4abadc5d69117b346506, Björn Töpel,
	Karlsson, Magnus, David Miller, LKML, Netdev, syzkaller-bugs
In-Reply-To: <10d6b170-b820-3077-8737-c9d06e98d0fb@I-love.SAKURA.ne.jp>

Den sön 10 juni 2018 kl 04:53 skrev Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp>:
>
> On 2018/06/10 7:47, syzbot wrote:
> > Hello,
> >
> > syzbot found the following crash on:
> >
> > HEAD commit:    7d3bf613e99a Merge tag 'libnvdimm-for-4.18' of git://git.k..
> > git tree:       upstream
> > console output: https://syzkaller.appspot.com/x/log.txt?x=1073f68f800000
> > kernel config:  https://syzkaller.appspot.com/x/.config?x=f04d8d0a2afb789a
> > dashboard link: https://syzkaller.appspot.com/bug?extid=4abadc5d69117b346506
> > compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
> > syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=13c9756f800000
> > C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=16366f9f800000
> >
> > IMPORTANT: if you fix the bug, please add the following tag to the commit:
> > Reported-by: syzbot+4abadc5d69117b346506@syzkaller.appspotmail.com
> >
> > random: sshd: uninitialized urandom read (32 bytes read)
> > random: sshd: uninitialized urandom read (32 bytes read)
> > random: sshd: uninitialized urandom read (32 bytes read)
> > random: sshd: uninitialized urandom read (32 bytes read)
> > random: sshd: uninitialized urandom read (32 bytes read)
> > WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996
> > Kernel panic - not syncing: panic_on_warn set ...
>
> syzbot gave up upon kmalloc(), but actually error handling path has
> NULL pointer dereference bug.
>

Thanks Tetsuo! This crash has been fixed by Daniel Borkmann in commit
c09290c56376 ("bpf, xdp: fix crash in xdp_umem_unaccount_pages").


Björn


> ----------
> #include <sys/socket.h>
> #include <unistd.h>
> #define PF_XDP 44
> #define SOL_XDP 283
> #define XDP_UMEM_REG 4
>
> int main(int argc, char *argv[])
> {
>         int fd = socket(PF_XDP, SOCK_RAW, 0);
>         struct xdp_umem_reg {
>                 unsigned long long addr;
>                 unsigned long long len;
>                 unsigned int chunk_size;
>                 unsigned int headroom;
>         } arg = {
>                 0x20000000,
>                 0x200002000,
>                 0x800,
>                 2
>         };
>         setsockopt(fd, SOL_XDP, XDP_UMEM_REG, &arg, sizeof(arg));
>         return 0;
> }
> ----------
>
> [   95.172962] WARNING: CPU: 3 PID: 2891 at mm/page_alloc.c:4065 __alloc_pages_nodemask+0x283/0xdf0
> [   95.175179] Modules linked in: pcspkr sg vmw_vmci i2c_piix4 sd_mod ata_generic pata_acpi ahci libahci vmwgfx drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm ata_piix mptspi scsi_transport_spi i2c_core mptscsih e1000 mptbase libata serio_raw
> [   95.180614] CPU: 3 PID: 2891 Comm: a.out Kdump: loaded Not tainted 4.17.0+ #421
> [   95.182351] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/19/2017
> [   95.184909] RIP: 0010:__alloc_pages_nodemask+0x283/0xdf0
> [   95.186319] Code: 00 00 04 00 41 0f 44 c6 48 3b 5c 24 78 c6 84 24 90 00 00 00 00 0f 85 50 0b 00 00 41 83 fd 0a 76 1d f6 c4 02 0f 85 3b ff ff ff <0f> 0b e9 34 ff ff ff 0f 0b 0f 1f 40 00 e9 10 fe ff ff 0f 0b 89 c2
> [   95.190997] RSP: 0018:ffffc900008efd20 EFLAGS: 00010246
> [   95.192257] RAX: 000000000060c0c0 RBX: 0000000000000000 RCX: ffff88013f7fe920
> [   95.194005] RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000000
> [   95.195697] RBP: 000000000060c0c0 R08: 0000000000000001 R09: ffffffffffffef81
> [   95.197393] R10: 000000000000000d R11: 0000000000000e8c R12: 0000000000000001
> [   95.199084] R13: 000000000000000d R14: 000000000060c0c0 R15: 0000000000000000
> [   95.200735] FS:  00007f8387e61740(0000) GS:ffff88013f4c0000(0000) knlGS:0000000000000000
> [   95.203441] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   95.205726] CR2: 0000000020000040 CR3: 0000000133e2c006 CR4: 00000000001606e0
> [   95.207743] Call Trace:
> [   95.208427]  ? __lock_acquire+0x22a/0x1830
> [   95.209391]  ? kmalloc_order+0x15/0x60
> [   95.210266]  ? __kmalloc+0x20a/0x210
> [   95.211104]  ? xdp_umem_create+0x16e/0x3c0
> [   95.212095]  ? xsk_setsockopt+0x153/0x1a0
> [   95.213143]  ? __sys_setsockopt+0x67/0xb0
> [   95.214058]  ? __x64_sys_setsockopt+0x1b/0x20
> [   95.215040]  ? do_syscall_64+0x4f/0x1f0
> [   95.215890]  ? entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [   95.217079] irq event stamp: 5296
> [   95.217785] hardirqs last  enabled at (5295): [<ffffffff810b2a77>] __raw_spin_lock_init+0x17/0x50
> [   95.220381] hardirqs last disabled at (5296): [<ffffffff81800f33>] error_entry+0x73/0xc0
> [   95.222447] softirqs last  enabled at (5284): [<ffffffff81a00183>] __do_softirq+0x183/0x204
> [   95.224328] softirqs last disabled at (5277): [<ffffffff81061bcd>] irq_exit+0xcd/0xf0
> [   95.226065] ---[ end trace 75b6f67917663997 ]---
> [   95.227250] BUG: unable to handle kernel NULL pointer dereference at 0000000000000060
> [   95.229101] PGD 1342eb067 P4D 1342eb067 PUD 1314a2067 PMD 0
> [   95.230398] Oops: 0002 [#1] SMP DEBUG_PAGEALLOC
> [   95.231418] CPU: 3 PID: 2891 Comm: a.out Kdump: loaded Tainted: G        W         4.17.0+ #421
> [   95.233474] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/19/2017
> [   95.236636] RIP: 0010:xdp_umem_create+0x228/0x3c0
> [   95.237867] Code: f4 ff ff ff e8 b9 f9 ff ff 48 8b bb 90 00 00 00 e8 3d d9 a7 ff 48 c7 83 90 00 00 00 00 00 00 00 48 8b 43 30 8b 93 98 00 00 00 <f0> 48 29 50 60 48 8b 7b 30 49 63 ec e8 57 10 92 ff 48 8b 7b 38 e8
> [   95.241945] RSP: 0018:ffffc900008efe88 EFLAGS: 00010246
> [   95.243236] RAX: 0000000000000000 RBX: ffff880133401288 RCX: 000000000060c0c0
> [   95.244789] RDX: 0000000000200002 RSI: 0000000001000010 RDI: 0000000000000000
> [   95.247382] RBP: 0000000000200002 R08: 0000000000000001 R09: ffffffffffffef81
> [   95.249735] R10: 000000000000000d R11: 0000000000000e8c R12: 00000000fffffff4
> [   95.252391] R13: 0000000000000040 R14: 0000000020000000 R15: 00000000000007c0
> [   95.255280] FS:  00007f8387e61740(0000) GS:ffff88013f4c0000(0000) knlGS:0000000000000000
> [   95.257918] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   95.260068] CR2: 0000000000000060 CR3: 0000000133e2c006 CR4: 00000000001606e0
> [   95.262535] Call Trace:
> [   95.263900]  ? xsk_setsockopt+0x153/0x1a0
> [   95.265495]  ? __sys_setsockopt+0x67/0xb0
> [   95.267108]  ? __x64_sys_setsockopt+0x1b/0x20
> [   95.269532]  ? do_syscall_64+0x4f/0x1f0
> [   95.271474]  ? entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [   95.273292] Modules linked in: pcspkr sg vmw_vmci i2c_piix4 sd_mod ata_generic pata_acpi ahci libahci vmwgfx drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm ata_piix mptspi scsi_transport_spi i2c_core mptscsih e1000 mptbase libata serio_raw
> [   95.279548] CR2: 0000000000000060
> [   95.281044] ---[ end trace 75b6f67917663998 ]---
> [   95.283132] RIP: 0010:xdp_umem_create+0x228/0x3c0
> [   95.285257] Code: f4 ff ff ff e8 b9 f9 ff ff 48 8b bb 90 00 00 00 e8 3d d9 a7 ff 48 c7 83 90 00 00 00 00 00 00 00 48 8b 43 30 8b 93 98 00 00 00 <f0> 48 29 50 60 48 8b 7b 30 49 63 ec e8 57 10 92 ff 48 8b 7b 38 e8
> [   95.291487] RSP: 0018:ffffc900008efe88 EFLAGS: 00010246
> [   95.293429] RAX: 0000000000000000 RBX: ffff880133401288 RCX: 000000000060c0c0
> [   95.295761] RDX: 0000000000200002 RSI: 0000000001000010 RDI: 0000000000000000
> [   95.298072] RBP: 0000000000200002 R08: 0000000000000001 R09: ffffffffffffef81
> [   95.300403] R10: 000000000000000d R11: 0000000000000e8c R12: 00000000fffffff4
> [   95.303699] R13: 0000000000000040 R14: 0000000020000000 R15: 00000000000007c0
> [   95.306178] FS:  00007f8387e61740(0000) GS:ffff88013f4c0000(0000) knlGS:0000000000000000
> [   95.308645] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   95.310782] CR2: 0000000000000060 CR3: 0000000133e2c006 CR4: 00000000001606e0
>
> xdp_umem_create+0x228/0x3c0:
> arch_atomic64_sub at arch/x86/include/asm/atomic64_64.h:60
> (inlined by) atomic64_sub at include/asm-generic/atomic-instrumented.h:145
> (inlined by) atomic_long_sub at include/asm-generic/atomic-long.h:199
> (inlined by) xdp_umem_unaccount_pages at net/xdp/xdp_umem.c:135
> (inlined by) xdp_umem_reg at net/xdp/xdp_umem.c:334
> (inlined by) xdp_umem_create at net/xdp/xdp_umem.c:349

^ permalink raw reply

* KASAN: slab-out-of-bounds Write in tls_push_record
From: syzbot @ 2018-06-10  6:15 UTC (permalink / raw)
  To: aviadye, borisp, davejwatson, davem, linux-kernel, netdev,
	syzkaller-bugs

Hello,

syzbot found the following crash on:

HEAD commit:    410feb75de24 Merge tag 'arm64-upstream' of git://git.kerne..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=1308a0cf800000
kernel config:  https://syzkaller.appspot.com/x/.config?x=7c0dfd1fff57e223
dashboard link: https://syzkaller.appspot.com/bug?extid=5c74af81c547738e1684
compiler:       gcc (GCC) 8.0.1 20180413 (experimental)

Unfortunately, I don't have any reproducer for this crash yet.

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+5c74af81c547738e1684@syzkaller.appspotmail.com

RDX: 00000000fffffdef RSI: 00000000200005c0 RDI: 0000000000000013
RBP: 000000000072bea0 R08: 0000000020000000 R09: 000000000000001c
R10: 0000000000000040 R11: 0000000000000246 R12: 0000000000000015
R13: 00000000004c0d3b R14: 00000000004d07a8 R15: 0000000000000002
==================================================================
BUG: KASAN: slab-out-of-bounds in tls_fill_prepend include/net/tls.h:339  
[inline]
BUG: KASAN: slab-out-of-bounds in tls_push_record+0x1023/0x13e0  
net/tls/tls_sw.c:240
Write of size 1 at addr ffff8801bae68000 by task syz-executor1/24187

CPU: 1 PID: 24187 Comm: syz-executor1 Not tainted 4.17.0+ #91
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
  print_address_description+0x6c/0x20b mm/kasan/report.c:256
  kasan_report_error mm/kasan/report.c:354 [inline]
  kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
  __asan_report_store1_noabort+0x17/0x20 mm/kasan/report.c:435
  tls_fill_prepend include/net/tls.h:339 [inline]
  tls_push_record+0x1023/0x13e0 net/tls/tls_sw.c:240
  tls_sw_push_pending_record+0x22/0x30 net/tls/tls_sw.c:276
  tls_push_pending_closed_record+0x10c/0x150 net/tls/tls_main.c:211
  tls_complete_pending_work include/net/tls.h:277 [inline]
  tls_sk_proto_close+0x8f2/0xad0 net/tls/tls_main.c:263
  inet_release+0x104/0x1f0 net/ipv4/af_inet.c:427
  inet6_release+0x50/0x70 net/ipv6/af_inet6.c:459
  sock_release+0x96/0x1b0 net/socket.c:598
  sock_close+0x16/0x20 net/socket.c:1174
  __fput+0x353/0x890 fs/file_table.c:209
  ____fput+0x15/0x20 fs/file_table.c:243
  task_work_run+0x1e4/0x290 kernel/task_work.c:113
  exit_task_work include/linux/task_work.h:22 [inline]
  do_exit+0x1aee/0x2730 kernel/exit.c:865
  do_group_exit+0x16f/0x430 kernel/exit.c:968
  get_signal+0x886/0x1960 kernel/signal.c:2478
  do_signal+0x98/0x2040 arch/x86/kernel/signal.c:810
  exit_to_usermode_loop+0x28a/0x310 arch/x86/entry/common.c:162
  prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
  syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
  do_syscall_64+0x6ac/0x800 arch/x86/entry/common.c:290
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4559f9
Code: 1d ba fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 eb b9 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fb96babace8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
RAX: fffffffffffffe00 RBX: 000000000072bec8 RCX: 00000000004559f9
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000072bec8
RBP: 000000000072bec8 R08: 0000000000000033 R09: 000000000072bea0
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007ffd163d2bff R14: 00007fb96babb9c0 R15: 0000000000000000

Allocated by task 24192:
  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
  set_track mm/kasan/kasan.c:460 [inline]
  kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
  __do_kmalloc mm/slab.c:3718 [inline]
  __kmalloc+0x14e/0x760 mm/slab.c:3727
  kmalloc include/linux/slab.h:518 [inline]
  rw_copy_check_uvector+0x31e/0x3a0 fs/read_write.c:781
  import_iovec+0xc3/0x420 lib/iov_iter.c:1459
  vfs_readv+0xe2/0x1a0 fs/read_write.c:984
  do_preadv+0x1ba/0x270 fs/read_write.c:1070
  __do_sys_preadv fs/read_write.c:1120 [inline]
  __se_sys_preadv fs/read_write.c:1115 [inline]
  __x64_sys_preadv+0x9a/0xf0 fs/read_write.c:1115
  do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 24192:
  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
  set_track mm/kasan/kasan.c:460 [inline]
  __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
  kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
  __cache_free mm/slab.c:3498 [inline]
  kfree+0xd9/0x260 mm/slab.c:3813
  import_iovec+0x32e/0x420 lib/iov_iter.c:1463
  vfs_readv+0xe2/0x1a0 fs/read_write.c:984
  do_preadv+0x1ba/0x270 fs/read_write.c:1070
  __do_sys_preadv fs/read_write.c:1120 [inline]
  __se_sys_preadv fs/read_write.c:1115 [inline]
  __x64_sys_preadv+0x9a/0xf0 fs/read_write.c:1115
  do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff8801bae69f80
  which belongs to the cache kmalloc-16384 of size 16384
The buggy address is located 8064 bytes to the left of
  16384-byte region [ffff8801bae69f80, ffff8801bae6df80)
The buggy address belongs to the page:
page:ffffea0006eb9a00 count:1 mapcount:0 mapping:ffff8801da802200 index:0x0  
compound_mapcount: 0
flags: 0x2fffc0000008100(slab|head)
raw: 02fffc0000008100 ffffea0005e2a408 ffff8801da801c48 ffff8801da802200
raw: 0000000000000000 ffff8801bae69f80 0000000100000001 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
  ffff8801bae67f00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  ffff8801bae67f80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> ffff8801bae68000: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
                    ^
  ffff8801bae68080: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
  ffff8801bae68100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with  
syzbot.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox