Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v3 2/4] vhost-vsock: add pkt cancel capability
From: Peng Tao @ 2016-12-08 17:12 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: David Miller, kvm, virtualization, netdev, Jorgen Hansen
In-Reply-To: <1481217156-7160-2-git-send-email-bergwolf@gmail.com>

To allow canceling all packets of a connection.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Peng Tao <bergwolf@gmail.com>
---
 drivers/vhost/vsock.c  | 41 +++++++++++++++++++++++++++++++++++++++++
 include/net/af_vsock.h |  3 +++
 2 files changed, 44 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index a504e2e0..db64d51 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -218,6 +218,46 @@ vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt)
 	return len;
 }
 
+static int
+vhost_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+	struct vhost_vsock *vsock;
+	struct virtio_vsock_pkt *pkt, *n;
+	int cnt = 0;
+	LIST_HEAD(freeme);
+
+	/* Find the vhost_vsock according to guest context id  */
+	vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
+	if (!vsock)
+		return -ENODEV;
+
+	spin_lock_bh(&vsock->send_pkt_list_lock);
+	list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
+		if (pkt->cancel_token != (void *)vsk)
+			continue;
+		list_move(&pkt->list, &freeme);
+	}
+	spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+	list_for_each_entry_safe(pkt, n, &freeme, list) {
+		if (pkt->reply)
+			cnt++;
+		list_del(&pkt->list);
+		virtio_transport_free_pkt(pkt);
+	}
+
+	if (cnt) {
+		struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
+		int new_cnt;
+
+		new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
+		if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num)
+			vhost_poll_queue(&tx_vq->poll);
+	}
+
+	return 0;
+}
+
 static struct virtio_vsock_pkt *
 vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
 		      unsigned int out, unsigned int in)
@@ -664,6 +704,7 @@ static struct virtio_transport vhost_transport = {
 		.release                  = virtio_transport_release,
 		.connect                  = virtio_transport_connect,
 		.shutdown                 = virtio_transport_shutdown,
+		.cancel_pkt               = vhost_transport_cancel_pkt,
 
 		.dgram_enqueue            = virtio_transport_dgram_enqueue,
 		.dgram_dequeue            = virtio_transport_dgram_dequeue,
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index f275896..ce5f100 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -100,6 +100,9 @@ struct vsock_transport {
 	void (*destruct)(struct vsock_sock *);
 	void (*release)(struct vsock_sock *);
 
+	/* Cancel packets belonging the same vsock */
+	int (*cancel_pkt)(struct vsock_sock *vsk);
+
 	/* Connections. */
 	int (*connect)(struct vsock_sock *);
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 1/4] vsock: track pkt owner vsock
From: Peng Tao @ 2016-12-08 17:12 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: David Miller, kvm, virtualization, netdev, Jorgen Hansen
In-Reply-To: <1481217156-7160-1-git-send-email-bergwolf@gmail.com>

So that we can cancel a queued pkt later if necessary.

Signed-off-by: Peng Tao <bergwolf@gmail.com>
---
 include/linux/virtio_vsock.h            | 2 ++
 net/vmw_vsock/virtio_transport_common.c | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 9638bfe..193ad3a 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -48,6 +48,7 @@ struct virtio_vsock_pkt {
 	struct virtio_vsock_hdr	hdr;
 	struct work_struct work;
 	struct list_head list;
+	void *cancel_token; /* only used for cancellation */
 	void *buf;
 	u32 len;
 	u32 off;
@@ -56,6 +57,7 @@ struct virtio_vsock_pkt {
 
 struct virtio_vsock_pkt_info {
 	u32 remote_cid, remote_port;
+	struct vsock_sock *vsk;
 	struct msghdr *msg;
 	u32 pkt_len;
 	u16 type;
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index a53b3a1..ef94eb8 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -57,6 +57,7 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
 	pkt->len		= len;
 	pkt->hdr.len		= cpu_to_le32(len);
 	pkt->reply		= info->reply;
+	pkt->cancel_token	= info->vsk;
 
 	if (info->msg && len > 0) {
 		pkt->buf = kmalloc(len, GFP_KERNEL);
@@ -180,6 +181,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
 	struct virtio_vsock_pkt_info info = {
 		.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
 		.type = type,
+		.vsk = vsk,
 	};
 
 	return virtio_transport_send_pkt_info(vsk, &info);
@@ -519,6 +521,7 @@ int virtio_transport_connect(struct vsock_sock *vsk)
 	struct virtio_vsock_pkt_info info = {
 		.op = VIRTIO_VSOCK_OP_REQUEST,
 		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.vsk = vsk,
 	};
 
 	return virtio_transport_send_pkt_info(vsk, &info);
@@ -534,6 +537,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
 			  VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
 			 (mode & SEND_SHUTDOWN ?
 			  VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
+		.vsk = vsk,
 	};
 
 	return virtio_transport_send_pkt_info(vsk, &info);
@@ -560,6 +564,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
 		.type = VIRTIO_VSOCK_TYPE_STREAM,
 		.msg = msg,
 		.pkt_len = len,
+		.vsk = vsk,
 	};
 
 	return virtio_transport_send_pkt_info(vsk, &info);
@@ -581,6 +586,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
 		.op = VIRTIO_VSOCK_OP_RST,
 		.type = VIRTIO_VSOCK_TYPE_STREAM,
 		.reply = !!pkt,
+		.vsk = vsk,
 	};
 
 	/* Send RST only if the original pkt is not a RST pkt */
@@ -826,6 +832,7 @@ virtio_transport_send_response(struct vsock_sock *vsk,
 		.remote_cid = le32_to_cpu(pkt->hdr.src_cid),
 		.remote_port = le32_to_cpu(pkt->hdr.src_port),
 		.reply = true,
+		.vsk = vsk,
 	};
 
 	return virtio_transport_send_pkt_info(vsk, &info);
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 0/4] vsock: cancel connect packets when failing to connect
From: Peng Tao @ 2016-12-08 17:12 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: David Miller, kvm, virtualization, netdev, Jorgen Hansen

Currently, if a connect call fails on a signal or timeout (e.g., guest is still
in the process of starting up), we'll just return to caller and leave the connect
packet queued and they are sent even though the connection is considered a failure,
which can confuse applications with unwanted false connect attempt.

The patchset enables vsock (both host and guest) to cancel queued packets when
a connect attempt is considered to fail.

v3 changelog:
  - define cancel_pkt callback in struct vsock_transport rather than struct virtio_transport
  - rename virtio_vsock_pkt->vsk to virtio_vsock_pkt->cancel_token
v2 changelog:
  - fix queued_replies counting and resume tx/rx when necessary


Peng Tao (4):
  vsock: track pkt owner vsock
  vhost-vsock: add pkt cancel capability
  vsock: add pkt cancel capability
  vsock: cancel packets when failing to connect

 drivers/vhost/vsock.c                   | 41 ++++++++++++++++++++++++++++++++
 include/linux/virtio_vsock.h            |  2 ++
 include/net/af_vsock.h                  |  3 +++
 net/vmw_vsock/af_vsock.c                | 14 +++++++++++
 net/vmw_vsock/virtio_transport.c        | 42 +++++++++++++++++++++++++++++++++
 net/vmw_vsock/virtio_transport_common.c |  7 ++++++
 6 files changed, 109 insertions(+)

-- 
2.7.4

^ permalink raw reply

* [PATCH-RESEND] vhost-vsock: fix orphan connection reset
From: Peng Tao @ 2016-12-08 17:10 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: David Miller, kvm, virtualization, netdev

local_addr.svm_cid is host cid. We should check guest cid instead,
which is remote_addr.svm_cid. Otherwise we end up resetting all
connections to all guests.

Cc: stable@vger.kernel.org [4.8+]
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Peng Tao <bergwolf@gmail.com>
---
resending because the last attempt looks to be dropped by vger.
 drivers/vhost/vsock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index e3b30ea..a504e2e0 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -506,7 +506,7 @@ static void vhost_vsock_reset_orphans(struct sock *sk)
 	 * executing.
 	 */
 
-	if (!vhost_vsock_get(vsk->local_addr.svm_cid)) {
+	if (!vhost_vsock_get(vsk->remote_addr.svm_cid)) {
 		sock_set_flag(sk, SOCK_DONE);
 		vsk->peer_shutdown = SHUTDOWN_MASK;
 		sk->sk_state = SS_UNCONNECTED;
-- 
2.7.4


^ permalink raw reply related

* Re: [net-next PATCH v5 4/6] virtio_net: add dedicated XDP transmit queues
From: John Fastabend @ 2016-12-08 17:10 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: daniel, shm, davem, tgraf, alexei.starovoitov, john.r.fastabend,
	netdev, brouer
In-Reply-To: <20161208075459-mutt-send-email-mst@kernel.org>

On 16-12-07 09:59 PM, Michael S. Tsirkin wrote:
> On Wed, Dec 07, 2016 at 12:12:23PM -0800, John Fastabend wrote:
>> XDP requires using isolated transmit queues to avoid interference
>> with normal networking stack (BQL, NETDEV_TX_BUSY, etc).
>> This patch
>> adds a XDP queue per cpu when a XDP program is loaded and does not
>> expose the queues to the OS via the normal API call to
>> netif_set_real_num_tx_queues(). This way the stack will never push
>> an skb to these queues.
>>
>> However virtio/vhost/qemu implementation only allows for creating
>> TX/RX queue pairs at this time so creating only TX queues was not
>> possible. And because the associated RX queues are being created I
>> went ahead and exposed these to the stack and let the backend use
>> them. This creates more RX queues visible to the network stack than
>> TX queues which is worth mentioning but does not cause any issues as
>> far as I can tell.
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>  drivers/net/virtio_net.c |   30 ++++++++++++++++++++++++++++--
>>  1 file changed, 28 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index a009299..28b1196 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -114,6 +114,9 @@ struct virtnet_info {
>>  	/* # of queue pairs currently used by the driver */
>>  	u16 curr_queue_pairs;
>>  
>> +	/* # of XDP queue pairs currently used by the driver */
>> +	u16 xdp_queue_pairs;
>> +
>>  	/* I like... big packets and I cannot lie! */
>>  	bool big_packets;
>>  
>> @@ -1547,7 +1550,8 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>>  	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
>>  	struct virtnet_info *vi = netdev_priv(dev);
>>  	struct bpf_prog *old_prog;
>> -	int i;
>> +	u16 xdp_qp = 0, curr_qp;
>> +	int i, err;
>>  
>>  	if ((dev->features & NETIF_F_LRO) && prog) {
>>  		netdev_warn(dev, "can't set XDP while LRO is on, disable LRO first\n");
>> @@ -1564,12 +1568,34 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>>  		return -EINVAL;
>>  	}
>>  
>> +	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
>> +	if (prog)
>> +		xdp_qp = nr_cpu_ids;
>> +
>> +	/* XDP requires extra queues for XDP_TX */
>> +	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
>> +		netdev_warn(dev, "request %i queues but max is %i\n",
>> +			    curr_qp + xdp_qp, vi->max_queue_pairs);
>> +		return -ENOMEM;
>> +	}
> 
> Can't we disable XDP_TX somehow? Many people might only want RX drop,
> and extra queues are not always there.
> 

Alexei, Daniel, any thoughts on this?

I know we were trying to claim some base level of feature support for
all XDP drivers. I am sympathetic to this argument though for DDOS we
do not need XDP_TX support. And virtio can become queue constrained
in some cases.

But, I do not want to silently degrade to RX mode and trying to check
this through the verifier appears challenging. And I'm not thrilled
about more knobs :/ Maybe an escape to force RX mode in sysfs or at
program load time would be OK?

I think this is an improvement that can go on my list along with LRO.

.John

^ permalink raw reply

* [PATCH 3/3] i40e: Don't reset/rebuild rings on XDP program swap
From: Björn Töpel @ 2016-12-08 17:00 UTC (permalink / raw)
  To: jeffrey.t.kirsher, intel-wired-lan
  Cc: Björn Töpel, john.r.fastabend, magnus.karlsson, netdev
In-Reply-To: <20161208170022.11555-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

Previously, when swapping from one XDP program to another, a
reset/rebuild rings was triggered. Now, the XDP program is simply
changed without that requirement.

Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h      |  4 +--
 drivers/net/ethernet/intel/i40e/i40e_main.c | 41 ++++++++++++++++++-----------
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 25 +++++++++++-------
 drivers/net/ethernet/intel/i40e/i40e_txrx.h |  2 +-
 4 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index adc1f3f32729..9bc2a8cf5c2e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -549,7 +549,7 @@ struct i40e_vsi {
 	 * regular rings, i.e. alloc_queue_pairs/num_queue_pairs
 	 */
 	struct i40e_ring **xdp_rings;
-	struct bpf_prog *xdp_prog;
+	bool xdp_enabled;
 
 	u32  active_filters;
 	u32  promisc_threshold;
@@ -920,6 +920,6 @@ void i40e_print_link_message(struct i40e_vsi *vsi, bool isup);
  **/
 static inline bool i40e_enabled_xdp_vsi(const struct i40e_vsi *vsi)
 {
-	return vsi->xdp_prog;
+	return vsi->xdp_enabled;
 }
 #endif /* _I40E_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 9310a5712ae3..7cac13d1c244 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -3116,15 +3116,6 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
 	ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
 	writel(0, ring->tail);
 
-	if (i40e_enabled_xdp_vsi(vsi)) {
-		struct bpf_prog *prog;
-
-		prog = bpf_prog_add(vsi->xdp_prog, 1);
-		if (IS_ERR(prog))
-			return PTR_ERR(prog);
-		ring->xdp_prog = prog;
-	}
-
 	i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
 
 	return 0;
@@ -9428,7 +9419,9 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi,
 {
 	struct i40e_pf *pf = vsi->back;
 	struct net_device *netdev = vsi->netdev;
-	int frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+	int i, frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+	bool need_reset;
+	struct bpf_prog *old_prog;
 
 	if (frame_size > I40E_RXBUFFER_2048)
 		return -EINVAL;
@@ -9439,13 +9432,29 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi,
 	if (!i40e_enabled_xdp_vsi(vsi) && !prog)
 		return 0;
 
-	i40e_prep_for_reset(pf);
+	if (prog) {
+		prog = bpf_prog_add(prog, vsi->num_queue_pairs - 1);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+	}
 
-	if (vsi->xdp_prog)
-		bpf_prog_put(vsi->xdp_prog);
-	vsi->xdp_prog = prog;
+	/* When turning XDP on->off/off->on we reset and rebuild the rings. */
+	need_reset = (i40e_enabled_xdp_vsi(vsi) != !!prog);
 
-	i40e_reset_and_rebuild(pf, true);
+	if (need_reset)
+		i40e_prep_for_reset(pf);
+
+	vsi->xdp_enabled = !!prog;
+
+	if (need_reset)
+		i40e_reset_and_rebuild(pf, true);
+
+	for (i = 0; i < vsi->num_queue_pairs; i++) {
+		old_prog = rtnl_dereference(vsi->rx_rings[i]->xdp_prog);
+		rcu_assign_pointer(vsi->rx_rings[i]->xdp_prog, prog);
+		if (old_prog)
+			bpf_prog_put(old_prog);
+	}
 	return 0;
 }
 
@@ -11740,7 +11749,9 @@ static void i40e_remove(struct pci_dev *pdev)
 		pf->flags &= ~I40E_FLAG_SRIOV_ENABLED;
 	}
 
+	rtnl_lock();
 	i40e_fdir_teardown(pf);
+	rtnl_unlock();
 
 	/* If there is a switch structure or any orphans, remove them.
 	 * This will leave only the PF's VSI remaining.
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index fccdec7ae102..338b4c4c0199 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1112,6 +1112,7 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
 	struct device *dev = rx_ring->dev;
 	unsigned long bi_size;
 	u16 i;
+	struct bpf_prog *old_prog;
 
 	/* ring already cleared, nothing to do */
 	if (!rx_ring->rx_bi)
@@ -1145,10 +1146,10 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
-	if (rx_ring->xdp_prog) {
-		bpf_prog_put(rx_ring->xdp_prog);
-		rx_ring->xdp_prog = NULL;
-	}
+	old_prog = rtnl_dereference(rx_ring->xdp_prog);
+	RCU_INIT_POINTER(rx_ring->xdp_prog, NULL);
+	if (old_prog)
+		bpf_prog_put(old_prog);
 }
 
 /**
@@ -1880,6 +1881,7 @@ bool i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
 {
 	struct i40e_rx_buffer *rx_buffer;
 	struct page *page;
+	struct bpf_prog *xdp_prog;
 
 	rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
 	page = rx_buffer->page;
@@ -1892,14 +1894,17 @@ bool i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
 				      I40E_RXBUFFER_2048,
 				      DMA_FROM_DEVICE);
 
-	if (rx_ring->xdp_prog) {
-		bool xdp_consumed;
-
-		xdp_consumed = i40e_run_xdp(rx_ring, rx_buffer,
-					    rx_desc, rx_ring->xdp_prog);
-		if (xdp_consumed)
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(rx_ring->xdp_prog);
+	if (xdp_prog) {
+		bool xdp_consumed = i40e_run_xdp(rx_ring, rx_buffer,
+						 rx_desc, xdp_prog);
+		if (xdp_consumed) {
+			rcu_read_unlock();
 			return true;
+		}
 	}
+	rcu_read_unlock();
 
 	*skb = rx_buffer->skb;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 4d9459134e69..cfb2c1016242 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -344,7 +344,7 @@ struct i40e_ring {
 	struct rcu_head rcu;		/* to avoid race on free */
 	u16 next_to_alloc;
 
-	struct bpf_prog *xdp_prog;
+	struct bpf_prog __rcu *xdp_prog;
 	struct i40e_ring *xdp_sibling;  /* rx to xdp, and xdp to rx */
 	bool xdp_needs_tail_bump;
 	u16 curr_in_use;
-- 
2.9.3

^ permalink raw reply related

* [PATCH 2/3] i40e: Add XDP_TX support
From: Björn Töpel @ 2016-12-08 17:00 UTC (permalink / raw)
  To: jeffrey.t.kirsher, intel-wired-lan
  Cc: Björn Töpel, john.r.fastabend, magnus.karlsson, netdev
In-Reply-To: <20161208170022.11555-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

This patch adds proper XDP_TX support.

Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h      |   5 +
 drivers/net/ethernet/intel/i40e/i40e_main.c | 273 ++++++++++++++++++++-----
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 304 +++++++++++++++++++++++-----
 drivers/net/ethernet/intel/i40e/i40e_txrx.h |   5 +
 4 files changed, 491 insertions(+), 96 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 05d805f439e6..adc1f3f32729 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -545,6 +545,10 @@ struct i40e_vsi {
 	struct i40e_ring **rx_rings;
 	struct i40e_ring **tx_rings;
 
+	/* The XDP rings are Tx only, and follows the count of the
+	 * regular rings, i.e. alloc_queue_pairs/num_queue_pairs
+	 */
+	struct i40e_ring **xdp_rings;
 	struct bpf_prog *xdp_prog;
 
 	u32  active_filters;
@@ -622,6 +626,7 @@ struct i40e_q_vector {
 
 	struct i40e_ring_container rx;
 	struct i40e_ring_container tx;
+	struct i40e_ring_container xdp;
 
 	u8 num_ringpairs;	/* total number of ring pairs in vector */
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index db0240213f3b..9310a5712ae3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -107,6 +107,18 @@ MODULE_VERSION(DRV_VERSION);
 static struct workqueue_struct *i40e_wq;
 
 /**
+ * i40e_alloc_queue_pairs_xdp_vsi - required # of XDP queue pairs
+ * @vsi: pointer to a vsi
+ **/
+static u16 i40e_alloc_queue_pairs_xdp_vsi(const struct i40e_vsi *vsi)
+{
+	if (i40e_enabled_xdp_vsi(vsi))
+		return vsi->alloc_queue_pairs;
+
+	return 0;
+}
+
+/**
  * i40e_allocate_dma_mem_d - OS specific memory alloc for shared code
  * @hw:   pointer to the HW structure
  * @mem:  ptr to mem struct to fill out
@@ -2828,6 +2840,12 @@ static int i40e_vsi_setup_tx_resources(struct i40e_vsi *vsi)
 	for (i = 0; i < vsi->num_queue_pairs && !err; i++)
 		err = i40e_setup_tx_descriptors(vsi->tx_rings[i]);
 
+	if (!i40e_enabled_xdp_vsi(vsi))
+		return err;
+
+	for (i = 0; i < vsi->num_queue_pairs && !err; i++)
+		err = i40e_setup_tx_descriptors(vsi->xdp_rings[i]);
+
 	return err;
 }
 
@@ -2841,12 +2859,17 @@ static void i40e_vsi_free_tx_resources(struct i40e_vsi *vsi)
 {
 	int i;
 
-	if (!vsi->tx_rings)
-		return;
+	if (vsi->tx_rings) {
+		for (i = 0; i < vsi->num_queue_pairs; i++)
+			if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc)
+				i40e_free_tx_resources(vsi->tx_rings[i]);
+	}
 
-	for (i = 0; i < vsi->num_queue_pairs; i++)
-		if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc)
-			i40e_free_tx_resources(vsi->tx_rings[i]);
+	if (vsi->xdp_rings) {
+		for (i = 0; i < vsi->num_queue_pairs; i++)
+			if (vsi->xdp_rings[i] && vsi->xdp_rings[i]->desc)
+				i40e_free_tx_resources(vsi->xdp_rings[i]);
+	}
 }
 
 /**
@@ -3121,6 +3144,12 @@ static int i40e_vsi_configure_tx(struct i40e_vsi *vsi)
 	for (i = 0; (i < vsi->num_queue_pairs) && !err; i++)
 		err = i40e_configure_tx_ring(vsi->tx_rings[i]);
 
+	if (!i40e_enabled_xdp_vsi(vsi))
+		return err;
+
+	for (i = 0; (i < vsi->num_queue_pairs) && !err; i++)
+		err = i40e_configure_tx_ring(vsi->xdp_rings[i]);
+
 	return err;
 }
 
@@ -3269,7 +3298,7 @@ static void i40e_vsi_configure_msix(struct i40e_vsi *vsi)
 	struct i40e_hw *hw = &pf->hw;
 	u16 vector;
 	int i, q;
-	u32 qp;
+	u32 qp, qp_idx = 0;
 
 	/* The interrupt indexing is offset by 1 in the PFINT_ITRn
 	 * and PFINT_LNKLSTn registers, e.g.:
@@ -3296,16 +3325,33 @@ static void i40e_vsi_configure_msix(struct i40e_vsi *vsi)
 		wr32(hw, I40E_PFINT_LNKLSTN(vector - 1), qp);
 		for (q = 0; q < q_vector->num_ringpairs; q++) {
 			u32 val;
+			u32 nqp = qp;
+
+			if (i40e_enabled_xdp_vsi(vsi)) {
+				nqp = vsi->base_queue +
+				      vsi->xdp_rings[qp_idx]->queue_index;
+			}
 
 			val = I40E_QINT_RQCTL_CAUSE_ENA_MASK |
-			      (I40E_RX_ITR << I40E_QINT_RQCTL_ITR_INDX_SHIFT)  |
-			      (vector      << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
-			      (qp          << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT)|
+			      (I40E_RX_ITR << I40E_QINT_RQCTL_ITR_INDX_SHIFT)   |
+			      (vector      << I40E_QINT_RQCTL_MSIX_INDX_SHIFT)  |
+			      (nqp         << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
 			      (I40E_QUEUE_TYPE_TX
 				      << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT);
 
 			wr32(hw, I40E_QINT_RQCTL(qp), val);
 
+			if (i40e_enabled_xdp_vsi(vsi)) {
+				val = I40E_QINT_TQCTL_CAUSE_ENA_MASK |
+				      (I40E_TX_ITR << I40E_QINT_TQCTL_ITR_INDX_SHIFT)   |
+				      (vector      << I40E_QINT_TQCTL_MSIX_INDX_SHIFT)  |
+				      (qp          << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
+				      (I40E_QUEUE_TYPE_TX
+				       << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT);
+
+				wr32(hw, I40E_QINT_TQCTL(nqp), val);
+			}
+
 			val = I40E_QINT_TQCTL_CAUSE_ENA_MASK |
 			      (I40E_TX_ITR << I40E_QINT_TQCTL_ITR_INDX_SHIFT)  |
 			      (vector      << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
@@ -3320,6 +3366,7 @@ static void i40e_vsi_configure_msix(struct i40e_vsi *vsi)
 
 			wr32(hw, I40E_QINT_TQCTL(qp), val);
 			qp++;
+			qp_idx++;
 		}
 	}
 
@@ -3562,6 +3609,10 @@ static void i40e_vsi_disable_irq(struct i40e_vsi *vsi)
 	for (i = 0; i < vsi->num_queue_pairs; i++) {
 		wr32(hw, I40E_QINT_TQCTL(vsi->tx_rings[i]->reg_idx), 0);
 		wr32(hw, I40E_QINT_RQCTL(vsi->rx_rings[i]->reg_idx), 0);
+		if (i40e_enabled_xdp_vsi(vsi)) {
+			wr32(hw, I40E_QINT_TQCTL(vsi->xdp_rings[i]->reg_idx),
+			     0);
+		}
 	}
 
 	if (pf->flags & I40E_FLAG_MSIX_ENABLED) {
@@ -3871,6 +3922,24 @@ static void i40e_map_vector_to_qp(struct i40e_vsi *vsi, int v_idx, int qp_idx)
 }
 
 /**
+ * i40e_map_vector_to_xdp_ring - Assigns the XDP Tx queue to the vector
+ * @vsi: the VSI being configured
+ * @v_idx: vector index
+ * @xdp_idx: XDP Tx queue index
+ **/
+static void i40e_map_vector_to_xdp_ring(struct i40e_vsi *vsi, int v_idx,
+					int xdp_idx)
+{
+	struct i40e_q_vector *q_vector = vsi->q_vectors[v_idx];
+	struct i40e_ring *xdp_ring = vsi->xdp_rings[xdp_idx];
+
+	xdp_ring->q_vector = q_vector;
+	xdp_ring->next = q_vector->xdp.ring;
+	q_vector->xdp.ring = xdp_ring;
+	q_vector->xdp.count++;
+}
+
+/**
  * i40e_vsi_map_rings_to_vectors - Maps descriptor rings to vectors
  * @vsi: the VSI being configured
  *
@@ -3903,11 +3972,17 @@ static void i40e_vsi_map_rings_to_vectors(struct i40e_vsi *vsi)
 
 		q_vector->rx.count = 0;
 		q_vector->tx.count = 0;
+		q_vector->xdp.count = 0;
 		q_vector->rx.ring = NULL;
 		q_vector->tx.ring = NULL;
+		q_vector->xdp.ring = NULL;
 
 		while (num_ringpairs--) {
 			i40e_map_vector_to_qp(vsi, v_start, qp_idx);
+			if (i40e_enabled_xdp_vsi(vsi)) {
+				i40e_map_vector_to_xdp_ring(vsi, v_start,
+							    qp_idx);
+			}
 			qp_idx++;
 			qp_remaining--;
 		}
@@ -4001,56 +4076,82 @@ static int i40e_pf_txq_wait(struct i40e_pf *pf, int pf_q, bool enable)
 }
 
 /**
- * i40e_vsi_control_tx - Start or stop a VSI's rings
+ * i40e_vsi_control_txq - Start or stop a VSI's queue
  * @vsi: the VSI being configured
  * @enable: start or stop the rings
+ * @pf_q: the PF queue
  **/
-static int i40e_vsi_control_tx(struct i40e_vsi *vsi, bool enable)
+static int i40e_vsi_control_txq(struct i40e_vsi *vsi, bool enable, int pf_q)
 {
 	struct i40e_pf *pf = vsi->back;
 	struct i40e_hw *hw = &pf->hw;
-	int i, j, pf_q, ret = 0;
+	int j, ret = 0;
 	u32 tx_reg;
 
-	pf_q = vsi->base_queue;
-	for (i = 0; i < vsi->num_queue_pairs; i++, pf_q++) {
+	/* warn the TX unit of coming changes */
+	i40e_pre_tx_queue_cfg(&pf->hw, pf_q, enable);
+	if (!enable)
+		usleep_range(10, 20);
 
-		/* warn the TX unit of coming changes */
-		i40e_pre_tx_queue_cfg(&pf->hw, pf_q, enable);
-		if (!enable)
-			usleep_range(10, 20);
+	for (j = 0; j < 50; j++) {
+		tx_reg = rd32(hw, I40E_QTX_ENA(pf_q));
+		if (((tx_reg >> I40E_QTX_ENA_QENA_REQ_SHIFT) & 1) ==
+		    ((tx_reg >> I40E_QTX_ENA_QENA_STAT_SHIFT) & 1))
+			break;
+		usleep_range(1000, 2000);
+	}
+	/* Skip if the queue is already in the requested state */
+	if (enable == !!(tx_reg & I40E_QTX_ENA_QENA_STAT_MASK))
+		return 0;
 
-		for (j = 0; j < 50; j++) {
-			tx_reg = rd32(hw, I40E_QTX_ENA(pf_q));
-			if (((tx_reg >> I40E_QTX_ENA_QENA_REQ_SHIFT) & 1) ==
-			    ((tx_reg >> I40E_QTX_ENA_QENA_STAT_SHIFT) & 1))
-				break;
-			usleep_range(1000, 2000);
-		}
-		/* Skip if the queue is already in the requested state */
-		if (enable == !!(tx_reg & I40E_QTX_ENA_QENA_STAT_MASK))
-			continue;
+	/* turn on/off the queue */
+	if (enable) {
+		wr32(hw, I40E_QTX_HEAD(pf_q), 0);
+		tx_reg |= I40E_QTX_ENA_QENA_REQ_MASK;
+	} else {
+		tx_reg &= ~I40E_QTX_ENA_QENA_REQ_MASK;
+	}
 
-		/* turn on/off the queue */
-		if (enable) {
-			wr32(hw, I40E_QTX_HEAD(pf_q), 0);
-			tx_reg |= I40E_QTX_ENA_QENA_REQ_MASK;
-		} else {
-			tx_reg &= ~I40E_QTX_ENA_QENA_REQ_MASK;
-		}
+	wr32(hw, I40E_QTX_ENA(pf_q), tx_reg);
+	/* No waiting for the Tx queue to disable */
+	if (!enable && test_bit(__I40E_PORT_TX_SUSPENDED, &pf->state))
+		return 0;
 
-		wr32(hw, I40E_QTX_ENA(pf_q), tx_reg);
-		/* No waiting for the Tx queue to disable */
-		if (!enable && test_bit(__I40E_PORT_TX_SUSPENDED, &pf->state))
-			continue;
+	/* wait for the change to finish */
+	ret = i40e_pf_txq_wait(pf, pf_q, enable);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "VSI seid %d Tx ring %d %sable timeout\n",
+			 vsi->seid, pf_q, (enable ? "en" : "dis"));
+		return ret;
+	}
+	return 0;
+}
 
-		/* wait for the change to finish */
-		ret = i40e_pf_txq_wait(pf, pf_q, enable);
-		if (ret) {
-			dev_info(&pf->pdev->dev,
-				 "VSI seid %d Tx ring %d %sable timeout\n",
-				 vsi->seid, pf_q, (enable ? "en" : "dis"));
+/**
+ * i40e_vsi_control_tx - Start or stop a VSI's rings
+ * @vsi: the VSI being configured
+ * @enable: start or stop the rings
+ **/
+static int i40e_vsi_control_tx(struct i40e_vsi *vsi, bool enable)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	int i, pf_q, ret = 0;
+
+	pf_q = vsi->base_queue;
+	for (i = 0; i < vsi->num_queue_pairs; i++, pf_q++) {
+		ret = i40e_vsi_control_txq(vsi, enable, pf_q);
+		if (ret)
 			break;
+	}
+
+	if (!ret && i40e_enabled_xdp_vsi(vsi)) {
+		for (i = 0; i < vsi->num_queue_pairs; i++) {
+			pf_q = vsi->base_queue + vsi->xdp_rings[i]->queue_index;
+			ret = i40e_vsi_control_txq(vsi, enable, pf_q);
+			if (ret)
+				break;
 		}
 	}
 
@@ -4311,6 +4412,9 @@ static void i40e_free_q_vector(struct i40e_vsi *vsi, int v_idx)
 	i40e_for_each_ring(ring, q_vector->rx)
 		ring->q_vector = NULL;
 
+	i40e_for_each_ring(ring, q_vector->xdp)
+		ring->q_vector = NULL;
+
 	/* only VSI w/ an associated netdev is set up w/ NAPI */
 	if (vsi->netdev)
 		netif_napi_del(&q_vector->napi);
@@ -4534,6 +4638,21 @@ static int i40e_vsi_wait_queues_disabled(struct i40e_vsi *vsi)
 		}
 	}
 
+	if (!i40e_enabled_xdp_vsi(vsi))
+		return 0;
+
+	for (i = 0; i < vsi->num_queue_pairs; i++) {
+		pf_q = vsi->base_queue + vsi->xdp_rings[i]->queue_index;
+		/* Check and wait for the disable status of the queue */
+		ret = i40e_pf_txq_wait(pf, pf_q, false);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "VSI seid %d XDP Tx ring %d disable timeout\n",
+				 vsi->seid, pf_q);
+			return ret;
+		}
+	}
+
 	return 0;
 }
 
@@ -5474,6 +5593,8 @@ void i40e_down(struct i40e_vsi *vsi)
 
 	for (i = 0; i < vsi->num_queue_pairs; i++) {
 		i40e_clean_tx_ring(vsi->tx_rings[i]);
+		if (i40e_enabled_xdp_vsi(vsi))
+			i40e_clean_tx_ring(vsi->xdp_rings[i]);
 		i40e_clean_rx_ring(vsi->rx_rings[i]);
 	}
 
@@ -7446,6 +7567,16 @@ static int i40e_vsi_alloc_arrays(struct i40e_vsi *vsi, bool alloc_qvectors)
 		return -ENOMEM;
 	vsi->rx_rings = &vsi->tx_rings[vsi->alloc_queue_pairs];
 
+	if (i40e_enabled_xdp_vsi(vsi)) {
+		size = sizeof(struct i40e_ring *) *
+		       i40e_alloc_queue_pairs_xdp_vsi(vsi);
+		vsi->xdp_rings = kzalloc(size, GFP_KERNEL);
+		if (!vsi->xdp_rings) {
+			ret = -ENOMEM;
+			goto err_xdp_rings;
+		}
+	}
+
 	if (alloc_qvectors) {
 		/* allocate memory for q_vector pointers */
 		size = sizeof(struct i40e_q_vector *) * vsi->num_q_vectors;
@@ -7458,6 +7589,8 @@ static int i40e_vsi_alloc_arrays(struct i40e_vsi *vsi, bool alloc_qvectors)
 	return ret;
 
 err_vectors:
+	kfree(vsi->xdp_rings);
+err_xdp_rings:
 	kfree(vsi->tx_rings);
 	return ret;
 }
@@ -7564,6 +7697,8 @@ static void i40e_vsi_free_arrays(struct i40e_vsi *vsi, bool free_qvectors)
 	kfree(vsi->tx_rings);
 	vsi->tx_rings = NULL;
 	vsi->rx_rings = NULL;
+	kfree(vsi->xdp_rings);
+	vsi->xdp_rings = NULL;
 }
 
 /**
@@ -7649,6 +7784,13 @@ static void i40e_vsi_clear_rings(struct i40e_vsi *vsi)
 			vsi->rx_rings[i] = NULL;
 		}
 	}
+
+	if (vsi->xdp_rings && vsi->xdp_rings[0]) {
+		for (i = 0; i < vsi->alloc_queue_pairs; i++) {
+			kfree_rcu(vsi->xdp_rings[i], rcu);
+			vsi->xdp_rings[i] = NULL;
+		}
+	}
 }
 
 /**
@@ -7696,6 +7838,31 @@ static int i40e_alloc_rings(struct i40e_vsi *vsi)
 		vsi->rx_rings[i] = rx_ring;
 	}
 
+	if (!i40e_enabled_xdp_vsi(vsi))
+		return 0;
+
+	for (i = 0; i < vsi->alloc_queue_pairs; i++) {
+		tx_ring = kzalloc(sizeof(*tx_ring), GFP_KERNEL);
+		if (!tx_ring)
+			goto err_out;
+
+		tx_ring->queue_index = vsi->alloc_queue_pairs + i;
+		tx_ring->reg_idx = vsi->base_queue + vsi->alloc_queue_pairs + i;
+		tx_ring->ring_active = false;
+		tx_ring->vsi = vsi;
+		tx_ring->netdev = NULL;
+		tx_ring->dev = &pf->pdev->dev;
+		tx_ring->count = vsi->num_desc;
+		tx_ring->size = 0;
+		tx_ring->dcb_tc = 0;
+		if (vsi->back->flags & I40E_FLAG_WB_ON_ITR_CAPABLE)
+			tx_ring->flags = I40E_TXR_FLAGS_WB_ON_ITR;
+		tx_ring->tx_itr_setting = pf->tx_itr_default;
+		tx_ring->xdp_sibling = vsi->rx_rings[i];
+		vsi->xdp_rings[i] = tx_ring;
+		vsi->rx_rings[i]->xdp_sibling = tx_ring;
+	}
+
 	return 0;
 
 err_out:
@@ -9921,6 +10088,7 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
 	struct i40e_pf *pf;
 	u8 enabled_tc;
 	int ret;
+	u16 alloc_queue_pairs;
 
 	if (!vsi)
 		return NULL;
@@ -9936,11 +10104,13 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
 	if (ret)
 		goto err_vsi;
 
-	ret = i40e_get_lump(pf, pf->qp_pile, vsi->alloc_queue_pairs, vsi->idx);
+	alloc_queue_pairs = vsi->alloc_queue_pairs +
+			    i40e_alloc_queue_pairs_xdp_vsi(vsi);
+	ret = i40e_get_lump(pf, pf->qp_pile, alloc_queue_pairs, vsi->idx);
 	if (ret < 0) {
 		dev_info(&pf->pdev->dev,
 			 "failed to get tracking for %d queues for VSI %d err %d\n",
-			 vsi->alloc_queue_pairs, vsi->seid, ret);
+			 alloc_queue_pairs, vsi->seid, ret);
 		goto err_vsi;
 	}
 	vsi->base_queue = ret;
@@ -9998,6 +10168,7 @@ struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 type,
 	struct i40e_veb *veb = NULL;
 	int ret, i;
 	int v_idx;
+	u16 alloc_queue_pairs;
 
 	/* The requested uplink_seid must be either
 	 *     - the PF's port seid
@@ -10082,13 +10253,15 @@ struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 type,
 		pf->lan_vsi = v_idx;
 	else if (type == I40E_VSI_SRIOV)
 		vsi->vf_id = param1;
+
+	alloc_queue_pairs = vsi->alloc_queue_pairs +
+			    i40e_alloc_queue_pairs_xdp_vsi(vsi);
 	/* assign it some queues */
-	ret = i40e_get_lump(pf, pf->qp_pile, vsi->alloc_queue_pairs,
-				vsi->idx);
+	ret = i40e_get_lump(pf, pf->qp_pile, alloc_queue_pairs,	vsi->idx);
 	if (ret < 0) {
 		dev_info(&pf->pdev->dev,
 			 "failed to get tracking for %d queues for VSI %d err=%d\n",
-			 vsi->alloc_queue_pairs, vsi->seid, ret);
+			 alloc_queue_pairs, vsi->seid, ret);
 		goto err_vsi;
 	}
 	vsi->base_queue = ret;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index d835a51dafa6..fccdec7ae102 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -520,6 +520,8 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
 	if (tx_buffer->skb) {
 		if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
 			kfree(tx_buffer->raw_buf);
+		else if (tx_buffer->tx_flags & I40E_TX_FLAGS_XDP)
+			put_page(tx_buffer->page);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 		if (dma_unmap_len(tx_buffer, len))
@@ -620,6 +622,15 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw)
 #define WB_STRIDE 4
 
 /**
+ * i40e_page_is_reserved - check if reuse is possible
+ * @page: page struct to check
+ */
+static inline bool i40e_page_is_reserved(struct page *page)
+{
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
+}
+
+/**
  * i40e_clean_tx_irq - Reclaim resources after transmit completes
  * @vsi: the VSI we care about
  * @tx_ring: Tx ring to clean
@@ -762,6 +773,98 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 	return !!budget;
 }
 
+static bool i40e_clean_xdp_irq(struct i40e_vsi *vsi,
+			       struct i40e_ring *tx_ring)
+{
+	u16 i = tx_ring->next_to_clean;
+	struct i40e_tx_buffer *tx_buf;
+	struct i40e_tx_desc *tx_head;
+	struct i40e_tx_desc *tx_desc;
+	unsigned int total_bytes = 0, total_packets = 0;
+	unsigned int budget = vsi->work_limit;
+
+	tx_buf = &tx_ring->tx_bi[i];
+	tx_desc = I40E_TX_DESC(tx_ring, i);
+	i -= tx_ring->count;
+
+	tx_head = I40E_TX_DESC(tx_ring, i40e_get_head(tx_ring));
+
+	do {
+		struct i40e_tx_desc *eop_desc = tx_buf->next_to_watch;
+
+		/* if next_to_watch is not set then there is no work pending */
+		if (!eop_desc)
+			break;
+
+		/* prevent any other reads prior to eop_desc */
+		read_barrier_depends();
+
+		/* we have caught up to head, no work left to do */
+		if (tx_head == tx_desc)
+			break;
+
+		/* clear next_to_watch to prevent false hangs */
+		tx_buf->next_to_watch = NULL;
+
+		/* update the statistics for this packet */
+		total_bytes += tx_buf->bytecount;
+		total_packets += tx_buf->gso_segs;
+
+		put_page(tx_buf->page);
+
+		/* unmap skb header data */
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(tx_buf, dma),
+				 dma_unmap_len(tx_buf, len),
+				 DMA_TO_DEVICE);
+
+		/* clear tx_buffer data */
+		tx_buf->skb = NULL;
+		dma_unmap_len_set(tx_buf, len, 0);
+
+		/* move us one more past the eop_desc for start of next pkt */
+		tx_buf++;
+		tx_desc++;
+		i++;
+		if (unlikely(!i)) {
+			i -= tx_ring->count;
+			tx_buf = tx_ring->tx_bi;
+			tx_desc = I40E_TX_DESC(tx_ring, 0);
+		}
+
+		prefetch(tx_desc);
+
+		/* update budget accounting */
+		budget--;
+	} while (likely(budget));
+
+	i += tx_ring->count;
+	tx_ring->next_to_clean = i;
+	u64_stats_update_begin(&tx_ring->syncp);
+	tx_ring->stats.bytes += total_bytes;
+	tx_ring->stats.packets += total_packets;
+	u64_stats_update_end(&tx_ring->syncp);
+	tx_ring->q_vector->tx.total_bytes += total_bytes;
+	tx_ring->q_vector->tx.total_packets += total_packets;
+
+	if (tx_ring->flags & I40E_TXR_FLAGS_WB_ON_ITR) {
+		/* check to see if there are < 4 descriptors
+		 * waiting to be written back, then kick the hardware to force
+		 * them to be written back in case we stay in NAPI.
+		 * In this mode on X722 we do not enable Interrupt.
+		 */
+		unsigned int j = i40e_get_tx_pending(tx_ring, false);
+
+		if (budget &&
+		    ((j / WB_STRIDE) == 0) && (j > 0) &&
+		    !test_bit(__I40E_DOWN, &vsi->state) &&
+		    (I40E_DESC_UNUSED(tx_ring) != tx_ring->count))
+			tx_ring->arm_wb = true;
+	}
+
+	return !!budget;
+}
+
 /**
  * i40e_enable_wb_on_itr - Arm hardware to do a wb, interrupts are not enabled
  * @vsi: the VSI we care about
@@ -1496,35 +1599,46 @@ static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb)
 }
 
 /**
- * i40e_reuse_rx_page - page flip buffer and store it back on the ring
- * @rx_ring: rx descriptor ring to store buffers on
- * @old_buff: donor buffer to have page reused
+ * i40e_try_flip_rx_page - attempts to flip a page for reuse
+ * @rx_buffer: The buffer to alter
  *
- * Synchronizes page for reuse by the adapter
- **/
-static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
-			       struct i40e_rx_buffer *old_buff)
+ * Returns true if the page was successfully flipped and can be
+ * reused.
+ */
+static bool i40e_try_flip_rx_page(struct i40e_rx_buffer *rx_buffer)
 {
-	struct i40e_rx_buffer *new_buff;
-	u16 nta = rx_ring->next_to_alloc;
+#if (PAGE_SIZE < 8192)
+	unsigned int truesize = I40E_RXBUFFER_2048;
+#else
+	unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
+	unsigned int last_offset = PAGE_SIZE - I40E_RXBUFFER_2048;
+#endif
 
-	new_buff = &rx_ring->rx_bi[nta];
+	/* avoid re-using remote pages */
+	if (unlikely(i40e_page_is_reserved(rx_buffer->page)))
+		return false;
 
-	/* update, and store next to alloc */
-	nta++;
-	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+#if (PAGE_SIZE < 8192)
+	/* if we are only owner of page we can reuse it */
+	if (unlikely(page_count(rx_buffer->page) != 1))
+		return false;
 
-	/* transfer page from old buffer to new buffer */
-	*new_buff = *old_buff;
-}
+	/* flip page offset to other buffer */
+	rx_buffer->page_offset ^= truesize;
+#else
+	/* move offset up to the next cache line */
+	rx_buffer->page_offset += truesize;
 
-/**
- * i40e_page_is_reserved - check if reuse is possible
- * @page: page struct to check
- */
-static inline bool i40e_page_is_reserved(struct page *page)
-{
-	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
+	if (rx_buffer->page_offset > last_offset)
+		return false;
+#endif
+
+	/* Even if we own the page, we are not allowed to use atomic_set()
+	 * This would break get_page_unless_zero() users.
+	 */
+	get_page(rx_buffer->page);
+
+	return true;
 }
 
 /**
@@ -1555,7 +1669,6 @@ static bool i40e_add_rx_frag(struct i40e_ring *rx_ring,
 	unsigned int truesize = I40E_RXBUFFER_2048;
 #else
 	unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
-	unsigned int last_offset = PAGE_SIZE - I40E_RXBUFFER_2048;
 #endif
 
 	/* will the data fit in the skb we allocated? if so, just
@@ -1578,34 +1691,107 @@ static bool i40e_add_rx_frag(struct i40e_ring *rx_ring,
 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
 			rx_buffer->page_offset, size, truesize);
 
-	/* avoid re-using remote pages */
-	if (unlikely(i40e_page_is_reserved(page)))
-		return false;
+	return i40e_try_flip_rx_page(rx_buffer);
+}
 
-#if (PAGE_SIZE < 8192)
-	/* if we are only owner of page we can reuse it */
-	if (unlikely(page_count(page) != 1))
+/**
+ * i40e_xdp_xmit_tail_bump - updates the tail and sets the RS bit
+ * @xdp_ring: XDP Tx ring
+ **/
+static
+void i40e_xdp_xmit_tail_bump(struct i40e_ring *xdp_ring)
+{
+	struct i40e_tx_desc *tx_desc;
+
+	/* Set RS and bump tail */
+	tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->curr_in_use);
+	tx_desc->cmd_type_offset_bsz |=
+		cpu_to_le64(I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT);
+	/* Force memory writes to complete before letting h/w know
+	 * there are new descriptors to fetch.  (Only applicable for
+	 * weak-ordered memory model archs, such as IA-64).
+	 */
+	wmb();
+	writel(xdp_ring->curr_in_use, xdp_ring->tail);
+
+	xdp_ring->xdp_needs_tail_bump = false;
+}
+
+/**
+ * i40e_xdp_xmit - transmit a frame on the XDP Tx queue
+ * @xdp_ring: XDP Tx ring
+ * @page: current page containing the frame
+ * @page_offset: offset where the frame resides
+ * @dma: Bus address of the frame
+ * @size: size of the frame
+ *
+ * Returns true successfully sent.
+ **/
+static bool i40e_xdp_xmit(void *data, size_t size, struct page *page,
+			  struct i40e_ring *xdp_ring)
+{
+	struct i40e_tx_buffer *tx_bi;
+	struct i40e_tx_desc *tx_desc;
+	u16 i = xdp_ring->next_to_use;
+	dma_addr_t dma;
+
+	if (unlikely(I40E_DESC_UNUSED(xdp_ring) < 1)) {
+		if (xdp_ring->xdp_needs_tail_bump)
+			i40e_xdp_xmit_tail_bump(xdp_ring);
+		xdp_ring->tx_stats.tx_busy++;
 		return false;
+	}
 
-	/* flip page offset to other buffer */
-	rx_buffer->page_offset ^= truesize;
-#else
-	/* move offset up to the next cache line */
-	rx_buffer->page_offset += truesize;
+	tx_bi = &xdp_ring->tx_bi[i];
+	tx_bi->bytecount = size;
+	tx_bi->gso_segs = 1;
+	tx_bi->tx_flags = I40E_TX_FLAGS_XDP;
+	tx_bi->page = page;
 
-	if (rx_buffer->page_offset > last_offset)
+	dma = dma_map_single(xdp_ring->dev, data, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(xdp_ring->dev, dma))
 		return false;
-#endif
 
-	/* Even if we own the page, we are not allowed to use atomic_set()
-	 * This would break get_page_unless_zero() users.
-	 */
-	get_page(rx_buffer->page);
+	/* record length, and DMA address */
+	dma_unmap_len_set(tx_bi, len, size);
+	dma_unmap_addr_set(tx_bi, dma, dma);
 
+	tx_desc = I40E_TX_DESC(xdp_ring, i);
+	tx_desc->buffer_addr = cpu_to_le64(dma);
+	tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC
+						  | I40E_TX_DESC_CMD_EOP,
+						  0, size, 0);
+	tx_bi->next_to_watch = tx_desc;
+	xdp_ring->curr_in_use = i++;
+	xdp_ring->next_to_use = (i < xdp_ring->count) ? i : 0;
+	xdp_ring->xdp_needs_tail_bump = true;
 	return true;
 }
 
 /**
+ * i40e_reuse_rx_page - page flip buffer and store it back on the ring
+ * @rx_ring: rx descriptor ring to store buffers on
+ * @old_buff: donor buffer to have page reused
+ *
+ * Synchronizes page for reuse by the adapter
+ **/
+static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
+			       struct i40e_rx_buffer *old_buff)
+{
+	struct i40e_rx_buffer *new_buff;
+	u16 nta = rx_ring->next_to_alloc;
+
+	new_buff = &rx_ring->rx_bi[nta];
+
+	/* update, and store next to alloc */
+	nta++;
+	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+	/* transfer page from old buffer to new buffer */
+	*new_buff = *old_buff;
+}
+
+/**
  * i40e_run_xdp - Runs an XDP program for an Rx ring
  * @rx_ring: Rx ring used for XDP
  * @rx_buffer: current Rx buffer
@@ -1625,6 +1811,7 @@ static bool i40e_run_xdp(struct i40e_ring *rx_ring,
 			    I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
 	struct xdp_buff xdp;
 	u32 xdp_action;
+	bool tx_ok;
 
 	WARN_ON(!i40e_test_staterr(rx_desc,
 				   BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)));
@@ -1636,21 +1823,34 @@ static bool i40e_run_xdp(struct i40e_ring *rx_ring,
 	switch (xdp_action) {
 	case XDP_PASS:
 		return false;
-	default:
-		bpf_warn_invalid_xdp_action(xdp_action);
-	case XDP_ABORTED:
 	case XDP_TX:
+		tx_ok = i40e_xdp_xmit(xdp.data, size, rx_buffer->page,
+				      rx_ring->xdp_sibling);
+		if (likely(tx_ok)) {
+			if (i40e_try_flip_rx_page(rx_buffer)) {
+				i40e_reuse_rx_page(rx_ring, rx_buffer);
+				rx_ring->rx_stats.page_reuse_count++;
+			} else {
+				dma_unmap_page(rx_ring->dev, rx_buffer->dma,
+					       PAGE_SIZE, DMA_FROM_DEVICE);
+			}
+			break;
+		}
+	case XDP_ABORTED:
 	case XDP_DROP:
+do_drop:
 		if (likely(!i40e_page_is_reserved(rx_buffer->page))) {
 			i40e_reuse_rx_page(rx_ring, rx_buffer);
 			rx_ring->rx_stats.page_reuse_count++;
 			break;
 		}
-
-		/* we are not reusing the buffer so unmap it */
 		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
 			       DMA_FROM_DEVICE);
 		__free_pages(rx_buffer->page, 0);
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(xdp_action);
+		goto do_drop;
 	}
 
 	/* clear contents of buffer_info */
@@ -2065,6 +2265,15 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
 		ring->arm_wb = false;
 	}
 
+	i40e_for_each_ring(ring, q_vector->xdp) {
+		if (!i40e_clean_xdp_irq(vsi, ring)) {
+			clean_complete = false;
+			continue;
+		}
+		arm_wb |= ring->arm_wb;
+		ring->arm_wb = false;
+	}
+
 	/* Handle case where we are called by netpoll with a budget of 0 */
 	if (budget <= 0)
 		goto tx_only;
@@ -2077,6 +2286,9 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
 	i40e_for_each_ring(ring, q_vector->rx) {
 		int cleaned = i40e_clean_rx_irq(ring, budget_per_ring);
 
+		if (ring->xdp_sibling && ring->xdp_sibling->xdp_needs_tail_bump)
+			i40e_xdp_xmit_tail_bump(ring->xdp_sibling);
+
 		work_done += cleaned;
 		/* if we clean as many as budgeted, we must not be done */
 		if (cleaned >= budget_per_ring)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 957d856a82c4..4d9459134e69 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -220,6 +220,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size)
 #define I40E_TX_FLAGS_TSYN		BIT(8)
 #define I40E_TX_FLAGS_FD_SB		BIT(9)
 #define I40E_TX_FLAGS_UDP_TUNNEL	BIT(10)
+#define I40E_TX_FLAGS_XDP		BIT(11)
 #define I40E_TX_FLAGS_VLAN_MASK		0xffff0000
 #define I40E_TX_FLAGS_VLAN_PRIO_MASK	0xe0000000
 #define I40E_TX_FLAGS_VLAN_PRIO_SHIFT	29
@@ -230,6 +231,7 @@ struct i40e_tx_buffer {
 	union {
 		struct sk_buff *skb;
 		void *raw_buf;
+		struct page *page;
 	};
 	unsigned int bytecount;
 	unsigned short gso_segs;
@@ -343,6 +345,9 @@ struct i40e_ring {
 	u16 next_to_alloc;
 
 	struct bpf_prog *xdp_prog;
+	struct i40e_ring *xdp_sibling;  /* rx to xdp, and xdp to rx */
+	bool xdp_needs_tail_bump;
+	u16 curr_in_use;
 } ____cacheline_internodealigned_in_smp;
 
 enum i40e_latency_range {
-- 
2.9.3

^ permalink raw reply related

* [PATCH 1/3] i40e: Initial support for XDP
From: Björn Töpel @ 2016-12-08 17:00 UTC (permalink / raw)
  To: jeffrey.t.kirsher, intel-wired-lan
  Cc: Björn Töpel, john.r.fastabend, magnus.karlsson, netdev
In-Reply-To: <20161208170022.11555-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

This commit adds basic XDP support for i40e derived NICs. All XDP
actions will end up in XDP_DROP.

Only the default/main VSI has support for enabling XDP.

Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h         |  13 +++
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |   3 +
 drivers/net/ethernet/intel/i40e/i40e_main.c    |  74 +++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_txrx.c    | 146 ++++++++++++++++++++-----
 drivers/net/ethernet/intel/i40e/i40e_txrx.h    |   2 +
 5 files changed, 213 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index ba8d30984bee..05d805f439e6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -545,6 +545,8 @@ struct i40e_vsi {
 	struct i40e_ring **rx_rings;
 	struct i40e_ring **tx_rings;
 
+	struct bpf_prog *xdp_prog;
+
 	u32  active_filters;
 	u32  promisc_threshold;
 
@@ -904,4 +906,15 @@ i40e_status i40e_get_npar_bw_setting(struct i40e_pf *pf);
 i40e_status i40e_set_npar_bw_setting(struct i40e_pf *pf);
 i40e_status i40e_commit_npar_bw_setting(struct i40e_pf *pf);
 void i40e_print_link_message(struct i40e_vsi *vsi, bool isup);
+
+/**
+ * i40e_enabled_xdp_vsi - Check if VSI has XDP enabled
+ * @vsi: pointer to a vsi
+ *
+ * Returns true if the VSI has XDP enabled.
+ **/
+static inline bool i40e_enabled_xdp_vsi(const struct i40e_vsi *vsi)
+{
+	return vsi->xdp_prog;
+}
 #endif /* _I40E_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index cc1465aac2ef..831bbc208fc8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1254,6 +1254,9 @@ static int i40e_set_ringparam(struct net_device *netdev,
 	if ((ring->rx_mini_pending) || (ring->rx_jumbo_pending))
 		return -EINVAL;
 
+	if (i40e_enabled_xdp_vsi(vsi))
+		return -EINVAL;
+
 	if (ring->tx_pending > I40E_MAX_NUM_DESCRIPTORS ||
 	    ring->tx_pending < I40E_MIN_NUM_DESCRIPTORS ||
 	    ring->rx_pending > I40E_MAX_NUM_DESCRIPTORS ||
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index da4cbe32eb86..db0240213f3b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -24,6 +24,7 @@
  *
  ******************************************************************************/
 
+#include <linux/bpf.h>
 #include <linux/etherdevice.h>
 #include <linux/of_net.h>
 #include <linux/pci.h>
@@ -2431,6 +2432,13 @@ static int i40e_change_mtu(struct net_device *netdev, int new_mtu)
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_vsi *vsi = np->vsi;
 
+	if (i40e_enabled_xdp_vsi(vsi)) {
+		int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+
+		if (max_frame > I40E_RXBUFFER_2048)
+			return -EINVAL;
+	}
+
 	netdev_info(netdev, "changing MTU from %d to %d\n",
 		    netdev->mtu, new_mtu);
 	netdev->mtu = new_mtu;
@@ -3085,6 +3093,15 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
 	ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
 	writel(0, ring->tail);
 
+	if (i40e_enabled_xdp_vsi(vsi)) {
+		struct bpf_prog *prog;
+
+		prog = bpf_prog_add(vsi->xdp_prog, 1);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+		ring->xdp_prog = prog;
+	}
+
 	i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
 
 	return 0;
@@ -9234,6 +9251,62 @@ static netdev_features_t i40e_features_check(struct sk_buff *skb,
 	return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 }
 
+/**
+ * i40e_xdp_setup - Add/remove an XDP program to a VSI
+ * @vsi: the VSI to add the program
+ * @prog: the XDP program
+ **/
+static int i40e_xdp_setup(struct i40e_vsi *vsi,
+			  struct bpf_prog *prog)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct net_device *netdev = vsi->netdev;
+	int frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+
+	if (frame_size > I40E_RXBUFFER_2048)
+		return -EINVAL;
+
+	if (!(pf->flags & I40E_FLAG_MSIX_ENABLED))
+		return -EINVAL;
+
+	if (!i40e_enabled_xdp_vsi(vsi) && !prog)
+		return 0;
+
+	i40e_prep_for_reset(pf);
+
+	if (vsi->xdp_prog)
+		bpf_prog_put(vsi->xdp_prog);
+	vsi->xdp_prog = prog;
+
+	i40e_reset_and_rebuild(pf, true);
+	return 0;
+}
+
+/**
+ * i40e_xdp - NDO for enabled/query
+ * @dev: the netdev
+ * @xdp: XDP program
+ **/
+static int i40e_xdp(struct net_device *dev,
+		    struct netdev_xdp *xdp)
+{
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+	struct i40e_vsi *vsi = np->vsi;
+
+	if (vsi->type != I40E_VSI_MAIN)
+		return -EINVAL;
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return i40e_xdp_setup(vsi, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = i40e_enabled_xdp_vsi(vsi);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops i40e_netdev_ops = {
 	.ndo_open		= i40e_open,
 	.ndo_stop		= i40e_close,
@@ -9270,6 +9343,7 @@ static const struct net_device_ops i40e_netdev_ops = {
 	.ndo_features_check	= i40e_features_check,
 	.ndo_bridge_getlink	= i40e_ndo_bridge_getlink,
 	.ndo_bridge_setlink	= i40e_ndo_bridge_setlink,
+	.ndo_xdp                = i40e_xdp,
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 352cf7cd2ef4..d835a51dafa6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -24,6 +24,7 @@
  *
  ******************************************************************************/
 
+#include <linux/bpf.h>
 #include <linux/prefetch.h>
 #include <net/busy_poll.h>
 #include "i40e.h"
@@ -1040,6 +1041,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
 	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
+
+	if (rx_ring->xdp_prog) {
+		bpf_prog_put(rx_ring->xdp_prog);
+		rx_ring->xdp_prog = NULL;
+	}
 }
 
 /**
@@ -1600,30 +1606,104 @@ static bool i40e_add_rx_frag(struct i40e_ring *rx_ring,
 }
 
 /**
+ * i40e_run_xdp - Runs an XDP program for an Rx ring
+ * @rx_ring: Rx ring used for XDP
+ * @rx_buffer: current Rx buffer
+ * @rx_desc: current Rx descriptor
+ * @xdp_prog: the XDP program to run
+ *
+ * Returns true if the XDP program consumed the incoming frame. False
+ * means pass the frame to the good old stack.
+ **/
+static bool i40e_run_xdp(struct i40e_ring *rx_ring,
+			 struct i40e_rx_buffer *rx_buffer,
+			 union i40e_rx_desc *rx_desc,
+			 struct bpf_prog *xdp_prog)
+{
+	u64 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+	unsigned int size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
+			    I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
+	struct xdp_buff xdp;
+	u32 xdp_action;
+
+	WARN_ON(!i40e_test_staterr(rx_desc,
+				   BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)));
+
+	xdp.data = page_address(rx_buffer->page) + rx_buffer->page_offset;
+	xdp.data_end = xdp.data + size;
+	xdp_action = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+	switch (xdp_action) {
+	case XDP_PASS:
+		return false;
+	default:
+		bpf_warn_invalid_xdp_action(xdp_action);
+	case XDP_ABORTED:
+	case XDP_TX:
+	case XDP_DROP:
+		if (likely(!i40e_page_is_reserved(rx_buffer->page))) {
+			i40e_reuse_rx_page(rx_ring, rx_buffer);
+			rx_ring->rx_stats.page_reuse_count++;
+			break;
+		}
+
+		/* we are not reusing the buffer so unmap it */
+		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
+			       DMA_FROM_DEVICE);
+		__free_pages(rx_buffer->page, 0);
+	}
+
+	/* clear contents of buffer_info */
+	rx_buffer->page = NULL;
+	return true; /* Swallowed by XDP */
+}
+
+/**
  * i40e_fetch_rx_buffer - Allocate skb and populate it
  * @rx_ring: rx descriptor ring to transact packets on
  * @rx_desc: descriptor containing info written by hardware
+ * @skb: The allocated skb, if any
  *
- * This function allocates an skb on the fly, and populates it with the page
- * data from the current receive descriptor, taking care to set up the skb
- * correctly, as well as handling calling the page recycle function if
- * necessary.
+ * Unless XDP is enabled, this function allocates an skb on the fly,
+ * and populates it with the page data from the current receive
+ * descriptor, taking care to set up the skb correctly, as well as
+ * handling calling the page recycle function if necessary.
+ *
+ * If the received frame was handled by XDP, true is
+ * returned. Otherwise, the skb is returned to the caller via the skb
+ * parameter.
  */
 static inline
-struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
-				     union i40e_rx_desc *rx_desc)
+bool i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
+			  union i40e_rx_desc *rx_desc,
+			  struct sk_buff **skb)
 {
 	struct i40e_rx_buffer *rx_buffer;
-	struct sk_buff *skb;
 	struct page *page;
 
 	rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
 	page = rx_buffer->page;
 	prefetchw(page);
 
-	skb = rx_buffer->skb;
+	/* we are reusing so sync this buffer for CPU use */
+	dma_sync_single_range_for_cpu(rx_ring->dev,
+				      rx_buffer->dma,
+				      rx_buffer->page_offset,
+				      I40E_RXBUFFER_2048,
+				      DMA_FROM_DEVICE);
+
+	if (rx_ring->xdp_prog) {
+		bool xdp_consumed;
+
+		xdp_consumed = i40e_run_xdp(rx_ring, rx_buffer,
+					    rx_desc, rx_ring->xdp_prog);
+		if (xdp_consumed)
+			return true;
+	}
 
-	if (likely(!skb)) {
+	*skb = rx_buffer->skb;
+
+	if (likely(!*skb)) {
 		void *page_addr = page_address(page) + rx_buffer->page_offset;
 
 		/* prefetch first cache line of first page */
@@ -1633,32 +1713,25 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
 #endif
 
 		/* allocate a skb to store the frags */
-		skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
-				       I40E_RX_HDR_SIZE,
-				       GFP_ATOMIC | __GFP_NOWARN);
-		if (unlikely(!skb)) {
+		*skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
+					I40E_RX_HDR_SIZE,
+					GFP_ATOMIC | __GFP_NOWARN);
+		if (unlikely(!*skb)) {
 			rx_ring->rx_stats.alloc_buff_failed++;
-			return NULL;
+			return false;
 		}
 
 		/* we will be copying header into skb->data in
 		 * pskb_may_pull so it is in our interest to prefetch
 		 * it now to avoid a possible cache miss
 		 */
-		prefetchw(skb->data);
+		prefetchw((*skb)->data);
 	} else {
 		rx_buffer->skb = NULL;
 	}
 
-	/* we are reusing so sync this buffer for CPU use */
-	dma_sync_single_range_for_cpu(rx_ring->dev,
-				      rx_buffer->dma,
-				      rx_buffer->page_offset,
-				      I40E_RXBUFFER_2048,
-				      DMA_FROM_DEVICE);
-
 	/* pull page into skb */
-	if (i40e_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
+	if (i40e_add_rx_frag(rx_ring, rx_buffer, rx_desc, *skb)) {
 		/* hand second half of page back to the ring */
 		i40e_reuse_rx_page(rx_ring, rx_buffer);
 		rx_ring->rx_stats.page_reuse_count++;
@@ -1671,7 +1744,7 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
 	/* clear contents of buffer_info */
 	rx_buffer->page = NULL;
 
-	return skb;
+	return false;
 }
 
 /**
@@ -1716,6 +1789,20 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
 }
 
 /**
+ * i40e_update_rx_next_to_clean - Bumps the next-to-clean for an Rx ing
+ * @rx_ring: Rx ring to bump
+ **/
+static void i40e_update_rx_next_to_clean(struct i40e_ring *rx_ring)
+{
+	u32 ntc = rx_ring->next_to_clean + 1;
+
+	ntc = (ntc < rx_ring->count) ? ntc : 0;
+	rx_ring->next_to_clean = ntc;
+
+	prefetch(I40E_RX_DESC(rx_ring, ntc));
+}
+
+/**
  * i40e_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
  * @rx_ring: rx descriptor ring to transact packets on
  * @budget: Total limit on number of packets to process
@@ -1739,6 +1826,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		u16 vlan_tag;
 		u8 rx_ptype;
 		u64 qword;
+		bool xdp_consumed;
 
 		/* return some buffers to hardware, one at a time is too slow */
 		if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
@@ -1764,7 +1852,15 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		 */
 		dma_rmb();
 
-		skb = i40e_fetch_rx_buffer(rx_ring, rx_desc);
+		xdp_consumed = i40e_fetch_rx_buffer(rx_ring, rx_desc, &skb);
+		if (xdp_consumed) {
+			cleaned_count++;
+
+			i40e_update_rx_next_to_clean(rx_ring);
+			total_rx_packets++;
+			continue;
+		}
+
 		if (!skb)
 			break;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index e065321ce8ed..957d856a82c4 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -341,6 +341,8 @@ struct i40e_ring {
 
 	struct rcu_head rcu;		/* to avoid race on free */
 	u16 next_to_alloc;
+
+	struct bpf_prog *xdp_prog;
 } ____cacheline_internodealigned_in_smp;
 
 enum i40e_latency_range {
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH net] phy: Don't increment MDIO bus refcount unless it's a different owner
From: Johan Hovold @ 2016-12-08 17:01 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: Johan Hovold, netdev, rmk+kernel, andrew
In-Reply-To: <81ffa62a-b385-94ca-2396-f2137a320e8b@gmail.com>

On Thu, Dec 08, 2016 at 08:47:54AM -0800, Florian Fainelli wrote:
> On 12/08/2016 08:27 AM, Johan Hovold wrote:
> > On Tue, Dec 06, 2016 at 08:54:43PM -0800, Florian Fainelli wrote:
> >> Commit 3e3aaf649416 ("phy: fix mdiobus module safety") fixed the way we
> >> dealt with MDIO bus module reference count, but sort of introduced a
> >> regression in that, if an Ethernet driver registers its own MDIO bus
> >> driver, as is common, we will end up with the Ethernet driver's
> >> module->refnct set to 1, thus preventing this driver from any removal.
> >>
> >> Fix this by comparing the network device's device driver owner against
> >> the MDIO bus driver owner, and only if they are different, increment the
> >> MDIO bus module refcount.
> >>
> >> Fixes: 3e3aaf649416 ("phy: fix mdiobus module safety")
> >> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> >> ---
> >> Russell,
> >>
> >> I verified this against the ethoc driver primarily (on a TS7300 board)
> >> and bcmgenet.
> >>
> >> Thanks!
> >>
> >>  drivers/net/phy/phy_device.c | 16 +++++++++++++---
> >>  1 file changed, 13 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> >> index 1a4bf8acad78..c4ceb082e970 100644
> >> --- a/drivers/net/phy/phy_device.c
> >> +++ b/drivers/net/phy/phy_device.c
> >> @@ -857,11 +857,17 @@ EXPORT_SYMBOL(phy_attached_print);
> >>  int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
> >>  		      u32 flags, phy_interface_t interface)
> >>  {
> >> +	struct module *ndev_owner = dev->dev.parent->driver->owner;
> > 
> > Is this really safe? A driver does not need to set a parent device, and
> > in that case you get a NULL-deref here (I tried using cpsw).
> 
> Humm, cpsw does call SET_NETDEV_DEV() which should take care of that, is
> the call made too late? Do you have an example oops?

Sorry if I was being unclear, cpsw does set a parent device, but there
are network driver that do not. Perhaps such drivers will never hit this
code path, but I can't say for sure and everything appear to work for
cpsw if you comment out that SET_NETDEV_DEV (well, at least before this
patch).

> I don't mind safeguarding this with a check against dev->dev.parent, but
> I would like to fix the drivers where relevant too, since
> SET_NETDEV_DEV() should really be called, otherwise a number of things
> just don't work

I grepped for for register_netdev and think I saw a number of drivers
which do not call SET_NETDEV_DEV.

Again, perhaps they will never hit this path, but thought I should ask.

Johan

^ permalink raw reply

* [PATCH 0/3] i40e: Support for XDP
From: Björn Töpel @ 2016-12-08 17:00 UTC (permalink / raw)
  To: jeffrey.t.kirsher, intel-wired-lan
  Cc: Björn Töpel, john.r.fastabend, magnus.karlsson, netdev

From: Björn Töpel <bjorn.topel@intel.com>

This series adds XDP support for i40e-based NICs.

The first patch adds XDP_RX support, the second XDP_TX support and the
last patch makes it possible to change an XDP program without
rebuilding the rings.


Björn


Björn Töpel (3):
  i40e: Initial support for XDP
  i40e: Add XDP_TX support
  i40e: Don't reset/rebuild rings on XDP program swap

 drivers/net/ethernet/intel/i40e/i40e.h         |  18 +
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |   3 +
 drivers/net/ethernet/intel/i40e/i40e_main.c    | 358 +++++++++++++++++---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c    | 445 +++++++++++++++++++++----
 drivers/net/ethernet/intel/i40e/i40e_txrx.h    |   7 +
 5 files changed, 715 insertions(+), 116 deletions(-)

-- 
2.9.3

^ permalink raw reply

* Re: [PATCH v4 net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
From: John Fastabend @ 2016-12-08 17:00 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann
  Cc: Martin KaFai Lau, netdev, Alexei Starovoitov, Brenden Blanco,
	David Miller, Jakub Kicinski, Jesper Dangaard Brouer,
	Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <20161208165515.GA92599@ast-mbp.thefacebook.com>

On 16-12-08 08:55 AM, Alexei Starovoitov wrote:
> On Thu, Dec 08, 2016 at 10:02:37AM +0100, Daniel Borkmann wrote:
>> On 12/08/2016 12:53 AM, Martin KaFai Lau wrote:
>>> This patch allows XDP prog to extend/remove the packet
>>> data at the head (like adding or removing header).  It is
>>> done by adding a new XDP helper bpf_xdp_adjust_head().
>>>
>>> It also renames bpf_helper_changes_skb_data() to
>>> bpf_helper_changes_pkt_data() to better reflect
>>> that XDP prog does not work on skb.
>>>
>>> This patch adds one "xdp_adjust_head" bit to bpf_prog for the
>>> XDP-capable driver to check if the XDP prog requires
>>> bpf_xdp_adjust_head() support.  The driver can then decide
>>> to error out during XDP_SETUP_PROG.
>>>
>>> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
>>
>> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
> 
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> 

Thanks for fixing up the xdp_buff struct. And for good measure,

Acked-by: John Fastabend <john.r.fastabend@intel.com>

^ permalink raw reply

* Re: [PATCH net-next v6 0/2] net/sched: cls_flower: Support matching on ICMP
From: David Miller @ 2016-12-08 16:58 UTC (permalink / raw)
  To: simon.horman; +Cc: jiri, tom, netdev, ogerlitz
In-Reply-To: <1481114908-24189-1-git-send-email-simon.horman@netronome.com>

From: Simon Horman <simon.horman@netronome.com>
Date: Wed,  7 Dec 2016 13:48:26 +0100

> this series adds support for matching on ICMP type and code to
> cls_flower.

Series applied, thanks Simon.  I guess we'll see some netronome
offloading of this stuff coming soon? :-)

Simon and Or, you both added extensions to cls_flower at the same
time.  Or's changes went in first, so his UAPI numbers did not change.
Simons, your changes went in next so your numbers did change and
therefore you will have to recompile any userland components you were
using for testing.

Just FYI...

^ permalink raw reply

* Re: [PATCH v4 net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
From: Alexei Starovoitov @ 2016-12-08 16:55 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Martin KaFai Lau, netdev, Alexei Starovoitov, Brenden Blanco,
	David Miller, Jakub Kicinski, Jesper Dangaard Brouer,
	John Fastabend, Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <584921AD.3060004@iogearbox.net>

On Thu, Dec 08, 2016 at 10:02:37AM +0100, Daniel Borkmann wrote:
> On 12/08/2016 12:53 AM, Martin KaFai Lau wrote:
> >This patch allows XDP prog to extend/remove the packet
> >data at the head (like adding or removing header).  It is
> >done by adding a new XDP helper bpf_xdp_adjust_head().
> >
> >It also renames bpf_helper_changes_skb_data() to
> >bpf_helper_changes_pkt_data() to better reflect
> >that XDP prog does not work on skb.
> >
> >This patch adds one "xdp_adjust_head" bit to bpf_prog for the
> >XDP-capable driver to check if the XDP prog requires
> >bpf_xdp_adjust_head() support.  The driver can then decide
> >to error out during XDP_SETUP_PROG.
> >
> >Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> 
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>

Acked-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply

* Re: [PATCH net] phy: Don't increment MDIO bus refcount unless it's a different owner
From: Florian Fainelli @ 2016-12-08 16:47 UTC (permalink / raw)
  To: Johan Hovold; +Cc: netdev, rmk+kernel, andrew
In-Reply-To: <20161208162740.GI31573@localhost>

On 12/08/2016 08:27 AM, Johan Hovold wrote:
> On Tue, Dec 06, 2016 at 08:54:43PM -0800, Florian Fainelli wrote:
>> Commit 3e3aaf649416 ("phy: fix mdiobus module safety") fixed the way we
>> dealt with MDIO bus module reference count, but sort of introduced a
>> regression in that, if an Ethernet driver registers its own MDIO bus
>> driver, as is common, we will end up with the Ethernet driver's
>> module->refnct set to 1, thus preventing this driver from any removal.
>>
>> Fix this by comparing the network device's device driver owner against
>> the MDIO bus driver owner, and only if they are different, increment the
>> MDIO bus module refcount.
>>
>> Fixes: 3e3aaf649416 ("phy: fix mdiobus module safety")
>> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
>> ---
>> Russell,
>>
>> I verified this against the ethoc driver primarily (on a TS7300 board)
>> and bcmgenet.
>>
>> Thanks!
>>
>>  drivers/net/phy/phy_device.c | 16 +++++++++++++---
>>  1 file changed, 13 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>> index 1a4bf8acad78..c4ceb082e970 100644
>> --- a/drivers/net/phy/phy_device.c
>> +++ b/drivers/net/phy/phy_device.c
>> @@ -857,11 +857,17 @@ EXPORT_SYMBOL(phy_attached_print);
>>  int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
>>  		      u32 flags, phy_interface_t interface)
>>  {
>> +	struct module *ndev_owner = dev->dev.parent->driver->owner;
> 
> Is this really safe? A driver does not need to set a parent device, and
> in that case you get a NULL-deref here (I tried using cpsw).

Humm, cpsw does call SET_NETDEV_DEV() which should take care of that, is
the call made too late? Do you have an example oops?

I don't mind safeguarding this with a check against dev->dev.parent, but
I would like to fix the drivers where relevant too, since
SET_NETDEV_DEV() should really be called, otherwise a number of things
just don't work

> 
>>  	struct mii_bus *bus = phydev->mdio.bus;
>>  	struct device *d = &phydev->mdio.dev;
>>  	int err;
>>  
>> -	if (!try_module_get(bus->owner)) {
>> +	/* For Ethernet device drivers that register their own MDIO bus, we
>> +	 * will have bus->owner match ndev_mod, so we do not want to increment
> 
> You also wanted s/ndev_mod/ndev_owner/ here.

Meh, it's merged now, but thanks, I will fix this once we find out the
proper solution for cpsw.
-- 
Florian

^ permalink raw reply

* Re: [RESEND PATCH 3/3] net: stmmac: stmmac_platform: use correct setup function for gmac4
From: David Miller @ 2016-12-08 16:36 UTC (permalink / raw)
  To: niklas.cassel
  Cc: peppe.cavallaro, alexandre.torgue, niklass, netdev, linux-kernel
In-Reply-To: <1481114469-4788-3-git-send-email-niklass@axis.com>

From: Niklas Cassel <niklas.cassel@axis.com>
Date: Wed, 7 Dec 2016 13:41:08 +0100

> From: Niklas Cassel <niklas.cassel@axis.com>
> 
> devicetree binding for stmmac states:
> - compatible: Should be "snps,dwmac-<ip_version>", "snps,dwmac"
> 	For backwards compatibility: "st,spear600-gmac" is also supported.
> 
> Previously, when specifying "snps,dwmac-4.10a", "snps,dwmac" as your
> compatible string, plat_stmmacenet_data would have both has_gmac and
> has_gmac4 set.
> 
> This would lead to stmmac_hw_init calling dwmac1000_setup rather than
> dwmac4_setup, resulting in a non-functional driver.
> This happened since the check for has_gmac is done before the check for
> has_gmac4. However, the order should not matter, so it does not make sense
> to have both set.
> 
> If something is valid for both, you should do as the stmmac_interrupt does:
> if (priv->plat->has_gmac || priv->plat->has_gmac4) ...
> 
> The places where it was obvious that the author actually meant
> if (has_gmac || has_gmac4) rather than if (has_gmac) has been updated.
> 
> Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
> Acked-by: Alexandre TORGUE <alexandre.torgue@st.com>

Applied.

^ permalink raw reply

* Re: [RESEND PATCH 2/3] net: stmmac: dwmac-generic: add missing compatible strings
From: David Miller @ 2016-12-08 16:35 UTC (permalink / raw)
  To: niklas.cassel
  Cc: peppe.cavallaro, alexandre.torgue, niklass, netdev, linux-kernel
In-Reply-To: <1481114469-4788-2-git-send-email-niklass@axis.com>

From: Niklas Cassel <niklas.cassel@axis.com>
Date: Wed, 7 Dec 2016 13:41:07 +0100

> From: Niklas Cassel <niklas.cassel@axis.com>
> 
> devicetree binding for stmmac states:
> - compatible: Should be "snps,dwmac-<ip_version>", "snps,dwmac"
> 	For backwards compatibility: "st,spear600-gmac" is also supported.
> 
> Since dwmac-generic.c calls stmmac_probe_config_dt explicitly,
> another alternative would have been to remove all compatible strings
> other than "snps,dwmac" and "st,spear600-gmac" from dwmac-generic.c.
> 
> However, that would probably do more good than harm, since when trying
> to figure out what hardware a certain driver supports, you usually look
> at the compatible strings in the struct of_device_id, and not in some
> function defined in a completely different file.
> 
> No functional change intended.
> 
> Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
> Acked-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>

Applied.

^ permalink raw reply

* Re: [RESEND PATCH 1/3] bindings: net: stmmac: correct note about TSO
From: David Miller @ 2016-12-08 16:35 UTC (permalink / raw)
  To: niklas.cassel-VrBV9hrLPhE
  Cc: robh+dt-DgEjT+Ai2ygdnm+yROfE0A, mark.rutland-5wv7dgnIgG8,
	peppe.cavallaro-qxv4g6HH51o, alexandre.torgue-qxv4g6HH51o,
	preid-qgqNFa1JUf/o2iN0hyhwsIdd74u8MsAO,
	eric-op+oiCINJLTt9jDmeYuA0g, niklass-VrBV9hrLPhE,
	netdev-u79uwXL29TY76Z2rM5mHXA, devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1481114469-4788-1-git-send-email-niklass-VrBV9hrLPhE@public.gmane.org>

From: Niklas Cassel <niklas.cassel-VrBV9hrLPhE@public.gmane.org>
Date: Wed, 7 Dec 2016 13:41:06 +0100

> From: Niklas Cassel <niklas.cassel-VrBV9hrLPhE@public.gmane.org>
> 
> snps,tso was previously placed under AXI BUS Mode parameters,
> suggesting that the property should be in the stmmac-axi-config node.
> 
> TSO (TCP Segmentation Offloading) has nothing to do with AXI BUS Mode
> parameters, and the parser actually expects it to be in the root node,
> not in the stmmac-axi-config.
> 
> Also added a note about snps,tso only being available on GMAC4 and newer.
> 
> Signed-off-by: Niklas Cassel <niklas.cassel-VrBV9hrLPhE@public.gmane.org>
> Acked-by: Alexandre TORGUE <alexandre.torgue-qxv4g6HH51o@public.gmane.org>
> Acked-by: Rob Herring <robh-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Acked-by: Giuseppe Cavallaro <peppe.cavallaro-qxv4g6HH51o@public.gmane.org>

Applied.
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] net: axienet: Utilize of_get_mac_address()
From: David Miller @ 2016-12-08 16:34 UTC (permalink / raw)
  To: tklauser; +Cc: netdev, michal.simek, soren.brinkmann, f.fainelli
In-Reply-To: <20161207122529.29508-1-tklauser@distanz.ch>

From: Tobias Klauser <tklauser@distanz.ch>
Date: Wed,  7 Dec 2016 13:25:28 +0100

> Do not open code getting the MAC address exclusively from the
> "local-mac-address" property, but instead use of_get_mac_address()
> which looks up the MAC address using the 3 typical property names.
> 
> Also avoid casting away the const qualifier of the return value by
> making axienet_set_mac_address() take a const void* address.
> 
> Follows commit b34296a9c047 ("net: ethoc: Utilize
> of_get_mac_address()").
> 
> Cc: Florian Fainelli <f.fainelli@gmail.com>
> Signed-off-by: Tobias Klauser <tklauser@distanz.ch>

Applied.

^ permalink raw reply

* Re: [PATCH] net: ll_temac: Utilize of_get_mac_address()
From: David Miller @ 2016-12-08 16:34 UTC (permalink / raw)
  To: tklauser; +Cc: netdev, michal.simek, soren.brinkmann, f.fainelli
In-Reply-To: <20161207122529.29508-2-tklauser@distanz.ch>

From: Tobias Klauser <tklauser@distanz.ch>
Date: Wed,  7 Dec 2016 13:25:29 +0100

> Do not open code getting the MAC address exclusively from the
> "local-mac-address" property, but instead use of_get_mac_address()
> which looks up the MAC address using the 3 typical property names.
> 
> Also avoid casting away the const qualifier of the return value by
> making temac_init_mac_address() take a const void* address.
> 
> Follows commit b34296a9c047 ("net: ethoc: Utilize
> of_get_mac_address()").
> 
> Cc: Florian Fainelli <f.fainelli@gmail.com>
> Signed-off-by: Tobias Klauser <tklauser@distanz.ch>

Applied.

^ permalink raw reply

* Re: [PATCH net-next V3 0/2] net/sched: cls_flower: Add support for matching on dissection flags
From: David Miller @ 2016-12-08 16:33 UTC (permalink / raw)
  To: ogerlitz; +Cc: netdev, jiri, roid, hadarh
In-Reply-To: <1481112191-14115-1-git-send-email-ogerlitz@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed,  7 Dec 2016 14:03:09 +0200

> This series add the UAPI to provide set of flags for matching, where the 
> flags provided from user-space are mapped to flow-dissector flags.
> 
> The 1st flag allows to match on whether the packet is an
> IP fragment and corresponds to the FLOW_DIS_IS_FRAGMENT flag.
  ...
> v2->v3:
>   - replace BIT() with << (kbuild test robot)
> 
> v1->v2:
>  - dropped the flow dissector patch (#1) as no changes are needed there (Jiri)
>  - applied code review comments from Jiri to the flower patch

Series applied.

^ permalink raw reply

* Re: [patch net-next] net: mvneta: Indent some statements
From: David Miller @ 2016-12-08 16:31 UTC (permalink / raw)
  To: dan.carpenter; +Cc: thomas.petazzoni, mw, netdev, kernel-janitors
In-Reply-To: <20161207113217.GE5507@elgon.mountain>

From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 7 Dec 2016 14:32:17 +0300

> These two statements were not indented correctly so it's sort of
> confusing.
> 
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

Applied.

^ permalink raw reply

* Re: fs, net: deadlock between bind/splice on af_unix
From: Dmitry Vyukov @ 2016-12-08 16:30 UTC (permalink / raw)
  To: Al Viro, linux-fsdevel@vger.kernel.org, LKML, David Miller,
	Rainer Weikusat, Hannes Frederic Sowa, Cong Wang, netdev,
	Eric Dumazet
  Cc: syzkaller
In-Reply-To: <CACT4Y+Z981V+QLHr=PnQy1Dvxrpp-nCDhQtf+5HuNAusH+Vqxw@mail.gmail.com>

On Thu, Dec 8, 2016 at 3:47 PM, Dmitry Vyukov <dvyukov@google.com> wrote:
> Hello,
>
> I am getting the following deadlock reports while running syzkaller
> fuzzer on 318c8932ddec5c1c26a4af0f3c053784841c598e (Dec 7).
>
>
> [ INFO: possible circular locking dependency detected ]
> 4.9.0-rc8+ #77 Not tainted
> -------------------------------------------------------
> syz-executor0/3155 is trying to acquire lock:
>  (&u->bindlock){+.+.+.}, at: [<ffffffff871bca1a>]
> unix_autobind.isra.26+0xca/0x8a0 net/unix/af_unix.c:852
> but task is already holding lock:
>  (&pipe->mutex/1){+.+.+.}, at: [<     inline     >] pipe_lock_nested
> fs/pipe.c:66
>  (&pipe->mutex/1){+.+.+.}, at: [<ffffffff81a8ea4b>]
> pipe_lock+0x5b/0x70 fs/pipe.c:74
> which lock already depends on the new lock.
>
> the existing dependency chain (in reverse order) is:
>
>        [  202.103497] [<     inline     >] validate_chain
> kernel/locking/lockdep.c:2265
>        [  202.103497] [<ffffffff81569576>]
> __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
>        [  202.103497] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
> kernel/locking/lockdep.c:3749
>        [  202.103497] [<     inline     >] __mutex_lock_common
> kernel/locking/mutex.c:521
>        [  202.103497] [<ffffffff88195bcf>]
> mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
>        [  202.103497] [<     inline     >] pipe_lock_nested fs/pipe.c:66
>        [  202.103497] [<ffffffff81a8ea4b>] pipe_lock+0x5b/0x70 fs/pipe.c:74
>        [  202.103497] [<ffffffff81b451f7>]
> iter_file_splice_write+0x267/0xfa0 fs/splice.c:717
>        [  202.103497] [<     inline     >] do_splice_from fs/splice.c:869
>        [  202.103497] [<     inline     >] do_splice fs/splice.c:1160
>        [  202.103497] [<     inline     >] SYSC_splice fs/splice.c:1410
>        [  202.103497] [<ffffffff81b473c7>] SyS_splice+0x7d7/0x16a0
> fs/splice.c:1393
>        [  202.103497] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
>
>        [  202.103497] [<     inline     >] validate_chain
> kernel/locking/lockdep.c:2265
>        [  202.103497] [<ffffffff81569576>]
> __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
>        [  202.103497] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
> kernel/locking/lockdep.c:3749
>        [  202.103497] [<     inline     >]
> percpu_down_read_preempt_disable include/linux/percpu-rwsem.h:35
>        [  202.103497] [<     inline     >] percpu_down_read
> include/linux/percpu-rwsem.h:58
>        [  202.103497] [<ffffffff81a7bb33>]
> __sb_start_write+0x193/0x2a0 fs/super.c:1252
>        [  202.103497] [<     inline     >] sb_start_write
> include/linux/fs.h:1549
>        [  202.103497] [<ffffffff81af9954>] mnt_want_write+0x44/0xb0
> fs/namespace.c:389
>        [  202.103497] [<ffffffff81ab09f6>] filename_create+0x156/0x620
> fs/namei.c:3598
>        [  202.103497] [<ffffffff81ab0ef8>] kern_path_create+0x38/0x50
> fs/namei.c:3644
>        [  202.103497] [<     inline     >] unix_mknod net/unix/af_unix.c:967
>        [  202.103497] [<ffffffff871c0e11>] unix_bind+0x4d1/0xe60
> net/unix/af_unix.c:1035
>        [  202.103497] [<ffffffff86a76b7e>] SYSC_bind+0x20e/0x4c0
> net/socket.c:1382
>        [  202.103497] [<ffffffff86a7a509>] SyS_bind+0x29/0x30 net/socket.c:1368
>        [  202.103497] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
>
>        [  202.103497] [<     inline     >] check_prev_add
> kernel/locking/lockdep.c:1828
>        [  202.103497] [<ffffffff8156309b>]
> check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
>        [  202.103497] [<     inline     >] validate_chain
> kernel/locking/lockdep.c:2265
>        [  202.103497] [<ffffffff81569576>]
> __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
>        [  202.103497] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
> kernel/locking/lockdep.c:3749
>        [  202.103497] [<     inline     >] __mutex_lock_common
> kernel/locking/mutex.c:521
>        [  202.103497] [<ffffffff88196b82>]
> mutex_lock_interruptible_nested+0x2d2/0x11d0
> kernel/locking/mutex.c:650
>        [  202.103497] [<ffffffff871bca1a>]
> unix_autobind.isra.26+0xca/0x8a0 net/unix/af_unix.c:852
>        [  202.103497] [<ffffffff871c76dd>]
> unix_dgram_sendmsg+0x105d/0x1730 net/unix/af_unix.c:1667
>        [  202.103497] [<ffffffff871c7ea8>]
> unix_seqpacket_sendmsg+0xf8/0x170 net/unix/af_unix.c:2071
>        [  202.103497] [<     inline     >] sock_sendmsg_nosec net/socket.c:621
>        [  202.103497] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
> net/socket.c:631
>        [  202.103497] [<ffffffff86a7683c>] kernel_sendmsg+0x4c/0x60
> net/socket.c:639
>        [  202.103497] [<ffffffff86a8101d>]
> sock_no_sendpage+0x20d/0x310 net/core/sock.c:2321
>        [  202.103497] [<ffffffff86a74c95>] kernel_sendpage+0x95/0xf0
> net/socket.c:3289
>        [  202.103497] [<ffffffff86a74d92>] sock_sendpage+0xa2/0xd0
> net/socket.c:775
>        [  202.103497] [<ffffffff81b3ee1e>]
> pipe_to_sendpage+0x2ae/0x390 fs/splice.c:469
>        [  202.103497] [<     inline     >] splice_from_pipe_feed fs/splice.c:520
>        [  202.103497] [<ffffffff81b42f3f>]
> __splice_from_pipe+0x31f/0x750 fs/splice.c:644
>        [  202.103497] [<ffffffff81b4665c>]
> splice_from_pipe+0x1dc/0x300 fs/splice.c:679
>        [  202.103497] [<ffffffff81b467c5>]
> generic_splice_sendpage+0x45/0x60 fs/splice.c:850
>        [  202.103497] [<     inline     >] do_splice_from fs/splice.c:869
>        [  202.103497] [<     inline     >] do_splice fs/splice.c:1160
>        [  202.103497] [<     inline     >] SYSC_splice fs/splice.c:1410
>        [  202.103497] [<ffffffff81b473c7>] SyS_splice+0x7d7/0x16a0
> fs/splice.c:1393
>        [  202.103497] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
>
> other info that might help us debug this:
>
> Chain exists of:
>  Possible unsafe locking scenario:
>
>        CPU0                    CPU1
>        ----                    ----
>   lock(&pipe->mutex/1);
>                                lock(sb_writers#5);
>                                lock(&pipe->mutex/1);
>   lock(&u->bindlock);
>
>  *** DEADLOCK ***
>
> 1 lock held by syz-executor0/3155:
>  #0:  (&pipe->mutex/1){+.+.+.}, at: [<     inline     >]
> pipe_lock_nested fs/pipe.c:66
>  #0:  (&pipe->mutex/1){+.+.+.}, at: [<ffffffff81a8ea4b>]
> pipe_lock+0x5b/0x70 fs/pipe.c:74
>
> stack backtrace:
> CPU: 3 PID: 3155 Comm: syz-executor0 Not tainted 4.9.0-rc8+ #77
> Hardware name: Google Google/Google, BIOS Google 01/01/2011
>  ffff88004b1fe288 ffffffff834c44f9 ffffffff00000003 1ffff1000963fbe4
>  ffffed000963fbdc 0000000041b58ab3 ffffffff895816f0 ffffffff834c420b
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
>  [<     inline     >] __dump_stack lib/dump_stack.c:15
>  [<ffffffff834c44f9>] dump_stack+0x2ee/0x3f5 lib/dump_stack.c:51
>  [<ffffffff81560cb0>] print_circular_bug+0x310/0x3c0
> kernel/locking/lockdep.c:1202
>  [<     inline     >] check_prev_add kernel/locking/lockdep.c:1828
>  [<ffffffff8156309b>] check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
>  [<     inline     >] validate_chain kernel/locking/lockdep.c:2265
>  [<ffffffff81569576>] __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
>  [<ffffffff8156b672>] lock_acquire+0x2a2/0x790 kernel/locking/lockdep.c:3749
>  [<     inline     >] __mutex_lock_common kernel/locking/mutex.c:521
>  [<ffffffff88196b82>] mutex_lock_interruptible_nested+0x2d2/0x11d0
> kernel/locking/mutex.c:650
>  [<ffffffff871bca1a>] unix_autobind.isra.26+0xca/0x8a0 net/unix/af_unix.c:852
>  [<ffffffff871c76dd>] unix_dgram_sendmsg+0x105d/0x1730 net/unix/af_unix.c:1667
>  [<ffffffff871c7ea8>] unix_seqpacket_sendmsg+0xf8/0x170 net/unix/af_unix.c:2071
>  [<     inline     >] sock_sendmsg_nosec net/socket.c:621
>  [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110 net/socket.c:631
>  [<ffffffff86a7683c>] kernel_sendmsg+0x4c/0x60 net/socket.c:639
>  [<ffffffff86a8101d>] sock_no_sendpage+0x20d/0x310 net/core/sock.c:2321
>  [<ffffffff86a74c95>] kernel_sendpage+0x95/0xf0 net/socket.c:3289
>  [<ffffffff86a74d92>] sock_sendpage+0xa2/0xd0 net/socket.c:775
>  [<ffffffff81b3ee1e>] pipe_to_sendpage+0x2ae/0x390 fs/splice.c:469
>  [<     inline     >] splice_from_pipe_feed fs/splice.c:520
>  [<ffffffff81b42f3f>] __splice_from_pipe+0x31f/0x750 fs/splice.c:644
>  [<ffffffff81b4665c>] splice_from_pipe+0x1dc/0x300 fs/splice.c:679
>  [<ffffffff81b467c5>] generic_splice_sendpage+0x45/0x60 fs/splice.c:850
>  [<     inline     >] do_splice_from fs/splice.c:869
>  [<     inline     >] do_splice fs/splice.c:1160
>  [<     inline     >] SYSC_splice fs/splice.c:1410
>  [<ffffffff81b473c7>] SyS_splice+0x7d7/0x16a0 fs/splice.c:1393
>  [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6


Seems to be the same, but detected in the context of the second thread:

[ INFO: possible circular locking dependency detected ]
4.9.0-rc8+ #77 Not tainted
-------------------------------------------------------
syz-executor3/24365 is trying to acquire lock:
 (&pipe->mutex/1){+.+.+.}, at: [<     inline     >] pipe_lock_nested
fs/pipe.c:66
 (&pipe->mutex/1){+.+.+.}, at: [<ffffffff81a8ea4b>]
pipe_lock+0x5b/0x70 fs/pipe.c:74
but task is already holding lock:
 (sb_writers#5){.+.+.+}, at: [<     inline     >] file_start_write
include/linux/fs.h:2592
 (sb_writers#5){.+.+.+}, at: [<     inline     >] do_splice fs/splice.c:1159
 (sb_writers#5){.+.+.+}, at: [<     inline     >] SYSC_splice fs/splice.c:1410
 (sb_writers#5){.+.+.+}, at: [<ffffffff81b47d9f>]
SyS_splice+0x11af/0x16a0 fs/splice.c:1393
which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

       [  131.709013] [<     inline     >] validate_chain
kernel/locking/lockdep.c:2265
       [  131.709013] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
       [  131.709013] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
       [  131.709013] [<     inline     >]
percpu_down_read_preempt_disable include/linux/percpu-rwsem.h:35
       [  131.709013] [<     inline     >] percpu_down_read
include/linux/percpu-rwsem.h:58
       [  131.709013] [<ffffffff81a7bb33>]
__sb_start_write+0x193/0x2a0 fs/super.c:1252
       [  131.709013] [<     inline     >] sb_start_write
include/linux/fs.h:1549
       [  131.709013] [<ffffffff81af9954>] mnt_want_write+0x44/0xb0
fs/namespace.c:389
       [  131.709013] [<ffffffff81ab09f6>] filename_create+0x156/0x620
fs/namei.c:3598
       [  131.709013] [<ffffffff81ab0ef8>] kern_path_create+0x38/0x50
fs/namei.c:3644
       [  131.709013] [<     inline     >] unix_mknod net/unix/af_unix.c:967
       [  131.709013] [<ffffffff871c0e11>] unix_bind+0x4d1/0xe60
net/unix/af_unix.c:1035
       [  131.709013] [<ffffffff86a76b7e>] SYSC_bind+0x20e/0x4c0
net/socket.c:1382
       [  131.709013] [<ffffffff86a7a509>] SyS_bind+0x29/0x30 net/socket.c:1368
       [  131.709013] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6

       [  131.709013] [<     inline     >] validate_chain
kernel/locking/lockdep.c:2265
       [  131.709013] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
       [  131.709013] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
       [  131.709013] [<     inline     >] __mutex_lock_common
kernel/locking/mutex.c:521
       [  131.709013] [<ffffffff88196b82>]
mutex_lock_interruptible_nested+0x2d2/0x11d0
kernel/locking/mutex.c:650
       [  131.709013] [<ffffffff871bca1a>]
unix_autobind.isra.26+0xca/0x8a0 net/unix/af_unix.c:852
       [  131.709013] [<ffffffff871c76dd>]
unix_dgram_sendmsg+0x105d/0x1730 net/unix/af_unix.c:1667
       [  131.709013] [<ffffffff871c7ea8>]
unix_seqpacket_sendmsg+0xf8/0x170 net/unix/af_unix.c:2071
       [  131.709013] [<     inline     >] sock_sendmsg_nosec net/socket.c:621
       [  131.709013] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
net/socket.c:631
       [  131.709013] [<ffffffff86a7683c>] kernel_sendmsg+0x4c/0x60
net/socket.c:639
       [  131.709013] [<ffffffff86a8101d>]
sock_no_sendpage+0x20d/0x310 net/core/sock.c:2321
       [  131.709013] [<ffffffff86a74c95>] kernel_sendpage+0x95/0xf0
net/socket.c:3289
       [  131.709013] [<ffffffff86a74d92>] sock_sendpage+0xa2/0xd0
net/socket.c:775
       [  131.709013] [<ffffffff81b3ee1e>]
pipe_to_sendpage+0x2ae/0x390 fs/splice.c:469
       [  131.709013] [<     inline     >] splice_from_pipe_feed fs/splice.c:520
       [  131.709013] [<ffffffff81b42f3f>]
__splice_from_pipe+0x31f/0x750 fs/splice.c:644
       [  131.709013] [<ffffffff81b4665c>]
splice_from_pipe+0x1dc/0x300 fs/splice.c:679
       [  131.709013] [<ffffffff81b467c5>]
generic_splice_sendpage+0x45/0x60 fs/splice.c:850
       [  131.709013] [<     inline     >] do_splice_from fs/splice.c:869
       [  131.709013] [<     inline     >] do_splice fs/splice.c:1160
       [  131.709013] [<     inline     >] SYSC_splice fs/splice.c:1410
       [  131.709013] [<ffffffff81b473c7>] SyS_splice+0x7d7/0x16a0
fs/splice.c:1393
       [  131.709013] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6

       [  131.709013] [<     inline     >] check_prev_add
kernel/locking/lockdep.c:1828
       [  131.709013] [<ffffffff8156309b>]
check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
       [  131.709013] [<     inline     >] validate_chain
kernel/locking/lockdep.c:2265
       [  131.709013] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
       [  131.709013] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
       [  131.709013] [<     inline     >] __mutex_lock_common
kernel/locking/mutex.c:521
       [  131.709013] [<ffffffff88195bcf>]
mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
       [  131.709013] [<     inline     >] pipe_lock_nested fs/pipe.c:66
       [  131.709013] [<ffffffff81a8ea4b>] pipe_lock+0x5b/0x70 fs/pipe.c:74
       [  131.709013] [<ffffffff81b451f7>]
iter_file_splice_write+0x267/0xfa0 fs/splice.c:717
       [  131.709013] [<     inline     >] do_splice_from fs/splice.c:869
       [  131.709013] [<     inline     >] do_splice fs/splice.c:1160
       [  131.709013] [<     inline     >] SYSC_splice fs/splice.c:1410
       [  131.709013] [<ffffffff81b473c7>] SyS_splice+0x7d7/0x16a0
fs/splice.c:1393
       [  131.709013] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6

other info that might help us debug this:

Chain exists of:
 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(sb_writers#5);
                               lock(&u->bindlock);
                               lock(sb_writers#5);
  lock(&pipe->mutex/1);

 *** DEADLOCK ***

1 lock held by syz-executor3/24365:
 #0:  (sb_writers#5){.+.+.+}, at: [<     inline     >]
file_start_write include/linux/fs.h:2592
 #0:  (sb_writers#5){.+.+.+}, at: [<     inline     >] do_splice
fs/splice.c:1159
 #0:  (sb_writers#5){.+.+.+}, at: [<     inline     >] SYSC_splice
fs/splice.c:1410
 #0:  (sb_writers#5){.+.+.+}, at: [<ffffffff81b47d9f>]
SyS_splice+0x11af/0x16a0 fs/splice.c:1393

stack backtrace:
CPU: 2 PID: 24365 Comm: syz-executor3 Not tainted 4.9.0-rc8+ #77
Hardware name: Google Google/Google, BIOS Google 01/01/2011
 ffff8800597b6af8 ffffffff834c44f9 ffffffff00000002 1ffff1000b2f6cf2
 ffffed000b2f6cea 0000000041b58ab3 ffffffff895816f0 ffffffff834c420b
 0000000041b58ab3 ffffffff894dbca8 ffffffff8155c780 ffff8800597b6878
Call Trace:
 [<     inline     >] __dump_stack lib/dump_stack.c:15
 [<ffffffff834c44f9>] dump_stack+0x2ee/0x3f5 lib/dump_stack.c:51
 [<ffffffff81560cb0>] print_circular_bug+0x310/0x3c0
kernel/locking/lockdep.c:1202
 [<     inline     >] check_prev_add kernel/locking/lockdep.c:1828
 [<ffffffff8156309b>] check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
 [<     inline     >] validate_chain kernel/locking/lockdep.c:2265
 [<ffffffff81569576>] __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
 [<ffffffff8156b672>] lock_acquire+0x2a2/0x790 kernel/locking/lockdep.c:3749
 [<     inline     >] __mutex_lock_common kernel/locking/mutex.c:521
 [<ffffffff88195bcf>] mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
 [<     inline     >] pipe_lock_nested fs/pipe.c:66
 [<ffffffff81a8ea4b>] pipe_lock+0x5b/0x70 fs/pipe.c:74
 [<ffffffff81b451f7>] iter_file_splice_write+0x267/0xfa0 fs/splice.c:717
 [<     inline     >] do_splice_from fs/splice.c:869
 [<     inline     >] do_splice fs/splice.c:1160
 [<     inline     >] SYSC_splice fs/splice.c:1410
 [<ffffffff81b473c7>] SyS_splice+0x7d7/0x16a0 fs/splice.c:1393
 [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6

^ permalink raw reply

* Re: [patch] drivers: net: xgene: uninitialized variable in xgene_enet_free_pagepool()
From: David Miller @ 2016-12-08 16:30 UTC (permalink / raw)
  To: dan.carpenter
  Cc: isubramanian, kchudgar, netdev, linux-kernel, kernel-janitors
In-Reply-To: <20161207111424.GA5507@elgon.mountain>

From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 7 Dec 2016 14:14:24 +0300

> We never set "slots" in this function.
> 
> Fixes: a9380b0f7be8 ("drivers: net: xgene: Add support for Jumbo frame")
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

Applied.

^ permalink raw reply

* Re: [PATCH 2/2] vhost: remove unnecessary smp_mb from vhost_work_queue
From: David Miller @ 2016-12-08 16:30 UTC (permalink / raw)
  To: bergwolf; +Cc: stefanha, kvm, virtualization, netdev
In-Reply-To: <1481104340-77035-2-git-send-email-bergwolf@gmail.com>

From: Peng Tao <bergwolf@gmail.com>
Date: Wed,  7 Dec 2016 17:52:19 +0800

> test_and_set_bit() already implies a memory barrier.
> 
> Signed-off-by: Peng Tao <bergwolf@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH 1/2] vhost-vsock: remove unused vq variable
From: David Miller @ 2016-12-08 16:30 UTC (permalink / raw)
  To: bergwolf; +Cc: stefanha, kvm, virtualization, netdev
In-Reply-To: <1481104340-77035-1-git-send-email-bergwolf@gmail.com>

From: Peng Tao <bergwolf@gmail.com>
Date: Wed,  7 Dec 2016 17:52:18 +0800

> Signed-off-by: Peng Tao <bergwolf@gmail.com>

Applied.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox