Netdev List
 help / color / mirror / Atom feed
* [net-next][PATCH v2 08/18] RDS: IB: split the mr registration and invalidation path
From: Santosh Shilimkar @ 2016-12-07  4:01 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, santosh.shilimkar
In-Reply-To: <1481083316-11648-1-git-send-email-santosh.shilimkar@oracle.com>

MR invalidation in RDS is done in background thread and not in
data path like registration. So break the dependency between them
which helps to remove the performance bottleneck.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h      |  4 +++-
 net/rds/ib_cm.c   |  9 +++++++--
 net/rds/ib_frmr.c | 11 ++++++-----
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/net/rds/ib.h b/net/rds/ib.h
index f4e8121..f14c26d 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -14,7 +14,8 @@
 
 #define RDS_IB_DEFAULT_RECV_WR		1024
 #define RDS_IB_DEFAULT_SEND_WR		256
-#define RDS_IB_DEFAULT_FR_WR		512
+#define RDS_IB_DEFAULT_FR_WR		256
+#define RDS_IB_DEFAULT_FR_INV_WR	256
 
 #define RDS_IB_DEFAULT_RETRY_COUNT	1
 
@@ -125,6 +126,7 @@ struct rds_ib_connection {
 
 	/* To control the number of wrs from fastreg */
 	atomic_t		i_fastreg_wrs;
+	atomic_t		i_fastunreg_wrs;
 
 	/* interrupt handling */
 	struct tasklet_struct	i_send_tasklet;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index b9da1e5..3002acf 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -382,7 +382,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	 * completion queue and send queue. This extra space is used for FRMR
 	 * registration and invalidation work requests
 	 */
-	fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
+	fr_queue_space = rds_ibdev->use_fastreg ?
+			 (RDS_IB_DEFAULT_FR_WR + 1) +
+			 (RDS_IB_DEFAULT_FR_INV_WR + 1)
+			 : 0;
 
 	/* add the conn now so that connection establishment has the dev */
 	rds_ib_add_conn(rds_ibdev, conn);
@@ -444,6 +447,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	attr.send_cq = ic->i_send_cq;
 	attr.recv_cq = ic->i_recv_cq;
 	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
+	atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR);
 
 	/*
 	 * XXX this can fail if max_*_wr is too large?  Are we supposed
@@ -766,7 +770,8 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 		wait_event(rds_ib_ring_empty_wait,
 			   rds_ib_ring_empty(&ic->i_recv_ring) &&
 			   (atomic_read(&ic->i_signaled_sends) == 0) &&
-			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
+			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) &&
+			   (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR));
 		tasklet_kill(&ic->i_send_tasklet);
 		tasklet_kill(&ic->i_recv_tasklet);
 
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 66b3d62..48332a6 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -241,8 +241,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 	if (frmr->fr_state != FRMR_IS_INUSE)
 		goto out;
 
-	while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
-		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+	while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) {
+		atomic_inc(&ibmr->ic->i_fastunreg_wrs);
 		cpu_relax();
 	}
 
@@ -261,7 +261,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 	if (unlikely(ret)) {
 		frmr->fr_state = FRMR_IS_STALE;
 		frmr->fr_inv = false;
-		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+		atomic_inc(&ibmr->ic->i_fastunreg_wrs);
 		pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
 		goto out;
 	}
@@ -289,9 +289,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 	if (frmr->fr_inv) {
 		frmr->fr_state = FRMR_IS_FREE;
 		frmr->fr_inv = false;
+		atomic_inc(&ic->i_fastreg_wrs);
+	} else {
+		atomic_inc(&ic->i_fastunreg_wrs);
 	}
-
-	atomic_inc(&ic->i_fastreg_wrs);
 }
 
 void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH v2 10/18] RDS: IB: track and log active side endpoint in connection
From: Santosh Shilimkar @ 2016-12-07  4:01 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, santosh.shilimkar
In-Reply-To: <1481083316-11648-1-git-send-email-santosh.shilimkar@oracle.com>

Useful to know the active and passive end points in a
RDS IB connection.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h    |  3 +++
 net/rds/ib_cm.c | 11 +++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/net/rds/ib.h b/net/rds/ib.h
index f14c26d..97e7696 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -181,6 +181,9 @@ struct rds_ib_connection {
 
 	/* Batched completions */
 	unsigned int		i_unsignaled_wrs;
+
+	/* Endpoint role in connection */
+	int			i_active_side;
 };
 
 /* This assumes that atomic_t is at least 32 bits */
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 3002acf..4d1bf04 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -120,16 +120,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 		rds_conn_destroy(conn);
 		return;
 	} else {
-		pr_notice("RDS/IB: connected <%pI4,%pI4> version %u.%u%s\n",
+		pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
+			  ic->i_active_side ? "Active" : "Passive",
 			  &conn->c_laddr, &conn->c_faddr,
 			  RDS_PROTOCOL_MAJOR(conn->c_version),
 			  RDS_PROTOCOL_MINOR(conn->c_version),
 			  ic->i_flowctl ? ", flow control" : "");
 	}
 
-	/*
-	 * Init rings and fill recv. this needs to wait until protocol negotiation
-	 * is complete, since ring layout is different from 3.0 to 3.1.
+	/* Init rings and fill recv. this needs to wait until protocol
+	 * negotiation is complete, since ring layout is different
+	 * from 3.1 to 4.1.
 	 */
 	rds_ib_send_init_ring(ic);
 	rds_ib_recv_init_ring(ic);
@@ -685,6 +686,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
 		if (ic->i_cm_id == cm_id)
 			ret = 0;
 	}
+	ic->i_active_side = true;
 	return ret;
 }
 
@@ -859,6 +861,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 	ic->i_sends = NULL;
 	vfree(ic->i_recvs);
 	ic->i_recvs = NULL;
+	ic->i_active_side = false;
 }
 
 int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
-- 
1.9.1

^ permalink raw reply related

* linux-next: manual merge of the staging tree with the net-next tree
From: Stephen Rothwell @ 2016-12-07  4:04 UTC (permalink / raw)
  To: Greg KH, David Miller, Networking; +Cc: linux-next, linux-kernel, Jarod Wilson

Hi Greg,

Today's linux-next merge of the staging tree got a conflict in:

  drivers/staging/slicoss/slicoss.c

between commit:

  a52ad514fdf3 ("net: deprecate eth_change_mtu, remove usage")

from the net-next tree and commit:

  0af72df267f2 ("staging: slicoss: remove the staging driver")

from the staging tree.

I fixed it up (I just removed the file) and can carry the fix as
necessary. This is now fixed as far as linux-next is concerned, but any
non trivial conflicts should be mentioned to your upstream maintainer
when your tree is submitted for merging.  You may also want to consider
cooperating with the maintainer of the conflicting tree to minimise any
particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell

^ permalink raw reply

* Re: [PATCH 05/10] vhost: make interval tree static inline
From: Jason Wang @ 2016-12-07  4:16 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-kernel; +Cc: netdev, kvm, virtualization
In-Reply-To: <1481038106-24899-6-git-send-email-mst@redhat.com>



On 2016年12月06日 23:40, Michael S. Tsirkin wrote:
> vhost_umem_interval_tree is only used locally within vhost.c, mark it
> static. As some functions generated go unused, this triggers warnings
> unless we also mark it inline.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>   drivers/vhost/vhost.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index c6f2d89..7331ef3 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -49,7 +49,7 @@ enum {
>   
>   INTERVAL_TREE_DEFINE(struct vhost_umem_node,
>   		     rb, __u64, __subtree_last,
> -		     START, LAST, , vhost_umem_interval_tree);
> +		     START, LAST, static inline, vhost_umem_interval_tree);
>   
>   #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
>   static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)

Reviewed-by: Jason Wang <jasowang@redhat.com>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH 06/10] vhost: add missing __user annotations
From: Jason Wang @ 2016-12-07  4:17 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-kernel; +Cc: netdev, kvm, virtualization
In-Reply-To: <1481038106-24899-7-git-send-email-mst@redhat.com>



On 2016年12月06日 23:40, Michael S. Tsirkin wrote:
> Several vhost functions were missing __user annotations
> on pointers, causing sparse warnings. Fix this up.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>   drivers/vhost/vhost.c | 10 +++++-----
>   1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 7331ef3..ba7db68 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -719,7 +719,7 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
>   static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
>   			  struct iovec iov[], int iov_size, int access);
>   
> -static int vhost_copy_to_user(struct vhost_virtqueue *vq, void *to,
> +static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
>   			      const void *from, unsigned size)
>   {
>   	int ret;
> @@ -749,7 +749,7 @@ static int vhost_copy_to_user(struct vhost_virtqueue *vq, void *to,
>   }
>   
>   static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
> -				void *from, unsigned size)
> +				void __user *from, unsigned size)
>   {
>   	int ret;
>   
> @@ -783,7 +783,7 @@ static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
>   }
>   
>   static void __user *__vhost_get_user(struct vhost_virtqueue *vq,
> -				     void *addr, unsigned size)
> +				     void __user *addr, unsigned size)
>   {
>   	int ret;
>   
> @@ -934,8 +934,8 @@ static int umem_access_ok(u64 uaddr, u64 size, int access)
>   	return 0;
>   }
>   
> -int vhost_process_iotlb_msg(struct vhost_dev *dev,
> -			    struct vhost_iotlb_msg *msg)
> +static int vhost_process_iotlb_msg(struct vhost_dev *dev,
> +				   struct vhost_iotlb_msg *msg)
>   {
>   	int ret = 0;
>   

Patch looks good but this looks like another static conversion not 
__user annotations.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH 07/10] vsock/virtio: add a missing __le annotation
From: Jason Wang @ 2016-12-07  4:19 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-kernel
  Cc: netdev, virtualization, David S. Miller, Stefan Hajnoczi, kvm
In-Reply-To: <1481038106-24899-8-git-send-email-mst@redhat.com>



On 2016年12月06日 23:40, Michael S. Tsirkin wrote:
> guest cid is read from config space, therefore it's in little endian
> format and is treated as such, annotate it accordingly.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>   net/vmw_vsock/virtio_transport.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> index 936d7ee..90096b9 100644
> --- a/net/vmw_vsock/virtio_transport.c
> +++ b/net/vmw_vsock/virtio_transport.c
> @@ -336,7 +336,7 @@ static void virtio_vsock_reset_sock(struct sock *sk)
>   static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock)
>   {
>   	struct virtio_device *vdev = vsock->vdev;
> -	u64 guest_cid;
> +	__le64 guest_cid;
>   
>   	vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid),
>   			  &guest_cid, sizeof(guest_cid));

Reviewed-by: Jason Wang <jasowang@redhat.com>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH 08/10] vsock/virtio: mark an internal function static
From: Jason Wang @ 2016-12-07  4:21 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-kernel
  Cc: netdev, virtualization, David S. Miller, Stefan Hajnoczi, kvm
In-Reply-To: <1481038106-24899-9-git-send-email-mst@redhat.com>



On 2016年12月06日 23:41, Michael S. Tsirkin wrote:
> virtio_transport_alloc_pkt is only used locally, make it static.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>   net/vmw_vsock/virtio_transport_common.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index a53b3a1..6120384 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -32,7 +32,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>   	return container_of(t, struct virtio_transport, transport);
>   }
>   
> -struct virtio_vsock_pkt *
> +static struct virtio_vsock_pkt *
>   virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
>   			   size_t len,
>   			   u32 src_cid,

Git grep shows it was used by tracing.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* [PATCH net-next 1/1] driver: macvlan: Remove the rcu member of macvlan_port
From: fgao @ 2016-12-07  4:23 UTC (permalink / raw)
  To: davem, kaber, netdev, gfree.wind; +Cc: Gao Feng

From: Gao Feng <fgao@ikuai8.com>

When free macvlan_port in macvlan_port_destroy, it is safe to free
directly because netdev_rx_handler_unregister could enforce one
grace period.
So it is unnecessary to use kfree_rcu for macvlan_port.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
---
 drivers/net/macvlan.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 3c0a171..20b3fdf2 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -43,7 +43,6 @@ struct macvlan_port {
 	struct net_device	*dev;
 	struct hlist_head	vlan_hash[MACVLAN_HASH_SIZE];
 	struct list_head	vlans;
-	struct rcu_head		rcu;
 	struct sk_buff_head	bc_queue;
 	struct work_struct	bc_work;
 	bool 			passthru;
@@ -1151,7 +1150,7 @@ static void macvlan_port_destroy(struct net_device *dev)
 	cancel_work_sync(&port->bc_work);
 	__skb_queue_purge(&port->bc_queue);
 
-	kfree_rcu(port, rcu);
+	kfree(port);
 }
 
 static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[])
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH 09/10] vsock/virtio: fix src/dst cid format
From: Jason Wang @ 2016-12-07  4:31 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-kernel
  Cc: stable, Stefan Hajnoczi, David S. Miller, kvm, virtualization,
	netdev
In-Reply-To: <1481038106-24899-10-git-send-email-mst@redhat.com>



On 2016年12月06日 23:41, Michael S. Tsirkin wrote:
> These fields are 64 bit, using le32_to_cpu and friends
> on these will not do the right thing.
> Fix this up.
>
> Cc: stable@vger.kernel.org
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>   net/vmw_vsock/virtio_transport_common.c | 14 +++++++-------
>   1 file changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index 6120384..22e99c4 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -606,9 +606,9 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
>   		return 0;
>   
>   	pkt = virtio_transport_alloc_pkt(&info, 0,
> -					 le32_to_cpu(pkt->hdr.dst_cid),
> +					 le64_to_cpu(pkt->hdr.dst_cid),
>   					 le32_to_cpu(pkt->hdr.dst_port),
> -					 le32_to_cpu(pkt->hdr.src_cid),
> +					 le64_to_cpu(pkt->hdr.src_cid),
>   					 le32_to_cpu(pkt->hdr.src_port));

Looking at sockaddr_vm, svm_cid is "unsigned int", do we really want 64 
bit here?

>   	if (!pkt)
>   		return -ENOMEM;
> @@ -823,7 +823,7 @@ virtio_transport_send_response(struct vsock_sock *vsk,
>   	struct virtio_vsock_pkt_info info = {
>   		.op = VIRTIO_VSOCK_OP_RESPONSE,
>   		.type = VIRTIO_VSOCK_TYPE_STREAM,
> -		.remote_cid = le32_to_cpu(pkt->hdr.src_cid),
> +		.remote_cid = le64_to_cpu(pkt->hdr.src_cid),
>   		.remote_port = le32_to_cpu(pkt->hdr.src_port),
>   		.reply = true,
>   	};
> @@ -863,9 +863,9 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
>   	child->sk_state = SS_CONNECTED;
>   
>   	vchild = vsock_sk(child);
> -	vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
> +	vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid),
>   			le32_to_cpu(pkt->hdr.dst_port));
> -	vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
> +	vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid),
>   			le32_to_cpu(pkt->hdr.src_port));
>   
>   	vsock_insert_connected(vchild);
> @@ -904,9 +904,9 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
>   	struct sock *sk;
>   	bool space_available;
>   
> -	vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid),
> +	vsock_addr_init(&src, le64_to_cpu(pkt->hdr.src_cid),
>   			le32_to_cpu(pkt->hdr.src_port));
> -	vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid),
> +	vsock_addr_init(&dst, le64_to_cpu(pkt->hdr.dst_cid),
>   			le32_to_cpu(pkt->hdr.dst_port));
>   
>   	trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,

^ permalink raw reply

* [PATCH net] phy: Don't increment MDIO bus refcount unless it's a different owner
From: Florian Fainelli @ 2016-12-07  4:54 UTC (permalink / raw)
  To: netdev; +Cc: johan, rmk+kernel, andrew, Florian Fainelli

Commit 3e3aaf649416 ("phy: fix mdiobus module safety") fixed the way we
dealt with MDIO bus module reference count, but sort of introduced a
regression in that, if an Ethernet driver registers its own MDIO bus
driver, as is common, we will end up with the Ethernet driver's
module->refnct set to 1, thus preventing this driver from any removal.

Fix this by comparing the network device's device driver owner against
the MDIO bus driver owner, and only if they are different, increment the
MDIO bus module refcount.

Fixes: 3e3aaf649416 ("phy: fix mdiobus module safety")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
Russell,

I verified this against the ethoc driver primarily (on a TS7300 board)
and bcmgenet.

Thanks!

 drivers/net/phy/phy_device.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 1a4bf8acad78..c4ceb082e970 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -857,11 +857,17 @@ EXPORT_SYMBOL(phy_attached_print);
 int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 		      u32 flags, phy_interface_t interface)
 {
+	struct module *ndev_owner = dev->dev.parent->driver->owner;
 	struct mii_bus *bus = phydev->mdio.bus;
 	struct device *d = &phydev->mdio.dev;
 	int err;
 
-	if (!try_module_get(bus->owner)) {
+	/* For Ethernet device drivers that register their own MDIO bus, we
+	 * will have bus->owner match ndev_mod, so we do not want to increment
+	 * our own module->refcnt here, otherwise we would not be able to
+	 * unload later on.
+	 */
+	if (ndev_owner != bus->owner && !try_module_get(bus->owner)) {
 		dev_err(&dev->dev, "failed to get the bus module\n");
 		return -EIO;
 	}
@@ -921,7 +927,8 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 
 error:
 	put_device(d);
-	module_put(bus->owner);
+	if (ndev_owner != bus->owner)
+		module_put(bus->owner);
 	return err;
 }
 EXPORT_SYMBOL(phy_attach_direct);
@@ -971,6 +978,8 @@ EXPORT_SYMBOL(phy_attach);
  */
 void phy_detach(struct phy_device *phydev)
 {
+	struct net_device *dev = phydev->attached_dev;
+	struct module *ndev_owner = dev->dev.parent->driver->owner;
 	struct mii_bus *bus;
 	int i;
 
@@ -998,7 +1007,8 @@ void phy_detach(struct phy_device *phydev)
 	bus = phydev->mdio.bus;
 
 	put_device(&phydev->mdio.dev);
-	module_put(bus->owner);
+	if (ndev_owner != bus->owner)
+		module_put(bus->owner);
 }
 EXPORT_SYMBOL(phy_detach);
 
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH] net: return value of skb_linearize should be handled in Linux kernel
From: Cong Wang @ 2016-12-07  5:02 UTC (permalink / raw)
  To: Zhouyi Zhou
  Cc: faisal.latif-ral2JQCrhuEAvxtiuMwx3w,
	dledford-H+wXaHxf7aLQT0dZR+AlfA,
	sean.hefty-ral2JQCrhuEAvxtiuMwx3w,
	hal.rosenstock-Re5JQEeQqe8AvxtiuMwx3w, Jeff Kirsher,
	QLogic-Storage-Upstream-h88ZbnxC6KDQT0dZR+AlfA,
	jejb-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8, Martin K. Petersen,
	jth-DgEjT+Ai2ygdnm+yROfE0A, jon.maloy-IzeFyvvaP7pWk0Htik3J/w,
	ying.xue-CWA4WttNNZF54TAoqtyWWQ, David Miller,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, LKML, intel-wired-lan,
	Linux Kernel Network Developers,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	fcoe-devel-s9riP+hp16TNLxjTenLetw,
	tipc-discussion-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
In-Reply-To: <1481008233-16777-1-git-send-email-zhouzhouyi-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

On Mon, Dec 5, 2016 at 11:10 PM, Zhouyi Zhou <zhouzhouyi-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
> index 2a653ec..ab787cb 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
> @@ -490,7 +490,11 @@ int ixgbe_fcoe_ddp(struct ixgbe_adapter *adapter,
>          */
>         if ((fh->fh_r_ctl == FC_RCTL_DD_SOL_DATA) &&
>             (fctl & FC_FC_END_SEQ)) {
> -               skb_linearize(skb);
> +               int err = 0;
> +
> +               err = skb_linearize(skb);
> +               if (err)
> +                       return err;


You can reuse 'rc' instead of adding 'err'.



>                 crc = (struct fcoe_crc_eof *)skb_put(skb, sizeof(*crc));
>                 crc->fcoe_eof = FC_EOF_T;
>         }
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index fee1f29..4926d48 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -2173,8 +2173,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
>                                 total_rx_bytes += ddp_bytes;
>                                 total_rx_packets += DIV_ROUND_UP(ddp_bytes,
>                                                                  mss);
> -                       }
> -                       if (!ddp_bytes) {
> +                       } else {
>                                 dev_kfree_skb_any(skb);
>                                 continue;
>                         }


This piece doesn't seem to be related.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] net: wireless: realtek: constify rate_control_ops structures
From: Jes Sorensen @ 2016-12-07  5:07 UTC (permalink / raw)
  To: Larry Finger
  Cc: Bhumika Goyal, julia.lawall, chaoming_li, kvalo, linux-wireless,
	netdev, linux-kernel
In-Reply-To: <0f223291-734f-7658-57b7-18e962d15823@lwfinger.net>

Larry Finger <Larry.Finger@lwfinger.net> writes:
> On 12/02/2016 03:50 AM, Bhumika Goyal wrote:
>> The structures rate_control_ops are only passed as an argument to the
>> functions ieee80211_rate_control_{register/unregister}. This argument is
>> of type const, so rate_control_ops having this property can also be
>> declared as const.
>> Done using Coccinelle:
>>
>> @r1 disable optional_qualifier @
>> identifier i;
>> position p;
>> @@
>> static struct rate_control_ops i@p = {...};
>>
>> @ok1@
>> identifier r1.i;
>> position p;
>> @@
>> ieee80211_rate_control_register(&i@p)
>>
>> @ok2@
>> identifier r1.i;
>> position p;
>> @@
>> ieee80211_rate_control_unregister(&i@p)
>>
>> @bad@
>> position p!={r1.p,ok1.p,ok2.p};
>> identifier r1.i;
>> @@
>> i@p
>>
>> @depends on !bad disable optional_qualifier@
>> identifier r1.i;
>> @@
>> static
>> +const
>> struct rate_control_ops i={...};
>>
>> @depends on !bad disable optional_qualifier@
>> identifier r1.i;
>> @@
>> +const
>> struct rate_control_ops i;
>>
>> File size before:
>>    text	   data	    bss	    dec	    hex	filename
>>    1991	    104	      0	   2095	    82f wireless/realtek/rtlwifi/rc.o
>>
>> File size after:
>>    text	   data	    bss	    dec	    hex	filename
>>    2095	      0	      0	   2095	    wireless/realtek/rtlwifi/rc.o
>>
[snip]
> The content of your patch is OK; however, your subject is not. By
> convention, "net: wireless: realtek:" is assumed. We do, however,
> include "rtlwifi:" to indicate which part of
> drivers/net/wireless/realtek/ is referenced.

In addition, the first part of the description is useful and the file
size information is reasonable too, but ~20 lines of coccinelle scripts
in the commit message is rather pointless.

Jes

^ permalink raw reply

* [PATCH net-next v2 0/7] bnxt_en: Add interface to support RDMA driver.
From: Michael Chan @ 2016-12-07  5:26 UTC (permalink / raw)
  To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma

This series adds an interface to support a brand new RDMA driver bnxt_re.
The first step is to re-arrange some code so that pci_enable_msix() can
be called during pci probe.  The purpose is to allow the RDMA driver to
initialize and stay initialized whether the netdev is up or down.

Then we make some changes to VF resource allocation so that there is
enough resources to support RDMA.

Finally the last patch adds a simple interface to allow the RDMA driver to
probe and register itself with any bnxt_en devices that support RDMA.
Once registered, the RDMA driver can request MSIX, send fw messages, and
receive some notifications.

v2: Fixed kbuild test robot warnings.

David, please consider this series for net-next.  Thanks.

Michael Chan (7):
  bnxt_en: Add bnxt_set_max_func_irqs().
  bnxt_en: Enable MSIX early in bnxt_init_one().
  bnxt_en: Move function reset to bnxt_init_one().
  bnxt_en: Improve completion ring allocation for VFs.
  bnxt_en: Reserve RDMA resources by default.
  bnxt_en: Refactor the driver registration function with firmware.
  bnxt_en: Add interface to support RDMA driver.

 drivers/net/ethernet/broadcom/bnxt/Makefile     |   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c       | 360 +++++++++++++++++-------
 drivers/net/ethernet/broadcom/bnxt/bnxt.h       |  22 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c |  14 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c   | 346 +++++++++++++++++++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h   |  93 ++++++
 6 files changed, 722 insertions(+), 115 deletions(-)
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h

-- 
1.8.3.1

^ permalink raw reply

* [PATCH net-next v2 1/7] bnxt_en: Add bnxt_set_max_func_irqs().
From: Michael Chan @ 2016-12-07  5:26 UTC (permalink / raw)
  To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>

By refactoring existing code into this new function.  The new function
will be used in subsequent patches.

v2: Fixed compile warning when CONFIG_BNXT_SRIOV is not set.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 17 +++++++++++------
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 +
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index e84613a..269e757 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4743,6 +4743,16 @@ static int bnxt_trim_rings(struct bnxt *bp, int *rx, int *tx, int max,
 	return 0;
 }
 
+void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max_irqs)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+	if (BNXT_VF(bp))
+		bp->vf.max_irqs = max_irqs;
+	else
+#endif
+		bp->pf.max_irqs = max_irqs;
+}
+
 static int bnxt_setup_msix(struct bnxt *bp)
 {
 	struct msix_entry *msix_ent;
@@ -6949,12 +6959,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	bnxt_set_tpa_flags(bp);
 	bnxt_set_ring_params(bp);
-	if (BNXT_PF(bp))
-		bp->pf.max_irqs = max_irqs;
-#if defined(CONFIG_BNXT_SRIOV)
-	else
-		bp->vf.max_irqs = max_irqs;
-#endif
+	bnxt_set_max_func_irqs(bp, max_irqs);
 	bnxt_set_dflt_rings(bp);
 
 	/* Default RSS hash cfg. */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index b4abc1b..8327d0d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1235,6 +1235,7 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
 int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
 int bnxt_hwrm_set_coal(struct bnxt *);
 int bnxt_hwrm_func_qcaps(struct bnxt *);
+void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
 void bnxt_tx_disable(struct bnxt *bp);
 void bnxt_tx_enable(struct bnxt *bp);
 int bnxt_hwrm_set_pause(struct bnxt *);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v2 2/7] bnxt_en: Enable MSIX early in bnxt_init_one().
From: Michael Chan @ 2016-12-07  5:26 UTC (permalink / raw)
  To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>

To better support the new RDMA driver, we need to move pci_enable_msix()
from bnxt_open() to bnxt_init_one().  This way, MSIX vectors are available
to the RDMA driver whether the network device is up or down.

Part of the existing bnxt_setup_int_mode() function is now refactored into
a new bnxt_init_int_mode().  bnxt_init_int_mode() is called during
bnxt_init_one() to enable MSIX.  The remaining logic in
bnxt_setup_int_mode() to map the IRQs to the completion rings is called
during bnxt_open().

v2: Fixed compile warning when CONFIG_BNXT_SRIOV is not set.

Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 183 +++++++++++++++++++-----------
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |   1 +
 2 files changed, 115 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 269e757..da302eb 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4743,6 +4743,80 @@ static int bnxt_trim_rings(struct bnxt *bp, int *rx, int *tx, int max,
 	return 0;
 }
 
+static void bnxt_setup_msix(struct bnxt *bp)
+{
+	const int len = sizeof(bp->irq_tbl[0].name);
+	struct net_device *dev = bp->dev;
+	int tcs, i;
+
+	tcs = netdev_get_num_tc(dev);
+	if (tcs > 1) {
+		bp->tx_nr_rings_per_tc = bp->tx_nr_rings / tcs;
+		if (bp->tx_nr_rings_per_tc == 0) {
+			netdev_reset_tc(dev);
+			bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
+		} else {
+			int i, off, count;
+
+			bp->tx_nr_rings = bp->tx_nr_rings_per_tc * tcs;
+			for (i = 0; i < tcs; i++) {
+				count = bp->tx_nr_rings_per_tc;
+				off = i * count;
+				netdev_set_tc_queue(dev, i, count, off);
+			}
+		}
+	}
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		char *attr;
+
+		if (bp->flags & BNXT_FLAG_SHARED_RINGS)
+			attr = "TxRx";
+		else if (i < bp->rx_nr_rings)
+			attr = "rx";
+		else
+			attr = "tx";
+
+		snprintf(bp->irq_tbl[i].name, len, "%s-%s-%d", dev->name, attr,
+			 i);
+		bp->irq_tbl[i].handler = bnxt_msix;
+	}
+}
+
+static void bnxt_setup_inta(struct bnxt *bp)
+{
+	const int len = sizeof(bp->irq_tbl[0].name);
+
+	if (netdev_get_num_tc(bp->dev))
+		netdev_reset_tc(bp->dev);
+
+	snprintf(bp->irq_tbl[0].name, len, "%s-%s-%d", bp->dev->name, "TxRx",
+		 0);
+	bp->irq_tbl[0].handler = bnxt_inta;
+}
+
+static int bnxt_setup_int_mode(struct bnxt *bp)
+{
+	int rc;
+
+	if (bp->flags & BNXT_FLAG_USING_MSIX)
+		bnxt_setup_msix(bp);
+	else
+		bnxt_setup_inta(bp);
+
+	rc = bnxt_set_real_num_queues(bp);
+	return rc;
+}
+
+static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+	if (BNXT_VF(bp))
+		return bp->vf.max_irqs;
+#endif
+	return bp->pf.max_irqs;
+}
+
 void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max_irqs)
 {
 #if defined(CONFIG_BNXT_SRIOV)
@@ -4753,16 +4827,12 @@ void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max_irqs)
 		bp->pf.max_irqs = max_irqs;
 }
 
-static int bnxt_setup_msix(struct bnxt *bp)
+static int bnxt_init_msix(struct bnxt *bp)
 {
-	struct msix_entry *msix_ent;
-	struct net_device *dev = bp->dev;
 	int i, total_vecs, rc = 0, min = 1;
-	const int len = sizeof(bp->irq_tbl[0].name);
-
-	bp->flags &= ~BNXT_FLAG_USING_MSIX;
-	total_vecs = bp->cp_nr_rings;
+	struct msix_entry *msix_ent;
 
+	total_vecs = bnxt_get_max_func_irqs(bp);
 	msix_ent = kcalloc(total_vecs, sizeof(struct msix_entry), GFP_KERNEL);
 	if (!msix_ent)
 		return -ENOMEM;
@@ -4783,8 +4853,10 @@ static int bnxt_setup_msix(struct bnxt *bp)
 
 	bp->irq_tbl = kcalloc(total_vecs, sizeof(struct bnxt_irq), GFP_KERNEL);
 	if (bp->irq_tbl) {
-		int tcs;
+		for (i = 0; i < total_vecs; i++)
+			bp->irq_tbl[i].vector = msix_ent[i].vector;
 
+		bp->total_irqs = total_vecs;
 		/* Trim rings based upon num of vectors allocated */
 		rc = bnxt_trim_rings(bp, &bp->rx_nr_rings, &bp->tx_nr_rings,
 				     total_vecs, min == 1);
@@ -4792,43 +4864,10 @@ static int bnxt_setup_msix(struct bnxt *bp)
 			goto msix_setup_exit;
 
 		bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
-		tcs = netdev_get_num_tc(dev);
-		if (tcs > 1) {
-			bp->tx_nr_rings_per_tc = bp->tx_nr_rings / tcs;
-			if (bp->tx_nr_rings_per_tc == 0) {
-				netdev_reset_tc(dev);
-				bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
-			} else {
-				int i, off, count;
+		bp->cp_nr_rings = (min == 1) ?
+				  max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) :
+				  bp->tx_nr_rings + bp->rx_nr_rings;
 
-				bp->tx_nr_rings = bp->tx_nr_rings_per_tc * tcs;
-				for (i = 0; i < tcs; i++) {
-					count = bp->tx_nr_rings_per_tc;
-					off = i * count;
-					netdev_set_tc_queue(dev, i, count, off);
-				}
-			}
-		}
-		bp->cp_nr_rings = total_vecs;
-
-		for (i = 0; i < bp->cp_nr_rings; i++) {
-			char *attr;
-
-			bp->irq_tbl[i].vector = msix_ent[i].vector;
-			if (bp->flags & BNXT_FLAG_SHARED_RINGS)
-				attr = "TxRx";
-			else if (i < bp->rx_nr_rings)
-				attr = "rx";
-			else
-				attr = "tx";
-
-			snprintf(bp->irq_tbl[i].name, len,
-				 "%s-%s-%d", dev->name, attr, i);
-			bp->irq_tbl[i].handler = bnxt_msix;
-		}
-		rc = bnxt_set_real_num_queues(bp);
-		if (rc)
-			goto msix_setup_exit;
 	} else {
 		rc = -ENOMEM;
 		goto msix_setup_exit;
@@ -4838,52 +4877,54 @@ static int bnxt_setup_msix(struct bnxt *bp)
 	return 0;
 
 msix_setup_exit:
-	netdev_err(bp->dev, "bnxt_setup_msix err: %x\n", rc);
+	netdev_err(bp->dev, "bnxt_init_msix err: %x\n", rc);
+	kfree(bp->irq_tbl);
+	bp->irq_tbl = NULL;
 	pci_disable_msix(bp->pdev);
 	kfree(msix_ent);
 	return rc;
 }
 
-static int bnxt_setup_inta(struct bnxt *bp)
+static int bnxt_init_inta(struct bnxt *bp)
 {
-	int rc;
-	const int len = sizeof(bp->irq_tbl[0].name);
-
-	if (netdev_get_num_tc(bp->dev))
-		netdev_reset_tc(bp->dev);
-
 	bp->irq_tbl = kcalloc(1, sizeof(struct bnxt_irq), GFP_KERNEL);
-	if (!bp->irq_tbl) {
-		rc = -ENOMEM;
-		return rc;
-	}
+	if (!bp->irq_tbl)
+		return -ENOMEM;
+
+	bp->total_irqs = 1;
 	bp->rx_nr_rings = 1;
 	bp->tx_nr_rings = 1;
 	bp->cp_nr_rings = 1;
 	bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
 	bp->flags |= BNXT_FLAG_SHARED_RINGS;
 	bp->irq_tbl[0].vector = bp->pdev->irq;
-	snprintf(bp->irq_tbl[0].name, len,
-		 "%s-%s-%d", bp->dev->name, "TxRx", 0);
-	bp->irq_tbl[0].handler = bnxt_inta;
-	rc = bnxt_set_real_num_queues(bp);
-	return rc;
+	return 0;
 }
 
-static int bnxt_setup_int_mode(struct bnxt *bp)
+static int bnxt_init_int_mode(struct bnxt *bp)
 {
 	int rc = 0;
 
 	if (bp->flags & BNXT_FLAG_MSIX_CAP)
-		rc = bnxt_setup_msix(bp);
+		rc = bnxt_init_msix(bp);
 
 	if (!(bp->flags & BNXT_FLAG_USING_MSIX) && BNXT_PF(bp)) {
 		/* fallback to INTA */
-		rc = bnxt_setup_inta(bp);
+		rc = bnxt_init_inta(bp);
 	}
 	return rc;
 }
 
+static void bnxt_clear_int_mode(struct bnxt *bp)
+{
+	if (bp->flags & BNXT_FLAG_USING_MSIX)
+		pci_disable_msix(bp->pdev);
+
+	kfree(bp->irq_tbl);
+	bp->irq_tbl = NULL;
+	bp->flags &= ~BNXT_FLAG_USING_MSIX;
+}
+
 static void bnxt_free_irq(struct bnxt *bp)
 {
 	struct bnxt_irq *irq;
@@ -4902,10 +4943,6 @@ static void bnxt_free_irq(struct bnxt *bp)
 			free_irq(irq->vector, bp->bnapi[i]);
 		irq->requested = 0;
 	}
-	if (bp->flags & BNXT_FLAG_USING_MSIX)
-		pci_disable_msix(bp->pdev);
-	kfree(bp->irq_tbl);
-	bp->irq_tbl = NULL;
 }
 
 static int bnxt_request_irq(struct bnxt *bp)
@@ -6695,6 +6732,7 @@ static void bnxt_remove_one(struct pci_dev *pdev)
 	cancel_work_sync(&bp->sp_task);
 	bp->sp_event = 0;
 
+	bnxt_clear_int_mode(bp);
 	bnxt_hwrm_func_drv_unrgtr(bp);
 	bnxt_free_hwrm_resources(bp);
 	bnxt_dcb_free(bp);
@@ -6990,10 +7028,14 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto init_err;
 
-	rc = register_netdev(dev);
+	rc = bnxt_init_int_mode(bp);
 	if (rc)
 		goto init_err;
 
+	rc = register_netdev(dev);
+	if (rc)
+		goto init_err_clr_int;
+
 	netdev_info(dev, "%s found at mem %lx, node addr %pM\n",
 		    board_info[ent->driver_data].name,
 		    (long)pci_resource_start(pdev, 0), dev->dev_addr);
@@ -7002,6 +7044,9 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	return 0;
 
+init_err_clr_int:
+	bnxt_clear_int_mode(bp);
+
 init_err:
 	pci_iounmap(pdev, bp->bar0);
 	pci_release_regions(pdev);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 8327d0d..1461355 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1024,6 +1024,7 @@ struct bnxt {
 #define BNXT_STATE_FN_RST_DONE	2
 
 	struct bnxt_irq	*irq_tbl;
+	int			total_irqs;
 	u8			mac_addr[ETH_ALEN];
 
 #ifdef CONFIG_BNXT_DCB
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v2 3/7] bnxt_en: Move function reset to bnxt_init_one().
From: Michael Chan @ 2016-12-07  5:26 UTC (permalink / raw)
  To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>

Now that MSIX is enabled in bnxt_init_one(), resources may be allocated by
the RDMA driver before the network device is opened.  So we cannot do
function reset in bnxt_open() which will clear all the resources.

The proper place to do function reset now is in bnxt_init_one().
If we get AER, we'll do function reset as well.

Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 25 ++++++-------------------
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 -
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index da302eb..30b482b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5613,22 +5613,7 @@ int bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 static int bnxt_open(struct net_device *dev)
 {
 	struct bnxt *bp = netdev_priv(dev);
-	int rc = 0;
 
-	if (!test_bit(BNXT_STATE_FN_RST_DONE, &bp->state)) {
-		rc = bnxt_hwrm_func_reset(bp);
-		if (rc) {
-			netdev_err(bp->dev, "hwrm chip reset failure rc: %x\n",
-				   rc);
-			rc = -EBUSY;
-			return rc;
-		}
-		/* Do func_reset during the 1st PF open only to prevent killing
-		 * the VFs when the PF is brought down and up.
-		 */
-		if (BNXT_PF(bp))
-			set_bit(BNXT_STATE_FN_RST_DONE, &bp->state);
-	}
 	return __bnxt_open_nic(bp, true, true);
 }
 
@@ -7028,6 +7013,10 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto init_err;
 
+	rc = bnxt_hwrm_func_reset(bp);
+	if (rc)
+		goto init_err;
+
 	rc = bnxt_init_int_mode(bp);
 	if (rc)
 		goto init_err;
@@ -7069,7 +7058,6 @@ static pci_ers_result_t bnxt_io_error_detected(struct pci_dev *pdev,
 					       pci_channel_state_t state)
 {
 	struct net_device *netdev = pci_get_drvdata(pdev);
-	struct bnxt *bp = netdev_priv(netdev);
 
 	netdev_info(netdev, "PCI I/O error detected\n");
 
@@ -7084,8 +7072,6 @@ static pci_ers_result_t bnxt_io_error_detected(struct pci_dev *pdev,
 	if (netif_running(netdev))
 		bnxt_close(netdev);
 
-	/* So that func_reset will be done during slot_reset */
-	clear_bit(BNXT_STATE_FN_RST_DONE, &bp->state);
 	pci_disable_device(pdev);
 	rtnl_unlock();
 
@@ -7119,7 +7105,8 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev)
 	} else {
 		pci_set_master(pdev);
 
-		if (netif_running(netdev))
+		err = bnxt_hwrm_func_reset(bp);
+		if (!err && netif_running(netdev))
 			err = bnxt_open(netdev);
 
 		if (!err)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 1461355..0ee2cc4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1021,7 +1021,6 @@ struct bnxt {
 	unsigned long		state;
 #define BNXT_STATE_OPEN		0
 #define BNXT_STATE_IN_SP_TASK	1
-#define BNXT_STATE_FN_RST_DONE	2
 
 	struct bnxt_irq	*irq_tbl;
 	int			total_irqs;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v2 4/7] bnxt_en: Improve completion ring allocation for VFs.
From: Michael Chan @ 2016-12-07  5:26 UTC (permalink / raw)
  To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>

All available remaining completion rings not used by the PF should be
made available for the VFs so that there are enough rings in the VF to
support RDMA.  The earlier workaround code of capping the rings by the
statistics context is removed.

When SRIOV is disabled, call a new function bnxt_restore_pf_fw_resources()
to restore FW resources.  Later on we need to add some logic to account
for RDMA resources.

Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c       |  8 +++++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h       |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 14 ++++----------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 30b482b..52b8ad4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4152,7 +4152,7 @@ static int bnxt_hwrm_func_qcfg(struct bnxt *bp)
 	return rc;
 }
 
-int bnxt_hwrm_func_qcaps(struct bnxt *bp)
+static int bnxt_hwrm_func_qcaps(struct bnxt *bp)
 {
 	int rc = 0;
 	struct hwrm_func_qcaps_input req = {0};
@@ -6856,6 +6856,12 @@ static int bnxt_set_dflt_rings(struct bnxt *bp)
 	return rc;
 }
 
+void bnxt_restore_pf_fw_resources(struct bnxt *bp)
+{
+	ASSERT_RTNL();
+	bnxt_hwrm_func_qcaps(bp);
+}
+
 static void bnxt_parse_log_pcie_link(struct bnxt *bp)
 {
 	enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 0ee2cc4..43a4b17 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1234,7 +1234,6 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
 int hwrm_send_message(struct bnxt *, void *, u32, int);
 int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
 int bnxt_hwrm_set_coal(struct bnxt *);
-int bnxt_hwrm_func_qcaps(struct bnxt *);
 void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
 void bnxt_tx_disable(struct bnxt *bp);
 void bnxt_tx_enable(struct bnxt *bp);
@@ -1245,4 +1244,5 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
 int bnxt_close_nic(struct bnxt *, bool, bool);
 int bnxt_setup_mq_tc(struct net_device *dev, u8 tc);
 int bnxt_get_max_rings(struct bnxt *, int *, int *, bool);
+void bnxt_restore_pf_fw_resources(struct bnxt *bp);
 #endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index bff626a..c696025 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -420,15 +420,7 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
 	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
 
 	/* Remaining rings are distributed equally amongs VF's for now */
-	/* TODO: the following workaroud is needed to restrict total number
-	 * of vf_cp_rings not exceed number of HW ring groups. This WA should
-	 * be removed once new HWRM provides HW ring groups capability in
-	 * hwrm_func_qcap.
-	 */
-	vf_cp_rings = min_t(u16, pf->max_cp_rings, pf->max_stat_ctxs);
-	vf_cp_rings = (vf_cp_rings - bp->cp_nr_rings) / num_vfs;
-	/* TODO: restore this logic below once the WA above is removed */
-	/* vf_cp_rings = (pf->max_cp_rings - bp->cp_nr_rings) / num_vfs; */
+	vf_cp_rings = (pf->max_cp_rings - bp->cp_nr_rings) / num_vfs;
 	vf_stat_ctx = (pf->max_stat_ctxs - bp->num_stat_ctxs) / num_vfs;
 	if (bp->flags & BNXT_FLAG_AGG_RINGS)
 		vf_rx_rings = (pf->max_rx_rings - bp->rx_nr_rings * 2) /
@@ -590,7 +582,9 @@ void bnxt_sriov_disable(struct bnxt *bp)
 
 	bp->pf.active_vfs = 0;
 	/* Reclaim all resources for the PF. */
-	bnxt_hwrm_func_qcaps(bp);
+	rtnl_lock();
+	bnxt_restore_pf_fw_resources(bp);
+	rtnl_unlock();
 }
 
 int bnxt_sriov_configure(struct pci_dev *pdev, int num_vfs)
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v2 5/7] bnxt_en: Reserve RDMA resources by default.
From: Michael Chan @ 2016-12-07  5:26 UTC (permalink / raw)
  To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>

If the device supports RDMA, we'll setup network default rings so that
there are enough minimum resources for RDMA, if possible.  However, the
user can still increase network rings to the max if he wants.  The actual
RDMA resources won't be reserved until the RDMA driver registers.

v2: Fix compile warning when BNXT_CONFIG_SRIOV is not set.

Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 58 ++++++++++++++++++++++++++++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  9 +++++
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 52b8ad4..57285bd 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4166,6 +4166,11 @@ static int bnxt_hwrm_func_qcaps(struct bnxt *bp)
 	if (rc)
 		goto hwrm_func_qcaps_exit;
 
+	if (resp->flags & cpu_to_le32(FUNC_QCAPS_RESP_FLAGS_ROCE_V1_SUPPORTED))
+		bp->flags |= BNXT_FLAG_ROCEV1_CAP;
+	if (resp->flags & cpu_to_le32(FUNC_QCAPS_RESP_FLAGS_ROCE_V2_SUPPORTED))
+		bp->flags |= BNXT_FLAG_ROCEV2_CAP;
+
 	bp->tx_push_thresh = 0;
 	if (resp->flags &
 	    cpu_to_le32(FUNC_QCAPS_RESP_FLAGS_PUSH_MODE_SUPPORTED))
@@ -4808,6 +4813,24 @@ static int bnxt_setup_int_mode(struct bnxt *bp)
 	return rc;
 }
 
+unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+	if (BNXT_VF(bp))
+		return bp->vf.max_stat_ctxs;
+#endif
+	return bp->pf.max_stat_ctxs;
+}
+
+unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+	if (BNXT_VF(bp))
+		return bp->vf.max_cp_rings;
+#endif
+	return bp->pf.max_cp_rings;
+}
+
 static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
 {
 #if defined(CONFIG_BNXT_SRIOV)
@@ -6832,6 +6855,39 @@ int bnxt_get_max_rings(struct bnxt *bp, int *max_rx, int *max_tx, bool shared)
 	return bnxt_trim_rings(bp, max_rx, max_tx, cp, shared);
 }
 
+static int bnxt_get_dflt_rings(struct bnxt *bp, int *max_rx, int *max_tx,
+			       bool shared)
+{
+	int rc;
+
+	rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
+	if (rc)
+		return rc;
+
+	if (bp->flags & BNXT_FLAG_ROCE_CAP) {
+		int max_cp, max_stat, max_irq;
+
+		/* Reserve minimum resources for RoCE */
+		max_cp = bnxt_get_max_func_cp_rings(bp);
+		max_stat = bnxt_get_max_func_stat_ctxs(bp);
+		max_irq = bnxt_get_max_func_irqs(bp);
+		if (max_cp <= BNXT_MIN_ROCE_CP_RINGS ||
+		    max_irq <= BNXT_MIN_ROCE_CP_RINGS ||
+		    max_stat <= BNXT_MIN_ROCE_STAT_CTXS)
+			return 0;
+
+		max_cp -= BNXT_MIN_ROCE_CP_RINGS;
+		max_irq -= BNXT_MIN_ROCE_CP_RINGS;
+		max_stat -= BNXT_MIN_ROCE_STAT_CTXS;
+		max_cp = min_t(int, max_cp, max_irq);
+		max_cp = min_t(int, max_cp, max_stat);
+		rc = bnxt_trim_rings(bp, max_rx, max_tx, max_cp, shared);
+		if (rc)
+			rc = 0;
+	}
+	return rc;
+}
+
 static int bnxt_set_dflt_rings(struct bnxt *bp)
 {
 	int dflt_rings, max_rx_rings, max_tx_rings, rc;
@@ -6840,7 +6896,7 @@ static int bnxt_set_dflt_rings(struct bnxt *bp)
 	if (sh)
 		bp->flags |= BNXT_FLAG_SHARED_RINGS;
 	dflt_rings = netif_get_num_default_rss_queues();
-	rc = bnxt_get_max_rings(bp, &max_rx_rings, &max_tx_rings, sh);
+	rc = bnxt_get_dflt_rings(bp, &max_rx_rings, &max_tx_rings, sh);
 	if (rc)
 		return rc;
 	bp->rx_nr_rings = min_t(int, dflt_rings, max_rx_rings);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 43a4b17..d796836 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -387,6 +387,9 @@ struct rx_tpa_end_cmp_ext {
 #define DB_KEY_TX_PUSH						(0x4 << 28)
 #define DB_LONG_TX_PUSH						(0x2 << 24)
 
+#define BNXT_MIN_ROCE_CP_RINGS	2
+#define BNXT_MIN_ROCE_STAT_CTXS	1
+
 #define INVALID_HW_RING_ID	((u16)-1)
 
 /* The hardware supports certain page sizes.  Use the supported page sizes
@@ -953,6 +956,10 @@ struct bnxt {
 	#define BNXT_FLAG_PORT_STATS	0x400
 	#define BNXT_FLAG_UDP_RSS_CAP	0x800
 	#define BNXT_FLAG_EEE_CAP	0x1000
+	#define BNXT_FLAG_ROCEV1_CAP	0x8000
+	#define BNXT_FLAG_ROCEV2_CAP	0x10000
+	#define BNXT_FLAG_ROCE_CAP	(BNXT_FLAG_ROCEV1_CAP |	\
+					 BNXT_FLAG_ROCEV2_CAP)
 	#define BNXT_FLAG_CHIP_NITRO_A0	0x1000000
 
 	#define BNXT_FLAG_ALL_CONFIG_FEATS (BNXT_FLAG_TPA |		\
@@ -1234,6 +1241,8 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
 int hwrm_send_message(struct bnxt *, void *, u32, int);
 int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
 int bnxt_hwrm_set_coal(struct bnxt *);
+unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
+unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp);
 void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
 void bnxt_tx_disable(struct bnxt *bp);
 void bnxt_tx_enable(struct bnxt *bp);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v2 6/7] bnxt_en: Refactor the driver registration function with firmware.
From: Michael Chan @ 2016-12-07  5:26 UTC (permalink / raw)
  To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>

The driver register function with firmware consists of passing version
information and registering for async events.  To support the RDMA driver,
the async events that we need to register may change.  Separate the
driver register function into 2 parts so that we can just update the
async events for the RDMA driver.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 34 ++++++++++++++++++++++++++-----
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  2 ++
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 57285bd..c782942 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3117,27 +3117,46 @@ int hwrm_send_message_silent(struct bnxt *bp, void *msg, u32 msg_len,
 	return rc;
 }
 
-static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
+int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
+				     int bmap_size)
 {
 	struct hwrm_func_drv_rgtr_input req = {0};
-	int i;
 	DECLARE_BITMAP(async_events_bmap, 256);
 	u32 *events = (u32 *)async_events_bmap;
+	int i;
 
 	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1);
 
 	req.enables =
-		cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE |
-			    FUNC_DRV_RGTR_REQ_ENABLES_VER |
-			    FUNC_DRV_RGTR_REQ_ENABLES_ASYNC_EVENT_FWD);
+		cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_ASYNC_EVENT_FWD);
 
 	memset(async_events_bmap, 0, sizeof(async_events_bmap));
 	for (i = 0; i < ARRAY_SIZE(bnxt_async_events_arr); i++)
 		__set_bit(bnxt_async_events_arr[i], async_events_bmap);
 
+	if (bmap && bmap_size) {
+		for (i = 0; i < bmap_size; i++) {
+			if (test_bit(i, bmap))
+				__set_bit(i, async_events_bmap);
+		}
+	}
+
 	for (i = 0; i < 8; i++)
 		req.async_event_fwd[i] |= cpu_to_le32(events[i]);
 
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
+{
+	struct hwrm_func_drv_rgtr_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1);
+
+	req.enables =
+		cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE |
+			    FUNC_DRV_RGTR_REQ_ENABLES_VER);
+
 	req.os_type = cpu_to_le16(FUNC_DRV_RGTR_REQ_OS_TYPE_LINUX);
 	req.ver_maj = DRV_VER_MAJ;
 	req.ver_min = DRV_VER_MIN;
@@ -3146,6 +3165,7 @@ static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
 	if (BNXT_PF(bp)) {
 		DECLARE_BITMAP(vf_req_snif_bmap, 256);
 		u32 *data = (u32 *)vf_req_snif_bmap;
+		int i;
 
 		memset(vf_req_snif_bmap, 0, sizeof(vf_req_snif_bmap));
 		for (i = 0; i < ARRAY_SIZE(bnxt_vf_req_snif); i++)
@@ -7023,6 +7043,10 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto init_err;
 
+	rc = bnxt_hwrm_func_rgtr_async_events(bp, NULL, 0);
+	if (rc)
+		goto init_err;
+
 	/* Get the MAX capabilities for this function */
 	rc = bnxt_hwrm_func_qcaps(bp);
 	if (rc) {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index d796836..eec2415 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1240,6 +1240,8 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
 int _hwrm_send_message(struct bnxt *, void *, u32, int);
 int hwrm_send_message(struct bnxt *, void *, u32, int);
 int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
+int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
+				     int bmap_size);
 int bnxt_hwrm_set_coal(struct bnxt *);
 unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
 unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v2 7/7] bnxt_en: Add interface to support RDMA driver.
From: Michael Chan @ 2016-12-07  5:26 UTC (permalink / raw)
  To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>

Since the network driver and RDMA driver operate on the same PCI function,
we need to create an interface to allow the RDMA driver to share resources
with the network driver.

1. Create a new bnxt_en_dev struct which will be returned by
bnxt_ulp_probe() upon success.  After that, all calls from the RDMA driver
to bnxt_en will pass a pointer to this struct.

2. This struct contains additional function pointers to register, request
msix, send fw messages, register for async events.

3. If the RDMA driver wants to enable RDMA on the function, it needs to
call the function pointer bnxt_register_device().  A ulp_ops structure
is passed for RCU protected upcalls from bnxt_en to the RDMA driver.

4. The RDMA driver can call firmware APIs using the bnxt_send_fw_msg()
function pointer.

5. 1 stats context is reserved when the RDMA driver registers.  MSIX
and completion rings are reserved when the RDMA driver calls
bnxt_request_msix() function pointer.

6. When the RDMA driver calls bnxt_unregister_device(), all RDMA resources
will be cleaned up.

v2: Fixed 2 uninitialized variable warnings.

Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/Makefile   |   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     |  41 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     |   6 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 346 ++++++++++++++++++++++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h |  93 +++++++
 5 files changed, 483 insertions(+), 5 deletions(-)
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h

diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile
index b233a86..6082ed1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/Makefile
+++ b/drivers/net/ethernet/broadcom/bnxt/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_BNXT) += bnxt_en.o
 
-bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o
+bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index c782942..9608cb4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -52,6 +52,7 @@
 
 #include "bnxt_hsi.h"
 #include "bnxt.h"
+#include "bnxt_ulp.h"
 #include "bnxt_sriov.h"
 #include "bnxt_ethtool.h"
 #include "bnxt_dcb.h"
@@ -1528,12 +1529,11 @@ static int bnxt_async_event_process(struct bnxt *bp,
 		set_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event);
 		break;
 	default:
-		netdev_err(bp->dev, "unhandled ASYNC event (id 0x%x)\n",
-			   event_id);
 		goto async_event_process_exit;
 	}
 	schedule_work(&bp->sp_task);
 async_event_process_exit:
+	bnxt_ulp_async_events(bp, cmpl);
 	return 0;
 }
 
@@ -3547,7 +3547,7 @@ static int bnxt_hwrm_vnic_ctx_alloc(struct bnxt *bp, u16 vnic_id, u16 ctx_idx)
 	return rc;
 }
 
-static int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
+int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
 {
 	unsigned int ring = 0, grp_idx;
 	struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
@@ -3595,6 +3595,9 @@ static int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
 #endif
 	if ((bp->flags & BNXT_FLAG_STRIP_VLAN) || def_vlan)
 		req.flags |= cpu_to_le32(VNIC_CFG_REQ_FLAGS_VLAN_STRIP_MODE);
+	if (!vnic_id && bnxt_ulp_registered(bp->edev, BNXT_ROCE_ULP))
+		req.flags |=
+			cpu_to_le32(VNIC_CFG_REQ_FLAGS_ROCE_DUAL_VNIC_MODE);
 
 	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 }
@@ -4842,6 +4845,16 @@ unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp)
 	return bp->pf.max_stat_ctxs;
 }
 
+void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+	if (BNXT_VF(bp))
+		bp->vf.max_stat_ctxs = max;
+	else
+#endif
+		bp->pf.max_stat_ctxs = max;
+}
+
 unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp)
 {
 #if defined(CONFIG_BNXT_SRIOV)
@@ -4851,6 +4864,16 @@ unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp)
 	return bp->pf.max_cp_rings;
 }
 
+void bnxt_set_max_func_cp_rings(struct bnxt *bp, unsigned int max)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+	if (BNXT_VF(bp))
+		bp->vf.max_cp_rings = max;
+	else
+#endif
+		bp->pf.max_cp_rings = max;
+}
+
 static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
 {
 #if defined(CONFIG_BNXT_SRIOV)
@@ -6767,6 +6790,8 @@ static void bnxt_remove_one(struct pci_dev *pdev)
 	pci_iounmap(pdev, bp->bar2);
 	pci_iounmap(pdev, bp->bar1);
 	pci_iounmap(pdev, bp->bar0);
+	kfree(bp->edev);
+	bp->edev = NULL;
 	free_netdev(dev);
 
 	pci_release_regions(pdev);
@@ -6936,6 +6961,7 @@ void bnxt_restore_pf_fw_resources(struct bnxt *bp)
 {
 	ASSERT_RTNL();
 	bnxt_hwrm_func_qcaps(bp);
+	bnxt_subtract_ulp_resources(bp, BNXT_ROCE_ULP);
 }
 
 static void bnxt_parse_log_pcie_link(struct bnxt *bp)
@@ -7047,6 +7073,8 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto init_err;
 
+	bp->ulp_probe = bnxt_ulp_probe;
+
 	/* Get the MAX capabilities for this function */
 	rc = bnxt_hwrm_func_qcaps(bp);
 	if (rc) {
@@ -7144,12 +7172,15 @@ static pci_ers_result_t bnxt_io_error_detected(struct pci_dev *pdev,
 					       pci_channel_state_t state)
 {
 	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct bnxt *bp = netdev_priv(netdev);
 
 	netdev_info(netdev, "PCI I/O error detected\n");
 
 	rtnl_lock();
 	netif_device_detach(netdev);
 
+	bnxt_ulp_stop(bp);
+
 	if (state == pci_channel_io_perm_failure) {
 		rtnl_unlock();
 		return PCI_ERS_RESULT_DISCONNECT;
@@ -7195,8 +7226,10 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev)
 		if (!err && netif_running(netdev))
 			err = bnxt_open(netdev);
 
-		if (!err)
+		if (!err) {
 			result = PCI_ERS_RESULT_RECOVERED;
+			bnxt_ulp_start(bp);
+		}
 	}
 
 	if (result != PCI_ERS_RESULT_RECOVERED && netif_running(netdev))
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index eec2415..16defe9 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -972,6 +972,9 @@ struct bnxt {
 #define BNXT_SINGLE_PF(bp)	(BNXT_PF(bp) && !BNXT_NPAR(bp))
 #define BNXT_CHIP_TYPE_NITRO_A0(bp) ((bp)->flags & BNXT_FLAG_CHIP_NITRO_A0)
 
+	struct bnxt_en_dev	*edev;
+	struct bnxt_en_dev *	(*ulp_probe)(struct net_device *);
+
 	struct bnxt_napi	**bnapi;
 
 	struct bnxt_rx_ring_info	*rx_ring;
@@ -1242,9 +1245,12 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
 int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
 int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
 				     int bmap_size);
+int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id);
 int bnxt_hwrm_set_coal(struct bnxt *);
 unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
+void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max);
 unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp);
+void bnxt_set_max_func_cp_rings(struct bnxt *bp, unsigned int max);
 void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
 void bnxt_tx_disable(struct bnxt *bp);
 void bnxt_tx_enable(struct bnxt *bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
new file mode 100644
index 0000000..8b7464b
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
@@ -0,0 +1,346 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2016 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <asm/byteorder.h>
+#include <linux/bitmap.h>
+
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+#include "bnxt_ulp.h"
+
+static int bnxt_register_dev(struct bnxt_en_dev *edev, int ulp_id,
+			     struct bnxt_ulp_ops *ulp_ops, void *handle)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_ulp *ulp;
+
+	ASSERT_RTNL();
+	if (ulp_id >= BNXT_MAX_ULP)
+		return -EINVAL;
+
+	ulp = &edev->ulp_tbl[ulp_id];
+	if (rcu_access_pointer(ulp->ulp_ops)) {
+		netdev_err(bp->dev, "ulp id %d already registered\n", ulp_id);
+		return -EBUSY;
+	}
+	if (ulp_id == BNXT_ROCE_ULP) {
+		unsigned int max_stat_ctxs;
+
+		max_stat_ctxs = bnxt_get_max_func_stat_ctxs(bp);
+		if (max_stat_ctxs <= BNXT_MIN_ROCE_STAT_CTXS ||
+		    bp->num_stat_ctxs == max_stat_ctxs)
+			return -ENOMEM;
+		bnxt_set_max_func_stat_ctxs(bp, max_stat_ctxs -
+					    BNXT_MIN_ROCE_STAT_CTXS);
+	}
+
+	atomic_set(&ulp->ref_count, 0);
+	ulp->handle = handle;
+	rcu_assign_pointer(ulp->ulp_ops, ulp_ops);
+
+	if (ulp_id == BNXT_ROCE_ULP) {
+		if (test_bit(BNXT_STATE_OPEN, &bp->state))
+			bnxt_hwrm_vnic_cfg(bp, 0);
+	}
+
+	return 0;
+}
+
+static int bnxt_unregister_dev(struct bnxt_en_dev *edev, int ulp_id)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_ulp *ulp;
+	int i = 0;
+
+	ASSERT_RTNL();
+	if (ulp_id >= BNXT_MAX_ULP)
+		return -EINVAL;
+
+	ulp = &edev->ulp_tbl[ulp_id];
+	if (!rcu_access_pointer(ulp->ulp_ops)) {
+		netdev_err(bp->dev, "ulp id %d not registered\n", ulp_id);
+		return -EINVAL;
+	}
+	if (ulp_id == BNXT_ROCE_ULP) {
+		unsigned int max_stat_ctxs;
+
+		max_stat_ctxs = bnxt_get_max_func_stat_ctxs(bp);
+		bnxt_set_max_func_stat_ctxs(bp, max_stat_ctxs + 1);
+	}
+	if (ulp->max_async_event_id)
+		bnxt_hwrm_func_rgtr_async_events(bp, NULL, 0);
+
+	RCU_INIT_POINTER(ulp->ulp_ops, NULL);
+	synchronize_rcu();
+	ulp->max_async_event_id = 0;
+	ulp->async_events_bmap = NULL;
+	while (atomic_read(&ulp->ref_count) != 0 && i < 10) {
+		msleep(100);
+		i++;
+	}
+	return 0;
+}
+
+static int bnxt_req_msix_vecs(struct bnxt_en_dev *edev, int ulp_id,
+			      struct bnxt_msix_entry *ent, int num_msix)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	int max_idx, max_cp_rings;
+	int avail_msix, i, idx;
+
+	ASSERT_RTNL();
+	if (ulp_id != BNXT_ROCE_ULP)
+		return -EINVAL;
+
+	if (!(bp->flags & BNXT_FLAG_USING_MSIX))
+		return -ENODEV;
+
+	max_cp_rings = bnxt_get_max_func_cp_rings(bp);
+	max_idx = min_t(int, bp->total_irqs, max_cp_rings);
+	avail_msix = max_idx - bp->cp_nr_rings;
+	if (!avail_msix)
+		return -ENOMEM;
+	if (avail_msix > num_msix)
+		avail_msix = num_msix;
+
+	idx = max_idx - avail_msix;
+	for (i = 0; i < avail_msix; i++) {
+		ent[i].vector = bp->irq_tbl[idx + i].vector;
+		ent[i].ring_idx = idx + i;
+		ent[i].db_offset = (idx + i) * 0x80;
+	}
+	bnxt_set_max_func_irqs(bp, max_idx - avail_msix);
+	bnxt_set_max_func_cp_rings(bp, max_cp_rings - avail_msix);
+	edev->ulp_tbl[ulp_id].msix_requested = avail_msix;
+	return avail_msix;
+}
+
+static int bnxt_free_msix_vecs(struct bnxt_en_dev *edev, int ulp_id)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	int max_cp_rings, msix_requested;
+
+	ASSERT_RTNL();
+	if (ulp_id != BNXT_ROCE_ULP)
+		return -EINVAL;
+
+	max_cp_rings = bnxt_get_max_func_cp_rings(bp);
+	msix_requested = edev->ulp_tbl[ulp_id].msix_requested;
+	bnxt_set_max_func_cp_rings(bp, max_cp_rings + msix_requested);
+	edev->ulp_tbl[ulp_id].msix_requested = 0;
+	bnxt_set_max_func_irqs(bp, bp->total_irqs);
+	return 0;
+}
+
+void bnxt_subtract_ulp_resources(struct bnxt *bp, int ulp_id)
+{
+	ASSERT_RTNL();
+	if (bnxt_ulp_registered(bp->edev, ulp_id)) {
+		struct bnxt_en_dev *edev = bp->edev;
+		unsigned int msix_req, max;
+
+		msix_req = edev->ulp_tbl[ulp_id].msix_requested;
+		max = bnxt_get_max_func_cp_rings(bp);
+		bnxt_set_max_func_cp_rings(bp, max - msix_req);
+		max = bnxt_get_max_func_stat_ctxs(bp);
+		bnxt_set_max_func_stat_ctxs(bp, max - 1);
+	}
+}
+
+static int bnxt_send_msg(struct bnxt_en_dev *edev, int ulp_id,
+			 struct bnxt_fw_msg *fw_msg)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	struct input *req;
+	int rc;
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	req = fw_msg->msg;
+	req->resp_addr = cpu_to_le64(bp->hwrm_cmd_resp_dma_addr);
+	rc = _hwrm_send_message(bp, fw_msg->msg, fw_msg->msg_len,
+				fw_msg->timeout);
+	if (!rc) {
+		struct output *resp = bp->hwrm_cmd_resp_addr;
+		u32 len = le16_to_cpu(resp->resp_len);
+
+		if (fw_msg->resp_max_len < len)
+			len = fw_msg->resp_max_len;
+
+		memcpy(fw_msg->resp, resp, len);
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static void bnxt_ulp_get(struct bnxt_ulp *ulp)
+{
+	atomic_inc(&ulp->ref_count);
+}
+
+static void bnxt_ulp_put(struct bnxt_ulp *ulp)
+{
+	atomic_dec(&ulp->ref_count);
+}
+
+void bnxt_ulp_stop(struct bnxt *bp)
+{
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+	int i;
+
+	if (!edev)
+		return;
+
+	for (i = 0; i < BNXT_MAX_ULP; i++) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+		ops = rtnl_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_stop)
+			continue;
+		ops->ulp_stop(ulp->handle);
+	}
+}
+
+void bnxt_ulp_start(struct bnxt *bp)
+{
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+	int i;
+
+	if (!edev)
+		return;
+
+	for (i = 0; i < BNXT_MAX_ULP; i++) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+		ops = rtnl_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_start)
+			continue;
+		ops->ulp_start(ulp->handle);
+	}
+}
+
+void bnxt_ulp_sriov_cfg(struct bnxt *bp, int num_vfs)
+{
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+	int i;
+
+	if (!edev)
+		return;
+
+	for (i = 0; i < BNXT_MAX_ULP; i++) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+		rcu_read_lock();
+		ops = rcu_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_sriov_config) {
+			rcu_read_unlock();
+			continue;
+		}
+		bnxt_ulp_get(ulp);
+		rcu_read_unlock();
+		ops->ulp_sriov_config(ulp->handle, num_vfs);
+		bnxt_ulp_put(ulp);
+	}
+}
+
+void bnxt_ulp_async_events(struct bnxt *bp, struct hwrm_async_event_cmpl *cmpl)
+{
+	u16 event_id = le16_to_cpu(cmpl->event_id);
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+	int i;
+
+	if (!edev)
+		return;
+
+	rcu_read_lock();
+	for (i = 0; i < BNXT_MAX_ULP; i++) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+		ops = rcu_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_async_notifier)
+			continue;
+		if (!ulp->async_events_bmap ||
+		    event_id > ulp->max_async_event_id)
+			continue;
+
+		/* Read max_async_event_id first before testing the bitmap. */
+		smp_rmb();
+		if (test_bit(event_id, ulp->async_events_bmap))
+			ops->ulp_async_notifier(ulp->handle, cmpl);
+	}
+	rcu_read_unlock();
+}
+
+static int bnxt_register_async_events(struct bnxt_en_dev *edev, int ulp_id,
+				      unsigned long *events_bmap, u16 max_id)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_ulp *ulp;
+
+	if (ulp_id >= BNXT_MAX_ULP)
+		return -EINVAL;
+
+	ulp = &edev->ulp_tbl[ulp_id];
+	ulp->async_events_bmap = events_bmap;
+	/* Make sure bnxt_ulp_async_events() sees this order */
+	smp_wmb();
+	ulp->max_async_event_id = max_id;
+	bnxt_hwrm_func_rgtr_async_events(bp, events_bmap, max_id + 1);
+	return 0;
+}
+
+static const struct bnxt_en_ops bnxt_en_ops_tbl = {
+	.bnxt_register_device	= bnxt_register_dev,
+	.bnxt_unregister_device	= bnxt_unregister_dev,
+	.bnxt_request_msix	= bnxt_req_msix_vecs,
+	.bnxt_free_msix		= bnxt_free_msix_vecs,
+	.bnxt_send_fw_msg	= bnxt_send_msg,
+	.bnxt_register_fw_async_events	= bnxt_register_async_events,
+};
+
+struct bnxt_en_dev *bnxt_ulp_probe(struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_en_dev *edev;
+
+	edev = bp->edev;
+	if (!edev) {
+		edev = kzalloc(sizeof(*edev), GFP_KERNEL);
+		if (!edev)
+			return ERR_PTR(-ENOMEM);
+		edev->en_ops = &bnxt_en_ops_tbl;
+		if (bp->flags & BNXT_FLAG_ROCEV1_CAP)
+			edev->flags |= BNXT_EN_FLAG_ROCEV1_CAP;
+		if (bp->flags & BNXT_FLAG_ROCEV2_CAP)
+			edev->flags |= BNXT_EN_FLAG_ROCEV2_CAP;
+		edev->net = dev;
+		edev->pdev = bp->pdev;
+		bp->edev = edev;
+	}
+	return bp->edev;
+}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h
new file mode 100644
index 0000000..74f816e
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h
@@ -0,0 +1,93 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2016 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_ULP_H
+#define BNXT_ULP_H
+
+#define BNXT_ROCE_ULP	0
+#define BNXT_OTHER_ULP	1
+#define BNXT_MAX_ULP	2
+
+#define BNXT_MIN_ROCE_CP_RINGS	2
+#define BNXT_MIN_ROCE_STAT_CTXS	1
+
+struct hwrm_async_event_cmpl;
+struct bnxt;
+
+struct bnxt_ulp_ops {
+	/* async_notifier() cannot sleep (in BH context) */
+	void (*ulp_async_notifier)(void *, struct hwrm_async_event_cmpl *);
+	void (*ulp_stop)(void *);
+	void (*ulp_start)(void *);
+	void (*ulp_sriov_config)(void *, int);
+};
+
+struct bnxt_msix_entry {
+	u32	vector;
+	u32	ring_idx;
+	u32	db_offset;
+};
+
+struct bnxt_fw_msg {
+	void	*msg;
+	int	msg_len;
+	void	*resp;
+	int	resp_max_len;
+	int	timeout;
+};
+
+struct bnxt_ulp {
+	void		*handle;
+	struct bnxt_ulp_ops __rcu *ulp_ops;
+	unsigned long	*async_events_bmap;
+	u16		max_async_event_id;
+	u16		msix_requested;
+	atomic_t	ref_count;
+};
+
+struct bnxt_en_dev {
+	struct net_device *net;
+	struct pci_dev *pdev;
+	u32 flags;
+	#define BNXT_EN_FLAG_ROCEV1_CAP		0x1
+	#define BNXT_EN_FLAG_ROCEV2_CAP		0x2
+	#define BNXT_EN_FLAG_ROCE_CAP		(BNXT_EN_FLAG_ROCEV1_CAP | \
+						 BNXT_EN_FLAG_ROCEV2_CAP)
+	const struct bnxt_en_ops	*en_ops;
+	struct bnxt_ulp			ulp_tbl[BNXT_MAX_ULP];
+};
+
+struct bnxt_en_ops {
+	int (*bnxt_register_device)(struct bnxt_en_dev *, int,
+				    struct bnxt_ulp_ops *, void *);
+	int (*bnxt_unregister_device)(struct bnxt_en_dev *, int);
+	int (*bnxt_request_msix)(struct bnxt_en_dev *, int,
+				 struct bnxt_msix_entry *, int);
+	int (*bnxt_free_msix)(struct bnxt_en_dev *, int);
+	int (*bnxt_send_fw_msg)(struct bnxt_en_dev *, int,
+				struct bnxt_fw_msg *);
+	int (*bnxt_register_fw_async_events)(struct bnxt_en_dev *, int,
+					     unsigned long *, u16);
+};
+
+static inline bool bnxt_ulp_registered(struct bnxt_en_dev *edev, int ulp_id)
+{
+	if (edev && rcu_access_pointer(edev->ulp_tbl[ulp_id].ulp_ops))
+		return true;
+	return false;
+}
+
+void bnxt_subtract_ulp_resources(struct bnxt *bp, int ulp_id);
+void bnxt_ulp_stop(struct bnxt *bp);
+void bnxt_ulp_start(struct bnxt *bp);
+void bnxt_ulp_sriov_cfg(struct bnxt *bp, int num_vfs);
+void bnxt_ulp_async_events(struct bnxt *bp, struct hwrm_async_event_cmpl *cmpl);
+struct bnxt_en_dev *bnxt_ulp_probe(struct net_device *dev);
+
+#endif
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH 10/10] virtio: enable endian checks for sparse builds
From: Jason Wang @ 2016-12-07  5:27 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-kernel
  Cc: kvm, Neil Armstrong, David Airlie, linux-remoteproc, dri-devel,
	virtualization, linux-s390, James E.J. Bottomley, Herbert Xu,
	linux-scsi, v9fs-developer, Asias He, Arnd Bergmann, linux-kbuild,
	Jens Axboe, Michal Marek, Stefan Hajnoczi, Matt Mackall,
	Greg Kroah-Hartman, linux-crypto, netdev, David S. Miller
In-Reply-To: <1481038106-24899-11-git-send-email-mst@redhat.com>



On 2016年12月06日 23:41, Michael S. Tsirkin wrote:
> __CHECK_ENDIAN__ isn't on by default presumably because
> it triggers too many sparse warnings for correct code.
> But virtio is now clean of these warnings, and
> we want to keep it this way - enable this for
> sparse builds.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> It seems that there should be a better way to do it,
> but this works too.

Reviewed-by: Jason Wang <jasowang@redhat.com>

>
>   drivers/block/Makefile          | 1 +
>   drivers/char/Makefile           | 1 +
>   drivers/char/hw_random/Makefile | 2 ++
>   drivers/gpu/drm/virtio/Makefile | 1 +
>   drivers/net/Makefile            | 3 +++
>   drivers/net/caif/Makefile       | 1 +
>   drivers/rpmsg/Makefile          | 1 +
>   drivers/s390/virtio/Makefile    | 2 ++
>   drivers/scsi/Makefile           | 1 +
>   drivers/vhost/Makefile          | 1 +
>   drivers/virtio/Makefile         | 3 +++
>   net/9p/Makefile                 | 1 +
>   net/packet/Makefile             | 1 +
>   net/vmw_vsock/Makefile          | 2 ++
>   14 files changed, 21 insertions(+)
>
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index 1e9661e..597481c 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -27,6 +27,7 @@ obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
>   obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
>   obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
>   obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
> +CFLAGS_virtio_blk.o += -D__CHECK_ENDIAN__
>   obj-$(CONFIG_VIRTIO_BLK)	+= virtio_blk.o
>   
>   obj-$(CONFIG_BLK_DEV_SX8)	+= sx8.o
> diff --git a/drivers/char/Makefile b/drivers/char/Makefile
> index 6e6c244..a99467d 100644
> --- a/drivers/char/Makefile
> +++ b/drivers/char/Makefile
> @@ -6,6 +6,7 @@ obj-y				+= mem.o random.o
>   obj-$(CONFIG_TTY_PRINTK)	+= ttyprintk.o
>   obj-y				+= misc.o
>   obj-$(CONFIG_ATARI_DSP56K)	+= dsp56k.o
> +CFLAGS_virtio_console.o += -D__CHECK_ENDIAN__
>   obj-$(CONFIG_VIRTIO_CONSOLE)	+= virtio_console.o
>   obj-$(CONFIG_RAW_DRIVER)	+= raw.o
>   obj-$(CONFIG_SGI_SNSC)		+= snsc.o snsc_event.o
> diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
> index 5f52b1e..a2b0931 100644
> --- a/drivers/char/hw_random/Makefile
> +++ b/drivers/char/hw_random/Makefile
> @@ -17,6 +17,8 @@ obj-$(CONFIG_HW_RANDOM_IXP4XX) += ixp4xx-rng.o
>   obj-$(CONFIG_HW_RANDOM_OMAP) += omap-rng.o
>   obj-$(CONFIG_HW_RANDOM_OMAP3_ROM) += omap3-rom-rng.o
>   obj-$(CONFIG_HW_RANDOM_PASEMI) += pasemi-rng.o
> +CFLAGS_virtio_transport.o += -D__CHECK_ENDIAN__
> +CFLAGS_virtio-rng.o += -D__CHECK_ENDIAN__
>   obj-$(CONFIG_HW_RANDOM_VIRTIO) += virtio-rng.o
>   obj-$(CONFIG_HW_RANDOM_TX4939) += tx4939-rng.o
>   obj-$(CONFIG_HW_RANDOM_MXC_RNGA) += mxc-rnga.o
> diff --git a/drivers/gpu/drm/virtio/Makefile b/drivers/gpu/drm/virtio/Makefile
> index 3fb8eac..1162366 100644
> --- a/drivers/gpu/drm/virtio/Makefile
> +++ b/drivers/gpu/drm/virtio/Makefile
> @@ -3,6 +3,7 @@
>   # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
>   
>   ccflags-y := -Iinclude/drm
> +ccflags-y += -D__CHECK_ENDIAN__
>   
>   virtio-gpu-y := virtgpu_drv.o virtgpu_kms.o virtgpu_drm_bus.o virtgpu_gem.o \
>   	virtgpu_fb.o virtgpu_display.o virtgpu_vq.o virtgpu_ttm.o \
> diff --git a/drivers/net/Makefile b/drivers/net/Makefile
> index 7336cbd..3f587de 100644
> --- a/drivers/net/Makefile
> +++ b/drivers/net/Makefile
> @@ -12,6 +12,7 @@ obj-$(CONFIG_EQUALIZER) += eql.o
>   obj-$(CONFIG_IFB) += ifb.o
>   obj-$(CONFIG_MACSEC) += macsec.o
>   obj-$(CONFIG_MACVLAN) += macvlan.o
> +CFLAGS_macvtap.o += -D__CHECK_ENDIAN__
>   obj-$(CONFIG_MACVTAP) += macvtap.o
>   obj-$(CONFIG_MII) += mii.o
>   obj-$(CONFIG_MDIO) += mdio.o
> @@ -20,8 +21,10 @@ obj-$(CONFIG_NETCONSOLE) += netconsole.o
>   obj-$(CONFIG_PHYLIB) += phy/
>   obj-$(CONFIG_RIONET) += rionet.o
>   obj-$(CONFIG_NET_TEAM) += team/
> +CFLAGS_tun.o += -D__CHECK_ENDIAN__
>   obj-$(CONFIG_TUN) += tun.o
>   obj-$(CONFIG_VETH) += veth.o
> +CFLAGS_virtio_net.o += -D__CHECK_ENDIAN__
>   obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
>   obj-$(CONFIG_VXLAN) += vxlan.o
>   obj-$(CONFIG_GENEVE) += geneve.o
> diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile
> index 9bbd453..d1a922c 100644
> --- a/drivers/net/caif/Makefile
> +++ b/drivers/net/caif/Makefile
> @@ -12,3 +12,4 @@ obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
>   
>   # Virtio interface
>   obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o
> +CFLAGS_caif_virtio.o += -D__CHECK_ENDIAN__
> diff --git a/drivers/rpmsg/Makefile b/drivers/rpmsg/Makefile
> index ae9c913..23c8b66 100644
> --- a/drivers/rpmsg/Makefile
> +++ b/drivers/rpmsg/Makefile
> @@ -1,3 +1,4 @@
>   obj-$(CONFIG_RPMSG)		+= rpmsg_core.o
>   obj-$(CONFIG_RPMSG_QCOM_SMD)	+= qcom_smd.o
>   obj-$(CONFIG_RPMSG_VIRTIO)	+= virtio_rpmsg_bus.o
> +CFLAGS_virtio_rpmsg_bus.o	+= -D__CHECK_ENDIAN__
> diff --git a/drivers/s390/virtio/Makefile b/drivers/s390/virtio/Makefile
> index df40692..270ada5 100644
> --- a/drivers/s390/virtio/Makefile
> +++ b/drivers/s390/virtio/Makefile
> @@ -6,6 +6,8 @@
>   # it under the terms of the GNU General Public License (version 2 only)
>   # as published by the Free Software Foundation.
>   
> +CFLAGS_virtio_ccw.o += -D__CHECK_ENDIAN__
> +CFLAGS_kvm_virtio.o += -D__CHECK_ENDIAN__
>   s390-virtio-objs := virtio_ccw.o
>   ifdef CONFIG_S390_GUEST_OLD_TRANSPORT
>   s390-virtio-objs += kvm_virtio.o
> diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
> index 38d938d..9f70d46 100644
> --- a/drivers/scsi/Makefile
> +++ b/drivers/scsi/Makefile
> @@ -135,6 +135,7 @@ obj-$(CONFIG_SCSI_BNX2_ISCSI)	+= libiscsi.o bnx2i/
>   obj-$(CONFIG_BE2ISCSI)		+= libiscsi.o be2iscsi/
>   obj-$(CONFIG_SCSI_ESAS2R)	+= esas2r/
>   obj-$(CONFIG_SCSI_PMCRAID)	+= pmcraid.o
> +CFLAGS_virtio_scsi.o += -D__CHECK_ENDIAN__
>   obj-$(CONFIG_SCSI_VIRTIO)	+= virtio_scsi.o
>   obj-$(CONFIG_VMWARE_PVSCSI)	+= vmw_pvscsi.o
>   obj-$(CONFIG_XEN_SCSI_FRONTEND)	+= xen-scsifront.o
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index 6b012b9..619e2cd 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -1,3 +1,4 @@
> +ccflags-y := -D__CHECK_ENDIAN__
>   obj-$(CONFIG_VHOST_NET) += vhost_net.o
>   vhost_net-y := net.o
>   
> diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
> index 41e30e3..d331f19 100644
> --- a/drivers/virtio/Makefile
> +++ b/drivers/virtio/Makefile
> @@ -1,3 +1,6 @@
> +#virtio must be kept clean wrt endian tags,
> +#otherwise we'll get to maintain broken host/guest ABIs
> +ccflags-y := -D__CHECK_ENDIAN__
>   obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o
>   obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
>   obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
> diff --git a/net/9p/Makefile b/net/9p/Makefile
> index a0874cc..acf1225 100644
> --- a/net/9p/Makefile
> +++ b/net/9p/Makefile
> @@ -11,6 +11,7 @@ obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o
>   	trans_fd.o \
>   	trans_common.o \
>   
> +CFLAGS_trans_virtio.o += -D__CHECK_ENDIAN__
>   9pnet_virtio-objs := \
>   	trans_virtio.o \
>   
> diff --git a/net/packet/Makefile b/net/packet/Makefile
> index 9df6134..a13bcb3 100644
> --- a/net/packet/Makefile
> +++ b/net/packet/Makefile
> @@ -2,6 +2,7 @@
>   # Makefile for the packet AF.
>   #
>   
> +ccflags-y := -D__CHECK_ENDIAN__
>   obj-$(CONFIG_PACKET) += af_packet.o
>   obj-$(CONFIG_PACKET_DIAG) += af_packet_diag.o
>   af_packet_diag-y += diag.o
> diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
> index bc27c70..a61eccb 100644
> --- a/net/vmw_vsock/Makefile
> +++ b/net/vmw_vsock/Makefile
> @@ -8,6 +8,8 @@ vsock-y += af_vsock.o vsock_addr.o
>   vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
>   	vmci_transport_notify_qstate.o
>   
> +CFLAGS_virtio_transport.o += -D__CHECK_ENDIAN__
> +CFLAGS_virtio_transport_common.o += -D__CHECK_ENDIAN__
>   vmw_vsock_virtio_transport-y += virtio_transport.o
>   
>   vmw_vsock_virtio_transport_common-y += virtio_transport_common.o

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* [PATCH v3 net-next 0/4]: Allow head adjustment in XDP prog
From: Martin KaFai Lau @ 2016-12-07  5:31 UTC (permalink / raw)
  To: netdev
  Cc: Alexei Starovoitov, Brenden Blanco, Daniel Borkmann, David Miller,
	Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
	Saeed Mahameed, Tariq Toukan, Kernel Team

This series adds a helper to allow head adjusting in XDP prog.  mlx4
driver has been modified to support this feature.  An example is written
to encapsulate a packet with an IPv4/v6 header and then XDP_TX it
out.

v3:
1. Check if the driver supports head adjustment before
   setting the xdp_prog fd to the device in patch 1of4.
2. Remove the page alignment assumption on the data_hard_start.
   Instead, add data_hard_start to the struct xdp_buff and the
   driver has to fill it if it supports head adjustment.
3. Keep the wire MTU as before in mlx4
4. Set map0_byte_count to PAGE_SIZE in patch 3of4

v2:
1. Make a variable name change in bpf_xdp_adjust_head() in patch 1
2. Ensure no less than ETH_HLEN data in bpf_xdp_adjust_head() in patch 1
3. Some clarifications in commit log messages of patch 2 and 3

Thanks,
Martin

Martin KaFai Lau (4):
  bpf: xdp: Allow head adjustment in XDP prog
  mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs
  mlx4: xdp: Reserve headroom for receiving packet when XDP prog is
    active
  bpf: xdp: Add XDP example for head adjustment

 arch/powerpc/net/bpf_jit_comp64.c                  |   4 +-
 arch/s390/net/bpf_jit_comp.c                       |   2 +-
 arch/x86/net/bpf_jit_comp.c                        |   2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  32 ++-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |  70 +++---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c         |   9 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h       |   3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |   3 +
 drivers/net/ethernet/qlogic/qede/qede_main.c       |   3 +
 include/linux/filter.h                             |   6 +-
 include/linux/netdevice.h                          |  12 +
 include/uapi/linux/bpf.h                           |  11 +-
 kernel/bpf/core.c                                  |   2 +-
 kernel/bpf/syscall.c                               |   2 +
 kernel/bpf/verifier.c                              |   2 +-
 net/core/dev.c                                     |   9 +
 net/core/filter.c                                  |  28 ++-
 samples/bpf/Makefile                               |   4 +
 samples/bpf/bpf_helpers.h                          |   2 +
 samples/bpf/bpf_load.c                             |  94 ++++++++
 samples/bpf/bpf_load.h                             |   1 +
 samples/bpf/xdp1_user.c                            |  93 --------
 samples/bpf/xdp_tx_iptnl_common.h                  |  37 +++
 samples/bpf/xdp_tx_iptnl_kern.c                    | 232 +++++++++++++++++++
 samples/bpf/xdp_tx_iptnl_user.c                    | 253 +++++++++++++++++++++
 26 files changed, 774 insertions(+), 145 deletions(-)
 create mode 100644 samples/bpf/xdp_tx_iptnl_common.h
 create mode 100644 samples/bpf/xdp_tx_iptnl_kern.c
 create mode 100644 samples/bpf/xdp_tx_iptnl_user.c

-- 
2.5.1

^ permalink raw reply

* [PATCH v3 net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs
From: Martin KaFai Lau @ 2016-12-07  5:31 UTC (permalink / raw)
  To: netdev
  Cc: Alexei Starovoitov, Brenden Blanco, Daniel Borkmann, David Miller,
	Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
	Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <1481088714-54512-1-git-send-email-kafai@fb.com>

When XDP is active in mlx4, mlx4 is using one page/pkt.
At the same time (i.e. when XDP is active), it is currently
limiting MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN)
which is 1514 in x86.  AFAICT, we can at least raise the MTU
limit up to PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this
patch is doing.  It will be useful in the next patch which
allows XDP program to extend the packet by adding new header(s).

Note: In the earlier XDP patches, there is already existing guard
to ensure the page/pkt scheme only applies when XDP is active
in mlx4.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 28 +++++++++++-----
 drivers/net/ethernet/mellanox/mlx4/en_rx.c     | 46 ++++++++++++++------------
 2 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 6261157f444e..5482591688f8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -51,6 +51,8 @@
 #include "mlx4_en.h"
 #include "en_port.h"
 
+#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN)))
+
 int mlx4_en_setup_tc(struct net_device *dev, u8 up)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -2249,6 +2251,19 @@ void mlx4_en_destroy_netdev(struct net_device *dev)
 	free_netdev(dev);
 }
 
+static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (mtu > MLX4_EN_MAX_XDP_MTU) {
+		en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n",
+		       mtu, MLX4_EN_MAX_XDP_MTU);
+		return false;
+	}
+
+	return true;
+}
+
 static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -2258,11 +2273,10 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
 	en_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n",
 		 dev->mtu, new_mtu);
 
-	if (priv->tx_ring_num[TX_XDP] && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) {
-		en_err(priv, "MTU size:%d requires frags but XDP running\n",
-		       new_mtu);
-		return -EOPNOTSUPP;
-	}
+	if (priv->tx_ring_num[TX_XDP] &&
+	    !mlx4_en_check_xdp_mtu(dev, new_mtu))
+		return -ENOTSUPP;
+
 	dev->mtu = new_mtu;
 
 	if (netif_running(dev)) {
@@ -2710,10 +2724,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 		return 0;
 	}
 
-	if (priv->num_frags > 1) {
-		en_err(priv, "Cannot set XDP if MTU requires multiple frags\n");
+	if (!mlx4_en_check_xdp_mtu(dev, dev->mtu))
 		return -EOPNOTSUPP;
-	}
 
 	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
 	if (!tmp)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 6562f78b07f4..23e9d04d1ef4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1164,37 +1164,39 @@ static const int frag_sizes[] = {
 
 void mlx4_en_calc_rx_buf(struct net_device *dev)
 {
-	enum dma_data_direction dma_dir = PCI_DMA_FROMDEVICE;
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
-	int order = MLX4_EN_ALLOC_PREFER_ORDER;
-	u32 align = SMP_CACHE_BYTES;
-	int buf_size = 0;
 	int i = 0;
 
 	/* bpf requires buffers to be set up as 1 packet per page.
 	 * This only works when num_frags == 1.
 	 */
 	if (priv->tx_ring_num[TX_XDP]) {
-		dma_dir = PCI_DMA_BIDIRECTIONAL;
-		/* This will gain efficient xdp frame recycling at the expense
-		 * of more costly truesize accounting
+		priv->frag_info[0].order = 0;
+		priv->frag_info[0].frag_size = eff_mtu;
+		priv->frag_info[0].frag_prefix_size = 0;
+		/* This will gain efficient xdp frame recycling at the
+		 * expense of more costly truesize accounting
 		 */
-		align = PAGE_SIZE;
-		order = 0;
-	}
-
-	while (buf_size < eff_mtu) {
-		priv->frag_info[i].order = order;
-		priv->frag_info[i].frag_size =
-			(eff_mtu > buf_size + frag_sizes[i]) ?
-				frag_sizes[i] : eff_mtu - buf_size;
-		priv->frag_info[i].frag_prefix_size = buf_size;
-		priv->frag_info[i].frag_stride =
-				ALIGN(priv->frag_info[i].frag_size, align);
-		priv->frag_info[i].dma_dir = dma_dir;
-		buf_size += priv->frag_info[i].frag_size;
-		i++;
+		priv->frag_info[0].frag_stride = PAGE_SIZE;
+		priv->frag_info[0].dma_dir = PCI_DMA_BIDIRECTIONAL;
+		i = 1;
+	} else {
+		int buf_size = 0;
+
+		while (buf_size < eff_mtu) {
+			priv->frag_info[i].order = MLX4_EN_ALLOC_PREFER_ORDER;
+			priv->frag_info[i].frag_size =
+				(eff_mtu > buf_size + frag_sizes[i]) ?
+					frag_sizes[i] : eff_mtu - buf_size;
+			priv->frag_info[i].frag_prefix_size = buf_size;
+			priv->frag_info[i].frag_stride =
+				ALIGN(priv->frag_info[i].frag_size,
+				      SMP_CACHE_BYTES);
+			priv->frag_info[i].dma_dir = PCI_DMA_FROMDEVICE;
+			buf_size += priv->frag_info[i].frag_size;
+			i++;
+		}
 	}
 
 	priv->num_frags = i;
-- 
2.5.1

^ permalink raw reply related

* [PATCH v3 net-next 4/4] bpf: xdp: Add XDP example for head adjustment
From: Martin KaFai Lau @ 2016-12-07  5:31 UTC (permalink / raw)
  To: netdev
  Cc: Alexei Starovoitov, Brenden Blanco, Daniel Borkmann, David Miller,
	Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
	Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <1481088714-54512-1-git-send-email-kafai@fb.com>

The XDP prog checks if the incoming packet matches any VIP:PORT
combination in the BPF hashmap.  If it is, it will encapsulate
the packet with a IPv4/v6 header as instructed by the value of
the BPF hashmap and then XDP_TX it out.

The VIP:PORT -> IP-Encap-Info can be specified by the cmd args
of the user prog.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 samples/bpf/Makefile              |   4 +
 samples/bpf/bpf_helpers.h         |   2 +
 samples/bpf/bpf_load.c            |  94 ++++++++++++++
 samples/bpf/bpf_load.h            |   1 +
 samples/bpf/xdp1_user.c           |  93 --------------
 samples/bpf/xdp_tx_iptnl_common.h |  37 ++++++
 samples/bpf/xdp_tx_iptnl_kern.c   | 232 ++++++++++++++++++++++++++++++++++
 samples/bpf/xdp_tx_iptnl_user.c   | 253 ++++++++++++++++++++++++++++++++++++++
 8 files changed, 623 insertions(+), 93 deletions(-)
 create mode 100644 samples/bpf/xdp_tx_iptnl_common.h
 create mode 100644 samples/bpf/xdp_tx_iptnl_kern.c
 create mode 100644 samples/bpf/xdp_tx_iptnl_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 00cd3081c038..f78e0ef6ff10 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -33,6 +33,7 @@ hostprogs-y += trace_event
 hostprogs-y += sampleip
 hostprogs-y += tc_l2_redirect
 hostprogs-y += lwt_len_hist
+hostprogs-y += xdp_tx_iptnl
 
 test_lru_dist-objs := test_lru_dist.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
@@ -67,6 +68,7 @@ trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
 sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
 tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o
 lwt_len_hist-objs := bpf_load.o libbpf.o lwt_len_hist_user.o
+xdp_tx_iptnl-objs := bpf_load.o libbpf.o xdp_tx_iptnl_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -99,6 +101,7 @@ always += test_current_task_under_cgroup_kern.o
 always += trace_event_kern.o
 always += sampleip_kern.o
 always += lwt_len_hist_kern.o
+always += xdp_tx_iptnl_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/
@@ -129,6 +132,7 @@ HOSTLOADLIBES_trace_event += -lelf
 HOSTLOADLIBES_sampleip += -lelf
 HOSTLOADLIBES_tc_l2_redirect += -l elf
 HOSTLOADLIBES_lwt_len_hist += -l elf
+HOSTLOADLIBES_xdp_tx_iptnl += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index 8370a6e3839d..faaffe2e139a 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -57,6 +57,8 @@ static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) =
 	(void *) BPF_FUNC_skb_set_tunnel_opt;
 static unsigned long long (*bpf_get_prandom_u32)(void) =
 	(void *) BPF_FUNC_get_prandom_u32;
+static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
+	(void *) BPF_FUNC_xdp_adjust_head;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 49b45ccbe153..e30b6de94f2e 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -12,6 +12,10 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/perf_event.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <sys/types.h>
+#include <sys/socket.h>
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
@@ -450,3 +454,93 @@ struct ksym *ksym_search(long key)
 	/* out of range. return _stext */
 	return &syms[0];
 }
+
+int set_link_xdp_fd(int ifindex, int fd)
+{
+	struct sockaddr_nl sa;
+	int sock, seq = 0, len, ret = -1;
+	char buf[4096];
+	struct nlattr *nla, *nla_xdp;
+	struct {
+		struct nlmsghdr  nh;
+		struct ifinfomsg ifinfo;
+		char             attrbuf[64];
+	} req;
+	struct nlmsghdr *nh;
+	struct nlmsgerr *err;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.nl_family = AF_NETLINK;
+
+	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (sock < 0) {
+		printf("open netlink socket: %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+		printf("bind to netlink: %s\n", strerror(errno));
+		goto cleanup;
+	}
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	req.nh.nlmsg_type = RTM_SETLINK;
+	req.nh.nlmsg_pid = 0;
+	req.nh.nlmsg_seq = ++seq;
+	req.ifinfo.ifi_family = AF_UNSPEC;
+	req.ifinfo.ifi_index = ifindex;
+	nla = (struct nlattr *)(((char *)&req)
+				+ NLMSG_ALIGN(req.nh.nlmsg_len));
+	nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
+
+	nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN);
+	nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
+	nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
+	memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
+	nla->nla_len = NLA_HDRLEN + nla_xdp->nla_len;
+
+	req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
+
+	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+		printf("send to netlink: %s\n", strerror(errno));
+		goto cleanup;
+	}
+
+	len = recv(sock, buf, sizeof(buf), 0);
+	if (len < 0) {
+		printf("recv from netlink: %s\n", strerror(errno));
+		goto cleanup;
+	}
+
+	for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+	     nh = NLMSG_NEXT(nh, len)) {
+		if (nh->nlmsg_pid != getpid()) {
+			printf("Wrong pid %d, expected %d\n",
+			       nh->nlmsg_pid, getpid());
+			goto cleanup;
+		}
+		if (nh->nlmsg_seq != seq) {
+			printf("Wrong seq %d, expected %d\n",
+			       nh->nlmsg_seq, seq);
+			goto cleanup;
+		}
+		switch (nh->nlmsg_type) {
+		case NLMSG_ERROR:
+			err = (struct nlmsgerr *)NLMSG_DATA(nh);
+			if (!err->error)
+				continue;
+			printf("nlmsg error %s\n", strerror(-err->error));
+			goto cleanup;
+		case NLMSG_DONE:
+			break;
+		}
+	}
+
+	ret = 0;
+
+cleanup:
+	close(sock);
+	return ret;
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 4adeeef53ad6..fb46a421ab41 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -31,4 +31,5 @@ struct ksym {
 
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
+int set_link_xdp_fd(int ifindex, int fd);
 #endif
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
index 2b2150d6d6f7..5f040a0d7712 100644
--- a/samples/bpf/xdp1_user.c
+++ b/samples/bpf/xdp1_user.c
@@ -5,111 +5,18 @@
  * License as published by the Free Software Foundation.
  */
 #include <linux/bpf.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
 #include <assert.h>
 #include <errno.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/socket.h>
 #include <unistd.h>
 
 #include "bpf_load.h"
 #include "bpf_util.h"
 #include "libbpf.h"
 
-static int set_link_xdp_fd(int ifindex, int fd)
-{
-	struct sockaddr_nl sa;
-	int sock, seq = 0, len, ret = -1;
-	char buf[4096];
-	struct nlattr *nla, *nla_xdp;
-	struct {
-		struct nlmsghdr  nh;
-		struct ifinfomsg ifinfo;
-		char             attrbuf[64];
-	} req;
-	struct nlmsghdr *nh;
-	struct nlmsgerr *err;
-
-	memset(&sa, 0, sizeof(sa));
-	sa.nl_family = AF_NETLINK;
-
-	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
-	if (sock < 0) {
-		printf("open netlink socket: %s\n", strerror(errno));
-		return -1;
-	}
-
-	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
-		printf("bind to netlink: %s\n", strerror(errno));
-		goto cleanup;
-	}
-
-	memset(&req, 0, sizeof(req));
-	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
-	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
-	req.nh.nlmsg_type = RTM_SETLINK;
-	req.nh.nlmsg_pid = 0;
-	req.nh.nlmsg_seq = ++seq;
-	req.ifinfo.ifi_family = AF_UNSPEC;
-	req.ifinfo.ifi_index = ifindex;
-	nla = (struct nlattr *)(((char *)&req)
-				+ NLMSG_ALIGN(req.nh.nlmsg_len));
-	nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
-
-	nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN);
-	nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
-	nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
-	memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
-	nla->nla_len = NLA_HDRLEN + nla_xdp->nla_len;
-
-	req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
-
-	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
-		printf("send to netlink: %s\n", strerror(errno));
-		goto cleanup;
-	}
-
-	len = recv(sock, buf, sizeof(buf), 0);
-	if (len < 0) {
-		printf("recv from netlink: %s\n", strerror(errno));
-		goto cleanup;
-	}
-
-	for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
-	     nh = NLMSG_NEXT(nh, len)) {
-		if (nh->nlmsg_pid != getpid()) {
-			printf("Wrong pid %d, expected %d\n",
-			       nh->nlmsg_pid, getpid());
-			goto cleanup;
-		}
-		if (nh->nlmsg_seq != seq) {
-			printf("Wrong seq %d, expected %d\n",
-			       nh->nlmsg_seq, seq);
-			goto cleanup;
-		}
-		switch (nh->nlmsg_type) {
-		case NLMSG_ERROR:
-			err = (struct nlmsgerr *)NLMSG_DATA(nh);
-			if (!err->error)
-				continue;
-			printf("nlmsg error %s\n", strerror(-err->error));
-			goto cleanup;
-		case NLMSG_DONE:
-			break;
-		}
-	}
-
-	ret = 0;
-
-cleanup:
-	close(sock);
-	return ret;
-}
-
 static int ifindex;
 
 static void int_exit(int sig)
diff --git a/samples/bpf/xdp_tx_iptnl_common.h b/samples/bpf/xdp_tx_iptnl_common.h
new file mode 100644
index 000000000000..dd12cc35110f
--- /dev/null
+++ b/samples/bpf/xdp_tx_iptnl_common.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H
+#define _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H
+
+#include <linux/types.h>
+
+#define MAX_IPTNL_ENTRIES 256U
+
+struct vip {
+	union {
+		__u32 v6[4];
+		__u32 v4;
+	} daddr;
+	__u16 dport;
+	__u16 family;
+	__u8 protocol;
+};
+
+struct iptnl_info {
+	union {
+		__u32 v6[4];
+		__u32 v4;
+	} saddr;
+	union {
+		__u32 v6[4];
+		__u32 v4;
+	} daddr;
+	__u16 family;
+	__u8 dmac[6];
+};
+
+#endif
diff --git a/samples/bpf/xdp_tx_iptnl_kern.c b/samples/bpf/xdp_tx_iptnl_kern.c
new file mode 100644
index 000000000000..d88c064175aa
--- /dev/null
+++ b/samples/bpf/xdp_tx_iptnl_kern.c
@@ -0,0 +1,232 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+#include "xdp_tx_iptnl_common.h"
+
+struct bpf_map_def SEC("maps") rxcnt = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u64),
+	.max_entries = 256,
+};
+
+struct bpf_map_def SEC("maps") vip2tnl = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(struct vip),
+	.value_size = sizeof(struct iptnl_info),
+	.max_entries = MAX_IPTNL_ENTRIES,
+};
+
+static __always_inline void count_tx(u32 protocol)
+{
+	u64 *rxcnt_count;
+
+	rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol);
+	if (rxcnt_count)
+		*rxcnt_count += 1;
+}
+
+static __always_inline int get_dport(void *trans_data, void *data_end,
+				     u8 protocol)
+{
+	struct tcphdr *th;
+	struct udphdr *uh;
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+		th = (struct tcphdr *)trans_data;
+		if (th + 1 > data_end)
+			return -1;
+		return th->dest;
+	case IPPROTO_UDP:
+		uh = (struct udphdr *)trans_data;
+		if (uh + 1 > data_end)
+			return -1;
+		return uh->dest;
+	default:
+		return 0;
+	}
+}
+
+static __always_inline void set_ethhdr(struct ethhdr *new_eth,
+				       const struct ethhdr *old_eth,
+				       const struct iptnl_info *tnl,
+				       __be16 h_proto)
+{
+	memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source));
+	memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest));
+	new_eth->h_proto = h_proto;
+}
+
+static __always_inline int handle_ipv4(struct xdp_md *xdp)
+{
+	void *data_end = (void *)(long)xdp->data_end;
+	void *data = (void *)(long)xdp->data;
+	struct iptnl_info *tnl;
+	struct ethhdr *new_eth;
+	struct ethhdr *old_eth;
+	struct iphdr *iph = data + sizeof(struct ethhdr);
+	u16 *next_iph_u16;
+	u16 payload_len;
+	struct vip vip = {};
+	int dport;
+	u32 csum = 0;
+	int i;
+
+	if (iph + 1 > data_end)
+		return XDP_DROP;
+
+	dport = get_dport(iph + 1, data_end, iph->protocol);
+	if (dport == -1)
+		return XDP_DROP;
+
+	vip.protocol = iph->protocol;
+	vip.family = AF_INET;
+	vip.daddr.v4 = iph->daddr;
+	vip.dport = dport;
+	payload_len = ntohs(iph->tot_len);
+
+	tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
+	/* It only does v4-in-v4 */
+	if (!tnl || tnl->family != AF_INET)
+		return XDP_PASS;
+
+	/* The vip key is found.  Add an IP header and send it out */
+
+	if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
+		return XDP_DROP;
+
+	data = (void *)(long)xdp->data;
+	data_end = (void *)(long)xdp->data_end;
+
+	new_eth = data;
+	iph = data + sizeof(*new_eth);
+	old_eth = data + sizeof(*iph);
+
+	if (new_eth + 1 > data_end ||
+	    old_eth + 1 > data_end ||
+	    iph + 1 > data_end)
+		return XDP_DROP;
+
+	set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IP));
+
+	iph->version = 4;
+	iph->ihl = sizeof(*iph) >> 2;
+	iph->frag_off =	0;
+	iph->protocol = IPPROTO_IPIP;
+	iph->check = 0;
+	iph->tos = 0;
+	iph->tot_len = htons(payload_len + sizeof(*iph));
+	iph->daddr = tnl->daddr.v4;
+	iph->saddr = tnl->saddr.v4;
+	iph->ttl = 8;
+
+	next_iph_u16 = (u16 *)iph;
+#pragma clang loop unroll(full)
+	for (i = 0; i < sizeof(*iph) >> 1; i++)
+		csum += *next_iph_u16++;
+
+	iph->check = ~((csum & 0xffff) + (csum >> 16));
+
+	count_tx(vip.protocol);
+
+	return XDP_TX;
+}
+
+static __always_inline int handle_ipv6(struct xdp_md *xdp)
+{
+	void *data_end = (void *)(long)xdp->data_end;
+	void *data = (void *)(long)xdp->data;
+	struct iptnl_info *tnl;
+	struct ethhdr *new_eth;
+	struct ethhdr *old_eth;
+	struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
+	__u16 payload_len;
+	struct vip vip = {};
+	int dport;
+
+	if (ip6h + 1 > data_end)
+		return XDP_DROP;
+
+	dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
+	if (dport == -1)
+		return XDP_DROP;
+
+	vip.protocol = ip6h->nexthdr;
+	vip.family = AF_INET6;
+	memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr));
+	vip.dport = dport;
+	payload_len = ip6h->payload_len;
+
+	tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
+	/* It only does v6-in-v6 */
+	if (!tnl || tnl->family != AF_INET6)
+		return XDP_PASS;
+
+	/* The vip key is found.  Add an IP header and send it out */
+
+	if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
+		return XDP_DROP;
+
+	data = (void *)(long)xdp->data;
+	data_end = (void *)(long)xdp->data_end;
+
+	new_eth = data;
+	ip6h = data + sizeof(*new_eth);
+	old_eth = data + sizeof(*ip6h);
+
+	if (new_eth + 1 > data_end ||
+	    old_eth + 1 > data_end ||
+	    ip6h + 1 > data_end)
+		return XDP_DROP;
+
+	set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IPV6));
+
+	ip6h->version = 6;
+	ip6h->priority = 0;
+	memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
+	ip6h->payload_len = htons(ntohs(payload_len) + sizeof(*ip6h));
+	ip6h->nexthdr = IPPROTO_IPV6;
+	ip6h->hop_limit = 8;
+	memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
+	memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6));
+
+	count_tx(vip.protocol);
+
+	return XDP_TX;
+}
+
+SEC("xdp_tx_iptnl")
+int _xdp_tx_iptnl(struct xdp_md *xdp)
+{
+	void *data_end = (void *)(long)xdp->data_end;
+	void *data = (void *)(long)xdp->data;
+	struct ethhdr *eth = data;
+	__u16 h_proto;
+
+	if (eth + 1 > data_end)
+		return XDP_DROP;
+
+	h_proto = eth->h_proto;
+
+	if (h_proto == htons(ETH_P_IP))
+		return handle_ipv4(xdp);
+	else if (h_proto == htons(ETH_P_IPV6))
+
+		return handle_ipv6(xdp);
+	else
+		return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_tx_iptnl_user.c b/samples/bpf/xdp_tx_iptnl_user.c
new file mode 100644
index 000000000000..9aeef7579af4
--- /dev/null
+++ b/samples/bpf/xdp_tx_iptnl_user.c
@@ -0,0 +1,253 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <arpa/inet.h>
+#include <netinet/ether.h>
+#include <unistd.h>
+#include <time.h>
+#include "bpf_load.h"
+#include "libbpf.h"
+#include "bpf_util.h"
+#include "xdp_tx_iptnl_common.h"
+
+#define STATS_INTERVAL_S 2U
+
+static int ifindex = -1;
+
+static void int_exit(int sig)
+{
+	if (ifindex > -1)
+		set_link_xdp_fd(ifindex, -1);
+	exit(0);
+}
+
+/* simple per-protocol drop counter
+ */
+static void poll_stats(unsigned int kill_after_s)
+{
+	const unsigned int nr_protos = 256;
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	time_t started_at = time(NULL);
+	__u64 values[nr_cpus], prev[nr_protos][nr_cpus];
+	__u32 proto;
+	int i;
+
+	memset(prev, 0, sizeof(prev));
+
+	while (!kill_after_s || time(NULL) - started_at <= kill_after_s) {
+		sleep(STATS_INTERVAL_S);
+
+		for (proto = 0; proto < nr_protos; proto++) {
+			__u64 sum = 0;
+
+			assert(bpf_lookup_elem(map_fd[0], &proto, values) == 0);
+			for (i = 0; i < nr_cpus; i++)
+				sum += (values[i] - prev[proto][i]);
+
+			if (sum)
+				printf("proto %u: sum:%10llu pkts, rate:%10llu pkts/s\n",
+				       proto, sum, sum / STATS_INTERVAL_S);
+			memcpy(prev[proto], values, sizeof(values));
+		}
+	}
+}
+
+static void usage(const char *cmd)
+{
+	printf("Usage: %s [...]\n", cmd);
+	printf("    -i <ifindex> Interface Index\n");
+	printf("    -a <vip-service-address> IPv4 or IPv6\n");
+	printf("    -p <vip-service-port> A port range (e.g. 433-444) is also allowed\n");
+	printf("    -s <source-ip> Used in the IPTunnel Header\n");
+	printf("    -d <dest-ip> Used in the IPTunnel header>\n");
+	printf("    -m <dest-MAC> Used in sending the IP Tunneled pkt>\n");
+	printf("    -T <stop-after-X-seconds> Default: 0 (forever)\n");
+	printf("    -P <IP-Protocol> Default is TCP\n");
+	printf("    -h Display this help\n");
+}
+
+static int parse_ipstr(const char *ipstr, unsigned int *addr)
+{
+	if (inet_pton(AF_INET6, ipstr, addr) == 1) {
+		return AF_INET6;
+	} else if (inet_pton(AF_INET, ipstr, addr) == 1) {
+		addr[1] = addr[2] = addr[3] = 0;
+		return AF_INET;
+	}
+
+	fprintf(stderr, "%s is an invalid IP\n", ipstr);
+	return AF_UNSPEC;
+}
+
+static int parse_ports(const char *port_str, int *min_port, int *max_port)
+{
+	char *end;
+	long tmp_min_port;
+	long tmp_max_port;
+
+	tmp_min_port = strtol(optarg, &end, 10);
+	if (tmp_min_port < 1 || tmp_min_port > 65535) {
+		fprintf(stderr, "Invalid port(s):%s\n", optarg);
+		return 1;
+	}
+
+	if (*end == '-') {
+		end++;
+		tmp_max_port = strtol(end, NULL, 10);
+		if (tmp_max_port < 1 || tmp_max_port > 65535) {
+			fprintf(stderr, "Invalid port(s):%s\n", optarg);
+			return 1;
+		}
+	} else {
+		tmp_max_port = tmp_min_port;
+	}
+
+	if (tmp_min_port > tmp_max_port) {
+		fprintf(stderr, "Invalid port(s):%s\n", optarg);
+		return 1;
+	}
+
+	if (tmp_max_port - tmp_min_port + 1 > MAX_IPTNL_ENTRIES) {
+		fprintf(stderr, "Port range (%s) is larger than %u\n",
+			port_str, MAX_IPTNL_ENTRIES);
+		return 1;
+	}
+	*min_port = tmp_min_port;
+	*max_port = tmp_max_port;
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	unsigned char opt_flags[256] = {};
+	unsigned int kill_after_s = 0;
+	const char *optstr = "i:a:p:s:d:m:T:P:";
+	int min_port = 0, max_port = 0;
+	struct iptnl_info tnl = {};
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	struct vip vip = {};
+	char filename[256];
+	int opt;
+	int i;
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
+		return 1;
+	}
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	if (!prog_fd[0]) {
+		printf("load_bpf_file: %s\n", strerror(errno));
+		return 1;
+	}
+
+	tnl.family = AF_UNSPEC;
+	vip.protocol = IPPROTO_TCP;
+
+	for (i = 0; i < strlen(optstr); i++)
+		if ('a' <= optstr[i] && optstr[i] <= 'z')
+			opt_flags[(unsigned char)optstr[i]] = 1;
+
+	while ((opt = getopt(argc, argv, optstr)) != -1) {
+		unsigned short family;
+		unsigned int *v6;
+
+		switch (opt) {
+		case 'i':
+			ifindex = atoi(optarg);
+			break;
+		case 'a':
+			vip.family = parse_ipstr(optarg, vip.daddr.v6);
+			if (vip.family == AF_UNSPEC)
+				return 1;
+			break;
+		case 'p':
+			if (parse_ports(optarg, &min_port, &max_port))
+				return 1;
+			break;
+		case 'P':
+			vip.protocol = atoi(optarg);
+			break;
+		case 's':
+		case 'd':
+			if (opt == 's')
+				v6 = tnl.saddr.v6;
+			else
+				v6 = tnl.daddr.v6;
+
+			family = parse_ipstr(optarg, v6);
+			if (family == AF_UNSPEC)
+				return 1;
+			if (tnl.family == AF_UNSPEC) {
+				tnl.family = family;
+			} else if (tnl.family != family) {
+				fprintf(stderr,
+					"The IP version of the src and dst addresses used in the IP encapsulation does not match\n");
+				return 1;
+			}
+			break;
+		case 'm':
+			if (!ether_aton_r(optarg,
+					  (struct ether_addr *)tnl.dmac)) {
+				fprintf(stderr, "Invalid mac address:%s\n",
+					optarg);
+				return 1;
+			}
+			break;
+		case 'T':
+			kill_after_s = atoi(optarg);
+			break;
+		default:
+			usage(argv[0]);
+			return 1;
+		}
+		opt_flags[opt] = 0;
+	}
+
+	for (i = 0; i < strlen(optstr); i++) {
+		if (opt_flags[(unsigned int)optstr[i]]) {
+			fprintf(stderr, "Missing argument -%c\n", optstr[i]);
+			usage(argv[0]);
+			return 1;
+		}
+	}
+
+	signal(SIGINT, int_exit);
+
+	while (min_port <= max_port) {
+		vip.dport = htons(min_port++);
+		if (bpf_update_elem(map_fd[1], &vip, &tnl, BPF_NOEXIST)) {
+			perror("bpf_update_elem(&vip2tnl)");
+			return 1;
+		}
+	}
+
+	if (set_link_xdp_fd(ifindex, prog_fd[0]) < 0) {
+		printf("link set xdp fd failed\n");
+		return 1;
+	}
+
+	poll_stats(kill_after_s);
+
+	set_link_xdp_fd(ifindex, -1);
+
+	return 0;
+}
-- 
2.5.1

^ permalink raw reply related

* [PATCH v3 net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
From: Martin KaFai Lau @ 2016-12-07  5:31 UTC (permalink / raw)
  To: netdev
  Cc: Alexei Starovoitov, Brenden Blanco, Daniel Borkmann, David Miller,
	Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
	Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <1481088714-54512-1-git-send-email-kafai@fb.com>

This patch allows XDP prog to extend/remove the packet
data at the head (like adding or removing header).  It is
done by adding a new XDP helper bpf_xdp_adjust_head().

It also renames bpf_helper_changes_skb_data() to
bpf_helper_changes_pkt_data() to better reflect
that XDP prog does not work on skb.

To avoid breaking unsupported drivers, this patch
also does the needed checking before setting
the xdp_prog fd to the device.  It is done by
1) Adding a XDP_QUERY_FEATURES command
2) Adding one "xdp_adjust_head" bit to bpf_prog

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 arch/powerpc/net/bpf_jit_comp64.c                  |  4 ++--
 arch/s390/net/bpf_jit_comp.c                       |  2 +-
 arch/x86/net/bpf_jit_comp.c                        |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  3 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  3 +++
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  3 +++
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  3 +++
 include/linux/filter.h                             |  6 +++--
 include/linux/netdevice.h                          | 12 ++++++++++
 include/uapi/linux/bpf.h                           | 11 ++++++++-
 kernel/bpf/core.c                                  |  2 +-
 kernel/bpf/syscall.c                               |  2 ++
 kernel/bpf/verifier.c                              |  2 +-
 net/core/dev.c                                     |  9 +++++++
 net/core/filter.c                                  | 28 ++++++++++++++++++++--
 15 files changed, 81 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 0fe98a567125..73a5cf18fd84 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -766,7 +766,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 			func = (u8 *) __bpf_call_base + imm;
 
 			/* Save skb pointer if we need to re-cache skb data */
-			if (bpf_helper_changes_skb_data(func))
+			if (bpf_helper_changes_pkt_data(func))
 				PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
 
 			bpf_jit_emit_func_call(image, ctx, (u64)func);
@@ -775,7 +775,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 			PPC_MR(b2p[BPF_REG_0], 3);
 
 			/* refresh skb cache */
-			if (bpf_helper_changes_skb_data(func)) {
+			if (bpf_helper_changes_pkt_data(func)) {
 				/* reload skb pointer to r3 */
 				PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
 				bpf_jit_emit_skb_loads(image, ctx);
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index bee281f3163d..167b31b186c1 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -981,7 +981,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		EMIT2(0x0d00, REG_14, REG_W1);
 		/* lgr %b0,%r2: load return value into %b0 */
 		EMIT4(0xb9040000, BPF_REG_0, REG_2);
-		if (bpf_helper_changes_skb_data((void *)func)) {
+		if (bpf_helper_changes_pkt_data((void *)func)) {
 			jit->seen |= SEEN_SKB_CHANGE;
 			/* lg %b1,ST_OFF_SKBP(%r15) */
 			EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index fe04a04dab8e..e76d1af60f7a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -853,7 +853,7 @@ xadd:			if (is_imm8(insn->off))
 			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
 			if (seen_ld_abs) {
-				reload_skb_data = bpf_helper_changes_skb_data(func);
+				reload_skb_data = bpf_helper_changes_pkt_data(func);
 				if (reload_skb_data) {
 					EMIT1(0x57); /* push %rdi */
 					jmp_offset += 22; /* pop, mov, sub, mov */
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 49a81f1fc1d6..6261157f444e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2794,6 +2794,9 @@ static int mlx4_xdp(struct net_device *dev, struct netdev_xdp *xdp)
 	case XDP_QUERY_PROG:
 		xdp->prog_attached = mlx4_xdp_attached(dev);
 		return 0;
+	case XDP_QUERY_FEATURES:
+		xdp->features = 0;
+		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9def5cc378a3..f7a6b6b56d30 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3262,6 +3262,9 @@ static int mlx5e_xdp(struct net_device *dev, struct netdev_xdp *xdp)
 	case XDP_QUERY_PROG:
 		xdp->prog_attached = mlx5e_xdp_attached(dev);
 		return 0;
+	case XDP_QUERY_FEATURES:
+		xdp->features = 0;
+		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 00d9a03be31d..89c95bb91503 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2981,6 +2981,9 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_xdp *xdp)
 	case XDP_QUERY_PROG:
 		xdp->prog_attached = !!nn->xdp_prog;
 		return 0;
+	case XDP_QUERY_FEATURES:
+		xdp->features = 0;
+		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index cf1dd1436d93..d52dd83ae8a1 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -2525,6 +2525,9 @@ static int qede_xdp(struct net_device *dev, struct netdev_xdp *xdp)
 	case XDP_QUERY_PROG:
 		xdp->prog_attached = !!edev->xdp_prog;
 		return 0;
+	case XDP_QUERY_FEATURES:
+		xdp->features = 0;
+		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f078d2b1cff6..6a1658308612 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -406,7 +406,8 @@ struct bpf_prog {
 	u16			jited:1,	/* Is our filter JIT'ed? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1;	/* Do we need dst entry? */
+				dst_needed:1,	/* Do we need dst entry? */
+				xdp_adjust_head:1; /* Adjusting pkt head? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
@@ -440,6 +441,7 @@ struct bpf_skb_data_end {
 struct xdp_buff {
 	void *data;
 	void *data_end;
+	void *data_hard_start;
 };
 
 /* compute the linear packet data range [data, data_end) which
@@ -595,7 +597,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
-bool bpf_helper_changes_skb_data(void *func);
+bool bpf_helper_changes_pkt_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1ff5ea6e1221..786ad7c67215 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -30,6 +30,7 @@
 #include <linux/delay.h>
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
+#include <linux/bitops.h>
 #include <asm/cache.h>
 #include <asm/byteorder.h>
 
@@ -805,6 +806,13 @@ struct tc_to_netdev {
 	bool egress_dev;
 };
 
+/* Driver must allow a XDP prog to extend header by
+ * up to XDP_PACKET_HEADROOM.  It must also fill out
+ * the data_hard_start value in struct xdp_buff
+ * before calling out the xdp_prog.
+ */
+#define XDP_F_ADJUST_HEAD	BIT(0)
+
 /* These structures hold the attributes of xdp state that are being passed
  * to the netdevice through the xdp op.
  */
@@ -821,6 +829,8 @@ enum xdp_netdev_command {
 	 * return true if a program is currently attached and running.
 	 */
 	XDP_QUERY_PROG,
+	/* Check what XDP features are supported by a device */
+	XDP_QUERY_FEATURES,
 };
 
 struct netdev_xdp {
@@ -830,6 +840,8 @@ struct netdev_xdp {
 		struct bpf_prog *prog;
 		/* XDP_QUERY_PROG */
 		bool prog_attached;
+		/* XDP_QUERY_FEATURES */
+		u32 features;
 	};
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6123d9b8e828..0eb0e87dbe9f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -424,6 +424,12 @@ union bpf_attr {
  *     @len: length of header to be pushed in front
  *     @flags: Flags (unused for now)
  *     Return: 0 on success or negative error
+ *
+ * int bpf_xdp_adjust_head(xdp_md, delta)
+ *     Adjust the xdp_md.data by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -469,7 +475,8 @@ union bpf_attr {
 	FN(csum_update),		\
 	FN(set_hash_invalid),		\
 	FN(get_numa_node_id),		\
-	FN(skb_change_head),
+	FN(skb_change_head),		\
+	FN(xdp_adjust_head),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -576,6 +583,8 @@ struct bpf_sock {
 	__u32 protocol;
 };
 
+#define XDP_PACKET_HEADROOM 256
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index bdcc9f4ba767..83e0d153b0b4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1143,7 +1143,7 @@ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
 	return prog;
 }
 
-bool __weak bpf_helper_changes_skb_data(void *func)
+bool __weak bpf_helper_changes_pkt_data(void *func)
 {
 	return false;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c0d2b423ce93..add93ecd7e69 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -579,6 +579,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 				prog->dst_needed = 1;
 			if (insn->imm == BPF_FUNC_get_prandom_u32)
 				bpf_user_rnd_init_once();
+			if (insn->imm == BPF_FUNC_xdp_adjust_head)
+				prog->xdp_adjust_head = 1;
 			if (insn->imm == BPF_FUNC_tail_call) {
 				/* mark bpf_tail_call as different opcode
 				 * to avoid conditional branch in
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cb37339ca0da..f5fa326518c9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1216,7 +1216,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
 		return -EINVAL;
 	}
 
-	changes_data = bpf_helper_changes_skb_data(fn->func);
+	changes_data = bpf_helper_changes_pkt_data(fn->func);
 
 	memset(&meta, 0, sizeof(meta));
 	meta.pkt_access = fn->pkt_access;
diff --git a/net/core/dev.c b/net/core/dev.c
index bffb5253e778..90696f7e6b59 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6722,6 +6722,15 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
+
+		xdp.command = XDP_QUERY_FEATURES;
+		err = ops->ndo_xdp(dev, &xdp);
+		if (err)
+			return err;
+
+		if (prog->xdp_adjust_head &&
+		    !(xdp.features & XDP_F_ADJUST_HEAD))
+			return -ENOTSUPP;
 	}
 
 	memset(&xdp, 0, sizeof(xdp));
diff --git a/net/core/filter.c b/net/core/filter.c
index b751202e12f8..b1461708a977 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2234,7 +2234,28 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-bool bpf_helper_changes_skb_data(void *func)
+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
+{
+	void *data = xdp->data + offset;
+
+	if (unlikely(data < xdp->data_hard_start ||
+		     data > xdp->data_end - ETH_HLEN))
+		return -EINVAL;
+
+	xdp->data = data;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
+	.func		= bpf_xdp_adjust_head,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
 	    func == bpf_skb_vlan_pop ||
@@ -2244,7 +2265,8 @@ bool bpf_helper_changes_skb_data(void *func)
 	    func == bpf_skb_change_tail ||
 	    func == bpf_skb_pull_data ||
 	    func == bpf_l3_csum_replace ||
-	    func == bpf_l4_csum_replace)
+	    func == bpf_l4_csum_replace ||
+	    func == bpf_xdp_adjust_head)
 		return true;
 
 	return false;
@@ -2670,6 +2692,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_xdp_event_output_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_xdp_adjust_head:
+		return &bpf_xdp_adjust_head_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
2.5.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox