Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH net-next] net: mvpp2: phylink support
From: Antoine Tenart @ 2017-09-25 13:06 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Antoine Tenart, davem, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux-kernel, mw, stefanc, netdev
In-Reply-To: <20170925121343.GO20805@n2100.armlinux.org.uk>

On Mon, Sep 25, 2017 at 01:13:43PM +0100, Russell King - ARM Linux wrote:
> On Mon, Sep 25, 2017 at 01:53:03PM +0200, Antoine Tenart wrote:
> > On Mon, Sep 25, 2017 at 11:45:32AM +0100, Russell King - ARM Linux wrote:
> > > Can you describe what the GoP link IRQ is doing please?
> > 
> > In cases where there is no PHY connected to the MAC and no SFP cage is
> > used. One example is when a SOHO switch is connected directly to a
> > serdes lane. In such cases we still need to have a minimal link
> > management. The GoP link interrupt helps doing so as it raises when the
> > serdes is in sync and AN succeeded.
> 
> Isn't this just like a fixed link scenario, or an in-band
> autonegotiation scenario (both of which phylink supports natively)?
> 
> The situation on Clearfog with the 88E6176 switch is pretty similar -
> a switch connected directly via serdes to the MAC.  Currently, we
> configure stuff there as a fixed link, but in actual fact the 88E6176
> is configured to run the CPU facing port in 1000base-X mode, and with
> appropriate tweaks, switching phylink to 1000base-X mode also works.

Hmm, I think you're right, we should be able to represent the link
between the MAC and the switch as a fixed link. And when it's not fixed,
it could be done with in-band AN. I cannot test this myself but I've
asked someone who can to.

Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* [PATCH net v2 3/3] net: mvpp2: do not select the internal source clock
From: Antoine Tenart @ 2017-09-25 12:59 UTC (permalink / raw)
  To: davem
  Cc: Antoine Tenart, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux, linux-kernel, mw, stefanc, netdev
In-Reply-To: <20170925125948.13507-1-antoine.tenart@free-electrons.com>

This patch stops the internal MAC Tx clock from being enabled as the
internal clock isn't used. The definition used for the bit controlling
this behaviour is renamed as well as it was wrongly named (bit 4 of
GMAC_CTRL_2_REG).

Fixes: 3919357fb0bb ("net: mvpp2: initialize the GMAC when using a port")
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
---
 drivers/net/ethernet/marvell/mvpp2.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index b2f99df81e9c..161055564720 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -333,7 +333,7 @@
 #define     MVPP2_GMAC_INBAND_AN_MASK		BIT(0)
 #define     MVPP2_GMAC_FLOW_CTRL_MASK		GENMASK(2, 1)
 #define     MVPP2_GMAC_PCS_ENABLE_MASK		BIT(3)
-#define     MVPP2_GMAC_PORT_RGMII_MASK		BIT(4)
+#define     MVPP2_GMAC_INTERNAL_CLK_MASK	BIT(4)
 #define     MVPP2_GMAC_DISABLE_PADDING		BIT(5)
 #define     MVPP2_GMAC_PORT_RESET_MASK		BIT(6)
 #define MVPP2_GMAC_AUTONEG_CONFIG		0xc
@@ -4599,7 +4599,6 @@ static void mvpp2_port_mii_gmac_configure(struct mvpp2_port *port)
 	        val |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK;
 	} else if (phy_interface_mode_is_rgmii(port->phy_interface)) {
 		val &= ~MVPP2_GMAC_PCS_ENABLE_MASK;
-		val |= MVPP2_GMAC_PORT_RGMII_MASK;
 	}
 	writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
 
-- 
2.13.5

^ permalink raw reply related

* [PATCH net v2 2/3] net: mvpp2: fix port list indexing
From: Antoine Tenart @ 2017-09-25 12:59 UTC (permalink / raw)
  To: davem
  Cc: Yan Markman, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux, linux-kernel, mw, stefanc, netdev,
	Antoine Tenart
In-Reply-To: <20170925125948.13507-1-antoine.tenart@free-electrons.com>

From: Yan Markman <ymarkman@marvell.com>

The private port_list array has a list of pointers to mvpp2_port
instances. This list is allocated given the number of ports enabled in
the device tree, but the pointers are set using the port-id property. If
on a single port is enabled, the port_list array will be of size 1, but
when registering the port, if its id is not 0 the driver will crash.
Other crashes were encountered in various situations.

This fixes the issue by using an index not equal to the value of the
port-id property.

Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit")
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
---
 drivers/net/ethernet/marvell/mvpp2.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index da04939a2748..b2f99df81e9c 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -7504,7 +7504,7 @@ static void mvpp2_port_copy_mac_addr(struct net_device *dev, struct mvpp2 *priv,
 /* Ports initialization */
 static int mvpp2_port_probe(struct platform_device *pdev,
 			    struct device_node *port_node,
-			    struct mvpp2 *priv)
+			    struct mvpp2 *priv, int index)
 {
 	struct device_node *phy_node;
 	struct phy *comphy;
@@ -7678,7 +7678,7 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 	}
 	netdev_info(dev, "Using %s mac address %pM\n", mac_from, dev->dev_addr);
 
-	priv->port_list[id] = port;
+	priv->port_list[index] = port;
 	return 0;
 
 err_free_port_pcpu:
@@ -8013,10 +8013,12 @@ static int mvpp2_probe(struct platform_device *pdev)
 	}
 
 	/* Initialize ports */
+	i = 0;
 	for_each_available_child_of_node(dn, port_node) {
-		err = mvpp2_port_probe(pdev, port_node, priv);
+		err = mvpp2_port_probe(pdev, port_node, priv, i);
 		if (err < 0)
 			goto err_mg_clk;
+		i++;
 	}
 
 	platform_set_drvdata(pdev, priv);
-- 
2.13.5

^ permalink raw reply related

* [PATCH net v2 1/3] net: mvpp2: fix parsing fragmentation detection
From: Antoine Tenart @ 2017-09-25 12:59 UTC (permalink / raw)
  To: davem
  Cc: Stefan Chulski, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux, linux-kernel, mw, netdev,
	Antoine Tenart
In-Reply-To: <20170925125948.13507-1-antoine.tenart@free-electrons.com>

From: Stefan Chulski <stefanc@marvell.com>

Parsing fragmentation detection failed due to wrong configured
parser TCAM entry's. Some traffic was marked as fragmented in RX
descriptor, even it wasn't IP fragmented. The hardware also failed to
calculate checksums which lead to use software checksum and caused
performance degradation.

Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit")
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
---
 drivers/net/ethernet/marvell/mvpp2.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index dd0ee2691c86..da04939a2748 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -676,6 +676,7 @@ enum mvpp2_tag_type {
 #define MVPP2_PRS_RI_L3_MCAST			BIT(15)
 #define MVPP2_PRS_RI_L3_BCAST			(BIT(15) | BIT(16))
 #define MVPP2_PRS_RI_IP_FRAG_MASK		0x20000
+#define MVPP2_PRS_RI_IP_FRAG_TRUE		BIT(17)
 #define MVPP2_PRS_RI_UDF3_MASK			0x300000
 #define MVPP2_PRS_RI_UDF3_RX_SPECIAL		BIT(21)
 #define MVPP2_PRS_RI_L4_PROTO_MASK		0x1c00000
@@ -2315,7 +2316,7 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
 	    (proto != IPPROTO_IGMP))
 		return -EINVAL;
 
-	/* Fragmented packet */
+	/* Not fragmented packet */
 	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
 					MVPP2_PE_LAST_FREE_TID);
 	if (tid < 0)
@@ -2334,8 +2335,12 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
 				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
 	mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT,
 				 MVPP2_PRS_IPV4_DIP_AI_BIT);
-	mvpp2_prs_sram_ri_update(&pe, ri | MVPP2_PRS_RI_IP_FRAG_MASK,
-				 ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
+	mvpp2_prs_sram_ri_update(&pe, ri, ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
+
+	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00,
+				     MVPP2_PRS_TCAM_PROTO_MASK_L);
+	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00,
+				     MVPP2_PRS_TCAM_PROTO_MASK);
 
 	mvpp2_prs_tcam_data_byte_set(&pe, 5, proto, MVPP2_PRS_TCAM_PROTO_MASK);
 	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV4_DIP_AI_BIT);
@@ -2346,7 +2351,7 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
 	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
 	mvpp2_prs_hw_write(priv, &pe);
 
-	/* Not fragmented packet */
+	/* Fragmented packet */
 	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
 					MVPP2_PE_LAST_FREE_TID);
 	if (tid < 0)
@@ -2358,8 +2363,11 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
 	pe.sram.word[MVPP2_PRS_SRAM_RI_CTRL_WORD] = 0x0;
 	mvpp2_prs_sram_ri_update(&pe, ri, ri_mask);
 
-	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, MVPP2_PRS_TCAM_PROTO_MASK_L);
-	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, MVPP2_PRS_TCAM_PROTO_MASK);
+	mvpp2_prs_sram_ri_update(&pe, ri | MVPP2_PRS_RI_IP_FRAG_TRUE,
+				 ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
+
+	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, 0x0);
+	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, 0x0);
 
 	/* Update shadow table and hw entry */
 	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
-- 
2.13.5

^ permalink raw reply related

* [PATCH net v2 0/3] net: mvpp2: various fixes
From: Antoine Tenart @ 2017-09-25 12:59 UTC (permalink / raw)
  To: davem
  Cc: Antoine Tenart, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux, linux-kernel, mw, stefanc, netdev

Hi all,

This series contains 3 fixes for the Marvell PPv2 driver.

Thanks!
Antoine

Since v1:
  - Removed one patch about dma masks as it would need a better fix.
  - Added one fix about the MAC Tx clock source selection.

Antoine Tenart (1):
  net: mvpp2: do not select the internal source clock

Stefan Chulski (1):
  net: mvpp2: fix parsing fragmentation detection

Yan Markman (1):
  net: mvpp2: fix port list indexing

 drivers/net/ethernet/marvell/mvpp2.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

-- 
2.13.5

^ permalink raw reply

* Re: [PATCH net 1/3] net: mvpp2: fix the dma_mask and coherent_dma_mask settings for PPv2.2
From: Antoine Tenart @ 2017-09-25 12:40 UTC (permalink / raw)
  To: David Miller
  Cc: antoine.tenart, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux, linux-kernel, mw, stefanc, netdev
In-Reply-To: <20170921.100718.386894052177530033.davem@davemloft.net>

On Thu, Sep 21, 2017 at 10:07:18AM -0700, David Miller wrote:
> From: Antoine Tenart <antoine.tenart@free-electrons.com>
> Date: Thu, 21 Sep 2017 16:24:13 +0200
> 
> > That's also the default when the platform does not allocate dma_mask.
> 
> That's the problem that needs to be fixed then.

OK, I'll drop this patch until I find a proper solution.

Thanks,
Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* Re: [PATCH net v2] l2tp: fix race condition in l2tp_tunnel_delete
From: Guillaume Nault @ 2017-09-25 12:33 UTC (permalink / raw)
  To: Sabrina Dubroca; +Cc: netdev, Xin Long, Tom Parkin
In-Reply-To: <20170922161624.GA31500@bistromath.localdomain>

On Fri, Sep 22, 2017 at 06:16:24PM +0200, Sabrina Dubroca wrote:
> 2017-09-19, 18:43:37 +0200, Guillaume Nault wrote:
> > On Tue, Sep 19, 2017 at 03:40:40PM +0200, Sabrina Dubroca wrote:
> > > If we try to delete the same tunnel twice, the first delete operation
> > > does a lookup (l2tp_tunnel_get), finds the tunnel, calls
> > > l2tp_tunnel_delete, which queues it for deletion by
> > > l2tp_tunnel_del_work.
> > > 
> > > The second delete operation also finds the tunnel and calls
> > > l2tp_tunnel_delete. If the workqueue has already fired and started
> > > running l2tp_tunnel_del_work, then l2tp_tunnel_delete will queue the
> > > same tunnel a second time, and try to free the socket again.
> > > 
> > > Add a dead flag to prevent firing the workqueue twice. Then we can
> > > remove the check of queue_work's result that was meant to prevent that
> > > race but doesn't.
> > > 
> > > Also check the flag in the tunnel lookup functions, to avoid returning a
> > > tunnel that is already scheduled for destruction.
> > > 
> > > Reproducer:
> > > 
> > >     ip l2tp add tunnel tunnel_id 3000 peer_tunnel_id 4000 local 192.168.0.2 remote 192.168.0.1 encap udp udp_sport 5000 udp_dport 6000
> > >     ip l2tp add session name l2tp1 tunnel_id 3000 session_id 1000 peer_session_id 2000
> > >     ip link set l2tp1 up
> > >     ip l2tp del tunnel tunnel_id 3000
> > >     ip l2tp del tunnel tunnel_id 3000
> > > 
> > > Fixes: f8ccac0e4493 ("l2tp: put tunnel socket release on a workqueue")
> > > Reported-by: Jianlin Shi <jishi@redhat.com>
> > > Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
> > > ---
> > > v2: as Tom Parkin explained, we can't remove the tunnel from the
> > >     per-net list from netlink. v2 uses only a dead flag, and adds
> > >     corresponding checks during lookups
> > > 
> > >  net/l2tp/l2tp_core.c | 18 +++++++++---------
> > >  net/l2tp/l2tp_core.h |  5 ++++-
> > >  2 files changed, 13 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
> > > index ee485df73ccd..3891f0260f2b 100644
> > > --- a/net/l2tp/l2tp_core.c
> > > +++ b/net/l2tp/l2tp_core.c
> > > @@ -203,7 +203,8 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id)
> > >  
> > >  	rcu_read_lock_bh();
> > >  	list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
> > > -		if (tunnel->tunnel_id == tunnel_id) {
> > > +		if (tunnel->tunnel_id == tunnel_id &&
> > > +		    !test_bit(0, &tunnel->dead)) {
> > >  			l2tp_tunnel_inc_refcount(tunnel);
> > >  			rcu_read_unlock_bh();
> > >  
> > > @@ -390,7 +391,8 @@ struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id)
> > >  
> > >  	rcu_read_lock_bh();
> > >  	list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
> > > -		if (tunnel->tunnel_id == tunnel_id) {
> > > +		if (tunnel->tunnel_id == tunnel_id &&
> > > +		    !test_bit(0, &tunnel->dead)) {
> > >  			rcu_read_unlock_bh();
> > >  			return tunnel;
> > >  		}
> > > @@ -409,7 +411,7 @@ struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth)
> > >  
> > >  	rcu_read_lock_bh();
> > >  	list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
> > > -		if (++count > nth) {
> > > +		if (++count > nth && !test_bit(0, &tunnel->dead)) {
> > >  			rcu_read_unlock_bh();
> > >  			return tunnel;
> > >  		}
> > > 
> > I don't get why you're checking the dead flag in l2tp_tunnel_{get,find}*().
> > Since it can be set concurrently right after test_bit(), it doesn't
> > protect the caller from getting a tunnel that is being removed by
> > l2tp_tunnel_delete().
> > Or have I missed something?
> 
> You're right.
> 
> Then I would try going back to essentially v1, but keeping code to
> remove the tunnel from the list in l2tp_tunnel_destruct if it's not
> dead yet.
> 
> What do you think?
> 
My main question was more about why do you feel the need for preventing
other parts of the code from accessing dead tunnels? The TOCTOU issue
was just there to illustrate the fact that it couldn't be implemented
this easily.

My reasonning is that a tunnel may already be in use when
l2tp_tunnel_delete() is called. So any function using tunnels must
already work properly on dying tunnels, because l2tp_tunnel_delete()
might kill them concurrently. Getting a dying tunnel from
l2tp_tunnel_get() or having the tunnel killed by l2tp_tunnel_delete()
while in use should make no difference, as long as the user properly
holds a reference. Of course we have the problem of l2tp_tunnel_find*()
which is racy wrt. tunnel reference counting, but I'm going to continue
converting these users to the safe l2tp_tunnel_get() lookup function.

Of course, making dying tunnels inaccessible makes sense but, unless
I've missed something, it looks more like cleanup/optimisation than bug
fixing.

So what about using your v2 patch, but without the ->dead flag test in
l2tp_tunnel_get() and l2tp_tunnel_find*()?


Now for some more context, I think tunnel creation and deletion will
need to be reworked. Tunnels should be removed from the pernet list by
l2tp_udp_encap_destroy() for L2TP over UDP, and by
l2tp_ip_destroy_sock() or l2tp_ip6_destroy_sock() for L2TP over IP.

Then we could stop hooking on ->sk_destruct(), because the
l2tp_tunnel_closeall() call found in l2tp_tunnel_destruct() is already
useless (if it actually had to remove sessions, it could sleep while in
atomic context, because ->sk_destruct() is now invoked through
call_rcu() for UDP sockets).

And we should break the tight coupling of the l2tp_tunnel structure
with the tunnel socket. This situation, where they dereference one
another without any protection, complicates the deletion process.
Protecting the socket and the tunnel's structure pointers with RCU
would certainly allow for simpler deletion code.

All in all, your last patch makes a lot of sense in this bigger
picture, but for now I'd rather go for simply preventing queueing
l2tp_tunnel_del_work() twice. Unless required for accurately fixing the
current issue, I think removing tunnels in l2tp_tunnel_delete() would fit
better in a different series.

> 
> -------- 8< --------
> 
> diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
> index ee485df73ccd..63cd1f30ac7d 100644
> --- a/net/l2tp/l2tp_core.c
> +++ b/net/l2tp/l2tp_core.c
> @@ -1234,6 +1234,23 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
>  }
>  EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
>  
> +static bool __l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
> +{
> +	struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
> +	bool ret = false;
> +
> +	spin_lock_bh(&pn->l2tp_tunnel_list_lock);
> +	if (!tunnel->dead) {
> +		tunnel->dead = 1;
> +		list_del_rcu(&tunnel->list);
> +		atomic_dec(&l2tp_tunnel_count);
> +		ret = true;
> +	}
> +	spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
> +
> +	return ret;
> +}
> +
>  /*****************************************************************************
>   * Tinnel and session create/destroy.
>   *****************************************************************************/
> @@ -1245,7 +1262,6 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
>  static void l2tp_tunnel_destruct(struct sock *sk)
>  {
>  	struct l2tp_tunnel *tunnel = l2tp_tunnel(sk);
> -	struct l2tp_net *pn;
>  
>  	if (tunnel == NULL)
>  		goto end;
> @@ -1270,11 +1286,7 @@ static void l2tp_tunnel_destruct(struct sock *sk)
>  	sk->sk_user_data = NULL;
>  
>  	/* Remove the tunnel struct from the tunnel list */
> -	pn = l2tp_pernet(tunnel->l2tp_net);
> -	spin_lock_bh(&pn->l2tp_tunnel_list_lock);
> -	list_del_rcu(&tunnel->list);
> -	spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
> -	atomic_dec(&l2tp_tunnel_count);
> +	__l2tp_tunnel_delete(tunnel);
>  
>  	l2tp_tunnel_closeall(tunnel);
>  
> @@ -1685,14 +1697,12 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create);
>  
>  /* This function is used by the netlink TUNNEL_DELETE command.
>   */
> -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
> +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
>  {
> -	l2tp_tunnel_inc_refcount(tunnel);
> -	if (false == queue_work(l2tp_wq, &tunnel->del_work)) {
> -		l2tp_tunnel_dec_refcount(tunnel);
> -		return 1;
> +	if (__l2tp_tunnel_delete(tunnel)) {
> +		l2tp_tunnel_inc_refcount(tunnel);
> +		queue_work(l2tp_wq, &tunnel->del_work);
>  	}
> -	return 0;
>  }
>  EXPORT_SYMBOL_GPL(l2tp_tunnel_delete);
>  
> diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
> index a305e0c5925a..173e68bb8119 100644
> --- a/net/l2tp/l2tp_core.h
> +++ b/net/l2tp/l2tp_core.h
> @@ -160,6 +160,8 @@ struct l2tp_tunnel_cfg {
>  
>  struct l2tp_tunnel {
>  	int			magic;		/* Should be L2TP_TUNNEL_MAGIC */
> +	int			dead;
> +
>  	struct rcu_head rcu;
>  	rwlock_t		hlist_lock;	/* protect session_hlist */
>  	bool			acpt_newsess;	/* Indicates whether this
> @@ -254,7 +256,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
>  		       u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg,
>  		       struct l2tp_tunnel **tunnelp);
>  void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
> -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
> +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
>  struct l2tp_session *l2tp_session_create(int priv_size,
>  					 struct l2tp_tunnel *tunnel,
>  					 u32 session_id, u32 peer_session_id,
> 
> 
> -- 
> Sabrina

^ permalink raw reply

* Re: usb/wireless/rsi_91x: use-after-free write in __run_timers
From: Kalle Valo @ 2017-09-25 12:26 UTC (permalink / raw)
  To: Andrey Konovalov
  Cc: Amitkumar Karwar, Prameela Rani Garnepudi, Karun Eagalapati,
	linux-wireless, netdev, LKML, Dmitry Vyukov, Kostya Serebryany,
	syzkaller
In-Reply-To: <CAAeHK+y61FFKLpePKOhRjd=5QJEWRy9-pank64PuG+aKzafANw@mail.gmail.com>

Andrey Konovalov <andreyknvl@google.com> writes:

> On Mon, Sep 25, 2017 at 6:26 AM, Kalle Valo <kvalo@codeaurora.org> wrote:
>> Andrey Konovalov <andreyknvl@google.com> writes:
>>
>>> I've got the following report while fuzzing the kernel with syzkaller.
>>>
>>> On commit 6e80ecdddf4ea6f3cd84e83720f3d852e6624a68 (Sep 21).
>>>
>>> ==================================================================
>>> BUG: KASAN: use-after-free in __run_timers+0xc0e/0xd40
>>> Write of size 8 at addr ffff880069f701b8 by task swapper/0/0
>>>
>>> CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.14.0-rc1-42311-g6e80ecdddf4e #234
>>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>>
>> [...]
>>
>>> Allocated by task 1845:
>>>  save_stack_trace+0x1b/0x20 arch/x86/kernel/stacktrace.c:59
>>>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>>>  set_track mm/kasan/kasan.c:459
>>>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
>>>  kmem_cache_alloc_trace+0x11e/0x2d0 mm/slub.c:2772
>>>  kmalloc ./include/linux/slab.h:493
>>>  kzalloc ./include/linux/slab.h:666
>>>  rsi_91x_init+0x98/0x510 drivers/net/wireless/rsi/rsi_91x_main.c:203
>>>  rsi_probe+0xb6/0x13b0 drivers/net/wireless/rsi/rsi_91x_usb.c:665
>>>  usb_probe_interface+0x35d/0x8e0 drivers/usb/core/driver.c:361
>>
>> I'm curious about your setup. Apparently you are running syzkaller on
>> QEMU but what I don't understand is how the rsi device comes into the
>> picture. Did you have a rsi usb device connected to the virtual machine
>> or what? Or does syzkaller do some kind of magic here?
>
> I use dummy_hcd and gadgetfs to connect random USB devices to the
> kernel from a userspace application. This happens inside a QEMU
> instance. This simplifies fuzzing, since everything is virtualized,
> but the found bugs can be triggered on a real machine by connecting a
> malicious USB device.

That's very cool, thanks for explaining the setup.

-- 
Kalle Valo

^ permalink raw reply

* Re: [PATCH net-next] net: mvpp2: phylink support
From: Russell King - ARM Linux @ 2017-09-25 12:13 UTC (permalink / raw)
  To: Antoine Tenart
  Cc: davem, andrew, gregory.clement, thomas.petazzoni, miquel.raynal,
	nadavh, linux-kernel, mw, stefanc, netdev
In-Reply-To: <20170925115303.GC19364@kwain>

On Mon, Sep 25, 2017 at 01:53:03PM +0200, Antoine Tenart wrote:
> On Mon, Sep 25, 2017 at 11:45:32AM +0100, Russell King - ARM Linux wrote:
> > Can you describe what the GoP link IRQ is doing please?
> 
> In cases where there is no PHY connected to the MAC and no SFP cage is
> used. One example is when a SOHO switch is connected directly to a
> serdes lane. In such cases we still need to have a minimal link
> management. The GoP link interrupt helps doing so as it raises when the
> serdes is in sync and AN succeeded.

Isn't this just like a fixed link scenario, or an in-band
autonegotiation scenario (both of which phylink supports natively)?

The situation on Clearfog with the 88E6176 switch is pretty similar -
a switch connected directly via serdes to the MAC.  Currently, we
configure stuff there as a fixed link, but in actual fact the 88E6176
is configured to run the CPU facing port in 1000base-X mode, and with
appropriate tweaks, switching phylink to 1000base-X mode also works.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply

* Re: [PATCH net-next] net: mvpp2: phylink support
From: Antoine Tenart @ 2017-09-25 11:53 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Antoine Tenart, davem, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux-kernel, mw, stefanc, netdev
In-Reply-To: <20170925104532.GN20805@n2100.armlinux.org.uk>

On Mon, Sep 25, 2017 at 11:45:32AM +0100, Russell King - ARM Linux wrote:
> On Mon, Sep 25, 2017 at 11:55:14AM +0200, Antoine Tenart wrote:
> > On Fri, Sep 22, 2017 at 12:07:31PM +0100, Russell King - ARM Linux wrote:
> > > On Thu, Sep 21, 2017 at 03:45:22PM +0200, Antoine Tenart wrote:
> > > > Convert the PPv2 driver to use phylink, which models the MAC to PHY
> > > > link. The phylink support is made such a way the GoP link IRQ can still
> > > > be used: the two modes are incompatible and the GoP link IRQ will be
> > > > used if no PHY is described in the device tree. This is the same
> > > > behaviour as before.
> > > 
> > > This makes no sense.  The point of phylink is to be able to support SFP
> > > cages, and SFP cages do not have a PHY described in DT.  So, when you
> > > want to use phylink because of SFP, you can't, because if you omit
> > > the PHY the driver avoids using phylink.
> > 
> > Yes that's an issue. However we do need to support the GoP link IRQ
> > which is also needed in some cases where there is no PHY (and when
> > phylink cannot be used). What would you propose to differentiate those
> > two cases: no PHY using phylink, and no PHY using the GoP link IRQ?
> 
> Can you describe what the GoP link IRQ is doing please?

In cases where there is no PHY connected to the MAC and no SFP cage is
used. One example is when a SOHO switch is connected directly to a
serdes lane. In such cases we still need to have a minimal link
management. The GoP link interrupt helps doing so as it raises when the
serdes is in sync and AN succeeded.

I also wonder if this is needed when using passive cables?

Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* Re: [PATCH RESEND] wireless: iwlwifi: fix minor code style issues
From: Christoph Böhmwalder @ 2017-09-25 11:51 UTC (permalink / raw)
  To: Coelho, Luciano, linux-kernel@vger.kernel.org, trivial@kernel.org,
	Berg, Johannes, kvalo@codeaurora.org, netdev@vger.kernel.org,
	linux-wireless@vger.kernel.org, Grumbach, Emmanuel
In-Reply-To: <1506340012.3276.17.camel@intel.com>


[-- Attachment #1.1: Type: text/plain, Size: 306 bytes --]

> Why are you already resending this?

Sorry, I guess I was too impatient. I also messed up the spelling in a
"To:" line and forgot trivial@kernel.org the first time I sent it, so I
figured I'd just fix it in a resend.

I'll make sure to wait a little longer next time.

--
Regards,
Christoph


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [PATCH RESEND] wireless: iwlwifi: fix minor code style issues
From: Coelho, Luciano @ 2017-09-25 11:47 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org, trivial@kernel.org,
	christoph@boehmwalder.at, Berg, Johannes, kvalo@codeaurora.org,
	netdev@vger.kernel.org, linux-wireless@vger.kernel.org,
	Grumbach, Emmanuel
In-Reply-To: <912deca9-79b6-7a44-6859-dbe532d90fed@boehmwalder.at>

On Mon, 2017-09-25 at 13:37 +0200, Christoph Böhmwalder wrote:
> Fixes three trivial issues as reported by checkpatch.pl, namely two
switch/case indentation issues and one alignment issue in a multiline
comment.

Signed-off-by: Christoph Böhmwalder <christoph@boehmwalder.at>
---

Why are you already resending this? You sent the first email 2 days ago,
you can't expect that a non-critical patch be merged in such a short
time (especially during the weekend).

--
Cheers,
Luca

^ permalink raw reply

* Re: [PATCH 5/5] xfrm: eradicate size_t
From: Steffen Klassert @ 2017-09-25 11:46 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: herbert, davem, netdev
In-Reply-To: <20170921204853.GF13550@avx2>

On Thu, Sep 21, 2017 at 11:48:54PM +0300, Alexey Dobriyan wrote:
> All netlink message sizes are a) unsigned, b) can't be >= 4GB in size
> because netlink doesn't support >= 64KB messages in the first place.
> 
> All those size_t across the code are a scam especially across networking
> which likes to work with small numbers like 1500 or 65536.
> 
> Propagate unsignedness and flip some "int" to "unsigned int" as well.
> 
> This is preparation to switching nlmsg_new() to "unsigned int".
> 
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>

All applied to ipsec-next, thanks Alexey!

^ permalink raw reply

* [PATCH RESEND] wireless: iwlwifi: fix minor code style issues
From: Christoph Böhmwalder @ 2017-09-25 11:37 UTC (permalink / raw)
  To: johannes.berg, emmanuel.grumbach, luciano.coelho, kvalo,
	linux-wireless, netdev, linux-kernel, trivial
  Cc: Christoph Böhmwalder


[-- Attachment #1.1: Type: text/plain, Size: 1716 bytes --]

Fixes three trivial issues as reported by checkpatch.pl, namely two
switch/case indentation issues and one alignment issue in a multiline
comment.

Signed-off-by: Christoph Böhmwalder <christoph@boehmwalder.at>
---
 drivers/net/wireless/intel/iwlwifi/iwl-drv.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
index 99676d6c4713..ccdb247d68c5 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
@@ -832,7 +832,7 @@ static int iwl_parse_tlv_firmware(struct iwl_drv *drv,
 			capa->standard_phy_calibration_size =
 					le32_to_cpup((__le32 *)tlv_data);
 			break;
-		 case IWL_UCODE_TLV_SEC_RT:
+		case IWL_UCODE_TLV_SEC_RT:
 			iwl_store_ucode_sec(pieces, tlv_data, IWL_UCODE_REGULAR,
 					    tlv_len);
 			drv->fw.type = IWL_FW_MVM;
@@ -864,7 +864,7 @@ static int iwl_parse_tlv_firmware(struct iwl_drv *drv,
 						FW_PHY_CFG_RX_CHAIN) >>
 						FW_PHY_CFG_RX_CHAIN_POS;
 			break;
-		 case IWL_UCODE_TLV_SECURE_SEC_RT:
+		case IWL_UCODE_TLV_SECURE_SEC_RT:
 			iwl_store_ucode_sec(pieces, tlv_data, IWL_UCODE_REGULAR,
 					    tlv_len);
 			drv->fw.type = IWL_FW_MVM;
@@ -1335,7 +1335,8 @@ static void iwl_req_fw_callback(const struct
firmware *ucode_raw, void *context)
  	/* Runtime instructions and 2 copies of data:
 	 * 1) unmodified from disk
-	 * 2) backup cache for save/restore during power-downs */
+	 * 2) backup cache for save/restore during power-downs
+	 */
 	for (i = 0; i < IWL_UCODE_TYPE_MAX; i++)
 		if (iwl_alloc_ucode(drv, pieces, i))
 			goto out_free_fw;
-- 
2.13.5



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply related

* Re: [PATCH net v2] sctp: Fix a big endian bug in sctp_diag_dump()
From: Neil Horman @ 2017-09-25 11:23 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Vlad Yasevich, Xin Long, David S. Miller, linux-sctp, netdev,
	kernel-janitors
In-Reply-To: <20170925101926.db4f6x4hblh7tcvo@mwanda>

On Mon, Sep 25, 2017 at 01:19:26PM +0300, Dan Carpenter wrote:
> The sctp_for_each_transport() function takes an pointer to int.  The
> cb->args[] array holds longs so it's only using the high 32 bits.  It
> works on little endian system but will break on big endian 64 bit
> machines.
> 
> Fixes: d25adbeb0cdb ("sctp: fix an use-after-free issue in sctp_sock_dump")
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
> ---
> v2: The v1 patch changed the function to take a long pointer, but v2
>     just changes the caller.
> 
> diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
> index 22ed01a76b19..a72a7d925d46 100644
> --- a/net/sctp/sctp_diag.c
> +++ b/net/sctp/sctp_diag.c
> @@ -463,6 +463,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
>  		.r = r,
>  		.net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN),
>  	};
> +	int pos = cb->args[2];
>  
>  	/* eps hashtable dumps
>  	 * args:
> @@ -493,7 +494,8 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
>  		goto done;
>  
>  	sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump,
> -				net, (int *)&cb->args[2], &commp);
> +				net, &pos, &commp);
> +	cb->args[2] = pos;
>  
>  done:
>  	cb->args[1] = cb->args[4];
> 
Acked-by: Neil Horman <nhorman@tuxdriver.com>

^ permalink raw reply

* Re: [patch net-next v2 05/12] net: ipmr: Add MFC offload indication
From: Yotam Gigi @ 2017-09-25 11:21 UTC (permalink / raw)
  To: Nikolay Aleksandrov, Jiri Pirko, netdev; +Cc: davem, idosch, mlxsw, andrew
In-Reply-To: <7d9433b4-50c9-3ac0-4eef-c8f847897e8c@cumulusnetworks.com>

On 09/25/2017 12:36 PM, Nikolay Aleksandrov wrote:
> On 24/09/17 20:22, Jiri Pirko wrote:
>> From: Yotam Gigi <yotamg@mellanox.com>
>>
>> Allow drivers, registered to the fib notification chain indicate whether a
>> multicast MFC route is offloaded or not, similarly to unicast routes. The
>> indication of whether a route is offloaded is done using the mfc_flags
>> field on an mfc_cache struct, and the information is sent to the userspace
>> via the RTNetlink interface only.
>>
>> Currently, MFC routes are either offloaded or not, thus there is no need to
>> add per-VIF offload indication.
>>
>> Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
>> Reviewed-by: Ido Schimmel <idosch@mellanox.com>
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>> v1->v2:
>>  - Add comment for the MFC_OFFLOAD flag
>> ---
>>  include/linux/mroute.h | 2 ++
>>  net/ipv4/ipmr.c        | 3 +++
>>  2 files changed, 5 insertions(+)
>>
>> diff --git a/include/linux/mroute.h b/include/linux/mroute.h
>> index 54c5cb8..5566580 100644
>> --- a/include/linux/mroute.h
>> +++ b/include/linux/mroute.h
>> @@ -90,9 +90,11 @@ struct mr_table {
>>  
>>  /* mfc_flags:
>>   * MFC_STATIC - the entry was added statically (not by a routing daemon)
>> + * MFC_OFFLOAD - the entry was offloaded to the hardware
>>   */
>>  enum {
>>  	MFC_STATIC = BIT(0),
>> +	MFC_OFFLOAD = BIT(1),
>>  };
>>  
>>  struct mfc_cache_cmp_arg {
>> diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
>> index ba71bc4..2a795d2 100644
>> --- a/net/ipv4/ipmr.c
>> +++ b/net/ipv4/ipmr.c
>> @@ -2268,6 +2268,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
>>  	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
>>  		return -EMSGSIZE;
>>  
>> +	if (c->mfc_flags & MFC_OFFLOAD)
>> +		rtm->rtm_flags |= RTNH_F_OFFLOAD;
>> +
>>  	if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
>>  		return -EMSGSIZE;
>>  
>>
> Thanks!

Thank you for reviewing :)

>
> Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>

^ permalink raw reply

* Re: usb/wireless/rsi_91x: use-after-free write in __run_timers
From: Andrey Konovalov @ 2017-09-25 11:20 UTC (permalink / raw)
  To: Kalle Valo
  Cc: Amitkumar Karwar, Prameela Rani Garnepudi, Karun Eagalapati,
	linux-wireless, netdev, LKML, Dmitry Vyukov, Kostya Serebryany,
	syzkaller
In-Reply-To: <87lgl3769o.fsf@kamboji.qca.qualcomm.com>

On Mon, Sep 25, 2017 at 6:26 AM, Kalle Valo <kvalo@codeaurora.org> wrote:
> Andrey Konovalov <andreyknvl@google.com> writes:
>
>> I've got the following report while fuzzing the kernel with syzkaller.
>>
>> On commit 6e80ecdddf4ea6f3cd84e83720f3d852e6624a68 (Sep 21).
>>
>> ==================================================================
>> BUG: KASAN: use-after-free in __run_timers+0xc0e/0xd40
>> Write of size 8 at addr ffff880069f701b8 by task swapper/0/0
>>
>> CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.14.0-rc1-42311-g6e80ecdddf4e #234
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>
> [...]
>
>> Allocated by task 1845:
>>  save_stack_trace+0x1b/0x20 arch/x86/kernel/stacktrace.c:59
>>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>>  set_track mm/kasan/kasan.c:459
>>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
>>  kmem_cache_alloc_trace+0x11e/0x2d0 mm/slub.c:2772
>>  kmalloc ./include/linux/slab.h:493
>>  kzalloc ./include/linux/slab.h:666
>>  rsi_91x_init+0x98/0x510 drivers/net/wireless/rsi/rsi_91x_main.c:203
>>  rsi_probe+0xb6/0x13b0 drivers/net/wireless/rsi/rsi_91x_usb.c:665
>>  usb_probe_interface+0x35d/0x8e0 drivers/usb/core/driver.c:361
>
> I'm curious about your setup. Apparently you are running syzkaller on
> QEMU but what I don't understand is how the rsi device comes into the
> picture. Did you have a rsi usb device connected to the virtual machine
> or what? Or does syzkaller do some kind of magic here?

I use dummy_hcd and gadgetfs to connect random USB devices to the
kernel from a userspace application. This happens inside a QEMU
instance. This simplifies fuzzing, since everything is virtualized,
but the found bugs can be triggered on a real machine by connecting a
malicious USB device.

>
> --
> Kalle Valo
>
> --
> You received this message because you are subscribed to the Google Groups "syzkaller" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to syzkaller+unsubscribe@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply

* Re: [PATCH net-next 5/6] bpf, nfp: add meta data support
From: Jakub Kicinski @ 2017-09-25 11:12 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: davem, alexei.starovoitov, john.fastabend, peter.waskiewicz.jr,
	netdev
In-Reply-To: <e88b1cbcb0cbfb20a960c57cee3c873e3cadc4cb.1506297988.git.daniel@iogearbox.net>

On Mon, 25 Sep 2017 02:25:54 +0200, Daniel Borkmann wrote:
> Implement support for transferring XDP meta data into skb for
> nfp driver; before calling into the program, xdp.data_meta points
> to xdp.data, where on program return with pass verdict, we call
> into skb_metadata_set().
> 
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> Acked-by: John Fastabend <john.fastabend@gmail.com>

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>

Thanks!

^ permalink raw reply

* [PATCH v2 02/16] thunderbolt: Add support for XDomain properties
From: Mika Westerberg @ 2017-09-25 11:07 UTC (permalink / raw)
  To: Greg Kroah-Hartman, David S . Miller
  Cc: Andreas Noever, Michael Jamet, Yehezkel Bernat, Amir Levy,
	Mario.Limonciello, Lukas Wunner, Andy Shevchenko, Andrew Lunn,
	Mika Westerberg, linux-kernel, netdev
In-Reply-To: <20170925110738.68382-1-mika.westerberg@linux.intel.com>

Thunderbolt XDomain discovery protocol uses directories which contain
properties and other directories to exchange information about what
capabilities the remote host supports. This also includes identification
information like device ID and name.

This adds support for parsing and formatting these properties and
establishes an API drivers can use in addition to the core Thunderbolt
driver. This API is exposed in a new header: include/linux/thunderbolt.h.

This code is based on the work done by Amir Levy and Michael Jamet.

Signed-off-by: Michael Jamet <michael.jamet@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
---
 MAINTAINERS                    |   1 +
 drivers/thunderbolt/Makefile   |   2 +-
 drivers/thunderbolt/property.c | 670 +++++++++++++++++++++++++++++++++++++++++
 include/linux/thunderbolt.h    |  89 ++++++
 4 files changed, 761 insertions(+), 1 deletion(-)
 create mode 100644 drivers/thunderbolt/property.c
 create mode 100644 include/linux/thunderbolt.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 6671f375f7fc..c1c90d962012 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13278,6 +13278,7 @@ M:	Mika Westerberg <mika.westerberg@linux.intel.com>
 M:	Yehezkel Bernat <yehezkel.bernat@intel.com>
 S:	Maintained
 F:	drivers/thunderbolt/
+F:	include/linux/thunderbolt.h
 
 THUNDERX GPIO DRIVER
 M:	David Daney <david.daney@cavium.com>
diff --git a/drivers/thunderbolt/Makefile b/drivers/thunderbolt/Makefile
index 4900febc6c8a..7afd21f5383a 100644
--- a/drivers/thunderbolt/Makefile
+++ b/drivers/thunderbolt/Makefile
@@ -1,3 +1,3 @@
 obj-${CONFIG_THUNDERBOLT} := thunderbolt.o
 thunderbolt-objs := nhi.o ctl.o tb.o switch.o cap.o path.o tunnel_pci.o eeprom.o
-thunderbolt-objs += domain.o dma_port.o icm.o
+thunderbolt-objs += domain.o dma_port.o icm.o property.o
diff --git a/drivers/thunderbolt/property.c b/drivers/thunderbolt/property.c
new file mode 100644
index 000000000000..55a8aa32b1d6
--- /dev/null
+++ b/drivers/thunderbolt/property.c
@@ -0,0 +1,670 @@
+/*
+ * Thunderbolt XDomain property support
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uuid.h>
+#include <linux/thunderbolt.h>
+
+struct tb_property_entry {
+	u32 key_hi;
+	u32 key_lo;
+	u16 length;
+	u8 reserved;
+	u8 type;
+	u32 value;
+} __packed;
+
+struct tb_property_rootdir_entry {
+	u32 magic;
+	u32 length;
+	struct tb_property_entry entries[];
+} __packed;
+
+struct tb_property_dir_entry {
+	u32 uuid[4];
+	struct tb_property_entry entries[];
+} __packed;
+
+#define TB_PROPERTY_ROOTDIR_MAGIC	0x55584401
+
+static struct tb_property_dir *__tb_property_parse_dir(const u32 *block,
+	size_t block_len, unsigned int dir_offset, size_t dir_len,
+	bool is_root);
+
+static inline void parse_dwdata(void *dst, const void *src, size_t dwords)
+{
+	be32_to_cpu_array(dst, src, dwords);
+}
+
+static inline void format_dwdata(void *dst, const void *src, size_t dwords)
+{
+	cpu_to_be32_array(dst, src, dwords);
+}
+
+static bool tb_property_entry_valid(const struct tb_property_entry *entry,
+				  size_t block_len)
+{
+	switch (entry->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+	case TB_PROPERTY_TYPE_DATA:
+	case TB_PROPERTY_TYPE_TEXT:
+		if (entry->length > block_len)
+			return false;
+		if (entry->value + entry->length > block_len)
+			return false;
+		break;
+
+	case TB_PROPERTY_TYPE_VALUE:
+		if (entry->length != 1)
+			return false;
+		break;
+	}
+
+	return true;
+}
+
+static bool tb_property_key_valid(const char *key)
+{
+	return key && strlen(key) <= TB_PROPERTY_KEY_SIZE;
+}
+
+static struct tb_property *
+tb_property_alloc(const char *key, enum tb_property_type type)
+{
+	struct tb_property *property;
+
+	property = kzalloc(sizeof(*property), GFP_KERNEL);
+	if (!property)
+		return NULL;
+
+	strcpy(property->key, key);
+	property->type = type;
+	INIT_LIST_HEAD(&property->list);
+
+	return property;
+}
+
+static struct tb_property *tb_property_parse(const u32 *block, size_t block_len,
+					const struct tb_property_entry *entry)
+{
+	char key[TB_PROPERTY_KEY_SIZE + 1];
+	struct tb_property *property;
+	struct tb_property_dir *dir;
+
+	if (!tb_property_entry_valid(entry, block_len))
+		return NULL;
+
+	parse_dwdata(key, entry, 2);
+	key[TB_PROPERTY_KEY_SIZE] = '\0';
+
+	property = tb_property_alloc(key, entry->type);
+	if (!property)
+		return NULL;
+
+	property->length = entry->length;
+
+	switch (property->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+		dir = __tb_property_parse_dir(block, block_len, entry->value,
+					      entry->length, false);
+		if (!dir) {
+			kfree(property);
+			return NULL;
+		}
+		property->value.dir = dir;
+		break;
+
+	case TB_PROPERTY_TYPE_DATA:
+		property->value.data = kcalloc(property->length, sizeof(u32),
+					       GFP_KERNEL);
+		if (!property->value.data) {
+			kfree(property);
+			return NULL;
+		}
+		parse_dwdata(property->value.data, block + entry->value,
+			     entry->length);
+		break;
+
+	case TB_PROPERTY_TYPE_TEXT:
+		property->value.text = kcalloc(property->length, sizeof(u32),
+					       GFP_KERNEL);
+		if (!property->value.text) {
+			kfree(property);
+			return NULL;
+		}
+		parse_dwdata(property->value.text, block + entry->value,
+			     entry->length);
+		/* Force null termination */
+		property->value.text[property->length * 4 - 1] = '\0';
+		break;
+
+	case TB_PROPERTY_TYPE_VALUE:
+		property->value.immediate = entry->value;
+		break;
+
+	default:
+		property->type = TB_PROPERTY_TYPE_UNKNOWN;
+		break;
+	}
+
+	return property;
+}
+
+static struct tb_property_dir *__tb_property_parse_dir(const u32 *block,
+	size_t block_len, unsigned int dir_offset, size_t dir_len, bool is_root)
+{
+	const struct tb_property_entry *entries;
+	size_t i, content_len, nentries;
+	unsigned int content_offset;
+	struct tb_property_dir *dir;
+
+	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		return NULL;
+
+	if (is_root) {
+		content_offset = dir_offset + 2;
+		content_len = dir_len;
+	} else {
+		dir->uuid = kmemdup(&block[dir_offset], sizeof(*dir->uuid),
+				    GFP_KERNEL);
+		content_offset = dir_offset + 4;
+		content_len = dir_len - 4; /* Length includes UUID */
+	}
+
+	entries = (const struct tb_property_entry *)&block[content_offset];
+	nentries = content_len / (sizeof(*entries) / 4);
+
+	INIT_LIST_HEAD(&dir->properties);
+
+	for (i = 0; i < nentries; i++) {
+		struct tb_property *property;
+
+		property = tb_property_parse(block, block_len, &entries[i]);
+		if (!property) {
+			tb_property_free_dir(dir);
+			return NULL;
+		}
+
+		list_add_tail(&property->list, &dir->properties);
+	}
+
+	return dir;
+}
+
+/**
+ * tb_property_parse_dir() - Parses properties from given property block
+ * @block: Property block to parse
+ * @block_len: Number of dword elements in the property block
+ *
+ * This function parses the XDomain properties data block into format that
+ * can be traversed using the helper functions provided by this module.
+ * Upon success returns the parsed directory. In case of error returns
+ * %NULL. The resulting &struct tb_property_dir needs to be released by
+ * calling tb_property_free_dir() when not needed anymore.
+ *
+ * The @block is expected to be root directory.
+ */
+struct tb_property_dir *tb_property_parse_dir(const u32 *block,
+					      size_t block_len)
+{
+	const struct tb_property_rootdir_entry *rootdir =
+		(const struct tb_property_rootdir_entry *)block;
+
+	if (rootdir->magic != TB_PROPERTY_ROOTDIR_MAGIC)
+		return NULL;
+	if (rootdir->length > block_len)
+		return NULL;
+
+	return __tb_property_parse_dir(block, block_len, 0, rootdir->length,
+				       true);
+}
+
+/**
+ * tb_property_create_dir() - Creates new property directory
+ * @uuid: UUID used to identify the particular directory
+ *
+ * Creates new, empty property directory. If @uuid is %NULL then the
+ * directory is assumed to be root directory.
+ */
+struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid)
+{
+	struct tb_property_dir *dir;
+
+	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		return NULL;
+
+	INIT_LIST_HEAD(&dir->properties);
+	if (uuid) {
+		dir->uuid = kmemdup(uuid, sizeof(*dir->uuid), GFP_KERNEL);
+		if (!dir->uuid) {
+			kfree(dir);
+			return NULL;
+		}
+	}
+
+	return dir;
+}
+EXPORT_SYMBOL_GPL(tb_property_create_dir);
+
+static void tb_property_free(struct tb_property *property)
+{
+	switch (property->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+		tb_property_free_dir(property->value.dir);
+		break;
+
+	case TB_PROPERTY_TYPE_DATA:
+		kfree(property->value.data);
+		break;
+
+	case TB_PROPERTY_TYPE_TEXT:
+		kfree(property->value.text);
+		break;
+
+	default:
+		break;
+	}
+
+	kfree(property);
+}
+
+/**
+ * tb_property_free_dir() - Release memory allocated for property directory
+ * @dir: Directory to release
+ *
+ * This will release all the memory the directory occupies including all
+ * descendants. It is OK to pass %NULL @dir, then the function does
+ * nothing.
+ */
+void tb_property_free_dir(struct tb_property_dir *dir)
+{
+	struct tb_property *property, *tmp;
+
+	if (!dir)
+		return;
+
+	list_for_each_entry_safe(property, tmp, &dir->properties, list) {
+		list_del(&property->list);
+		tb_property_free(property);
+	}
+	kfree(dir->uuid);
+	kfree(dir);
+}
+EXPORT_SYMBOL_GPL(tb_property_free_dir);
+
+static size_t tb_property_dir_length(const struct tb_property_dir *dir,
+				     bool recurse, size_t *data_len)
+{
+	const struct tb_property *property;
+	size_t len = 0;
+
+	if (dir->uuid)
+		len += sizeof(*dir->uuid) / 4;
+	else
+		len += sizeof(struct tb_property_rootdir_entry) / 4;
+
+	list_for_each_entry(property, &dir->properties, list) {
+		len += sizeof(struct tb_property_entry) / 4;
+
+		switch (property->type) {
+		case TB_PROPERTY_TYPE_DIRECTORY:
+			if (recurse) {
+				len += tb_property_dir_length(
+					property->value.dir, recurse, data_len);
+			}
+			/* Reserve dword padding after each directory */
+			if (data_len)
+				*data_len += 1;
+			break;
+
+		case TB_PROPERTY_TYPE_DATA:
+		case TB_PROPERTY_TYPE_TEXT:
+			if (data_len)
+				*data_len += property->length;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	return len;
+}
+
+static ssize_t __tb_property_format_dir(const struct tb_property_dir *dir,
+	u32 *block, unsigned int start_offset, size_t block_len)
+{
+	unsigned int data_offset, dir_end;
+	const struct tb_property *property;
+	struct tb_property_entry *entry;
+	size_t dir_len, data_len = 0;
+	int ret;
+
+	/*
+	 * The structure of property block looks like following. Leaf
+	 * data/text is included right after the directory and each
+	 * directory follows each other (even nested ones).
+	 *
+	 * +----------+ <-- start_offset
+	 * |  header  | <-- root directory header
+	 * +----------+ ---
+	 * |  entry 0 | -^--------------------.
+	 * +----------+  |                    |
+	 * |  entry 1 | -|--------------------|--.
+	 * +----------+  |                    |  |
+	 * |  entry 2 | -|-----------------.  |  |
+	 * +----------+  |                 |  |  |
+	 * :          :  |  dir_len        |  |  |
+	 * .          .  |                 |  |  |
+	 * :          :  |                 |  |  |
+	 * +----------+  |                 |  |  |
+	 * |  entry n |  v                 |  |  |
+	 * +----------+ <-- data_offset    |  |  |
+	 * |  data 0  | <------------------|--'  |
+	 * +----------+                    |     |
+	 * |  data 1  | <------------------|-----'
+	 * +----------+                    |
+	 * | 00000000 | padding            |
+	 * +----------+ <-- dir_end <------'
+	 * |   UUID   | <-- directory UUID (child directory)
+	 * +----------+
+	 * |  entry 0 |
+	 * +----------+
+	 * |  entry 1 |
+	 * +----------+
+	 * :          :
+	 * .          .
+	 * :          :
+	 * +----------+
+	 * |  entry n |
+	 * +----------+
+	 * |  data 0  |
+	 * +----------+
+	 *
+	 * We use dir_end to hold pointer to the end of the directory. It
+	 * will increase as we add directories and each directory should be
+	 * added starting from previous dir_end.
+	 */
+	dir_len = tb_property_dir_length(dir, false, &data_len);
+	data_offset = start_offset + dir_len;
+	dir_end = start_offset + data_len + dir_len;
+
+	if (data_offset > dir_end)
+		return -EINVAL;
+	if (dir_end > block_len)
+		return -EINVAL;
+
+	/* Write headers first */
+	if (dir->uuid) {
+		struct tb_property_dir_entry *pe;
+
+		pe = (struct tb_property_dir_entry *)&block[start_offset];
+		memcpy(pe->uuid, dir->uuid, sizeof(pe->uuid));
+		entry = pe->entries;
+	} else {
+		struct tb_property_rootdir_entry *re;
+
+		re = (struct tb_property_rootdir_entry *)&block[start_offset];
+		re->magic = TB_PROPERTY_ROOTDIR_MAGIC;
+		re->length = dir_len - sizeof(*re) / 4;
+		entry = re->entries;
+	}
+
+	list_for_each_entry(property, &dir->properties, list) {
+		const struct tb_property_dir *child;
+
+		format_dwdata(entry, property->key, 2);
+		entry->type = property->type;
+
+		switch (property->type) {
+		case TB_PROPERTY_TYPE_DIRECTORY:
+			child = property->value.dir;
+			ret = __tb_property_format_dir(child, block, dir_end,
+						       block_len);
+			if (ret < 0)
+				return ret;
+			entry->length = tb_property_dir_length(child, false,
+							       NULL);
+			entry->value = dir_end;
+			dir_end = ret;
+			break;
+
+		case TB_PROPERTY_TYPE_DATA:
+			format_dwdata(&block[data_offset], property->value.data,
+				      property->length);
+			entry->length = property->length;
+			entry->value = data_offset;
+			data_offset += entry->length;
+			break;
+
+		case TB_PROPERTY_TYPE_TEXT:
+			format_dwdata(&block[data_offset], property->value.text,
+				      property->length);
+			entry->length = property->length;
+			entry->value = data_offset;
+			data_offset += entry->length;
+			break;
+
+		case TB_PROPERTY_TYPE_VALUE:
+			entry->length = property->length;
+			entry->value = property->value.immediate;
+			break;
+
+		default:
+			break;
+		}
+
+		entry++;
+	}
+
+	return dir_end;
+}
+
+/**
+ * tb_property_format_dir() - Formats directory to the packed XDomain format
+ * @dir: Directory to format
+ * @block: Property block where the packed data is placed
+ * @block_len: Length of the property block
+ *
+ * This function formats the directory to the packed format that can be
+ * then send over the thunderbolt fabric to receiving host. Returns %0 in
+ * case of success and negative errno on faulure. Passing %NULL in @block
+ * returns number of entries the block takes.
+ */
+ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
+			       size_t block_len)
+{
+	ssize_t ret;
+
+	if (!block) {
+		size_t dir_len, data_len = 0;
+
+		dir_len = tb_property_dir_length(dir, true, &data_len);
+		return dir_len + data_len;
+	}
+
+	ret = __tb_property_format_dir(dir, block, 0, block_len);
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ * tb_property_add_immediate() - Add immediate property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @value: Immediate value to store with the property
+ */
+int tb_property_add_immediate(struct tb_property_dir *parent, const char *key,
+			      u32 value)
+{
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_VALUE);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = 1;
+	property->value.immediate = value;
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_immediate);
+
+/**
+ * tb_property_add_data() - Adds arbitrary data property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @buf: Data buffer to add
+ * @buflen: Number of bytes in the data buffer
+ *
+ * Function takes a copy of @buf and adds it to the directory.
+ */
+int tb_property_add_data(struct tb_property_dir *parent, const char *key,
+			 const void *buf, size_t buflen)
+{
+	/* Need to pad to dword boundary */
+	size_t size = round_up(buflen, 4);
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_DATA);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = size / 4;
+	property->value.data = kzalloc(size, GFP_KERNEL);
+	memcpy(property->value.data, buf, buflen);
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_data);
+
+/**
+ * tb_property_add_text() - Adds string property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @text: String to add
+ *
+ * Function takes a copy of @text and adds it to the directory.
+ */
+int tb_property_add_text(struct tb_property_dir *parent, const char *key,
+			 const char *text)
+{
+	/* Need to pad to dword boundary */
+	size_t size = round_up(strlen(text) + 1, 4);
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_TEXT);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = size / 4;
+	property->value.data = kzalloc(size, GFP_KERNEL);
+	strcpy(property->value.text, text);
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_text);
+
+/**
+ * tb_property_add_dir() - Adds a directory to the parent directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @dir: Directory to add
+ */
+int tb_property_add_dir(struct tb_property_dir *parent, const char *key,
+			struct tb_property_dir *dir)
+{
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_DIRECTORY);
+	if (!property)
+		return -ENOMEM;
+
+	property->value.dir = dir;
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_dir);
+
+/**
+ * tb_property_remove() - Removes property from a parent directory
+ * @property: Property to remove
+ *
+ * Note memory for @property is released as well so it is not allowed to
+ * touch the object after call to this function.
+ */
+void tb_property_remove(struct tb_property *property)
+{
+	list_del(&property->list);
+	kfree(property);
+}
+EXPORT_SYMBOL_GPL(tb_property_remove);
+
+/**
+ * tb_property_find() - Find a property from a directory
+ * @dir: Directory where the property is searched
+ * @key: Key to look for
+ * @type: Type of the property
+ *
+ * Finds and returns property from the given directory. Does not recurse
+ * into sub-directories. Returns %NULL if the property was not found.
+ */
+struct tb_property *tb_property_find(struct tb_property_dir *dir,
+	const char *key, enum tb_property_type type)
+{
+	struct tb_property *property;
+
+	list_for_each_entry(property, &dir->properties, list) {
+		if (property->type == type && !strcmp(property->key, key))
+			return property;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(tb_property_find);
+
+/**
+ * tb_property_get_next() - Get next property from directory
+ * @dir: Directory holding properties
+ * @prev: Previous property in the directory (%NULL returns the first)
+ */
+struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
+					 struct tb_property *prev)
+{
+	if (prev) {
+		if (list_is_last(&prev->list, &dir->properties))
+			return NULL;
+		return list_next_entry(prev, list);
+	}
+	return list_first_entry_or_null(&dir->properties, struct tb_property,
+					list);
+}
+EXPORT_SYMBOL_GPL(tb_property_get_next);
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
new file mode 100644
index 000000000000..96561c1265ae
--- /dev/null
+++ b/include/linux/thunderbolt.h
@@ -0,0 +1,89 @@
+/*
+ * Thunderbolt service API
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef THUNDERBOLT_H_
+#define THUNDERBOLT_H_
+
+#include <linux/list.h>
+#include <linux/uuid.h>
+
+/**
+ * struct tb_property_dir - XDomain property directory
+ * @uuid: Directory UUID or %NULL if root directory
+ * @properties: List of properties in this directory
+ *
+ * User needs to provide serialization if needed.
+ */
+struct tb_property_dir {
+	const uuid_t *uuid;
+	struct list_head properties;
+};
+
+enum tb_property_type {
+	TB_PROPERTY_TYPE_UNKNOWN = 0x00,
+	TB_PROPERTY_TYPE_DIRECTORY = 0x44,
+	TB_PROPERTY_TYPE_DATA = 0x64,
+	TB_PROPERTY_TYPE_TEXT = 0x74,
+	TB_PROPERTY_TYPE_VALUE = 0x76,
+};
+
+#define TB_PROPERTY_KEY_SIZE	8
+
+/**
+ * struct tb_property - XDomain property
+ * @list: Used to link properties together in a directory
+ * @key: Key for the property (always terminated).
+ * @type: Type of the property
+ * @length: Length of the property data in dwords
+ * @value: Property value
+ *
+ * Users use @type to determine which field in @value is filled.
+ */
+struct tb_property {
+	struct list_head list;
+	char key[TB_PROPERTY_KEY_SIZE + 1];
+	enum tb_property_type type;
+	size_t length;
+	union {
+		struct tb_property_dir *dir;
+		u8 *data;
+		char *text;
+		u32 immediate;
+	} value;
+};
+
+struct tb_property_dir *tb_property_parse_dir(const u32 *block,
+					      size_t block_len);
+ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
+			       size_t block_len);
+struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid);
+void tb_property_free_dir(struct tb_property_dir *dir);
+int tb_property_add_immediate(struct tb_property_dir *parent, const char *key,
+			      u32 value);
+int tb_property_add_data(struct tb_property_dir *parent, const char *key,
+			 const void *buf, size_t buflen);
+int tb_property_add_text(struct tb_property_dir *parent, const char *key,
+			 const char *text);
+int tb_property_add_dir(struct tb_property_dir *parent, const char *key,
+			struct tb_property_dir *dir);
+void tb_property_remove(struct tb_property *tb_property);
+struct tb_property *tb_property_find(struct tb_property_dir *dir,
+			const char *key, enum tb_property_type type);
+struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
+					 struct tb_property *prev);
+
+#define tb_property_for_each(dir, property)			\
+	for (property = tb_property_get_next(dir, NULL);	\
+	     property;						\
+	     property = tb_property_get_next(dir, property))
+
+#endif /* THUNDERBOLT_H_ */
-- 
2.14.1

^ permalink raw reply related

* [PATCH v2 16/16] net: Add support for networking over Thunderbolt cable
From: Mika Westerberg @ 2017-09-25 11:07 UTC (permalink / raw)
  To: Greg Kroah-Hartman, David S . Miller
  Cc: Andreas Noever, Michael Jamet, Yehezkel Bernat, Amir Levy,
	Mario.Limonciello, Lukas Wunner, Andy Shevchenko, Andrew Lunn,
	Mika Westerberg, linux-kernel, netdev
In-Reply-To: <20170925110738.68382-1-mika.westerberg@linux.intel.com>

From: Amir Levy <amir.jer.levy@intel.com>

ThunderboltIP is a protocol created by Apple to tunnel IP/ethernet
traffic over a Thunderbolt cable. The protocol consists of configuration
phase where each side sends ThunderboltIP login packets (the protocol is
determined by UUID in the XDomain packet header) over the configuration
channel. Once both sides get positive acknowledgment to their login
packet, they configure high-speed DMA path accordingly. This DMA path is
then used to transmit and receive networking traffic.

This patch creates a virtual ethernet interface the host software can
use in the same way as any other networking interface. Once the
interface is brought up successfully network packets get tunneled over
the Thunderbolt cable to the remote host and back.

The connection is terminated by sending a ThunderboltIP logout packet
over the configuration channel. We do this when the network interface is
brought down by user or the driver is unloaded.

Signed-off-by: Amir Levy <amir.jer.levy@intel.com>
Signed-off-by: Michael Jamet <michael.jamet@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
---
 Documentation/admin-guide/thunderbolt.rst |   24 +
 MAINTAINERS                               |    6 +
 drivers/net/Kconfig                       |   12 +
 drivers/net/Makefile                      |    3 +
 drivers/net/thunderbolt.c                 | 1379 +++++++++++++++++++++++++++++
 5 files changed, 1424 insertions(+)
 create mode 100644 drivers/net/thunderbolt.c

diff --git a/Documentation/admin-guide/thunderbolt.rst b/Documentation/admin-guide/thunderbolt.rst
index 6a4cd1f159ca..5c62d11d77e8 100644
--- a/Documentation/admin-guide/thunderbolt.rst
+++ b/Documentation/admin-guide/thunderbolt.rst
@@ -197,3 +197,27 @@ information is missing.
 
 To recover from this mode, one needs to flash a valid NVM image to the
 host host controller in the same way it is done in the previous chapter.
+
+Networking over Thunderbolt cable
+---------------------------------
+Thunderbolt technology allows software communication across two hosts
+connected by a Thunderbolt cable.
+
+It is possible to tunnel any kind of traffic over Thunderbolt link but
+currently we only support Apple ThunderboltIP protocol.
+
+If the other host is running Windows or macOS only thing you need to
+do is to connect Thunderbolt cable between the two hosts, the
+``thunderbolt-net`` is loaded automatically. If the other host is also
+Linux you should load ``thunderbolt-net`` manually on one host (it does
+not matter which one)::
+
+  # modprobe thunderbolt-net
+
+This triggers module load on the other host automatically. If the driver
+is built-in to the kernel image, there is no need to do anything.
+
+The driver will create one virtual ethernet interface per Thunderbolt
+port which are named like ``thunderbolt0`` and so on. From this point
+you can either use standard userspace tools like ``ifconfig`` to
+configure the interface or let your GUI to handle it automatically.
diff --git a/MAINTAINERS b/MAINTAINERS
index c1c90d962012..0dfbb3b2fbf0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13280,6 +13280,12 @@ S:	Maintained
 F:	drivers/thunderbolt/
 F:	include/linux/thunderbolt.h
 
+THUNDERBOLT NETWORK DRIVER
+M:	Mika Westerberg <mika.westerberg@linux.intel.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/thunderbolt.c
+
 THUNDERX GPIO DRIVER
 M:	David Daney <david.daney@cavium.com>
 S:	Maintained
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index aba0d652095b..0936da592e12 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -483,6 +483,18 @@ config FUJITSU_ES
 	  This driver provides support for Extended Socket network device
           on Extended Partitioning of FUJITSU PRIMEQUEST 2000 E2 series.
 
+config THUNDERBOLT_NET
+	tristate "Networking over Thunderbolt cable"
+	depends on THUNDERBOLT && INET
+	help
+	  Select this if you want to create network between two
+	  computers over a Thunderbolt cable. The driver supports Apple
+	  ThunderboltIP protocol and allows communication with any host
+	  supporting the same protocol including Windows and macOS.
+
+	  To compile this driver a module, choose M here. The module will be
+	  called thunderbolt-net.
+
 source "drivers/net/hyperv/Kconfig"
 
 endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 8dff900085d6..7c8f4dd3a7c5 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -74,3 +74,6 @@ obj-$(CONFIG_HYPERV_NET) += hyperv/
 obj-$(CONFIG_NTB_NETDEV) += ntb_netdev.o
 
 obj-$(CONFIG_FUJITSU_ES) += fjes/
+
+thunderbolt-net-y += thunderbolt.o
+obj-$(CONFIG_THUNDERBOLT_NET) += thunderbolt-net.o
diff --git a/drivers/net/thunderbolt.c b/drivers/net/thunderbolt.c
new file mode 100644
index 000000000000..0128fe7e665e
--- /dev/null
+++ b/drivers/net/thunderbolt.c
@@ -0,0 +1,1379 @@
+/*
+ * Networking over Thunderbolt cable using Apple ThunderboltIP protocol
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Amir Levy <amir.jer.levy@intel.com>
+ *          Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/atomic.h>
+#include <linux/highmem.h>
+#include <linux/if_vlan.h>
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/sizes.h>
+#include <linux/thunderbolt.h>
+#include <linux/uuid.h>
+#include <linux/workqueue.h>
+#include <net/ip6_checksum.h>
+
+/* Protocol timeouts in ms */
+#define TBNET_LOGIN_DELAY	4500
+#define TBNET_LOGIN_TIMEOUT	500
+#define TBNET_LOGOUT_TIMEOUT	100
+
+#define TBNET_RING_SIZE		256
+#define TBNET_LOCAL_PATH	0xf
+#define TBNET_RX_HDR_SIZE	256
+#define TBNET_LOGIN_RETRIES	60
+#define TBNET_LOGOUT_RETRIES	5
+#define TBNET_MATCH_FRAGS_ID	BIT(1)
+#define TBNET_MAX_MTU		SZ_64K
+#define TBNET_FRAME_SIZE	SZ_4K
+#define TBNET_MAX_PAYLOAD_SIZE		\
+	(TBNET_FRAME_SIZE - sizeof(struct thunderbolt_ip_frame_header))
+
+#define TBNET_L0_PORT_NUM(route) ((route) & GENMASK(5, 0))
+
+/**
+ * struct thunderbolt_ip_frame_header - Header for each Thunderbolt frame
+ * @frame_size: size of the data with the frame
+ * @frame_index: running index on the frames
+ * @frame_id: ID of the frame to match frames to specific packet
+ * @frame_count: how many frames assembles a full packet
+ *
+ * Each data frame passed to the high-speed DMA ring has this header. If
+ * the XDomain network directory announces that %TBNET_MATCH_FRAGS_ID is
+ * supported then @frame_id is filled, otherwise it stays %0.
+ */
+struct thunderbolt_ip_frame_header {
+	u32 frame_size;
+	u16 frame_index;
+	u16 frame_id;
+	u32 frame_count;
+} __packed;
+
+enum thunderbolt_ip_frame_pdf {
+	TBIP_PDF_FRAME_START = 1,
+	TBIP_PDF_FRAME_END,
+};
+
+enum thunderbolt_ip_type {
+	TBIP_LOGIN,
+	TBIP_LOGIN_RESPONSE,
+	TBIP_LOGOUT,
+	TBIP_STATUS,
+};
+
+struct thunderbolt_ip_header {
+	u32 route_hi;
+	u32 route_lo;
+	u32 length_sn;
+	uuid_t uuid;
+	uuid_t initiator_uuid;
+	uuid_t target_uuid;
+	u32 type;
+	u32 command_id;
+} __packed;
+
+#define TBIP_HDR_LENGTH_MASK		GENMASK(5, 0)
+#define TBIP_HDR_SN_MASK		GENMASK(28, 27)
+#define TBIP_HDR_SN_SHIFT		27
+
+struct thunderbolt_ip_login {
+	struct thunderbolt_ip_header hdr;
+	u32 proto_version;
+	u32 transmit_path;
+	u32 reserved[4];
+} __packed;
+
+#define TBIP_LOGIN_PROTO_VERSION	1
+
+struct thunderbolt_ip_login_response {
+	struct thunderbolt_ip_header hdr;
+	u32 status;
+	u32 receiver_mac[2];
+	u32 receiver_mac_len;
+	u32 reserved[4];
+} __packed;
+
+struct thunderbolt_ip_logout {
+	struct thunderbolt_ip_header hdr;
+} __packed;
+
+struct thunderbolt_ip_status {
+	struct thunderbolt_ip_header hdr;
+	u32 status;
+} __packed;
+
+struct tbnet_stats {
+	u64 tx_packets;
+	u64 rx_packets;
+	u64 tx_bytes;
+	u64 rx_bytes;
+	u64 tx_errors;
+	u64 rx_length_errors;
+	u64 rx_over_errors;
+	u64 rx_crc_errors;
+	u64 rx_missed_errors;
+};
+
+struct tbnet_frame {
+	struct net_device *dev;
+	struct page *page;
+	struct ring_frame frame;
+};
+
+struct tbnet_ring {
+	struct tbnet_frame frames[TBNET_RING_SIZE];
+	unsigned int cons;
+	unsigned int prod;
+	struct tb_ring *ring;
+};
+
+/**
+ * struct tbnet - ThunderboltIP network driver private data
+ * @svc: XDomain service the driver is bound to
+ * @xd: XDomain the service blongs to
+ * @handler: ThunderboltIP configuration protocol handler
+ * @dev: Networking device
+ * @napi: NAPI structure for Rx polling
+ * @stats: Network statistics
+ * @skb: Network packet that is currently processed on Rx path
+ * @command_id: ID used for next configuration protocol packet
+ * @login_sent: ThunderboltIP login message successfully sent
+ * @login_received: ThunderboltIP login message received from the remote
+ *		    host
+ * @transmit_path: HopID the other end needs to use building the
+ *		   opposite side path.
+ * @connection_lock: Lock serializing access to @login_sent,
+ *		     @login_received and @transmit_path.
+ * @login_retries: Number of login retries currently done
+ * @login_work: Worker to send ThunderboltIP login packets
+ * @connected_work: Worker that finalizes the ThunderboltIP connection
+ *		    setup and enables DMA paths for high speed data
+ *		    transfers
+ * @rx_hdr: Copy of the currently processed Rx frame. Used when a
+ *	    network packet consists of multiple Thunderbolt frames.
+ *	    In host byte order.
+ * @rx_ring: Software ring holding Rx frames
+ * @frame_id: Frame ID use for next Tx packet (if
+ *	      %TBNET_MATCH_FRAGS_ID is supported in both ends)
+ * @tx_ring: Software ring holding Tx frames
+ */
+struct tbnet {
+	const struct tb_service *svc;
+	struct tb_xdomain *xd;
+	struct tb_protocol_handler handler;
+	struct net_device *dev;
+	struct napi_struct napi;
+	struct tbnet_stats stats;
+	struct sk_buff *skb;
+	atomic_t command_id;
+	bool login_sent;
+	bool login_received;
+	u32 transmit_path;
+	struct mutex connection_lock;
+	int login_retries;
+	struct delayed_work login_work;
+	struct work_struct connected_work;
+	struct thunderbolt_ip_frame_header rx_hdr;
+	struct tbnet_ring rx_ring;
+	atomic_t frame_id;
+	struct tbnet_ring tx_ring;
+};
+
+/* Network property directory UUID */
+static const uuid_t tbnet_dir_uuid =
+	UUID_INIT(0xc66189ca, 0x1cce, 0x4195,
+		  0xbd, 0xb8, 0x49, 0x59, 0x2e, 0x5f, 0x5a, 0x4f);
+
+/* ThunderboltIP configuration protocol UUID */
+static const uuid_t tbnet_svc_uuid =
+	UUID_INIT(0x798f589e, 0x3616, 0x8a47,
+		  0x97, 0xc6, 0x56, 0x64, 0xa9, 0x20, 0xc8, 0xdd);
+
+static struct tb_property_dir *tbnet_dir;
+
+static void tbnet_fill_header(struct thunderbolt_ip_header *hdr, u64 route,
+	u8 sequence, const uuid_t *initiator_uuid, const uuid_t *target_uuid,
+	enum thunderbolt_ip_type type, size_t size, u32 command_id)
+{
+	u32 length_sn;
+
+	/* Length does not include route_hi/lo and length_sn fields */
+	length_sn = (size - 3 * 4) / 4;
+	length_sn |= (sequence << TBIP_HDR_SN_SHIFT) & TBIP_HDR_SN_MASK;
+
+	hdr->route_hi = upper_32_bits(route);
+	hdr->route_lo = lower_32_bits(route);
+	hdr->length_sn = length_sn;
+	uuid_copy(&hdr->uuid, &tbnet_svc_uuid);
+	uuid_copy(&hdr->initiator_uuid, initiator_uuid);
+	uuid_copy(&hdr->target_uuid, target_uuid);
+	hdr->type = type;
+	hdr->command_id = command_id;
+}
+
+static int tbnet_login_response(struct tbnet *net, u64 route, u8 sequence,
+				u32 command_id)
+{
+	struct thunderbolt_ip_login_response reply;
+	struct tb_xdomain *xd = net->xd;
+
+	memset(&reply, 0, sizeof(reply));
+	tbnet_fill_header(&reply.hdr, route, sequence, xd->local_uuid,
+			  xd->remote_uuid, TBIP_LOGIN_RESPONSE, sizeof(reply),
+			  command_id);
+	memcpy(reply.receiver_mac, net->dev->dev_addr, ETH_ALEN);
+	reply.receiver_mac_len = ETH_ALEN;
+
+	return tb_xdomain_response(xd, &reply, sizeof(reply),
+				   TB_CFG_PKG_XDOMAIN_RESP);
+}
+
+static int tbnet_login_request(struct tbnet *net, u8 sequence)
+{
+	struct thunderbolt_ip_login_response reply;
+	struct thunderbolt_ip_login request;
+	struct tb_xdomain *xd = net->xd;
+
+	memset(&request, 0, sizeof(request));
+	tbnet_fill_header(&request.hdr, xd->route, sequence, xd->local_uuid,
+			  xd->remote_uuid, TBIP_LOGIN, sizeof(request),
+			  atomic_inc_return(&net->command_id));
+
+	request.proto_version = TBIP_LOGIN_PROTO_VERSION;
+	request.transmit_path = TBNET_LOCAL_PATH;
+
+	return tb_xdomain_request(xd, &request, sizeof(request),
+				  TB_CFG_PKG_XDOMAIN_RESP, &reply,
+				  sizeof(reply), TB_CFG_PKG_XDOMAIN_RESP,
+				  TBNET_LOGIN_TIMEOUT);
+}
+
+static int tbnet_logout_response(struct tbnet *net, u64 route, u8 sequence,
+				 u32 command_id)
+{
+	struct thunderbolt_ip_status reply;
+	struct tb_xdomain *xd = net->xd;
+
+	memset(&reply, 0, sizeof(reply));
+	tbnet_fill_header(&reply.hdr, route, sequence, xd->local_uuid,
+			  xd->remote_uuid, TBIP_STATUS, sizeof(reply),
+			  atomic_inc_return(&net->command_id));
+	return tb_xdomain_response(xd, &reply, sizeof(reply),
+				   TB_CFG_PKG_XDOMAIN_RESP);
+}
+
+static int tbnet_logout_request(struct tbnet *net)
+{
+	struct thunderbolt_ip_logout request;
+	struct thunderbolt_ip_status reply;
+	struct tb_xdomain *xd = net->xd;
+
+	memset(&request, 0, sizeof(request));
+	tbnet_fill_header(&request.hdr, xd->route, 0, xd->local_uuid,
+			  xd->remote_uuid, TBIP_LOGOUT, sizeof(request),
+			  atomic_inc_return(&net->command_id));
+
+	return tb_xdomain_request(xd, &request, sizeof(request),
+				  TB_CFG_PKG_XDOMAIN_RESP, &reply,
+				  sizeof(reply), TB_CFG_PKG_XDOMAIN_RESP,
+				  TBNET_LOGOUT_TIMEOUT);
+}
+
+static void start_login(struct tbnet *net)
+{
+	mutex_lock(&net->connection_lock);
+	net->login_sent = false;
+	net->login_received = false;
+	mutex_unlock(&net->connection_lock);
+
+	queue_delayed_work(system_long_wq, &net->login_work,
+			   msecs_to_jiffies(1000));
+}
+
+static void stop_login(struct tbnet *net)
+{
+	cancel_delayed_work_sync(&net->login_work);
+	cancel_work_sync(&net->connected_work);
+}
+
+static inline unsigned int tbnet_frame_size(const struct tbnet_frame *tf)
+{
+	return tf->frame.size ? : TBNET_FRAME_SIZE;
+}
+
+static void tbnet_free_buffers(struct tbnet_ring *ring)
+{
+	unsigned int i;
+
+	for (i = 0; i < TBNET_RING_SIZE; i++) {
+		struct device *dma_dev = tb_ring_dma_device(ring->ring);
+		struct tbnet_frame *tf = &ring->frames[i];
+		enum dma_data_direction dir;
+		size_t size;
+
+		if (!tf->page)
+			continue;
+
+		if (ring->ring->is_tx) {
+			dir = DMA_TO_DEVICE;
+			size = tbnet_frame_size(tf);
+		} else {
+			dir = DMA_FROM_DEVICE;
+			size = TBNET_FRAME_SIZE;
+		}
+
+		dma_unmap_page(dma_dev, tf->frame.buffer_phy, size, dir);
+		__free_page(tf->page);
+		tf->page = NULL;
+	}
+
+	ring->cons = 0;
+	ring->prod = 0;
+}
+
+static void tbnet_tear_down(struct tbnet *net, bool send_logout)
+{
+	netif_carrier_off(net->dev);
+	netif_stop_queue(net->dev);
+
+	stop_login(net);
+
+	mutex_lock(&net->connection_lock);
+
+	if (net->login_sent && net->login_received) {
+		int retries = TBNET_LOGOUT_RETRIES;
+
+		while (send_logout && retries-- > 0) {
+			int ret = tbnet_logout_request(net);
+			if (ret != -ETIMEDOUT)
+				break;
+		}
+
+		tb_ring_stop(net->rx_ring.ring);
+		tb_ring_stop(net->tx_ring.ring);
+		tbnet_free_buffers(&net->rx_ring);
+		tbnet_free_buffers(&net->tx_ring);
+
+		if (tb_xdomain_disable_paths(net->xd))
+			netdev_warn(net->dev, "failed to disable DMA paths\n");
+	}
+
+	net->login_retries = 0;
+	net->login_sent = false;
+	net->login_received = false;
+
+	mutex_unlock(&net->connection_lock);
+}
+
+static int tbnet_handle_packet(const void *buf, size_t size, void *data)
+{
+	const struct thunderbolt_ip_login *pkg = buf;
+	struct tbnet *net = data;
+	u32 command_id;
+	int ret = 0;
+	u8 sequence;
+	u64 route;
+
+	/* Make sure the packet is for us */
+	if (size < sizeof(struct thunderbolt_ip_header))
+		return 0;
+	if (!uuid_equal(&pkg->hdr.initiator_uuid, net->xd->remote_uuid))
+		return 0;
+	if (!uuid_equal(&pkg->hdr.target_uuid, net->xd->local_uuid))
+		return 0;
+
+	route = ((u64)pkg->hdr.route_hi << 32) | pkg->hdr.route_lo;
+	route &= ~BIT_ULL(63);
+	if (route != net->xd->route)
+		return 0;
+
+	sequence = pkg->hdr.length_sn & TBIP_HDR_SN_MASK;
+	sequence >>= TBIP_HDR_SN_SHIFT;
+	command_id = pkg->hdr.command_id;
+
+	switch (pkg->hdr.type) {
+	case TBIP_LOGIN:
+		if (!netif_running(net->dev))
+			break;
+
+		ret = tbnet_login_response(net, route, sequence,
+					   pkg->hdr.command_id);
+		if (!ret) {
+			mutex_lock(&net->connection_lock);
+			net->login_received = true;
+			net->transmit_path = pkg->transmit_path;
+
+			/* If we reached the number of max retries or
+			 * previous logout, schedule another round of
+			 * login retries
+			 */
+			if (net->login_retries >= TBNET_LOGIN_RETRIES ||
+			    !net->login_sent) {
+				net->login_retries = 0;
+				queue_delayed_work(system_long_wq,
+						   &net->login_work, 0);
+			}
+			mutex_unlock(&net->connection_lock);
+
+			queue_work(system_long_wq, &net->connected_work);
+		}
+		break;
+
+	case TBIP_LOGOUT:
+		ret = tbnet_logout_response(net, route, sequence, command_id);
+		if (!ret)
+			tbnet_tear_down(net, false);
+		break;
+
+	default:
+		return 0;
+	}
+
+	if (ret)
+		netdev_warn(net->dev, "failed to send ThunderboltIP response\n");
+
+	return 1;
+}
+
+static unsigned int tbnet_available_buffers(const struct tbnet_ring *ring)
+{
+	return ring->prod - ring->cons;
+}
+
+static int tbnet_alloc_rx_buffers(struct tbnet *net, unsigned int nbuffers)
+{
+	struct tbnet_ring *ring = &net->rx_ring;
+	int ret;
+
+	while (nbuffers--) {
+		struct device *dma_dev = tb_ring_dma_device(ring->ring);
+		unsigned int index = ring->prod & (TBNET_RING_SIZE - 1);
+		struct tbnet_frame *tf = &ring->frames[index];
+		dma_addr_t dma_addr;
+
+		if (tf->page)
+			break;
+
+		tf->page = dev_alloc_page();
+		if (!tf->page) {
+			ret = -ENOMEM;
+			goto err_free;
+		}
+
+		dma_addr = dma_map_page(dma_dev, tf->page, 0,
+					TBNET_FRAME_SIZE, DMA_FROM_DEVICE);
+		if (dma_mapping_error(dma_dev, dma_addr)) {
+			ret = -ENOMEM;
+			goto err_free;
+		}
+
+		tf->frame.buffer_phy = dma_addr;
+		tf->dev = net->dev;
+
+		tb_ring_rx(ring->ring, &tf->frame);
+
+		ring->prod++;
+	}
+
+	return 0;
+
+err_free:
+	tbnet_free_buffers(ring);
+	return ret;
+}
+
+static struct tbnet_frame *tbnet_get_tx_buffer(struct tbnet *net)
+{
+	struct tbnet_ring *ring = &net->tx_ring;
+	struct tbnet_frame *tf;
+	unsigned int index;
+
+	if (!tbnet_available_buffers(ring))
+		return NULL;
+
+	index = ring->cons++ & (TBNET_RING_SIZE - 1);
+
+	tf = &ring->frames[index];
+	tf->frame.size = 0;
+	tf->frame.buffer_phy = 0;
+
+	return tf;
+}
+
+static void tbnet_tx_callback(struct tb_ring *ring, struct ring_frame *frame,
+			      bool canceled)
+{
+	struct tbnet_frame *tf = container_of(frame, typeof(*tf), frame);
+	struct device *dma_dev = tb_ring_dma_device(ring);
+	struct tbnet *net = netdev_priv(tf->dev);
+
+	dma_unmap_page(dma_dev, tf->frame.buffer_phy, tbnet_frame_size(tf),
+		       DMA_TO_DEVICE);
+
+	/* Return buffer to the ring */
+	net->tx_ring.prod++;
+
+	if (tbnet_available_buffers(&net->tx_ring) >= TBNET_RING_SIZE / 2)
+		netif_wake_queue(net->dev);
+}
+
+static int tbnet_alloc_tx_buffers(struct tbnet *net)
+{
+	struct tbnet_ring *ring = &net->tx_ring;
+	unsigned int i;
+
+	for (i = 0; i < TBNET_RING_SIZE; i++) {
+		struct tbnet_frame *tf = &ring->frames[i];
+
+		tf->page = alloc_page(GFP_KERNEL);
+		if (!tf->page) {
+			tbnet_free_buffers(ring);
+			return -ENOMEM;
+		}
+
+		tf->dev = net->dev;
+		tf->frame.callback = tbnet_tx_callback;
+		tf->frame.sof = TBIP_PDF_FRAME_START;
+		tf->frame.eof = TBIP_PDF_FRAME_END;
+	}
+
+	ring->cons = 0;
+	ring->prod = TBNET_RING_SIZE - 1;
+
+	return 0;
+}
+
+static void tbnet_connected_work(struct work_struct *work)
+{
+	struct tbnet *net = container_of(work, typeof(*net), connected_work);
+	bool connected;
+	int ret;
+
+	if (netif_carrier_ok(net->dev))
+		return;
+
+	mutex_lock(&net->connection_lock);
+	connected = net->login_sent && net->login_received;
+	mutex_unlock(&net->connection_lock);
+
+	if (!connected)
+		return;
+
+	/* Both logins successful so enable the high-speed DMA paths and
+	 * start the network device queue.
+	 */
+	ret = tb_xdomain_enable_paths(net->xd, TBNET_LOCAL_PATH,
+				      net->rx_ring.ring->hop,
+				      net->transmit_path,
+				      net->tx_ring.ring->hop);
+	if (ret) {
+		netdev_err(net->dev, "failed to enable DMA paths\n");
+		return;
+	}
+
+	tb_ring_start(net->tx_ring.ring);
+	tb_ring_start(net->rx_ring.ring);
+
+	ret = tbnet_alloc_rx_buffers(net, TBNET_RING_SIZE);
+	if (ret)
+		goto err_stop_rings;
+
+	ret = tbnet_alloc_tx_buffers(net);
+	if (ret)
+		goto err_free_rx_buffers;
+
+	netif_carrier_on(net->dev);
+	netif_start_queue(net->dev);
+	return;
+
+err_free_rx_buffers:
+	tbnet_free_buffers(&net->rx_ring);
+err_stop_rings:
+	tb_ring_stop(net->rx_ring.ring);
+	tb_ring_stop(net->tx_ring.ring);
+}
+
+static void tbnet_login_work(struct work_struct *work)
+{
+	struct tbnet *net = container_of(work, typeof(*net), login_work.work);
+	unsigned long delay = msecs_to_jiffies(TBNET_LOGIN_DELAY);
+	int ret;
+
+	if (netif_carrier_ok(net->dev))
+		return;
+
+	ret = tbnet_login_request(net, net->login_retries % 4);
+	if (ret) {
+		if (net->login_retries++ < TBNET_LOGIN_RETRIES) {
+			queue_delayed_work(system_long_wq, &net->login_work,
+					   delay);
+		} else {
+			netdev_info(net->dev, "ThunderboltIP login timed out\n");
+		}
+	} else {
+		net->login_retries = 0;
+
+		mutex_lock(&net->connection_lock);
+		net->login_sent = true;
+		mutex_unlock(&net->connection_lock);
+
+		queue_work(system_long_wq, &net->connected_work);
+	}
+}
+
+static bool tbnet_check_frame(struct tbnet *net, const struct tbnet_frame *tf)
+{
+	u32 frame_id, frame_count, frame_size, frame_index;
+	const struct thunderbolt_ip_frame_header *hdr;
+	unsigned int size;
+
+	if (tf->frame.flags & RING_DESC_CRC_ERROR) {
+		net->stats.rx_crc_errors++;
+		return false;
+	} else if (tf->frame.flags & RING_DESC_BUFFER_OVERRUN) {
+		net->stats.rx_over_errors++;
+		return false;
+	}
+
+	/* Should be greater than just header i.e. contains data */
+	size = tbnet_frame_size(tf);
+	if (size <= sizeof(*hdr)) {
+		net->stats.rx_length_errors++;
+		return false;
+	}
+
+	hdr = page_address(tf->page);
+	frame_count = le32_to_cpu(hdr->frame_count);
+	frame_size = le32_to_cpu(hdr->frame_size);
+	frame_index = le16_to_cpu(hdr->frame_index);
+	frame_id = le16_to_cpu(hdr->frame_id);
+
+	if ((frame_size > size - sizeof(*hdr)) || !frame_size) {
+		net->stats.rx_length_errors++;
+		return false;
+	}
+
+	/* In case we're in the middle of packet, validate the frame
+	 * header based on first fragment of the packet.
+	 */
+	if (net->skb && net->rx_hdr.frame_count) {
+		/* Check the frame count fits the count field */
+		if (frame_count != net->rx_hdr.frame_count) {
+			net->stats.rx_length_errors++;
+			return false;
+		}
+
+		/* Check the frame identifiers are incremented correctly,
+		 * and id is matching.
+		 */
+		if (frame_index != net->rx_hdr.frame_index + 1 ||
+		    frame_id != net->rx_hdr.frame_id) {
+			net->stats.rx_missed_errors++;
+			return false;
+		}
+
+		if (net->skb->len + frame_size > TBNET_MAX_MTU) {
+			net->stats.rx_length_errors++;
+			return false;
+		}
+
+		return true;
+	}
+
+	/* Start of packet, validate the frame header */
+	if (frame_count == 0 || frame_count > TBNET_RING_SIZE / 4) {
+		net->stats.rx_length_errors++;
+		return false;
+	}
+	if (frame_index != 0) {
+		net->stats.rx_missed_errors++;
+		return false;
+	}
+	if (frame_count > 1 && frame_size < TBNET_RX_HDR_SIZE) {
+		net->stats.rx_length_errors++;
+		return false;
+	}
+
+	return true;
+}
+
+static void tbnet_pull_tail(struct sk_buff *skb)
+{
+	skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
+	unsigned int pull_len;
+	void *hdr;
+
+	hdr = skb_frag_address(frag);
+	pull_len = eth_get_headlen(hdr, TBNET_RX_HDR_SIZE);
+
+	/* Align pull length to size of long to optimize memcpy performance */
+	skb_copy_to_linear_data(skb, hdr, ALIGN(pull_len, sizeof(long)));
+
+	/* Update all of the pointers */
+	skb_frag_size_sub(frag, pull_len);
+	frag->page_offset += pull_len;
+	skb->data_len -= pull_len;
+	skb->tail += pull_len;
+}
+
+static int tbnet_poll(struct napi_struct *napi, int budget)
+{
+	struct tbnet *net = container_of(napi, struct tbnet, napi);
+	unsigned int cleaned_count = tbnet_available_buffers(&net->rx_ring);
+	struct device *dma_dev = tb_ring_dma_device(net->rx_ring.ring);
+	unsigned int rx_packets = 0;
+
+	while (rx_packets < budget) {
+		u32 size, frame_size, frame_count, frame_index;
+		const struct thunderbolt_ip_frame_header *hdr;
+		unsigned int hdr_size = sizeof(*hdr);
+		struct sk_buff *skb = NULL;
+		struct ring_frame *frame;
+		struct tbnet_frame *tf;
+		bool last = true;
+
+		/* Return some buffers to hardware, one at a time is too
+		 * slow so allocate MAX_SKB_FRAGS buffers at the same
+		 * time.
+		 */
+		if (cleaned_count >= MAX_SKB_FRAGS) {
+			tbnet_alloc_rx_buffers(net, cleaned_count);
+			cleaned_count = 0;
+		}
+
+		frame = tb_ring_poll(net->rx_ring.ring);
+		if (!frame)
+			break;
+
+		dma_unmap_page(dma_dev, frame->buffer_phy, TBNET_FRAME_SIZE,
+			       DMA_FROM_DEVICE);
+
+		tf = container_of(frame, typeof(*tf), frame);
+		size = tbnet_frame_size(tf);
+		hdr = page_address(tf->page);
+
+		if (!tbnet_check_frame(net, tf)) {
+			__free_page(tf->page);
+			tf->page = NULL;
+			net->rx_ring.cons++;
+			cleaned_count++;
+			dev_kfree_skb_any(net->skb);
+			net->skb = NULL;
+			continue;
+		}
+
+		frame_count = le32_to_cpu(hdr->frame_count);
+		frame_size = le32_to_cpu(hdr->frame_size);
+		frame_index = le16_to_cpu(hdr->frame_index);
+		last = frame_index == frame_count - 1;
+
+		skb = net->skb;
+		if (!skb) {
+			skb = netdev_alloc_skb_ip_align(net->dev,
+							TBNET_RX_HDR_SIZE);
+			net->skb = skb;
+		}
+		if (!skb)
+			break;
+
+		/* Single small buffer we can copy directly to the
+		 * header part of the skb.
+		 */
+		if (hdr->frame_count == 1 && frame_size <= TBNET_RX_HDR_SIZE) {
+			const void *data = hdr + 1;
+
+			memcpy(__skb_put(skb, frame_size), data,
+			       ALIGN(frame_size, sizeof(long)));
+
+			__free_page(tf->page);
+		} else {
+			skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+					tf->page, hdr_size, frame_size, size);
+			if (last)
+				tbnet_pull_tail(skb);
+		}
+
+		tf->page = NULL;
+		net->rx_ring.cons++;
+		cleaned_count++;
+
+		net->rx_hdr.frame_count = frame_count;
+		net->rx_hdr.frame_size = frame_size;
+		net->rx_hdr.frame_index = frame_index;
+		net->rx_hdr.frame_id = le16_to_cpu(hdr->frame_id);
+
+		rx_packets++;
+		net->stats.rx_bytes += frame_size;
+
+		if (last) {
+			skb->protocol = eth_type_trans(skb, net->dev);
+			napi_gro_receive(&net->napi, skb);
+			net->skb = NULL;
+		}
+	}
+
+	net->stats.rx_packets += rx_packets;
+
+	if (cleaned_count)
+		tbnet_alloc_rx_buffers(net, cleaned_count);
+
+	if (rx_packets >= budget)
+		return budget;
+
+	napi_complete_done(napi, rx_packets);
+	/* Re-enable the ring interrupt */
+	tb_ring_poll_complete(net->rx_ring.ring);
+
+	return rx_packets;
+}
+
+static void tbnet_start_poll(void *data)
+{
+	struct tbnet *net = data;
+
+	napi_schedule(&net->napi);
+}
+
+static int tbnet_open(struct net_device *dev)
+{
+	struct tbnet *net = netdev_priv(dev);
+	struct tb_xdomain *xd = net->xd;
+	u16 sof_mask, eof_mask;
+	struct tb_ring *ring;
+
+	netif_carrier_off(dev);
+
+	ring = tb_ring_alloc_tx(xd->tb->nhi, -1, TBNET_RING_SIZE,
+				RING_FLAG_FRAME);
+	if (!ring) {
+		netdev_err(dev, "failed to allocate Tx ring\n");
+		return -ENOMEM;
+	}
+	net->tx_ring.ring = ring;
+
+	sof_mask = BIT(TBIP_PDF_FRAME_START);
+	eof_mask = BIT(TBIP_PDF_FRAME_END);
+
+	ring = tb_ring_alloc_rx(xd->tb->nhi, -1, TBNET_RING_SIZE,
+				RING_FLAG_FRAME | RING_FLAG_E2E, sof_mask,
+				eof_mask, tbnet_start_poll, net);
+	if (!ring) {
+		netdev_err(dev, "failed to allocate Rx ring\n");
+		tb_ring_free(net->tx_ring.ring);
+		net->tx_ring.ring = NULL;
+		return -ENOMEM;
+	}
+	net->rx_ring.ring = ring;
+
+	napi_enable(&net->napi);
+	start_login(net);
+
+	return 0;
+}
+
+static int tbnet_stop(struct net_device *dev)
+{
+	struct tbnet *net = netdev_priv(dev);
+
+	napi_disable(&net->napi);
+
+	tbnet_tear_down(net, true);
+
+	tb_ring_free(net->rx_ring.ring);
+	net->rx_ring.ring = NULL;
+	tb_ring_free(net->tx_ring.ring);
+	net->tx_ring.ring = NULL;
+
+	return 0;
+}
+
+static bool tbnet_xmit_map(struct device *dma_dev, struct tbnet_frame *tf)
+{
+	dma_addr_t dma_addr;
+
+	dma_addr = dma_map_page(dma_dev, tf->page, 0, tbnet_frame_size(tf),
+				DMA_TO_DEVICE);
+	if (dma_mapping_error(dma_dev, dma_addr))
+		return false;
+
+	tf->frame.buffer_phy = dma_addr;
+	return true;
+}
+
+static bool tbnet_xmit_csum_and_map(struct tbnet *net, struct sk_buff *skb,
+	struct tbnet_frame **frames, u32 frame_count)
+{
+	struct thunderbolt_ip_frame_header *hdr = page_address(frames[0]->page);
+	struct device *dma_dev = tb_ring_dma_device(net->tx_ring.ring);
+	__wsum wsum = htonl(skb->len - skb_transport_offset(skb));
+	unsigned int i, len, offset = skb_transport_offset(skb);
+	__be16 protocol = skb->protocol;
+	void *data = skb->data;
+	void *dest = hdr + 1;
+	__sum16 *tucso;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		/* No need to calculate checksum so we just update the
+		 * total frame count and map the frames for DMA.
+		 */
+		for (i = 0; i < frame_count; i++) {
+			hdr = page_address(frames[i]->page);
+			hdr->frame_count = cpu_to_le32(frame_count);
+			if (!tbnet_xmit_map(dma_dev, frames[i]))
+				goto err_unmap;
+		}
+
+		return true;
+	}
+
+	if (protocol == htons(ETH_P_8021Q)) {
+		struct vlan_hdr *vhdr, vh;
+
+		vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(vh), &vh);
+		if (!vhdr)
+			return false;
+
+		protocol = vhdr->h_vlan_encapsulated_proto;
+	}
+
+	/* Data points on the beginning of packet.
+	 * Check is the checksum absolute place in the packet.
+	 * ipcso will update IP checksum.
+	 * tucso will update TCP/UPD checksum.
+	 */
+	if (protocol == htons(ETH_P_IP)) {
+		__sum16 *ipcso = dest + ((void *)&(ip_hdr(skb)->check) - data);
+
+		*ipcso = 0;
+		*ipcso = ip_fast_csum(dest + skb_network_offset(skb),
+				      ip_hdr(skb)->ihl);
+
+		if (ip_hdr(skb)->protocol == IPPROTO_TCP)
+			tucso = dest + ((void *)&(tcp_hdr(skb)->check) - data);
+		else if (ip_hdr(skb)->protocol == IPPROTO_UDP)
+			tucso = dest + ((void *)&(udp_hdr(skb)->check) - data);
+		else
+			return false;
+
+		*tucso = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
+					    ip_hdr(skb)->daddr, 0,
+					    ip_hdr(skb)->protocol, 0);
+	} else if (skb_is_gso_v6(skb)) {
+		tucso = dest + ((void *)&(tcp_hdr(skb)->check) - data);
+		*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					  &ipv6_hdr(skb)->daddr, 0,
+					  IPPROTO_TCP, 0);
+		return false;
+	} else if (protocol == htons(ETH_P_IPV6)) {
+		tucso = dest + skb_checksum_start_offset(skb) + skb->csum_offset;
+		*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					  &ipv6_hdr(skb)->daddr, 0,
+					  ipv6_hdr(skb)->nexthdr, 0);
+	} else {
+		return false;
+	}
+
+	/* First frame was headers, rest of the frames contain data.
+	 * Calculate checksum over each frame.
+	 */
+	for (i = 0; i < frame_count; i++) {
+		hdr = page_address(frames[i]->page);
+		dest = (void *)(hdr + 1) + offset;
+		len = le32_to_cpu(hdr->frame_size) - offset;
+		wsum = csum_partial(dest, len, wsum);
+		hdr->frame_count = cpu_to_le32(frame_count);
+
+		offset = 0;
+	}
+
+	*tucso = csum_fold(wsum);
+
+	/* Checksum is finally calculated and we don't touch the memory
+	 * anymore, so DMA map the frames now.
+	 */
+	for (i = 0; i < frame_count; i++) {
+		if (!tbnet_xmit_map(dma_dev, frames[i]))
+			goto err_unmap;
+	}
+
+	return true;
+
+err_unmap:
+	while (i--)
+		dma_unmap_page(dma_dev, frames[i]->frame.buffer_phy,
+			       tbnet_frame_size(frames[i]), DMA_TO_DEVICE);
+
+	return false;
+}
+
+static void *tbnet_kmap_frag(struct sk_buff *skb, unsigned int frag_num,
+			     unsigned int *len)
+{
+	const skb_frag_t *frag = &skb_shinfo(skb)->frags[frag_num];
+
+	*len = skb_frag_size(frag);
+	return kmap_atomic(skb_frag_page(frag)) + frag->page_offset;
+}
+
+static netdev_tx_t tbnet_start_xmit(struct sk_buff *skb,
+				    struct net_device *dev)
+{
+	struct tbnet *net = netdev_priv(dev);
+	struct tbnet_frame *frames[MAX_SKB_FRAGS];
+	u16 frame_id = atomic_read(&net->frame_id);
+	struct thunderbolt_ip_frame_header *hdr;
+	unsigned int len = skb_headlen(skb);
+	unsigned int data_len = skb->len;
+	unsigned int nframes, i;
+	unsigned int frag = 0;
+	void *src = skb->data;
+	u32 frame_index = 0;
+	bool unmap = false;
+	void *dest;
+
+	nframes = DIV_ROUND_UP(data_len, TBNET_MAX_PAYLOAD_SIZE);
+	if (tbnet_available_buffers(&net->tx_ring) < nframes) {
+		netif_stop_queue(net->dev);
+		return NETDEV_TX_BUSY;
+	}
+
+	frames[frame_index] = tbnet_get_tx_buffer(net);
+	if (!frames[frame_index])
+		goto err_drop;
+
+	hdr = page_address(frames[frame_index]->page);
+	dest = hdr + 1;
+
+	/* If overall packet is bigger than the frame data size */
+	while (data_len > TBNET_MAX_PAYLOAD_SIZE) {
+		unsigned int size_left = TBNET_MAX_PAYLOAD_SIZE;
+
+		hdr->frame_size = cpu_to_le32(TBNET_MAX_PAYLOAD_SIZE);
+		hdr->frame_index = cpu_to_le16(frame_index);
+		hdr->frame_id = cpu_to_le16(frame_id);
+
+		do {
+			if (len > size_left) {
+				/* Copy data onto Tx buffer data with
+				 * full frame size then break and go to
+				 * next frame
+				 */
+				memcpy(dest, src, size_left);
+				len -= size_left;
+				dest += size_left;
+				src += size_left;
+				break;
+			}
+
+			memcpy(dest, src, len);
+			size_left -= len;
+			dest += len;
+
+			if (unmap) {
+				kunmap_atomic(src);
+				unmap = false;
+			}
+
+			/* Ensure all fragments have been processed */
+			if (frag < skb_shinfo(skb)->nr_frags) {
+				/* Map and then unmap quickly */
+				src = tbnet_kmap_frag(skb, frag++, &len);
+				unmap = true;
+			} else if (unlikely(size_left > 0)) {
+				goto err_drop;
+			}
+		} while (size_left > 0);
+
+		data_len -= TBNET_MAX_PAYLOAD_SIZE;
+		frame_index++;
+
+		frames[frame_index] = tbnet_get_tx_buffer(net);
+		if (!frames[frame_index])
+			goto err_drop;
+
+		hdr = page_address(frames[frame_index]->page);
+		dest = hdr + 1;
+	}
+
+	hdr->frame_size = cpu_to_le32(data_len);
+	hdr->frame_index = cpu_to_le16(frame_index);
+	hdr->frame_id = cpu_to_le16(frame_id);
+
+	frames[frame_index]->frame.size = data_len + sizeof(*hdr);
+
+	/* In case  the remaining data_len is smaller than a frame */
+	while (len < data_len) {
+		memcpy(dest, src, len);
+		data_len -= len;
+		dest += len;
+
+		if (unmap) {
+			kunmap_atomic(src);
+			unmap = false;
+		}
+
+		if (frag < skb_shinfo(skb)->nr_frags) {
+			src = tbnet_kmap_frag(skb, frag++, &len);
+			unmap = true;
+		} else if (unlikely(data_len > 0)) {
+			goto err_drop;
+		}
+	}
+
+	memcpy(dest, src, data_len);
+
+	if (unmap)
+		kunmap_atomic(src);
+
+	if (!tbnet_xmit_csum_and_map(net, skb, frames, frame_index + 1))
+		goto err_drop;
+
+	for (i = 0; i < frame_index + 1; i++)
+		tb_ring_tx(net->tx_ring.ring, &frames[i]->frame);
+
+	if (net->svc->prtcstns & TBNET_MATCH_FRAGS_ID)
+		atomic_inc(&net->frame_id);
+
+	net->stats.tx_packets++;
+	net->stats.tx_bytes += skb->len;
+
+	dev_consume_skb_any(skb);
+
+	return NETDEV_TX_OK;
+
+err_drop:
+	/* We can re-use the buffers */
+	net->tx_ring.cons -= frame_index;
+
+	dev_kfree_skb_any(skb);
+	net->stats.tx_errors++;
+
+	return NETDEV_TX_OK;
+}
+
+static void tbnet_get_stats64(struct net_device *dev,
+			      struct rtnl_link_stats64 *stats)
+{
+	struct tbnet *net = netdev_priv(dev);
+
+	stats->tx_packets = net->stats.tx_packets;
+	stats->rx_packets = net->stats.rx_packets;
+	stats->tx_bytes = net->stats.tx_bytes;
+	stats->rx_bytes = net->stats.rx_bytes;
+	stats->tx_errors = net->stats.tx_errors;
+	stats->rx_errors = net->stats.rx_length_errors +
+		net->stats.rx_over_errors + net->stats.rx_crc_errors +
+		net->stats.rx_missed_errors;
+	stats->rx_length_errors = net->stats.rx_length_errors;
+	stats->rx_over_errors = net->stats.rx_over_errors;
+	stats->rx_crc_errors = net->stats.rx_crc_errors;
+	stats->rx_missed_errors = net->stats.rx_missed_errors;
+}
+
+static const struct net_device_ops tbnet_netdev_ops = {
+	.ndo_open = tbnet_open,
+	.ndo_stop = tbnet_stop,
+	.ndo_start_xmit = tbnet_start_xmit,
+	.ndo_get_stats64 = tbnet_get_stats64,
+};
+
+static void tbnet_generate_mac(struct net_device *dev)
+{
+	const struct tbnet *net = netdev_priv(dev);
+	const struct tb_xdomain *xd = net->xd;
+	u8 phy_port;
+	u32 hash;
+
+	phy_port = tb_phy_port_from_link(TBNET_L0_PORT_NUM(xd->route));
+
+	/* Unicast and locally administered MAC */
+	dev->dev_addr[0] = phy_port << 4 | 0x02;
+	hash = jhash2((u32 *)xd->local_uuid, 4, 0);
+	memcpy(dev->dev_addr + 1, &hash, sizeof(hash));
+	hash = jhash2((u32 *)xd->local_uuid, 4, hash);
+	dev->dev_addr[5] = hash & 0xff;
+}
+
+static int tbnet_probe(struct tb_service *svc, const struct tb_service_id *id)
+{
+	struct tb_xdomain *xd = tb_service_parent(svc);
+	struct net_device *dev;
+	struct tbnet *net;
+	int ret;
+
+	dev = alloc_etherdev(sizeof(*net));
+	if (!dev)
+		return -ENOMEM;
+
+	SET_NETDEV_DEV(dev, &svc->dev);
+
+	net = netdev_priv(dev);
+	INIT_DELAYED_WORK(&net->login_work, tbnet_login_work);
+	INIT_WORK(&net->connected_work, tbnet_connected_work);
+	mutex_init(&net->connection_lock);
+	atomic_set(&net->command_id, 0);
+	atomic_set(&net->frame_id, 0);
+	net->svc = svc;
+	net->dev = dev;
+	net->xd = xd;
+
+	tbnet_generate_mac(dev);
+
+	strcpy(dev->name, "thunderbolt%d");
+	dev->netdev_ops = &tbnet_netdev_ops;
+
+	/* ThunderboltIP takes advantage of TSO packets but instead of
+	 * segmenting them we just split the packet into Thunderbolt
+	 * frames (maximum payload size of each frame is 4084 bytes) and
+	 * calculate checksum over the whole packet here.
+	 *
+	 * The receiving side does the opposite if the host OS supports
+	 * LRO, otherwise it needs to split the large packet into MTU
+	 * sized smaller packets.
+	 *
+	 * In order to receive large packets from the networking stack,
+	 * we need to announce support for most of the offloading
+	 * features here.
+	 */
+	dev->hw_features = NETIF_F_SG | NETIF_F_ALL_TSO | NETIF_F_GRO |
+			   NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+	dev->features = dev->hw_features | NETIF_F_HIGHDMA;
+	dev->hard_header_len += sizeof(struct thunderbolt_ip_frame_header);
+
+	netif_napi_add(dev, &net->napi, tbnet_poll, NAPI_POLL_WEIGHT);
+
+	/* MTU range: 68 - 65522 */
+	dev->min_mtu = ETH_MIN_MTU;
+	dev->max_mtu = TBNET_MAX_MTU - ETH_HLEN;
+
+	ret = register_netdev(dev);
+	if (ret) {
+		free_netdev(dev);
+		return ret;
+	}
+
+	net->handler.uuid = &tbnet_svc_uuid;
+	net->handler.callback = tbnet_handle_packet,
+	net->handler.data = net;
+	tb_register_protocol_handler(&net->handler);
+
+	tb_service_set_drvdata(svc, net);
+
+	return 0;
+}
+
+static void tbnet_remove(struct tb_service *svc)
+{
+	struct tbnet *net = tb_service_get_drvdata(svc);
+
+	unregister_netdev(net->dev);
+	tb_unregister_protocol_handler(&net->handler);
+	free_netdev(net->dev);
+}
+
+static void tbnet_shutdown(struct tb_service *svc)
+{
+	tbnet_tear_down(tb_service_get_drvdata(svc), true);
+}
+
+static int __maybe_unused tbnet_suspend(struct device *dev)
+{
+	struct tb_service *svc = tb_to_service(dev);
+	struct tbnet *net = tb_service_get_drvdata(svc);
+
+	stop_login(net);
+	if (netif_running(net->dev)) {
+		netif_device_detach(net->dev);
+		tb_ring_stop(net->rx_ring.ring);
+		tb_ring_stop(net->tx_ring.ring);
+		tbnet_free_buffers(&net->rx_ring);
+		tbnet_free_buffers(&net->tx_ring);
+	}
+
+	return 0;
+}
+
+static int __maybe_unused tbnet_resume(struct device *dev)
+{
+	struct tb_service *svc = tb_to_service(dev);
+	struct tbnet *net = tb_service_get_drvdata(svc);
+
+	netif_carrier_off(net->dev);
+	if (netif_running(net->dev)) {
+		netif_device_attach(net->dev);
+		start_login(net);
+	}
+
+	return 0;
+}
+
+static const struct dev_pm_ops tbnet_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(tbnet_suspend, tbnet_resume)
+};
+
+static const struct tb_service_id tbnet_ids[] = {
+	{ TB_SERVICE("network", 1) },
+	{ },
+};
+MODULE_DEVICE_TABLE(tbsvc, tbnet_ids);
+
+static struct tb_service_driver tbnet_driver = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "thunderbolt-net",
+		.pm = &tbnet_pm_ops,
+	},
+	.probe = tbnet_probe,
+	.remove = tbnet_remove,
+	.shutdown = tbnet_shutdown,
+	.id_table = tbnet_ids,
+};
+
+static int __init tbnet_init(void)
+{
+	int ret;
+
+	tbnet_dir = tb_property_create_dir(&tbnet_dir_uuid);
+	if (!tbnet_dir)
+		return -ENOMEM;
+
+	tb_property_add_immediate(tbnet_dir, "prtcid", 1);
+	tb_property_add_immediate(tbnet_dir, "prtcvers", 1);
+	tb_property_add_immediate(tbnet_dir, "prtcrevs", 1);
+	tb_property_add_immediate(tbnet_dir, "prtcstns",
+				  TBNET_MATCH_FRAGS_ID);
+
+	ret = tb_register_property_dir("network", tbnet_dir);
+	if (ret) {
+		tb_property_free_dir(tbnet_dir);
+		return ret;
+	}
+
+	return tb_register_service_driver(&tbnet_driver);
+}
+module_init(tbnet_init);
+
+static void __exit tbnet_exit(void)
+{
+	tb_unregister_service_driver(&tbnet_driver);
+	tb_unregister_property_dir("network", tbnet_dir);
+	tb_property_free_dir(tbnet_dir);
+}
+module_exit(tbnet_exit);
+
+MODULE_AUTHOR("Amir Levy <amir.jer.levy@intel.com>");
+MODULE_AUTHOR("Michael Jamet <michael.jamet@intel.com>");
+MODULE_AUTHOR("Mika Westerberg <mika.westerberg@linux.intel.com>");
+MODULE_DESCRIPTION("Thunderbolt network driver");
+MODULE_LICENSE("GPL v2");
-- 
2.14.1

^ permalink raw reply related

* [PATCH v2 15/16] thunderbolt: Allocate ring HopID automatically if requested
From: Mika Westerberg @ 2017-09-25 11:07 UTC (permalink / raw)
  To: Greg Kroah-Hartman, David S . Miller
  Cc: Andreas Noever, Michael Jamet, Yehezkel Bernat, Amir Levy,
	Mario.Limonciello, Lukas Wunner, Andy Shevchenko, Andrew Lunn,
	Mika Westerberg, linux-kernel, netdev
In-Reply-To: <20170925110738.68382-1-mika.westerberg@linux.intel.com>

Thunderbolt services should not care which HopID (ring) they use for
sending and receiving packets over the high-speed DMA path, so make
tb_ring_alloc_rx() and tb_ring_alloc_tx() accept negative HopID. This
means that the NHI will allocate next available HopID for the caller
automatically.

These HopIDs will be allocated from the range which is not reserved for
the Thunderbolt protocol (8 .. hop_count - 1).

The allocated HopID can be retrieved from ring->hop field after the ring
has been allocated successfully if needed.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
---
 drivers/thunderbolt/nhi.c | 78 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 60 insertions(+), 18 deletions(-)

diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 5bc3f77cc1f3..b9e3980aa4fc 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -26,6 +26,8 @@
  * use this ring for anything else.
  */
 #define RING_E2E_UNUSED_HOPID	2
+/* HopIDs 0-7 are reserved by the Thunderbolt protocol */
+#define RING_FIRST_USABLE_HOPID	8
 
 /*
  * Minimal number of vectors when we use MSI-X. Two for control channel
@@ -411,6 +413,62 @@ static void ring_release_msix(struct tb_ring *ring)
 	ring->irq = 0;
 }
 
+static int nhi_alloc_hop(struct tb_nhi *nhi, struct tb_ring *ring)
+{
+	int ret = 0;
+
+	spin_lock_irq(&nhi->lock);
+
+	if (ring->hop < 0) {
+		unsigned int i;
+
+		/*
+		 * Automatically allocate HopID from the non-reserved
+		 * range 8 .. hop_count - 1.
+		 */
+		for (i = RING_FIRST_USABLE_HOPID; i < nhi->hop_count; i++) {
+			if (ring->is_tx) {
+				if (!nhi->tx_rings[i]) {
+					ring->hop = i;
+					break;
+				}
+			} else {
+				if (!nhi->rx_rings[i]) {
+					ring->hop = i;
+					break;
+				}
+			}
+		}
+	}
+
+	if (ring->hop < 0 || ring->hop >= nhi->hop_count) {
+		dev_warn(&nhi->pdev->dev, "invalid hop: %d\n", ring->hop);
+		ret = -EINVAL;
+		goto err_unlock;
+	}
+	if (ring->is_tx && nhi->tx_rings[ring->hop]) {
+		dev_warn(&nhi->pdev->dev, "TX hop %d already allocated\n",
+			 ring->hop);
+		ret = -EBUSY;
+		goto err_unlock;
+	} else if (!ring->is_tx && nhi->rx_rings[ring->hop]) {
+		dev_warn(&nhi->pdev->dev, "RX hop %d already allocated\n",
+			 ring->hop);
+		ret = -EBUSY;
+		goto err_unlock;
+	}
+
+	if (ring->is_tx)
+		nhi->tx_rings[ring->hop] = ring;
+	else
+		nhi->rx_rings[ring->hop] = ring;
+
+err_unlock:
+	spin_unlock_irq(&nhi->lock);
+
+	return ret;
+}
+
 static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 				     bool transmit, unsigned int flags,
 				     u16 sof_mask, u16 eof_mask,
@@ -456,28 +514,12 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	if (ring_request_msix(ring, flags & RING_FLAG_NO_SUSPEND))
 		goto err_free_descs;
 
-	spin_lock_irq(&nhi->lock);
-	if (hop >= nhi->hop_count) {
-		dev_WARN(&nhi->pdev->dev, "invalid hop: %d\n", hop);
+	if (nhi_alloc_hop(nhi, ring))
 		goto err_release_msix;
-	}
-	if (transmit && nhi->tx_rings[hop]) {
-		dev_WARN(&nhi->pdev->dev, "TX hop %d already allocated\n", hop);
-		goto err_release_msix;
-	} else if (!transmit && nhi->rx_rings[hop]) {
-		dev_WARN(&nhi->pdev->dev, "RX hop %d already allocated\n", hop);
-		goto err_release_msix;
-	}
-	if (transmit)
-		nhi->tx_rings[hop] = ring;
-	else
-		nhi->rx_rings[hop] = ring;
-	spin_unlock_irq(&nhi->lock);
 
 	return ring;
 
 err_release_msix:
-	spin_unlock_irq(&nhi->lock);
 	ring_release_msix(ring);
 err_free_descs:
 	dma_free_coherent(&ring->nhi->pdev->dev,
@@ -506,7 +548,7 @@ EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
 /**
  * tb_ring_alloc_rx() - Allocate DMA ring for receive
  * @nhi: Pointer to the NHI the ring is to be allocated
- * @hop: HopID (ring) to allocate
+ * @hop: HopID (ring) to allocate. Pass %-1 for automatic allocation.
  * @size: Number of entries in the ring
  * @flags: Flags for the ring
  * @sof_mask: Mask of PDF values that start a frame
-- 
2.14.1

^ permalink raw reply related

* [PATCH v2 14/16] thunderbolt: Add function to retrieve DMA device for the ring
From: Mika Westerberg @ 2017-09-25 11:07 UTC (permalink / raw)
  To: Greg Kroah-Hartman, David S . Miller
  Cc: Andreas Noever, Michael Jamet, Yehezkel Bernat, Amir Levy,
	Mario.Limonciello, Lukas Wunner, Andy Shevchenko, Andrew Lunn,
	Mika Westerberg, linux-kernel, netdev
In-Reply-To: <20170925110738.68382-1-mika.westerberg@linux.intel.com>

This is needed when Thunderbolt service drivers need to DMA map memory
before it is passed down to the ring.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
---
 include/linux/thunderbolt.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 36925e3aec7c..7b69853188b1 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -19,6 +19,7 @@
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/mod_devicetable.h>
+#include <linux/pci.h>
 #include <linux/uuid.h>
 #include <linux/workqueue.h>
 
@@ -582,4 +583,16 @@ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame)
 struct ring_frame *tb_ring_poll(struct tb_ring *ring);
 void tb_ring_poll_complete(struct tb_ring *ring);
 
+/**
+ * tb_ring_dma_device() - Return device used for DMA mapping
+ * @ring: Ring whose DMA device is retrieved
+ *
+ * Use this function when you are mapping DMA for buffers that are
+ * passed to the ring for sending/receiving.
+ */
+static inline struct device *tb_ring_dma_device(struct tb_ring *ring)
+{
+	return &ring->nhi->pdev->dev;
+}
+
 #endif /* THUNDERBOLT_H_ */
-- 
2.14.1

^ permalink raw reply related

* [PATCH v2 13/16] thunderbolt: Add polling mode for rings
From: Mika Westerberg @ 2017-09-25 11:07 UTC (permalink / raw)
  To: Greg Kroah-Hartman, David S . Miller
  Cc: Andreas Noever, Michael Jamet, Yehezkel Bernat, Amir Levy,
	Mario.Limonciello, Lukas Wunner, Andy Shevchenko, Andrew Lunn,
	Mika Westerberg, linux-kernel, netdev
In-Reply-To: <20170925110738.68382-1-mika.westerberg@linux.intel.com>

In order to support things like networking over Thunderbolt cable, there
needs to be a way to switch the ring to a mode where it can be polled
with the interrupt masked. We implement such mode so that the caller can
allocate a ring by passing pointer to a function that is then called
when an interrupt is triggered. Completed frames can be fetched using
tb_ring_poll() and the interrupt can be re-enabled when the caller is
finished with polling by using tb_ring_poll_complete().

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
---
 drivers/thunderbolt/ctl.c   |   2 +-
 drivers/thunderbolt/nhi.c   | 126 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/thunderbolt.h |  23 +++++---
 3 files changed, 134 insertions(+), 17 deletions(-)

diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c
index dd10789e1dbb..d079dbba2c03 100644
--- a/drivers/thunderbolt/ctl.c
+++ b/drivers/thunderbolt/ctl.c
@@ -619,7 +619,7 @@ struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data)
 		goto err;
 
 	ctl->rx = tb_ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff,
-				0xffff);
+				0xffff, NULL, NULL);
 	if (!ctl->rx)
 		goto err;
 
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index cf1397afa72f..5bc3f77cc1f3 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -252,7 +252,8 @@ static void ring_work(struct work_struct *work)
 		 * Do not hold on to it.
 		 */
 		list_del_init(&frame->list);
-		frame->callback(ring, frame, canceled);
+		if (frame->callback)
+			frame->callback(ring, frame, canceled);
 	}
 }
 
@@ -273,11 +274,106 @@ int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
 }
 EXPORT_SYMBOL_GPL(__tb_ring_enqueue);
 
+/**
+ * tb_ring_poll() - Poll one completed frame from the ring
+ * @ring: Ring to poll
+ *
+ * This function can be called when @start_poll callback of the @ring
+ * has been called. It will read one completed frame from the ring and
+ * return it to the caller. Returns %NULL if there is no more completed
+ * frames.
+ */
+struct ring_frame *tb_ring_poll(struct tb_ring *ring)
+{
+	struct ring_frame *frame = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ring->lock, flags);
+	if (!ring->running)
+		goto unlock;
+	if (ring_empty(ring))
+		goto unlock;
+
+	if (ring->descriptors[ring->tail].flags & RING_DESC_COMPLETED) {
+		frame = list_first_entry(&ring->in_flight, typeof(*frame),
+					 list);
+		list_del_init(&frame->list);
+
+		if (!ring->is_tx) {
+			frame->size = ring->descriptors[ring->tail].length;
+			frame->eof = ring->descriptors[ring->tail].eof;
+			frame->sof = ring->descriptors[ring->tail].sof;
+			frame->flags = ring->descriptors[ring->tail].flags;
+		}
+
+		ring->tail = (ring->tail + 1) % ring->size;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&ring->lock, flags);
+	return frame;
+}
+EXPORT_SYMBOL_GPL(tb_ring_poll);
+
+static void __ring_interrupt_mask(struct tb_ring *ring, bool mask)
+{
+	int idx = ring_interrupt_index(ring);
+	int reg = REG_RING_INTERRUPT_BASE + idx / 32 * 4;
+	int bit = idx % 32;
+	u32 val;
+
+	val = ioread32(ring->nhi->iobase + reg);
+	if (mask)
+		val &= ~BIT(bit);
+	else
+		val |= BIT(bit);
+	iowrite32(val, ring->nhi->iobase + reg);
+}
+
+/* Both @nhi->lock and @ring->lock should be held */
+static void __ring_interrupt(struct tb_ring *ring)
+{
+	if (!ring->running)
+		return;
+
+	if (ring->start_poll) {
+		__ring_interrupt_mask(ring, false);
+		ring->start_poll(ring->poll_data);
+	} else {
+		schedule_work(&ring->work);
+	}
+}
+
+/**
+ * tb_ring_poll_complete() - Re-start interrupt for the ring
+ * @ring: Ring to re-start the interrupt
+ *
+ * This will re-start (unmask) the ring interrupt once the user is done
+ * with polling.
+ */
+void tb_ring_poll_complete(struct tb_ring *ring)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ring->nhi->lock, flags);
+	spin_lock(&ring->lock);
+	if (ring->start_poll)
+		__ring_interrupt_mask(ring, false);
+	spin_unlock(&ring->lock);
+	spin_unlock_irqrestore(&ring->nhi->lock, flags);
+}
+EXPORT_SYMBOL_GPL(tb_ring_poll_complete);
+
 static irqreturn_t ring_msix(int irq, void *data)
 {
 	struct tb_ring *ring = data;
 
-	schedule_work(&ring->work);
+	spin_lock(&ring->nhi->lock);
+	spin_lock(&ring->lock);
+	__ring_interrupt(ring);
+	spin_unlock(&ring->lock);
+	spin_unlock(&ring->nhi->lock);
+
 	return IRQ_HANDLED;
 }
 
@@ -317,7 +413,9 @@ static void ring_release_msix(struct tb_ring *ring)
 
 static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 				     bool transmit, unsigned int flags,
-				     u16 sof_mask, u16 eof_mask)
+				     u16 sof_mask, u16 eof_mask,
+				     void (*start_poll)(void *),
+				     void *poll_data)
 {
 	struct tb_ring *ring = NULL;
 	dev_info(&nhi->pdev->dev, "allocating %s ring %d of size %d\n",
@@ -346,6 +444,8 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	ring->head = 0;
 	ring->tail = 0;
 	ring->running = false;
+	ring->start_poll = start_poll;
+	ring->poll_data = poll_data;
 
 	ring->descriptors = dma_alloc_coherent(&ring->nhi->pdev->dev,
 			size * sizeof(*ring->descriptors),
@@ -399,7 +499,7 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
 				 unsigned int flags)
 {
-	return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0);
+	return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
 
@@ -411,11 +511,17 @@ EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
  * @flags: Flags for the ring
  * @sof_mask: Mask of PDF values that start a frame
  * @eof_mask: Mask of PDF values that end a frame
+ * @start_poll: If not %NULL the ring will call this function when an
+ *		interrupt is triggered and masked, instead of callback
+ *		in each Rx frame.
+ * @poll_data: Optional data passed to @start_poll
  */
 struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-				 unsigned int flags, u16 sof_mask, u16 eof_mask)
+				 unsigned int flags, u16 sof_mask, u16 eof_mask,
+				 void (*start_poll)(void *), void *poll_data)
 {
-	return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask);
+	return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask,
+			     start_poll, poll_data);
 }
 EXPORT_SYMBOL_GPL(tb_ring_alloc_rx);
 
@@ -556,6 +662,7 @@ void tb_ring_free(struct tb_ring *ring)
 		dev_WARN(&ring->nhi->pdev->dev, "%s %d still running\n",
 			 RING_TYPE(ring), ring->hop);
 	}
+	spin_unlock_irq(&ring->nhi->lock);
 
 	ring_release_msix(ring);
 
@@ -572,7 +679,6 @@ void tb_ring_free(struct tb_ring *ring)
 		 RING_TYPE(ring),
 		 ring->hop);
 
-	spin_unlock_irq(&ring->nhi->lock);
 	/**
 	 * ring->work can no longer be scheduled (it is scheduled only
 	 * by nhi_interrupt_work, ring_stop and ring_msix). Wait for it
@@ -682,8 +788,10 @@ static void nhi_interrupt_work(struct work_struct *work)
 				 hop);
 			continue;
 		}
-		/* we do not check ring->running, this is done in ring->work */
-		schedule_work(&ring->work);
+
+		spin_lock(&ring->lock);
+		__ring_interrupt(ring);
+		spin_unlock(&ring->lock);
 	}
 	spin_unlock_irq(&nhi->lock);
 }
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index d59e3f9a35c4..36925e3aec7c 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -446,6 +446,9 @@ struct tb_nhi {
  * @flags: Ring specific flags
  * @sof_mask: Bit mask used to detect start of frame PDF
  * @eof_mask: Bit mask used to detect end of frame PDF
+ * @start_poll: Called when ring interrupt is triggered to start
+ *		polling. Passing %NULL keeps the ring in interrupt mode.
+ * @poll_data: Data passed to @start_poll
  */
 struct tb_ring {
 	spinlock_t lock;
@@ -466,6 +469,8 @@ struct tb_ring {
 	unsigned int flags;
 	u16 sof_mask;
 	u16 eof_mask;
+	void (*start_poll)(void *data);
+	void *poll_data;
 };
 
 /* Leave ring interrupt enabled on suspend */
@@ -499,7 +504,7 @@ enum ring_desc_flags {
 /**
  * struct ring_frame - For use with ring_rx/ring_tx
  * @buffer_phy: DMA mapped address of the frame
- * @callback: Callback called when the frame is finished
+ * @callback: Callback called when the frame is finished (optional)
  * @list: Frame is linked to a queue using this
  * @size: Size of the frame in bytes (%0 means %4096)
  * @flags: Flags for the frame (see &enum ring_desc_flags)
@@ -522,8 +527,8 @@ struct ring_frame {
 struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
 				 unsigned int flags);
 struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-				 unsigned int flags, u16 sof_mask,
-				 u16 eof_mask);
+				 unsigned int flags, u16 sof_mask, u16 eof_mask,
+				 void (*start_poll)(void *), void *poll_data);
 void tb_ring_start(struct tb_ring *ring);
 void tb_ring_stop(struct tb_ring *ring);
 void tb_ring_free(struct tb_ring *ring);
@@ -535,8 +540,8 @@ int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame);
  * @ring: Ring to enqueue the frame
  * @frame: Frame to enqueue
  *
- * @frame->buffer, @frame->buffer_phy and @frame->callback have to be set. The
- * buffer must contain at least %TB_FRAME_SIZE bytes.
+ * @frame->buffer, @frame->buffer_phy have to be set. The buffer must
+ * contain at least %TB_FRAME_SIZE bytes.
  *
  * @frame->callback will be invoked with @frame->size, @frame->flags,
  * @frame->eof, @frame->sof set once the frame has been received.
@@ -557,8 +562,8 @@ static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame)
  * @ring: Ring the enqueue the frame
  * @frame: Frame to enqueue
  *
- * @frame->buffer, @frame->buffer_phy, @frame->callback, @frame->size,
- * @frame->eof and @frame->sof have to be set.
+ * @frame->buffer, @frame->buffer_phy, @frame->size, @frame->eof and
+ * @frame->sof have to be set.
  *
  * @frame->callback will be invoked with once the frame has been transmitted.
  *
@@ -573,4 +578,8 @@ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame)
 	return __tb_ring_enqueue(ring, frame);
 }
 
+/* Used only when the ring is in polling mode */
+struct ring_frame *tb_ring_poll(struct tb_ring *ring);
+void tb_ring_poll_complete(struct tb_ring *ring);
+
 #endif /* THUNDERBOLT_H_ */
-- 
2.14.1

^ permalink raw reply related

* [PATCH v2 12/16] thunderbolt: Use spinlock in NHI serialization
From: Mika Westerberg @ 2017-09-25 11:07 UTC (permalink / raw)
  To: Greg Kroah-Hartman, David S . Miller
  Cc: Andreas Noever, Michael Jamet, Yehezkel Bernat, Amir Levy,
	Mario.Limonciello, Lukas Wunner, Andy Shevchenko, Andrew Lunn,
	Mika Westerberg, linux-kernel, netdev
In-Reply-To: <20170925110738.68382-1-mika.westerberg@linux.intel.com>

This is needed because ring polling functionality can be called from
atomic contexts when networking and other high-speed traffic is
transferred over a Thunderbolt cable.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
---
 drivers/thunderbolt/nhi.c   | 75 +++++++++++++++++++++++++--------------------
 include/linux/thunderbolt.h |  2 +-
 2 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index d1ad37c6eccf..cf1397afa72f 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -327,21 +327,9 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	if (transmit && hop == RING_E2E_UNUSED_HOPID)
 		return NULL;
 
-	mutex_lock(&nhi->lock);
-	if (hop >= nhi->hop_count) {
-		dev_WARN(&nhi->pdev->dev, "invalid hop: %d\n", hop);
-		goto err;
-	}
-	if (transmit && nhi->tx_rings[hop]) {
-		dev_WARN(&nhi->pdev->dev, "TX hop %d already allocated\n", hop);
-		goto err;
-	} else if (!transmit && nhi->rx_rings[hop]) {
-		dev_WARN(&nhi->pdev->dev, "RX hop %d already allocated\n", hop);
-		goto err;
-	}
 	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
 	if (!ring)
-		goto err;
+		return NULL;
 
 	spin_lock_init(&ring->lock);
 	INIT_LIST_HEAD(&ring->queue);
@@ -359,25 +347,45 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	ring->tail = 0;
 	ring->running = false;
 
-	if (ring_request_msix(ring, flags & RING_FLAG_NO_SUSPEND))
-		goto err;
-
 	ring->descriptors = dma_alloc_coherent(&ring->nhi->pdev->dev,
 			size * sizeof(*ring->descriptors),
 			&ring->descriptors_dma, GFP_KERNEL | __GFP_ZERO);
 	if (!ring->descriptors)
-		goto err;
+		goto err_free_ring;
 
+	if (ring_request_msix(ring, flags & RING_FLAG_NO_SUSPEND))
+		goto err_free_descs;
+
+	spin_lock_irq(&nhi->lock);
+	if (hop >= nhi->hop_count) {
+		dev_WARN(&nhi->pdev->dev, "invalid hop: %d\n", hop);
+		goto err_release_msix;
+	}
+	if (transmit && nhi->tx_rings[hop]) {
+		dev_WARN(&nhi->pdev->dev, "TX hop %d already allocated\n", hop);
+		goto err_release_msix;
+	} else if (!transmit && nhi->rx_rings[hop]) {
+		dev_WARN(&nhi->pdev->dev, "RX hop %d already allocated\n", hop);
+		goto err_release_msix;
+	}
 	if (transmit)
 		nhi->tx_rings[hop] = ring;
 	else
 		nhi->rx_rings[hop] = ring;
-	mutex_unlock(&nhi->lock);
+	spin_unlock_irq(&nhi->lock);
+
 	return ring;
 
-err:
+err_release_msix:
+	spin_unlock_irq(&nhi->lock);
+	ring_release_msix(ring);
+err_free_descs:
+	dma_free_coherent(&ring->nhi->pdev->dev,
+			  ring->size * sizeof(*ring->descriptors),
+			  ring->descriptors, ring->descriptors_dma);
+err_free_ring:
 	kfree(ring);
-	mutex_unlock(&nhi->lock);
+
 	return NULL;
 }
 
@@ -421,8 +429,8 @@ void tb_ring_start(struct tb_ring *ring)
 	u16 frame_size;
 	u32 flags;
 
-	mutex_lock(&ring->nhi->lock);
-	spin_lock_irq(&ring->lock);
+	spin_lock_irq(&ring->nhi->lock);
+	spin_lock(&ring->lock);
 	if (ring->nhi->going_away)
 		goto err;
 	if (ring->running) {
@@ -469,8 +477,8 @@ void tb_ring_start(struct tb_ring *ring)
 	ring_interrupt_active(ring, true);
 	ring->running = true;
 err:
-	spin_unlock_irq(&ring->lock);
-	mutex_unlock(&ring->nhi->lock);
+	spin_unlock(&ring->lock);
+	spin_unlock_irq(&ring->nhi->lock);
 }
 EXPORT_SYMBOL_GPL(tb_ring_start);
 
@@ -489,8 +497,8 @@ EXPORT_SYMBOL_GPL(tb_ring_start);
  */
 void tb_ring_stop(struct tb_ring *ring)
 {
-	mutex_lock(&ring->nhi->lock);
-	spin_lock_irq(&ring->lock);
+	spin_lock_irq(&ring->nhi->lock);
+	spin_lock(&ring->lock);
 	dev_info(&ring->nhi->pdev->dev, "stopping %s %d\n",
 		 RING_TYPE(ring), ring->hop);
 	if (ring->nhi->going_away)
@@ -511,8 +519,8 @@ void tb_ring_stop(struct tb_ring *ring)
 	ring->running = false;
 
 err:
-	spin_unlock_irq(&ring->lock);
-	mutex_unlock(&ring->nhi->lock);
+	spin_unlock(&ring->lock);
+	spin_unlock_irq(&ring->nhi->lock);
 
 	/*
 	 * schedule ring->work to invoke callbacks on all remaining frames.
@@ -534,7 +542,7 @@ EXPORT_SYMBOL_GPL(tb_ring_stop);
  */
 void tb_ring_free(struct tb_ring *ring)
 {
-	mutex_lock(&ring->nhi->lock);
+	spin_lock_irq(&ring->nhi->lock);
 	/*
 	 * Dissociate the ring from the NHI. This also ensures that
 	 * nhi_interrupt_work cannot reschedule ring->work.
@@ -564,7 +572,7 @@ void tb_ring_free(struct tb_ring *ring)
 		 RING_TYPE(ring),
 		 ring->hop);
 
-	mutex_unlock(&ring->nhi->lock);
+	spin_unlock_irq(&ring->nhi->lock);
 	/**
 	 * ring->work can no longer be scheduled (it is scheduled only
 	 * by nhi_interrupt_work, ring_stop and ring_msix). Wait for it
@@ -639,7 +647,7 @@ static void nhi_interrupt_work(struct work_struct *work)
 	int type = 0; /* current interrupt type 0: TX, 1: RX, 2: RX overflow */
 	struct tb_ring *ring;
 
-	mutex_lock(&nhi->lock);
+	spin_lock_irq(&nhi->lock);
 
 	/*
 	 * Starting at REG_RING_NOTIFY_BASE there are three status bitfields
@@ -677,7 +685,7 @@ static void nhi_interrupt_work(struct work_struct *work)
 		/* we do not check ring->running, this is done in ring->work */
 		schedule_work(&ring->work);
 	}
-	mutex_unlock(&nhi->lock);
+	spin_unlock_irq(&nhi->lock);
 }
 
 static irqreturn_t nhi_msi(int irq, void *data)
@@ -766,7 +774,6 @@ static void nhi_shutdown(struct tb_nhi *nhi)
 		devm_free_irq(&nhi->pdev->dev, nhi->pdev->irq, nhi);
 		flush_work(&nhi->interrupt_work);
 	}
-	mutex_destroy(&nhi->lock);
 	ida_destroy(&nhi->msix_ida);
 }
 
@@ -855,7 +862,7 @@ static int nhi_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return res;
 	}
 
-	mutex_init(&nhi->lock);
+	spin_lock_init(&nhi->lock);
 
 	pci_set_master(pdev);
 
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index cf9e42db780f..d59e3f9a35c4 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -415,7 +415,7 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc)
  * @hop_count: Number of rings (end point hops) supported by NHI.
  */
 struct tb_nhi {
-	struct mutex lock;
+	spinlock_t lock;
 	struct pci_dev *pdev;
 	void __iomem *iobase;
 	struct tb_ring **tx_rings;
-- 
2.14.1

^ permalink raw reply related

* [PATCH v2 11/16] thunderbolt: Use spinlock in ring serialization
From: Mika Westerberg @ 2017-09-25 11:07 UTC (permalink / raw)
  To: Greg Kroah-Hartman, David S . Miller
  Cc: Andreas Noever, Michael Jamet, Yehezkel Bernat, Amir Levy,
	Mario.Limonciello, Lukas Wunner, Andy Shevchenko, Andrew Lunn,
	Mika Westerberg, linux-kernel, netdev
In-Reply-To: <20170925110738.68382-1-mika.westerberg@linux.intel.com>

This makes it possible to enqueue frames also from atomic context which
is needed for example, when networking packets are sent over a
Thunderbolt cable.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
---
 drivers/thunderbolt/nhi.c   | 26 ++++++++++++++------------
 include/linux/thunderbolt.h |  2 +-
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 4704ccd20c84..d1ad37c6eccf 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -212,8 +212,10 @@ static void ring_work(struct work_struct *work)
 	struct tb_ring *ring = container_of(work, typeof(*ring), work);
 	struct ring_frame *frame;
 	bool canceled = false;
+	unsigned long flags;
 	LIST_HEAD(done);
-	mutex_lock(&ring->lock);
+
+	spin_lock_irqsave(&ring->lock, flags);
 
 	if (!ring->running) {
 		/*  Move all frames to done and mark them as canceled. */
@@ -241,7 +243,8 @@ static void ring_work(struct work_struct *work)
 	ring_write_descriptors(ring);
 
 invoke_callback:
-	mutex_unlock(&ring->lock); /* allow callbacks to schedule new work */
+	/* allow callbacks to schedule new work */
+	spin_unlock_irqrestore(&ring->lock, flags);
 	while (!list_empty(&done)) {
 		frame = list_first_entry(&done, typeof(*frame), list);
 		/*
@@ -255,15 +258,17 @@ static void ring_work(struct work_struct *work)
 
 int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
 {
+	unsigned long flags;
 	int ret = 0;
-	mutex_lock(&ring->lock);
+
+	spin_lock_irqsave(&ring->lock, flags);
 	if (ring->running) {
 		list_add_tail(&frame->list, &ring->queue);
 		ring_write_descriptors(ring);
 	} else {
 		ret = -ESHUTDOWN;
 	}
-	mutex_unlock(&ring->lock);
+	spin_unlock_irqrestore(&ring->lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(__tb_ring_enqueue);
@@ -338,7 +343,7 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	if (!ring)
 		goto err;
 
-	mutex_init(&ring->lock);
+	spin_lock_init(&ring->lock);
 	INIT_LIST_HEAD(&ring->queue);
 	INIT_LIST_HEAD(&ring->in_flight);
 	INIT_WORK(&ring->work, ring_work);
@@ -371,8 +376,6 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	return ring;
 
 err:
-	if (ring)
-		mutex_destroy(&ring->lock);
 	kfree(ring);
 	mutex_unlock(&nhi->lock);
 	return NULL;
@@ -419,7 +422,7 @@ void tb_ring_start(struct tb_ring *ring)
 	u32 flags;
 
 	mutex_lock(&ring->nhi->lock);
-	mutex_lock(&ring->lock);
+	spin_lock_irq(&ring->lock);
 	if (ring->nhi->going_away)
 		goto err;
 	if (ring->running) {
@@ -466,7 +469,7 @@ void tb_ring_start(struct tb_ring *ring)
 	ring_interrupt_active(ring, true);
 	ring->running = true;
 err:
-	mutex_unlock(&ring->lock);
+	spin_unlock_irq(&ring->lock);
 	mutex_unlock(&ring->nhi->lock);
 }
 EXPORT_SYMBOL_GPL(tb_ring_start);
@@ -487,7 +490,7 @@ EXPORT_SYMBOL_GPL(tb_ring_start);
 void tb_ring_stop(struct tb_ring *ring)
 {
 	mutex_lock(&ring->nhi->lock);
-	mutex_lock(&ring->lock);
+	spin_lock_irq(&ring->lock);
 	dev_info(&ring->nhi->pdev->dev, "stopping %s %d\n",
 		 RING_TYPE(ring), ring->hop);
 	if (ring->nhi->going_away)
@@ -508,7 +511,7 @@ void tb_ring_stop(struct tb_ring *ring)
 	ring->running = false;
 
 err:
-	mutex_unlock(&ring->lock);
+	spin_unlock_irq(&ring->lock);
 	mutex_unlock(&ring->nhi->lock);
 
 	/*
@@ -568,7 +571,6 @@ void tb_ring_free(struct tb_ring *ring)
 	 * to finish before freeing the ring.
 	 */
 	flush_work(&ring->work);
-	mutex_destroy(&ring->lock);
 	kfree(ring);
 }
 EXPORT_SYMBOL_GPL(tb_ring_free);
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index e3b9af7be0ad..cf9e42db780f 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -448,7 +448,7 @@ struct tb_nhi {
  * @eof_mask: Bit mask used to detect end of frame PDF
  */
 struct tb_ring {
-	struct mutex lock;
+	spinlock_t lock;
 	struct tb_nhi *nhi;
 	int size;
 	int hop;
-- 
2.14.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox