Netdev List

Netdev List
 help / color / mirror / Atom feed

* RE: [PATCH v4 4/9] em28xx: fix em28xx_dvb_init for KASAN
From: David Laight @ 2017-09-25 14:41 UTC (permalink / raw)
  To: 'Arnd Bergmann', Mauro Carvalho Chehab
  Cc: Jiri Pirko, Arend van Spriel, Kalle Valo, David S. Miller,
	Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov,
	Masahiro Yamada, Michal Marek, Andrew Morton, Kees Cook,
	Geert Uytterhoeven, Greg Kroah-Hartman,
	linux-media@vger.kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, linux-wireless@vger.kernel.org
In-Reply-To: <20170922212930.620249-5-arnd@arndb.de>

From: Arnd Bergmann
> Sent: 22 September 2017 22:29
...
> It seems that this is triggered in part by using strlcpy(), which the
> compiler doesn't recognize as copying at most 'len' bytes, since strlcpy
> is not part of the C standard.

Neither is strncpy().

It'll almost certainly be a marker in a header file somewhere,
so it should be possibly to teach it about other functions.

	David

^ permalink raw reply

* [PATCH net-next v9] openvswitch: enable NSH support
From: Yi Yang @ 2017-09-25 14:16 UTC (permalink / raw)
  To: netdev; +Cc: dev, jbenc, e, davem, Yi Yang

v8->v9
 - Fix build error reported by daily intel build
   because nsh module isn't selected by openvswitch

v7->v8
 - Rework nested value and mask for OVS_KEY_ATTR_NSH
 - Change pop_nsh to adapt to nsh kernel module
 - Fix many issues per comments from Jiri Benc

v6->v7
 - Remove NSH GSO patches in v6 because Jiri Benc
   reworked it as another patch series and they have
   been merged.
 - Change it to adapt to nsh kernel module added by NSH
   GSO patch series

v5->v6
 - Fix the rest comments for v4.
 - Add NSH GSO support for VxLAN-gpe + NSH and
   Eth + NSH.

v4->v5
 - Fix many comments by Jiri Benc and Eric Garver
   for v4.

v3->v4
 - Add new NSH match field ttl
 - Update NSH header to the latest format
   which will be final format and won't change
   per its author's confirmation.
 - Fix comments for v3.

v2->v3
 - Change OVS_KEY_ATTR_NSH to nested key to handle
   length-fixed attributes and length-variable
   attriubte more flexibly.
 - Remove struct ovs_action_push_nsh completely
 - Add code to handle nested attribute for SET_MASKED
 - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH
   to transfer NSH header data.
 - Fix comments and coding style issues by Jiri and Eric

v1->v2
 - Change encap_nsh and decap_nsh to push_nsh and pop_nsh
 - Dynamically allocate struct ovs_action_push_nsh for
   length-variable metadata.

OVS master and 2.8 branch has merged NSH userspace
patch series, this patch is to enable NSH support
in kernel data path in order that OVS can support
NSH in compat mode by porting this.

Signed-off-by: Yi Yang <yi.y.yang@intel.com>
---
 include/net/nsh.h                |   3 +
 include/uapi/linux/openvswitch.h |  29 ++++
 net/nsh/nsh.c                    |  53 +++++++
 net/openvswitch/Kconfig          |   1 +
 net/openvswitch/actions.c        | 112 ++++++++++++++
 net/openvswitch/flow.c           |  51 ++++++
 net/openvswitch/flow.h           |  11 ++
 net/openvswitch/flow_netlink.c   | 324 ++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/flow_netlink.h   |   5 +
 9 files changed, 588 insertions(+), 1 deletion(-)

diff --git a/include/net/nsh.h b/include/net/nsh.h
index a1eaea2..b886d33 100644
--- a/include/net/nsh.h
+++ b/include/net/nsh.h
@@ -304,4 +304,7 @@ static inline void nsh_set_flags_ttl_len(struct nshhdr *nsh, u8 flags,
 			NSH_FLAGS_MASK | NSH_TTL_MASK | NSH_LEN_MASK);
 }
 
+int skb_push_nsh(struct sk_buff *skb, const struct nshhdr *src_nsh_hdr);
+int skb_pop_nsh(struct sk_buff *skb);
+
 #endif /* __NET_NSH_H */
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 156ee4c..c1a785c 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -333,6 +333,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_CT_LABELS,	/* 16-octet connection tracking label */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,   /* struct ovs_key_ct_tuple_ipv4 */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,   /* struct ovs_key_ct_tuple_ipv6 */
+	OVS_KEY_ATTR_NSH,       /* Nested set of ovs_nsh_key_* */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -491,6 +492,30 @@ struct ovs_key_ct_tuple_ipv6 {
 	__u8   ipv6_proto;
 };
 
+enum ovs_nsh_key_attr {
+	OVS_NSH_KEY_ATTR_UNSPEC,
+	OVS_NSH_KEY_ATTR_BASE,  /* struct ovs_nsh_key_base. */
+	OVS_NSH_KEY_ATTR_MD1,   /* struct ovs_nsh_key_md1. */
+	OVS_NSH_KEY_ATTR_MD2,   /* variable-length octets for MD type 2. */
+	__OVS_NSH_KEY_ATTR_MAX
+};
+
+#define OVS_NSH_KEY_ATTR_MAX (__OVS_NSH_KEY_ATTR_MAX - 1)
+
+struct ovs_nsh_key_base {
+	__u8 flags;
+	__u8 ttl;
+	__u8 mdtype;
+	__u8 np;
+	__be32 path_hdr;
+};
+
+#define NSH_MD1_CONTEXT_SIZE 4
+
+struct ovs_nsh_key_md1 {
+	__be32 context[NSH_MD1_CONTEXT_SIZE];
+};
+
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
  * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
@@ -806,6 +831,8 @@ struct ovs_action_push_eth {
  * packet.
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
+ * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet.
+ * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -835,6 +862,8 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_TRUNC,        /* u32 struct ovs_action_trunc. */
 	OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
 	OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
+	OVS_ACTION_ATTR_PUSH_NSH,     /* Nested OVS_NSH_KEY_ATTR_*. */
+	OVS_ACTION_ATTR_POP_NSH,      /* No argument. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c
index 58fb827..54334ca 100644
--- a/net/nsh/nsh.c
+++ b/net/nsh/nsh.c
@@ -14,6 +14,59 @@
 #include <net/nsh.h>
 #include <net/tun_proto.h>
 
+int skb_push_nsh(struct sk_buff *skb, const struct nshhdr *src_nsh_hdr)
+{
+	struct nshhdr *nsh_hdr;
+	size_t length = nsh_hdr_len(src_nsh_hdr);
+	u8 next_proto;
+
+	if (skb->mac_len) {
+		next_proto = TUN_P_ETHERNET;
+	} else {
+		next_proto = tun_p_from_eth_p(skb->protocol);
+		if (!next_proto)
+			return -EAFNOSUPPORT;
+	}
+
+	/* Add the NSH header */
+	if (skb_cow_head(skb, length) < 0)
+		return -ENOMEM;
+
+	skb_push(skb, length);
+	nsh_hdr = (struct nshhdr *)(skb->data);
+	memcpy(nsh_hdr, src_nsh_hdr, length);
+	nsh_hdr->np = next_proto;
+
+	skb->protocol = htons(ETH_P_NSH);
+	skb_reset_mac_header(skb);
+	skb_reset_mac_len(skb);
+	skb_reset_network_header(skb);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_push_nsh);
+
+int skb_pop_nsh(struct sk_buff *skb)
+{
+	struct nshhdr *nsh_hdr = (struct nshhdr *)(skb->data);
+	size_t length;
+	u16 inner_proto;
+
+	inner_proto = tun_p_to_eth_p(nsh_hdr->np);
+	if (!inner_proto)
+		return -EAFNOSUPPORT;
+
+	length = nsh_hdr_len(nsh_hdr);
+	skb_pull(skb, length);
+	skb_reset_mac_header(skb);
+	skb_reset_mac_len(skb);
+	skb_reset_network_header(skb);
+	skb->protocol = inner_proto;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_pop_nsh);
+
 static struct sk_buff *nsh_gso_segment(struct sk_buff *skb,
 				       netdev_features_t features)
 {
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index ce94729..2650205 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -14,6 +14,7 @@ config OPENVSWITCH
 	select MPLS
 	select NET_MPLS_GSO
 	select DST_CACHE
+	select NET_NSH
 	---help---
 	  Open vSwitch is a multilayer Ethernet switch targeted at virtualized
 	  environments.  In addition to supporting a variety of features
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a54a556..d026b85 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -43,6 +43,7 @@
 #include "flow.h"
 #include "conntrack.h"
 #include "vport.h"
+#include "flow_netlink.h"
 
 struct deferred_action {
 	struct sk_buff *skb;
@@ -380,6 +381,45 @@ static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
+static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key,
+		    const struct nshhdr *src_nsh_hdr)
+{
+	int err;
+
+	err = skb_push_nsh(skb, src_nsh_hdr);
+	if (err)
+		return err;
+
+	key->eth.type = htons(ETH_P_NSH);
+
+	/* safe right before invalidate_flow_key */
+	key->mac_proto = MAC_PROTO_NONE;
+	invalidate_flow_key(key);
+	return 0;
+}
+
+static int pop_nsh(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	int err;
+
+	if (ovs_key_mac_proto(key) != MAC_PROTO_NONE ||
+	    skb->protocol != htons(ETH_P_NSH)) {
+		return -EINVAL;
+	}
+
+	err = skb_pop_nsh(skb);
+	if (err)
+		return err;
+
+	/* safe right before invalidate_flow_key */
+	if (skb->protocol == htons(ETH_P_TEB))
+		key->mac_proto = MAC_PROTO_ETHERNET;
+	else
+		key->mac_proto = MAC_PROTO_NONE;
+	invalidate_flow_key(key);
+	return 0;
+}
+
 static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
 				  __be32 addr, __be32 new_addr)
 {
@@ -602,6 +642,59 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
 	return 0;
 }
 
+static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key,
+		   const struct nlattr *a)
+{
+	struct nshhdr *nsh_hdr;
+	int err;
+	u8 flags;
+	u8 ttl;
+	int i;
+
+	struct ovs_key_nsh key;
+	struct ovs_key_nsh mask;
+
+	err = nsh_key_from_nlattr(a, &key, &mask);
+	if (err)
+		return err;
+
+	err = skb_ensure_writable(skb, skb_network_offset(skb) +
+				  sizeof(struct nshhdr));
+	if (unlikely(err))
+		return err;
+
+	nsh_hdr = (struct nshhdr *)skb_network_header(skb);
+
+	flags = nsh_get_flags(nsh_hdr);
+	flags = OVS_MASKED(flags, key.flags, mask.flags);
+	flow_key->nsh.flags = flags;
+	ttl = nsh_get_ttl(nsh_hdr);
+	ttl = OVS_MASKED(ttl, key.ttl, mask.ttl);
+	flow_key->nsh.ttl = ttl;
+	nsh_set_flags_and_ttl(nsh_hdr, flags, ttl);
+	nsh_hdr->path_hdr = OVS_MASKED(nsh_hdr->path_hdr, key.path_hdr,
+				       mask.path_hdr);
+	flow_key->nsh.path_hdr = nsh_hdr->path_hdr;
+	switch (nsh_hdr->mdtype) {
+	case NSH_M_TYPE1:
+		for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) {
+			nsh_hdr->md1.context[i] =
+			    OVS_MASKED(nsh_hdr->md1.context[i], key.context[i],
+				       mask.context[i]);
+		}
+		memcpy(flow_key->nsh.context, nsh_hdr->md1.context,
+		       sizeof(nsh_hdr->md1.context));
+		break;
+	case NSH_M_TYPE2:
+		memset(flow_key->nsh.context, 0,
+		       sizeof(flow_key->nsh.context));
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /* Must follow skb_ensure_writable() since that can move the skb data. */
 static void set_tp_port(struct sk_buff *skb, __be16 *port,
 			__be16 new_port, __sum16 *check)
@@ -1024,6 +1117,10 @@ static int execute_masked_set_action(struct sk_buff *skb,
 				   get_mask(a, struct ovs_key_ethernet *));
 		break;
 
+	case OVS_KEY_ATTR_NSH:
+		err = set_nsh(skb, flow_key, a);
+		break;
+
 	case OVS_KEY_ATTR_IPV4:
 		err = set_ipv4(skb, flow_key, nla_data(a),
 			       get_mask(a, struct ovs_key_ipv4 *));
@@ -1210,6 +1307,21 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		case OVS_ACTION_ATTR_POP_ETH:
 			err = pop_eth(skb, key);
 			break;
+
+		case OVS_ACTION_ATTR_PUSH_NSH: {
+			u8 buffer[NSH_HDR_MAX_LEN];
+			struct nshhdr *nsh_hdr = (struct nshhdr *)buffer;
+			const struct nshhdr *src_nsh_hdr = nsh_hdr;
+
+			nsh_hdr_from_nlattr(nla_data(a), nsh_hdr,
+					    NSH_HDR_MAX_LEN);
+			err = push_nsh(skb, key, src_nsh_hdr);
+			break;
+		}
+
+		case OVS_ACTION_ATTR_POP_NSH:
+			err = pop_nsh(skb, key);
+			break;
 		}
 
 		if (unlikely(err)) {
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 8c94cef..67fb6d9 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -46,6 +46,7 @@
 #include <net/ipv6.h>
 #include <net/mpls.h>
 #include <net/ndisc.h>
+#include <net/nsh.h>
 
 #include "conntrack.h"
 #include "datapath.h"
@@ -490,6 +491,52 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
+static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	struct nshhdr *nsh_hdr;
+	unsigned int nh_ofs = skb_network_offset(skb);
+	u8 version, length;
+	int err;
+
+	err = check_header(skb, nh_ofs + NSH_BASE_HDR_LEN);
+	if (unlikely(err))
+		return err;
+
+	nsh_hdr = (struct nshhdr *)skb_network_header(skb);
+	version = nsh_get_ver(nsh_hdr);
+	length = nsh_hdr_len(nsh_hdr);
+
+	if (version != 0)
+		return -EINVAL;
+
+	err = check_header(skb, nh_ofs + length);
+	if (unlikely(err))
+		return err;
+
+	nsh_hdr = (struct nshhdr *)skb_network_header(skb);
+	key->nsh.flags = nsh_get_flags(nsh_hdr);
+	key->nsh.ttl = nsh_get_ttl(nsh_hdr);
+	key->nsh.mdtype = nsh_hdr->mdtype;
+	key->nsh.np = nsh_hdr->np;
+	key->nsh.path_hdr = nsh_hdr->path_hdr;
+	switch (key->nsh.mdtype) {
+	case NSH_M_TYPE1:
+		if (length != NSH_M_TYPE1_LEN)
+			return -EINVAL;
+		memcpy(key->nsh.context, nsh_hdr->md1.context,
+		       sizeof(nsh_hdr->md1));
+		break;
+	case NSH_M_TYPE2:
+		memset(key->nsh.context, 0,
+		       sizeof(nsh_hdr->md1));
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  * key_extract - extracts a flow key from an Ethernet frame.
  * @skb: sk_buff that contains the frame, with skb->data pointing to the
@@ -735,6 +782,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		}
+	} else if (key->eth.type == htons(ETH_P_NSH)) {
+		error = parse_nsh(skb, key);
+		if (error)
+			return error;
 	}
 	return 0;
 }
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 1875bba..6a3cd9c 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -35,6 +35,7 @@
 #include <net/inet_ecn.h>
 #include <net/ip_tunnels.h>
 #include <net/dst_metadata.h>
+#include <net/nsh.h>
 
 struct sk_buff;
 
@@ -66,6 +67,15 @@ struct vlan_head {
 	(offsetof(struct sw_flow_key, recirc_id) +	\
 	FIELD_SIZEOF(struct sw_flow_key, recirc_id))
 
+struct ovs_key_nsh {
+	u8 flags;
+	u8 ttl;
+	u8 mdtype;
+	u8 np;
+	__be32 path_hdr;
+	__be32 context[NSH_MD1_CONTEXT_SIZE];
+};
+
 struct sw_flow_key {
 	u8 tun_opts[IP_TUNNEL_OPTS_MAX];
 	u8 tun_opts_len;
@@ -144,6 +154,7 @@ struct sw_flow_key {
 			};
 		} ipv6;
 	};
+	struct ovs_key_nsh nsh;         /* network service header */
 	struct {
 		/* Connection tracking fields not packed above. */
 		struct {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index e8eb427..278bbb3 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -48,6 +48,7 @@
 #include <net/ndisc.h>
 #include <net/mpls.h>
 #include <net/vxlan.h>
+#include <net/tun_proto.h>
 
 #include "flow_netlink.h"
 
@@ -78,9 +79,11 @@ static bool actions_may_change_flow(const struct nlattr *actions)
 		case OVS_ACTION_ATTR_HASH:
 		case OVS_ACTION_ATTR_POP_ETH:
 		case OVS_ACTION_ATTR_POP_MPLS:
+		case OVS_ACTION_ATTR_POP_NSH:
 		case OVS_ACTION_ATTR_POP_VLAN:
 		case OVS_ACTION_ATTR_PUSH_ETH:
 		case OVS_ACTION_ATTR_PUSH_MPLS:
+		case OVS_ACTION_ATTR_PUSH_NSH:
 		case OVS_ACTION_ATTR_PUSH_VLAN:
 		case OVS_ACTION_ATTR_SAMPLE:
 		case OVS_ACTION_ATTR_SET:
@@ -322,12 +325,27 @@ size_t ovs_tun_key_attr_size(void)
 		+ nla_total_size(2);   /* OVS_TUNNEL_KEY_ATTR_TP_DST */
 }
 
+size_t ovs_nsh_key_attr_size(void)
+{
+	/* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider
+	 * updating this function.
+	 */
+	return  nla_total_size(NSH_BASE_HDR_LEN) /* OVS_NSH_KEY_ATTR_BASE */
+		/* OVS_NSH_KEY_ATTR_MD1 and OVS_NSH_KEY_ATTR_MD2 are
+		 * mutually exclusive, so the bigger one can cover
+		 * the small one.
+		 *
+		 * OVS_NSH_KEY_ATTR_MD2
+		 */
+		+ nla_total_size(NSH_CTX_HDRS_MAX_LEN);
+}
+
 size_t ovs_key_attr_size(void)
 {
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -341,6 +359,8 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
 		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABELS */
 		+ nla_total_size(40)  /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
+		+ nla_total_size(0)   /* OVS_KEY_ATTR_NSH */
+		  + ovs_nsh_key_attr_size()
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
@@ -373,6 +393,13 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
 	[OVS_TUNNEL_KEY_ATTR_IPV6_DST]      = { .len = sizeof(struct in6_addr) },
 };
 
+static const struct ovs_len_tbl
+ovs_nsh_key_attr_lens[OVS_NSH_KEY_ATTR_MAX + 1] = {
+	[OVS_NSH_KEY_ATTR_BASE] = { .len = sizeof(struct ovs_nsh_key_base) },
+	[OVS_NSH_KEY_ATTR_MD1]  = { .len = sizeof(struct ovs_nsh_key_md1) },
+	[OVS_NSH_KEY_ATTR_MD2]  = { .len = OVS_ATTR_VARIABLE },
+};
+
 /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
 static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_ENCAP]	 = { .len = OVS_ATTR_NESTED },
@@ -405,6 +432,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 		.len = sizeof(struct ovs_key_ct_tuple_ipv4) },
 	[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
 		.len = sizeof(struct ovs_key_ct_tuple_ipv6) },
+	[OVS_KEY_ATTR_NSH]       = { .len = OVS_ATTR_NESTED,
+				     .next = ovs_nsh_key_attr_lens, },
 };
 
 static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -1179,6 +1208,222 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 	return 0;
 }
 
+int nsh_hdr_from_nlattr(const struct nlattr *attr,
+			struct nshhdr *nsh, size_t size)
+{
+	struct nlattr *a;
+	int rem;
+	u8 flags = 0;
+	u8 ttl = 0;
+	int mdlen = 0;
+
+	/* validate_nsh has check this, so we needn't do duplicate check here
+	 */
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base =
+				(struct ovs_nsh_key_base *)nla_data(a);
+			flags = base->flags;
+			ttl = base->ttl;
+			nsh->np = base->np;
+			nsh->mdtype = base->mdtype;
+			nsh->path_hdr = base->path_hdr;
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 =
+				(struct ovs_nsh_key_md1 *)nla_data(a);
+			struct nsh_md1_ctx *md1_dst = &nsh->md1;
+
+			mdlen = nla_len(a);
+			memcpy(md1_dst, md1, mdlen);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2: {
+			const struct u8 *md2 = nla_data(a);
+			struct nsh_md2_tlv *md2_dst = &nsh->md2;
+
+			mdlen = nla_len(a);
+			memcpy(md2_dst, md2, mdlen);
+			break;
+		}
+		default:
+			return -EINVAL;
+		}
+	}
+
+	/* nsh header length  = NSH_BASE_HDR_LEN + mdlen */
+	nsh->ver_flags_ttl_len = 0;
+	nsh_set_flags_ttl_len(nsh, flags, ttl, NSH_BASE_HDR_LEN + mdlen);
+
+	return 0;
+}
+
+int nsh_key_from_nlattr(const struct nlattr *attr,
+			struct ovs_key_nsh *nsh, struct ovs_key_nsh *nsh_mask)
+{
+	struct nlattr *a;
+	int rem;
+
+	/* validate_nsh has check this, so we needn't do duplicate check here
+	 */
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base =
+				(struct ovs_nsh_key_base *)nla_data(a);
+			const struct ovs_nsh_key_base *base_mask = base + 1;
+
+			memcpy(nsh, base, sizeof(*base));
+			memcpy(nsh, base_mask, sizeof(*base_mask));
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 =
+				(struct ovs_nsh_key_md1 *)nla_data(a);
+			const struct ovs_nsh_key_md1 *md1_mask = md1 + 1;
+
+			memcpy(nsh->context, md1->context, sizeof(*md1));
+			memcpy(nsh_mask->context, md1_mask->context,
+			       sizeof(*md1_mask));
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2:
+			/* Not supported yet */
+			return -ENOTSUPP;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int nsh_key_put_from_nlattr(const struct nlattr *attr,
+				   struct sw_flow_match *match, bool is_mask,
+				   bool is_push_nsh, bool log)
+{
+	struct nlattr *a;
+	int rem;
+	bool has_base = false;
+	bool has_md1 = false;
+	bool has_md2 = false;
+	u8 mdtype = 0;
+	int mdlen = 0;
+
+	if (unlikely(is_push_nsh && is_mask))
+		return -EINVAL;
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+		int i;
+
+		if (type > OVS_NSH_KEY_ATTR_MAX) {
+			OVS_NLERR(log, "nsh attr %d is out of range max %d",
+				  type, OVS_NSH_KEY_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (!check_attr_len(nla_len(a),
+				    ovs_nsh_key_attr_lens[type].len)) {
+			OVS_NLERR(
+			    log,
+			    "nsh attr %d has unexpected len %d expected %d",
+			    type,
+			    nla_len(a),
+			    ovs_nsh_key_attr_lens[type].len
+			);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base =
+				(struct ovs_nsh_key_base *)nla_data(a);
+
+			has_base = true;
+			mdtype = base->mdtype;
+			SW_FLOW_KEY_PUT(match, nsh.flags,
+					base->flags, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.ttl,
+					base->ttl, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.mdtype,
+					base->mdtype, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.np,
+					base->np, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.path_hdr,
+					base->path_hdr, is_mask);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 =
+				(struct ovs_nsh_key_md1 *)nla_data(a);
+
+			has_md1 = true;
+			for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++)
+				SW_FLOW_KEY_PUT(match, nsh.context[i],
+						md1->context[i], is_mask);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2:
+			if (!is_push_nsh) /* Not supported MD type 2 yet */
+				return -ENOTSUPP;
+
+			has_md2 = true;
+			mdlen = nla_len(a);
+			if ((mdlen > NSH_CTX_HDRS_MAX_LEN) ||
+			    (mdlen <= 0)) {
+				WARN_ON_ONCE(1);
+				return -EINVAL;
+			}
+			break;
+		default:
+			OVS_NLERR(log, "Unknown nsh attribute %d",
+				  type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(log, "nsh attribute has %d unknown bytes.", rem);
+		return -EINVAL;
+	}
+
+	if (has_md1 && has_md2) {
+		OVS_NLERR(
+		    1,
+		    "invalid nsh attribute: md1 and md2 are exclusive."
+		);
+		return -EINVAL;
+	}
+
+	if (!is_mask) {
+		if ((has_md1 && mdtype != NSH_M_TYPE1) ||
+		    (has_md2 && mdtype != NSH_M_TYPE2)) {
+			OVS_NLERR(1, "nsh attribute has unmatched MD type %d.",
+				  mdtype);
+			return -EINVAL;
+		}
+
+		if (is_push_nsh &&
+		    (!has_base || (!has_md1 && !has_md2))) {
+			OVS_NLERR(
+			    1,
+			    "push nsh attributes are invalid for type %d.",
+			    mdtype
+			);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
 				u64 attrs, const struct nlattr **a,
 				bool is_mask, bool log)
@@ -1306,6 +1551,13 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
 		attrs &= ~(1 << OVS_KEY_ATTR_ARP);
 	}
 
+	if (attrs & (1 << OVS_KEY_ATTR_NSH)) {
+		if (nsh_key_put_from_nlattr(a[OVS_KEY_ATTR_NSH], match,
+					    is_mask, false, log) < 0)
+			return -EINVAL;
+		attrs &= ~(1 << OVS_KEY_ATTR_NSH);
+	}
+
 	if (attrs & (1 << OVS_KEY_ATTR_MPLS)) {
 		const struct ovs_key_mpls *mpls_key;
 
@@ -1622,6 +1874,40 @@ static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh,
 	return 0;
 }
 
+static int nsh_key_to_nlattr(const struct ovs_key_nsh *nsh, bool is_mask,
+			     struct sk_buff *skb)
+{
+	struct nlattr *start;
+	struct ovs_nsh_key_base base;
+	struct ovs_nsh_key_md1 md1;
+
+	memcpy(&base, nsh, sizeof(base));
+
+	if (is_mask || nsh->mdtype == NSH_M_TYPE1)
+		memcpy(md1.context, nsh->context, sizeof(md1));
+
+	start = nla_nest_start(skb, OVS_KEY_ATTR_NSH);
+	if (!start)
+		return -EMSGSIZE;
+
+	if (nla_put(skb, OVS_NSH_KEY_ATTR_BASE, sizeof(base), &base))
+		goto nla_put_failure;
+
+	if (is_mask || nsh->mdtype == NSH_M_TYPE1) {
+		if (nla_put(skb, OVS_NSH_KEY_ATTR_MD1, sizeof(md1), &md1))
+			goto nla_put_failure;
+	}
+
+	/* Don't support MD type 2 yet */
+
+	nla_nest_end(skb, start);
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 			     const struct sw_flow_key *output, bool is_mask,
 			     struct sk_buff *skb)
@@ -1750,6 +2036,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 		ipv6_key->ipv6_tclass = output->ip.tos;
 		ipv6_key->ipv6_hlimit = output->ip.ttl;
 		ipv6_key->ipv6_frag = output->ip.frag;
+	} else if (swkey->eth.type == htons(ETH_P_NSH)) {
+		if (nsh_key_to_nlattr(&output->nsh, is_mask, skb))
+			goto nla_put_failure;
 	} else if (swkey->eth.type == htons(ETH_P_ARP) ||
 		   swkey->eth.type == htons(ETH_P_RARP)) {
 		struct ovs_key_arp *arp_key;
@@ -2242,6 +2531,19 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	return err;
 }
 
+static bool validate_nsh(const struct nlattr *attr, bool is_mask,
+			 bool is_push_nsh, bool log)
+{
+	struct sw_flow_match match;
+	struct sw_flow_key key;
+	int ret = 0;
+
+	ovs_match_init(&match, &key, true, NULL);
+	ret = nsh_key_put_from_nlattr(attr, &match, is_mask,
+				      is_push_nsh, log);
+	return ((ret != 0) ? false : true);
+}
+
 /* Return false if there are any non-masked bits set.
  * Mask follows data immediately, before any netlink padding.
  */
@@ -2384,6 +2686,11 @@ static int validate_set(const struct nlattr *a,
 
 		break;
 
+	case OVS_KEY_ATTR_NSH:
+		if (!validate_nsh(nla_data(a), masked, false, log))
+			return -EINVAL;
+		break;
+
 	default:
 		return -EINVAL;
 	}
@@ -2482,6 +2789,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
 			[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
 			[OVS_ACTION_ATTR_POP_ETH] = 0,
+			[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
+			[OVS_ACTION_ATTR_POP_NSH] = 0,
 		};
 		const struct ovs_action_push_vlan *vlan;
 		int type = nla_type(a);
@@ -2636,6 +2945,19 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			mac_proto = MAC_PROTO_ETHERNET;
 			break;
 
+		case OVS_ACTION_ATTR_PUSH_NSH:
+			mac_proto = MAC_PROTO_NONE;
+			if (!validate_nsh(nla_data(a), false, true, true))
+				return -EINVAL;
+			break;
+
+		case OVS_ACTION_ATTR_POP_NSH:
+			if (key->nsh.np == TUN_P_ETHERNET)
+				mac_proto = MAC_PROTO_ETHERNET;
+			else
+				mac_proto = MAC_PROTO_NONE;
+			break;
+
 		default:
 			OVS_NLERR(log, "Unknown Action type %d", type);
 			return -EINVAL;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 929c665..4b80083 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -79,4 +79,9 @@ int ovs_nla_put_actions(const struct nlattr *attr,
 void ovs_nla_free_flow_actions(struct sw_flow_actions *);
 void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *);
 
+int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh,
+			struct ovs_key_nsh *nsh_mask);
+int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *src_nsh_hdr,
+			size_t size);
+
 #endif /* flow_netlink.h */
-- 
2.5.5

^ permalink raw reply related

* Re: [patch net-next v2 06/12] net: mroute: Check if rule is a default rule
From: Yotam Gigi @ 2017-09-25 13:37 UTC (permalink / raw)
  To: Nikolay Aleksandrov, Jiri Pirko, Yunsheng Lin
  Cc: netdev, davem, idosch, mlxsw, andrew
In-Reply-To: <fc1f3324-d2fc-f95c-50d2-25773f3c7683@cumulusnetworks.com>

On 09/25/2017 01:02 PM, Nikolay Aleksandrov wrote:
> On 25/09/17 12:45, Jiri Pirko wrote:
>> Mon, Sep 25, 2017 at 03:28:21AM CEST, linyunsheng@huawei.com wrote:
>>> Hi, Jiri
>>>
>>> On 2017/9/25 1:22, Jiri Pirko wrote:
>>>> From: Yotam Gigi <yotamg@mellanox.com>
>>>>
>>>> When the ipmr starts, it adds one default FIB rule that matches all packets
>>>> and sends them to the DEFAULT (multicast) FIB table. A more complex rule
>>>> can be added by user to specify that for a specific interface, a packet
>>>> should be look up at either an arbitrary table or according to the l3mdev
>>>> of the interface.
>>>>
>>>> For drivers willing to offload the ipmr logic into a hardware but don't
>>>> want to offload all the FIB rules functionality, provide a function that
>>>> can indicate whether the FIB rule is the default multicast rule, thus only
>>>> one routing table is needed.
>>>>
>>>> This way, a driver can register to the FIB notification chain, get
>>>> notifications about FIB rules added and trigger some kind of an internal
>>>> abort mechanism when a non default rule is added by the user.
>>>>
>>>> Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
>>>> Reviewed-by: Ido Schimmel <idosch@mellanox.com>
>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>> ---
>>>>  include/linux/mroute.h |  7 +++++++
>>>>  net/ipv4/ipmr.c        | 10 ++++++++++
>>>>  2 files changed, 17 insertions(+)
>>>>
>>>> diff --git a/include/linux/mroute.h b/include/linux/mroute.h
>>>> index 5566580..b072a84 100644
>>>> --- a/include/linux/mroute.h
>>>> +++ b/include/linux/mroute.h
>>>> @@ -5,6 +5,7 @@
>>>>  #include <linux/pim.h>
>>>>  #include <linux/rhashtable.h>
>>>>  #include <net/sock.h>
>>>> +#include <net/fib_rules.h>
>>>>  #include <net/fib_notifier.h>
>>>>  #include <uapi/linux/mroute.h>
>>>>  
>>>> @@ -19,6 +20,7 @@ int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
>>>>  int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
>>>>  int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
>>>>  int ip_mr_init(void);
>>>> +bool ipmr_rule_default(const struct fib_rule *rule);
>>>>  #else
>>>>  static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
>>>>  				       char __user *optval, unsigned int optlen)
>>>> @@ -46,6 +48,11 @@ static inline int ip_mroute_opt(int opt)
>>>>  {
>>>>  	return 0;
>>>>  }
>>>> +
>>>> +static inline bool ipmr_rule_default(const struct fib_rule *rule)
>>>> +{
>>>> +	return true;
>>>> +}
>>>>  #endif
>>>>  
>>>>  struct vif_device {
>>>> diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
>>>> index 2a795d2..a714f55 100644
>>>> --- a/net/ipv4/ipmr.c
>>>> +++ b/net/ipv4/ipmr.c
>>>> @@ -320,6 +320,16 @@ static unsigned int ipmr_rules_seq_read(struct net *net)
>>>>  }
>>>>  #endif
>>>>  
>>>> +bool ipmr_rule_default(const struct fib_rule *rule)
>>>> +{
>>>> +#if IS_ENABLED(CONFIG_FIB_RULES)
>>>> +	return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
>>>> +#else
>>>> +	return true;
>>>> +#endif
>>> In patch 02, You have the following, can you do the same for the above?
>>> +#ifdef CONFIG_IP_MROUTE
>>> +void ipmr_cache_free(struct mfc_cache *mfc_cache);
>>> +#else
>>> +static inline void ipmr_cache_free(struct mfc_cache *mfc_cache)
>>> +{
>>> +}
>>> +#endif
>> I don't believe this is necessary. The solution you described is often
>> used in headers. But here, I'm ok with the current code.
>>
> +1


Hmm, when re-looking at it, I think I will just use the already existing
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES other than adding a new one. It selects
the CONFIG_FIB_RULES, and if CONFIG_IP_MROUTE_MULTIPLE_TABLES is not defined
than only default rules can exist for the IPMR family.

I will fix it for v3.



>
>>>> +}
>>>> +EXPORT_SYMBOL(ipmr_rule_default);
>>>> +
>>>>  static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
>>>>  				const void *ptr)
>>>>  {
>>>>

^ permalink raw reply

* Re: [PATCH net v2 1/3] net: mvpp2: fix parsing fragmentation detection
From: Antoine Tenart @ 2017-09-25 13:10 UTC (permalink / raw)
  To: davem
  Cc: Stefan Chulski, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux, linux-kernel, mw, netdev,
	Antoine Tenart
In-Reply-To: <20170925125948.13507-2-antoine.tenart@free-electrons.com>

On Mon, Sep 25, 2017 at 02:59:46PM +0200, Antoine Tenart wrote:
> From: Stefan Chulski <stefanc@marvell.com>
> 
> Parsing fragmentation detection failed due to wrong configured
> parser TCAM entry's. Some traffic was marked as fragmented in RX
> descriptor, even it wasn't IP fragmented. The hardware also failed to
> calculate checksums which lead to use software checksum and caused
> performance degradation.
> 
> Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit")

With,

Signed-off-by: Stefan Chulski <stefanc@marvell.com>

I don't know why this SoB was removed but it should be added back.

Antoine

> Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
> ---
>  drivers/net/ethernet/marvell/mvpp2.c | 20 ++++++++++++++------
>  1 file changed, 14 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
> index dd0ee2691c86..da04939a2748 100644
> --- a/drivers/net/ethernet/marvell/mvpp2.c
> +++ b/drivers/net/ethernet/marvell/mvpp2.c
> @@ -676,6 +676,7 @@ enum mvpp2_tag_type {
>  #define MVPP2_PRS_RI_L3_MCAST			BIT(15)
>  #define MVPP2_PRS_RI_L3_BCAST			(BIT(15) | BIT(16))
>  #define MVPP2_PRS_RI_IP_FRAG_MASK		0x20000
> +#define MVPP2_PRS_RI_IP_FRAG_TRUE		BIT(17)
>  #define MVPP2_PRS_RI_UDF3_MASK			0x300000
>  #define MVPP2_PRS_RI_UDF3_RX_SPECIAL		BIT(21)
>  #define MVPP2_PRS_RI_L4_PROTO_MASK		0x1c00000
> @@ -2315,7 +2316,7 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
>  	    (proto != IPPROTO_IGMP))
>  		return -EINVAL;
>  
> -	/* Fragmented packet */
> +	/* Not fragmented packet */
>  	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
>  					MVPP2_PE_LAST_FREE_TID);
>  	if (tid < 0)
> @@ -2334,8 +2335,12 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
>  				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
>  	mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT,
>  				 MVPP2_PRS_IPV4_DIP_AI_BIT);
> -	mvpp2_prs_sram_ri_update(&pe, ri | MVPP2_PRS_RI_IP_FRAG_MASK,
> -				 ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
> +	mvpp2_prs_sram_ri_update(&pe, ri, ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
> +
> +	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00,
> +				     MVPP2_PRS_TCAM_PROTO_MASK_L);
> +	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00,
> +				     MVPP2_PRS_TCAM_PROTO_MASK);
>  
>  	mvpp2_prs_tcam_data_byte_set(&pe, 5, proto, MVPP2_PRS_TCAM_PROTO_MASK);
>  	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV4_DIP_AI_BIT);
> @@ -2346,7 +2351,7 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
>  	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
>  	mvpp2_prs_hw_write(priv, &pe);
>  
> -	/* Not fragmented packet */
> +	/* Fragmented packet */
>  	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
>  					MVPP2_PE_LAST_FREE_TID);
>  	if (tid < 0)
> @@ -2358,8 +2363,11 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
>  	pe.sram.word[MVPP2_PRS_SRAM_RI_CTRL_WORD] = 0x0;
>  	mvpp2_prs_sram_ri_update(&pe, ri, ri_mask);
>  
> -	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, MVPP2_PRS_TCAM_PROTO_MASK_L);
> -	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, MVPP2_PRS_TCAM_PROTO_MASK);
> +	mvpp2_prs_sram_ri_update(&pe, ri | MVPP2_PRS_RI_IP_FRAG_TRUE,
> +				 ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
> +
> +	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, 0x0);
> +	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, 0x0);
>  
>  	/* Update shadow table and hw entry */
>  	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
> -- 
> 2.13.5
> 

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* Re: [PATCH net v2 2/3] net: mvpp2: fix port list indexing
From: Antoine Tenart @ 2017-09-25 13:09 UTC (permalink / raw)
  To: davem
  Cc: Yan Markman, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux, linux-kernel, mw, stefanc, netdev,
	Antoine Tenart
In-Reply-To: <20170925125948.13507-3-antoine.tenart@free-electrons.com>

On Mon, Sep 25, 2017 at 02:59:47PM +0200, Antoine Tenart wrote:
> From: Yan Markman <ymarkman@marvell.com>
> 
> The private port_list array has a list of pointers to mvpp2_port
> instances. This list is allocated given the number of ports enabled in
> the device tree, but the pointers are set using the port-id property. If
> on a single port is enabled, the port_list array will be of size 1, but
> when registering the port, if its id is not 0 the driver will crash.
> Other crashes were encountered in various situations.
> 
> This fixes the issue by using an index not equal to the value of the
> port-id property.
> 
> Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit")

With,

Signed-off-by: Yan Markman <ymarkman@marvell.com>

I don't know why it was removed, but this SoB should be added back.

Antoine

> Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
> ---
>  drivers/net/ethernet/marvell/mvpp2.c | 8 +++++---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
> index da04939a2748..b2f99df81e9c 100644
> --- a/drivers/net/ethernet/marvell/mvpp2.c
> +++ b/drivers/net/ethernet/marvell/mvpp2.c
> @@ -7504,7 +7504,7 @@ static void mvpp2_port_copy_mac_addr(struct net_device *dev, struct mvpp2 *priv,
>  /* Ports initialization */
>  static int mvpp2_port_probe(struct platform_device *pdev,
>  			    struct device_node *port_node,
> -			    struct mvpp2 *priv)
> +			    struct mvpp2 *priv, int index)
>  {
>  	struct device_node *phy_node;
>  	struct phy *comphy;
> @@ -7678,7 +7678,7 @@ static int mvpp2_port_probe(struct platform_device *pdev,
>  	}
>  	netdev_info(dev, "Using %s mac address %pM\n", mac_from, dev->dev_addr);
>  
> -	priv->port_list[id] = port;
> +	priv->port_list[index] = port;
>  	return 0;
>  
>  err_free_port_pcpu:
> @@ -8013,10 +8013,12 @@ static int mvpp2_probe(struct platform_device *pdev)
>  	}
>  
>  	/* Initialize ports */
> +	i = 0;
>  	for_each_available_child_of_node(dn, port_node) {
> -		err = mvpp2_port_probe(pdev, port_node, priv);
> +		err = mvpp2_port_probe(pdev, port_node, priv, i);
>  		if (err < 0)
>  			goto err_mg_clk;
> +		i++;
>  	}
>  
>  	platform_set_drvdata(pdev, priv);
> -- 
> 2.13.5
> 

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* Re: [PATCH net-next] net: mvpp2: phylink support
From: Antoine Tenart @ 2017-09-25 13:06 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Antoine Tenart, davem, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux-kernel, mw, stefanc, netdev
In-Reply-To: <20170925121343.GO20805@n2100.armlinux.org.uk>

On Mon, Sep 25, 2017 at 01:13:43PM +0100, Russell King - ARM Linux wrote:
> On Mon, Sep 25, 2017 at 01:53:03PM +0200, Antoine Tenart wrote:
> > On Mon, Sep 25, 2017 at 11:45:32AM +0100, Russell King - ARM Linux wrote:
> > > Can you describe what the GoP link IRQ is doing please?
> > 
> > In cases where there is no PHY connected to the MAC and no SFP cage is
> > used. One example is when a SOHO switch is connected directly to a
> > serdes lane. In such cases we still need to have a minimal link
> > management. The GoP link interrupt helps doing so as it raises when the
> > serdes is in sync and AN succeeded.
> 
> Isn't this just like a fixed link scenario, or an in-band
> autonegotiation scenario (both of which phylink supports natively)?
> 
> The situation on Clearfog with the 88E6176 switch is pretty similar -
> a switch connected directly via serdes to the MAC.  Currently, we
> configure stuff there as a fixed link, but in actual fact the 88E6176
> is configured to run the CPU facing port in 1000base-X mode, and with
> appropriate tweaks, switching phylink to 1000base-X mode also works.

Hmm, I think you're right, we should be able to represent the link
between the MAC and the switch as a fixed link. And when it's not fixed,
it could be done with in-band AN. I cannot test this myself but I've
asked someone who can to.

Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* [PATCH net v2 3/3] net: mvpp2: do not select the internal source clock
From: Antoine Tenart @ 2017-09-25 12:59 UTC (permalink / raw)
  To: davem
  Cc: Antoine Tenart, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux, linux-kernel, mw, stefanc, netdev
In-Reply-To: <20170925125948.13507-1-antoine.tenart@free-electrons.com>

This patch stops the internal MAC Tx clock from being enabled as the
internal clock isn't used. The definition used for the bit controlling
this behaviour is renamed as well as it was wrongly named (bit 4 of
GMAC_CTRL_2_REG).

Fixes: 3919357fb0bb ("net: mvpp2: initialize the GMAC when using a port")
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
---
 drivers/net/ethernet/marvell/mvpp2.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index b2f99df81e9c..161055564720 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -333,7 +333,7 @@
 #define     MVPP2_GMAC_INBAND_AN_MASK		BIT(0)
 #define     MVPP2_GMAC_FLOW_CTRL_MASK		GENMASK(2, 1)
 #define     MVPP2_GMAC_PCS_ENABLE_MASK		BIT(3)
-#define     MVPP2_GMAC_PORT_RGMII_MASK		BIT(4)
+#define     MVPP2_GMAC_INTERNAL_CLK_MASK	BIT(4)
 #define     MVPP2_GMAC_DISABLE_PADDING		BIT(5)
 #define     MVPP2_GMAC_PORT_RESET_MASK		BIT(6)
 #define MVPP2_GMAC_AUTONEG_CONFIG		0xc
@@ -4599,7 +4599,6 @@ static void mvpp2_port_mii_gmac_configure(struct mvpp2_port *port)
 	        val |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK;
 	} else if (phy_interface_mode_is_rgmii(port->phy_interface)) {
 		val &= ~MVPP2_GMAC_PCS_ENABLE_MASK;
-		val |= MVPP2_GMAC_PORT_RGMII_MASK;
 	}
 	writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
 
-- 
2.13.5

^ permalink raw reply related

* [PATCH net v2 2/3] net: mvpp2: fix port list indexing
From: Antoine Tenart @ 2017-09-25 12:59 UTC (permalink / raw)
  To: davem
  Cc: Yan Markman, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux, linux-kernel, mw, stefanc, netdev,
	Antoine Tenart
In-Reply-To: <20170925125948.13507-1-antoine.tenart@free-electrons.com>

From: Yan Markman <ymarkman@marvell.com>

The private port_list array has a list of pointers to mvpp2_port
instances. This list is allocated given the number of ports enabled in
the device tree, but the pointers are set using the port-id property. If
on a single port is enabled, the port_list array will be of size 1, but
when registering the port, if its id is not 0 the driver will crash.
Other crashes were encountered in various situations.

This fixes the issue by using an index not equal to the value of the
port-id property.

Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit")
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
---
 drivers/net/ethernet/marvell/mvpp2.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index da04939a2748..b2f99df81e9c 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -7504,7 +7504,7 @@ static void mvpp2_port_copy_mac_addr(struct net_device *dev, struct mvpp2 *priv,
 /* Ports initialization */
 static int mvpp2_port_probe(struct platform_device *pdev,
 			    struct device_node *port_node,
-			    struct mvpp2 *priv)
+			    struct mvpp2 *priv, int index)
 {
 	struct device_node *phy_node;
 	struct phy *comphy;
@@ -7678,7 +7678,7 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 	}
 	netdev_info(dev, "Using %s mac address %pM\n", mac_from, dev->dev_addr);
 
-	priv->port_list[id] = port;
+	priv->port_list[index] = port;
 	return 0;
 
 err_free_port_pcpu:
@@ -8013,10 +8013,12 @@ static int mvpp2_probe(struct platform_device *pdev)
 	}
 
 	/* Initialize ports */
+	i = 0;
 	for_each_available_child_of_node(dn, port_node) {
-		err = mvpp2_port_probe(pdev, port_node, priv);
+		err = mvpp2_port_probe(pdev, port_node, priv, i);
 		if (err < 0)
 			goto err_mg_clk;
+		i++;
 	}
 
 	platform_set_drvdata(pdev, priv);
-- 
2.13.5

^ permalink raw reply related

* [PATCH net v2 1/3] net: mvpp2: fix parsing fragmentation detection
From: Antoine Tenart @ 2017-09-25 12:59 UTC (permalink / raw)
  To: davem
  Cc: Stefan Chulski, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux, linux-kernel, mw, netdev,
	Antoine Tenart
In-Reply-To: <20170925125948.13507-1-antoine.tenart@free-electrons.com>

From: Stefan Chulski <stefanc@marvell.com>

Parsing fragmentation detection failed due to wrong configured
parser TCAM entry's. Some traffic was marked as fragmented in RX
descriptor, even it wasn't IP fragmented. The hardware also failed to
calculate checksums which lead to use software checksum and caused
performance degradation.

Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit")
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
---
 drivers/net/ethernet/marvell/mvpp2.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index dd0ee2691c86..da04939a2748 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -676,6 +676,7 @@ enum mvpp2_tag_type {
 #define MVPP2_PRS_RI_L3_MCAST			BIT(15)
 #define MVPP2_PRS_RI_L3_BCAST			(BIT(15) | BIT(16))
 #define MVPP2_PRS_RI_IP_FRAG_MASK		0x20000
+#define MVPP2_PRS_RI_IP_FRAG_TRUE		BIT(17)
 #define MVPP2_PRS_RI_UDF3_MASK			0x300000
 #define MVPP2_PRS_RI_UDF3_RX_SPECIAL		BIT(21)
 #define MVPP2_PRS_RI_L4_PROTO_MASK		0x1c00000
@@ -2315,7 +2316,7 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
 	    (proto != IPPROTO_IGMP))
 		return -EINVAL;
 
-	/* Fragmented packet */
+	/* Not fragmented packet */
 	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
 					MVPP2_PE_LAST_FREE_TID);
 	if (tid < 0)
@@ -2334,8 +2335,12 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
 				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
 	mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT,
 				 MVPP2_PRS_IPV4_DIP_AI_BIT);
-	mvpp2_prs_sram_ri_update(&pe, ri | MVPP2_PRS_RI_IP_FRAG_MASK,
-				 ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
+	mvpp2_prs_sram_ri_update(&pe, ri, ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
+
+	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00,
+				     MVPP2_PRS_TCAM_PROTO_MASK_L);
+	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00,
+				     MVPP2_PRS_TCAM_PROTO_MASK);
 
 	mvpp2_prs_tcam_data_byte_set(&pe, 5, proto, MVPP2_PRS_TCAM_PROTO_MASK);
 	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV4_DIP_AI_BIT);
@@ -2346,7 +2351,7 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
 	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
 	mvpp2_prs_hw_write(priv, &pe);
 
-	/* Not fragmented packet */
+	/* Fragmented packet */
 	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
 					MVPP2_PE_LAST_FREE_TID);
 	if (tid < 0)
@@ -2358,8 +2363,11 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
 	pe.sram.word[MVPP2_PRS_SRAM_RI_CTRL_WORD] = 0x0;
 	mvpp2_prs_sram_ri_update(&pe, ri, ri_mask);
 
-	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, MVPP2_PRS_TCAM_PROTO_MASK_L);
-	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, MVPP2_PRS_TCAM_PROTO_MASK);
+	mvpp2_prs_sram_ri_update(&pe, ri | MVPP2_PRS_RI_IP_FRAG_TRUE,
+				 ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
+
+	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, 0x0);
+	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, 0x0);
 
 	/* Update shadow table and hw entry */
 	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
-- 
2.13.5

^ permalink raw reply related

* [PATCH net v2 0/3] net: mvpp2: various fixes
From: Antoine Tenart @ 2017-09-25 12:59 UTC (permalink / raw)
  To: davem
  Cc: Antoine Tenart, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux, linux-kernel, mw, stefanc, netdev

Hi all,

This series contains 3 fixes for the Marvell PPv2 driver.

Thanks!
Antoine

Since v1:
  - Removed one patch about dma masks as it would need a better fix.
  - Added one fix about the MAC Tx clock source selection.

Antoine Tenart (1):
  net: mvpp2: do not select the internal source clock

Stefan Chulski (1):
  net: mvpp2: fix parsing fragmentation detection

Yan Markman (1):
  net: mvpp2: fix port list indexing

 drivers/net/ethernet/marvell/mvpp2.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

-- 
2.13.5

^ permalink raw reply

* Re: [PATCH net 1/3] net: mvpp2: fix the dma_mask and coherent_dma_mask settings for PPv2.2
From: Antoine Tenart @ 2017-09-25 12:40 UTC (permalink / raw)
  To: David Miller
  Cc: antoine.tenart, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux, linux-kernel, mw, stefanc, netdev
In-Reply-To: <20170921.100718.386894052177530033.davem@davemloft.net>

On Thu, Sep 21, 2017 at 10:07:18AM -0700, David Miller wrote:
> From: Antoine Tenart <antoine.tenart@free-electrons.com>
> Date: Thu, 21 Sep 2017 16:24:13 +0200
> 
> > That's also the default when the platform does not allocate dma_mask.
> 
> That's the problem that needs to be fixed then.

OK, I'll drop this patch until I find a proper solution.

Thanks,
Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* Re: [PATCH net v2] l2tp: fix race condition in l2tp_tunnel_delete
From: Guillaume Nault @ 2017-09-25 12:33 UTC (permalink / raw)
  To: Sabrina Dubroca; +Cc: netdev, Xin Long, Tom Parkin
In-Reply-To: <20170922161624.GA31500@bistromath.localdomain>

On Fri, Sep 22, 2017 at 06:16:24PM +0200, Sabrina Dubroca wrote:
> 2017-09-19, 18:43:37 +0200, Guillaume Nault wrote:
> > On Tue, Sep 19, 2017 at 03:40:40PM +0200, Sabrina Dubroca wrote:
> > > If we try to delete the same tunnel twice, the first delete operation
> > > does a lookup (l2tp_tunnel_get), finds the tunnel, calls
> > > l2tp_tunnel_delete, which queues it for deletion by
> > > l2tp_tunnel_del_work.
> > > 
> > > The second delete operation also finds the tunnel and calls
> > > l2tp_tunnel_delete. If the workqueue has already fired and started
> > > running l2tp_tunnel_del_work, then l2tp_tunnel_delete will queue the
> > > same tunnel a second time, and try to free the socket again.
> > > 
> > > Add a dead flag to prevent firing the workqueue twice. Then we can
> > > remove the check of queue_work's result that was meant to prevent that
> > > race but doesn't.
> > > 
> > > Also check the flag in the tunnel lookup functions, to avoid returning a
> > > tunnel that is already scheduled for destruction.
> > > 
> > > Reproducer:
> > > 
> > >     ip l2tp add tunnel tunnel_id 3000 peer_tunnel_id 4000 local 192.168.0.2 remote 192.168.0.1 encap udp udp_sport 5000 udp_dport 6000
> > >     ip l2tp add session name l2tp1 tunnel_id 3000 session_id 1000 peer_session_id 2000
> > >     ip link set l2tp1 up
> > >     ip l2tp del tunnel tunnel_id 3000
> > >     ip l2tp del tunnel tunnel_id 3000
> > > 
> > > Fixes: f8ccac0e4493 ("l2tp: put tunnel socket release on a workqueue")
> > > Reported-by: Jianlin Shi <jishi@redhat.com>
> > > Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
> > > ---
> > > v2: as Tom Parkin explained, we can't remove the tunnel from the
> > >     per-net list from netlink. v2 uses only a dead flag, and adds
> > >     corresponding checks during lookups
> > > 
> > >  net/l2tp/l2tp_core.c | 18 +++++++++---------
> > >  net/l2tp/l2tp_core.h |  5 ++++-
> > >  2 files changed, 13 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
> > > index ee485df73ccd..3891f0260f2b 100644
> > > --- a/net/l2tp/l2tp_core.c
> > > +++ b/net/l2tp/l2tp_core.c
> > > @@ -203,7 +203,8 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id)
> > >  
> > >  	rcu_read_lock_bh();
> > >  	list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
> > > -		if (tunnel->tunnel_id == tunnel_id) {
> > > +		if (tunnel->tunnel_id == tunnel_id &&
> > > +		    !test_bit(0, &tunnel->dead)) {
> > >  			l2tp_tunnel_inc_refcount(tunnel);
> > >  			rcu_read_unlock_bh();
> > >  
> > > @@ -390,7 +391,8 @@ struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id)
> > >  
> > >  	rcu_read_lock_bh();
> > >  	list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
> > > -		if (tunnel->tunnel_id == tunnel_id) {
> > > +		if (tunnel->tunnel_id == tunnel_id &&
> > > +		    !test_bit(0, &tunnel->dead)) {
> > >  			rcu_read_unlock_bh();
> > >  			return tunnel;
> > >  		}
> > > @@ -409,7 +411,7 @@ struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth)
> > >  
> > >  	rcu_read_lock_bh();
> > >  	list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
> > > -		if (++count > nth) {
> > > +		if (++count > nth && !test_bit(0, &tunnel->dead)) {
> > >  			rcu_read_unlock_bh();
> > >  			return tunnel;
> > >  		}
> > > 
> > I don't get why you're checking the dead flag in l2tp_tunnel_{get,find}*().
> > Since it can be set concurrently right after test_bit(), it doesn't
> > protect the caller from getting a tunnel that is being removed by
> > l2tp_tunnel_delete().
> > Or have I missed something?
> 
> You're right.
> 
> Then I would try going back to essentially v1, but keeping code to
> remove the tunnel from the list in l2tp_tunnel_destruct if it's not
> dead yet.
> 
> What do you think?
> 
My main question was more about why do you feel the need for preventing
other parts of the code from accessing dead tunnels? The TOCTOU issue
was just there to illustrate the fact that it couldn't be implemented
this easily.

My reasonning is that a tunnel may already be in use when
l2tp_tunnel_delete() is called. So any function using tunnels must
already work properly on dying tunnels, because l2tp_tunnel_delete()
might kill them concurrently. Getting a dying tunnel from
l2tp_tunnel_get() or having the tunnel killed by l2tp_tunnel_delete()
while in use should make no difference, as long as the user properly
holds a reference. Of course we have the problem of l2tp_tunnel_find*()
which is racy wrt. tunnel reference counting, but I'm going to continue
converting these users to the safe l2tp_tunnel_get() lookup function.

Of course, making dying tunnels inaccessible makes sense but, unless
I've missed something, it looks more like cleanup/optimisation than bug
fixing.

So what about using your v2 patch, but without the ->dead flag test in
l2tp_tunnel_get() and l2tp_tunnel_find*()?


Now for some more context, I think tunnel creation and deletion will
need to be reworked. Tunnels should be removed from the pernet list by
l2tp_udp_encap_destroy() for L2TP over UDP, and by
l2tp_ip_destroy_sock() or l2tp_ip6_destroy_sock() for L2TP over IP.

Then we could stop hooking on ->sk_destruct(), because the
l2tp_tunnel_closeall() call found in l2tp_tunnel_destruct() is already
useless (if it actually had to remove sessions, it could sleep while in
atomic context, because ->sk_destruct() is now invoked through
call_rcu() for UDP sockets).

And we should break the tight coupling of the l2tp_tunnel structure
with the tunnel socket. This situation, where they dereference one
another without any protection, complicates the deletion process.
Protecting the socket and the tunnel's structure pointers with RCU
would certainly allow for simpler deletion code.

All in all, your last patch makes a lot of sense in this bigger
picture, but for now I'd rather go for simply preventing queueing
l2tp_tunnel_del_work() twice. Unless required for accurately fixing the
current issue, I think removing tunnels in l2tp_tunnel_delete() would fit
better in a different series.

> 
> -------- 8< --------
> 
> diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
> index ee485df73ccd..63cd1f30ac7d 100644
> --- a/net/l2tp/l2tp_core.c
> +++ b/net/l2tp/l2tp_core.c
> @@ -1234,6 +1234,23 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
>  }
>  EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
>  
> +static bool __l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
> +{
> +	struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
> +	bool ret = false;
> +
> +	spin_lock_bh(&pn->l2tp_tunnel_list_lock);
> +	if (!tunnel->dead) {
> +		tunnel->dead = 1;
> +		list_del_rcu(&tunnel->list);
> +		atomic_dec(&l2tp_tunnel_count);
> +		ret = true;
> +	}
> +	spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
> +
> +	return ret;
> +}
> +
>  /*****************************************************************************
>   * Tinnel and session create/destroy.
>   *****************************************************************************/
> @@ -1245,7 +1262,6 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
>  static void l2tp_tunnel_destruct(struct sock *sk)
>  {
>  	struct l2tp_tunnel *tunnel = l2tp_tunnel(sk);
> -	struct l2tp_net *pn;
>  
>  	if (tunnel == NULL)
>  		goto end;
> @@ -1270,11 +1286,7 @@ static void l2tp_tunnel_destruct(struct sock *sk)
>  	sk->sk_user_data = NULL;
>  
>  	/* Remove the tunnel struct from the tunnel list */
> -	pn = l2tp_pernet(tunnel->l2tp_net);
> -	spin_lock_bh(&pn->l2tp_tunnel_list_lock);
> -	list_del_rcu(&tunnel->list);
> -	spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
> -	atomic_dec(&l2tp_tunnel_count);
> +	__l2tp_tunnel_delete(tunnel);
>  
>  	l2tp_tunnel_closeall(tunnel);
>  
> @@ -1685,14 +1697,12 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create);
>  
>  /* This function is used by the netlink TUNNEL_DELETE command.
>   */
> -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
> +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
>  {
> -	l2tp_tunnel_inc_refcount(tunnel);
> -	if (false == queue_work(l2tp_wq, &tunnel->del_work)) {
> -		l2tp_tunnel_dec_refcount(tunnel);
> -		return 1;
> +	if (__l2tp_tunnel_delete(tunnel)) {
> +		l2tp_tunnel_inc_refcount(tunnel);
> +		queue_work(l2tp_wq, &tunnel->del_work);
>  	}
> -	return 0;
>  }
>  EXPORT_SYMBOL_GPL(l2tp_tunnel_delete);
>  
> diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
> index a305e0c5925a..173e68bb8119 100644
> --- a/net/l2tp/l2tp_core.h
> +++ b/net/l2tp/l2tp_core.h
> @@ -160,6 +160,8 @@ struct l2tp_tunnel_cfg {
>  
>  struct l2tp_tunnel {
>  	int			magic;		/* Should be L2TP_TUNNEL_MAGIC */
> +	int			dead;
> +
>  	struct rcu_head rcu;
>  	rwlock_t		hlist_lock;	/* protect session_hlist */
>  	bool			acpt_newsess;	/* Indicates whether this
> @@ -254,7 +256,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
>  		       u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg,
>  		       struct l2tp_tunnel **tunnelp);
>  void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
> -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
> +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
>  struct l2tp_session *l2tp_session_create(int priv_size,
>  					 struct l2tp_tunnel *tunnel,
>  					 u32 session_id, u32 peer_session_id,
> 
> 
> -- 
> Sabrina

^ permalink raw reply

* Re: usb/wireless/rsi_91x: use-after-free write in __run_timers
From: Kalle Valo @ 2017-09-25 12:26 UTC (permalink / raw)
  To: Andrey Konovalov
  Cc: Amitkumar Karwar, Prameela Rani Garnepudi, Karun Eagalapati,
	linux-wireless, netdev, LKML, Dmitry Vyukov, Kostya Serebryany,
	syzkaller
In-Reply-To: <CAAeHK+y61FFKLpePKOhRjd=5QJEWRy9-pank64PuG+aKzafANw@mail.gmail.com>

Andrey Konovalov <andreyknvl@google.com> writes:

> On Mon, Sep 25, 2017 at 6:26 AM, Kalle Valo <kvalo@codeaurora.org> wrote:
>> Andrey Konovalov <andreyknvl@google.com> writes:
>>
>>> I've got the following report while fuzzing the kernel with syzkaller.
>>>
>>> On commit 6e80ecdddf4ea6f3cd84e83720f3d852e6624a68 (Sep 21).
>>>
>>> ==================================================================
>>> BUG: KASAN: use-after-free in __run_timers+0xc0e/0xd40
>>> Write of size 8 at addr ffff880069f701b8 by task swapper/0/0
>>>
>>> CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.14.0-rc1-42311-g6e80ecdddf4e #234
>>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>>
>> [...]
>>
>>> Allocated by task 1845:
>>>  save_stack_trace+0x1b/0x20 arch/x86/kernel/stacktrace.c:59
>>>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>>>  set_track mm/kasan/kasan.c:459
>>>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
>>>  kmem_cache_alloc_trace+0x11e/0x2d0 mm/slub.c:2772
>>>  kmalloc ./include/linux/slab.h:493
>>>  kzalloc ./include/linux/slab.h:666
>>>  rsi_91x_init+0x98/0x510 drivers/net/wireless/rsi/rsi_91x_main.c:203
>>>  rsi_probe+0xb6/0x13b0 drivers/net/wireless/rsi/rsi_91x_usb.c:665
>>>  usb_probe_interface+0x35d/0x8e0 drivers/usb/core/driver.c:361
>>
>> I'm curious about your setup. Apparently you are running syzkaller on
>> QEMU but what I don't understand is how the rsi device comes into the
>> picture. Did you have a rsi usb device connected to the virtual machine
>> or what? Or does syzkaller do some kind of magic here?
>
> I use dummy_hcd and gadgetfs to connect random USB devices to the
> kernel from a userspace application. This happens inside a QEMU
> instance. This simplifies fuzzing, since everything is virtualized,
> but the found bugs can be triggered on a real machine by connecting a
> malicious USB device.

That's very cool, thanks for explaining the setup.

-- 
Kalle Valo

^ permalink raw reply

* Re: [PATCH net-next] net: mvpp2: phylink support
From: Russell King - ARM Linux @ 2017-09-25 12:13 UTC (permalink / raw)
  To: Antoine Tenart
  Cc: davem, andrew, gregory.clement, thomas.petazzoni, miquel.raynal,
	nadavh, linux-kernel, mw, stefanc, netdev
In-Reply-To: <20170925115303.GC19364@kwain>

On Mon, Sep 25, 2017 at 01:53:03PM +0200, Antoine Tenart wrote:
> On Mon, Sep 25, 2017 at 11:45:32AM +0100, Russell King - ARM Linux wrote:
> > Can you describe what the GoP link IRQ is doing please?
> 
> In cases where there is no PHY connected to the MAC and no SFP cage is
> used. One example is when a SOHO switch is connected directly to a
> serdes lane. In such cases we still need to have a minimal link
> management. The GoP link interrupt helps doing so as it raises when the
> serdes is in sync and AN succeeded.

Isn't this just like a fixed link scenario, or an in-band
autonegotiation scenario (both of which phylink supports natively)?

The situation on Clearfog with the 88E6176 switch is pretty similar -
a switch connected directly via serdes to the MAC.  Currently, we
configure stuff there as a fixed link, but in actual fact the 88E6176
is configured to run the CPU facing port in 1000base-X mode, and with
appropriate tweaks, switching phylink to 1000base-X mode also works.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply

* Re: [PATCH net-next] net: mvpp2: phylink support
From: Antoine Tenart @ 2017-09-25 11:53 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Antoine Tenart, davem, andrew, gregory.clement, thomas.petazzoni,
	miquel.raynal, nadavh, linux-kernel, mw, stefanc, netdev
In-Reply-To: <20170925104532.GN20805@n2100.armlinux.org.uk>

On Mon, Sep 25, 2017 at 11:45:32AM +0100, Russell King - ARM Linux wrote:
> On Mon, Sep 25, 2017 at 11:55:14AM +0200, Antoine Tenart wrote:
> > On Fri, Sep 22, 2017 at 12:07:31PM +0100, Russell King - ARM Linux wrote:
> > > On Thu, Sep 21, 2017 at 03:45:22PM +0200, Antoine Tenart wrote:
> > > > Convert the PPv2 driver to use phylink, which models the MAC to PHY
> > > > link. The phylink support is made such a way the GoP link IRQ can still
> > > > be used: the two modes are incompatible and the GoP link IRQ will be
> > > > used if no PHY is described in the device tree. This is the same
> > > > behaviour as before.
> > > 
> > > This makes no sense.  The point of phylink is to be able to support SFP
> > > cages, and SFP cages do not have a PHY described in DT.  So, when you
> > > want to use phylink because of SFP, you can't, because if you omit
> > > the PHY the driver avoids using phylink.
> > 
> > Yes that's an issue. However we do need to support the GoP link IRQ
> > which is also needed in some cases where there is no PHY (and when
> > phylink cannot be used). What would you propose to differentiate those
> > two cases: no PHY using phylink, and no PHY using the GoP link IRQ?
> 
> Can you describe what the GoP link IRQ is doing please?

In cases where there is no PHY connected to the MAC and no SFP cage is
used. One example is when a SOHO switch is connected directly to a
serdes lane. In such cases we still need to have a minimal link
management. The GoP link interrupt helps doing so as it raises when the
serdes is in sync and AN succeeded.

I also wonder if this is needed when using passive cables?

Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* Re: [PATCH RESEND] wireless: iwlwifi: fix minor code style issues
From: Christoph Böhmwalder @ 2017-09-25 11:51 UTC (permalink / raw)
  To: Coelho, Luciano, linux-kernel@vger.kernel.org, trivial@kernel.org,
	Berg, Johannes, kvalo@codeaurora.org, netdev@vger.kernel.org,
	linux-wireless@vger.kernel.org, Grumbach, Emmanuel
In-Reply-To: <1506340012.3276.17.camel@intel.com>


[-- Attachment #1.1: Type: text/plain, Size: 306 bytes --]

> Why are you already resending this?

Sorry, I guess I was too impatient. I also messed up the spelling in a
"To:" line and forgot trivial@kernel.org the first time I sent it, so I
figured I'd just fix it in a resend.

I'll make sure to wait a little longer next time.

--
Regards,
Christoph


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [PATCH RESEND] wireless: iwlwifi: fix minor code style issues
From: Coelho, Luciano @ 2017-09-25 11:47 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org, trivial@kernel.org,
	christoph@boehmwalder.at, Berg, Johannes, kvalo@codeaurora.org,
	netdev@vger.kernel.org, linux-wireless@vger.kernel.org,
	Grumbach, Emmanuel
In-Reply-To: <912deca9-79b6-7a44-6859-dbe532d90fed@boehmwalder.at>

On Mon, 2017-09-25 at 13:37 +0200, Christoph Böhmwalder wrote:
> Fixes three trivial issues as reported by checkpatch.pl, namely two
switch/case indentation issues and one alignment issue in a multiline
comment.

Signed-off-by: Christoph Böhmwalder <christoph@boehmwalder.at>
---

Why are you already resending this? You sent the first email 2 days ago,
you can't expect that a non-critical patch be merged in such a short
time (especially during the weekend).

--
Cheers,
Luca

^ permalink raw reply

* Re: [PATCH 5/5] xfrm: eradicate size_t
From: Steffen Klassert @ 2017-09-25 11:46 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: herbert, davem, netdev
In-Reply-To: <20170921204853.GF13550@avx2>

On Thu, Sep 21, 2017 at 11:48:54PM +0300, Alexey Dobriyan wrote:
> All netlink message sizes are a) unsigned, b) can't be >= 4GB in size
> because netlink doesn't support >= 64KB messages in the first place.
> 
> All those size_t across the code are a scam especially across networking
> which likes to work with small numbers like 1500 or 65536.
> 
> Propagate unsignedness and flip some "int" to "unsigned int" as well.
> 
> This is preparation to switching nlmsg_new() to "unsigned int".
> 
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>

All applied to ipsec-next, thanks Alexey!

^ permalink raw reply

* [PATCH RESEND] wireless: iwlwifi: fix minor code style issues
From: Christoph Böhmwalder @ 2017-09-25 11:37 UTC (permalink / raw)
  To: johannes.berg, emmanuel.grumbach, luciano.coelho, kvalo,
	linux-wireless, netdev, linux-kernel, trivial
  Cc: Christoph Böhmwalder


[-- Attachment #1.1: Type: text/plain, Size: 1716 bytes --]

Fixes three trivial issues as reported by checkpatch.pl, namely two
switch/case indentation issues and one alignment issue in a multiline
comment.

Signed-off-by: Christoph Böhmwalder <christoph@boehmwalder.at>
---
 drivers/net/wireless/intel/iwlwifi/iwl-drv.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
index 99676d6c4713..ccdb247d68c5 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
@@ -832,7 +832,7 @@ static int iwl_parse_tlv_firmware(struct iwl_drv *drv,
 			capa->standard_phy_calibration_size =
 					le32_to_cpup((__le32 *)tlv_data);
 			break;
-		 case IWL_UCODE_TLV_SEC_RT:
+		case IWL_UCODE_TLV_SEC_RT:
 			iwl_store_ucode_sec(pieces, tlv_data, IWL_UCODE_REGULAR,
 					    tlv_len);
 			drv->fw.type = IWL_FW_MVM;
@@ -864,7 +864,7 @@ static int iwl_parse_tlv_firmware(struct iwl_drv *drv,
 						FW_PHY_CFG_RX_CHAIN) >>
 						FW_PHY_CFG_RX_CHAIN_POS;
 			break;
-		 case IWL_UCODE_TLV_SECURE_SEC_RT:
+		case IWL_UCODE_TLV_SECURE_SEC_RT:
 			iwl_store_ucode_sec(pieces, tlv_data, IWL_UCODE_REGULAR,
 					    tlv_len);
 			drv->fw.type = IWL_FW_MVM;
@@ -1335,7 +1335,8 @@ static void iwl_req_fw_callback(const struct
firmware *ucode_raw, void *context)
  	/* Runtime instructions and 2 copies of data:
 	 * 1) unmodified from disk
-	 * 2) backup cache for save/restore during power-downs */
+	 * 2) backup cache for save/restore during power-downs
+	 */
 	for (i = 0; i < IWL_UCODE_TYPE_MAX; i++)
 		if (iwl_alloc_ucode(drv, pieces, i))
 			goto out_free_fw;
-- 
2.13.5



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply related

* Re: [PATCH net v2] sctp: Fix a big endian bug in sctp_diag_dump()
From: Neil Horman @ 2017-09-25 11:23 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Vlad Yasevich, Xin Long, David S. Miller, linux-sctp, netdev,
	kernel-janitors
In-Reply-To: <20170925101926.db4f6x4hblh7tcvo@mwanda>

On Mon, Sep 25, 2017 at 01:19:26PM +0300, Dan Carpenter wrote:
> The sctp_for_each_transport() function takes an pointer to int.  The
> cb->args[] array holds longs so it's only using the high 32 bits.  It
> works on little endian system but will break on big endian 64 bit
> machines.
> 
> Fixes: d25adbeb0cdb ("sctp: fix an use-after-free issue in sctp_sock_dump")
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
> ---
> v2: The v1 patch changed the function to take a long pointer, but v2
>     just changes the caller.
> 
> diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
> index 22ed01a76b19..a72a7d925d46 100644
> --- a/net/sctp/sctp_diag.c
> +++ b/net/sctp/sctp_diag.c
> @@ -463,6 +463,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
>  		.r = r,
>  		.net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN),
>  	};
> +	int pos = cb->args[2];
>  
>  	/* eps hashtable dumps
>  	 * args:
> @@ -493,7 +494,8 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
>  		goto done;
>  
>  	sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump,
> -				net, (int *)&cb->args[2], &commp);
> +				net, &pos, &commp);
> +	cb->args[2] = pos;
>  
>  done:
>  	cb->args[1] = cb->args[4];
> 
Acked-by: Neil Horman <nhorman@tuxdriver.com>

^ permalink raw reply

* Re: [patch net-next v2 05/12] net: ipmr: Add MFC offload indication
From: Yotam Gigi @ 2017-09-25 11:21 UTC (permalink / raw)
  To: Nikolay Aleksandrov, Jiri Pirko, netdev; +Cc: davem, idosch, mlxsw, andrew
In-Reply-To: <7d9433b4-50c9-3ac0-4eef-c8f847897e8c@cumulusnetworks.com>

On 09/25/2017 12:36 PM, Nikolay Aleksandrov wrote:
> On 24/09/17 20:22, Jiri Pirko wrote:
>> From: Yotam Gigi <yotamg@mellanox.com>
>>
>> Allow drivers, registered to the fib notification chain indicate whether a
>> multicast MFC route is offloaded or not, similarly to unicast routes. The
>> indication of whether a route is offloaded is done using the mfc_flags
>> field on an mfc_cache struct, and the information is sent to the userspace
>> via the RTNetlink interface only.
>>
>> Currently, MFC routes are either offloaded or not, thus there is no need to
>> add per-VIF offload indication.
>>
>> Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
>> Reviewed-by: Ido Schimmel <idosch@mellanox.com>
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>> v1->v2:
>>  - Add comment for the MFC_OFFLOAD flag
>> ---
>>  include/linux/mroute.h | 2 ++
>>  net/ipv4/ipmr.c        | 3 +++
>>  2 files changed, 5 insertions(+)
>>
>> diff --git a/include/linux/mroute.h b/include/linux/mroute.h
>> index 54c5cb8..5566580 100644
>> --- a/include/linux/mroute.h
>> +++ b/include/linux/mroute.h
>> @@ -90,9 +90,11 @@ struct mr_table {
>>  
>>  /* mfc_flags:
>>   * MFC_STATIC - the entry was added statically (not by a routing daemon)
>> + * MFC_OFFLOAD - the entry was offloaded to the hardware
>>   */
>>  enum {
>>  	MFC_STATIC = BIT(0),
>> +	MFC_OFFLOAD = BIT(1),
>>  };
>>  
>>  struct mfc_cache_cmp_arg {
>> diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
>> index ba71bc4..2a795d2 100644
>> --- a/net/ipv4/ipmr.c
>> +++ b/net/ipv4/ipmr.c
>> @@ -2268,6 +2268,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
>>  	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
>>  		return -EMSGSIZE;
>>  
>> +	if (c->mfc_flags & MFC_OFFLOAD)
>> +		rtm->rtm_flags |= RTNH_F_OFFLOAD;
>> +
>>  	if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
>>  		return -EMSGSIZE;
>>  
>>
> Thanks!

Thank you for reviewing :)

>
> Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>

^ permalink raw reply

* Re: usb/wireless/rsi_91x: use-after-free write in __run_timers
From: Andrey Konovalov @ 2017-09-25 11:20 UTC (permalink / raw)
  To: Kalle Valo
  Cc: Amitkumar Karwar, Prameela Rani Garnepudi, Karun Eagalapati,
	linux-wireless, netdev, LKML, Dmitry Vyukov, Kostya Serebryany,
	syzkaller
In-Reply-To: <87lgl3769o.fsf@kamboji.qca.qualcomm.com>

On Mon, Sep 25, 2017 at 6:26 AM, Kalle Valo <kvalo@codeaurora.org> wrote:
> Andrey Konovalov <andreyknvl@google.com> writes:
>
>> I've got the following report while fuzzing the kernel with syzkaller.
>>
>> On commit 6e80ecdddf4ea6f3cd84e83720f3d852e6624a68 (Sep 21).
>>
>> ==================================================================
>> BUG: KASAN: use-after-free in __run_timers+0xc0e/0xd40
>> Write of size 8 at addr ffff880069f701b8 by task swapper/0/0
>>
>> CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.14.0-rc1-42311-g6e80ecdddf4e #234
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>
> [...]
>
>> Allocated by task 1845:
>>  save_stack_trace+0x1b/0x20 arch/x86/kernel/stacktrace.c:59
>>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>>  set_track mm/kasan/kasan.c:459
>>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
>>  kmem_cache_alloc_trace+0x11e/0x2d0 mm/slub.c:2772
>>  kmalloc ./include/linux/slab.h:493
>>  kzalloc ./include/linux/slab.h:666
>>  rsi_91x_init+0x98/0x510 drivers/net/wireless/rsi/rsi_91x_main.c:203
>>  rsi_probe+0xb6/0x13b0 drivers/net/wireless/rsi/rsi_91x_usb.c:665
>>  usb_probe_interface+0x35d/0x8e0 drivers/usb/core/driver.c:361
>
> I'm curious about your setup. Apparently you are running syzkaller on
> QEMU but what I don't understand is how the rsi device comes into the
> picture. Did you have a rsi usb device connected to the virtual machine
> or what? Or does syzkaller do some kind of magic here?

I use dummy_hcd and gadgetfs to connect random USB devices to the
kernel from a userspace application. This happens inside a QEMU
instance. This simplifies fuzzing, since everything is virtualized,
but the found bugs can be triggered on a real machine by connecting a
malicious USB device.

>
> --
> Kalle Valo
>
> --
> You received this message because you are subscribed to the Google Groups "syzkaller" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to syzkaller+unsubscribe@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply

* Re: [PATCH net-next 5/6] bpf, nfp: add meta data support
From: Jakub Kicinski @ 2017-09-25 11:12 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: davem, alexei.starovoitov, john.fastabend, peter.waskiewicz.jr,
	netdev
In-Reply-To: <e88b1cbcb0cbfb20a960c57cee3c873e3cadc4cb.1506297988.git.daniel@iogearbox.net>

On Mon, 25 Sep 2017 02:25:54 +0200, Daniel Borkmann wrote:
> Implement support for transferring XDP meta data into skb for
> nfp driver; before calling into the program, xdp.data_meta points
> to xdp.data, where on program return with pass verdict, we call
> into skb_metadata_set().
> 
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> Acked-by: John Fastabend <john.fastabend@gmail.com>

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>

Thanks!

^ permalink raw reply

* [PATCH v2 02/16] thunderbolt: Add support for XDomain properties
From: Mika Westerberg @ 2017-09-25 11:07 UTC (permalink / raw)
  To: Greg Kroah-Hartman, David S . Miller
  Cc: Andreas Noever, Michael Jamet, Yehezkel Bernat, Amir Levy,
	Mario.Limonciello, Lukas Wunner, Andy Shevchenko, Andrew Lunn,
	Mika Westerberg, linux-kernel, netdev
In-Reply-To: <20170925110738.68382-1-mika.westerberg@linux.intel.com>

Thunderbolt XDomain discovery protocol uses directories which contain
properties and other directories to exchange information about what
capabilities the remote host supports. This also includes identification
information like device ID and name.

This adds support for parsing and formatting these properties and
establishes an API drivers can use in addition to the core Thunderbolt
driver. This API is exposed in a new header: include/linux/thunderbolt.h.

This code is based on the work done by Amir Levy and Michael Jamet.

Signed-off-by: Michael Jamet <michael.jamet@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
---
 MAINTAINERS                    |   1 +
 drivers/thunderbolt/Makefile   |   2 +-
 drivers/thunderbolt/property.c | 670 +++++++++++++++++++++++++++++++++++++++++
 include/linux/thunderbolt.h    |  89 ++++++
 4 files changed, 761 insertions(+), 1 deletion(-)
 create mode 100644 drivers/thunderbolt/property.c
 create mode 100644 include/linux/thunderbolt.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 6671f375f7fc..c1c90d962012 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13278,6 +13278,7 @@ M:	Mika Westerberg <mika.westerberg@linux.intel.com>
 M:	Yehezkel Bernat <yehezkel.bernat@intel.com>
 S:	Maintained
 F:	drivers/thunderbolt/
+F:	include/linux/thunderbolt.h
 
 THUNDERX GPIO DRIVER
 M:	David Daney <david.daney@cavium.com>
diff --git a/drivers/thunderbolt/Makefile b/drivers/thunderbolt/Makefile
index 4900febc6c8a..7afd21f5383a 100644
--- a/drivers/thunderbolt/Makefile
+++ b/drivers/thunderbolt/Makefile
@@ -1,3 +1,3 @@
 obj-${CONFIG_THUNDERBOLT} := thunderbolt.o
 thunderbolt-objs := nhi.o ctl.o tb.o switch.o cap.o path.o tunnel_pci.o eeprom.o
-thunderbolt-objs += domain.o dma_port.o icm.o
+thunderbolt-objs += domain.o dma_port.o icm.o property.o
diff --git a/drivers/thunderbolt/property.c b/drivers/thunderbolt/property.c
new file mode 100644
index 000000000000..55a8aa32b1d6
--- /dev/null
+++ b/drivers/thunderbolt/property.c
@@ -0,0 +1,670 @@
+/*
+ * Thunderbolt XDomain property support
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uuid.h>
+#include <linux/thunderbolt.h>
+
+struct tb_property_entry {
+	u32 key_hi;
+	u32 key_lo;
+	u16 length;
+	u8 reserved;
+	u8 type;
+	u32 value;
+} __packed;
+
+struct tb_property_rootdir_entry {
+	u32 magic;
+	u32 length;
+	struct tb_property_entry entries[];
+} __packed;
+
+struct tb_property_dir_entry {
+	u32 uuid[4];
+	struct tb_property_entry entries[];
+} __packed;
+
+#define TB_PROPERTY_ROOTDIR_MAGIC	0x55584401
+
+static struct tb_property_dir *__tb_property_parse_dir(const u32 *block,
+	size_t block_len, unsigned int dir_offset, size_t dir_len,
+	bool is_root);
+
+static inline void parse_dwdata(void *dst, const void *src, size_t dwords)
+{
+	be32_to_cpu_array(dst, src, dwords);
+}
+
+static inline void format_dwdata(void *dst, const void *src, size_t dwords)
+{
+	cpu_to_be32_array(dst, src, dwords);
+}
+
+static bool tb_property_entry_valid(const struct tb_property_entry *entry,
+				  size_t block_len)
+{
+	switch (entry->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+	case TB_PROPERTY_TYPE_DATA:
+	case TB_PROPERTY_TYPE_TEXT:
+		if (entry->length > block_len)
+			return false;
+		if (entry->value + entry->length > block_len)
+			return false;
+		break;
+
+	case TB_PROPERTY_TYPE_VALUE:
+		if (entry->length != 1)
+			return false;
+		break;
+	}
+
+	return true;
+}
+
+static bool tb_property_key_valid(const char *key)
+{
+	return key && strlen(key) <= TB_PROPERTY_KEY_SIZE;
+}
+
+static struct tb_property *
+tb_property_alloc(const char *key, enum tb_property_type type)
+{
+	struct tb_property *property;
+
+	property = kzalloc(sizeof(*property), GFP_KERNEL);
+	if (!property)
+		return NULL;
+
+	strcpy(property->key, key);
+	property->type = type;
+	INIT_LIST_HEAD(&property->list);
+
+	return property;
+}
+
+static struct tb_property *tb_property_parse(const u32 *block, size_t block_len,
+					const struct tb_property_entry *entry)
+{
+	char key[TB_PROPERTY_KEY_SIZE + 1];
+	struct tb_property *property;
+	struct tb_property_dir *dir;
+
+	if (!tb_property_entry_valid(entry, block_len))
+		return NULL;
+
+	parse_dwdata(key, entry, 2);
+	key[TB_PROPERTY_KEY_SIZE] = '\0';
+
+	property = tb_property_alloc(key, entry->type);
+	if (!property)
+		return NULL;
+
+	property->length = entry->length;
+
+	switch (property->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+		dir = __tb_property_parse_dir(block, block_len, entry->value,
+					      entry->length, false);
+		if (!dir) {
+			kfree(property);
+			return NULL;
+		}
+		property->value.dir = dir;
+		break;
+
+	case TB_PROPERTY_TYPE_DATA:
+		property->value.data = kcalloc(property->length, sizeof(u32),
+					       GFP_KERNEL);
+		if (!property->value.data) {
+			kfree(property);
+			return NULL;
+		}
+		parse_dwdata(property->value.data, block + entry->value,
+			     entry->length);
+		break;
+
+	case TB_PROPERTY_TYPE_TEXT:
+		property->value.text = kcalloc(property->length, sizeof(u32),
+					       GFP_KERNEL);
+		if (!property->value.text) {
+			kfree(property);
+			return NULL;
+		}
+		parse_dwdata(property->value.text, block + entry->value,
+			     entry->length);
+		/* Force null termination */
+		property->value.text[property->length * 4 - 1] = '\0';
+		break;
+
+	case TB_PROPERTY_TYPE_VALUE:
+		property->value.immediate = entry->value;
+		break;
+
+	default:
+		property->type = TB_PROPERTY_TYPE_UNKNOWN;
+		break;
+	}
+
+	return property;
+}
+
+static struct tb_property_dir *__tb_property_parse_dir(const u32 *block,
+	size_t block_len, unsigned int dir_offset, size_t dir_len, bool is_root)
+{
+	const struct tb_property_entry *entries;
+	size_t i, content_len, nentries;
+	unsigned int content_offset;
+	struct tb_property_dir *dir;
+
+	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		return NULL;
+
+	if (is_root) {
+		content_offset = dir_offset + 2;
+		content_len = dir_len;
+	} else {
+		dir->uuid = kmemdup(&block[dir_offset], sizeof(*dir->uuid),
+				    GFP_KERNEL);
+		content_offset = dir_offset + 4;
+		content_len = dir_len - 4; /* Length includes UUID */
+	}
+
+	entries = (const struct tb_property_entry *)&block[content_offset];
+	nentries = content_len / (sizeof(*entries) / 4);
+
+	INIT_LIST_HEAD(&dir->properties);
+
+	for (i = 0; i < nentries; i++) {
+		struct tb_property *property;
+
+		property = tb_property_parse(block, block_len, &entries[i]);
+		if (!property) {
+			tb_property_free_dir(dir);
+			return NULL;
+		}
+
+		list_add_tail(&property->list, &dir->properties);
+	}
+
+	return dir;
+}
+
+/**
+ * tb_property_parse_dir() - Parses properties from given property block
+ * @block: Property block to parse
+ * @block_len: Number of dword elements in the property block
+ *
+ * This function parses the XDomain properties data block into format that
+ * can be traversed using the helper functions provided by this module.
+ * Upon success returns the parsed directory. In case of error returns
+ * %NULL. The resulting &struct tb_property_dir needs to be released by
+ * calling tb_property_free_dir() when not needed anymore.
+ *
+ * The @block is expected to be root directory.
+ */
+struct tb_property_dir *tb_property_parse_dir(const u32 *block,
+					      size_t block_len)
+{
+	const struct tb_property_rootdir_entry *rootdir =
+		(const struct tb_property_rootdir_entry *)block;
+
+	if (rootdir->magic != TB_PROPERTY_ROOTDIR_MAGIC)
+		return NULL;
+	if (rootdir->length > block_len)
+		return NULL;
+
+	return __tb_property_parse_dir(block, block_len, 0, rootdir->length,
+				       true);
+}
+
+/**
+ * tb_property_create_dir() - Creates new property directory
+ * @uuid: UUID used to identify the particular directory
+ *
+ * Creates new, empty property directory. If @uuid is %NULL then the
+ * directory is assumed to be root directory.
+ */
+struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid)
+{
+	struct tb_property_dir *dir;
+
+	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		return NULL;
+
+	INIT_LIST_HEAD(&dir->properties);
+	if (uuid) {
+		dir->uuid = kmemdup(uuid, sizeof(*dir->uuid), GFP_KERNEL);
+		if (!dir->uuid) {
+			kfree(dir);
+			return NULL;
+		}
+	}
+
+	return dir;
+}
+EXPORT_SYMBOL_GPL(tb_property_create_dir);
+
+static void tb_property_free(struct tb_property *property)
+{
+	switch (property->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+		tb_property_free_dir(property->value.dir);
+		break;
+
+	case TB_PROPERTY_TYPE_DATA:
+		kfree(property->value.data);
+		break;
+
+	case TB_PROPERTY_TYPE_TEXT:
+		kfree(property->value.text);
+		break;
+
+	default:
+		break;
+	}
+
+	kfree(property);
+}
+
+/**
+ * tb_property_free_dir() - Release memory allocated for property directory
+ * @dir: Directory to release
+ *
+ * This will release all the memory the directory occupies including all
+ * descendants. It is OK to pass %NULL @dir, then the function does
+ * nothing.
+ */
+void tb_property_free_dir(struct tb_property_dir *dir)
+{
+	struct tb_property *property, *tmp;
+
+	if (!dir)
+		return;
+
+	list_for_each_entry_safe(property, tmp, &dir->properties, list) {
+		list_del(&property->list);
+		tb_property_free(property);
+	}
+	kfree(dir->uuid);
+	kfree(dir);
+}
+EXPORT_SYMBOL_GPL(tb_property_free_dir);
+
+static size_t tb_property_dir_length(const struct tb_property_dir *dir,
+				     bool recurse, size_t *data_len)
+{
+	const struct tb_property *property;
+	size_t len = 0;
+
+	if (dir->uuid)
+		len += sizeof(*dir->uuid) / 4;
+	else
+		len += sizeof(struct tb_property_rootdir_entry) / 4;
+
+	list_for_each_entry(property, &dir->properties, list) {
+		len += sizeof(struct tb_property_entry) / 4;
+
+		switch (property->type) {
+		case TB_PROPERTY_TYPE_DIRECTORY:
+			if (recurse) {
+				len += tb_property_dir_length(
+					property->value.dir, recurse, data_len);
+			}
+			/* Reserve dword padding after each directory */
+			if (data_len)
+				*data_len += 1;
+			break;
+
+		case TB_PROPERTY_TYPE_DATA:
+		case TB_PROPERTY_TYPE_TEXT:
+			if (data_len)
+				*data_len += property->length;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	return len;
+}
+
+static ssize_t __tb_property_format_dir(const struct tb_property_dir *dir,
+	u32 *block, unsigned int start_offset, size_t block_len)
+{
+	unsigned int data_offset, dir_end;
+	const struct tb_property *property;
+	struct tb_property_entry *entry;
+	size_t dir_len, data_len = 0;
+	int ret;
+
+	/*
+	 * The structure of property block looks like following. Leaf
+	 * data/text is included right after the directory and each
+	 * directory follows each other (even nested ones).
+	 *
+	 * +----------+ <-- start_offset
+	 * |  header  | <-- root directory header
+	 * +----------+ ---
+	 * |  entry 0 | -^--------------------.
+	 * +----------+  |                    |
+	 * |  entry 1 | -|--------------------|--.
+	 * +----------+  |                    |  |
+	 * |  entry 2 | -|-----------------.  |  |
+	 * +----------+  |                 |  |  |
+	 * :          :  |  dir_len        |  |  |
+	 * .          .  |                 |  |  |
+	 * :          :  |                 |  |  |
+	 * +----------+  |                 |  |  |
+	 * |  entry n |  v                 |  |  |
+	 * +----------+ <-- data_offset    |  |  |
+	 * |  data 0  | <------------------|--'  |
+	 * +----------+                    |     |
+	 * |  data 1  | <------------------|-----'
+	 * +----------+                    |
+	 * | 00000000 | padding            |
+	 * +----------+ <-- dir_end <------'
+	 * |   UUID   | <-- directory UUID (child directory)
+	 * +----------+
+	 * |  entry 0 |
+	 * +----------+
+	 * |  entry 1 |
+	 * +----------+
+	 * :          :
+	 * .          .
+	 * :          :
+	 * +----------+
+	 * |  entry n |
+	 * +----------+
+	 * |  data 0  |
+	 * +----------+
+	 *
+	 * We use dir_end to hold pointer to the end of the directory. It
+	 * will increase as we add directories and each directory should be
+	 * added starting from previous dir_end.
+	 */
+	dir_len = tb_property_dir_length(dir, false, &data_len);
+	data_offset = start_offset + dir_len;
+	dir_end = start_offset + data_len + dir_len;
+
+	if (data_offset > dir_end)
+		return -EINVAL;
+	if (dir_end > block_len)
+		return -EINVAL;
+
+	/* Write headers first */
+	if (dir->uuid) {
+		struct tb_property_dir_entry *pe;
+
+		pe = (struct tb_property_dir_entry *)&block[start_offset];
+		memcpy(pe->uuid, dir->uuid, sizeof(pe->uuid));
+		entry = pe->entries;
+	} else {
+		struct tb_property_rootdir_entry *re;
+
+		re = (struct tb_property_rootdir_entry *)&block[start_offset];
+		re->magic = TB_PROPERTY_ROOTDIR_MAGIC;
+		re->length = dir_len - sizeof(*re) / 4;
+		entry = re->entries;
+	}
+
+	list_for_each_entry(property, &dir->properties, list) {
+		const struct tb_property_dir *child;
+
+		format_dwdata(entry, property->key, 2);
+		entry->type = property->type;
+
+		switch (property->type) {
+		case TB_PROPERTY_TYPE_DIRECTORY:
+			child = property->value.dir;
+			ret = __tb_property_format_dir(child, block, dir_end,
+						       block_len);
+			if (ret < 0)
+				return ret;
+			entry->length = tb_property_dir_length(child, false,
+							       NULL);
+			entry->value = dir_end;
+			dir_end = ret;
+			break;
+
+		case TB_PROPERTY_TYPE_DATA:
+			format_dwdata(&block[data_offset], property->value.data,
+				      property->length);
+			entry->length = property->length;
+			entry->value = data_offset;
+			data_offset += entry->length;
+			break;
+
+		case TB_PROPERTY_TYPE_TEXT:
+			format_dwdata(&block[data_offset], property->value.text,
+				      property->length);
+			entry->length = property->length;
+			entry->value = data_offset;
+			data_offset += entry->length;
+			break;
+
+		case TB_PROPERTY_TYPE_VALUE:
+			entry->length = property->length;
+			entry->value = property->value.immediate;
+			break;
+
+		default:
+			break;
+		}
+
+		entry++;
+	}
+
+	return dir_end;
+}
+
+/**
+ * tb_property_format_dir() - Formats directory to the packed XDomain format
+ * @dir: Directory to format
+ * @block: Property block where the packed data is placed
+ * @block_len: Length of the property block
+ *
+ * This function formats the directory to the packed format that can be
+ * then send over the thunderbolt fabric to receiving host. Returns %0 in
+ * case of success and negative errno on faulure. Passing %NULL in @block
+ * returns number of entries the block takes.
+ */
+ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
+			       size_t block_len)
+{
+	ssize_t ret;
+
+	if (!block) {
+		size_t dir_len, data_len = 0;
+
+		dir_len = tb_property_dir_length(dir, true, &data_len);
+		return dir_len + data_len;
+	}
+
+	ret = __tb_property_format_dir(dir, block, 0, block_len);
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ * tb_property_add_immediate() - Add immediate property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @value: Immediate value to store with the property
+ */
+int tb_property_add_immediate(struct tb_property_dir *parent, const char *key,
+			      u32 value)
+{
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_VALUE);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = 1;
+	property->value.immediate = value;
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_immediate);
+
+/**
+ * tb_property_add_data() - Adds arbitrary data property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @buf: Data buffer to add
+ * @buflen: Number of bytes in the data buffer
+ *
+ * Function takes a copy of @buf and adds it to the directory.
+ */
+int tb_property_add_data(struct tb_property_dir *parent, const char *key,
+			 const void *buf, size_t buflen)
+{
+	/* Need to pad to dword boundary */
+	size_t size = round_up(buflen, 4);
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_DATA);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = size / 4;
+	property->value.data = kzalloc(size, GFP_KERNEL);
+	memcpy(property->value.data, buf, buflen);
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_data);
+
+/**
+ * tb_property_add_text() - Adds string property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @text: String to add
+ *
+ * Function takes a copy of @text and adds it to the directory.
+ */
+int tb_property_add_text(struct tb_property_dir *parent, const char *key,
+			 const char *text)
+{
+	/* Need to pad to dword boundary */
+	size_t size = round_up(strlen(text) + 1, 4);
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_TEXT);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = size / 4;
+	property->value.data = kzalloc(size, GFP_KERNEL);
+	strcpy(property->value.text, text);
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_text);
+
+/**
+ * tb_property_add_dir() - Adds a directory to the parent directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @dir: Directory to add
+ */
+int tb_property_add_dir(struct tb_property_dir *parent, const char *key,
+			struct tb_property_dir *dir)
+{
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_DIRECTORY);
+	if (!property)
+		return -ENOMEM;
+
+	property->value.dir = dir;
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_dir);
+
+/**
+ * tb_property_remove() - Removes property from a parent directory
+ * @property: Property to remove
+ *
+ * Note memory for @property is released as well so it is not allowed to
+ * touch the object after call to this function.
+ */
+void tb_property_remove(struct tb_property *property)
+{
+	list_del(&property->list);
+	kfree(property);
+}
+EXPORT_SYMBOL_GPL(tb_property_remove);
+
+/**
+ * tb_property_find() - Find a property from a directory
+ * @dir: Directory where the property is searched
+ * @key: Key to look for
+ * @type: Type of the property
+ *
+ * Finds and returns property from the given directory. Does not recurse
+ * into sub-directories. Returns %NULL if the property was not found.
+ */
+struct tb_property *tb_property_find(struct tb_property_dir *dir,
+	const char *key, enum tb_property_type type)
+{
+	struct tb_property *property;
+
+	list_for_each_entry(property, &dir->properties, list) {
+		if (property->type == type && !strcmp(property->key, key))
+			return property;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(tb_property_find);
+
+/**
+ * tb_property_get_next() - Get next property from directory
+ * @dir: Directory holding properties
+ * @prev: Previous property in the directory (%NULL returns the first)
+ */
+struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
+					 struct tb_property *prev)
+{
+	if (prev) {
+		if (list_is_last(&prev->list, &dir->properties))
+			return NULL;
+		return list_next_entry(prev, list);
+	}
+	return list_first_entry_or_null(&dir->properties, struct tb_property,
+					list);
+}
+EXPORT_SYMBOL_GPL(tb_property_get_next);
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
new file mode 100644
index 000000000000..96561c1265ae
--- /dev/null
+++ b/include/linux/thunderbolt.h
@@ -0,0 +1,89 @@
+/*
+ * Thunderbolt service API
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef THUNDERBOLT_H_
+#define THUNDERBOLT_H_
+
+#include <linux/list.h>
+#include <linux/uuid.h>
+
+/**
+ * struct tb_property_dir - XDomain property directory
+ * @uuid: Directory UUID or %NULL if root directory
+ * @properties: List of properties in this directory
+ *
+ * User needs to provide serialization if needed.
+ */
+struct tb_property_dir {
+	const uuid_t *uuid;
+	struct list_head properties;
+};
+
+enum tb_property_type {
+	TB_PROPERTY_TYPE_UNKNOWN = 0x00,
+	TB_PROPERTY_TYPE_DIRECTORY = 0x44,
+	TB_PROPERTY_TYPE_DATA = 0x64,
+	TB_PROPERTY_TYPE_TEXT = 0x74,
+	TB_PROPERTY_TYPE_VALUE = 0x76,
+};
+
+#define TB_PROPERTY_KEY_SIZE	8
+
+/**
+ * struct tb_property - XDomain property
+ * @list: Used to link properties together in a directory
+ * @key: Key for the property (always terminated).
+ * @type: Type of the property
+ * @length: Length of the property data in dwords
+ * @value: Property value
+ *
+ * Users use @type to determine which field in @value is filled.
+ */
+struct tb_property {
+	struct list_head list;
+	char key[TB_PROPERTY_KEY_SIZE + 1];
+	enum tb_property_type type;
+	size_t length;
+	union {
+		struct tb_property_dir *dir;
+		u8 *data;
+		char *text;
+		u32 immediate;
+	} value;
+};
+
+struct tb_property_dir *tb_property_parse_dir(const u32 *block,
+					      size_t block_len);
+ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
+			       size_t block_len);
+struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid);
+void tb_property_free_dir(struct tb_property_dir *dir);
+int tb_property_add_immediate(struct tb_property_dir *parent, const char *key,
+			      u32 value);
+int tb_property_add_data(struct tb_property_dir *parent, const char *key,
+			 const void *buf, size_t buflen);
+int tb_property_add_text(struct tb_property_dir *parent, const char *key,
+			 const char *text);
+int tb_property_add_dir(struct tb_property_dir *parent, const char *key,
+			struct tb_property_dir *dir);
+void tb_property_remove(struct tb_property *tb_property);
+struct tb_property *tb_property_find(struct tb_property_dir *dir,
+			const char *key, enum tb_property_type type);
+struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
+					 struct tb_property *prev);
+
+#define tb_property_for_each(dir, property)			\
+	for (property = tb_property_get_next(dir, NULL);	\
+	     property;						\
+	     property = tb_property_get_next(dir, property))
+
+#endif /* THUNDERBOLT_H_ */
-- 
2.14.1

^ permalink raw reply related

* [PATCH v2 16/16] net: Add support for networking over Thunderbolt cable
From: Mika Westerberg @ 2017-09-25 11:07 UTC (permalink / raw)
  To: Greg Kroah-Hartman, David S . Miller
  Cc: Andreas Noever, Michael Jamet, Yehezkel Bernat, Amir Levy,
	Mario.Limonciello, Lukas Wunner, Andy Shevchenko, Andrew Lunn,
	Mika Westerberg, linux-kernel, netdev
In-Reply-To: <20170925110738.68382-1-mika.westerberg@linux.intel.com>

From: Amir Levy <amir.jer.levy@intel.com>

ThunderboltIP is a protocol created by Apple to tunnel IP/ethernet
traffic over a Thunderbolt cable. The protocol consists of configuration
phase where each side sends ThunderboltIP login packets (the protocol is
determined by UUID in the XDomain packet header) over the configuration
channel. Once both sides get positive acknowledgment to their login
packet, they configure high-speed DMA path accordingly. This DMA path is
then used to transmit and receive networking traffic.

This patch creates a virtual ethernet interface the host software can
use in the same way as any other networking interface. Once the
interface is brought up successfully network packets get tunneled over
the Thunderbolt cable to the remote host and back.

The connection is terminated by sending a ThunderboltIP logout packet
over the configuration channel. We do this when the network interface is
brought down by user or the driver is unloaded.

Signed-off-by: Amir Levy <amir.jer.levy@intel.com>
Signed-off-by: Michael Jamet <michael.jamet@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
---
 Documentation/admin-guide/thunderbolt.rst |   24 +
 MAINTAINERS                               |    6 +
 drivers/net/Kconfig                       |   12 +
 drivers/net/Makefile                      |    3 +
 drivers/net/thunderbolt.c                 | 1379 +++++++++++++++++++++++++++++
 5 files changed, 1424 insertions(+)
 create mode 100644 drivers/net/thunderbolt.c

diff --git a/Documentation/admin-guide/thunderbolt.rst b/Documentation/admin-guide/thunderbolt.rst
index 6a4cd1f159ca..5c62d11d77e8 100644
--- a/Documentation/admin-guide/thunderbolt.rst
+++ b/Documentation/admin-guide/thunderbolt.rst
@@ -197,3 +197,27 @@ information is missing.
 
 To recover from this mode, one needs to flash a valid NVM image to the
 host host controller in the same way it is done in the previous chapter.
+
+Networking over Thunderbolt cable
+---------------------------------
+Thunderbolt technology allows software communication across two hosts
+connected by a Thunderbolt cable.
+
+It is possible to tunnel any kind of traffic over Thunderbolt link but
+currently we only support Apple ThunderboltIP protocol.
+
+If the other host is running Windows or macOS only thing you need to
+do is to connect Thunderbolt cable between the two hosts, the
+``thunderbolt-net`` is loaded automatically. If the other host is also
+Linux you should load ``thunderbolt-net`` manually on one host (it does
+not matter which one)::
+
+  # modprobe thunderbolt-net
+
+This triggers module load on the other host automatically. If the driver
+is built-in to the kernel image, there is no need to do anything.
+
+The driver will create one virtual ethernet interface per Thunderbolt
+port which are named like ``thunderbolt0`` and so on. From this point
+you can either use standard userspace tools like ``ifconfig`` to
+configure the interface or let your GUI to handle it automatically.
diff --git a/MAINTAINERS b/MAINTAINERS
index c1c90d962012..0dfbb3b2fbf0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13280,6 +13280,12 @@ S:	Maintained
 F:	drivers/thunderbolt/
 F:	include/linux/thunderbolt.h
 
+THUNDERBOLT NETWORK DRIVER
+M:	Mika Westerberg <mika.westerberg@linux.intel.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/thunderbolt.c
+
 THUNDERX GPIO DRIVER
 M:	David Daney <david.daney@cavium.com>
 S:	Maintained
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index aba0d652095b..0936da592e12 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -483,6 +483,18 @@ config FUJITSU_ES
 	  This driver provides support for Extended Socket network device
           on Extended Partitioning of FUJITSU PRIMEQUEST 2000 E2 series.
 
+config THUNDERBOLT_NET
+	tristate "Networking over Thunderbolt cable"
+	depends on THUNDERBOLT && INET
+	help
+	  Select this if you want to create network between two
+	  computers over a Thunderbolt cable. The driver supports Apple
+	  ThunderboltIP protocol and allows communication with any host
+	  supporting the same protocol including Windows and macOS.
+
+	  To compile this driver a module, choose M here. The module will be
+	  called thunderbolt-net.
+
 source "drivers/net/hyperv/Kconfig"
 
 endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 8dff900085d6..7c8f4dd3a7c5 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -74,3 +74,6 @@ obj-$(CONFIG_HYPERV_NET) += hyperv/
 obj-$(CONFIG_NTB_NETDEV) += ntb_netdev.o
 
 obj-$(CONFIG_FUJITSU_ES) += fjes/
+
+thunderbolt-net-y += thunderbolt.o
+obj-$(CONFIG_THUNDERBOLT_NET) += thunderbolt-net.o
diff --git a/drivers/net/thunderbolt.c b/drivers/net/thunderbolt.c
new file mode 100644
index 000000000000..0128fe7e665e
--- /dev/null
+++ b/drivers/net/thunderbolt.c
@@ -0,0 +1,1379 @@
+/*
+ * Networking over Thunderbolt cable using Apple ThunderboltIP protocol
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Amir Levy <amir.jer.levy@intel.com>
+ *          Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/atomic.h>
+#include <linux/highmem.h>
+#include <linux/if_vlan.h>
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/sizes.h>
+#include <linux/thunderbolt.h>
+#include <linux/uuid.h>
+#include <linux/workqueue.h>
+#include <net/ip6_checksum.h>
+
+/* Protocol timeouts in ms */
+#define TBNET_LOGIN_DELAY	4500
+#define TBNET_LOGIN_TIMEOUT	500
+#define TBNET_LOGOUT_TIMEOUT	100
+
+#define TBNET_RING_SIZE		256
+#define TBNET_LOCAL_PATH	0xf
+#define TBNET_RX_HDR_SIZE	256
+#define TBNET_LOGIN_RETRIES	60
+#define TBNET_LOGOUT_RETRIES	5
+#define TBNET_MATCH_FRAGS_ID	BIT(1)
+#define TBNET_MAX_MTU		SZ_64K
+#define TBNET_FRAME_SIZE	SZ_4K
+#define TBNET_MAX_PAYLOAD_SIZE		\
+	(TBNET_FRAME_SIZE - sizeof(struct thunderbolt_ip_frame_header))
+
+#define TBNET_L0_PORT_NUM(route) ((route) & GENMASK(5, 0))
+
+/**
+ * struct thunderbolt_ip_frame_header - Header for each Thunderbolt frame
+ * @frame_size: size of the data with the frame
+ * @frame_index: running index on the frames
+ * @frame_id: ID of the frame to match frames to specific packet
+ * @frame_count: how many frames assembles a full packet
+ *
+ * Each data frame passed to the high-speed DMA ring has this header. If
+ * the XDomain network directory announces that %TBNET_MATCH_FRAGS_ID is
+ * supported then @frame_id is filled, otherwise it stays %0.
+ */
+struct thunderbolt_ip_frame_header {
+	u32 frame_size;
+	u16 frame_index;
+	u16 frame_id;
+	u32 frame_count;
+} __packed;
+
+enum thunderbolt_ip_frame_pdf {
+	TBIP_PDF_FRAME_START = 1,
+	TBIP_PDF_FRAME_END,
+};
+
+enum thunderbolt_ip_type {
+	TBIP_LOGIN,
+	TBIP_LOGIN_RESPONSE,
+	TBIP_LOGOUT,
+	TBIP_STATUS,
+};
+
+struct thunderbolt_ip_header {
+	u32 route_hi;
+	u32 route_lo;
+	u32 length_sn;
+	uuid_t uuid;
+	uuid_t initiator_uuid;
+	uuid_t target_uuid;
+	u32 type;
+	u32 command_id;
+} __packed;
+
+#define TBIP_HDR_LENGTH_MASK		GENMASK(5, 0)
+#define TBIP_HDR_SN_MASK		GENMASK(28, 27)
+#define TBIP_HDR_SN_SHIFT		27
+
+struct thunderbolt_ip_login {
+	struct thunderbolt_ip_header hdr;
+	u32 proto_version;
+	u32 transmit_path;
+	u32 reserved[4];
+} __packed;
+
+#define TBIP_LOGIN_PROTO_VERSION	1
+
+struct thunderbolt_ip_login_response {
+	struct thunderbolt_ip_header hdr;
+	u32 status;
+	u32 receiver_mac[2];
+	u32 receiver_mac_len;
+	u32 reserved[4];
+} __packed;
+
+struct thunderbolt_ip_logout {
+	struct thunderbolt_ip_header hdr;
+} __packed;
+
+struct thunderbolt_ip_status {
+	struct thunderbolt_ip_header hdr;
+	u32 status;
+} __packed;
+
+struct tbnet_stats {
+	u64 tx_packets;
+	u64 rx_packets;
+	u64 tx_bytes;
+	u64 rx_bytes;
+	u64 tx_errors;
+	u64 rx_length_errors;
+	u64 rx_over_errors;
+	u64 rx_crc_errors;
+	u64 rx_missed_errors;
+};
+
+struct tbnet_frame {
+	struct net_device *dev;
+	struct page *page;
+	struct ring_frame frame;
+};
+
+struct tbnet_ring {
+	struct tbnet_frame frames[TBNET_RING_SIZE];
+	unsigned int cons;
+	unsigned int prod;
+	struct tb_ring *ring;
+};
+
+/**
+ * struct tbnet - ThunderboltIP network driver private data
+ * @svc: XDomain service the driver is bound to
+ * @xd: XDomain the service blongs to
+ * @handler: ThunderboltIP configuration protocol handler
+ * @dev: Networking device
+ * @napi: NAPI structure for Rx polling
+ * @stats: Network statistics
+ * @skb: Network packet that is currently processed on Rx path
+ * @command_id: ID used for next configuration protocol packet
+ * @login_sent: ThunderboltIP login message successfully sent
+ * @login_received: ThunderboltIP login message received from the remote
+ *		    host
+ * @transmit_path: HopID the other end needs to use building the
+ *		   opposite side path.
+ * @connection_lock: Lock serializing access to @login_sent,
+ *		     @login_received and @transmit_path.
+ * @login_retries: Number of login retries currently done
+ * @login_work: Worker to send ThunderboltIP login packets
+ * @connected_work: Worker that finalizes the ThunderboltIP connection
+ *		    setup and enables DMA paths for high speed data
+ *		    transfers
+ * @rx_hdr: Copy of the currently processed Rx frame. Used when a
+ *	    network packet consists of multiple Thunderbolt frames.
+ *	    In host byte order.
+ * @rx_ring: Software ring holding Rx frames
+ * @frame_id: Frame ID use for next Tx packet (if
+ *	      %TBNET_MATCH_FRAGS_ID is supported in both ends)
+ * @tx_ring: Software ring holding Tx frames
+ */
+struct tbnet {
+	const struct tb_service *svc;
+	struct tb_xdomain *xd;
+	struct tb_protocol_handler handler;
+	struct net_device *dev;
+	struct napi_struct napi;
+	struct tbnet_stats stats;
+	struct sk_buff *skb;
+	atomic_t command_id;
+	bool login_sent;
+	bool login_received;
+	u32 transmit_path;
+	struct mutex connection_lock;
+	int login_retries;
+	struct delayed_work login_work;
+	struct work_struct connected_work;
+	struct thunderbolt_ip_frame_header rx_hdr;
+	struct tbnet_ring rx_ring;
+	atomic_t frame_id;
+	struct tbnet_ring tx_ring;
+};
+
+/* Network property directory UUID */
+static const uuid_t tbnet_dir_uuid =
+	UUID_INIT(0xc66189ca, 0x1cce, 0x4195,
+		  0xbd, 0xb8, 0x49, 0x59, 0x2e, 0x5f, 0x5a, 0x4f);
+
+/* ThunderboltIP configuration protocol UUID */
+static const uuid_t tbnet_svc_uuid =
+	UUID_INIT(0x798f589e, 0x3616, 0x8a47,
+		  0x97, 0xc6, 0x56, 0x64, 0xa9, 0x20, 0xc8, 0xdd);
+
+static struct tb_property_dir *tbnet_dir;
+
+static void tbnet_fill_header(struct thunderbolt_ip_header *hdr, u64 route,
+	u8 sequence, const uuid_t *initiator_uuid, const uuid_t *target_uuid,
+	enum thunderbolt_ip_type type, size_t size, u32 command_id)
+{
+	u32 length_sn;
+
+	/* Length does not include route_hi/lo and length_sn fields */
+	length_sn = (size - 3 * 4) / 4;
+	length_sn |= (sequence << TBIP_HDR_SN_SHIFT) & TBIP_HDR_SN_MASK;
+
+	hdr->route_hi = upper_32_bits(route);
+	hdr->route_lo = lower_32_bits(route);
+	hdr->length_sn = length_sn;
+	uuid_copy(&hdr->uuid, &tbnet_svc_uuid);
+	uuid_copy(&hdr->initiator_uuid, initiator_uuid);
+	uuid_copy(&hdr->target_uuid, target_uuid);
+	hdr->type = type;
+	hdr->command_id = command_id;
+}
+
+static int tbnet_login_response(struct tbnet *net, u64 route, u8 sequence,
+				u32 command_id)
+{
+	struct thunderbolt_ip_login_response reply;
+	struct tb_xdomain *xd = net->xd;
+
+	memset(&reply, 0, sizeof(reply));
+	tbnet_fill_header(&reply.hdr, route, sequence, xd->local_uuid,
+			  xd->remote_uuid, TBIP_LOGIN_RESPONSE, sizeof(reply),
+			  command_id);
+	memcpy(reply.receiver_mac, net->dev->dev_addr, ETH_ALEN);
+	reply.receiver_mac_len = ETH_ALEN;
+
+	return tb_xdomain_response(xd, &reply, sizeof(reply),
+				   TB_CFG_PKG_XDOMAIN_RESP);
+}
+
+static int tbnet_login_request(struct tbnet *net, u8 sequence)
+{
+	struct thunderbolt_ip_login_response reply;
+	struct thunderbolt_ip_login request;
+	struct tb_xdomain *xd = net->xd;
+
+	memset(&request, 0, sizeof(request));
+	tbnet_fill_header(&request.hdr, xd->route, sequence, xd->local_uuid,
+			  xd->remote_uuid, TBIP_LOGIN, sizeof(request),
+			  atomic_inc_return(&net->command_id));
+
+	request.proto_version = TBIP_LOGIN_PROTO_VERSION;
+	request.transmit_path = TBNET_LOCAL_PATH;
+
+	return tb_xdomain_request(xd, &request, sizeof(request),
+				  TB_CFG_PKG_XDOMAIN_RESP, &reply,
+				  sizeof(reply), TB_CFG_PKG_XDOMAIN_RESP,
+				  TBNET_LOGIN_TIMEOUT);
+}
+
+static int tbnet_logout_response(struct tbnet *net, u64 route, u8 sequence,
+				 u32 command_id)
+{
+	struct thunderbolt_ip_status reply;
+	struct tb_xdomain *xd = net->xd;
+
+	memset(&reply, 0, sizeof(reply));
+	tbnet_fill_header(&reply.hdr, route, sequence, xd->local_uuid,
+			  xd->remote_uuid, TBIP_STATUS, sizeof(reply),
+			  atomic_inc_return(&net->command_id));
+	return tb_xdomain_response(xd, &reply, sizeof(reply),
+				   TB_CFG_PKG_XDOMAIN_RESP);
+}
+
+static int tbnet_logout_request(struct tbnet *net)
+{
+	struct thunderbolt_ip_logout request;
+	struct thunderbolt_ip_status reply;
+	struct tb_xdomain *xd = net->xd;
+
+	memset(&request, 0, sizeof(request));
+	tbnet_fill_header(&request.hdr, xd->route, 0, xd->local_uuid,
+			  xd->remote_uuid, TBIP_LOGOUT, sizeof(request),
+			  atomic_inc_return(&net->command_id));
+
+	return tb_xdomain_request(xd, &request, sizeof(request),
+				  TB_CFG_PKG_XDOMAIN_RESP, &reply,
+				  sizeof(reply), TB_CFG_PKG_XDOMAIN_RESP,
+				  TBNET_LOGOUT_TIMEOUT);
+}
+
+static void start_login(struct tbnet *net)
+{
+	mutex_lock(&net->connection_lock);
+	net->login_sent = false;
+	net->login_received = false;
+	mutex_unlock(&net->connection_lock);
+
+	queue_delayed_work(system_long_wq, &net->login_work,
+			   msecs_to_jiffies(1000));
+}
+
+static void stop_login(struct tbnet *net)
+{
+	cancel_delayed_work_sync(&net->login_work);
+	cancel_work_sync(&net->connected_work);
+}
+
+static inline unsigned int tbnet_frame_size(const struct tbnet_frame *tf)
+{
+	return tf->frame.size ? : TBNET_FRAME_SIZE;
+}
+
+static void tbnet_free_buffers(struct tbnet_ring *ring)
+{
+	unsigned int i;
+
+	for (i = 0; i < TBNET_RING_SIZE; i++) {
+		struct device *dma_dev = tb_ring_dma_device(ring->ring);
+		struct tbnet_frame *tf = &ring->frames[i];
+		enum dma_data_direction dir;
+		size_t size;
+
+		if (!tf->page)
+			continue;
+
+		if (ring->ring->is_tx) {
+			dir = DMA_TO_DEVICE;
+			size = tbnet_frame_size(tf);
+		} else {
+			dir = DMA_FROM_DEVICE;
+			size = TBNET_FRAME_SIZE;
+		}
+
+		dma_unmap_page(dma_dev, tf->frame.buffer_phy, size, dir);
+		__free_page(tf->page);
+		tf->page = NULL;
+	}
+
+	ring->cons = 0;
+	ring->prod = 0;
+}
+
+static void tbnet_tear_down(struct tbnet *net, bool send_logout)
+{
+	netif_carrier_off(net->dev);
+	netif_stop_queue(net->dev);
+
+	stop_login(net);
+
+	mutex_lock(&net->connection_lock);
+
+	if (net->login_sent && net->login_received) {
+		int retries = TBNET_LOGOUT_RETRIES;
+
+		while (send_logout && retries-- > 0) {
+			int ret = tbnet_logout_request(net);
+			if (ret != -ETIMEDOUT)
+				break;
+		}
+
+		tb_ring_stop(net->rx_ring.ring);
+		tb_ring_stop(net->tx_ring.ring);
+		tbnet_free_buffers(&net->rx_ring);
+		tbnet_free_buffers(&net->tx_ring);
+
+		if (tb_xdomain_disable_paths(net->xd))
+			netdev_warn(net->dev, "failed to disable DMA paths\n");
+	}
+
+	net->login_retries = 0;
+	net->login_sent = false;
+	net->login_received = false;
+
+	mutex_unlock(&net->connection_lock);
+}
+
+static int tbnet_handle_packet(const void *buf, size_t size, void *data)
+{
+	const struct thunderbolt_ip_login *pkg = buf;
+	struct tbnet *net = data;
+	u32 command_id;
+	int ret = 0;
+	u8 sequence;
+	u64 route;
+
+	/* Make sure the packet is for us */
+	if (size < sizeof(struct thunderbolt_ip_header))
+		return 0;
+	if (!uuid_equal(&pkg->hdr.initiator_uuid, net->xd->remote_uuid))
+		return 0;
+	if (!uuid_equal(&pkg->hdr.target_uuid, net->xd->local_uuid))
+		return 0;
+
+	route = ((u64)pkg->hdr.route_hi << 32) | pkg->hdr.route_lo;
+	route &= ~BIT_ULL(63);
+	if (route != net->xd->route)
+		return 0;
+
+	sequence = pkg->hdr.length_sn & TBIP_HDR_SN_MASK;
+	sequence >>= TBIP_HDR_SN_SHIFT;
+	command_id = pkg->hdr.command_id;
+
+	switch (pkg->hdr.type) {
+	case TBIP_LOGIN:
+		if (!netif_running(net->dev))
+			break;
+
+		ret = tbnet_login_response(net, route, sequence,
+					   pkg->hdr.command_id);
+		if (!ret) {
+			mutex_lock(&net->connection_lock);
+			net->login_received = true;
+			net->transmit_path = pkg->transmit_path;
+
+			/* If we reached the number of max retries or
+			 * previous logout, schedule another round of
+			 * login retries
+			 */
+			if (net->login_retries >= TBNET_LOGIN_RETRIES ||
+			    !net->login_sent) {
+				net->login_retries = 0;
+				queue_delayed_work(system_long_wq,
+						   &net->login_work, 0);
+			}
+			mutex_unlock(&net->connection_lock);
+
+			queue_work(system_long_wq, &net->connected_work);
+		}
+		break;
+
+	case TBIP_LOGOUT:
+		ret = tbnet_logout_response(net, route, sequence, command_id);
+		if (!ret)
+			tbnet_tear_down(net, false);
+		break;
+
+	default:
+		return 0;
+	}
+
+	if (ret)
+		netdev_warn(net->dev, "failed to send ThunderboltIP response\n");
+
+	return 1;
+}
+
+static unsigned int tbnet_available_buffers(const struct tbnet_ring *ring)
+{
+	return ring->prod - ring->cons;
+}
+
+static int tbnet_alloc_rx_buffers(struct tbnet *net, unsigned int nbuffers)
+{
+	struct tbnet_ring *ring = &net->rx_ring;
+	int ret;
+
+	while (nbuffers--) {
+		struct device *dma_dev = tb_ring_dma_device(ring->ring);
+		unsigned int index = ring->prod & (TBNET_RING_SIZE - 1);
+		struct tbnet_frame *tf = &ring->frames[index];
+		dma_addr_t dma_addr;
+
+		if (tf->page)
+			break;
+
+		tf->page = dev_alloc_page();
+		if (!tf->page) {
+			ret = -ENOMEM;
+			goto err_free;
+		}
+
+		dma_addr = dma_map_page(dma_dev, tf->page, 0,
+					TBNET_FRAME_SIZE, DMA_FROM_DEVICE);
+		if (dma_mapping_error(dma_dev, dma_addr)) {
+			ret = -ENOMEM;
+			goto err_free;
+		}
+
+		tf->frame.buffer_phy = dma_addr;
+		tf->dev = net->dev;
+
+		tb_ring_rx(ring->ring, &tf->frame);
+
+		ring->prod++;
+	}
+
+	return 0;
+
+err_free:
+	tbnet_free_buffers(ring);
+	return ret;
+}
+
+static struct tbnet_frame *tbnet_get_tx_buffer(struct tbnet *net)
+{
+	struct tbnet_ring *ring = &net->tx_ring;
+	struct tbnet_frame *tf;
+	unsigned int index;
+
+	if (!tbnet_available_buffers(ring))
+		return NULL;
+
+	index = ring->cons++ & (TBNET_RING_SIZE - 1);
+
+	tf = &ring->frames[index];
+	tf->frame.size = 0;
+	tf->frame.buffer_phy = 0;
+
+	return tf;
+}
+
+static void tbnet_tx_callback(struct tb_ring *ring, struct ring_frame *frame,
+			      bool canceled)
+{
+	struct tbnet_frame *tf = container_of(frame, typeof(*tf), frame);
+	struct device *dma_dev = tb_ring_dma_device(ring);
+	struct tbnet *net = netdev_priv(tf->dev);
+
+	dma_unmap_page(dma_dev, tf->frame.buffer_phy, tbnet_frame_size(tf),
+		       DMA_TO_DEVICE);
+
+	/* Return buffer to the ring */
+	net->tx_ring.prod++;
+
+	if (tbnet_available_buffers(&net->tx_ring) >= TBNET_RING_SIZE / 2)
+		netif_wake_queue(net->dev);
+}
+
+static int tbnet_alloc_tx_buffers(struct tbnet *net)
+{
+	struct tbnet_ring *ring = &net->tx_ring;
+	unsigned int i;
+
+	for (i = 0; i < TBNET_RING_SIZE; i++) {
+		struct tbnet_frame *tf = &ring->frames[i];
+
+		tf->page = alloc_page(GFP_KERNEL);
+		if (!tf->page) {
+			tbnet_free_buffers(ring);
+			return -ENOMEM;
+		}
+
+		tf->dev = net->dev;
+		tf->frame.callback = tbnet_tx_callback;
+		tf->frame.sof = TBIP_PDF_FRAME_START;
+		tf->frame.eof = TBIP_PDF_FRAME_END;
+	}
+
+	ring->cons = 0;
+	ring->prod = TBNET_RING_SIZE - 1;
+
+	return 0;
+}
+
+static void tbnet_connected_work(struct work_struct *work)
+{
+	struct tbnet *net = container_of(work, typeof(*net), connected_work);
+	bool connected;
+	int ret;
+
+	if (netif_carrier_ok(net->dev))
+		return;
+
+	mutex_lock(&net->connection_lock);
+	connected = net->login_sent && net->login_received;
+	mutex_unlock(&net->connection_lock);
+
+	if (!connected)
+		return;
+
+	/* Both logins successful so enable the high-speed DMA paths and
+	 * start the network device queue.
+	 */
+	ret = tb_xdomain_enable_paths(net->xd, TBNET_LOCAL_PATH,
+				      net->rx_ring.ring->hop,
+				      net->transmit_path,
+				      net->tx_ring.ring->hop);
+	if (ret) {
+		netdev_err(net->dev, "failed to enable DMA paths\n");
+		return;
+	}
+
+	tb_ring_start(net->tx_ring.ring);
+	tb_ring_start(net->rx_ring.ring);
+
+	ret = tbnet_alloc_rx_buffers(net, TBNET_RING_SIZE);
+	if (ret)
+		goto err_stop_rings;
+
+	ret = tbnet_alloc_tx_buffers(net);
+	if (ret)
+		goto err_free_rx_buffers;
+
+	netif_carrier_on(net->dev);
+	netif_start_queue(net->dev);
+	return;
+
+err_free_rx_buffers:
+	tbnet_free_buffers(&net->rx_ring);
+err_stop_rings:
+	tb_ring_stop(net->rx_ring.ring);
+	tb_ring_stop(net->tx_ring.ring);
+}
+
+static void tbnet_login_work(struct work_struct *work)
+{
+	struct tbnet *net = container_of(work, typeof(*net), login_work.work);
+	unsigned long delay = msecs_to_jiffies(TBNET_LOGIN_DELAY);
+	int ret;
+
+	if (netif_carrier_ok(net->dev))
+		return;
+
+	ret = tbnet_login_request(net, net->login_retries % 4);
+	if (ret) {
+		if (net->login_retries++ < TBNET_LOGIN_RETRIES) {
+			queue_delayed_work(system_long_wq, &net->login_work,
+					   delay);
+		} else {
+			netdev_info(net->dev, "ThunderboltIP login timed out\n");
+		}
+	} else {
+		net->login_retries = 0;
+
+		mutex_lock(&net->connection_lock);
+		net->login_sent = true;
+		mutex_unlock(&net->connection_lock);
+
+		queue_work(system_long_wq, &net->connected_work);
+	}
+}
+
+static bool tbnet_check_frame(struct tbnet *net, const struct tbnet_frame *tf)
+{
+	u32 frame_id, frame_count, frame_size, frame_index;
+	const struct thunderbolt_ip_frame_header *hdr;
+	unsigned int size;
+
+	if (tf->frame.flags & RING_DESC_CRC_ERROR) {
+		net->stats.rx_crc_errors++;
+		return false;
+	} else if (tf->frame.flags & RING_DESC_BUFFER_OVERRUN) {
+		net->stats.rx_over_errors++;
+		return false;
+	}
+
+	/* Should be greater than just header i.e. contains data */
+	size = tbnet_frame_size(tf);
+	if (size <= sizeof(*hdr)) {
+		net->stats.rx_length_errors++;
+		return false;
+	}
+
+	hdr = page_address(tf->page);
+	frame_count = le32_to_cpu(hdr->frame_count);
+	frame_size = le32_to_cpu(hdr->frame_size);
+	frame_index = le16_to_cpu(hdr->frame_index);
+	frame_id = le16_to_cpu(hdr->frame_id);
+
+	if ((frame_size > size - sizeof(*hdr)) || !frame_size) {
+		net->stats.rx_length_errors++;
+		return false;
+	}
+
+	/* In case we're in the middle of packet, validate the frame
+	 * header based on first fragment of the packet.
+	 */
+	if (net->skb && net->rx_hdr.frame_count) {
+		/* Check the frame count fits the count field */
+		if (frame_count != net->rx_hdr.frame_count) {
+			net->stats.rx_length_errors++;
+			return false;
+		}
+
+		/* Check the frame identifiers are incremented correctly,
+		 * and id is matching.
+		 */
+		if (frame_index != net->rx_hdr.frame_index + 1 ||
+		    frame_id != net->rx_hdr.frame_id) {
+			net->stats.rx_missed_errors++;
+			return false;
+		}
+
+		if (net->skb->len + frame_size > TBNET_MAX_MTU) {
+			net->stats.rx_length_errors++;
+			return false;
+		}
+
+		return true;
+	}
+
+	/* Start of packet, validate the frame header */
+	if (frame_count == 0 || frame_count > TBNET_RING_SIZE / 4) {
+		net->stats.rx_length_errors++;
+		return false;
+	}
+	if (frame_index != 0) {
+		net->stats.rx_missed_errors++;
+		return false;
+	}
+	if (frame_count > 1 && frame_size < TBNET_RX_HDR_SIZE) {
+		net->stats.rx_length_errors++;
+		return false;
+	}
+
+	return true;
+}
+
+static void tbnet_pull_tail(struct sk_buff *skb)
+{
+	skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
+	unsigned int pull_len;
+	void *hdr;
+
+	hdr = skb_frag_address(frag);
+	pull_len = eth_get_headlen(hdr, TBNET_RX_HDR_SIZE);
+
+	/* Align pull length to size of long to optimize memcpy performance */
+	skb_copy_to_linear_data(skb, hdr, ALIGN(pull_len, sizeof(long)));
+
+	/* Update all of the pointers */
+	skb_frag_size_sub(frag, pull_len);
+	frag->page_offset += pull_len;
+	skb->data_len -= pull_len;
+	skb->tail += pull_len;
+}
+
+static int tbnet_poll(struct napi_struct *napi, int budget)
+{
+	struct tbnet *net = container_of(napi, struct tbnet, napi);
+	unsigned int cleaned_count = tbnet_available_buffers(&net->rx_ring);
+	struct device *dma_dev = tb_ring_dma_device(net->rx_ring.ring);
+	unsigned int rx_packets = 0;
+
+	while (rx_packets < budget) {
+		u32 size, frame_size, frame_count, frame_index;
+		const struct thunderbolt_ip_frame_header *hdr;
+		unsigned int hdr_size = sizeof(*hdr);
+		struct sk_buff *skb = NULL;
+		struct ring_frame *frame;
+		struct tbnet_frame *tf;
+		bool last = true;
+
+		/* Return some buffers to hardware, one at a time is too
+		 * slow so allocate MAX_SKB_FRAGS buffers at the same
+		 * time.
+		 */
+		if (cleaned_count >= MAX_SKB_FRAGS) {
+			tbnet_alloc_rx_buffers(net, cleaned_count);
+			cleaned_count = 0;
+		}
+
+		frame = tb_ring_poll(net->rx_ring.ring);
+		if (!frame)
+			break;
+
+		dma_unmap_page(dma_dev, frame->buffer_phy, TBNET_FRAME_SIZE,
+			       DMA_FROM_DEVICE);
+
+		tf = container_of(frame, typeof(*tf), frame);
+		size = tbnet_frame_size(tf);
+		hdr = page_address(tf->page);
+
+		if (!tbnet_check_frame(net, tf)) {
+			__free_page(tf->page);
+			tf->page = NULL;
+			net->rx_ring.cons++;
+			cleaned_count++;
+			dev_kfree_skb_any(net->skb);
+			net->skb = NULL;
+			continue;
+		}
+
+		frame_count = le32_to_cpu(hdr->frame_count);
+		frame_size = le32_to_cpu(hdr->frame_size);
+		frame_index = le16_to_cpu(hdr->frame_index);
+		last = frame_index == frame_count - 1;
+
+		skb = net->skb;
+		if (!skb) {
+			skb = netdev_alloc_skb_ip_align(net->dev,
+							TBNET_RX_HDR_SIZE);
+			net->skb = skb;
+		}
+		if (!skb)
+			break;
+
+		/* Single small buffer we can copy directly to the
+		 * header part of the skb.
+		 */
+		if (hdr->frame_count == 1 && frame_size <= TBNET_RX_HDR_SIZE) {
+			const void *data = hdr + 1;
+
+			memcpy(__skb_put(skb, frame_size), data,
+			       ALIGN(frame_size, sizeof(long)));
+
+			__free_page(tf->page);
+		} else {
+			skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+					tf->page, hdr_size, frame_size, size);
+			if (last)
+				tbnet_pull_tail(skb);
+		}
+
+		tf->page = NULL;
+		net->rx_ring.cons++;
+		cleaned_count++;
+
+		net->rx_hdr.frame_count = frame_count;
+		net->rx_hdr.frame_size = frame_size;
+		net->rx_hdr.frame_index = frame_index;
+		net->rx_hdr.frame_id = le16_to_cpu(hdr->frame_id);
+
+		rx_packets++;
+		net->stats.rx_bytes += frame_size;
+
+		if (last) {
+			skb->protocol = eth_type_trans(skb, net->dev);
+			napi_gro_receive(&net->napi, skb);
+			net->skb = NULL;
+		}
+	}
+
+	net->stats.rx_packets += rx_packets;
+
+	if (cleaned_count)
+		tbnet_alloc_rx_buffers(net, cleaned_count);
+
+	if (rx_packets >= budget)
+		return budget;
+
+	napi_complete_done(napi, rx_packets);
+	/* Re-enable the ring interrupt */
+	tb_ring_poll_complete(net->rx_ring.ring);
+
+	return rx_packets;
+}
+
+static void tbnet_start_poll(void *data)
+{
+	struct tbnet *net = data;
+
+	napi_schedule(&net->napi);
+}
+
+static int tbnet_open(struct net_device *dev)
+{
+	struct tbnet *net = netdev_priv(dev);
+	struct tb_xdomain *xd = net->xd;
+	u16 sof_mask, eof_mask;
+	struct tb_ring *ring;
+
+	netif_carrier_off(dev);
+
+	ring = tb_ring_alloc_tx(xd->tb->nhi, -1, TBNET_RING_SIZE,
+				RING_FLAG_FRAME);
+	if (!ring) {
+		netdev_err(dev, "failed to allocate Tx ring\n");
+		return -ENOMEM;
+	}
+	net->tx_ring.ring = ring;
+
+	sof_mask = BIT(TBIP_PDF_FRAME_START);
+	eof_mask = BIT(TBIP_PDF_FRAME_END);
+
+	ring = tb_ring_alloc_rx(xd->tb->nhi, -1, TBNET_RING_SIZE,
+				RING_FLAG_FRAME | RING_FLAG_E2E, sof_mask,
+				eof_mask, tbnet_start_poll, net);
+	if (!ring) {
+		netdev_err(dev, "failed to allocate Rx ring\n");
+		tb_ring_free(net->tx_ring.ring);
+		net->tx_ring.ring = NULL;
+		return -ENOMEM;
+	}
+	net->rx_ring.ring = ring;
+
+	napi_enable(&net->napi);
+	start_login(net);
+
+	return 0;
+}
+
+static int tbnet_stop(struct net_device *dev)
+{
+	struct tbnet *net = netdev_priv(dev);
+
+	napi_disable(&net->napi);
+
+	tbnet_tear_down(net, true);
+
+	tb_ring_free(net->rx_ring.ring);
+	net->rx_ring.ring = NULL;
+	tb_ring_free(net->tx_ring.ring);
+	net->tx_ring.ring = NULL;
+
+	return 0;
+}
+
+static bool tbnet_xmit_map(struct device *dma_dev, struct tbnet_frame *tf)
+{
+	dma_addr_t dma_addr;
+
+	dma_addr = dma_map_page(dma_dev, tf->page, 0, tbnet_frame_size(tf),
+				DMA_TO_DEVICE);
+	if (dma_mapping_error(dma_dev, dma_addr))
+		return false;
+
+	tf->frame.buffer_phy = dma_addr;
+	return true;
+}
+
+static bool tbnet_xmit_csum_and_map(struct tbnet *net, struct sk_buff *skb,
+	struct tbnet_frame **frames, u32 frame_count)
+{
+	struct thunderbolt_ip_frame_header *hdr = page_address(frames[0]->page);
+	struct device *dma_dev = tb_ring_dma_device(net->tx_ring.ring);
+	__wsum wsum = htonl(skb->len - skb_transport_offset(skb));
+	unsigned int i, len, offset = skb_transport_offset(skb);
+	__be16 protocol = skb->protocol;
+	void *data = skb->data;
+	void *dest = hdr + 1;
+	__sum16 *tucso;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		/* No need to calculate checksum so we just update the
+		 * total frame count and map the frames for DMA.
+		 */
+		for (i = 0; i < frame_count; i++) {
+			hdr = page_address(frames[i]->page);
+			hdr->frame_count = cpu_to_le32(frame_count);
+			if (!tbnet_xmit_map(dma_dev, frames[i]))
+				goto err_unmap;
+		}
+
+		return true;
+	}
+
+	if (protocol == htons(ETH_P_8021Q)) {
+		struct vlan_hdr *vhdr, vh;
+
+		vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(vh), &vh);
+		if (!vhdr)
+			return false;
+
+		protocol = vhdr->h_vlan_encapsulated_proto;
+	}
+
+	/* Data points on the beginning of packet.
+	 * Check is the checksum absolute place in the packet.
+	 * ipcso will update IP checksum.
+	 * tucso will update TCP/UPD checksum.
+	 */
+	if (protocol == htons(ETH_P_IP)) {
+		__sum16 *ipcso = dest + ((void *)&(ip_hdr(skb)->check) - data);
+
+		*ipcso = 0;
+		*ipcso = ip_fast_csum(dest + skb_network_offset(skb),
+				      ip_hdr(skb)->ihl);
+
+		if (ip_hdr(skb)->protocol == IPPROTO_TCP)
+			tucso = dest + ((void *)&(tcp_hdr(skb)->check) - data);
+		else if (ip_hdr(skb)->protocol == IPPROTO_UDP)
+			tucso = dest + ((void *)&(udp_hdr(skb)->check) - data);
+		else
+			return false;
+
+		*tucso = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
+					    ip_hdr(skb)->daddr, 0,
+					    ip_hdr(skb)->protocol, 0);
+	} else if (skb_is_gso_v6(skb)) {
+		tucso = dest + ((void *)&(tcp_hdr(skb)->check) - data);
+		*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					  &ipv6_hdr(skb)->daddr, 0,
+					  IPPROTO_TCP, 0);
+		return false;
+	} else if (protocol == htons(ETH_P_IPV6)) {
+		tucso = dest + skb_checksum_start_offset(skb) + skb->csum_offset;
+		*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					  &ipv6_hdr(skb)->daddr, 0,
+					  ipv6_hdr(skb)->nexthdr, 0);
+	} else {
+		return false;
+	}
+
+	/* First frame was headers, rest of the frames contain data.
+	 * Calculate checksum over each frame.
+	 */
+	for (i = 0; i < frame_count; i++) {
+		hdr = page_address(frames[i]->page);
+		dest = (void *)(hdr + 1) + offset;
+		len = le32_to_cpu(hdr->frame_size) - offset;
+		wsum = csum_partial(dest, len, wsum);
+		hdr->frame_count = cpu_to_le32(frame_count);
+
+		offset = 0;
+	}
+
+	*tucso = csum_fold(wsum);
+
+	/* Checksum is finally calculated and we don't touch the memory
+	 * anymore, so DMA map the frames now.
+	 */
+	for (i = 0; i < frame_count; i++) {
+		if (!tbnet_xmit_map(dma_dev, frames[i]))
+			goto err_unmap;
+	}
+
+	return true;
+
+err_unmap:
+	while (i--)
+		dma_unmap_page(dma_dev, frames[i]->frame.buffer_phy,
+			       tbnet_frame_size(frames[i]), DMA_TO_DEVICE);
+
+	return false;
+}
+
+static void *tbnet_kmap_frag(struct sk_buff *skb, unsigned int frag_num,
+			     unsigned int *len)
+{
+	const skb_frag_t *frag = &skb_shinfo(skb)->frags[frag_num];
+
+	*len = skb_frag_size(frag);
+	return kmap_atomic(skb_frag_page(frag)) + frag->page_offset;
+}
+
+static netdev_tx_t tbnet_start_xmit(struct sk_buff *skb,
+				    struct net_device *dev)
+{
+	struct tbnet *net = netdev_priv(dev);
+	struct tbnet_frame *frames[MAX_SKB_FRAGS];
+	u16 frame_id = atomic_read(&net->frame_id);
+	struct thunderbolt_ip_frame_header *hdr;
+	unsigned int len = skb_headlen(skb);
+	unsigned int data_len = skb->len;
+	unsigned int nframes, i;
+	unsigned int frag = 0;
+	void *src = skb->data;
+	u32 frame_index = 0;
+	bool unmap = false;
+	void *dest;
+
+	nframes = DIV_ROUND_UP(data_len, TBNET_MAX_PAYLOAD_SIZE);
+	if (tbnet_available_buffers(&net->tx_ring) < nframes) {
+		netif_stop_queue(net->dev);
+		return NETDEV_TX_BUSY;
+	}
+
+	frames[frame_index] = tbnet_get_tx_buffer(net);
+	if (!frames[frame_index])
+		goto err_drop;
+
+	hdr = page_address(frames[frame_index]->page);
+	dest = hdr + 1;
+
+	/* If overall packet is bigger than the frame data size */
+	while (data_len > TBNET_MAX_PAYLOAD_SIZE) {
+		unsigned int size_left = TBNET_MAX_PAYLOAD_SIZE;
+
+		hdr->frame_size = cpu_to_le32(TBNET_MAX_PAYLOAD_SIZE);
+		hdr->frame_index = cpu_to_le16(frame_index);
+		hdr->frame_id = cpu_to_le16(frame_id);
+
+		do {
+			if (len > size_left) {
+				/* Copy data onto Tx buffer data with
+				 * full frame size then break and go to
+				 * next frame
+				 */
+				memcpy(dest, src, size_left);
+				len -= size_left;
+				dest += size_left;
+				src += size_left;
+				break;
+			}
+
+			memcpy(dest, src, len);
+			size_left -= len;
+			dest += len;
+
+			if (unmap) {
+				kunmap_atomic(src);
+				unmap = false;
+			}
+
+			/* Ensure all fragments have been processed */
+			if (frag < skb_shinfo(skb)->nr_frags) {
+				/* Map and then unmap quickly */
+				src = tbnet_kmap_frag(skb, frag++, &len);
+				unmap = true;
+			} else if (unlikely(size_left > 0)) {
+				goto err_drop;
+			}
+		} while (size_left > 0);
+
+		data_len -= TBNET_MAX_PAYLOAD_SIZE;
+		frame_index++;
+
+		frames[frame_index] = tbnet_get_tx_buffer(net);
+		if (!frames[frame_index])
+			goto err_drop;
+
+		hdr = page_address(frames[frame_index]->page);
+		dest = hdr + 1;
+	}
+
+	hdr->frame_size = cpu_to_le32(data_len);
+	hdr->frame_index = cpu_to_le16(frame_index);
+	hdr->frame_id = cpu_to_le16(frame_id);
+
+	frames[frame_index]->frame.size = data_len + sizeof(*hdr);
+
+	/* In case  the remaining data_len is smaller than a frame */
+	while (len < data_len) {
+		memcpy(dest, src, len);
+		data_len -= len;
+		dest += len;
+
+		if (unmap) {
+			kunmap_atomic(src);
+			unmap = false;
+		}
+
+		if (frag < skb_shinfo(skb)->nr_frags) {
+			src = tbnet_kmap_frag(skb, frag++, &len);
+			unmap = true;
+		} else if (unlikely(data_len > 0)) {
+			goto err_drop;
+		}
+	}
+
+	memcpy(dest, src, data_len);
+
+	if (unmap)
+		kunmap_atomic(src);
+
+	if (!tbnet_xmit_csum_and_map(net, skb, frames, frame_index + 1))
+		goto err_drop;
+
+	for (i = 0; i < frame_index + 1; i++)
+		tb_ring_tx(net->tx_ring.ring, &frames[i]->frame);
+
+	if (net->svc->prtcstns & TBNET_MATCH_FRAGS_ID)
+		atomic_inc(&net->frame_id);
+
+	net->stats.tx_packets++;
+	net->stats.tx_bytes += skb->len;
+
+	dev_consume_skb_any(skb);
+
+	return NETDEV_TX_OK;
+
+err_drop:
+	/* We can re-use the buffers */
+	net->tx_ring.cons -= frame_index;
+
+	dev_kfree_skb_any(skb);
+	net->stats.tx_errors++;
+
+	return NETDEV_TX_OK;
+}
+
+static void tbnet_get_stats64(struct net_device *dev,
+			      struct rtnl_link_stats64 *stats)
+{
+	struct tbnet *net = netdev_priv(dev);
+
+	stats->tx_packets = net->stats.tx_packets;
+	stats->rx_packets = net->stats.rx_packets;
+	stats->tx_bytes = net->stats.tx_bytes;
+	stats->rx_bytes = net->stats.rx_bytes;
+	stats->tx_errors = net->stats.tx_errors;
+	stats->rx_errors = net->stats.rx_length_errors +
+		net->stats.rx_over_errors + net->stats.rx_crc_errors +
+		net->stats.rx_missed_errors;
+	stats->rx_length_errors = net->stats.rx_length_errors;
+	stats->rx_over_errors = net->stats.rx_over_errors;
+	stats->rx_crc_errors = net->stats.rx_crc_errors;
+	stats->rx_missed_errors = net->stats.rx_missed_errors;
+}
+
+static const struct net_device_ops tbnet_netdev_ops = {
+	.ndo_open = tbnet_open,
+	.ndo_stop = tbnet_stop,
+	.ndo_start_xmit = tbnet_start_xmit,
+	.ndo_get_stats64 = tbnet_get_stats64,
+};
+
+static void tbnet_generate_mac(struct net_device *dev)
+{
+	const struct tbnet *net = netdev_priv(dev);
+	const struct tb_xdomain *xd = net->xd;
+	u8 phy_port;
+	u32 hash;
+
+	phy_port = tb_phy_port_from_link(TBNET_L0_PORT_NUM(xd->route));
+
+	/* Unicast and locally administered MAC */
+	dev->dev_addr[0] = phy_port << 4 | 0x02;
+	hash = jhash2((u32 *)xd->local_uuid, 4, 0);
+	memcpy(dev->dev_addr + 1, &hash, sizeof(hash));
+	hash = jhash2((u32 *)xd->local_uuid, 4, hash);
+	dev->dev_addr[5] = hash & 0xff;
+}
+
+static int tbnet_probe(struct tb_service *svc, const struct tb_service_id *id)
+{
+	struct tb_xdomain *xd = tb_service_parent(svc);
+	struct net_device *dev;
+	struct tbnet *net;
+	int ret;
+
+	dev = alloc_etherdev(sizeof(*net));
+	if (!dev)
+		return -ENOMEM;
+
+	SET_NETDEV_DEV(dev, &svc->dev);
+
+	net = netdev_priv(dev);
+	INIT_DELAYED_WORK(&net->login_work, tbnet_login_work);
+	INIT_WORK(&net->connected_work, tbnet_connected_work);
+	mutex_init(&net->connection_lock);
+	atomic_set(&net->command_id, 0);
+	atomic_set(&net->frame_id, 0);
+	net->svc = svc;
+	net->dev = dev;
+	net->xd = xd;
+
+	tbnet_generate_mac(dev);
+
+	strcpy(dev->name, "thunderbolt%d");
+	dev->netdev_ops = &tbnet_netdev_ops;
+
+	/* ThunderboltIP takes advantage of TSO packets but instead of
+	 * segmenting them we just split the packet into Thunderbolt
+	 * frames (maximum payload size of each frame is 4084 bytes) and
+	 * calculate checksum over the whole packet here.
+	 *
+	 * The receiving side does the opposite if the host OS supports
+	 * LRO, otherwise it needs to split the large packet into MTU
+	 * sized smaller packets.
+	 *
+	 * In order to receive large packets from the networking stack,
+	 * we need to announce support for most of the offloading
+	 * features here.
+	 */
+	dev->hw_features = NETIF_F_SG | NETIF_F_ALL_TSO | NETIF_F_GRO |
+			   NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+	dev->features = dev->hw_features | NETIF_F_HIGHDMA;
+	dev->hard_header_len += sizeof(struct thunderbolt_ip_frame_header);
+
+	netif_napi_add(dev, &net->napi, tbnet_poll, NAPI_POLL_WEIGHT);
+
+	/* MTU range: 68 - 65522 */
+	dev->min_mtu = ETH_MIN_MTU;
+	dev->max_mtu = TBNET_MAX_MTU - ETH_HLEN;
+
+	ret = register_netdev(dev);
+	if (ret) {
+		free_netdev(dev);
+		return ret;
+	}
+
+	net->handler.uuid = &tbnet_svc_uuid;
+	net->handler.callback = tbnet_handle_packet,
+	net->handler.data = net;
+	tb_register_protocol_handler(&net->handler);
+
+	tb_service_set_drvdata(svc, net);
+
+	return 0;
+}
+
+static void tbnet_remove(struct tb_service *svc)
+{
+	struct tbnet *net = tb_service_get_drvdata(svc);
+
+	unregister_netdev(net->dev);
+	tb_unregister_protocol_handler(&net->handler);
+	free_netdev(net->dev);
+}
+
+static void tbnet_shutdown(struct tb_service *svc)
+{
+	tbnet_tear_down(tb_service_get_drvdata(svc), true);
+}
+
+static int __maybe_unused tbnet_suspend(struct device *dev)
+{
+	struct tb_service *svc = tb_to_service(dev);
+	struct tbnet *net = tb_service_get_drvdata(svc);
+
+	stop_login(net);
+	if (netif_running(net->dev)) {
+		netif_device_detach(net->dev);
+		tb_ring_stop(net->rx_ring.ring);
+		tb_ring_stop(net->tx_ring.ring);
+		tbnet_free_buffers(&net->rx_ring);
+		tbnet_free_buffers(&net->tx_ring);
+	}
+
+	return 0;
+}
+
+static int __maybe_unused tbnet_resume(struct device *dev)
+{
+	struct tb_service *svc = tb_to_service(dev);
+	struct tbnet *net = tb_service_get_drvdata(svc);
+
+	netif_carrier_off(net->dev);
+	if (netif_running(net->dev)) {
+		netif_device_attach(net->dev);
+		start_login(net);
+	}
+
+	return 0;
+}
+
+static const struct dev_pm_ops tbnet_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(tbnet_suspend, tbnet_resume)
+};
+
+static const struct tb_service_id tbnet_ids[] = {
+	{ TB_SERVICE("network", 1) },
+	{ },
+};
+MODULE_DEVICE_TABLE(tbsvc, tbnet_ids);
+
+static struct tb_service_driver tbnet_driver = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "thunderbolt-net",
+		.pm = &tbnet_pm_ops,
+	},
+	.probe = tbnet_probe,
+	.remove = tbnet_remove,
+	.shutdown = tbnet_shutdown,
+	.id_table = tbnet_ids,
+};
+
+static int __init tbnet_init(void)
+{
+	int ret;
+
+	tbnet_dir = tb_property_create_dir(&tbnet_dir_uuid);
+	if (!tbnet_dir)
+		return -ENOMEM;
+
+	tb_property_add_immediate(tbnet_dir, "prtcid", 1);
+	tb_property_add_immediate(tbnet_dir, "prtcvers", 1);
+	tb_property_add_immediate(tbnet_dir, "prtcrevs", 1);
+	tb_property_add_immediate(tbnet_dir, "prtcstns",
+				  TBNET_MATCH_FRAGS_ID);
+
+	ret = tb_register_property_dir("network", tbnet_dir);
+	if (ret) {
+		tb_property_free_dir(tbnet_dir);
+		return ret;
+	}
+
+	return tb_register_service_driver(&tbnet_driver);
+}
+module_init(tbnet_init);
+
+static void __exit tbnet_exit(void)
+{
+	tb_unregister_service_driver(&tbnet_driver);
+	tb_unregister_property_dir("network", tbnet_dir);
+	tb_property_free_dir(tbnet_dir);
+}
+module_exit(tbnet_exit);
+
+MODULE_AUTHOR("Amir Levy <amir.jer.levy@intel.com>");
+MODULE_AUTHOR("Michael Jamet <michael.jamet@intel.com>");
+MODULE_AUTHOR("Mika Westerberg <mika.westerberg@linux.intel.com>");
+MODULE_DESCRIPTION("Thunderbolt network driver");
+MODULE_LICENSE("GPL v2");
-- 
2.14.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox