Netdev List
 help / color / mirror / Atom feed
* [net-next 5/6] openvswitch: Factor out allocation and verification of actions.
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Jesse Gross, Andy Zhou
In-Reply-To: <1412237085-27215-1-git-send-email-azhou@nicira.com>

From: Jesse Gross <jesse@nicira.com>

As the size of the flow key grows, it can put some pressure on the
stack. This is particularly true in ovs_flow_cmd_set(), which needs several
copies of the key on the stack. One of those uses is logically separate,
so this factors it out to reduce stack pressure and improve readibility.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 net/openvswitch/datapath.c |   38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index f6bd93d..010125c 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -933,11 +933,34 @@ error:
 	return error;
 }
 
+static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
+						const struct sw_flow_key *key,
+						const struct sw_flow_mask *mask)
+{
+	struct sw_flow_actions *acts;
+	struct sw_flow_key masked_key;
+	int error;
+
+	acts = ovs_nla_alloc_flow_actions(nla_len(a));
+	if (IS_ERR(acts))
+		return acts;
+
+	ovs_flow_mask_key(&masked_key, key, mask);
+	error = ovs_nla_copy_actions(a, &masked_key, 0, &acts);
+	if (error) {
+		OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
+		kfree(acts);
+		return ERR_PTR(error);
+	}
+
+	return acts;
+}
+
 static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr **a = info->attrs;
 	struct ovs_header *ovs_header = info->userhdr;
-	struct sw_flow_key key, masked_key;
+	struct sw_flow_key key;
 	struct sw_flow *flow;
 	struct sw_flow_mask mask;
 	struct sk_buff *reply = NULL;
@@ -959,17 +982,10 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 
 	/* Validate actions. */
 	if (a[OVS_FLOW_ATTR_ACTIONS]) {
-		acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS]));
-		error = PTR_ERR(acts);
-		if (IS_ERR(acts))
+		acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask);
+		if (IS_ERR(acts)) {
+			error = PTR_ERR(acts);
 			goto error;
-
-		ovs_flow_mask_key(&masked_key, &key, &mask);
-		error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS],
-					     &masked_key, 0, &acts);
-		if (error) {
-			OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
-			goto err_kfree_acts;
 		}
 	}
 
-- 
1.7.9.5

^ permalink raw reply related

* [net-next 4/6] openvswitch: Wrap struct ovs_key_ipv4_tunnel in a new structure.
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Jesse Gross, Andy Zhou
In-Reply-To: <1412237085-27215-1-git-send-email-azhou@nicira.com>

From: Jesse Gross <jesse@nicira.com>

Currently, the flow information that is matched for tunnels and
the tunnel data passed around with packets is the same. However,
as additional information is added this is not necessarily desirable,
as in the case of pointers.

This adds a new structure for tunnel metadata which currently contains
only the existing struct. This change is purely internal to the kernel
since the current OVS_KEY_ATTR_IPV4_TUNNEL is simply a compressed version
of OVS_KEY_ATTR_TUNNEL that is translated at flow setup.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 include/uapi/linux/openvswitch.h |    2 +-
 net/openvswitch/actions.c        |    5 +++--
 net/openvswitch/datapath.h       |    2 +-
 net/openvswitch/flow.c           |    6 +++---
 net/openvswitch/flow.h           |   30 +++++++++++++++++-------------
 net/openvswitch/flow_netlink.c   |   38 +++++++++++++++++++++++++++++++-------
 net/openvswitch/vport-gre.c      |   16 +++++++++-------
 net/openvswitch/vport-vxlan.c    |   10 +++++-----
 net/openvswitch/vport.c          |    6 +++---
 net/openvswitch/vport.h          |    2 +-
 10 files changed, 74 insertions(+), 43 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 7c06106..6753032 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -294,7 +294,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */
 
 #ifdef __KERNEL__
-	OVS_KEY_ATTR_IPV4_TUNNEL,  /* struct ovs_key_ipv4_tunnel */
+	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ovs_tunnel_info */
 #endif
 	__OVS_KEY_ATTR_MAX
 };
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 6932a42..006886d 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -590,8 +590,8 @@ static int execute_set_action(struct sk_buff *skb,
 		skb->mark = nla_get_u32(nested_attr);
 		break;
 
-	case OVS_KEY_ATTR_IPV4_TUNNEL:
-		OVS_CB(skb)->egress_tun_key = nla_data(nested_attr);
+	case OVS_KEY_ATTR_TUNNEL_INFO:
+		OVS_CB(skb)->egress_tun_info = nla_data(nested_attr);
 		break;
 
 	case OVS_KEY_ATTR_ETHERNET:
@@ -778,6 +778,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
 	acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts);
 
 	this_cpu_inc(exec_actions_level);
+	OVS_CB(skb)->egress_tun_info = NULL;
 	err = do_execute_actions(dp, skb, key,
 				 acts->actions, acts->actions_len);
 
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index ac3f3df..9741354 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -102,8 +102,8 @@ struct datapath {
  */
 struct ovs_skb_cb {
 	struct sw_flow		*flow;
+	struct ovs_tunnel_info  *egress_tun_info;
 	struct vport		*input_vport;
-	struct ovs_key_ipv4_tunnel  *egress_tun_key;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
 
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 913bdc1..2924cb3 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -642,12 +642,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
 	return key_extract(skb, key);
 }
 
-int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key,
+int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info,
 			 struct sk_buff *skb, struct sw_flow_key *key)
 {
 	/* Extract metadata from packet. */
-	if (tun_key)
-		memcpy(&key->tun_key, tun_key, sizeof(key->tun_key));
+	if (tun_info)
+		memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key));
 	else
 		memset(&key->tun_key, 0, sizeof(key->tun_key));
 
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 0f5db4e..fe5a71b 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -49,20 +49,24 @@ struct ovs_key_ipv4_tunnel {
 	u8   ipv4_ttl;
 } __packed __aligned(4); /* Minimize padding. */
 
-static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
-					 const struct iphdr *iph, __be64 tun_id,
-					 __be16 tun_flags)
+struct ovs_tunnel_info {
+	struct ovs_key_ipv4_tunnel tunnel;
+};
+
+static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
+					  const struct iphdr *iph,
+					  __be64 tun_id, __be16 tun_flags)
 {
-	tun_key->tun_id = tun_id;
-	tun_key->ipv4_src = iph->saddr;
-	tun_key->ipv4_dst = iph->daddr;
-	tun_key->ipv4_tos = iph->tos;
-	tun_key->ipv4_ttl = iph->ttl;
-	tun_key->tun_flags = tun_flags;
+	tun_info->tunnel.tun_id = tun_id;
+	tun_info->tunnel.ipv4_src = iph->saddr;
+	tun_info->tunnel.ipv4_dst = iph->daddr;
+	tun_info->tunnel.ipv4_tos = iph->tos;
+	tun_info->tunnel.ipv4_ttl = iph->ttl;
+	tun_info->tunnel.tun_flags = tun_flags;
 
 	/* clear struct padding. */
-	memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0,
-	       sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE);
+	memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0,
+	       sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
 }
 
 struct sw_flow_key {
@@ -190,8 +194,8 @@ void ovs_flow_stats_clear(struct sw_flow *);
 u64 ovs_flow_used_time(unsigned long flow_jiffies);
 
 int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key,
-			 struct sk_buff *skb, struct sw_flow_key *key);
+int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb,
+			 struct sw_flow_key *key);
 /* Extract key from packet coming from userspace. */
 int ovs_flow_key_extract_userspace(const struct nlattr *attr,
 				   struct sk_buff *skb,
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 22c855f..5d6194d 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1148,13 +1148,14 @@ out:
 	return  (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
 }
 
-static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len)
+static struct nlattr *__add_action(struct sw_flow_actions **sfa,
+				   int attrtype, void *data, int len)
 {
 	struct nlattr *a;
 
 	a = reserve_sfa_size(sfa, nla_attr_size(len));
 	if (IS_ERR(a))
-		return PTR_ERR(a);
+		return a;
 
 	a->nla_type = attrtype;
 	a->nla_len = nla_attr_size(len);
@@ -1163,6 +1164,18 @@ static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, in
 		memcpy(nla_data(a), data, len);
 	memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len));
 
+	return a;
+}
+
+static int add_action(struct sw_flow_actions **sfa, int attrtype,
+		      void *data, int len)
+{
+	struct nlattr *a;
+
+	a = __add_action(sfa, attrtype, data, len);
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
 	return 0;
 }
 
@@ -1268,6 +1281,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 {
 	struct sw_flow_match match;
 	struct sw_flow_key key;
+	struct ovs_tunnel_info *tun_info;
+	struct nlattr *a;
 	int err, start;
 
 	ovs_match_init(&match, &key, NULL);
@@ -1279,8 +1294,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	if (start < 0)
 		return start;
 
-	err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key,
-			sizeof(match.key->tun_key));
+	a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
+			 sizeof(*tun_info));
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
+	tun_info = nla_data(a);
+	tun_info->tunnel = key.tun_key;
+
 	add_nested_action_end(*sfa, start);
 
 	return err;
@@ -1563,17 +1584,20 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
 	int err;
 
 	switch (key_type) {
-	case OVS_KEY_ATTR_IPV4_TUNNEL:
+	case OVS_KEY_ATTR_TUNNEL_INFO: {
+		struct ovs_tunnel_info *tun_info = nla_data(ovs_key);
+
 		start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
 		if (!start)
 			return -EMSGSIZE;
 
-		err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key),
-					     nla_data(ovs_key));
+		err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
+					 nla_data(ovs_key));
 		if (err)
 			return err;
 		nla_nest_end(skb, start);
 		break;
+	}
 	default:
 		if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
 			return -EMSGSIZE;
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 309cca6..fe768bd 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -63,8 +63,10 @@ static __be16 filter_tnl_flags(__be16 flags)
 static struct sk_buff *__build_header(struct sk_buff *skb,
 				      int tunnel_hlen)
 {
-	const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->egress_tun_key;
 	struct tnl_ptk_info tpi;
+	const struct ovs_key_ipv4_tunnel *tun_key;
+
+	tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
 
 	skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
 	if (IS_ERR(skb))
@@ -92,7 +94,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
 static int gre_rcv(struct sk_buff *skb,
 		   const struct tnl_ptk_info *tpi)
 {
-	struct ovs_key_ipv4_tunnel tun_key;
+	struct ovs_tunnel_info tun_info;
 	struct ovs_net *ovs_net;
 	struct vport *vport;
 	__be64 key;
@@ -103,10 +105,10 @@ static int gre_rcv(struct sk_buff *skb,
 		return PACKET_REJECT;
 
 	key = key_to_tunnel_id(tpi->key, tpi->seq);
-	ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key,
-			      filter_tnl_flags(tpi->flags));
+	ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key,
+			       filter_tnl_flags(tpi->flags));
 
-	ovs_vport_receive(vport, skb, &tun_key);
+	ovs_vport_receive(vport, skb, &tun_info);
 	return PACKET_RCVD;
 }
 
@@ -137,12 +139,12 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 	__be16 df;
 	int err;
 
-	if (unlikely(!OVS_CB(skb)->egress_tun_key)) {
+	if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
 		err = -EINVAL;
 		goto error;
 	}
 
-	tun_key = OVS_CB(skb)->egress_tun_key;
+	tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
 	/* Route lookup */
 	memset(&fl, 0, sizeof(fl));
 	fl.daddr = tun_key->ipv4_dst;
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index f19539b..5fbff2c 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -58,7 +58,7 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
 /* Called with rcu_read_lock and BH disabled. */
 static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
 {
-	struct ovs_key_ipv4_tunnel tun_key;
+	struct ovs_tunnel_info tun_info;
 	struct vport *vport = vs->data;
 	struct iphdr *iph;
 	__be64 key;
@@ -66,9 +66,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
 	/* Save outer tunnel values */
 	iph = ip_hdr(skb);
 	key = cpu_to_be64(ntohl(vx_vni) >> 8);
-	ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
+	ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY);
 
-	ovs_vport_receive(vport, skb, &tun_key);
+	ovs_vport_receive(vport, skb, &tun_info);
 }
 
 static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
@@ -147,12 +147,12 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 	__be16 df;
 	int err;
 
-	if (unlikely(!OVS_CB(skb)->egress_tun_key)) {
+	if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
 		err = -EINVAL;
 		goto error;
 	}
 
-	tun_key = OVS_CB(skb)->egress_tun_key;
+	tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
 	/* Route lookup */
 	memset(&fl, 0, sizeof(fl));
 	fl.daddr = tun_key->ipv4_dst;
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 5df8377..3e50ee8 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -432,7 +432,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *p, struct sk_buff *skb)
  * skb->data should point to the Ethernet header.
  */
 void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
-		       struct ovs_key_ipv4_tunnel *tun_key)
+		       struct ovs_tunnel_info *tun_info)
 {
 	struct pcpu_sw_netstats *stats;
 	struct sw_flow_key key;
@@ -445,9 +445,9 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 	u64_stats_update_end(&stats->syncp);
 
 	OVS_CB(skb)->input_vport = vport;
-	OVS_CB(skb)->egress_tun_key = NULL;
+	OVS_CB(skb)->egress_tun_info = NULL;
 	/* Extract flow from 'skb' into 'key'. */
-	error = ovs_flow_key_extract(tun_key, skb, &key);
+	error = ovs_flow_key_extract(tun_info, skb, &key);
 	if (unlikely(error)) {
 		kfree_skb(skb);
 		return;
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 0efd62f..e28964a 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -207,7 +207,7 @@ static inline struct vport *vport_from_priv(void *priv)
 }
 
 void ovs_vport_receive(struct vport *, struct sk_buff *,
-		       struct ovs_key_ipv4_tunnel *);
+		       struct ovs_tunnel_info *);
 
 /* List of statically compiled vport implementations.  Don't forget to also
  * add yours to the list at the top of vport.c. */
-- 
1.7.9.5

^ permalink raw reply related

* [net-next 3/6] openvswitch: Add support for matching on OAM packets.
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Jesse Gross, Andy Zhou
In-Reply-To: <1412237085-27215-1-git-send-email-azhou@nicira.com>

From: Jesse Gross <jesse@nicira.com>

Some tunnel formats have mechanisms for indicating that packets are
OAM frames that should be handled specially (either as high priority or
not forwarded beyond an endpoint). This provides support for allowing
those types of packets to be matched.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 include/uapi/linux/openvswitch.h |    1 +
 net/openvswitch/datapath.c       |    1 +
 net/openvswitch/flow_netlink.c   |   17 ++++++++++++-----
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index f7fc507..7c06106 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -309,6 +309,7 @@ enum ovs_tunnel_key_attr {
 	OVS_TUNNEL_KEY_ATTR_TTL,                /* u8 Tunnel IP TTL. */
 	OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT,      /* No argument, set DF. */
 	OVS_TUNNEL_KEY_ATTR_CSUM,               /* No argument. CSUM packet. */
+	OVS_TUNNEL_KEY_ATTR_OAM,                /* No argument. OAM frame.  */
 	__OVS_TUNNEL_KEY_ATTR_MAX
 };
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 9e3a2fa..f6bd93d 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -369,6 +369,7 @@ static size_t key_attr_size(void)
 		  + nla_total_size(1)   /* OVS_TUNNEL_KEY_ATTR_TTL */
 		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
 		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_CSUM */
+		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_OAM */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_IN_PORT */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index f4c8daa..22c855f 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -346,6 +346,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 			[OVS_TUNNEL_KEY_ATTR_TTL] = 1,
 			[OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
 			[OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
+			[OVS_TUNNEL_KEY_ATTR_OAM] = 0,
 		};
 
 		if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
@@ -390,6 +391,9 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 		case OVS_TUNNEL_KEY_ATTR_CSUM:
 			tun_flags |= TUNNEL_CSUM;
 			break;
+		case OVS_TUNNEL_KEY_ATTR_OAM:
+			tun_flags |= TUNNEL_OAM;
+			break;
 		default:
 			return -EINVAL;
 		}
@@ -431,21 +435,24 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
 	    nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
 		return -EMSGSIZE;
 	if (output->ipv4_src &&
-		nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
+	    nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
 		return -EMSGSIZE;
 	if (output->ipv4_dst &&
-		nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
+	    nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
 		return -EMSGSIZE;
 	if (output->ipv4_tos &&
-		nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
+	    nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
 		return -EMSGSIZE;
 	if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
 		return -EMSGSIZE;
 	if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
-		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
+	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
 		return -EMSGSIZE;
 	if ((output->tun_flags & TUNNEL_CSUM) &&
-		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
+	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
+		return -EMSGSIZE;
+	if ((output->tun_flags & TUNNEL_OAM) &&
+	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
 		return -EMSGSIZE;
 
 	nla_nest_end(skb, nla);
-- 
1.7.9.5

^ permalink raw reply related

* [net-next 2/6] openvswitch: Eliminate memset() from flow_extract.
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Jesse Gross, Andy Zhou
In-Reply-To: <1412237085-27215-1-git-send-email-azhou@nicira.com>

From: Jesse Gross <jesse@nicira.com>

As new protocols are added, the size of the flow key tends to
increase although few protocols care about all of the fields. In
order to optimize this for hashing and matching, OVS uses a variable
length portion of the key. However, when fields are extracted from
the packet we must still zero out the entire key.

This is no longer necessary now that OVS implements masking. Any
fields (or holes in the structure) which are not part of a given
protocol will be by definition not part of the mask and zeroed out
during lookup. Furthermore, since masking already uses variable
length keys this zeroing operation automatically benefits as well.

In principle, the only thing that needs to be done at this point
is remove the memset() at the beginning of flow. However, some
fields assume that they are initialized to zero, which now must be
done explicitly. In addition, in the event of an error we must also
zero out corresponding fields to signal that there is no valid data
present. These increase the total amount of code but very little of
it is executed in non-error situations.

Removing the memset() reduces the profile of ovs_flow_extract()
from 0.64% to 0.56% when tested with large packets on a 10G link.

Suggested-by: Pravin Shelar <pshelar@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 net/openvswitch/flow.c |   54 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 4010423..913bdc1 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -462,6 +462,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 	 * update skb->csum here.
 	 */
 
+	key->eth.tci = 0;
 	if (vlan_tx_tag_present(skb))
 		key->eth.tci = htons(skb->vlan_tci);
 	else if (eth->h_proto == htons(ETH_P_8021Q))
@@ -482,6 +483,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 
 		error = check_iphdr(skb);
 		if (unlikely(error)) {
+			memset(&key->ip, 0, sizeof(key->ip));
+			memset(&key->ipv4, 0, sizeof(key->ipv4));
 			if (error == -EINVAL) {
 				skb->transport_header = skb->network_header;
 				error = 0;
@@ -503,8 +506,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 			return 0;
 		}
 		if (nh->frag_off & htons(IP_MF) ||
-			 skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+			skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
+		else
+			key->ip.frag = OVS_FRAG_TYPE_NONE;
 
 		/* Transport layer. */
 		if (key->ip.proto == IPPROTO_TCP) {
@@ -513,18 +518,25 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 				key->tp.src = tcp->source;
 				key->tp.dst = tcp->dest;
 				key->tp.flags = TCP_FLAGS_BE16(tcp);
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
+
 		} else if (key->ip.proto == IPPROTO_UDP) {
 			if (udphdr_ok(skb)) {
 				struct udphdr *udp = udp_hdr(skb);
 				key->tp.src = udp->source;
 				key->tp.dst = udp->dest;
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		} else if (key->ip.proto == IPPROTO_SCTP) {
 			if (sctphdr_ok(skb)) {
 				struct sctphdr *sctp = sctp_hdr(skb);
 				key->tp.src = sctp->source;
 				key->tp.dst = sctp->dest;
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		} else if (key->ip.proto == IPPROTO_ICMP) {
 			if (icmphdr_ok(skb)) {
@@ -534,33 +546,44 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 				 * them in 16-bit network byte order. */
 				key->tp.src = htons(icmp->type);
 				key->tp.dst = htons(icmp->code);
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		}
 
-	} else if ((key->eth.type == htons(ETH_P_ARP) ||
-		   key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) {
+	} else if (key->eth.type == htons(ETH_P_ARP) ||
+		   key->eth.type == htons(ETH_P_RARP)) {
 		struct arp_eth_header *arp;
 
 		arp = (struct arp_eth_header *)skb_network_header(skb);
 
-		if (arp->ar_hrd == htons(ARPHRD_ETHER)
-				&& arp->ar_pro == htons(ETH_P_IP)
-				&& arp->ar_hln == ETH_ALEN
-				&& arp->ar_pln == 4) {
+		if (arphdr_ok(skb) &&
+		    arp->ar_hrd == htons(ARPHRD_ETHER) &&
+		    arp->ar_pro == htons(ETH_P_IP) &&
+		    arp->ar_hln == ETH_ALEN &&
+		    arp->ar_pln == 4) {
 
 			/* We only match on the lower 8 bits of the opcode. */
 			if (ntohs(arp->ar_op) <= 0xff)
 				key->ip.proto = ntohs(arp->ar_op);
+			else
+				key->ip.proto = 0;
+
 			memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
 			memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
 			ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha);
 			ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha);
+		} else {
+			memset(&key->ip, 0, sizeof(key->ip));
+			memset(&key->ipv4, 0, sizeof(key->ipv4));
 		}
 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
 		int nh_len;             /* IPv6 Header + Extensions */
 
 		nh_len = parse_ipv6hdr(skb, key);
 		if (unlikely(nh_len < 0)) {
+			memset(&key->ip, 0, sizeof(key->ip));
+			memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr));
 			if (nh_len == -EINVAL) {
 				skb->transport_header = skb->network_header;
 				error = 0;
@@ -582,24 +605,32 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 				key->tp.src = tcp->source;
 				key->tp.dst = tcp->dest;
 				key->tp.flags = TCP_FLAGS_BE16(tcp);
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		} else if (key->ip.proto == NEXTHDR_UDP) {
 			if (udphdr_ok(skb)) {
 				struct udphdr *udp = udp_hdr(skb);
 				key->tp.src = udp->source;
 				key->tp.dst = udp->dest;
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		} else if (key->ip.proto == NEXTHDR_SCTP) {
 			if (sctphdr_ok(skb)) {
 				struct sctphdr *sctp = sctp_hdr(skb);
 				key->tp.src = sctp->source;
 				key->tp.dst = sctp->dest;
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		} else if (key->ip.proto == NEXTHDR_ICMP) {
 			if (icmp6hdr_ok(skb)) {
 				error = parse_icmpv6(skb, key, nh_len);
 				if (error)
 					return error;
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		}
 	}
@@ -615,13 +646,19 @@ int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key,
 			 struct sk_buff *skb, struct sw_flow_key *key)
 {
 	/* Extract metadata from packet. */
-	memset(key, 0, sizeof(*key));
 	if (tun_key)
 		memcpy(&key->tun_key, tun_key, sizeof(key->tun_key));
+	else
+		memset(&key->tun_key, 0, sizeof(key->tun_key));
 
 	key->phy.priority = skb->priority;
 	key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
 	key->phy.skb_mark = skb->mark;
+	key->ovs_flow_hash = 0;
+	key->recirc_id = 0;
+
+	/* Flags are always used as part of stats */
+	key->tp.flags = 0;
 
 	return key_extract(skb, key);
 }
@@ -632,7 +669,6 @@ int ovs_flow_key_extract_userspace(const struct nlattr *attr,
 {
 	int err;
 
-	memset(key, 0, sizeof(*key));
 	/* Extract metadata from netlink attributes. */
 	err = ovs_nla_get_flow_metadata(attr, key);
 	if (err)
-- 
1.7.9.5

^ permalink raw reply related

* [net-next 1/6] net: Add Geneve tunneling protocol driver
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Andy Zhou, Jesse Gross
In-Reply-To: <1412237085-27215-1-git-send-email-azhou@nicira.com>

This adds a device level support for Geneve -- Generic Network
Virtualization Encapsulation. The protocol is documented at
http://tools.ietf.org/html/draft-gross-geneve-01

Only protocol layer Geneve support is provided by this driver.
Openvswitch can be used for configuring, set up and tear down
functional Geneve tunnels.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 include/net/geneve.h     |   91 +++++++++++
 include/net/ip_tunnels.h |    2 +
 net/ipv4/Kconfig         |   14 ++
 net/ipv4/Makefile        |    1 +
 net/ipv4/geneve.c        |  373 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 481 insertions(+)
 create mode 100644 include/net/geneve.h
 create mode 100644 net/ipv4/geneve.c

diff --git a/include/net/geneve.h b/include/net/geneve.h
new file mode 100644
index 0000000..ce98865
--- /dev/null
+++ b/include/net/geneve.h
@@ -0,0 +1,91 @@
+#ifndef __NET_GENEVE_H
+#define __NET_GENEVE_H  1
+
+#include <net/udp_tunnel.h>
+
+struct geneve_sock;
+
+typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb);
+
+struct geneve_sock {
+	struct hlist_node	hlist;
+	geneve_rcv_t		*rcv;
+	void			*rcv_data;
+	struct work_struct	del_work;
+	struct socket		*sock;
+	struct rcu_head		rcu;
+	atomic_t		refcnt;
+	struct udp_offload	udp_offloads;
+};
+
+/* Geneve Header:
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |Ver|  Opt Len  |O|C|    Rsvd.  |          Protocol Type        |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |        Virtual Network Identifier (VNI)       |    Reserved   |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                    Variable Length Options                    |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Option Header:
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |          Option Class         |      Type     |R|R|R| Length  |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                      Variable Option Data                     |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+
+struct geneve_opt {
+	__be16	opt_class;
+	u8	type;
+#ifdef __LITTLE_ENDIAN_BITFIELD
+	u8	length:5;
+	u8	r3:1;
+	u8	r2:1;
+	u8	r1:1;
+#else
+	u8	r1:1;
+	u8	r2:1;
+	u8	r3:1;
+	u8	length:5;
+#endif
+	u8	opt_data[];
+};
+
+#define GENEVE_CRIT_OPT_TYPE (1 << 7)
+
+struct genevehdr {
+#ifdef __LITTLE_ENDIAN_BITFIELD
+	u8 opt_len:6;
+	u8 ver:2;
+	u8 rsvd1:6;
+	u8 critical:1;
+	u8 oam:1;
+#else
+	u8 ver:2;
+	u8 opt_len:6;
+	u8 oam:1;
+	u8 critical:1;
+	u8 rsvd1:6;
+#endif
+	__be16 proto_type;
+	u8 vni[3];
+	u8 rsvd2;
+	struct geneve_opt options[];
+};
+
+#define GENEVE_VER 0
+#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
+
+struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
+				    geneve_rcv_t *rcv, void *data,
+				    bool no_share, bool ipv6);
+
+void geneve_sock_release(struct geneve_sock *vs);
+
+int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
+		    struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
+		    __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
+		    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
+		    bool xnet);
+#endif
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 7f538ba..a9ce155 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -95,6 +95,8 @@ struct ip_tunnel {
 #define TUNNEL_VERSION	__cpu_to_be16(0x40)
 #define TUNNEL_NO_KEY	__cpu_to_be16(0x80)
 #define TUNNEL_DONT_FRAGMENT    __cpu_to_be16(0x0100)
+#define TUNNEL_OAM	__cpu_to_be16(0x0200)
+#define TUNNEL_CRIT_OPT	__cpu_to_be16(0x0400)
 
 struct tnl_ptk_info {
 	__be16 flags;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 69fb378..15ce6b0 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -453,6 +453,20 @@ config TCP_CONG_BIC
 	increase provides TCP friendliness.
 	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
 
+config GENEVE
+       tristate "Generic Network Virtualization Encapsulation (Geneve)"
+       depends on INET
+       select NET_IP_TUNNEL
+       select NET_UDP_TUNNEL
+       ---help---
+	  This allows one to create Geneve virtual interfaces that provide
+	  Layer 2 Networks over Layer 3 Networks. Geneve is often used
+	  to tunnel virtual network infrastructure in virtualized environments.
+	  For more information see:
+	    http://tools.ietf.org/html/draft-gross-geneve-01
+
+	  To compile this driver as a module, choose M here: the module
+
 config TCP_CONG_CUBIC
 	tristate "CUBIC TCP"
 	default y
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index d810578..518c04e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-$(CONFIG_GENEVE) += geneve.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
new file mode 100644
index 0000000..f008c55
--- /dev/null
+++ b/net/ipv4/geneve.c
@@ -0,0 +1,373 @@
+/*
+ * Geneve: Generic Network Virtualization Encapsulation
+ *
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/hash.h>
+#include <linux/ethtool.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/geneve.h>
+#include <net/protocol.h>
+#include <net/udp_tunnel.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+#endif
+
+#define PORT_HASH_BITS 8
+#define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
+
+/* per-network namespace private data for this module */
+struct geneve_net {
+	struct hlist_head	sock_list[PORT_HASH_SIZE];
+	spinlock_t		sock_lock;   /* Protects sock_list */
+};
+
+static int geneve_net_id;
+
+static struct workqueue_struct *geneve_wq;
+
+static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
+{
+	return (struct genevehdr *)(udp_hdr(skb) + 1);
+}
+
+static struct hlist_head *gs_head(struct net *net, __be16 port)
+{
+	struct geneve_net *gn = net_generic(net, geneve_net_id);
+
+	return &gn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
+}
+
+/* Find geneve socket based on network namespace and UDP port */
+static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port)
+{
+	struct geneve_sock *gs;
+
+	hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) {
+		if (inet_sk(gs->sock->sk)->inet_sport == port)
+			return gs;
+	}
+
+	return NULL;
+}
+
+static void geneve_build_header(struct genevehdr *geneveh,
+				__be16 tun_flags, u8 vni[3],
+				u8 options_len, u8 *options)
+{
+	geneveh->ver = GENEVE_VER;
+	geneveh->opt_len = options_len / 4;
+	geneveh->oam = !!(tun_flags & TUNNEL_OAM);
+	geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
+	geneveh->rsvd1 = 0;
+	memcpy(geneveh->vni, vni, 3);
+	geneveh->proto_type = htons(ETH_P_TEB);
+	geneveh->rsvd2 = 0;
+
+	memcpy(geneveh->options, options, options_len);
+}
+
+/* Transmit a fully formated Geneve frame.
+ *
+ * When calling this function. The skb->data should point
+ * to the geneve header which is fully formed.
+ *
+ * This function will add other UDP tunnel headers.
+ */
+int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
+		    struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
+		    __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
+		    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
+		    bool xnet)
+{
+	struct genevehdr *gnvh;
+	int min_headroom;
+	int err;
+
+	skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx);
+
+	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+			+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
+			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+
+	err = skb_cow_head(skb, min_headroom);
+	if (unlikely(err))
+		return err;
+
+	if (vlan_tx_tag_present(skb)) {
+		if (unlikely(!__vlan_put_tag(skb,
+					     skb->vlan_proto,
+					     vlan_tx_tag_get(skb)))) {
+			err = -ENOMEM;
+			return err;
+		}
+		skb->vlan_tci = 0;
+	}
+
+	gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
+	geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
+
+	return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst,
+				   tos, ttl, df, src_port, dst_port, xnet);
+}
+EXPORT_SYMBOL_GPL(geneve_xmit_skb);
+
+static void geneve_notify_add_rx_port(struct geneve_sock *gs)
+{
+	struct sock *sk = gs->sock->sk;
+	sa_family_t sa_family = sk->sk_family;
+	int err;
+
+	if (sa_family == AF_INET) {
+		err = udp_add_offload(&gs->udp_offloads);
+		if (err)
+			pr_warn("geneve: udp_add_offload failed with status %d\n",
+				err);
+	}
+}
+
+/* Callback from net/ipv4/udp.c to receive packets */
+static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct genevehdr *geneveh;
+	struct geneve_sock *gs;
+	int opts_len;
+
+	/* Need Geneve and inner Ethernet header to be present */
+	if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
+		goto error;
+
+	/* Return packets with reserved bits set */
+	geneveh = geneve_hdr(skb);
+
+	if (unlikely(geneveh->ver != GENEVE_VER))
+		goto error;
+
+	if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
+		goto error;
+
+	opts_len = geneveh->opt_len * 4;
+	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
+				 htons(ETH_P_TEB)))
+		goto drop;
+
+	gs = rcu_dereference_sk_user_data(sk);
+	if (!gs)
+		goto drop;
+
+	gs->rcv(gs, skb);
+	return 0;
+
+drop:
+	/* Consume bad packet */
+	kfree_skb(skb);
+	return 0;
+
+error:
+	/* Let the UDP layer deal with the skb */
+	return 1;
+}
+
+static void geneve_del_work(struct work_struct *work)
+{
+	struct geneve_sock *gs = container_of(work, struct geneve_sock,
+					      del_work);
+
+	udp_tunnel_sock_release(gs->sock);
+	kfree_rcu(gs, rcu);
+}
+
+static struct socket *geneve_create_sock(struct net *net, bool ipv6,
+					 __be16 port)
+{
+	struct socket *sock;
+	struct udp_port_cfg udp_conf;
+	int err;
+
+	memset(&udp_conf, 0, sizeof(udp_conf));
+
+	if (ipv6) {
+		udp_conf.family = AF_INET6;
+	} else {
+		udp_conf.family = AF_INET;
+		udp_conf.local_ip.s_addr = INADDR_ANY;
+	}
+
+	udp_conf.local_udp_port = port;
+
+	/* Open UDP socket */
+	err = udp_sock_create(net, &udp_conf, &sock);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	return sock;
+}
+
+/* Create new listen socket if needed */
+static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
+						geneve_rcv_t *rcv, void *data,
+						bool ipv6)
+{
+	struct geneve_net *gn = net_generic(net, geneve_net_id);
+	struct geneve_sock *gs;
+	struct socket *sock;
+	struct udp_tunnel_sock_cfg tunnel_cfg;
+
+	gs = kzalloc(sizeof(*gs), GFP_KERNEL);
+	if (!gs)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_WORK(&gs->del_work, geneve_del_work);
+
+	sock = geneve_create_sock(net, ipv6, port);
+	if (IS_ERR(sock)) {
+		kfree(gs);
+		return ERR_CAST(sock);
+	}
+
+	gs->sock = sock;
+	atomic_set(&gs->refcnt, 1);
+	gs->rcv = rcv;
+	gs->rcv_data = data;
+
+	/* Initialize the geneve udp offloads structure */
+	gs->udp_offloads.port = port;
+	gs->udp_offloads.callbacks.gro_receive = NULL;
+	gs->udp_offloads.callbacks.gro_complete = NULL;
+
+	spin_lock(&gn->sock_lock);
+	hlist_add_head_rcu(&gs->hlist, gs_head(net, port));
+	geneve_notify_add_rx_port(gs);
+	spin_unlock(&gn->sock_lock);
+
+	/* Mark socket as an encapsulation socket */
+	tunnel_cfg.sk_user_data = gs;
+	tunnel_cfg.encap_type = 1;
+	tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
+	tunnel_cfg.encap_destroy = NULL;
+	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
+
+	return gs;
+}
+
+struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
+				    geneve_rcv_t *rcv, void *data,
+				    bool no_share, bool ipv6)
+{
+	struct geneve_sock *gs;
+
+	gs = geneve_socket_create(net, port, rcv, data, ipv6);
+	if (!IS_ERR(gs))
+		return gs;
+
+	if (no_share)	/* Return error if sharing is not allowed. */
+		return ERR_PTR(-EINVAL);
+
+	gs = geneve_find_sock(net, port);
+	if (gs) {
+		if (gs->rcv == rcv)
+			atomic_inc(&gs->refcnt);
+		else
+			gs = ERR_PTR(-EBUSY);
+	} else {
+		gs = ERR_PTR(-EINVAL);
+	}
+
+	return gs;
+}
+EXPORT_SYMBOL_GPL(geneve_sock_add);
+
+void geneve_sock_release(struct geneve_sock *gs)
+{
+	if (!atomic_dec_and_test(&gs->refcnt))
+		return;
+
+	queue_work(geneve_wq, &gs->del_work);
+}
+EXPORT_SYMBOL_GPL(geneve_sock_release);
+
+static __net_init int geneve_init_net(struct net *net)
+{
+	struct geneve_net *gn = net_generic(net, geneve_net_id);
+	unsigned int h;
+
+	spin_lock_init(&gn->sock_lock);
+
+	for (h = 0; h < PORT_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&gn->sock_list[h]);
+
+	return 0;
+}
+
+static struct pernet_operations geneve_net_ops = {
+	.init = geneve_init_net,
+	.exit = NULL,
+	.id   = &geneve_net_id,
+	.size = sizeof(struct geneve_net),
+};
+
+static int __init geneve_init_module(void)
+{
+	int rc;
+
+	geneve_wq = alloc_workqueue("geneve", 0, 0);
+	if (!geneve_wq)
+		return -ENOMEM;
+
+	rc = register_pernet_subsys(&geneve_net_ops);
+	if (rc)
+		return rc;
+
+	pr_info("Geneve driver\n");
+
+	return 0;
+}
+late_initcall(geneve_init_module);
+
+static void __exit geneve_cleanup_module(void)
+{
+	destroy_workqueue(geneve_wq);
+}
+module_exit(geneve_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>");
+MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic");
+MODULE_ALIAS_RTNL_LINK("geneve");
-- 
1.7.9.5

^ permalink raw reply related

* [net-next 0/6] Add Geneve tunnel protocol support
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Andy Zhou

This patch series adds kernel support for Geneve (Generic Network
Virtualization Encapsulation) based on Geneve IETF draft: 
http://www.ietf.org/id/draft-gross-geneve-01.txt

Patch 1 implements Geneve tunneling protocol drvier 

Patch 2-6 adds openvswitch support for creating and using 
Geneve tunnels by OVS user space.


Andy Zhou (1):
  net: Add Geneve tunneling protocol driver

Jesse Gross (5):
  openvswitch: Eliminate memset() from flow_extract.
  openvswitch: Add support for matching on OAM packets.
  openvswitch: Wrap struct ovs_key_ipv4_tunnel in a new structure.
  openvswitch: Factor out allocation and verification of actions.
  openvswitch: Add support for Geneve tunneling.

 include/net/geneve.h             |   91 ++++++++++
 include/net/ip_tunnels.h         |   19 +-
 include/uapi/linux/openvswitch.h |    5 +-
 net/ipv4/Kconfig                 |   14 ++
 net/ipv4/Makefile                |    1 +
 net/ipv4/geneve.c                |  373 ++++++++++++++++++++++++++++++++++++++
 net/openvswitch/Kconfig          |   11 ++
 net/openvswitch/Makefile         |    4 +
 net/openvswitch/actions.c        |    5 +-
 net/openvswitch/datapath.c       |   44 +++--
 net/openvswitch/datapath.h       |    2 +-
 net/openvswitch/flow.c           |   76 ++++++--
 net/openvswitch/flow.h           |   48 +++--
 net/openvswitch/flow_netlink.c   |  227 +++++++++++++++++++----
 net/openvswitch/vport-geneve.c   |  236 ++++++++++++++++++++++++
 net/openvswitch/vport-gre.c      |   16 +-
 net/openvswitch/vport-vxlan.c    |   10 +-
 net/openvswitch/vport.c          |    9 +-
 net/openvswitch/vport.h          |    3 +-
 19 files changed, 1093 insertions(+), 101 deletions(-)
 create mode 100644 include/net/geneve.h
 create mode 100644 net/ipv4/geneve.c
 create mode 100644 net/openvswitch/vport-geneve.c

-- 
1.7.9.5

^ permalink raw reply

* Re: [PATCH v2 net-next] mlx4: optimize xmit path
From: Amir Vadai @ 2014-10-02  8:03 UTC (permalink / raw)
  To: Eric Dumazet, Or Gerlitz
  Cc: Alexei Starovoitov, David S. Miller, Jesper Dangaard Brouer,
	Eric Dumazet, John Fastabend, Linux Netdev List, Or Gerlitz,
	amira, idos, Yevgeny Petrilin, eyalpe
In-Reply-To: <1412224524.16704.75.camel@edumazet-glaptop2.roam.corp.google.com>

On 10/2/2014 7:35 AM, Eric Dumazet wrote:
> On Sun, 2014-09-28 at 21:19 -0700, Eric Dumazet wrote:
>> From: Eric Dumazet <edumazet@google.com>
> 

[...]

> Sorry, there is a missing replacement of 
> 
> 	iowrite32be(ring->doorbell_qpn,
> 		    ring->bf.uar->map + MLX4_SEND_DOORBELL);
> 
> by iowrite32(ring->doorbell_qpn,
> 	     ring->bf.uar->map + MLX4_SEND_DOORBELL);
> 
> Since doorbel_qpn was changed to a __be32 and setup in
> mlx4_en_activate_tx_ring()
> 

Hi,

Will take it into the split patchset - we just hit this bug when tried
to run benchmarks with blueflame disabled (easy to test by using ethtool
priv flag blueflame).

I'm still working on it, but I can't reproduce the numbers that you
show. On my development machine, I get ~5.5Mpps with burst=8 and ~2Mpps
with burst=1.

In addition, I see no improvements when adding the optimization to the
xmit path.
I use the net-next kernel + pktgen burst support patch, with and without
this xmit path optimization patch.

Do you use other patches not upstream in your environment?
Can you share the .config/pktgen configuration?

One other note: we're checking now that blueflame could be used with
xmit_more. It might result with packets reordering/drops. Still under
investigation.

Thanks,
Amir

^ permalink raw reply

* [PATCH net-next] r8152: nway reset after setting eee
From: Hayes Wang @ 2014-10-02  8:01 UTC (permalink / raw)
  To: netdev; +Cc: nic_swsd, linux-kernel, linux-usb, Hayes Wang

Restart autonegotiation is necessary after setting EEE.

Signed-off-by: Hayes Wang <hayeswang@realtek.com>
---
 drivers/net/usb/r8152.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index a4d4c4a..e4dcfc1 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -3480,6 +3480,9 @@ rtl_ethtool_set_eee(struct net_device *net, struct ethtool_eee *edata)
 
 	ret = tp->rtl_ops.eee_set(tp, edata);
 
+	if (ret == 0)
+		ret = mii_nway_restart(&tp->mii);
+
 	usb_autopm_put_interface(tp->intf);
 
 out:
-- 
1.9.3

^ permalink raw reply related

* RE: ASIX 88772
From: Stam, Michel [FINT] @ 2014-10-02  7:50 UTC (permalink / raw)
  To: Stam, Michel [FINT], netdev
In-Reply-To: <C89EFD3CD56F64468D3D206D683A8D2203863DD8@ldam-msx2.fugro-nl.local>

LS,

Is it possible for someone to apply this patch?

Best regards,

Michel Stam

-----Original Message-----
From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
On Behalf Of Stam, Michel [FINT]
Sent: Monday, September 29, 2014 15:46 PM
To: netdev@vger.kernel.org
Subject: ASIX 88772

Dear list,

A while back we did an upgrade of the firmware running on our embedded
devices. These devices, amongst others, use an ASIX chip, model 88772A.

What we noticed, was that ethtool settings would be negated when the
interface was set to 'up'. This, effectively voided any control we tried
to exercise over the autonegotiation process (it always returns back to
100 Mbps/Full duplex). After comparing with the kernel we used before
(we used 2.6.22 before, now 3.10.34), I discovered that the difference
was the .link_reset function pointer defined in the driver_info struct
in drivers/net/usb/asix_devices.c:888. By setting it back to its
previous value, ax88772_link_reset, the functionality is restored and
ethtool behaves as expected.

Apparently this change to drivers/net/usb/asix_devices.c happened at
line 888 as part of commit 2e55cc721, which moved the driver into its
own file with that commit. It apparently also addresses some link reset
problems.

Would anyone have any idea what kind of link reset problems these were?

I have attached a git format-patch which works for us, but I would like
to make sure it does not break other devices instead. I've attached it
to this email because the mail client seems to corrupt the patches
occasionally.

Best regards,

Michel Stam

^ permalink raw reply

* Re: [net-next PATCH V5] qdisc: bulk dequeue support for qdiscs with TCQ_F_ONETXQUEUE
From: Jesper Dangaard Brouer @ 2014-10-02  7:44 UTC (permalink / raw)
  To: Dave Taht
  Cc: Jamal Hadi Salim, Tom Herbert, David Miller, Linux Netdev List,
	Eric Dumazet, Hannes Frederic Sowa, Florian Westphal,
	Daniel Borkmann, Alexander Duyck, John Fastabend,
	Toke Høiland-Jørgensen, brouer, Florian Fainelli
In-Reply-To: <CAA93jw5orK2FeD20zcFpXGkqGa=kF8E-vGtkdUs7vr54R5AE6g@mail.gmail.com>

On Wed, 1 Oct 2014 22:18:05 -0700 Dave Taht <dave.taht@gmail.com> wrote:

> > I usually also monitors the BQL limits during these tests.
> >
> >  grep -H . /sys/class/net/eth4/queues/tx-*/byte_queue_limits/{inflight,limit}
> >
> > To Toke:
> >  Perhaps we could convince Toke, to add a netperf-wrapper recorder for
> > the BQL inflight and limit?  (It would be really cool to plot together)
> 
> I just added a command line mode and support for timestamped limit and
> inflight output to bqlmon
> 
> Get it here:
> 
> https://github.com/dtaht/bqlmon
>
> That should be sufficient for netperf-wrapper to poll it efficiently.

https://github.com/dtaht/bqlmon/commit/ace44b55e2e521af88ba9927bf30fe615024073d#diff-aacbd6867cca49695bfe76ad2bcd45d1R407

Now, that we are in control of the output format.  I would output
something that is easier to parse for netperf-wrapper, e.g. some json
compliant string (it is future extendable and easy to parse for Toke).
Besides the output format, I like you patch to bqlmon.

Perhaps Toke have an opinion on the output formatting from bqlmon?

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: RFC: ixgbe+build_skb+extra performance experiments
From: Jesper Dangaard Brouer @ 2014-10-02  7:36 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: brouer, David S. Miller, Jeff Kirsher, Alexander Duyck,
	Ben Hutchings, Eric Dumazet, netdev
In-Reply-To: <1412229642-10555-1-git-send-email-ast@plumgrid.com>

On Wed,  1 Oct 2014 23:00:42 -0700 Alexei Starovoitov <ast@plumgrid.com> wrote:

> I'm trying to speed up single core packet per second.

Great, welcome to the club ;-)
 
> I took dual port ixgbe and added both ports to a linux bridge.
> Only one port is connected to another system running pktgen at 10G rate.
> I disabled gro to measure pure RX speed of ixgbe.

It is great that you are attacking the RX side, I planned to look at it
after finishing the qdisc bulking.  It is really lacking behind,
especially after we have now almost "fixed" the TX side (driver layer
can now do 14.8Mpps, if ignoring rest of stack, alloc etc.).


> Out of the box I see 6.5 Mpps and the following stack:
>   21.83%    ksoftirqd/0  [kernel.kallsyms]  [k] memcpy
>   17.58%    ksoftirqd/0  [ixgbe]            [k] ixgbe_clean_rx_irq
>   10.07%    ksoftirqd/0  [kernel.kallsyms]  [k] build_skb
>    6.40%    ksoftirqd/0  [kernel.kallsyms]  [k] __netdev_alloc_frag
>    5.18%    ksoftirqd/0  [kernel.kallsyms]  [k] put_compound_page
>    4.93%    ksoftirqd/0  [kernel.kallsyms]  [k] kmem_cache_alloc
>    4.55%    ksoftirqd/0  [kernel.kallsyms]  [k] __netif_receive_skb_core
> 
> Obviously driver spends huge amount of time copying data from
> hw buffers into skb.
> 
> Then I applied buggy but working in this case patch:
> http://patchwork.ozlabs.org/patch/236044/
> that is trying to use build_skb() API in ixgbe.

I also expected it will be a huge win to use build_skb() API.
Good to see this confirmed! :-)

I've been playing with a faster memory pool/allocator (implemented via a
ring_queue), and my experiments show I could save 52ns when using it
for the skb->data.  And you basically avoid this skb->data alloc with
build_skb().


> RX speed jumped to 7.6 Mpps with the following stack:
>   27.02%    ksoftirqd/0  [kernel.kallsyms]  [k] eth_type_trans
>   16.68%    ksoftirqd/0  [ixgbe]            [k] ixgbe_clean_rx_irq
>   11.45%    ksoftirqd/0  [kernel.kallsyms]  [k] build_skb
>    5.20%    ksoftirqd/0  [kernel.kallsyms]  [k] __netif_receive_skb_core
>    4.72%    ksoftirqd/0  [kernel.kallsyms]  [k] kmem_cache_alloc
>    3.96%    ksoftirqd/0  [kernel.kallsyms]  [k] kmem_cache_free

My faster memory pool/allocator could save 8ns for the kmem_cache/slub
calls, which is also high in your perf top.  8ns out of 40ns which is
the micro benchmarked cost of the kmem_cache_{alloc,free} calls.


> packets no longer copied and performance is higher.
> It's doing the following:
> - build_skb out of hw buffer and prefetch packet data
> - eth_type_trans
> - napi_gro_receive
> 
> but build_skb() is too fast and cpu doesn't have enough time
> to prefetch packet data before eth_type_trans() is called,
> so I added mini skb bursting of 2 skbs (patch below) that does:
> - build_skb1 out of hw buffer and prefetch packet data
> - build_skb2 out of hw buffer and prefetch packet data
> - eth_type_trans(skb1)
> - napi_gro_receive(skb1)
> - eth_type_trans(skb2)
> - napi_gro_receive(skb2)
> and performance jumped to 9.0 Mpps with stack:
>   20.54%    ksoftirqd/0  [ixgbe]            [k] ixgbe_clean_rx_irq
>   13.15%    ksoftirqd/0  [kernel.kallsyms]  [k] build_skb
>    8.35%    ksoftirqd/0  [kernel.kallsyms]  [k] __netif_receive_skb_core
>    7.16%    ksoftirqd/0  [kernel.kallsyms]  [k] eth_type_trans
>    4.73%    ksoftirqd/0  [kernel.kallsyms]  [k] kmem_cache_free
>    4.50%    ksoftirqd/0  [kernel.kallsyms]  [k] kmem_cache_alloc
> 
> with further instruction tunning inside ixgbe_clean_rx_irq()
> I could push it to 9.4 Mpps.
> 
> From 6.5 Mpps to 9.4 Mpps via build_skb() and tunning.

Cool, quite impressive performance boost! - good work! :-)


> Is there a way to fix the issue Ben pointed a year ago?
> Brute force fix could to be: avoid half-page buffers.
> We'll be wasting 16Mbyte of memory. Sure, but in some cases
> extra peformance might be worth it.
> Other options?
>
> I think we need to try harder to switch to build_skb()
> It will open up a lot of possibilities for further performance
> improvements.
> Thoughts?

Yes, we should really work on converting more drivers to use
build_skb().

> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   34 +++++++++++++++++++++----
>  1 file changed, 29 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index 21d1a65..1d1e37f 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -1590,8 +1590,6 @@ static void ixgbe_process_skb_fields(struct ixgbe_ring *rx_ring,
>  	}
> 
>  	skb_record_rx_queue(skb, rx_ring->queue_index);
> -
> -	skb->protocol = eth_type_trans(skb, dev);
>  }
> 
>  static void ixgbe_rx_skb(struct ixgbe_q_vector *q_vector,
> @@ -2063,6 +2061,24 @@ dma_sync:
>  	return skb;
>  }
> 
> +#define BURST_SIZE 2
> +static void ixgbe_rx_skb_burst(struct sk_buff *skbs[BURST_SIZE],
> +			       unsigned int skb_burst,
> +			       struct ixgbe_q_vector *q_vector,
> +			       struct net_device *dev)
> +{
> +	int i;
> +
> +	for (i = 0; i < skb_burst; i++) {
> +		struct sk_buff *skb = skbs[i];
> +
> +		skb->protocol = eth_type_trans(skb, dev);
> +
> +		skb_mark_napi_id(skb, &q_vector->napi);
> +		ixgbe_rx_skb(q_vector, skb);
> +	}
> +}
> +
>  /**
>   * ixgbe_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
>   * @q_vector: structure containing interrupt and ring information
> @@ -2087,6 +2103,8 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
>  	unsigned int mss = 0;
>  #endif /* IXGBE_FCOE */
>  	u16 cleaned_count = ixgbe_desc_unused(rx_ring);
> +	struct sk_buff *skbs[BURST_SIZE];
> +	unsigned int skb_burst = 0;
> 
>  	while (likely(total_rx_packets < budget)) {
>  		union ixgbe_adv_rx_desc *rx_desc;
> @@ -2161,13 +2179,19 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
>  		}
>  
>  #endif /* IXGBE_FCOE */
> -		skb_mark_napi_id(skb, &q_vector->napi);
> -		ixgbe_rx_skb(q_vector, skb);
> -
>  		/* update budget accounting */
>  		total_rx_packets++;
> +		skbs[skb_burst++] = skb;
> +
> +		if (skb_burst == BURST_SIZE) {
> +			ixgbe_rx_skb_burst(skbs, skb_burst, q_vector,
> +					   rx_ring->netdev);
> +			skb_burst = 0;
> +		}
>  	}
>  
> +	ixgbe_rx_skb_burst(skbs, skb_burst, q_vector, rx_ring->netdev);
> +
>  	u64_stats_update_begin(&rx_ring->syncp);
>  	rx_ring->stats.packets += total_rx_packets;
>  	rx_ring->stats.bytes += total_rx_bytes;


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [RFC net 0/2] ipv6: Avoid restaring fib6_lookup() for RTF_CACHE hit
From: Martin KaFai Lau @ 2014-10-02  6:49 UTC (permalink / raw)
  To: netdev; +Cc: kafai
In-Reply-To: <1412230168-19901-1-git-send-email-kafai@fb.com>

I am trying to understand why there is a need to restart fib6_lookup() after
getting rt with RTF_CACHE.

I have adapted the davem's udpflood test
(https://git.kernel.org/pub/scm/linux/kernel/git/davem/net_test_tools.git) to
support IPv6 and here is the result:

#root > time ./udpflood -l 20000000 -c 250 2401:db00:face:face::2

Before:
real    0m33.224s
user    0m2.941s
sys     0m30.232s

After:
real    0m31.517s
user    0m2.938s
sys     0m28.536s

^ permalink raw reply

* Re: [PATCH] team: add rescheduling jiffy delay on !rtnl_trylock
From: Paul E. McKenney @ 2014-10-02  6:43 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Joe Lawrence, netdev, Jiri Pirko
In-Reply-To: <20140929160601.GD15925@htj.dyndns.org>

On Mon, Sep 29, 2014 at 12:06:01PM -0400, Tejun Heo wrote:
> (cc'ing Paul and quoting the whole body)
> 
> Paul, this is a fix for RCU sched stall observed w/ a work item
> requeueing itself waiting for the RCU grace period.  As the self
> requeueing work item ends up being executed by the same kworker, the
> worker task never stops running in the absence of a higher priority
> task and it seems to delay RCU grace period for a very long time on
> !PREEMPT kernels.  As each work item denotes a boundary which no
> synchronization construct stretches across, I wonder whether it'd be a
> good idea to add a notification for the end of RCU critical section
> between executions of work items.

It sounds like a great idea to me!  I suggest invoking
rcu_note_context_switch() between executions of work items.

							Thanx, Paul

> Thanks.
> 
> On Mon, Sep 29, 2014 at 11:54:45AM -0400, Joe Lawrence wrote:
> > Hello Jiri,
> > 
> > I've been debugging a hang on RHEL7 that seems to originate in the
> > teaming driver and the team_notify_peers_work/team_mcast_rejoin_work
> > rtnl_trylock rescheduling logic.  Running a stand-alone minimal driver
> > mimicing the same schedule_delayed_work(.., 0) reproduces the problem on
> > RHEL7 and upstream kernels [1].
> > 
> > A quick summary of the hang:
> > 
> > 1 - systemd-udevd issues an ioctl that heads down dev_ioctl (grabs the
> >     rtnl_mutex), dev_ifsioc, dev_change_name and finally
> >     synchronize_sched.  In every vmcore I've taken of the hang, this
> >     thread is waiting on the RCU.
> > 
> > 2 - A kworker thread goes to 100% CPU.
> > 
> > 3 - Inspecting the running thread on the CPU that rcusched reported as
> >     holding up the RCU grace period usually shows it in either
> >     team_notify_peers_work, team_mcast_rejoin_work, or somewhere in the
> >     workqueue code (process_one_work).  This is the same CPU/thread as
> >     #2.
> > 
> > 4 - team_notify_peers_work and team_mcast_rejoin_work want the rtnl_lock
> >     that systemd-udevd in #1 has, so they try to play nice by calling
> >     rtnl_trylock and rescheduling on failure.  Unfortunately with 0
> >     jiffy delay, process_one_work will "execute immediately" (ie, after
> >     others already in queue, but before the next tick).  With the stock
> >     RHEL7 !CONFIG_PREEMPT at least, this creates a tight loop on
> >     process_one_work + rtnl_trylock that spins the CPU in #2.
> > 
> > 5 - Sometime minutes later, RCU seems to be kicked by a side effect of
> >     a smp_apic_timer_interrupt.  (This was the only other interesting
> >     function reported by ftrace function tracer).
> > 
> > See the patch below for a potential workaround.  Giving at least 1 jiffy
> > should give process_one_work some breathing room before calling back
> > into team_notify_peers_work/team_mcast_rejoin_work and attempting to
> > acquire the rtnl_lock mutex.
> > 
> > Regards,
> > 
> > -- Joe
> > 
> > [1] http://marc.info/?l=linux-kernel&m=141192244232345&w=2
> > 
> > -->8--- -->8--- -->8--- -->8---
> > 
> > From fc5bbf5771b5732f7479ac6e84bbfdde05710023 Mon Sep 17 00:00:00 2001
> > From: Joe Lawrence <joe.lawrence@stratus.com>
> > Date: Mon, 29 Sep 2014 11:09:05 -0400
> > Subject: [PATCH] team: add rescheduling jiffy delay on !rtnl_trylock
> > 
> > Give the CPU running the kworker handling team_notify_peers_work and
> > team_mcast_rejoin_work functions some scheduling air by specifying a
> > non-zero delay.
> > 
> > Signed-off-by: Joe Lawrence <joe.lawrence@stratus.com>
> > ---
> >  drivers/net/team/team.c |    4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
> > index ef10302..d46df38 100644
> > --- a/drivers/net/team/team.c
> > +++ b/drivers/net/team/team.c
> > @@ -633,7 +633,7 @@ static void team_notify_peers_work(struct work_struct *work)
> >  	team = container_of(work, struct team, notify_peers.dw.work);
> >  
> >  	if (!rtnl_trylock()) {
> > -		schedule_delayed_work(&team->notify_peers.dw, 0);
> > +		schedule_delayed_work(&team->notify_peers.dw, 1);
> >  		return;
> >  	}
> >  	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, team->dev);
> > @@ -673,7 +673,7 @@ static void team_mcast_rejoin_work(struct work_struct *work)
> >  	team = container_of(work, struct team, mcast_rejoin.dw.work);
> >  
> >  	if (!rtnl_trylock()) {
> > -		schedule_delayed_work(&team->mcast_rejoin.dw, 0);
> > +		schedule_delayed_work(&team->mcast_rejoin.dw, 1);
> >  		return;
> >  	}
> >  	call_netdevice_notifiers(NETDEV_RESEND_IGMP, team->dev);
> > -- 
> > 1.7.10.4
> > 
> 
> -- 
> tejun
> 

^ permalink raw reply

* [PATCH net-next] net: Cleanup skb cloning  by adding SKB_FCLONE_FREE
From: Vijay Subramanian @ 2014-10-02  6:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, Vijay Subramanian

SKB_FCLONE_UNAVAILABLE has overloaded meaning depending on type of skb.
1: If skb is allocated from head_cache, it indicates fclone is not available.
2: If skb is a companion fclone skb (allocated from fclone_cache), it indicates
it is available to be used.

To avoid confusion for case 2 above, this patch  replaces
SKB_FCLONE_UNAVAILABLE with SKB_FCLONE_FREE where appropriate. For fclone
companion skbs, this indicates it is free for use.

SKB_FCLONE_UNAVAILABLE will now simply indicate skb is from head_cache and
cannot / will not have a companion fclone.

Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com>
---
 include/linux/skbuff.h | 3 ++-
 net/core/skbuff.c      | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7c5036d..6c3fb9a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -339,9 +339,10 @@ struct skb_shared_info {
 
 
 enum {
-	SKB_FCLONE_UNAVAILABLE,
+	SKB_FCLONE_UNAVAILABLE,	/* skb has no fclone */
 	SKB_FCLONE_ORIG,
 	SKB_FCLONE_CLONE,
+	SKB_FCLONE_FREE,	/* this fclone skb is available */
 };
 
 enum {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f77e648..6f4e359 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -265,7 +265,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 		skb->fclone = SKB_FCLONE_ORIG;
 		atomic_set(&fclones->fclone_ref, 1);
 
-		fclones->skb2.fclone = SKB_FCLONE_UNAVAILABLE;
+		fclones->skb2.fclone = SKB_FCLONE_FREE;
 		fclones->skb2.pfmemalloc = pfmemalloc;
 	}
 out:
@@ -542,7 +542,7 @@ static void kfree_skbmem(struct sk_buff *skb)
 		fclones = container_of(skb, struct sk_buff_fclones, skb2);
 
 		/* Warning : We must perform the atomic_dec_and_test() before
-		 * setting skb->fclone back to SKB_FCLONE_UNAVAILABLE, otherwise
+		 * setting skb->fclone back to SKB_FCLONE_FREE, otherwise
 		 * skb_clone() could set clone_ref to 2 before our decrement.
 		 * Anyway, if we are going to free the structure, no need to
 		 * rewrite skb->fclone.
@@ -553,7 +553,7 @@ static void kfree_skbmem(struct sk_buff *skb)
 			/* The clone portion is available for
 			 * fast-cloning again.
 			 */
-			skb->fclone = SKB_FCLONE_UNAVAILABLE;
+			skb->fclone = SKB_FCLONE_FREE;
 		}
 		break;
 	}
@@ -874,7 +874,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 		return NULL;
 
 	if (skb->fclone == SKB_FCLONE_ORIG &&
-	    n->fclone == SKB_FCLONE_UNAVAILABLE) {
+	    n->fclone == SKB_FCLONE_FREE) {
 		n->fclone = SKB_FCLONE_CLONE;
 		/* As our fastclone was free, clone_ref must be 1 at this point.
 		 * We could use atomic_inc() here, but it is faster
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v1 5/5] driver-core: add driver asynchronous probe support
From: Tom Gundersen @ 2014-10-02  6:12 UTC (permalink / raw)
  To: Luis R. Rodriguez
  Cc: Luis R. Rodriguez, Michal Hocko, Greg KH, Dmitry Torokhov,
	Takashi Iwai, Tejun Heo, Arjan van de Ven, Robert Milasan, werner,
	Oleg Nesterov, hare, Benjamin Poirier, Santosh Rastapur, pmladek,
	dbueso, LKML, Tetsuo Handa, Joseph Salisbury, Kay Sievers,
	One Thousand Gnomes, Tim Gardner, Pierre Fersing, Andrew Morton,
	Nagalakshmi Nandigama
In-Reply-To: <20140930152419.GF14081@wotan.suse.de>

On Tue, Sep 30, 2014 at 5:24 PM, Luis R. Rodriguez <mcgrof@suse.com> wrote:
>> > commit e64fae5573e566ce4fd9b23c68ac8f3096603314
>> > Author: Kay Sievers <kay.sievers@vrfy.org>
>> > Date:   Wed Jan 18 05:06:18 2012 +0100
>> >
>> >     udevd: kill hanging event processes after 30 seconds
>> >
>> >     Some broken kernel drivers load firmware synchronously in the module init
>> >     path and block modprobe until the firmware request is fulfilled.
>> >     <...>
>>
>> This was a workaround to avoid a deadlock between udev and the kernel.
>> The 180 s timeout was already in place before this change, and was not
>> motivated by firmware loading. Also note that this patch was not about
>> "tracking device drivers", just about avoiding dead-lock.
>
> Thanks, can you elaborate on how a deadlock can occur if the kmod
> worker is not at some point sigkilled?

This was only relevant whet udev did the firmware loading. modprobe
would wait for the kernel, which would wait for the firmware loading,
which would wait for modprobe. This is no longer a problem as udev
does not do firmware loading any more.

> Is the issue that if there is no extra worker available and all are
> idling on sleep / synchronous long work boot will potentially hang
> unless a new worker becomes available to do more work?

Correct.

> If so I can
> see the sigkill helping for hanging tasks but it doesn't necessarily
> mean its a good idea to kill modules loading taking a while. Also
> what if the sigkill is just avoided for *just* kmod workers?

Depending on the number of devices you have, I suppose we could still
exhaust the workers.

>> The way I see it, the current status from systemd's side is: our
>> short-term work-around is to increase the timeout, and at the moment
>> it appears no long-term solution is needed (i.e., it seems like the
>> right thing to do is to make sure insmod can be near instantaneous, it
>> appears people are working towards this goal, and so far no examples
>> have cropped up showing that it is fundamentally impossible (once/if
>> they do, we should of course revisit the problem)).
>
> That again would be reactive behaviour, what would prevent avoiding the
> sigkill only for kmod workers? Is it known the deadlock is immiment?
> If the amount of workers for kmod that would hit the timeout is
> considered low I don't see how that's possible and why not just lift
> the sigkill.

Making kmod a special case is of course possible. However, as long as
there is no fundamental reason why kmod should get this special
treatment, this just looks like a work-around to me. We already have a
work-around, which is to increase the global timeout. If you still
think we should do something different in systemd, it is probably best
to take the discussion to systemd-devel to make sure all the relevant
people are involved.

Cheers,

Tom

^ permalink raw reply

* RFC: ixgbe+build_skb+extra performance experiments
From: Alexei Starovoitov @ 2014-10-02  6:00 UTC (permalink / raw)
  To: David S. Miller
  Cc: Jeff Kirsher, Alexander Duyck, Ben Hutchings, Eric Dumazet,
	netdev

Hi All,

I'm trying to speed up single core packet per second.

I took dual port ixgbe and added both ports to a linux bridge.
Only one port is connected to another system running pktgen at 10G rate.
I disabled gro to measure pure RX speed of ixgbe.

Out of the box I see 6.5 Mpps and the following stack:
  21.83%    ksoftirqd/0  [kernel.kallsyms]  [k] memcpy
  17.58%    ksoftirqd/0  [ixgbe]            [k] ixgbe_clean_rx_irq
  10.07%    ksoftirqd/0  [kernel.kallsyms]  [k] build_skb
   6.40%    ksoftirqd/0  [kernel.kallsyms]  [k] __netdev_alloc_frag
   5.18%    ksoftirqd/0  [kernel.kallsyms]  [k] put_compound_page
   4.93%    ksoftirqd/0  [kernel.kallsyms]  [k] kmem_cache_alloc
   4.55%    ksoftirqd/0  [kernel.kallsyms]  [k] __netif_receive_skb_core

Obviously driver spends huge amount of time copying data from
hw buffers into skb.

Then I applied buggy but working in this case patch:
http://patchwork.ozlabs.org/patch/236044/
that is trying to use build_skb() API in ixgbe.

RX speed jumped to 7.6 Mpps with the following stack:
  27.02%    ksoftirqd/0  [kernel.kallsyms]  [k] eth_type_trans
  16.68%    ksoftirqd/0  [ixgbe]            [k] ixgbe_clean_rx_irq
  11.45%    ksoftirqd/0  [kernel.kallsyms]  [k] build_skb
   5.20%    ksoftirqd/0  [kernel.kallsyms]  [k] __netif_receive_skb_core
   4.72%    ksoftirqd/0  [kernel.kallsyms]  [k] kmem_cache_alloc
   3.96%    ksoftirqd/0  [kernel.kallsyms]  [k] kmem_cache_free

packets no longer copied and performance is higher.
It's doing the following:
- build_skb out of hw buffer and prefetch packet data
- eth_type_trans
- napi_gro_receive

but build_skb() is too fast and cpu doesn't have enough time
to prefetch packet data before eth_type_trans() is called,
so I added mini skb bursting of 2 skbs (patch below) that does:
- build_skb1 out of hw buffer and prefetch packet data
- build_skb2 out of hw buffer and prefetch packet data
- eth_type_trans(skb1)
- napi_gro_receive(skb1)
- eth_type_trans(skb2)
- napi_gro_receive(skb2)
and performance jumped to 9.0 Mpps with stack:
  20.54%    ksoftirqd/0  [ixgbe]            [k] ixgbe_clean_rx_irq
  13.15%    ksoftirqd/0  [kernel.kallsyms]  [k] build_skb
   8.35%    ksoftirqd/0  [kernel.kallsyms]  [k] __netif_receive_skb_core
   7.16%    ksoftirqd/0  [kernel.kallsyms]  [k] eth_type_trans
   4.73%    ksoftirqd/0  [kernel.kallsyms]  [k] kmem_cache_free
   4.50%    ksoftirqd/0  [kernel.kallsyms]  [k] kmem_cache_alloc

with further instruction tunning inside ixgbe_clean_rx_irq()
I could push it to 9.4 Mpps.

>From 6.5 Mpps to 9.4 Mpps via build_skb() and tunning.

Is there a way to fix the issue Ben pointed a year ago?
Brute force fix could to be: avoid half-page buffers.
We'll be wasting 16Mbyte of memory. Sure, but in some cases
extra peformance might be worth it.
Other options?
I think we need to try harder to switch to build_skb()
It will open up a lot of possibilities for further performance
improvements.
Thoughts?

---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   34 +++++++++++++++++++++----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 21d1a65..1d1e37f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1590,8 +1590,6 @@ static void ixgbe_process_skb_fields(struct ixgbe_ring *rx_ring,
 	}

 	skb_record_rx_queue(skb, rx_ring->queue_index);
-
-	skb->protocol = eth_type_trans(skb, dev);
 }

 static void ixgbe_rx_skb(struct ixgbe_q_vector *q_vector,
@@ -2063,6 +2061,24 @@ dma_sync:
 	return skb;
 }

+#define BURST_SIZE 2
+static void ixgbe_rx_skb_burst(struct sk_buff *skbs[BURST_SIZE],
+			       unsigned int skb_burst,
+			       struct ixgbe_q_vector *q_vector,
+			       struct net_device *dev)
+{
+	int i;
+
+	for (i = 0; i < skb_burst; i++) {
+		struct sk_buff *skb = skbs[i];
+
+		skb->protocol = eth_type_trans(skb, dev);
+
+		skb_mark_napi_id(skb, &q_vector->napi);
+		ixgbe_rx_skb(q_vector, skb);
+	}
+}
+
 /**
  * ixgbe_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
  * @q_vector: structure containing interrupt and ring information
@@ -2087,6 +2103,8 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 	unsigned int mss = 0;
 #endif /* IXGBE_FCOE */
 	u16 cleaned_count = ixgbe_desc_unused(rx_ring);
+	struct sk_buff *skbs[BURST_SIZE];
+	unsigned int skb_burst = 0;

 	while (likely(total_rx_packets < budget)) {
 		union ixgbe_adv_rx_desc *rx_desc;
@@ -2161,13 +2179,19 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		}
 
 #endif /* IXGBE_FCOE */
-		skb_mark_napi_id(skb, &q_vector->napi);
-		ixgbe_rx_skb(q_vector, skb);
-
 		/* update budget accounting */
 		total_rx_packets++;
+		skbs[skb_burst++] = skb;
+
+		if (skb_burst == BURST_SIZE) {
+			ixgbe_rx_skb_burst(skbs, skb_burst, q_vector,
+					   rx_ring->netdev);
+			skb_burst = 0;
+		}
 	}
 
+	ixgbe_rx_skb_burst(skbs, skb_burst, q_vector, rx_ring->netdev);
+
 	u64_stats_update_begin(&rx_ring->syncp);
 	rx_ring->stats.packets += total_rx_packets;
 	rx_ring->stats.bytes += total_rx_bytes;
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCHv9 net-next 2/4] sunvnet: make transmit path zero-copy in the kernel
From: Raghuram Kothakota @ 2014-10-02  5:50 UTC (permalink / raw)
  To: David L Stevens; +Cc: David Miller, netdev, Sowmini Varadhan
In-Reply-To: <5429EFBB.2090909@oracle.com>

Sorry I am late in providing my comments, but I feel it is important
to share this comment.

> static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> {
> 	struct vnet *vp = netdev_priv(dev);
> @@ -788,12 +899,20 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> 	struct vio_net_desc *d;
> 	unsigned long flags;
> 	unsigned int len;
> -	void *tx_buf;
> -	int i, err;
> +	struct sk_buff *freeskbs = NULL;
> +	int i, err, txi;
> +	void *start = NULL;
> +	int nlen = 0;
> +	unsigned pending = 0;
> 
> 	if (unlikely(!port))
> 		goto out_dropped;
> 
> +	skb = vnet_skb_shape(skb, &start, &nlen);
> +
> +	if (unlikely(!skb))
> +		goto out_dropped;
> +
> 	spin_lock_irqsave(&port->vio.lock, flags);
> 
> 	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
> @@ -811,14 +930,27 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> 
> 	d = vio_dring_cur(dr);
> 
> -	tx_buf = port->tx_bufs[dr->prod].buf;
> -	skb_copy_from_linear_data(skb, tx_buf + VNET_PACKET_SKIP, skb->len);
> +	txi = dr->prod;
> +
> +	freeskbs = vnet_clean_tx_ring(port, &pending);
> +
> +	BUG_ON(port->tx_bufs[txi].skb);
> 
> 	len = skb->len;
> -	if (len < ETH_ZLEN) {
> +	if (len < ETH_ZLEN)
> 		len = ETH_ZLEN;
> -		memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len);
> +
> +	port->tx_bufs[txi].skb = skb;
> +	skb = NULL;
> +
> +	err = ldc_map_single(port->vio.lp, start, nlen,
> +			     port->tx_bufs[txi].cookies, 2,
> +			     (LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_RW));


The LDC sharing protection mechanism is at a page level. If I understand
well, the vnet_skb_shape() function only addresses the alignment requirement
but it still leaves the possibility of exporting a lot more data than required to the
peer. This can be treated as a security issue,  wondering if you thought of this issue.


-Raghuram



> +	if (err < 0) {
> +		netdev_info(dev, "tx buffer map error %d\n", err);
> +		goto out_dropped_unlock;
> 	}
> +	port->tx_bufs[txi].ncookies = err;
> 
> 	/* We don't rely on the ACKs to free the skb in vnet_start_xmit(),
> 	 * thus it is safe to not set VIO_ACK_ENABLE for each transmission:
> @@ -830,9 +962,9 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> 	 */
> 	d->hdr.ack = VIO_ACK_DISABLE;
> 	d->size = len;
> -	d->ncookies = port->tx_bufs[dr->prod].ncookies;
> +	d->ncookies = port->tx_bufs[txi].ncookies;
> 	for (i = 0; i < d->ncookies; i++)
> -		d->cookies[i] = port->tx_bufs[dr->prod].cookies[i];
> +		d->cookies[i] = port->tx_bufs[txi].cookies[i];
> 
> 	/* This has to be a non-SMP write barrier because we are writing
> 	 * to memory which is shared with the peer LDOM.
> @@ -876,7 +1008,7 @@ ldc_start_done:
> 	port->start_cons = false;
> 
> 	dev->stats.tx_packets++;
> -	dev->stats.tx_bytes += skb->len;
> +	dev->stats.tx_bytes += port->tx_bufs[txi].skb->len;
> 
> 	dr->prod = (dr->prod + 1) & (VNET_TX_RING_SIZE - 1);
> 	if (unlikely(vnet_tx_dring_avail(dr) < 2)) {
> @@ -887,7 +1019,9 @@ ldc_start_done:
> 
> 	spin_unlock_irqrestore(&port->vio.lock, flags);
> 
> -	dev_kfree_skb(skb);
> +	vnet_free_skbs(freeskbs);
> +
> +	(void)mod_timer(&port->clean_timer, jiffies + VNET_CLEAN_TIMEOUT);
> 
> 	return NETDEV_TX_OK;
> 
> @@ -895,7 +1029,14 @@ out_dropped_unlock:
> 	spin_unlock_irqrestore(&port->vio.lock, flags);
> 
> out_dropped:
> -	dev_kfree_skb(skb);
> +	if (skb)
> +		dev_kfree_skb(skb);
> +	vnet_free_skbs(freeskbs);
> +	if (pending)
> +		(void)mod_timer(&port->clean_timer,
> +				jiffies + VNET_CLEAN_TIMEOUT);
> +	else
> +		del_timer(&port->clean_timer);
> 	dev->stats.tx_dropped++;
> 	return NETDEV_TX_OK;
> }
> @@ -1097,17 +1238,22 @@ static void vnet_port_free_tx_bufs(struct vnet_port *port)
> 	}
> 
> 	for (i = 0; i < VNET_TX_RING_SIZE; i++) {
> -		void *buf = port->tx_bufs[i].buf;
> +		struct vio_net_desc *d;
> +		void *skb = port->tx_bufs[i].skb;
> 
> -		if (!buf)
> +		if (!skb)
> 			continue;
> 
> +		d = vio_dring_entry(dr, i);
> +		if (d->hdr.state == VIO_DESC_READY)
> +			pr_warn("active transmit buffers freed\n");
> +
> 		ldc_unmap(port->vio.lp,
> 			  port->tx_bufs[i].cookies,
> 			  port->tx_bufs[i].ncookies);
> -
> -		kfree(buf);
> -		port->tx_bufs[i].buf = NULL;
> +		dev_kfree_skb(skb);
> +		port->tx_bufs[i].skb = NULL;
> +		d->hdr.state = VIO_DESC_FREE;
> 	}
> }
> 
> @@ -1118,34 +1264,6 @@ static int vnet_port_alloc_tx_bufs(struct vnet_port *port)
> 	int i, err, ncookies;
> 	void *dring;
> 
> -	for (i = 0; i < VNET_TX_RING_SIZE; i++) {
> -		void *buf = kzalloc(VNET_MAXPACKET + 8, GFP_KERNEL);
> -		int map_len = (VNET_MAXPACKET + 7) & ~7;
> -
> -		err = -ENOMEM;
> -		if (!buf)
> -			goto err_out;
> -
> -		err = -EFAULT;
> -		if ((unsigned long)buf & (8UL - 1)) {
> -			pr_err("TX buffer misaligned\n");
> -			kfree(buf);
> -			goto err_out;
> -		}
> -
> -		err = ldc_map_single(port->vio.lp, buf, map_len,
> -				     port->tx_bufs[i].cookies, 2,
> -				     (LDC_MAP_SHADOW |
> -				      LDC_MAP_DIRECT |
> -				      LDC_MAP_RW));
> -		if (err < 0) {
> -			kfree(buf);
> -			goto err_out;
> -		}
> -		port->tx_bufs[i].buf = buf;
> -		port->tx_bufs[i].ncookies = err;
> -	}
> -
> 	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
> 
> 	len = (VNET_TX_RING_SIZE *
> @@ -1172,6 +1290,12 @@ static int vnet_port_alloc_tx_bufs(struct vnet_port *port)
> 	dr->pending = VNET_TX_RING_SIZE;
> 	dr->ncookies = ncookies;
> 
> +	for (i = 0; i < VNET_TX_RING_SIZE; ++i) {
> +		struct vio_net_desc *d;
> +
> +		d = vio_dring_entry(dr, i);
> +		d->hdr.state = VIO_DESC_FREE;
> +	}
> 	return 0;
> 
> err_out:
> @@ -1203,6 +1327,8 @@ static struct vnet *vnet_new(const u64 *local_mac)
> 	dev = alloc_etherdev(sizeof(*vp));
> 	if (!dev)
> 		return ERR_PTR(-ENOMEM);
> +	dev->needed_headroom = VNET_PACKET_SKIP + 8;
> +	dev->needed_tailroom = 8;
> 
> 	for (i = 0; i < ETH_ALEN; i++)
> 		dev->dev_addr[i] = (*local_mac >> (5 - i) * 8) & 0xff;
> @@ -1397,6 +1523,9 @@ static int vnet_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
> 	pr_info("%s: PORT ( remote-mac %pM%s )\n",
> 		vp->dev->name, port->raddr, switch_port ? " switch-port" : "");
> 
> +	setup_timer(&port->clean_timer, vnet_clean_timer_expire,
> +		    (unsigned long)port);
> +
> 	vio_port_up(&port->vio);
> 
> 	mdesc_release(hp);
> @@ -1423,6 +1552,7 @@ static int vnet_port_remove(struct vio_dev *vdev)
> 		unsigned long flags;
> 
> 		del_timer_sync(&port->vio.timer);
> +		del_timer_sync(&port->clean_timer);
> 
> 		spin_lock_irqsave(&vp->lock, flags);
> 		list_del(&port->list);
> diff --git a/drivers/net/ethernet/sun/sunvnet.h b/drivers/net/ethernet/sun/sunvnet.h
> index 986e04b..02f507d 100644
> --- a/drivers/net/ethernet/sun/sunvnet.h
> +++ b/drivers/net/ethernet/sun/sunvnet.h
> @@ -11,6 +11,11 @@
>  */
> #define VNET_TX_TIMEOUT			(5 * HZ)
> 
> +/* length of time (or less) we expect pending descriptors to be marked
> + * as VIO_DESC_DONE and skbs ready to be freed
> + */
> +#define	VNET_CLEAN_TIMEOUT		((HZ/100)+1)
> +
> #define VNET_MAXPACKET			1518ULL /* ETH_FRAMELEN + VLAN_HDR */
> #define VNET_TX_RING_SIZE		512
> #define VNET_TX_WAKEUP_THRESH(dr)	((dr)->pending / 4)
> @@ -22,7 +27,7 @@
> #define VNET_PACKET_SKIP		6
> 
> struct vnet_tx_entry {
> -	void			*buf;
> +	struct sk_buff		*skb;
> 	unsigned int		ncookies;
> 	struct ldc_trans_cookie	cookies[2];
> };
> @@ -46,6 +51,8 @@ struct vnet_port {
> 	bool			stop_rx;
> 	bool			start_cons;
> 
> +	struct timer_list	clean_timer;
> +
> 	u64			rmtu;
> };
> 
> -- 
> 1.7.1
> 

^ permalink raw reply

* RE: [PATCH v2 net-next 01/10] r8169:change uppercase numbertolowercase nubmer
From: Hau @ 2014-10-02  5:35 UTC (permalink / raw)
  To: David Miller
  Cc: netdev@vger.kernel.org, nic_swsd, linux-kernel@vger.kernel.org
In-Reply-To: <20141001.153434.1528027414789643819.davem@davemloft.net>

I will do that next time.

Thanks.

-----Original Message-----
From: David Miller [mailto:davem@davemloft.net] 
Sent: Thursday, October 02, 2014 3:35 AM
To: Hau
Cc: netdev@vger.kernel.org; nic_swsd; linux-kernel@vger.kernel.org
Subject: Re: [PATCH v2 net-next 01/10] r8169:change uppercase number tolowercase nubmer

From: Chun-Hao Lin <hau@realtek.com>
Date: Wed, 1 Oct 2014 23:17:12 +0800

> Signed-off-by: Chun-Hao Lin <hau@realtek.com>

Series applied, and I fixed the typo in the Subject line here.

But you _(REALLY)_ need to provide a [PATCH net-next 00/xx] posting at the beginning of the patch series which gives an overview of what the patch series does at a high level, and why.

This also allows me to have a sensible post to reply to when I just need to say what I've done with the entire series rather than providing comments on a specific patch.

Thanks.

------Please consider the environment before printing this e-mail.

^ permalink raw reply

* Re: [net-next PATCH V5] qdisc: bulk dequeue support for qdiscs with TCQ_F_ONETXQUEUE
From: Dave Taht @ 2014-10-02  5:18 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Jamal Hadi Salim, Tom Herbert, David Miller, Linux Netdev List,
	Eric Dumazet, Hannes Frederic Sowa, Florian Westphal,
	Daniel Borkmann, Alexander Duyck, John Fastabend,
	Toke Høiland-Jørgensen
In-Reply-To: <20141001223229.6cbaac07@redhat.com>

>> > I'm monitoring backlog of qdiscs, and I always see >1 backlog, I never
>> > saw a standing queue of 1 packet in my testing.  Either the backlog
>> > area is high 100-200 packets, or 0 backlog.  (With fake pktgen/trafgen
>> > style tests, it's possible to cause 1000 backlog).
>>
>> It would be nice to actually collect such stats. Monitoring the backlog
>> via dumping qdisc stats is a good start - but actually keeping traces
>> of average bulk size is more useful.

I am not huge on averages. Network theorists tend to think about things in
terms of fluid models. Van Jacobson's analogy of a water fountain's
operation is very profound...

While it is nearly impossible for a conventional Van Neuman time sliced
CPU + network to actually act that way, things like BQL and dedicated
pipelining systems like those in DPDK are getting closer to that ideal.

An example of where averages let you down is on the classic 5 minute
data reduction things like mrtg do, where you might see `60% of the
bandwidth (capacity/5 minutes) in use, yet still see drops because
over shorter intervals (capacity/10ms) you have bursts arriving.

Far better to sample queue occupancy at the highest rate you can
manage without heisenbugging the results and look at the detailed
curves.

> I usually also monitors the BQL limits during these tests.
>
>  grep -H . /sys/class/net/eth4/queues/tx-*/byte_queue_limits/{inflight,limit}
>
> To Toke:
>  Perhaps we could convince Toke, to add a netperf-wrapper recorder for
> the BQL inflight and limit?  (It would be really cool to plot together)

I just added a command line mode and support for timestamped limit and
inflight output to bqlmon

Get it here:

https://github.com/dtaht/bqlmon

That should be sufficient for netperf-wrapper to poll it efficiently.
As to how to graph it,
I suppose finding the max(limit) across the entire sample set and
scaling inflight appropriately would be good.

What test(s) would be best to combine it with? rrul? tcp_square_wave?

-- 
Dave Täht

https://www.bufferbloat.net/projects/make-wifi-fast

^ permalink raw reply

* FOU RX interface?
From: Andy Lutomirski @ 2014-10-02  5:14 UTC (permalink / raw)
  To: Network Development, Tom Herbert, David S. Miller

Hi-

Sorry for the lack of proper threading here -- I lost the original message.

If I'm understanding the FOU use case correctly, if I set up a FOU
tunnel tun0 that is encapsulated in UDP on eth0, then tun0 packets
will be transmitted on tun0, but incoming packets will show up on eth0
when they're reinjected after stripping the FOU header.

Is this right?  I think that, without a way to reinject the received
packets on the tunnel interface, using FOU will be annoying.  For
example, writing firewall rules might be tricky.  And programs that
use packet sockets or SO_BINDTODEVICE could have a hard time being
configured such that they notice the received packets.

Also, is it even possible to assign a FOU tunnel to a different
network namespace than the device that's being tunneled over?  How
will the received packets end up in the right netns?

--Andy

^ permalink raw reply

* [PATCH net-next 4/4] ip_tunnel: Add GUE support
From: Tom Herbert @ 2014-10-02  4:46 UTC (permalink / raw)
  To: davem, netdev
In-Reply-To: <1412225199-24942-1-git-send-email-therbert@google.com>

This patch allows configuring IPIP, sit, and GRE tunnels to use GUE.
This is very similar to fou excpet that we need to insert the GUE header
in addition to the UDP header on transmit.

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/uapi/linux/if_tunnel.h |  1 +
 net/ipv4/ip_tunnel.c           | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 7c832af..280d9e0 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -64,6 +64,7 @@ enum {
 enum tunnel_encap_types {
 	TUNNEL_ENCAP_NONE,
 	TUNNEL_ENCAP_FOU,
+	TUNNEL_ENCAP_GUE,
 };
 
 #define TUNNEL_ENCAP_FLAG_CSUM		(1<<0)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 54ace25..79f2ac0 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -56,6 +56,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/udp.h>
+#include <net/gue.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -495,6 +496,8 @@ static int ip_encap_hlen(struct ip_tunnel_encap *e)
 		return 0;
 	case TUNNEL_ENCAP_FOU:
 		return sizeof(struct udphdr);
+	case TUNNEL_ENCAP_GUE:
+		return sizeof(struct udphdr) + sizeof(struct guehdr);
 	default:
 		return -EINVAL;
 	}
@@ -546,6 +549,15 @@ static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 	skb_reset_transport_header(skb);
 	uh = udp_hdr(skb);
 
+	if (e->type == TUNNEL_ENCAP_GUE) {
+		struct guehdr *guehdr = (struct guehdr *)&uh[1];
+
+		guehdr->version = 0;
+		guehdr->hlen = 0;
+		guehdr->flags = 0;
+		guehdr->next_hdr = *protocol;
+	}
+
 	uh->dest = e->dport;
 	uh->source = sport;
 	uh->len = htons(skb->len);
@@ -565,6 +577,7 @@ int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
 	case TUNNEL_ENCAP_NONE:
 		return 0;
 	case TUNNEL_ENCAP_FOU:
+	case TUNNEL_ENCAP_GUE:
 		return fou_build_header(skb, &t->encap, t->encap_hlen,
 					protocol, fl4);
 	default:
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH net-next 3/4] gue: Receive side for Generic UDP Encapsulation
From: Tom Herbert @ 2014-10-02  4:46 UTC (permalink / raw)
  To: davem, netdev
In-Reply-To: <1412225199-24942-1-git-send-email-therbert@google.com>

This patch adds support receiving for GUE packets in the fou module. The
fou module now supports direct foo-over-udp (no encapsulation header)
and GUE. To support this a type parameter is added to the fou netlink
parameters.

For a GUE socket we define gue_udp_recv, gue_gro_receive, and
gue_gro_complete to handle the specifics of the GUE protocol. Most
of the code to manage and configure sockets is common with the fou.

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/uapi/linux/fou.h |   7 ++
 net/ipv4/fou.c           | 196 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 194 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h
index e03376d..8df0689 100644
--- a/include/uapi/linux/fou.h
+++ b/include/uapi/linux/fou.h
@@ -13,6 +13,7 @@ enum {
 	FOU_ATTR_PORT,				/* u16 */
 	FOU_ATTR_AF,				/* u8 */
 	FOU_ATTR_IPPROTO,			/* u8 */
+	FOU_ATTR_TYPE,				/* u8 */
 
 	__FOU_ATTR_MAX,
 };
@@ -27,6 +28,12 @@ enum {
 	__FOU_CMD_MAX,
 };
 
+enum {
+	FOU_ENCAP_UNSPEC,
+	FOU_ENCAP_DIRECT,
+	FOU_ENCAP_GUE,
+};
+
 #define FOU_CMD_MAX	(__FOU_CMD_MAX - 1)
 
 #endif /* _UAPI_LINUX_FOU_H */
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 7e2126a..efa70ad 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -7,6 +7,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <net/genetlink.h>
+#include <net/gue.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/udp.h>
@@ -27,6 +28,7 @@ struct fou {
 };
 
 struct fou_cfg {
+	u16 type;
 	u8 protocol;
 	struct udp_port_cfg udp_config;
 };
@@ -64,6 +66,41 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
 					  sizeof(struct udphdr));
 }
 
+static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct fou *fou = fou_from_sock(sk);
+	size_t len;
+	struct guehdr *guehdr;
+	struct udphdr *uh;
+
+	if (!fou)
+		return 1;
+
+	len = sizeof(struct udphdr) + sizeof(struct guehdr);
+	if (!pskb_may_pull(skb, len))
+		goto drop;
+
+	uh = udp_hdr(skb);
+	guehdr = (struct guehdr *)&uh[1];
+
+	len += guehdr->hlen << 2;
+	if (!pskb_may_pull(skb, len))
+		goto drop;
+
+	if (guehdr->version != 0)
+		goto drop;
+
+	if (guehdr->flags) {
+		/* No support yet */
+		goto drop;
+	}
+
+	return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len);
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
 static struct sk_buff **fou_gro_receive(struct sk_buff **head,
 					struct sk_buff *skb)
 {
@@ -107,6 +144,112 @@ out_unlock:
 	return err;
 }
 
+static struct sk_buff **gue_gro_receive(struct sk_buff **head,
+					struct sk_buff *skb)
+{
+	const struct net_offload **offloads;
+	const struct net_offload *ops;
+	struct sk_buff **pp = NULL;
+	struct sk_buff *p;
+	u8 proto;
+	struct guehdr *guehdr;
+	unsigned int hlen, guehlen;
+	unsigned int off;
+	int flush = 1;
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*guehdr);
+	guehdr = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, hlen)) {
+		guehdr = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!guehdr))
+			goto out;
+	}
+
+	proto = guehdr->next_hdr;
+
+	rcu_read_lock();
+	offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+	ops = rcu_dereference(offloads[proto]);
+	if (WARN_ON(!ops || !ops->callbacks.gro_receive))
+		goto out_unlock;
+
+	guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
+
+	hlen = off + guehlen;
+	if (skb_gro_header_hard(skb, hlen)) {
+		guehdr = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!guehdr))
+			goto out_unlock;
+	}
+
+	flush = 0;
+
+	for (p = *head; p; p = p->next) {
+		const struct guehdr *guehdr2;
+
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		guehdr2 = (struct guehdr *)(p->data + off);
+
+		/* Compare base GUE header to be equal (covers
+		 * hlen, version, next_hdr, and flags.
+		 */
+		if (guehdr->word != guehdr2->word) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		/* Compare optional fields are the same. */
+		if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1],
+					   guehdr->hlen << 2)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+	}
+
+	skb_gro_pull(skb, guehlen);
+
+	/* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
+	skb_gro_postpull_rcsum(skb, guehdr, guehlen);
+
+	pp = ops->callbacks.gro_receive(head, skb);
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	NAPI_GRO_CB(skb)->flush |= flush;
+
+	return pp;
+}
+
+static int gue_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	const struct net_offload **offloads;
+	struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
+	const struct net_offload *ops;
+	unsigned int guehlen;
+	u8 proto;
+	int err = -ENOENT;
+
+	proto = guehdr->next_hdr;
+
+	guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
+
+	rcu_read_lock();
+	offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+	ops = rcu_dereference(offloads[proto]);
+	if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+		goto out_unlock;
+
+	err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
+
+out_unlock:
+	rcu_read_unlock();
+	return err;
+}
+
 static int fou_add_to_port_list(struct fou *fou)
 {
 	struct fou *fout;
@@ -142,6 +285,28 @@ static void fou_release(struct fou *fou)
 	kfree(fou);
 }
 
+static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
+{
+	udp_sk(sk)->encap_rcv = fou_udp_recv;
+	fou->protocol = cfg->protocol;
+	fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
+	fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
+	fou->udp_offloads.port = cfg->udp_config.local_udp_port;
+	fou->udp_offloads.ipproto = cfg->protocol;
+
+	return 0;
+}
+
+static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
+{
+	udp_sk(sk)->encap_rcv = gue_udp_recv;
+	fou->udp_offloads.callbacks.gro_receive = gue_gro_receive;
+	fou->udp_offloads.callbacks.gro_complete = gue_gro_complete;
+	fou->udp_offloads.port = cfg->udp_config.local_udp_port;
+
+	return 0;
+}
+
 static int fou_create(struct net *net, struct fou_cfg *cfg,
 		      struct socket **sockp)
 {
@@ -164,10 +329,24 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
 
 	sk = sock->sk;
 
-	/* Mark socket as an encapsulation socket. See net/ipv4/udp.c */
-	fou->protocol = cfg->protocol;
-	fou->port =  cfg->udp_config.local_udp_port;
-	udp_sk(sk)->encap_rcv = fou_udp_recv;
+	fou->port = cfg->udp_config.local_udp_port;
+
+	/* Initial for fou type */
+	switch (cfg->type) {
+	case FOU_ENCAP_DIRECT:
+		err = fou_encap_init(sk, fou, cfg);
+		if (err)
+			goto error;
+		break;
+	case FOU_ENCAP_GUE:
+		err = gue_encap_init(sk, fou, cfg);
+		if (err)
+			goto error;
+		break;
+	default:
+		err = -EINVAL;
+		goto error;
+	}
 
 	udp_sk(sk)->encap_type = 1;
 	udp_encap_enable();
@@ -179,11 +358,6 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
 
 	sk->sk_allocation = GFP_ATOMIC;
 
-	fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
-	fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
-	fou->udp_offloads.port = cfg->udp_config.local_udp_port;
-	fou->udp_offloads.ipproto = cfg->protocol;
-
 	if (cfg->udp_config.family == AF_INET) {
 		err = udp_add_offload(&fou->udp_offloads);
 		if (err)
@@ -240,6 +414,7 @@ static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
 	[FOU_ATTR_PORT] = { .type = NLA_U16, },
 	[FOU_ATTR_AF] = { .type = NLA_U8, },
 	[FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
+	[FOU_ATTR_TYPE] = { .type = NLA_U8, },
 };
 
 static int parse_nl_config(struct genl_info *info,
@@ -267,6 +442,9 @@ static int parse_nl_config(struct genl_info *info,
 	if (info->attrs[FOU_ATTR_IPPROTO])
 		cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]);
 
+	if (info->attrs[FOU_ATTR_TYPE])
+		cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]);
+
 	return 0;
 }
 
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH net-next 2/4] fou: eliminate IPv4,v6 specific GRO functions
From: Tom Herbert @ 2014-10-02  4:46 UTC (permalink / raw)
  To: davem, netdev
In-Reply-To: <1412225199-24942-1-git-send-email-therbert@google.com>

This patch removes fou[46]_gro_receive and fou[46]_gro_complete
functions. The v4 or v6 variants were chosen for the UDP offloads
based on the address family of the socket this is not necessary
or correct. Alternatively, this patch adds is_ipv6 to napi_gro_skb.
This is set in udp6_gro_receive and unset in udp4_gro_receive. In
fou_gro_receive the value is used to select the correct inet_offloads
for the protocol of the outer IP header.

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/linux/netdevice.h |  3 +++
 net/ipv4/fou.c            | 48 ++++++++---------------------------------------
 net/ipv4/udp_offload.c    |  1 +
 net/ipv6/udp_offload.c    |  1 +
 4 files changed, 13 insertions(+), 40 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9b7fbac..640f8d8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1886,6 +1886,9 @@ struct napi_gro_cb {
 	/* Number of checksums via CHECKSUM_UNNECESSARY */
 	u8	csum_cnt:3;
 
+	/* Used in foo-over-udp, set in udp[46]_gro_receive */
+	u8	is_ipv6:1;
+
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
 
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index dced89f..7e2126a 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -65,14 +65,15 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
 }
 
 static struct sk_buff **fou_gro_receive(struct sk_buff **head,
-					struct sk_buff *skb,
-					const struct net_offload **offloads)
+					struct sk_buff *skb)
 {
 	const struct net_offload *ops;
 	struct sk_buff **pp = NULL;
 	u8 proto = NAPI_GRO_CB(skb)->proto;
+	const struct net_offload **offloads;
 
 	rcu_read_lock();
+	offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
 	ops = rcu_dereference(offloads[proto]);
 	if (!ops || !ops->callbacks.gro_receive)
 		goto out_unlock;
@@ -85,14 +86,15 @@ out_unlock:
 	return pp;
 }
 
-static int fou_gro_complete(struct sk_buff *skb, int nhoff,
-			    const struct net_offload **offloads)
+static int fou_gro_complete(struct sk_buff *skb, int nhoff)
 {
 	const struct net_offload *ops;
 	u8 proto = NAPI_GRO_CB(skb)->proto;
 	int err = -ENOSYS;
+	const struct net_offload **offloads;
 
 	rcu_read_lock();
+	offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
 	ops = rcu_dereference(offloads[proto]);
 	if (WARN_ON(!ops || !ops->callbacks.gro_complete))
 		goto out_unlock;
@@ -105,28 +107,6 @@ out_unlock:
 	return err;
 }
 
-static struct sk_buff **fou4_gro_receive(struct sk_buff **head,
-					 struct sk_buff *skb)
-{
-	return fou_gro_receive(head, skb, inet_offloads);
-}
-
-static int fou4_gro_complete(struct sk_buff *skb, int nhoff)
-{
-	return fou_gro_complete(skb, nhoff, inet_offloads);
-}
-
-static struct sk_buff **fou6_gro_receive(struct sk_buff **head,
-					 struct sk_buff *skb)
-{
-	return fou_gro_receive(head, skb, inet6_offloads);
-}
-
-static int fou6_gro_complete(struct sk_buff *skb, int nhoff)
-{
-	return fou_gro_complete(skb, nhoff, inet6_offloads);
-}
-
 static int fou_add_to_port_list(struct fou *fou)
 {
 	struct fou *fout;
@@ -199,20 +179,8 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
 
 	sk->sk_allocation = GFP_ATOMIC;
 
-	switch (cfg->udp_config.family) {
-	case AF_INET:
-		fou->udp_offloads.callbacks.gro_receive = fou4_gro_receive;
-		fou->udp_offloads.callbacks.gro_complete = fou4_gro_complete;
-		break;
-	case AF_INET6:
-		fou->udp_offloads.callbacks.gro_receive = fou6_gro_receive;
-		fou->udp_offloads.callbacks.gro_complete = fou6_gro_complete;
-		break;
-	default:
-		err = -EPFNOSUPPORT;
-		goto error;
-	}
-
+	fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
+	fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
 	fou->udp_offloads.port = cfg->udp_config.local_udp_port;
 	fou->udp_offloads.ipproto = cfg->protocol;
 
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 8c35f2c..507310e 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -334,6 +334,7 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
 		skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
 					     inet_gro_compute_pseudo);
 skip:
+	NAPI_GRO_CB(skb)->is_ipv6 = 0;
 	return udp_gro_receive(head, skb, uh);
 
 flush:
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 8f96988..6b8f543 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -140,6 +140,7 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
 					     ip6_gro_compute_pseudo);
 
 skip:
+	NAPI_GRO_CB(skb)->is_ipv6 = 1;
 	return udp_gro_receive(head, skb, uh);
 
 flush:
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH net-next 1/4] ip_tunnel: Account for secondary encapsulation header in max_headroom
From: Tom Herbert @ 2014-10-02  4:46 UTC (permalink / raw)
  To: davem, netdev
In-Reply-To: <1412225199-24942-1-git-send-email-therbert@google.com>

When adjusting max_header for the tunnel interface based on egress
device we need to account for any extra bytes in secondary encapsulation
(e.g. FOU).

Signed-off-by: Tom Herbert <therbert@google.com>
---
 net/ipv4/ip_tunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index b75b47b..54ace25 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -759,7 +759,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		df |= (inner_iph->frag_off&htons(IP_DF));
 
 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
-			+ rt->dst.header_len;
+			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
 	if (max_headroom > dev->needed_headroom)
 		dev->needed_headroom = max_headroom;
 
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH net-next 0/4] net: Generic UDP Encapsulation
From: Tom Herbert @ 2014-10-02  4:46 UTC (permalink / raw)
  To: davem, netdev

Generic UDP Encapsulation (GUE) is UDP encapsulation protocol that
encapsulates packets of various IP protocols. The GUE protocol is
described in http://tools.ietf.org/html/draft-herbert-gue-01.

The receive path of GUE is implemented in the FOU over UDP module (FOU).
This includes a UDP encap receive function for GUE as well as GUE
specific GRO functions. Management and configuration of GUE ports shares
most of the same code with FOU.

For the transmit path, the previous FOU support for IPIP, sit, and GRE
was simply extended for GUE (when GUE is enabled insert the GUE
header on transmit in addition to UDP header inserted for FOU).

Semantically GUE is the same as FOU in that the encapsulation (UDP
and GUE headers) that are inserted on transmission and removed on
reception so that IP packet is processed with the inner header.

This patch set includes:
 - Some fixes to FOU, removal of IPv4,v6 specific GRO functions
 - Support to configure a GUE receive port
 - Implementation of GUE receive path (normal and GRO)
 - Additions to ip_tunnel netlink to configure GUE
 - GUE header inserion in ip_tunnel transmit path

Follow on patches will include

Testng:

I ran performance numbers using netperf TCP_RR with 200 streams,
comparing encapsulation without GUE, encapsulation with GUE, and
encapsulation with FOU.

 GRE
    TCP_STREAM
      IPv4, FOU, UDP checksum enabled
        14.04% TX CPU utilization
        13.17% RX CPU utilization
        9211 Mbps
      IPv4, GUE, UDP checksum enabled
        14.99% TX CPU utilization
        13.79% RX CPU utilization
        9185 Mbps
      IPv4, FOU, UDP checksum disabled
        13.14% TX CPU utilization
        23.18% RX CPU utilization
        9277 Mbps
      IPv4, GUE, UDP checksum disabled
        13.66% TX CPU utilization
        23.57% RX CPU utilization
        9184 Mbps
    TCP_RR
      IPv4, FOU, UDP checksum enabled
        94.2% CPU utilization
        155/249/460 90/95/99% latencies
        1.17018e+06 tps
      IPv4, GUE, UDP checksum enabled
        93.9% CPU utilization
        158/253/472 90/95/99% latencies
        1.15045e+06 tps


  IPIP
    TCP_STREAM
      FOU, UDP checksum enabled
        15.28% TX CPU utilization
        13.92% RX CPU utilization
        9342 Mbps
      GUE, UDP checksum enabled
        13.99% TX CPU utilization
        13.34% RX CPU utilization
        9210 Mbps
      FOU, UDP checksum disabled
        15.08% TX CPU utilization
        24.64% RX CPU utilization
        9226 Mbps
      GUE, UDP checksum disabled
        15.90% TX CPU utilization
        24.77% RX CPU utilization
        9197 Mbps
    TCP_RR
      FOU, UDP checksum enabled
        94.23% CPU utilization
        149/237/429 90/95/99% latencies
        1.19553e+06 tps
      GUE, UDP checksum enabled
        93.75% CPU utilization
        152/243/442 90/95/99% latencies
        1.17027e+06 tps

  SIT
    TCP_STREAM
      FOU, UDP checksum enabled
        14.47% TX CPU utilization
        14.58% RX CPU utilization
        9106 Mbps
      GUE, UDP checksum enabled
        15.09% TX CPU utilization
        14.84% RX CPU utilization
        9080 Mbps
      FOU, UDP checksum disabled
        15.70% TX CPU utilization
        27.93% RX CPU utilization
        9097 Mbps
      GUE, UDP checksum disabled
        15.04% TX CPU utilization
        27.54% RX CPU utilization
        9073 Mbps
    TCP_RR
      FOU, UDP checksum enabled
        96.9% CPU utilization
        170/281/581 90/95/99% latencies
        1.03372e+06 tps
      GUE, UDP checksum enabled
        97.16% CPU utilization
        172/286/576 90/95/99% latencies
        1.00469e+06 tps

Tom Herbert (4):
  ip_tunnel: Account for secondary encapsulation header in max_headroom
  fou: eliminate IPv4,v6 specific GRO functions
  gue: Receive side for Generic UDP Encapsulation
  ip_tunnel: Add GUE support

 include/linux/netdevice.h      |   3 +
 include/uapi/linux/fou.h       |   7 ++
 include/uapi/linux/if_tunnel.h |   1 +
 net/ipv4/fou.c                 | 224 ++++++++++++++++++++++++++++++++++-------
 net/ipv4/ip_tunnel.c           |  15 ++-
 net/ipv4/udp_offload.c         |   1 +
 net/ipv6/udp_offload.c         |   1 +
 7 files changed, 212 insertions(+), 40 deletions(-)

-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox