Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v2 13/14] openvswitch: Refactor action alloc and copy api.
From: Pravin B Shelar @ 2014-11-06  9:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Pravin B Shelar

There are two separate API to allocate and copy actions list. Anytime
OVS needs to copy action list, it needs to call both functions.
Following patch moves action allocation to copy function to avoid
code duplication.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
---
 net/openvswitch/datapath.c     | 25 ++++---------------------
 net/openvswitch/flow_netlink.c | 24 +++++++++++++++++-------
 net/openvswitch/flow_netlink.h |  1 -
 3 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 5101780..014485e 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -543,18 +543,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		goto err_flow_free;
 
-	acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS]));
-	err = PTR_ERR(acts);
-	if (IS_ERR(acts))
-		goto err_flow_free;
-
 	err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
 				   &flow->key, &acts);
 	if (err)
 		goto err_flow_free;
 
 	rcu_assign_pointer(flow->sf_acts, acts);
-
 	OVS_CB(packet)->egress_tun_info = NULL;
 	packet->priority = flow->key.phy.priority;
 	packet->mark = flow->key.phy.skb_mark;
@@ -872,16 +866,11 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	ovs_flow_mask_key(&new_flow->key, &new_flow->unmasked_key, &mask);
 
 	/* Validate actions. */
-	acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS]));
-	error = PTR_ERR(acts);
-	if (IS_ERR(acts))
-		goto err_kfree_flow;
-
 	error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key,
 				     &acts);
 	if (error) {
 		OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
-		goto err_kfree_acts;
+		goto err_kfree_flow;
 	}
 
 	reply = ovs_flow_cmd_alloc_info(acts, info, false);
@@ -972,6 +961,7 @@ error:
 	return error;
 }
 
+/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
 static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
 						const struct sw_flow_key *key,
 						const struct sw_flow_mask *mask)
@@ -980,15 +970,10 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
 	struct sw_flow_key masked_key;
 	int error;
 
-	acts = ovs_nla_alloc_flow_actions(nla_len(a));
-	if (IS_ERR(acts))
-		return acts;
-
 	ovs_flow_mask_key(&masked_key, key, mask);
 	error = ovs_nla_copy_actions(a, &masked_key, &acts);
 	if (error) {
-		OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
-		kfree(acts);
+		OVS_NLERR("Actions may not be safe on all matching packets.\n");
 		return ERR_PTR(error);
 	}
 
@@ -1028,10 +1013,8 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 			error = PTR_ERR(acts);
 			goto error;
 		}
-	}
 
-	/* Can allocate before locking if have acts. */
-	if (acts) {
+		/* Can allocate before locking if have acts. */
 		reply = ovs_flow_cmd_alloc_info(acts, info, false);
 		if (IS_ERR(reply)) {
 			error = PTR_ERR(reply);
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 1050b28..482a0cb 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1284,7 +1284,7 @@ nla_put_failure:
 
 #define MAX_ACTIONS_BUFSIZE	(32 * 1024)
 
-struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size)
+static struct sw_flow_actions *nla_alloc_flow_actions(int size)
 {
 	struct sw_flow_actions *sfa;
 
@@ -1329,7 +1329,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
 		new_acts_size = MAX_ACTIONS_BUFSIZE;
 	}
 
-	acts = ovs_nla_alloc_flow_actions(new_acts_size);
+	acts = nla_alloc_flow_actions(new_acts_size);
 	if (IS_ERR(acts))
 		return (void *)acts;
 
@@ -1396,7 +1396,7 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa,
 	a->nla_len = sfa->actions_len - st_offset;
 }
 
-static int ovs_nla_copy_actions__(const struct nlattr *attr,
+static int __ovs_nla_copy_actions(const struct nlattr *attr,
 				  const struct sw_flow_key *key,
 				  int depth, struct sw_flow_actions **sfa,
 				  __be16 eth_type, __be16 vlan_tci);
@@ -1441,7 +1441,7 @@ static int validate_and_copy_sample(const struct nlattr *attr,
 	if (st_acts < 0)
 		return st_acts;
 
-	err = ovs_nla_copy_actions__(actions, key, depth + 1, sfa,
+	err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa,
 				     eth_type, vlan_tci);
 	if (err)
 		return err;
@@ -1684,7 +1684,7 @@ static int copy_action(const struct nlattr *from,
 	return 0;
 }
 
-static int ovs_nla_copy_actions__(const struct nlattr *attr,
+static int __ovs_nla_copy_actions(const struct nlattr *attr,
 				  const struct sw_flow_key *key,
 				  int depth, struct sw_flow_actions **sfa,
 				  __be16 eth_type, __be16 vlan_tci)
@@ -1846,8 +1846,18 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 			 const struct sw_flow_key *key,
 			 struct sw_flow_actions **sfa)
 {
-	return ovs_nla_copy_actions__(attr, key, 0, sfa, key->eth.type,
-				      key->eth.tci);
+	int err;
+
+	*sfa = nla_alloc_flow_actions(nla_len(attr));
+	if (IS_ERR(*sfa))
+		return PTR_ERR(*sfa);
+
+	err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type,
+				     key->eth.tci);
+	if (err)
+		kfree(*sfa);
+
+	return err;
 }
 
 static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 4f03706..eb0b177 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -56,7 +56,6 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 int ovs_nla_put_actions(const struct nlattr *attr,
 			int len, struct sk_buff *skb);
 
-struct sw_flow_actions *ovs_nla_alloc_flow_actions(int actions_len);
 void ovs_nla_free_flow_actions(struct sw_flow_actions *);
 
 #endif /* flow_netlink.h */
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 12/14] openvswitch: Move key_attr_size() to flow_netlink.h.
From: Pravin B Shelar @ 2014-11-06  9:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Joe Stringer, Pravin B Shelar

From: Joe Stringer <joestringer@nicira.com>

flow-netlink has netlink related code.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/datapath.c     | 31 +++----------------------------
 net/openvswitch/flow_netlink.c | 32 ++++++++++++++++++++++++++++++++
 net/openvswitch/flow_netlink.h |  2 ++
 3 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 4fd8a45..5101780 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -375,37 +375,12 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 	return err;
 }
 
-static size_t key_attr_size(void)
-{
-	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
-		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
-		  + nla_total_size(8)   /* OVS_TUNNEL_KEY_ATTR_ID */
-		  + nla_total_size(4)   /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */
-		  + nla_total_size(4)   /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */
-		  + nla_total_size(1)   /* OVS_TUNNEL_KEY_ATTR_TOS */
-		  + nla_total_size(1)   /* OVS_TUNNEL_KEY_ATTR_TTL */
-		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
-		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_CSUM */
-		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_OAM */
-		  + nla_total_size(256)   /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */
-		+ nla_total_size(4)   /* OVS_KEY_ATTR_IN_PORT */
-		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
-		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
-		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
-		+ nla_total_size(4)   /* OVS_KEY_ATTR_8021Q */
-		+ nla_total_size(0)   /* OVS_KEY_ATTR_ENCAP */
-		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
-		+ nla_total_size(40)  /* OVS_KEY_ATTR_IPV6 */
-		+ nla_total_size(2)   /* OVS_KEY_ATTR_ICMPV6 */
-		+ nla_total_size(28); /* OVS_KEY_ATTR_ND */
-}
-
 static size_t upcall_msg_size(const struct nlattr *userdata,
 			      unsigned int hdrlen)
 {
 	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
 		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
-		+ nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */
+		+ nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */
 
 	/* OVS_PACKET_ATTR_USERDATA */
 	if (userdata)
@@ -678,8 +653,8 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats,
 static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
 {
 	return NLMSG_ALIGN(sizeof(struct ovs_header))
-		+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */
-		+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */
+		+ nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_KEY */
+		+ nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_MASK */
 		+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
 		+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
 		+ nla_total_size(8) /* OVS_FLOW_ATTR_USED */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 1b29ea7..1050b28 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -252,6 +252,38 @@ static bool match_validate(const struct sw_flow_match *match,
 	return true;
 }
 
+size_t ovs_key_attr_size(void)
+{
+	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
+	 * updating this function.
+	 */
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 22);
+
+	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
+		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
+		  + nla_total_size(8)   /* OVS_TUNNEL_KEY_ATTR_ID */
+		  + nla_total_size(4)   /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */
+		  + nla_total_size(4)   /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */
+		  + nla_total_size(1)   /* OVS_TUNNEL_KEY_ATTR_TOS */
+		  + nla_total_size(1)   /* OVS_TUNNEL_KEY_ATTR_TTL */
+		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
+		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_CSUM */
+		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_OAM */
+		  + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */
+		+ nla_total_size(4)   /* OVS_KEY_ATTR_IN_PORT */
+		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
+		+ nla_total_size(4)   /* OVS_KEY_ATTR_DP_HASH */
+		+ nla_total_size(4)   /* OVS_KEY_ATTR_RECIRC_ID */
+		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
+		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
+		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
+		+ nla_total_size(0)   /* OVS_KEY_ATTR_ENCAP */
+		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
+		+ nla_total_size(40)  /* OVS_KEY_ATTR_IPV6 */
+		+ nla_total_size(2)   /* OVS_KEY_ATTR_ICMPV6 */
+		+ nla_total_size(28); /* OVS_KEY_ATTR_ND */
+}
+
 /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
 static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_ENCAP] = -1,
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 6355b1d..4f03706 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -37,6 +37,8 @@
 
 #include "flow.h"
 
+size_t ovs_key_attr_size(void);
+
 void ovs_match_init(struct sw_flow_match *match,
 		    struct sw_flow_key *key, struct sw_flow_mask *mask);
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 11/14] openvswitch: Remove flow member from struct ovs_skb_cb
From: Pravin B Shelar @ 2014-11-06  9:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Lorand Jakab, Pravin B Shelar

From: Lorand Jakab <lojakab@cisco.com>

The 'flow' memeber was chosen for removal because it's only used
in ovs_execute_actions() we can pass it as argument to this
function.

Signed-off-by: Lorand Jakab <lojakab@cisco.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/actions.c  |  5 +----
 net/openvswitch/datapath.c | 12 +++++++-----
 net/openvswitch/datapath.h |  4 +---
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 9fd33c0..f7e5891 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -865,14 +865,11 @@ static void process_deferred_actions(struct datapath *dp)
 
 /* Execute a list of actions against 'skb'. */
 int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
-			struct sw_flow_key *key)
+			struct sw_flow_actions *acts, struct sw_flow_key *key)
 {
 	int level = this_cpu_read(exec_actions_level);
-	struct sw_flow_actions *acts;
 	int err;
 
-	acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts);
-
 	this_cpu_inc(exec_actions_level);
 	OVS_CB(skb)->egress_tun_info = NULL;
 	err = do_execute_actions(dp, skb, key,
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index cdbc44c..4fd8a45 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -257,6 +257,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 	const struct vport *p = OVS_CB(skb)->input_vport;
 	struct datapath *dp = p->dp;
 	struct sw_flow *flow;
+	struct sw_flow_actions *sf_acts;
 	struct dp_stats_percpu *stats;
 	u64 *stats_counter;
 	u32 n_mask_hit;
@@ -282,10 +283,10 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 		goto out;
 	}
 
-	OVS_CB(skb)->flow = flow;
+	ovs_flow_stats_update(flow, key->tp.flags, skb);
+	sf_acts = rcu_dereference(flow->sf_acts);
+	ovs_execute_actions(dp, skb, sf_acts, key);
 
-	ovs_flow_stats_update(OVS_CB(skb)->flow, key->tp.flags, skb);
-	ovs_execute_actions(dp, skb, key);
 	stats_counter = &stats->n_hit;
 
 out:
@@ -524,6 +525,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	struct sw_flow_actions *acts;
 	struct sk_buff *packet;
 	struct sw_flow *flow;
+	struct sw_flow_actions *sf_acts;
 	struct datapath *dp;
 	struct ethhdr *eth;
 	struct vport *input_vport;
@@ -579,7 +581,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	rcu_assign_pointer(flow->sf_acts, acts);
 
 	OVS_CB(packet)->egress_tun_info = NULL;
-	OVS_CB(packet)->flow = flow;
 	packet->priority = flow->key.phy.priority;
 	packet->mark = flow->key.phy.skb_mark;
 
@@ -597,9 +598,10 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 		goto err_unlock;
 
 	OVS_CB(packet)->input_vport = input_vport;
+	sf_acts = rcu_dereference(flow->sf_acts);
 
 	local_bh_disable();
-	err = ovs_execute_actions(dp, packet, &flow->key);
+	err = ovs_execute_actions(dp, packet, sf_acts, &flow->key);
 	local_bh_enable();
 	rcu_read_unlock();
 
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 9741354..1c56a80 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -94,14 +94,12 @@ struct datapath {
 
 /**
  * struct ovs_skb_cb - OVS data in skb CB
- * @flow: The flow associated with this packet.  May be %NULL if no flow.
  * @egress_tun_key: Tunnel information about this packet on egress path.
  * NULL if the packet is not being tunneled.
  * @input_vport: The original vport packet came in on. This value is cached
  * when a packet is received by OVS.
  */
 struct ovs_skb_cb {
-	struct sw_flow		*flow;
 	struct ovs_tunnel_info  *egress_tun_info;
 	struct vport		*input_vport;
 };
@@ -194,7 +192,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq,
 					 u8 cmd);
 
 int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
-			struct sw_flow_key *);
+			struct sw_flow_actions *acts, struct sw_flow_key *);
 
 void ovs_dp_notify_wq(struct work_struct *work);
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 10/14] openvswitch: Fix the type of struct ovs_key_nd nd_target field.
From: Pravin B Shelar @ 2014-11-06  9:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Jarno Rajahalme, Pravin B Shelar

From: Jarno Rajahalme <jrajahalme@nicira.com>

Should be the same as other IPv6 address fields.

Current master produces sparse warnings without this change.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 include/uapi/linux/openvswitch.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 631056b..26c36c4 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -400,9 +400,9 @@ struct ovs_key_arp {
 };

 struct ovs_key_nd {
-	__u32 nd_target[4];
-	__u8  nd_sll[ETH_ALEN];
-	__u8  nd_tll[ETH_ALEN];
+	__be32	nd_target[4];
+	__u8	nd_sll[ETH_ALEN];
+	__u8	nd_tll[ETH_ALEN];
 };

 /**
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 09/14] openvswitch: Drop packets when interdev is not up
From: Pravin B Shelar @ 2014-11-06  9:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Chunhe Li, Pravin B Shelar

From: Chunhe Li <lichunhe@huawei.com>

If the internal device is not up, it should drop received
packets. Sometimes it receive the broadcast or multicast
packets, and the ip protocol stack will casue more cpu
usage wasted.

Signed-off-by: Chunhe Li <lichunhe@huawei.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/vport-internal_dev.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 10dc07e..6a55f71 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -224,6 +224,11 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
 	struct net_device *netdev = netdev_vport_priv(vport)->dev;
 	int len;
 
+	if (unlikely(!(netdev->flags & IFF_UP))) {
+		kfree_skb(skb);
+		return 0;
+	}
+
 	len = skb->len;
 
 	skb_dst_drop(skb);
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 08/14] openvswitch: Refactor get_dp() function into multiple access APIs.
From: Pravin B Shelar @ 2014-11-06  9:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Andy Zhou, Pravin B Shelar

From: Andy Zhou <azhou@nicira.com>

Avoid recursive read_rcu_lock() by using the lighter weight
get_dp_rcu() API. Add proper locking assertions to get_dp().

Signed-off-by: Andy Zhou <azhou@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/datapath.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index bbb920b..cdbc44c 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -140,19 +140,30 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
 static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
 				  const struct dp_upcall_info *);
 
-/* Must be called with rcu_read_lock or ovs_mutex. */
-static struct datapath *get_dp(struct net *net, int dp_ifindex)
+/* Must be called with rcu_read_lock. */
+static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
 {
-	struct datapath *dp = NULL;
-	struct net_device *dev;
+	struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
 
-	rcu_read_lock();
-	dev = dev_get_by_index_rcu(net, dp_ifindex);
 	if (dev) {
 		struct vport *vport = ovs_internal_dev_get_vport(dev);
 		if (vport)
-			dp = vport->dp;
+			return vport->dp;
 	}
+
+	return NULL;
+}
+
+/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
+ * returned dp pointer valid.
+ */
+static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
+{
+	struct datapath *dp;
+
+	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
+	rcu_read_lock();
+	dp = get_dp_rcu(net, dp_ifindex);
 	rcu_read_unlock();
 
 	return dp;
@@ -573,7 +584,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	packet->mark = flow->key.phy.skb_mark;
 
 	rcu_read_lock();
-	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+	dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
 	err = -ENODEV;
 	if (!dp)
 		goto err_unlock;
@@ -1227,7 +1238,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	struct datapath *dp;
 
 	rcu_read_lock();
-	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+	dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
 	if (!dp) {
 		rcu_read_unlock();
 		return -ENODEV;
@@ -1989,7 +2000,7 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	int i, j = 0;
 
 	rcu_read_lock();
-	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+	dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
 	if (!dp) {
 		rcu_read_unlock();
 		return -ENODEV;
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 07/14] openvswitch: Refactor ovs_flow_cmd_fill_info().
From: Pravin B Shelar @ 2014-11-06  9:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Joe Stringer, Pravin B Shelar

From: Joe Stringer <joestringer@nicira.com>

Split up ovs_flow_cmd_fill_info() to make it easier to cache parts of a
dump reply. This will be used to streamline flow_dump in a future patch.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/datapath.c | 93 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 66 insertions(+), 27 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 04a26ae..bbb920b 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -674,58 +674,67 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
 }
 
 /* Called with ovs_mutex or RCU read lock. */
-static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
-				  struct sk_buff *skb, u32 portid,
-				  u32 seq, u32 flags, u8 cmd)
+static int ovs_flow_cmd_fill_match(const struct sw_flow *flow,
+				   struct sk_buff *skb)
 {
-	const int skb_orig_len = skb->len;
-	struct nlattr *start;
-	struct ovs_flow_stats stats;
-	__be16 tcp_flags;
-	unsigned long used;
-	struct ovs_header *ovs_header;
 	struct nlattr *nla;
 	int err;
 
-	ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd);
-	if (!ovs_header)
-		return -EMSGSIZE;
-
-	ovs_header->dp_ifindex = dp_ifindex;
-
 	/* Fill flow key. */
 	nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
 	if (!nla)
-		goto nla_put_failure;
+		return -EMSGSIZE;
 
 	err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb);
 	if (err)
-		goto error;
+		return err;
+
 	nla_nest_end(skb, nla);
 
+	/* Fill flow mask. */
 	nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK);
 	if (!nla)
-		goto nla_put_failure;
+		return -EMSGSIZE;
 
 	err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb);
 	if (err)
-		goto error;
+		return err;
 
 	nla_nest_end(skb, nla);
+	return 0;
+}
+
+/* Called with ovs_mutex or RCU read lock. */
+static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
+				   struct sk_buff *skb)
+{
+	struct ovs_flow_stats stats;
+	__be16 tcp_flags;
+	unsigned long used;
 
 	ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
 
 	if (used &&
 	    nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)))
-		goto nla_put_failure;
+		return -EMSGSIZE;
 
 	if (stats.n_packets &&
 	    nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats))
-		goto nla_put_failure;
+		return -EMSGSIZE;
 
 	if ((u8)ntohs(tcp_flags) &&
 	     nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags)))
-		goto nla_put_failure;
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+/* Called with ovs_mutex or RCU read lock. */
+static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
+				     struct sk_buff *skb, int skb_orig_len)
+{
+	struct nlattr *start;
+	int err;
 
 	/* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
 	 * this is the first flow to be dumped into 'skb'.  This is unusual for
@@ -749,17 +758,47 @@ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
 			nla_nest_end(skb, start);
 		else {
 			if (skb_orig_len)
-				goto error;
+				return err;
 
 			nla_nest_cancel(skb, start);
 		}
-	} else if (skb_orig_len)
-		goto nla_put_failure;
+	} else if (skb_orig_len) {
+		return -EMSGSIZE;
+	}
+
+	return 0;
+}
+
+/* Called with ovs_mutex or RCU read lock. */
+static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
+				  struct sk_buff *skb, u32 portid,
+				  u32 seq, u32 flags, u8 cmd)
+{
+	const int skb_orig_len = skb->len;
+	struct ovs_header *ovs_header;
+	int err;
+
+	ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family,
+				 flags, cmd);
+	if (!ovs_header)
+		return -EMSGSIZE;
+
+	ovs_header->dp_ifindex = dp_ifindex;
+
+	err = ovs_flow_cmd_fill_match(flow, skb);
+	if (err)
+		goto error;
+
+	err = ovs_flow_cmd_fill_stats(flow, skb);
+	if (err)
+		goto error;
+
+	err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len);
+	if (err)
+		goto error;
 
 	return genlmsg_end(skb, ovs_header);
 
-nla_put_failure:
-	err = -EMSGSIZE;
 error:
 	genlmsg_cancel(skb, ovs_header);
 	return err;
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 06/14] openvswitch: refactor do_output() to move NULL check out of fast path
From: Pravin B Shelar @ 2014-11-06  9:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Andy Zhou, Pravin B Shelar

From: Andy Zhou <azhou@nicira.com>

skb_clone() NULL check is implemented in do_output(), as past of the
common (fast) path. Refactoring so that NULL check is done in the
slow path, immediately after skb_clone() is called.

Besides optimization, this change also improves code readability by
making the skb_clone() NULL check consistent within OVS datapath
module.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/actions.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 930b1b6..9fd33c0 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -551,21 +551,14 @@ static int set_sctp(struct sk_buff *skb,
 	return 0;
 }
 
-static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
+static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
 {
-	struct vport *vport;
+	struct vport *vport = ovs_vport_rcu(dp, out_port);
 
-	if (unlikely(!skb))
-		return -ENOMEM;
-
-	vport = ovs_vport_rcu(dp, out_port);
-	if (unlikely(!vport)) {
+	if (likely(vport))
+		ovs_vport_send(vport, skb);
+	else
 		kfree_skb(skb);
-		return -ENODEV;
-	}
-
-	ovs_vport_send(vport, skb);
-	return 0;
 }
 
 static int output_userspace(struct datapath *dp, struct sk_buff *skb,
@@ -768,8 +761,12 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 	     a = nla_next(a, &rem)) {
 		int err = 0;
 
-		if (prev_port != -1) {
-			do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port);
+		if (unlikely(prev_port != -1)) {
+			struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
+
+			if (out_skb)
+				do_output(dp, out_skb, prev_port);
+
 			prev_port = -1;
 		}
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 05/14] openvswitch: Additional logging for -EINVAL on flow setups.
From: Pravin B Shelar @ 2014-11-06  9:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Jesse Gross, Federico Iezzi, Pravin B Shelar

From: Jesse Gross <jesse@nicira.com>

There are many possible ways that a flow can be invalid so we've
added logging for most of them. This adds logs for the remaining
possible cases so there isn't any ambiguity while debugging.

CC: Federico Iezzi <fiezzi@enter.it>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/datapath.c     | 12 +++++++++---
 net/openvswitch/flow_netlink.c | 17 +++++++++++++----
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index a532a9c..04a26ae 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -817,10 +817,14 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 
 	/* Must have key and actions. */
 	error = -EINVAL;
-	if (!a[OVS_FLOW_ATTR_KEY])
+	if (!a[OVS_FLOW_ATTR_KEY]) {
+		OVS_NLERR("Flow key attribute not present in new flow.\n");
 		goto error;
-	if (!a[OVS_FLOW_ATTR_ACTIONS])
+	}
+	if (!a[OVS_FLOW_ATTR_ACTIONS]) {
+		OVS_NLERR("Flow actions attribute not present in new flow.\n");
 		goto error;
+	}
 
 	/* Most of the time we need to allocate a new flow, do it before
 	 * locking.
@@ -979,8 +983,10 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 
 	/* Extract key. */
 	error = -EINVAL;
-	if (!a[OVS_FLOW_ATTR_KEY])
+	if (!a[OVS_FLOW_ATTR_KEY]) {
+		OVS_NLERR("Flow key attribute not present in set flow.\n");
 		goto error;
+	}
 
 	ovs_match_init(&match, &key, &mask);
 	error = ovs_nla_get_match(&match,
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 5a91d79..1b29ea7 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -581,10 +581,13 @@ static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
 	if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
 		u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
 
-		if (is_mask)
+		if (is_mask) {
 			in_port = 0xffffffff; /* Always exact match in_port. */
-		else if (in_port >= DP_MAX_PORTS)
+		} else if (in_port >= DP_MAX_PORTS) {
+			OVS_NLERR("Port (%d) exceeds maximum allowable (%d).\n",
+				  in_port, DP_MAX_PORTS);
 			return -EINVAL;
+		}
 
 		SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask);
 		*attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
@@ -824,8 +827,11 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
 		attrs &= ~(1 << OVS_KEY_ATTR_ND);
 	}
 
-	if (attrs != 0)
+	if (attrs != 0) {
+		OVS_NLERR("Unknown key attributes (%llx).\n",
+			  (unsigned long long)attrs);
 		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -1250,8 +1256,10 @@ struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size)
 {
 	struct sw_flow_actions *sfa;
 
-	if (size > MAX_ACTIONS_BUFSIZE)
+	if (size > MAX_ACTIONS_BUFSIZE) {
+		OVS_NLERR("Flow action size (%u bytes) exceeds maximum", size);
 		return ERR_PTR(-EINVAL);
+	}
 
 	sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
 	if (!sfa)
@@ -1786,6 +1794,7 @@ static int ovs_nla_copy_actions__(const struct nlattr *attr,
 			break;
 
 		default:
+			OVS_NLERR("Unknown tunnel attribute (%d).\n", type);
 			return -EINVAL;
 		}
 		if (!skip_copy) {
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 04/14] openvswitch: Remove redundant tcp_flags code.
From: Pravin B Shelar @ 2014-11-06  9:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Joe Stringer, Pravin B Shelar

From: Joe Stringer <joestringer@nicira.com>

These two cases used to be treated differently for IPv4/IPv6,
but they are now identical.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/flow_netlink.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 569309c..5a91d79 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -611,7 +611,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
 				const struct nlattr **a, bool is_mask)
 {
 	int err;
-	u64 orig_attrs = attrs;
 
 	err = metadata_from_nlattrs(match, &attrs, a, is_mask);
 	if (err)
@@ -764,15 +763,9 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
 	}
 
 	if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) {
-		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
-			SW_FLOW_KEY_PUT(match, tp.flags,
-					nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
-					is_mask);
-		} else {
-			SW_FLOW_KEY_PUT(match, tp.flags,
-					nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
-					is_mask);
-		}
+		SW_FLOW_KEY_PUT(match, tp.flags,
+				nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
+				is_mask);
 		attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS);
 	}
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 03/14] openvswitch: Move table destroy to dp-rcu callback.
From: Pravin B Shelar @ 2014-11-06  9:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Pravin B Shelar

Ths simplifies flow-table-destroy API. No need to pass explicit
parameter about context.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>
---
 net/openvswitch/datapath.c   |  5 ++---
 net/openvswitch/flow_table.c | 11 +++++++----
 net/openvswitch/flow_table.h |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 688cb9b..a532a9c 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -187,6 +187,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
 {
 	struct datapath *dp = container_of(rcu, struct datapath, rcu);
 
+	ovs_flow_tbl_destroy(&dp->table);
 	free_percpu(dp->stats_percpu);
 	release_net(ovs_dp_get_net(dp));
 	kfree(dp->ports);
@@ -1444,7 +1445,7 @@ err_destroy_ports_array:
 err_destroy_percpu:
 	free_percpu(dp->stats_percpu);
 err_destroy_table:
-	ovs_flow_tbl_destroy(&dp->table, false);
+	ovs_flow_tbl_destroy(&dp->table);
 err_free_dp:
 	release_net(ovs_dp_get_net(dp));
 	kfree(dp);
@@ -1476,8 +1477,6 @@ static void __dp_destroy(struct datapath *dp)
 	ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
 
 	/* RCU destroy the flow table */
-	ovs_flow_tbl_destroy(&dp->table, true);
-
 	call_rcu(&dp->rcu, destroy_dp_rcu);
 }
 
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index cf2d853..90f8b40 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2013 Nicira, Inc.
+ * Copyright (c) 2007-2014 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -250,11 +250,14 @@ skip_flows:
 		__table_instance_destroy(ti);
 }
 
-void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred)
+/* No need for locking this function is called from RCU callback or
+ * error path.
+ */
+void ovs_flow_tbl_destroy(struct flow_table *table)
 {
-	struct table_instance *ti = ovsl_dereference(table->ti);
+	struct table_instance *ti = rcu_dereference_raw(table->ti);
 
-	table_instance_destroy(ti, deferred);
+	table_instance_destroy(ti, false);
 }
 
 struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index 5918bff..f682c8c 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -62,7 +62,7 @@ void ovs_flow_free(struct sw_flow *, bool deferred);
 
 int ovs_flow_tbl_init(struct flow_table *);
 int ovs_flow_tbl_count(struct flow_table *table);
-void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred);
+void ovs_flow_tbl_destroy(struct flow_table *table);
 int ovs_flow_tbl_flush(struct flow_table *flow_table);
 
 int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 02/14] openvswitch: Add basic MPLS support to kernel
From: Pravin B Shelar @ 2014-11-06  9:18 UTC (permalink / raw)
  To: davem
  Cc: netdev, Simon Horman, Ravi K, Leo Alterman, Isaku Yamahata,
	Joe Stringer, Jesse Gross, Pravin B Shelar

From: Simon Horman <horms@verge.net.au>

Allow datapath to recognize and extract MPLS labels into flow keys
and execute actions which push, pop, and set labels on packets.

Based heavily on work by Leo Alterman, Ravi K, Isaku Yamahata and Joe Stringer.

Cc: Ravi K <rkerur@gmail.com>
Cc: Leo Alterman <lalterman@nicira.com>
Cc: Isaku Yamahata <yamahata@valinux.co.jp>
Cc: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 include/net/mpls.h               |  39 +++++++++++
 include/uapi/linux/openvswitch.h |  32 +++++++++
 net/core/dev.c                   |   3 +-
 net/openvswitch/Kconfig          |   1 +
 net/openvswitch/actions.c        | 106 ++++++++++++++++++++++++++++-
 net/openvswitch/datapath.c       |   6 +-
 net/openvswitch/flow.c           |  30 +++++++++
 net/openvswitch/flow.h           |  17 +++--
 net/openvswitch/flow_netlink.c   | 139 ++++++++++++++++++++++++++++++++++-----
 net/openvswitch/flow_netlink.h   |   2 +-
 10 files changed, 345 insertions(+), 30 deletions(-)
 create mode 100644 include/net/mpls.h

diff --git a/include/net/mpls.h b/include/net/mpls.h
new file mode 100644
index 0000000..5b3b5ad
--- /dev/null
+++ b/include/net/mpls.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _NET_MPLS_H
+#define _NET_MPLS_H 1
+
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+
+#define MPLS_HLEN 4
+
+static inline bool eth_p_mpls(__be16 eth_type)
+{
+	return eth_type == htons(ETH_P_MPLS_UC) ||
+		eth_type == htons(ETH_P_MPLS_MC);
+}
+
+/*
+ * For non-MPLS skbs this will correspond to the network header.
+ * For MPLS skbs it will be before the network_header as the MPLS
+ * label stack lies between the end of the mac header and the network
+ * header. That is, for MPLS skbs the end of the mac header
+ * is the top of the MPLS label stack.
+ */
+static inline unsigned char *skb_mpls_header(struct sk_buff *skb)
+{
+	return skb_mac_header(skb) + skb->mac_len;
+}
+#endif
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 435eabc..631056b 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -293,6 +293,9 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_DP_HASH,      /* u32 hash value. Value 0 indicates the hash
 				   is not computed by the datapath. */
 	OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */
+	OVS_KEY_ATTR_MPLS,      /* array of struct ovs_key_mpls.
+				 * The implementation may restrict
+				 * the accepted length of the array. */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ovs_tunnel_info */
@@ -340,6 +343,10 @@ struct ovs_key_ethernet {
 	__u8	 eth_dst[ETH_ALEN];
 };
 
+struct ovs_key_mpls {
+	__be32 mpls_lse;
+};
+
 struct ovs_key_ipv4 {
 	__be32 ipv4_src;
 	__be32 ipv4_dst;
@@ -484,6 +491,19 @@ enum ovs_userspace_attr {
 #define OVS_USERSPACE_ATTR_MAX (__OVS_USERSPACE_ATTR_MAX - 1)
 
 /**
+ * struct ovs_action_push_mpls - %OVS_ACTION_ATTR_PUSH_MPLS action argument.
+ * @mpls_lse: MPLS label stack entry to push.
+ * @mpls_ethertype: Ethertype to set in the encapsulating ethernet frame.
+ *
+ * The only values @mpls_ethertype should ever be given are %ETH_P_MPLS_UC and
+ * %ETH_P_MPLS_MC, indicating MPLS unicast or multicast. Other are rejected.
+ */
+struct ovs_action_push_mpls {
+	__be32 mpls_lse;
+	__be16 mpls_ethertype; /* Either %ETH_P_MPLS_UC or %ETH_P_MPLS_MC */
+};
+
+/**
  * struct ovs_action_push_vlan - %OVS_ACTION_ATTR_PUSH_VLAN action argument.
  * @vlan_tpid: Tag protocol identifier (TPID) to push.
  * @vlan_tci: Tag control identifier (TCI) to push.  The CFI bit must be set
@@ -534,6 +554,15 @@ struct ovs_action_hash {
  * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet.
  * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in
  * the nested %OVS_SAMPLE_ATTR_* attributes.
+ * @OVS_ACTION_ATTR_PUSH_MPLS: Push a new MPLS label stack entry onto the
+ * top of the packets MPLS label stack.  Set the ethertype of the
+ * encapsulating frame to either %ETH_P_MPLS_UC or %ETH_P_MPLS_MC to
+ * indicate the new packet contents.
+ * @OVS_ACTION_ATTR_POP_MPLS: Pop an MPLS label stack entry off of the
+ * packet's MPLS label stack.  Set the encapsulating frame's ethertype to
+ * indicate the new packet contents. This could potentially still be
+ * %ETH_P_MPLS if the resulting MPLS label stack is not empty.  If there
+ * is no MPLS label stack, as determined by ethertype, no action is taken.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -550,6 +579,9 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_SAMPLE,       /* Nested OVS_SAMPLE_ATTR_*. */
 	OVS_ACTION_ATTR_RECIRC,       /* u32 recirc_id. */
 	OVS_ACTION_ATTR_HASH,	      /* struct ovs_action_hash. */
+	OVS_ACTION_ATTR_PUSH_MPLS,    /* struct ovs_action_push_mpls. */
+	OVS_ACTION_ATTR_POP_MPLS,     /* __be16 ethertype. */
+
 	__OVS_ACTION_ATTR_MAX
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 40be481..70bb609 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -118,6 +118,7 @@
 #include <linux/if_vlan.h>
 #include <linux/ip.h>
 #include <net/ip.h>
+#include <net/mpls.h>
 #include <linux/ipv6.h>
 #include <linux/in.h>
 #include <linux/jhash.h>
@@ -2530,7 +2531,7 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb,
 					   netdev_features_t features,
 					   __be16 type)
 {
-	if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+	if (eth_p_mpls(type))
 		features &= skb->dev->mpls_features;
 
 	return features;
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 2a9673e..454ce12 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -30,6 +30,7 @@ config OPENVSWITCH
 
 config OPENVSWITCH_GRE
 	tristate "Open vSwitch GRE tunneling support"
+	select NET_MPLS_GSO
 	depends on INET
 	depends on OPENVSWITCH
 	depends on NET_IPGRE_DEMUX
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 922c133..930b1b6 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -28,10 +28,12 @@
 #include <linux/in6.h>
 #include <linux/if_arp.h>
 #include <linux/if_vlan.h>
+
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/checksum.h>
 #include <net/dsfield.h>
+#include <net/mpls.h>
 #include <net/sctp/checksum.h>
 
 #include "datapath.h"
@@ -118,6 +120,92 @@ static int make_writable(struct sk_buff *skb, int write_len)
 	return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 }
 
+static int push_mpls(struct sk_buff *skb,
+		     const struct ovs_action_push_mpls *mpls)
+{
+	__be32 *new_mpls_lse;
+	struct ethhdr *hdr;
+
+	/* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
+	if (skb->encapsulation)
+		return -ENOTSUPP;
+
+	if (skb_cow_head(skb, MPLS_HLEN) < 0)
+		return -ENOMEM;
+
+	skb_push(skb, MPLS_HLEN);
+	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
+		skb->mac_len);
+	skb_reset_mac_header(skb);
+
+	new_mpls_lse = (__be32 *)skb_mpls_header(skb);
+	*new_mpls_lse = mpls->mpls_lse;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse,
+							     MPLS_HLEN, 0));
+
+	hdr = eth_hdr(skb);
+	hdr->h_proto = mpls->mpls_ethertype;
+
+	skb_set_inner_protocol(skb, skb->protocol);
+	skb->protocol = mpls->mpls_ethertype;
+
+	return 0;
+}
+
+static int pop_mpls(struct sk_buff *skb, const __be16 ethertype)
+{
+	struct ethhdr *hdr;
+	int err;
+
+	err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+	if (unlikely(err))
+		return err;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->csum = csum_sub(skb->csum,
+				     csum_partial(skb_mpls_header(skb),
+						  MPLS_HLEN, 0));
+
+	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
+		skb->mac_len);
+
+	__skb_pull(skb, MPLS_HLEN);
+	skb_reset_mac_header(skb);
+
+	/* skb_mpls_header() is used to locate the ethertype
+	 * field correctly in the presence of VLAN tags.
+	 */
+	hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN);
+	hdr->h_proto = ethertype;
+	if (eth_p_mpls(skb->protocol))
+		skb->protocol = ethertype;
+	return 0;
+}
+
+static int set_mpls(struct sk_buff *skb, const __be32 *mpls_lse)
+{
+	__be32 *stack;
+	int err;
+
+	err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+	if (unlikely(err))
+		return err;
+
+	stack = (__be32 *)skb_mpls_header(skb);
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		__be32 diff[] = { ~(*stack), *mpls_lse };
+
+		skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+					  ~skb->csum);
+	}
+
+	*stack = *mpls_lse;
+
+	return 0;
+}
+
 /* remove VLAN header from packet and update csum accordingly. */
 static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
 {
@@ -140,10 +228,12 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
 
 	vlan_set_encap_proto(skb, vhdr);
 	skb->mac_header += VLAN_HLEN;
+
 	if (skb_network_offset(skb) < ETH_HLEN)
 		skb_set_network_header(skb, ETH_HLEN);
-	skb_reset_mac_len(skb);
 
+	/* Update mac_len for subsequent MPLS actions */
+	skb_reset_mac_len(skb);
 	return 0;
 }
 
@@ -186,6 +276,8 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla
 
 		if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag))
 			return -ENOMEM;
+		/* Update mac_len for subsequent MPLS actions */
+		skb->mac_len += VLAN_HLEN;
 
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
 			skb->csum = csum_add(skb->csum, csum_partial(skb->data
@@ -612,6 +704,10 @@ static int execute_set_action(struct sk_buff *skb,
 	case OVS_KEY_ATTR_SCTP:
 		err = set_sctp(skb, nla_data(nested_attr));
 		break;
+
+	case OVS_KEY_ATTR_MPLS:
+		err = set_mpls(skb, nla_data(nested_attr));
+		break;
 	}
 
 	return err;
@@ -690,6 +786,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			execute_hash(skb, key, a);
 			break;
 
+		case OVS_ACTION_ATTR_PUSH_MPLS:
+			err = push_mpls(skb, nla_data(a));
+			break;
+
+		case OVS_ACTION_ATTR_POP_MPLS:
+			err = pop_mpls(skb, nla_get_be16(a));
+			break;
+
 		case OVS_ACTION_ATTR_PUSH_VLAN:
 			err = push_vlan(skb, nla_data(a));
 			if (unlikely(err)) /* skb already freed. */
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index f18302f..688cb9b 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -560,7 +560,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 		goto err_flow_free;
 
 	err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
-				   &flow->key, 0, &acts);
+				   &flow->key, &acts);
 	if (err)
 		goto err_flow_free;
 
@@ -846,7 +846,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 		goto err_kfree_flow;
 
 	error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key,
-				     0, &acts);
+				     &acts);
 	if (error) {
 		OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
 		goto err_kfree_acts;
@@ -953,7 +953,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
 		return acts;
 
 	ovs_flow_mask_key(&masked_key, key, mask);
-	error = ovs_nla_copy_actions(a, &masked_key, 0, &acts);
+	error = ovs_nla_copy_actions(a, &masked_key, &acts);
 	if (error) {
 		OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
 		kfree(acts);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 2b78789..90a2101 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -32,6 +32,7 @@
 #include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/mpls.h>
 #include <linux/sctp.h>
 #include <linux/smp.h>
 #include <linux/tcp.h>
@@ -42,6 +43,7 @@
 #include <net/ip.h>
 #include <net/ip_tunnels.h>
 #include <net/ipv6.h>
+#include <net/mpls.h>
 #include <net/ndisc.h>
 
 #include "datapath.h"
@@ -480,6 +482,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 		return -ENOMEM;
 
 	skb_reset_network_header(skb);
+	skb_reset_mac_len(skb);
 	__skb_push(skb, skb->data - skb_mac_header(skb));
 
 	/* Network layer. */
@@ -584,6 +587,33 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 			memset(&key->ip, 0, sizeof(key->ip));
 			memset(&key->ipv4, 0, sizeof(key->ipv4));
 		}
+	} else if (eth_p_mpls(key->eth.type)) {
+		size_t stack_len = MPLS_HLEN;
+
+		/* In the presence of an MPLS label stack the end of the L2
+		 * header and the beginning of the L3 header differ.
+		 *
+		 * Advance network_header to the beginning of the L3
+		 * header. mac_len corresponds to the end of the L2 header.
+		 */
+		while (1) {
+			__be32 lse;
+
+			error = check_header(skb, skb->mac_len + stack_len);
+			if (unlikely(error))
+				return 0;
+
+			memcpy(&lse, skb_network_header(skb), MPLS_HLEN);
+
+			if (stack_len == MPLS_HLEN)
+				memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN);
+
+			skb_set_network_header(skb, skb->mac_len + stack_len);
+			if (lse & htonl(MPLS_LS_S_MASK))
+				break;
+
+			stack_len += MPLS_HLEN;
+		}
 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
 		int nh_len;             /* IPv6 Header + Extensions */
 
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 7181331..4962bee 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -102,12 +102,17 @@ struct sw_flow_key {
 		__be16 tci;		/* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
 		__be16 type;		/* Ethernet frame type. */
 	} eth;
-	struct {
-		u8     proto;		/* IP protocol or lower 8 bits of ARP opcode. */
-		u8     tos;		/* IP ToS. */
-		u8     ttl;		/* IP TTL/hop limit. */
-		u8     frag;		/* One of OVS_FRAG_TYPE_*. */
-	} ip;
+	union {
+		struct {
+			__be32 top_lse;	/* top label stack entry */
+		} mpls;
+		struct {
+			u8     proto;	/* IP protocol or lower 8 bits of ARP opcode. */
+			u8     tos;	    /* IP ToS. */
+			u8     ttl;	    /* IP TTL/hop limit. */
+			u8     frag;	/* One of OVS_FRAG_TYPE_*. */
+		} ip;
+	};
 	struct {
 		__be16 src;		/* TCP/UDP/SCTP source port. */
 		__be16 dst;		/* TCP/UDP/SCTP destination port. */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 939bcb3..569309c 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -46,6 +46,7 @@
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/ndisc.h>
+#include <net/mpls.h>
 
 #include "flow_netlink.h"
 
@@ -134,7 +135,8 @@ static bool match_validate(const struct sw_flow_match *match,
 			| (1 << OVS_KEY_ATTR_ICMP)
 			| (1 << OVS_KEY_ATTR_ICMPV6)
 			| (1 << OVS_KEY_ATTR_ARP)
-			| (1 << OVS_KEY_ATTR_ND));
+			| (1 << OVS_KEY_ATTR_ND)
+			| (1 << OVS_KEY_ATTR_MPLS));
 
 	/* Always allowed mask fields. */
 	mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
@@ -149,6 +151,12 @@ static bool match_validate(const struct sw_flow_match *match,
 			mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
 	}
 
+	if (eth_p_mpls(match->key->eth.type)) {
+		key_expected |= 1 << OVS_KEY_ATTR_MPLS;
+		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+			mask_allowed |= 1 << OVS_KEY_ATTR_MPLS;
+	}
+
 	if (match->key->eth.type == htons(ETH_P_IP)) {
 		key_expected |= 1 << OVS_KEY_ATTR_IPV4;
 		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
@@ -266,6 +274,7 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32),
 	[OVS_KEY_ATTR_DP_HASH] = sizeof(u32),
 	[OVS_KEY_ATTR_TUNNEL] = -1,
+	[OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls),
 };
 
 static bool is_all_zero(const u8 *fp, size_t size)
@@ -735,6 +744,16 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
 		attrs &= ~(1 << OVS_KEY_ATTR_ARP);
 	}
 
+	if (attrs & (1 << OVS_KEY_ATTR_MPLS)) {
+		const struct ovs_key_mpls *mpls_key;
+
+		mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]);
+		SW_FLOW_KEY_PUT(match, mpls.top_lse,
+				mpls_key->mpls_lse, is_mask);
+
+		attrs &= ~(1 << OVS_KEY_ATTR_MPLS);
+	 }
+
 	if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
 		const struct ovs_key_tcp *tcp_key;
 
@@ -1140,6 +1159,14 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey,
 		arp_key->arp_op = htons(output->ip.proto);
 		ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha);
 		ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha);
+	} else if (eth_p_mpls(swkey->eth.type)) {
+		struct ovs_key_mpls *mpls_key;
+
+		nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key));
+		if (!nla)
+			goto nla_put_failure;
+		mpls_key = nla_data(nla);
+		mpls_key->mpls_lse = output->mpls.top_lse;
 	}
 
 	if ((swkey->eth.type == htons(ETH_P_IP) ||
@@ -1336,9 +1363,15 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa,
 	a->nla_len = sfa->actions_len - st_offset;
 }
 
+static int ovs_nla_copy_actions__(const struct nlattr *attr,
+				  const struct sw_flow_key *key,
+				  int depth, struct sw_flow_actions **sfa,
+				  __be16 eth_type, __be16 vlan_tci);
+
 static int validate_and_copy_sample(const struct nlattr *attr,
 				    const struct sw_flow_key *key, int depth,
-				    struct sw_flow_actions **sfa)
+				    struct sw_flow_actions **sfa,
+				    __be16 eth_type, __be16 vlan_tci)
 {
 	const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
 	const struct nlattr *probability, *actions;
@@ -1375,7 +1408,8 @@ static int validate_and_copy_sample(const struct nlattr *attr,
 	if (st_acts < 0)
 		return st_acts;
 
-	err = ovs_nla_copy_actions(actions, key, depth + 1, sfa);
+	err = ovs_nla_copy_actions__(actions, key, depth + 1, sfa,
+				     eth_type, vlan_tci);
 	if (err)
 		return err;
 
@@ -1385,10 +1419,10 @@ static int validate_and_copy_sample(const struct nlattr *attr,
 	return 0;
 }
 
-static int validate_tp_port(const struct sw_flow_key *flow_key)
+static int validate_tp_port(const struct sw_flow_key *flow_key,
+			    __be16 eth_type)
 {
-	if ((flow_key->eth.type == htons(ETH_P_IP) ||
-	     flow_key->eth.type == htons(ETH_P_IPV6)) &&
+	if ((eth_type == htons(ETH_P_IP) || eth_type == htons(ETH_P_IPV6)) &&
 	    (flow_key->tp.src || flow_key->tp.dst))
 		return 0;
 
@@ -1483,7 +1517,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 static int validate_set(const struct nlattr *a,
 			const struct sw_flow_key *flow_key,
 			struct sw_flow_actions **sfa,
-			bool *set_tun)
+			bool *set_tun, __be16 eth_type)
 {
 	const struct nlattr *ovs_key = nla_data(a);
 	int key_type = nla_type(ovs_key);
@@ -1508,6 +1542,9 @@ static int validate_set(const struct nlattr *a,
 		break;
 
 	case OVS_KEY_ATTR_TUNNEL:
+		if (eth_p_mpls(eth_type))
+			return -EINVAL;
+
 		*set_tun = true;
 		err = validate_and_copy_set_tun(a, sfa);
 		if (err)
@@ -1515,7 +1552,7 @@ static int validate_set(const struct nlattr *a,
 		break;
 
 	case OVS_KEY_ATTR_IPV4:
-		if (flow_key->eth.type != htons(ETH_P_IP))
+		if (eth_type != htons(ETH_P_IP))
 			return -EINVAL;
 
 		if (!flow_key->ip.proto)
@@ -1531,7 +1568,7 @@ static int validate_set(const struct nlattr *a,
 		break;
 
 	case OVS_KEY_ATTR_IPV6:
-		if (flow_key->eth.type != htons(ETH_P_IPV6))
+		if (eth_type != htons(ETH_P_IPV6))
 			return -EINVAL;
 
 		if (!flow_key->ip.proto)
@@ -1553,19 +1590,24 @@ static int validate_set(const struct nlattr *a,
 		if (flow_key->ip.proto != IPPROTO_TCP)
 			return -EINVAL;
 
-		return validate_tp_port(flow_key);
+		return validate_tp_port(flow_key, eth_type);
 
 	case OVS_KEY_ATTR_UDP:
 		if (flow_key->ip.proto != IPPROTO_UDP)
 			return -EINVAL;
 
-		return validate_tp_port(flow_key);
+		return validate_tp_port(flow_key, eth_type);
+
+	case OVS_KEY_ATTR_MPLS:
+		if (!eth_p_mpls(eth_type))
+			return -EINVAL;
+		break;
 
 	case OVS_KEY_ATTR_SCTP:
 		if (flow_key->ip.proto != IPPROTO_SCTP)
 			return -EINVAL;
 
-		return validate_tp_port(flow_key);
+		return validate_tp_port(flow_key, eth_type);
 
 	default:
 		return -EINVAL;
@@ -1609,12 +1651,13 @@ static int copy_action(const struct nlattr *from,
 	return 0;
 }
 
-int ovs_nla_copy_actions(const struct nlattr *attr,
-			 const struct sw_flow_key *key,
-			 int depth,
-			 struct sw_flow_actions **sfa)
+static int ovs_nla_copy_actions__(const struct nlattr *attr,
+				  const struct sw_flow_key *key,
+				  int depth, struct sw_flow_actions **sfa,
+				  __be16 eth_type, __be16 vlan_tci)
 {
 	const struct nlattr *a;
+	bool out_tnl_port = false;
 	int rem, err;
 
 	if (depth >= SAMPLE_ACTION_DEPTH)
@@ -1626,6 +1669,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 			[OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
 			[OVS_ACTION_ATTR_RECIRC] = sizeof(u32),
 			[OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
+			[OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls),
+			[OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16),
 			[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
 			[OVS_ACTION_ATTR_POP_VLAN] = 0,
 			[OVS_ACTION_ATTR_SET] = (u32)-1,
@@ -1655,6 +1700,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 		case OVS_ACTION_ATTR_OUTPUT:
 			if (nla_get_u32(a) >= DP_MAX_PORTS)
 				return -EINVAL;
+			out_tnl_port = false;
+
 			break;
 
 		case OVS_ACTION_ATTR_HASH: {
@@ -1671,6 +1718,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 		}
 
 		case OVS_ACTION_ATTR_POP_VLAN:
+			vlan_tci = htons(0);
 			break;
 
 		case OVS_ACTION_ATTR_PUSH_VLAN:
@@ -1679,19 +1727,66 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 				return -EINVAL;
 			if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
 				return -EINVAL;
+			vlan_tci = vlan->vlan_tci;
 			break;
 
 		case OVS_ACTION_ATTR_RECIRC:
 			break;
 
+		case OVS_ACTION_ATTR_PUSH_MPLS: {
+			const struct ovs_action_push_mpls *mpls = nla_data(a);
+
+			/* Networking stack do not allow simultaneous Tunnel
+			 * and MPLS GSO.
+			 */
+			if (out_tnl_port)
+				return -EINVAL;
+
+			if (!eth_p_mpls(mpls->mpls_ethertype))
+				return -EINVAL;
+			/* Prohibit push MPLS other than to a white list
+			 * for packets that have a known tag order.
+			 */
+			if (vlan_tci & htons(VLAN_TAG_PRESENT) ||
+			    (eth_type != htons(ETH_P_IP) &&
+			     eth_type != htons(ETH_P_IPV6) &&
+			     eth_type != htons(ETH_P_ARP) &&
+			     eth_type != htons(ETH_P_RARP) &&
+			     !eth_p_mpls(eth_type)))
+				return -EINVAL;
+			eth_type = mpls->mpls_ethertype;
+			break;
+		}
+
+		case OVS_ACTION_ATTR_POP_MPLS:
+			if (vlan_tci & htons(VLAN_TAG_PRESENT) ||
+			    !eth_p_mpls(eth_type))
+				return -EINVAL;
+
+			/* Disallow subsequent L2.5+ set and mpls_pop actions
+			 * as there is no check here to ensure that the new
+			 * eth_type is valid and thus set actions could
+			 * write off the end of the packet or otherwise
+			 * corrupt it.
+			 *
+			 * Support for these actions is planned using packet
+			 * recirculation.
+			 */
+			eth_type = htons(0);
+			break;
+
 		case OVS_ACTION_ATTR_SET:
-			err = validate_set(a, key, sfa, &skip_copy);
+			err = validate_set(a, key, sfa,
+					   &out_tnl_port, eth_type);
 			if (err)
 				return err;
+
+			skip_copy = out_tnl_port;
 			break;
 
 		case OVS_ACTION_ATTR_SAMPLE:
-			err = validate_and_copy_sample(a, key, depth, sfa);
+			err = validate_and_copy_sample(a, key, depth, sfa,
+						       eth_type, vlan_tci);
 			if (err)
 				return err;
 			skip_copy = true;
@@ -1713,6 +1808,14 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 	return 0;
 }
 
+int ovs_nla_copy_actions(const struct nlattr *attr,
+			 const struct sw_flow_key *key,
+			 struct sw_flow_actions **sfa)
+{
+	return ovs_nla_copy_actions__(attr, key, 0, sfa, key->eth.type,
+				      key->eth.tci);
+}
+
 static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
 {
 	const struct nlattr *a;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 206e45a..6355b1d 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -49,7 +49,7 @@ int ovs_nla_get_match(struct sw_flow_match *match,
 		      const struct nlattr *);
 
 int ovs_nla_copy_actions(const struct nlattr *attr,
-			 const struct sw_flow_key *key, int depth,
+			 const struct sw_flow_key *key,
 			 struct sw_flow_actions **sfa);
 int ovs_nla_put_actions(const struct nlattr *attr,
 			int len, struct sk_buff *skb);
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 01/14] net: Remove MPLS GSO feature.
From: Pravin B Shelar @ 2014-11-06  9:18 UTC (permalink / raw)
  To: davem; +Cc: netdev, Pravin B Shelar

Device can export MPLS GSO support in dev->mpls_features same way
it export vlan features in dev->vlan_features. So it is safe to
remove NETIF_F_GSO_MPLS redundant flag.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
v1-v2:
Fixed conflicts for latest net-next.
---
 include/linux/netdev_features.h | 5 +----
 include/linux/netdevice.h       | 1 -
 include/linux/skbuff.h          | 4 +---
 net/core/ethtool.c              | 1 -
 net/ipv4/af_inet.c              | 1 -
 net/ipv4/tcp_offload.c          | 1 -
 net/ipv4/udp_offload.c          | 3 +--
 net/ipv6/ip6_offload.c          | 1 -
 net/ipv6/udp_offload.c          | 3 +--
 net/mpls/mpls_gso.c             | 3 +--
 10 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 8c94b07..8e30685 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -47,7 +47,6 @@ enum {
 	NETIF_F_GSO_SIT_BIT,		/* ... SIT tunnel with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
-	NETIF_F_GSO_MPLS_BIT,		/* ... MPLS segmentation */
 	NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
 		NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
@@ -119,7 +118,6 @@ enum {
 #define NETIF_F_GSO_SIT		__NETIF_F(GSO_SIT)
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
-#define NETIF_F_GSO_MPLS	__NETIF_F(GSO_MPLS)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
@@ -183,7 +181,6 @@ enum {
 				 NETIF_F_GSO_IPIP |			\
 				 NETIF_F_GSO_SIT |			\
 				 NETIF_F_GSO_UDP_TUNNEL |		\
-				 NETIF_F_GSO_UDP_TUNNEL_CSUM |		\
-				 NETIF_F_GSO_MPLS)
+				 NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
 #endif	/* _LINUX_NETDEV_FEATURES_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4767f54..90ac959 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3583,7 +3583,6 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_SIT     != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
-	BUILD_BUG_ON(SKB_GSO_MPLS    != (NETIF_F_GSO_MPLS >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 39ec753..53f4f6c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -372,9 +372,7 @@ enum {
 
 	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
 
-	SKB_GSO_MPLS = 1 << 12,
-
-	SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
+	SKB_GSO_TUNNEL_REMCSUM = 1 << 12,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 06dfb29..b0f84f5 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -84,7 +84,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_GSO_IPIP_BIT] =	 "tx-ipip-segmentation",
 	[NETIF_F_GSO_SIT_BIT] =		 "tx-sit-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",
-	[NETIF_F_GSO_MPLS_BIT] =	 "tx-mpls-segmentation",
 
 	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",
 	[NETIF_F_SCTP_CSUM_BIT] =        "tx-checksum-sctp",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ed2c672..3a096bb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1223,7 +1223,6 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_TUNNEL_REMCSUM |
-		       SKB_GSO_MPLS |
 		       0)))
 		goto out;
 
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index a1b2a56..9d7930b 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -94,7 +94,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 			       SKB_GSO_GRE_CSUM |
 			       SKB_GSO_IPIP |
 			       SKB_GSO_SIT |
-			       SKB_GSO_MPLS |
 			       SKB_GSO_UDP_TUNNEL |
 			       SKB_GSO_UDP_TUNNEL_CSUM |
 			       SKB_GSO_TUNNEL_REMCSUM |
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0a5a70d..d3e537e 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -207,8 +207,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 				      SKB_GSO_UDP_TUNNEL_CSUM |
 				      SKB_GSO_TUNNEL_REMCSUM |
 				      SKB_GSO_IPIP |
-				      SKB_GSO_GRE | SKB_GSO_GRE_CSUM |
-				      SKB_GSO_MPLS) ||
+				      SKB_GSO_GRE | SKB_GSO_GRE_CSUM) ||
 			     !(type & (SKB_GSO_UDP))))
 			goto out;
 
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index e976707..fd76ce9 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -79,7 +79,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_TUNNEL_REMCSUM |
-		       SKB_GSO_MPLS |
 		       SKB_GSO_TCPV6 |
 		       0)))
 		goto out;
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 637ba2e..b6aa8ed 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -46,8 +46,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 				      SKB_GSO_GRE |
 				      SKB_GSO_GRE_CSUM |
 				      SKB_GSO_IPIP |
-				      SKB_GSO_SIT |
-				      SKB_GSO_MPLS) ||
+				      SKB_GSO_SIT) ||
 			     !(type & (SKB_GSO_UDP))))
 			goto out;
 
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index e3545f2..ca27837 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -34,8 +34,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_TCP_ECN |
 				  SKB_GSO_GRE |
 				  SKB_GSO_GRE_CSUM |
-				  SKB_GSO_IPIP |
-				  SKB_GSO_MPLS)))
+				  SKB_GSO_IPIP)))
 		goto out;
 
 	/* Setup inner SKB. */
-- 
1.9.3

^ permalink raw reply related

* [GIT net-next v2] Open vSwitch
From: Pravin B Shelar @ 2014-11-06  9:18 UTC (permalink / raw)
  To: davem; +Cc: netdev

First two patches are related to OVS MPLS support. Rest of patches
are mostly refactoring and minor improvements to openvswitch.

v1-v2:
 - Fix conflicts due to "gue: Remote checksum offload"
----------------------------------------------------------------

The following changes since commit e1b2cb655060e081e73b384b1fc8b2e978f73467:

  fou: Fix typo in returning flags in netlink (2014-11-05 22:18:20 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pshelar/openvswitch.git net_next_ovs

for you to fetch changes up to a85311bf1f9f8185682990cafdd4e0572c0ed373:

  openvswitch: Avoid NULL mask check while building mask (2014-11-05 23:52:35 -0800)

----------------------------------------------------------------
Andy Zhou (2):
      openvswitch: refactor do_output() to move NULL check out of fast path
      openvswitch: Refactor get_dp() function into multiple access APIs.

Chunhe Li (1):
      openvswitch: Drop packets when interdev is not up

Jarno Rajahalme (1):
      openvswitch: Fix the type of struct ovs_key_nd nd_target field.

Jesse Gross (1):
      openvswitch: Additional logging for -EINVAL on flow setups.

Joe Stringer (3):
      openvswitch: Remove redundant tcp_flags code.
      openvswitch: Refactor ovs_flow_cmd_fill_info().
      openvswitch: Move key_attr_size() to flow_netlink.h.

Lorand Jakab (1):
      openvswitch: Remove flow member from struct ovs_skb_cb

Pravin B Shelar (4):
      net: Remove MPLS GSO feature.
      openvswitch: Move table destroy to dp-rcu callback.
      openvswitch: Refactor action alloc and copy api.
      openvswitch: Avoid NULL mask check while building mask

Simon Horman (1):
      openvswitch: Add basic MPLS support to kernel

 include/linux/netdev_features.h      |   5 +-
 include/linux/netdevice.h            |   1 -
 include/linux/skbuff.h               |   4 +-
 include/net/mpls.h                   |  39 +++++
 include/uapi/linux/openvswitch.h     |  38 ++++-
 net/core/dev.c                       |   3 +-
 net/core/ethtool.c                   |   1 -
 net/ipv4/af_inet.c                   |   1 -
 net/ipv4/tcp_offload.c               |   1 -
 net/ipv4/udp_offload.c               |   3 +-
 net/ipv6/ip6_offload.c               |   1 -
 net/ipv6/udp_offload.c               |   3 +-
 net/mpls/mpls_gso.c                  |   3 +-
 net/openvswitch/Kconfig              |   1 +
 net/openvswitch/actions.c            | 136 ++++++++++++---
 net/openvswitch/datapath.c           | 215 ++++++++++++-----------
 net/openvswitch/datapath.h           |   4 +-
 net/openvswitch/flow.c               |  30 ++++
 net/openvswitch/flow.h               |  17 +-
 net/openvswitch/flow_netlink.c       | 322 +++++++++++++++++++++++++----------
 net/openvswitch/flow_netlink.h       |   5 +-
 net/openvswitch/flow_table.c         |  11 +-
 net/openvswitch/flow_table.h         |   2 +-
 net/openvswitch/vport-internal_dev.c |   5 +
 24 files changed, 606 insertions(+), 245 deletions(-)
 create mode 100644 include/net/mpls.h

^ permalink raw reply

* Re: bifurcated driver
From: Nicolas Dichtel @ 2014-11-06  9:10 UTC (permalink / raw)
  To: Alex Markuze, Zhou, Danny
  Cc: dev-VfR2kkLFssw@public.gmane.org, Fastabend, John R, Or Gerlitz,
	netdev
In-Reply-To: <CAKfHP0VaCW_zBb9-uJYwwDQ-+sz-DZ=b6hcWn0HfMmMzhiOfUA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Also CC netdev, this thread may interest network folks.

Le 06/11/2014 09:13, Alex Markuze a écrit :
> Danny sums up the issue perfectly IMHO.
> While both verbs and DPDK aim to provide generic user space networking, the
> similarities end there.
> verbs and RDMA HW are closely coupled and behave differently then standard
> eth nics and are not related to netdev mechanisms.
>
> Or, welcome to this discussion.
>
> Those interested can read the IB spec's (+1K pages) available from
> openfabrics*.
> *https://www.openfabrics.org/index.php
>
>
>
>
> On Thu, Nov 6, 2014 at 6:45 AM, Zhou, Danny <danny.zhou-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
>
>> I roughly read libibverbs related code and relevant infiniband/rdma
>> documents, and found though
>> many concepts in libibverbs looks similar to bifurcated driver, but there
>> are still lots of differences as
>> illustrated below based on my understanding:
>>
>> 1) Queue pair defined in RDMA specification are abstract concept, where
>> the queue pairs term used in
>>    bifurcated driver are rx/tx queue pairs in the NIC.
>> 2) Bifurcated PMD in DPDK directly access NIC resources as a slave driver
>> (no NIC control), while libibverbs
>>    as a user space library rather than driver offloads certain operations
>> to kernel driver and NIC by invoking
>>    "verbs" APIs.
>> 3) Libibverbs invokes infiniband specific system calls to allow
>> user/kernel space communication based on
>>    "verbs" defined in infiniband/RDMA spec, while bifurcated driver build
>> on top of af_packet module
>>    and new socket options to do things like hw queue split-off , map
>> certain pages on I/O space to user space
>>    operations, etc.
>> 4) There is a specific embedded MMU unit in Infiniband/RDMA to provides
>> memory protection, while
>>    bifurcated driver uses IOMMU rather than NIC to provide memory
>> protection.
>>
>> IMHO, libibverbs and corresponding kernel modules/drivers are specifically
>> designed and implemented for
>> direct access to RDMA hardware from userspace, and it highly depends on
>> "verbs" related system calls
>> supported by infiniband/rdma mechanism in kernel, rather than netdev
>> mechanism that bifurcated driver
>> solution depends on.
>>
>>> -----Original Message-----
>>> From: Vincent JARDIN [mailto:vincent.jardin-pdR9zngts4EAvxtiuMwx3w@public.gmane.org]
>>> Sent: Thursday, November 06, 2014 9:31 AM
>>> To: Zhou, Danny
>>> Cc: Thomas Monjalon; dev-VfR2kkLFssw@public.gmane.org; Fastabend, John R; Or Gerlitz
>>> Subject: Re: [dpdk-dev] bifurcated driver
>>>
>>> +Or
>>>
>>> On 05/11/2014 23:48, Zhou, Danny wrote:
>>>> Hi Thomas,
>>>>
>>>> Thanks for sharing the links to ibverbs, I will take a close look at
>> it and compare it to bifurcated driver. My take
>>>> after a rough review is that idea is very much similar, but bifurcated
>> driver implementation is generic for any
>>>> Ethernet device based on existing af_packet mechanism, with extension
>> of exchanging the messages between
>>>> user space and kernel space driver.
>>>>
>>>> I have an internal document to summary the pros and cons of below
>> solutions, except for ibvers, but
>>>> will be adding it shortly.
>>>>
>>>> - igb_uio
>>>> - uio_pci_generic
>>>> - VFIO
>>>> - bifurcated driver
>>>>
>>>> Short answers to your questions:
>>>>>     - upstream status
>>>> Adding IOMMU based memory protection and generic descriptor
>> description support now, into version 2
>>>> kernel patches.
>>>>
>>>>>     - usable with kernel netdev
>>>> af_packet based, and relevant patchset will be submitted to netdev for
>> sure.
>>>>
>>>>>     - usable in a vm
>>>> No, it does no coexist with SRIOV for number of reasons. but if you
>> pass-through a PF to a VM, it works perfect.
>>>>
>>>>>     - usable for Ethernet
>>>> It could work with all Ethernet NICs, as flow director is available
>> and NIC driver support new net_ops to split off
>>>> queue pairs for user space.
>>>>
>>>>>     - hardware requirements
>>>> No specific hardware requirements. All mainstream NICs have multiple
>> qpairs and flow director support.
>>>>
>>>>>     - security protection
>>>> Leverage IOMMU to provide memory protection on Intel platform. Other
>> archs provide similar memory protection
>>>> mechanism, so we only use arch-agnostic DMA memory allocation APIs in
>> kernel to support memory protection.
>>>>
>>>>>     - performance
>>>> DPDK native performance on user space queues, as long as drop_en is
>> enabled to avoid head-of-line blocking.
>>>>
>>>> -Danny
>>>>
>>>>> -----Original Message-----
>>>>> From: Thomas Monjalon [mailto:thomas.monjalon-pdR9zngts4EAvxtiuMwx3w@public.gmane.org]
>>>>> Sent: Wednesday, November 05, 2014 9:01 PM
>>>>> To: Zhou, Danny
>>>>> Cc: dev-VfR2kkLFssw@public.gmane.org; Fastabend, John R
>>>>> Subject: Re: [dpdk-dev] bifurcated driver
>>>>>
>>>>> Hi Danny,
>>>>>
>>>>> 2014-10-31 17:36, O'driscoll, Tim:
>>>>>> Bifurcated Driver (Danny.Zhou-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org)
>>>>>
>>>>> Thanks for the presentation of bifurcated driver during the community
>> call.
>>>>> I asked if you looked at ibverbs and you wanted a link to check.
>>>>> The kernel module is here:
>>>>>
>> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/infiniband/core
>>>>> The userspace library:
>>>>>     http://git.kernel.org/cgit/libs/infiniband/libibverbs.git
>>>>>
>>>>> Extract from Kconfig:
>>>>> "
>>>>> config INFINIBAND_USER_ACCESS
>>>>>     tristate "InfiniBand userspace access (verbs and CM)"
>>>>>     select ANON_INODES
>>>>>     ---help---
>>>>>       Userspace InfiniBand access support.  This enables the
>>>>>       kernel side of userspace verbs and the userspace
>>>>>       communication manager (CM).  This allows userspace processes
>>>>>       to set up connections and directly access InfiniBand
>>>>>       hardware for fast-path operations.  You will also need
>>>>>       libibverbs, libibcm and a hardware driver library from
>>>>>       <http://www.openfabrics.org/git/>.
>>>>> "
>>>>>
>>>>> It seems to be close to the bifurcated driver needs.
>>>>> Not sure if it can solve the security issues if there is no dedicated
>> MMU
>>>>> in the NIC.
>>>>>
>>>>> I feel we should sum up pros and cons of
>>>>>     - igb_uio
>>>>>     - uio_pci_generic
>>>>>     - VFIO
>>>>>     - ibverbs
>>>>>     - bifurcated driver
>>>>> I suggest to consider these criterias:
>>>>>     - upstream status
>>>>>     - usable with kernel netdev
>>>>>     - usable in a vm
>>>>>     - usable for ethernet
>>>>>     - hardware requirements
>>>>>     - security protection
>>>>>     - performance
>>>>>
>>>>> --
>>>>> Thomas
>>
>>

^ permalink raw reply

* Re: "asix: Don't reset PHY on if_up for ASIX 88772" breaks net on arndale platform
From: Riku Voipio @ 2014-11-06  9:06 UTC (permalink / raw)
  To: Charles Keepax
  Cc: Stam, Michel [FINT], freddy, davem, linux-usb, netdev,
	linux-kernel, linux-samsung-soc
In-Reply-To: <20141105150258.GR23178@opensource.wolfsonmicro.com>

On Wed, Nov 05, 2014 at 03:02:58PM +0000, Charles Keepax wrote:
> On Wed, Nov 05, 2014 at 01:04:37PM +0100, Stam, Michel [FINT] wrote:
> > Hello Charles,
> > 
> > After looking around I found the reset value for the 8772 chip, which
> > seems to be 0x1E1 (ANAR register).
> > 
> > This equates to (according to include/uapi/linux/mii.h)
> > ADVERTISE_ALL | ADVERTISE_CSMA.
> > 
> > The register only seems to become 0 if the software reset fails.
 
> Odd it definitely reads back as zero on Arndale. I am guessing
> that the root of the problem here is that for some reason Arndale
> POR of the ethernet is pants and it needs a full software reset
> before it will work and the patch removes the full reset
> callback.

The asix on arndale comes semi-configured from u-boot, which I guess is
not the state kernel expects it to come in. At least in my case where
I use tftp from u-boot to load my kernel.

So probably the full reset is needed here to make the asix chip come
to a truly pristine state.

The commit that Michel partially reverted (by returning to use
ax88772_link_reset instead of ax88772_reset), indicates that a strong reset
is needed for suspend/resume as well:

commit 4ad1438f025ed8d1e4e95a796ca7f0ad5a22c378
Author: Grant Grundler <grundler@chromium.org>
Date:   Tue Oct 4 09:55:16 2011 +0000

    NET: fix phy init for AX88772 USB ethernet
        
    Fix phy initialization for AX88772 (USB 2.0 100BT). Failure
    was occasionally DHCP wouldn't work after reboot or
    suspend/resume cycle.
 
> > Unfortunately, this is exactly what I get when the patch is applied;
> > asix 1-2:1.0 eth1: Failed to send software reset: ffffffb5
> > asix 1-2:1.0 eth1: link reset failed (-75) usbnet usb-0000:00:1d.0-2, 
> > ASIX AX88772 USB 2.0 Ethernet
> > asix 1-2:1.0 eth1: Failed to send software reset: ffffffb5
> > asix 1-2:1.0 eth1: link reset failed (-75) usbnet usb-0000:00:1d.0-2, 
> > ASIX AX88772 USB 2.0 Ethernet
> 
> Ok so I am guessing you have a value in the register which is
> neither the reset value or 0 and this causing problems later in
> the reset/on the next reset. I do find the naming confusing in
> the error message there as it says link reset failed but the
> link_reset callback can't fail in the driver and I modified the
> reset callback. But I guess that is just oddities of the network
> stack I am not familiar with.
> 
> The other thing that feels odd is (and again apologies as I know
> next to nothing about the networking stack) how come it is
> unexpected that the reset callback destroys the state of the
> device. Naively I would have expected that a reset callback would
> reset the device back to its default state. Here we seem to be
> trying to avoid that happening.

Indeed, it would seems some tracing would be neede to figure out in
which order the .reset and .link_reset callbacks are being called.

^ permalink raw reply

* Re: M_CAN message RAM initialization AppNote  - was: Re: [PATCH V3 3/3] can: m_can: workaround for transmit data less than 4 bytes
From: Marc Kleine-Budde @ 2014-11-06  9:00 UTC (permalink / raw)
  To: Oliver Hartkopp, Dong Aisheng
  Cc: linux-can, wg, varkabhadram, netdev, linux-arm-kernel
In-Reply-To: <545B1D71.4000408@hartkopp.net>

[-- Attachment #1: Type: text/plain, Size: 2447 bytes --]

On 11/06/2014 08:04 AM, Oliver Hartkopp wrote:
> On 06.11.2014 02:57, Dong Aisheng wrote:
>> On Wed, Nov 05, 2014 at 07:15:10PM +0100, Oliver Hartkopp wrote:
> 
>>> The Message RAM is usually equipped with a parity or ECC functionality.
>>> But RAM cells suffer a hardware reset and can therefore hold
>>> arbitrary content at startup - including parity and/or ECC bits.
>>>
>>> So when you write only the CAN ID and the first four bytes the last
>>> four bytes remain untouched. Then the M_CAN starts to read in 32bit
>>> words from the start of the Tx Message element. So it is very likely
>>> to trigger the message RAM error when reading the uninitialized
>>> 32bit word from the last four bytes.
>>>
>>> Finally it turns out that an initial writing (with any kind of data)
>>> to the entire message RAM is mandatory to create valid parity/ECC
>>> checksums.
>>>
>>> That's it.
>>>
>>
>> Thanks for sharing this information.
>> Does it mean this issue is related to the nature of Message RAM and is
>> supposed to exist on all M_CAN IP versions?
> 
> From what I know from the 3.1.x revision there's no change regarding
> IR.BRU and IR.BEC - so I would assume this to stay on all M_CAN IP
> revisions.
> 
> But after some sleep I wonder if this patch [3/3] would need an update too.
> 
> Writing to the TX message RAM is obviously no workaround but a valid and
> needed initialization process.
> 
> I would tend to make this patch:
> 
> ---
> 
> can: m_can: add missing TX message RAM initialization
> 
> The M_CAN message RAM is usually equipped with a parity or ECC
> functionality.
> But RAM cells suffer a hardware reset and can therefore hold arbitrary
> content at startup - including parity and/or ECC bits.
> 
> To prevent the M_CAN controller detecting checksum errors when reading
> potentially uninitialized TX message RAM content to transmit CAN frames
> the TX message RAM has to be written with (any kind of) initial data.
> 
> ---
> 
> Then the code should memset() the entire TX FIFO element - and not only
> the 8 data bytes we are addressing now.

No literal memset() as this is iomem

Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [PATCH] bridge: missing null bridge device check causing null pointer dereference (bugfix)
From: Toshiaki Makita @ 2014-11-06  8:28 UTC (permalink / raw)
  To: 박수현, Stephen Hemminger, David S. Miller
  Cc: netdev@vger.kernel.org, bridge@lists.linux-foundation.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <8D1F1238A24CE743B8F3CED0F137C69E408AA087@EXMB02.ahnbang.ahnlab.com>

On 2014/11/06 16:58, 박수현 wrote:
>> -----Original Message-----
>> From: Toshiaki Makita [mailto:makita.toshiaki@lab.ntt.co.jp]
>> Sent: Thursday, November 06, 2014 4:07 PM
>> To: 박수현; Stephen Hemminger; David S. Miller
>> Cc: bridge@lists.linux-foundation.org; netdev@vger.kernel.org; linux-
>> kernel@vger.kernel.org
>> Subject: Re: [PATCH] bridge: missing null bridge device check causing null
>> pointer dereference (bugfix)
>>
>> On 2014/11/06 15:26, Su-Hyun Park wrote:
>>> the bridge device can be null if the bridge is being deleted while
>>> processing the packet, which causes the null pointer dereference in
>> switch statement.
>>
>> How can this happen??
>> It is guarded by rcu.
>> netdev_rx_handler_unregister() ensures rx_handler_data is non NULL.
>>
> 
> The RCU protect rx_handler_data, not the bridge member port. It can be NULL according to below code.
> 
> static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *dev) {
> 	struct net_bridge_port *port = rcu_dereference(dev->rx_handler_data);
> 	return br_port_exists(dev) ? port : NULL; 
> }

Seems to have been fixed for a year.
716ec052d228 ("bridge: fix NULL pointer deref of br_port_get_rcu")

Thanks,
Toshiaki Makita

^ permalink raw reply

* [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec
From: Herbert Xu @ 2014-11-06  8:28 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki
In-Reply-To: <20141106082704.GB29800@gondor.apana.org.au>

Now that both macvtap and tun are using skb_copy_datagram_iter, we
can kill the abomination that is skb_copy_datagram_const_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/linux/skbuff.h |    3 -
 net/core/datagram.c    |   89 -------------------------------------------------
 2 files changed, 92 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a405013..da59580 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2653,9 +2653,6 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 				 int len);
 int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
 			   int offset, size_t count);
-int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset,
-				  const struct iovec *to, int to_offset,
-				  int size);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 			   struct iov_iter *to, int size);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 45a9d4d..93054b9 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -394,95 +394,6 @@ fault:
 EXPORT_SYMBOL(skb_copy_datagram_iovec);
 
 /**
- *	skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
- *	@skb: buffer to copy
- *	@offset: offset in the buffer to start copying from
- *	@to: io vector to copy to
- *	@to_offset: offset in the io vector to start copying to
- *	@len: amount of data to copy from buffer to iovec
- *
- *	Returns 0 or -EFAULT.
- *	Note: the iovec is not modified during the copy.
- */
-int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
-				  const struct iovec *to, int to_offset,
-				  int len)
-{
-	int start = skb_headlen(skb);
-	int i, copy = start - offset;
-	struct sk_buff *frag_iter;
-
-	/* Copy header. */
-	if (copy > 0) {
-		if (copy > len)
-			copy = len;
-		if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
-			goto fault;
-		if ((len -= copy) == 0)
-			return 0;
-		offset += copy;
-		to_offset += copy;
-	}
-
-	/* Copy paged appendix. Hmm... why does this look so complicated? */
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-		int end;
-		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-		WARN_ON(start > offset + len);
-
-		end = start + skb_frag_size(frag);
-		if ((copy = end - offset) > 0) {
-			int err;
-			u8  *vaddr;
-			struct page *page = skb_frag_page(frag);
-
-			if (copy > len)
-				copy = len;
-			vaddr = kmap(page);
-			err = memcpy_toiovecend(to, vaddr + frag->page_offset +
-						offset - start, to_offset, copy);
-			kunmap(page);
-			if (err)
-				goto fault;
-			if (!(len -= copy))
-				return 0;
-			offset += copy;
-			to_offset += copy;
-		}
-		start = end;
-	}
-
-	skb_walk_frags(skb, frag_iter) {
-		int end;
-
-		WARN_ON(start > offset + len);
-
-		end = start + frag_iter->len;
-		if ((copy = end - offset) > 0) {
-			if (copy > len)
-				copy = len;
-			if (skb_copy_datagram_const_iovec(frag_iter,
-							  offset - start,
-							  to, to_offset,
-							  copy))
-				goto fault;
-			if ((len -= copy) == 0)
-				return 0;
-			offset += copy;
-			to_offset += copy;
-		}
-		start = end;
-	}
-	if (!len)
-		return 0;
-
-fault:
-	return -EFAULT;
-}
-EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
-
-/**
  *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying from

^ permalink raw reply related

* [PATCH 3/4] macvtap: Use iovec iterators
From: Herbert Xu @ 2014-11-06  8:28 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki
In-Reply-To: <20141106082704.GB29800@gondor.apana.org.au>

This patch removes the use of skb_copy_datagram_const_iovec in
favour of the iovec iterator-based skb_copy_datagram_iter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 drivers/net/macvtap.c |   45 ++++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 880cc09..a0e1dd7 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -15,6 +15,7 @@
 #include <linux/cdev.h>
 #include <linux/idr.h>
 #include <linux/fs.h>
+#include <linux/uio.h>
 
 #include <net/ipv6.h>
 #include <net/net_namespace.h>
@@ -778,31 +779,28 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
 /* Put packet to the user space buffer */
 static ssize_t macvtap_put_user(struct macvtap_queue *q,
 				const struct sk_buff *skb,
-				const struct iovec *iv, int len)
+				struct iov_iter *iter)
 {
 	int ret;
 	int vnet_hdr_len = 0;
 	int vlan_offset = 0;
-	int copied, total;
+	int total;
 
 	if (q->flags & IFF_VNET_HDR) {
 		struct virtio_net_hdr vnet_hdr;
 		vnet_hdr_len = q->vnet_hdr_sz;
-		if ((len -= vnet_hdr_len) < 0)
+		if (iov_iter_count(iter) < vnet_hdr_len)
 			return -EINVAL;
 
 		macvtap_skb_to_vnet_hdr(skb, &vnet_hdr);
 
-		if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr)))
+		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter))
 			return -EFAULT;
 	}
-	total = copied = vnet_hdr_len;
+	total = vnet_hdr_len;
 	total += skb->len;
 
-	if (!vlan_tx_tag_present(skb))
-		len = min_t(int, skb->len, len);
-	else {
-		int copy;
+	if (vlan_tx_tag_present(skb)) {
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
@@ -811,37 +809,33 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
-		len = min_t(int, skb->len + VLAN_HLEN, len);
 		total += VLAN_HLEN;
 
-		copy = min_t(int, vlan_offset, len);
-		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 
-		copy = min_t(int, sizeof(veth), len);
-		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = copy_to_iter(&veth, sizeof(veth), iter);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 	}
 
-	ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
+				     skb->len - vlan_offset);
 
 done:
 	return ret ? ret : total;
 }
 
 static ssize_t macvtap_do_read(struct macvtap_queue *q,
-			       const struct iovec *iv, unsigned long len,
+			       const struct iovec *iv, unsigned long segs,
+			       unsigned long len,
 			       int noblock)
 {
 	DEFINE_WAIT(wait);
 	struct sk_buff *skb;
 	ssize_t ret = 0;
+	struct iov_iter iter;
 
 	while (len) {
 		if (!noblock)
@@ -863,7 +857,8 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q,
 			schedule();
 			continue;
 		}
-		ret = macvtap_put_user(q, skb, iv, len);
+		iov_iter_init(&iter, READ, iv, segs, len);
+		ret = macvtap_put_user(q, skb, &iter);
 		kfree_skb(skb);
 		break;
 	}
@@ -886,7 +881,7 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	ret = macvtap_do_read(q, iv, len, file->f_flags & O_NONBLOCK);
+	ret = macvtap_do_read(q, iv, count, len, file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
 		iocb->ki_pos = ret;
@@ -1117,7 +1112,7 @@ static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock,
 	int ret;
 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
 		return -EINVAL;
-	ret = macvtap_do_read(q, m->msg_iov, total_len,
+	ret = macvtap_do_read(q, m->msg_iov, m->msg_iovlen, total_len,
 			  flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;

^ permalink raw reply related

* [PATCH 2/4] tun: Use iovec iterators
From: Herbert Xu @ 2014-11-06  8:28 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki
In-Reply-To: <20141106082704.GB29800@gondor.apana.org.au>

This patch removes the use of skb_copy_datagram_const_iovec in
favour of the iovec iterator-based skb_copy_datagram_iter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 drivers/net/tun.c |   65 ++++++++++++++++++++++++------------------------------
 1 file changed, 30 insertions(+), 35 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9dd3746..b4ac4d5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -71,6 +71,7 @@
 #include <net/rtnetlink.h>
 #include <net/sock.h>
 #include <linux/seq_file.h>
+#include <linux/uio.h>
 
 #include <asm/uaccess.h>
 
@@ -1230,11 +1231,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 static ssize_t tun_put_user(struct tun_struct *tun,
 			    struct tun_file *tfile,
 			    struct sk_buff *skb,
-			    const struct iovec *iv, int len)
+			    struct iov_iter *iter)
 {
 	struct tun_pi pi = { 0, skb->protocol };
-	ssize_t total = 0;
-	int vlan_offset = 0, copied;
+	ssize_t total;
+	int vlan_offset;
 	int vlan_hlen = 0;
 	int vnet_hdr_sz = 0;
 
@@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	if (tun->flags & TUN_VNET_HDR)
 		vnet_hdr_sz = tun->vnet_hdr_sz;
 
+	total = skb->len + vlan_hlen + vnet_hdr_sz;
+
 	if (!(tun->flags & TUN_NO_PI)) {
-		if ((len -= sizeof(pi)) < 0)
+		if (iov_iter_count(iter) < sizeof(pi))
 			return -EINVAL;
 
-		if (len < skb->len + vlan_hlen + vnet_hdr_sz) {
+		total += sizeof(pi);
+		if (iov_iter_count(iter) < total) {
 			/* Packet will be striped */
 			pi.flags |= TUN_PKT_STRIP;
 		}
 
-		if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi)))
+		if (copy_to_iter(&pi, sizeof(pi), iter))
 			return -EFAULT;
-		total += sizeof(pi);
 	}
 
 	if (vnet_hdr_sz) {
 		struct virtio_net_hdr gso = { 0 }; /* no info leak */
-		if ((len -= vnet_hdr_sz) < 0)
+		if (iov_iter_count(iter) < vnet_hdr_sz)
 			return -EINVAL;
 
 		if (skb_is_gso(skb)) {
@@ -1299,17 +1302,12 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 			gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
 		} /* else everything is zero */
 
-		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
-					       sizeof(gso))))
+		if (copy_to_iter(&gso, sizeof(gso), iter))
 			return -EFAULT;
-		total += vnet_hdr_sz;
 	}
 
-	copied = total;
-	len = min_t(int, skb->len + vlan_hlen, len);
-	total += skb->len + vlan_hlen;
 	if (vlan_hlen) {
-		int copy, ret;
+		int ret;
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
@@ -1320,36 +1318,32 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 
-		copy = min_t(int, vlan_offset, len);
-		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 
-		copy = min_t(int, sizeof(veth), len);
-		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = copy_to_iter(&veth, sizeof(veth), iter);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 	}
 
-	skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
 
 done:
 	tun->dev->stats.tx_packets++;
-	tun->dev->stats.tx_bytes += len;
+	tun->dev->stats.tx_bytes += skb->len + vlan_hlen;
 
 	return total;
 }
 
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
-			   const struct iovec *iv, ssize_t len, int noblock)
+			   const struct iovec *iv, unsigned long segs,
+			   ssize_t len, int noblock)
 {
 	struct sk_buff *skb;
 	ssize_t ret = 0;
 	int peeked, err, off = 0;
+	struct iov_iter iter;
 
 	tun_debug(KERN_INFO, tun, "tun_do_read\n");
 
@@ -1362,11 +1356,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	/* Read frames from queue */
 	skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
 				  &peeked, &off, &err);
-	if (skb) {
-		ret = tun_put_user(tun, tfile, skb, iv, len);
-		kfree_skb(skb);
-	} else
-		ret = err;
+	if (!skb)
+		return ret;
+
+	iov_iter_init(&iter, READ, iv, segs, len);
+	ret = tun_put_user(tun, tfile, skb, &iter);
+	kfree_skb(skb);
 
 	return ret;
 }
@@ -1387,7 +1382,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	ret = tun_do_read(tun, tfile, iv, len,
+	ret = tun_do_read(tun, tfile, iv, count, len,
 			  file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
@@ -1488,7 +1483,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 					 SOL_PACKET, TUN_TX_TIMESTAMP);
 		goto out;
 	}
-	ret = tun_do_read(tun, tfile, m->msg_iov, total_len,
+	ret = tun_do_read(tun, tfile, m->msg_iov, m->msg_iovlen, total_len,
 			  flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;

^ permalink raw reply related

* [PATCH 1/4] inet: Add skb_copy_datagram_iter
From: Herbert Xu @ 2014-11-06  8:28 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki
In-Reply-To: <20141106082704.GB29800@gondor.apana.org.au>

This patch adds skb_copy_datagram_iter, which is identical to
skb_copy_datagram_iovec except that it operates on iov_iter
instead of iovec.

Eventually all users of skb_copy_datagram_iovec should switch
over to iov_iter and then we can remove skb_copy_datagram_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/linux/skbuff.h |    3 +
 net/core/datagram.c    |   82 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 39ec753..a405013 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -150,6 +150,7 @@
 struct net_device;
 struct scatterlist;
 struct pipe_inode_info;
+struct iov_iter;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -2655,6 +2656,8 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
 int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset,
 				  const struct iovec *to, int to_offset,
 				  int size);
+int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
+			   struct iov_iter *to, int size);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fdbc9a8..45a9d4d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -49,6 +49,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/uio.h>
 
 #include <net/protocol.h>
 #include <linux/skbuff.h>
@@ -482,6 +483,87 @@ fault:
 EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
 
 /**
+ *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: iovec iterator to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+			   struct iov_iter *to, int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+
+	trace_skb_copy_datagram_iovec(skb, len);
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if (copy_to_iter(skb->data + offset, copy, to))
+			goto fault;
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_frag_size(frag);
+		if ((copy = end - offset) > 0) {
+			int err;
+			u8  *vaddr;
+			struct page *page = skb_frag_page(frag);
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap(page);
+			err = copy_to_iter(vaddr + frag->page_offset +
+					   offset - start, copy, to);
+			kunmap(page);
+			if (err)
+				goto fault;
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (skb_copy_datagram_iter(frag_iter, offset - start,
+						   to, copy))
+				goto fault;
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_iter);
+
+/**
  *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying to

^ permalink raw reply related

* [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version
From: Herbert Xu @ 2014-11-06  8:27 UTC (permalink / raw)
  To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki
In-Reply-To: <20141105.152410.1775725940252546246.davem@davemloft.net>

Hi Dave:

This patch series adds the helper skb_copy_datagram_iter, which
is meant to replace both skb_copy_datagram_iovec and its evil
twin skb_copy_datagram_const_iovec.

It then converts tun and macvtap over to the new helper and finally
removes skb_copy_datagram_const_iovec which is only used by tun
and macvtap.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
From: Herbert Xu @ 2014-11-06  8:23 UTC (permalink / raw)
  To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl
In-Reply-To: <20141105.152410.1775725940252546246.davem@davemloft.net>

On Wed, Nov 05, 2014 at 03:24:10PM -0500, David Miller wrote:
> 
> Herbert, please provide a cover letter for this series, and the most recent
> version of patch #2 gets various rejects when I try to apply it to net-next.

Sure, I'll regenerate them.  However, while doing so I noticed that
a number of my patches on tun/macvtap that you have previously set
as accepted are missing from net-next.  Could this be why you got
the rejects?

For example, this patch wasn't in net-next when I just did a pull.

https://patchwork.ozlabs.org/patch/405966/

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH net 0/5] Implement ndo_gso_check() for vxlan nics
From: Or Gerlitz @ 2014-11-06  8:23 UTC (permalink / raw)
  To: Tom Herbert
  Cc: David Miller, Joe Stringer, Linux Netdev List, Sathya Perla,
	Jeff Kirsher, linux.nics, Amir Vadai, shahed.shaikh,
	dept-gelinuxnicdev, LKML
In-Reply-To: <CA+mtBx-Js62X3BpfjQkcsh=w=-cgvSyM5ueCpufGVv0uu1nphw@mail.gmail.com>

On Thu, Nov 6, 2014 at 4:44 AM, Tom Herbert <therbert@google.com> wrote:
> On Wed, Nov 5, 2014 at 6:15 PM, David Miller <davem@davemloft.net> wrote:
>> From: Joe Stringer <joestringer@nicira.com>
>> Date: Wed, 5 Nov 2014 17:06:46 -0800
>>
>>> My impression was that the changes are more likely to be
>>> hardware-specific (like the i40e changes) rather than software-specific,
>>> like changes that might be integrated into the helper.
>>
>> I think there is more commonality amongst hardware capabilities,
>> and this is why I want the helper to play itself out.
>>
>>> That said, I can rework for one helper. The way I see it would be the
>>> same code as these patches, as "vxlan_gso_check(struct sk_buff *)" in
>>> drivers/net/vxlan.c which would be called from each driver. Is that what
>>> you had in mind?
>>
>> Yes.
>
> Note that this code is not VXLAN specific, it will also accept NVGRE
> and GRE/UDP with keyid and TEB. I imagine all these cases should be
> indistinguishable to the hardware so they probably just work (which
> would be cool!). It might be better to name and locate the helper
> function to reflect that.

Unlike the VXLAN case, currently there's no indication from the
network stack towards the driver that an NVGRE tunnel was set, so in
our case we're not arming the HW offloads for NVGRE. I'll look into
that, maybe we can make them work always. Also re the math there to be
the same for VXLAN/NVGRE  -- skb_inner_mac_header(skb) -
skb_transport_header(skb) is exactly 8 (sizeof struct gre_base_hdr),
isn't that?

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox