Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH ipsec-next 02/11] xfrm: security: iterate all, not inexact lists
From: Florian Westphal @ 2018-11-07 22:00 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20181107220041.26205-1-fw@strlen.de>

currently all non-socket policies are either hashed in the dst table,
or placed on the 'inexact list'.  When flushing, we first walk the
table, then the (per-direction) inexact lists.

When we try and get rid of the inexact lists to having "n" inexact
lists (e.g. per-af inexact lists, or sorted into a tree), this walk
would become more complicated.

Simplify this: walk the 'all' list and skip socket policies during
traversal so we don't need to handle exact and inexact policies
separately anymore.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/xfrm/xfrm_policy.c | 93 ++++++++++++------------------------------
 1 file changed, 26 insertions(+), 67 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 119a427d9b2b..39d0db2a50d9 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -892,36 +892,19 @@ EXPORT_SYMBOL(xfrm_policy_byid);
 static inline int
 xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
 {
-	int dir, err = 0;
+	struct xfrm_policy *pol;
+	int err = 0;
 
-	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
-		struct xfrm_policy *pol;
-		int i;
+	list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+		if (pol->walk.dead ||
+		    xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
+		    pol->type != type)
+			continue;
 
-		hlist_for_each_entry(pol,
-				     &net->xfrm.policy_inexact[dir], bydst) {
-			if (pol->type != type)
-				continue;
-			err = security_xfrm_policy_delete(pol->security);
-			if (err) {
-				xfrm_audit_policy_delete(pol, 0, task_valid);
-				return err;
-			}
-		}
-		for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
-			hlist_for_each_entry(pol,
-					     net->xfrm.policy_bydst[dir].table + i,
-					     bydst) {
-				if (pol->type != type)
-					continue;
-				err = security_xfrm_policy_delete(
-								pol->security);
-				if (err) {
-					xfrm_audit_policy_delete(pol, 0,
-								 task_valid);
-					return err;
-				}
-			}
+		err = security_xfrm_policy_delete(pol->security);
+		if (err) {
+			xfrm_audit_policy_delete(pol, 0, task_valid);
+			return err;
 		}
 	}
 	return err;
@@ -937,6 +920,7 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
 int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 {
 	int dir, err = 0, cnt = 0;
+	struct xfrm_policy *pol;
 
 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 
@@ -944,46 +928,21 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 	if (err)
 		goto out;
 
-	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
-		struct xfrm_policy *pol;
-		int i;
-
-	again1:
-		hlist_for_each_entry(pol,
-				     &net->xfrm.policy_inexact[dir], bydst) {
-			if (pol->type != type)
-				continue;
-			__xfrm_policy_unlink(pol, dir);
-			spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
-			cnt++;
-
-			xfrm_audit_policy_delete(pol, 1, task_valid);
-
-			xfrm_policy_kill(pol);
-
-			spin_lock_bh(&net->xfrm.xfrm_policy_lock);
-			goto again1;
-		}
-
-		for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
-	again2:
-			hlist_for_each_entry(pol,
-					     net->xfrm.policy_bydst[dir].table + i,
-					     bydst) {
-				if (pol->type != type)
-					continue;
-				__xfrm_policy_unlink(pol, dir);
-				spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
-				cnt++;
-
-				xfrm_audit_policy_delete(pol, 1, task_valid);
-				xfrm_policy_kill(pol);
-
-				spin_lock_bh(&net->xfrm.xfrm_policy_lock);
-				goto again2;
-			}
-		}
+again:
+	list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+		dir = xfrm_policy_id2dir(pol->index);
+		if (pol->walk.dead ||
+		    dir >= XFRM_POLICY_MAX ||
+		    pol->type != type)
+			continue;
 
+		__xfrm_policy_unlink(pol, dir);
+		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+		cnt++;
+		xfrm_audit_policy_delete(pol, 1, task_valid);
+		xfrm_policy_kill(pol);
+		spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+		goto again;
 	}
 	if (!cnt)
 		err = -ESRCH;
-- 
2.18.1

^ permalink raw reply related

* [PATCH ipsec-next 03/11] xfrm: policy: split list insertion into a helper
From: Florian Westphal @ 2018-11-07 22:00 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20181107220041.26205-1-fw@strlen.de>

... so we can reuse this later without code duplication when we add
policy to a second inexact list.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/xfrm/xfrm_policy.c | 43 ++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 39d0db2a50d9..20d6815be0d7 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -740,18 +740,12 @@ static bool xfrm_policy_mark_match(struct xfrm_policy *policy,
 	return false;
 }
 
-int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
+static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
+						   struct xfrm_policy *policy,
+						   bool excl)
 {
-	struct net *net = xp_net(policy);
-	struct xfrm_policy *pol;
-	struct xfrm_policy *delpol;
-	struct hlist_head *chain;
-	struct hlist_node *newpos;
+	struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL;
 
-	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
-	chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
-	delpol = NULL;
-	newpos = NULL;
 	hlist_for_each_entry(pol, chain, bydst) {
 		if (pol->type == policy->type &&
 		    pol->if_id == policy->if_id &&
@@ -759,24 +753,41 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 		    xfrm_policy_mark_match(policy, pol) &&
 		    xfrm_sec_ctx_match(pol->security, policy->security) &&
 		    !WARN_ON(delpol)) {
-			if (excl) {
-				spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
-				return -EEXIST;
-			}
+			if (excl)
+				return ERR_PTR(-EEXIST);
 			delpol = pol;
 			if (policy->priority > pol->priority)
 				continue;
 		} else if (policy->priority >= pol->priority) {
-			newpos = &pol->bydst;
+			newpos = pol;
 			continue;
 		}
 		if (delpol)
 			break;
 	}
 	if (newpos)
-		hlist_add_behind_rcu(&policy->bydst, newpos);
+		hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
 	else
 		hlist_add_head_rcu(&policy->bydst, chain);
+
+	return delpol;
+}
+
+int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
+{
+	struct net *net = xp_net(policy);
+	struct xfrm_policy *delpol;
+	struct hlist_head *chain;
+
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+	chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
+	delpol = xfrm_policy_insert_list(chain, policy, excl);
+
+	if (IS_ERR(delpol)) {
+		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+		return PTR_ERR(delpol);
+	}
+
 	__xfrm_policy_link(policy, dir);
 
 	/* After previous checking, family can either be AF_INET or AF_INET6 */
-- 
2.18.1

^ permalink raw reply related

* [PATCH ipsec-next 04/11] xfrm: policy: return NULL when inexact search needed
From: Florian Westphal @ 2018-11-07 22:00 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20181107220041.26205-1-fw@strlen.de>

currently policy_hash_bysel() returns the hash bucket list
(for exact policies), or the inexact list (when policy uses a prefix).

Searching this inexact list is slow, so it might be better to pre-sort
inexact lists into a tree or another data structure for faster
searching.

However, due to 'any' policies, that need to be searched in any case,
doing so will require that 'inexact' policies need to be handled
specially to decide the best search strategy.  So change hash_bysel()
and return NULL if the policy can't be handled via the policy hash
table.

Right now, we simply use the inexact list when this happens, but
future patch can then implement a different strategy.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/xfrm/xfrm_policy.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 20d6815be0d7..b00c265f6be3 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -365,7 +365,7 @@ static struct hlist_head *policy_hash_bysel(struct net *net,
 	hash = __sel_hash(sel, family, hmask, dbits, sbits);
 
 	if (hash == hmask + 1)
-		return &net->xfrm.policy_inexact[dir];
+		return NULL;
 
 	return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
 		     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
@@ -625,6 +625,8 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 		chain = policy_hash_bysel(net, &policy->selector,
 					  policy->family,
 					  xfrm_policy_id2dir(policy->index));
+		if (!chain)
+			chain = &net->xfrm.policy_inexact[dir];
 		hlist_for_each_entry(pol, chain, bydst) {
 			if (policy->priority >= pol->priority)
 				newpos = &pol->bydst;
@@ -781,7 +783,12 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 
 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
-	delpol = xfrm_policy_insert_list(chain, policy, excl);
+	if (chain) {
+		delpol = xfrm_policy_insert_list(chain, policy, excl);
+	} else {
+		chain = &net->xfrm.policy_inexact[dir];
+		delpol = xfrm_policy_insert_list(chain, policy, excl);
+	}
 
 	if (IS_ERR(delpol)) {
 		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
@@ -829,6 +836,8 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
 	*err = 0;
 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = policy_hash_bysel(net, sel, sel->family, dir);
+	if (!chain)
+		chain = &net->xfrm.policy_inexact[dir];
 	ret = NULL;
 	hlist_for_each_entry(pol, chain, bydst) {
 		if (pol->type == type &&
-- 
2.18.1

^ permalink raw reply related

* [PATCH ipsec-next 05/11] xfrm: policy: store inexact policies in an rhashtable
From: Florian Westphal @ 2018-11-07 22:00 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20181107220041.26205-1-fw@strlen.de>

Switch packet-path lookups for inexact policies to rhashtable.

In this initial version, we now no longer need to search policies with
non-matching address family and type.

Next patch will add the if_id as well so lookups from the xfrm interface
driver only need to search inexact policies for that device.

Future patches will augment the hlist in each rhash bucket with a tree
and pre-sort policies according to daddr/prefix.

A single rhashtable is used.  In order to avoid a full rhashtable walk on
netns exit, the bins get placed on a pernet list, i.e. we add almost no
cost for network namespaces that had no xfrm policies.

The inexact lists are kept in place, and policies are added to both the
per-rhash-inexact list and a pernet one.

The latter is needed for the control plane to handle migrate -- these
requests do not consider the if_id, so if we'd remove the inexact_list
now we would have to search all hash buckets and then figure
out which matching policy candidate is the most recent one -- this appears
a bit harder than just keeping the 'old' inexact list for this purpose.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netns/xfrm.h |   2 +
 include/net/xfrm.h       |   1 +
 net/xfrm/xfrm_policy.c   | 350 +++++++++++++++++++++++++++++++++++++--
 3 files changed, 335 insertions(+), 18 deletions(-)

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 9991e5ef52cc..59f45b1e9dac 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -5,6 +5,7 @@
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
+#include <linux/rhashtable-types.h>
 #include <linux/xfrm.h>
 #include <net/dst_ops.h>
 
@@ -53,6 +54,7 @@ struct netns_xfrm {
 	unsigned int		policy_count[XFRM_POLICY_MAX * 2];
 	struct work_struct	policy_hash_work;
 	struct xfrm_policy_hthresh policy_hthresh;
+	struct list_head	inexact_bins;
 
 
 	struct sock		*nlsk;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 0eb390c205af..870fa9b27f7e 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -596,6 +596,7 @@ struct xfrm_policy {
 	u16			family;
 	struct xfrm_sec_ctx	*security;
 	struct xfrm_tmpl       	xfrm_vec[XFRM_MAX_DEPTH];
+	struct hlist_node	bydst_inexact_list;
 	struct rcu_head		rcu;
 };
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b00c265f6be3..5c7e7399323f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -26,6 +26,7 @@
 #include <linux/cache.h>
 #include <linux/cpu.h>
 #include <linux/audit.h>
+#include <linux/rhashtable.h>
 #include <net/dst.h>
 #include <net/flow.h>
 #include <net/xfrm.h>
@@ -45,6 +46,22 @@ struct xfrm_flo {
 	u8 flags;
 };
 
+struct xfrm_pol_inexact_key {
+	possible_net_t net;
+	u16 family;
+	u8 dir, type;
+};
+
+struct xfrm_pol_inexact_bin {
+	struct xfrm_pol_inexact_key k;
+	struct rhash_head head;
+	struct hlist_head hhead;
+
+	/* slow path below */
+	struct list_head inexact_bins;
+	struct rcu_head rcu;
+};
+
 static DEFINE_SPINLOCK(xfrm_if_cb_lock);
 static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
 
@@ -55,6 +72,9 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
 static struct kmem_cache *xfrm_dst_cache __ro_after_init;
 static __read_mostly seqcount_t xfrm_policy_hash_generation;
 
+static struct rhashtable xfrm_policy_inexact_table;
+static const struct rhashtable_params xfrm_pol_inexact_params;
+
 static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
 static int stale_bundle(struct dst_entry *dst);
 static int xfrm_bundle_ok(struct xfrm_dst *xdst);
@@ -64,6 +84,18 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir);
 
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir);
+
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup_rcu(struct net *net,
+			       u8 type, u16 family, u8 dir);
+static struct xfrm_policy *
+xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
+			bool excl);
+static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
+					    struct xfrm_policy *policy);
+
 static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
 {
 	return refcount_inc_not_zero(&policy->refcnt);
@@ -269,6 +301,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 	if (policy) {
 		write_pnet(&policy->xp_net, net);
 		INIT_LIST_HEAD(&policy->walk.all);
+		INIT_HLIST_NODE(&policy->bydst_inexact_list);
 		INIT_HLIST_NODE(&policy->bydst);
 		INIT_HLIST_NODE(&policy->byidx);
 		rwlock_init(&policy->lock);
@@ -563,6 +596,107 @@ static void xfrm_hash_resize(struct work_struct *work)
 	mutex_unlock(&hash_resize_mutex);
 }
 
+static void xfrm_hash_reset_inexact_table(struct net *net)
+{
+	struct xfrm_pol_inexact_bin *b;
+
+	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+	list_for_each_entry(b, &net->xfrm.inexact_bins, inexact_bins)
+		INIT_HLIST_HEAD(&b->hhead);
+}
+
+/* Make sure *pol can be inserted into fastbin.
+ * Useful to check that later insert requests will be sucessful
+ * (provided xfrm_policy_lock is held throughout).
+ */
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
+{
+	struct xfrm_pol_inexact_bin *bin, *prev;
+	struct xfrm_pol_inexact_key k = {
+		.family = pol->family,
+		.type = pol->type,
+		.dir = dir,
+	};
+	struct net *net = xp_net(pol);
+
+	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+	write_pnet(&k.net, net);
+	bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k,
+				     xfrm_pol_inexact_params);
+	if (bin)
+		return bin;
+
+	bin = kzalloc(sizeof(*bin), GFP_ATOMIC);
+	if (!bin)
+		return NULL;
+
+	bin->k = k;
+	INIT_HLIST_HEAD(&bin->hhead);
+
+	prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
+						&bin->k, &bin->head,
+						xfrm_pol_inexact_params);
+	if (!prev) {
+		list_add(&bin->inexact_bins, &net->xfrm.inexact_bins);
+		return bin;
+	}
+
+	kfree(bin);
+
+	return IS_ERR(prev) ? NULL : prev;
+}
+
+static void xfrm_policy_inexact_delete_bin(struct net *net,
+					   struct xfrm_pol_inexact_bin *b)
+{
+	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+	if (!hlist_empty(&b->hhead))
+		return;
+
+	if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head,
+				   xfrm_pol_inexact_params) == 0) {
+		list_del(&b->inexact_bins);
+		kfree_rcu(b, rcu);
+	}
+}
+
+static void __xfrm_policy_inexact_flush(struct net *net)
+{
+	struct xfrm_pol_inexact_bin *bin;
+
+	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+	list_for_each_entry(bin, &net->xfrm.inexact_bins, inexact_bins)
+		xfrm_policy_inexact_delete_bin(net, bin);
+}
+
+static struct xfrm_policy *
+xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
+{
+	struct xfrm_pol_inexact_bin *bin;
+	struct xfrm_policy *delpol;
+	struct hlist_head *chain;
+	struct net *net;
+
+	bin = xfrm_policy_inexact_alloc_bin(policy, dir);
+	if (!bin)
+		return ERR_PTR(-ENOMEM);
+
+	delpol = xfrm_policy_insert_list(&bin->hhead, policy, excl);
+	if (delpol && excl)
+		return ERR_PTR(-EEXIST);
+
+	net = xp_net(policy);
+	chain = &net->xfrm.policy_inexact[dir];
+	xfrm_policy_insert_inexact_list(chain, policy);
+
+	return delpol;
+}
+
 static void xfrm_hash_rebuild(struct work_struct *work)
 {
 	struct net *net = container_of(work, struct net,
@@ -592,7 +726,45 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 
 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 
+	/* make sure that we can insert the indirect policies again before
+	 * we start with destructive action.
+	 */
+	list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
+		u8 dbits, sbits;
+
+		dir = xfrm_policy_id2dir(policy->index);
+		if (policy->walk.dead || dir >= XFRM_POLICY_MAX)
+			continue;
+
+		if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
+			if (policy->family == AF_INET) {
+				dbits = rbits4;
+				sbits = lbits4;
+			} else {
+				dbits = rbits6;
+				sbits = lbits6;
+			}
+		} else {
+			if (policy->family == AF_INET) {
+				dbits = lbits4;
+				sbits = rbits4;
+			} else {
+				dbits = lbits6;
+				sbits = rbits6;
+			}
+		}
+
+		if (policy->selector.prefixlen_d < dbits ||
+		    policy->selector.prefixlen_s < sbits)
+			continue;
+
+		if (!xfrm_policy_inexact_alloc_bin(policy, dir))
+			goto out_unlock;
+	}
+
 	/* reset the bydst and inexact table in all directions */
+	xfrm_hash_reset_inexact_table(net);
+
 	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
 		INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
 		hmask = net->xfrm.policy_bydst[dir].hmask;
@@ -625,8 +797,13 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 		chain = policy_hash_bysel(net, &policy->selector,
 					  policy->family,
 					  xfrm_policy_id2dir(policy->index));
-		if (!chain)
-			chain = &net->xfrm.policy_inexact[dir];
+		if (!chain) {
+			void *p = xfrm_policy_inexact_insert(policy, dir, 0);
+
+			WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p));
+			continue;
+		}
+
 		hlist_for_each_entry(pol, chain, bydst) {
 			if (policy->priority >= pol->priority)
 				newpos = &pol->bydst;
@@ -639,6 +816,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 			hlist_add_head_rcu(&policy->bydst, chain);
 	}
 
+out_unlock:
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	mutex_unlock(&hash_resize_mutex);
@@ -742,6 +920,84 @@ static bool xfrm_policy_mark_match(struct xfrm_policy *policy,
 	return false;
 }
 
+static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
+{
+	const struct xfrm_pol_inexact_key *k = data;
+	u32 a = k->type << 24 | k->dir << 16 | k->family;
+
+	return jhash_2words(a, net_hash_mix(read_pnet(&k->net)), seed);
+}
+
+static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed)
+{
+	const struct xfrm_pol_inexact_bin *b = data;
+
+	return xfrm_pol_bin_key(&b->k, 0, seed);
+}
+
+static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg,
+			    const void *ptr)
+{
+	const struct xfrm_pol_inexact_key *key = arg->key;
+	const struct xfrm_pol_inexact_bin *b = ptr;
+	int ret;
+
+	if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net)))
+		return -1;
+
+	ret = b->k.dir ^ key->dir;
+	if (ret)
+		return ret;
+
+	ret = b->k.type ^ key->type;
+	if (ret)
+		return ret;
+
+	ret = b->k.family ^ key->family;
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static const struct rhashtable_params xfrm_pol_inexact_params = {
+	.head_offset		= offsetof(struct xfrm_pol_inexact_bin, head),
+	.hashfn			= xfrm_pol_bin_key,
+	.obj_hashfn		= xfrm_pol_bin_obj,
+	.obj_cmpfn		= xfrm_pol_bin_cmp,
+	.automatic_shrinking	= true,
+};
+
+static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
+					    struct xfrm_policy *policy)
+{
+	struct xfrm_policy *pol, *delpol = NULL;
+	struct hlist_node *newpos = NULL;
+
+	hlist_for_each_entry(pol, chain, bydst_inexact_list) {
+		if (pol->type == policy->type &&
+		    pol->if_id == policy->if_id &&
+		    !selector_cmp(&pol->selector, &policy->selector) &&
+		    xfrm_policy_mark_match(policy, pol) &&
+		    xfrm_sec_ctx_match(pol->security, policy->security) &&
+		    !WARN_ON(delpol)) {
+			delpol = pol;
+			if (policy->priority > pol->priority)
+				continue;
+		} else if (policy->priority >= pol->priority) {
+			newpos = &pol->bydst_inexact_list;
+			continue;
+		}
+		if (delpol)
+			break;
+	}
+
+	if (newpos)
+		hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos);
+	else
+		hlist_add_head_rcu(&policy->bydst_inexact_list, chain);
+}
+
 static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
 						   struct xfrm_policy *policy,
 						   bool excl)
@@ -767,6 +1023,7 @@ static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
 		if (delpol)
 			break;
 	}
+
 	if (newpos)
 		hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
 	else
@@ -783,12 +1040,10 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 
 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
-	if (chain) {
+	if (chain)
 		delpol = xfrm_policy_insert_list(chain, policy, excl);
-	} else {
-		chain = &net->xfrm.policy_inexact[dir];
-		delpol = xfrm_policy_insert_list(chain, policy, excl);
-	}
+	else
+		delpol = xfrm_policy_inexact_insert(policy, dir, excl);
 
 	if (IS_ERR(delpol)) {
 		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
@@ -830,14 +1085,24 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
 					  struct xfrm_sec_ctx *ctx, int delete,
 					  int *err)
 {
-	struct xfrm_policy *pol, *ret;
+	struct xfrm_pol_inexact_bin *bin = NULL;
+	struct xfrm_policy *pol, *ret = NULL;
 	struct hlist_head *chain;
 
 	*err = 0;
 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = policy_hash_bysel(net, sel, sel->family, dir);
-	if (!chain)
-		chain = &net->xfrm.policy_inexact[dir];
+	if (!chain) {
+		bin = xfrm_policy_inexact_lookup(net, type,
+						 sel->family, dir);
+		if (!bin) {
+			spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+			return NULL;
+		}
+
+		chain = &bin->hhead;
+	}
+
 	ret = NULL;
 	hlist_for_each_entry(pol, chain, bydst) {
 		if (pol->type == type &&
@@ -854,6 +1119,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
 					return pol;
 				}
 				__xfrm_policy_unlink(pol, dir);
+				xfrm_policy_inexact_delete_bin(net, bin);
 			}
 			ret = pol;
 			break;
@@ -964,7 +1230,9 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 		spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 		goto again;
 	}
-	if (!cnt)
+	if (cnt)
+		__xfrm_policy_inexact_flush(net);
+	else
 		err = -ESRCH;
 out:
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
@@ -1063,21 +1331,50 @@ static int xfrm_policy_match(const struct xfrm_policy *pol,
 	if (match)
 		ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid,
 						  dir);
-
 	return ret;
 }
 
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir)
+{
+	struct xfrm_pol_inexact_key k = {
+		.family = family,
+		.type = type,
+		.dir = dir,
+	};
+
+	write_pnet(&k.net, net);
+
+	return rhashtable_lookup(&xfrm_policy_inexact_table, &k,
+				 xfrm_pol_inexact_params);
+}
+
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir)
+{
+	struct xfrm_pol_inexact_bin *bin;
+
+	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+	rcu_read_lock();
+	bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir);
+	rcu_read_unlock();
+
+	return bin;
+}
+
 static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 						     const struct flowi *fl,
 						     u16 family, u8 dir,
 						     u32 if_id)
 {
-	int err;
-	struct xfrm_policy *pol, *ret;
 	const xfrm_address_t *daddr, *saddr;
+	struct xfrm_pol_inexact_bin *bin;
+	struct xfrm_policy *pol, *ret;
 	struct hlist_head *chain;
 	unsigned int sequence;
 	u32 priority;
+	int err;
 
 	daddr = xfrm_flowi_daddr(fl, family);
 	saddr = xfrm_flowi_saddr(fl, family);
@@ -1108,7 +1405,10 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 			break;
 		}
 	}
-	chain = &net->xfrm.policy_inexact[dir];
+	bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir);
+	if (!bin)
+		goto skip_inexact;
+	chain = &bin->hhead;
 	hlist_for_each_entry_rcu(pol, chain, bydst) {
 		if ((pol->priority >= priority) && ret)
 			break;
@@ -1127,6 +1427,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 		}
 	}
 
+skip_inexact:
 	if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
 		goto retry;
 
@@ -1218,6 +1519,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 	/* Socket policies are not hashed. */
 	if (!hlist_unhashed(&pol->bydst)) {
 		hlist_del_rcu(&pol->bydst);
+		hlist_del_init(&pol->bydst_inexact_list);
 		hlist_del(&pol->byidx);
 	}
 
@@ -2795,13 +3097,17 @@ static void xfrm_statistics_fini(struct net *net)
 static int __net_init xfrm_policy_init(struct net *net)
 {
 	unsigned int hmask, sz;
-	int dir;
+	int dir, err;
 
-	if (net_eq(net, &init_net))
+	if (net_eq(net, &init_net)) {
 		xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
 					   sizeof(struct xfrm_dst),
 					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
 					   NULL);
+		err = rhashtable_init(&xfrm_policy_inexact_table,
+				      &xfrm_pol_inexact_params);
+		BUG_ON(err);
+	}
 
 	hmask = 8 - 1;
 	sz = (hmask+1) * sizeof(struct hlist_head);
@@ -2836,6 +3142,7 @@ static int __net_init xfrm_policy_init(struct net *net)
 	seqlock_init(&net->xfrm.policy_hthresh.lock);
 
 	INIT_LIST_HEAD(&net->xfrm.policy_all);
+	INIT_LIST_HEAD(&net->xfrm.inexact_bins);
 	INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
 	INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
 	return 0;
@@ -2854,6 +3161,7 @@ static int __net_init xfrm_policy_init(struct net *net)
 
 static void xfrm_policy_fini(struct net *net)
 {
+	struct xfrm_pol_inexact_bin *bin, *tmp;
 	unsigned int sz;
 	int dir;
 
@@ -2879,6 +3187,12 @@ static void xfrm_policy_fini(struct net *net)
 	sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
 	WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
 	xfrm_hash_free(net->xfrm.policy_byidx, sz);
+
+	list_for_each_entry_safe(bin, tmp, &net->xfrm.inexact_bins,
+				 inexact_bins) {
+		WARN_ON(!hlist_empty(&bin->hhead));
+		xfrm_policy_inexact_delete_bin(net, bin);
+	}
 }
 
 static int __net_init xfrm_net_init(struct net *net)
@@ -3044,7 +3358,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
 		}
 	}
 	chain = &net->xfrm.policy_inexact[dir];
-	hlist_for_each_entry(pol, chain, bydst) {
+	hlist_for_each_entry(pol, chain, bydst_inexact_list) {
 		if ((pol->priority >= priority) && ret)
 			break;
 
-- 
2.18.1

^ permalink raw reply related

* [PATCH ipsec-next 06/11] xfrm: policy: consider if_id when hashing inexact policy
From: Florian Westphal @ 2018-11-07 22:00 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20181107220041.26205-1-fw@strlen.de>

This avoids searches of polices that cannot match in the first
place due to different interface id by placing them in different bins.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/xfrm/xfrm_policy.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 5c7e7399323f..dda27fd7b8a4 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -48,6 +48,7 @@ struct xfrm_flo {
 
 struct xfrm_pol_inexact_key {
 	possible_net_t net;
+	u32 if_id;
 	u16 family;
 	u8 dir, type;
 };
@@ -85,11 +86,12 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir);
 
 static struct xfrm_pol_inexact_bin *
-xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir);
+xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir,
+			   u32 if_id);
 
 static struct xfrm_pol_inexact_bin *
 xfrm_policy_inexact_lookup_rcu(struct net *net,
-			       u8 type, u16 family, u8 dir);
+			       u8 type, u16 family, u8 dir, u32 if_id);
 static struct xfrm_policy *
 xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
 			bool excl);
@@ -618,6 +620,7 @@ xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
 		.family = pol->family,
 		.type = pol->type,
 		.dir = dir,
+		.if_id = pol->if_id,
 	};
 	struct net *net = xp_net(pol);
 
@@ -925,7 +928,8 @@ static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
 	const struct xfrm_pol_inexact_key *k = data;
 	u32 a = k->type << 24 | k->dir << 16 | k->family;
 
-	return jhash_2words(a, net_hash_mix(read_pnet(&k->net)), seed);
+	return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)),
+			    seed);
 }
 
 static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed)
@@ -957,7 +961,7 @@ static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg,
 	if (ret)
 		return ret;
 
-	return 0;
+	return b->k.if_id ^ key->if_id;
 }
 
 static const struct rhashtable_params xfrm_pol_inexact_params = {
@@ -1094,7 +1098,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
 	chain = policy_hash_bysel(net, sel, sel->family, dir);
 	if (!chain) {
 		bin = xfrm_policy_inexact_lookup(net, type,
-						 sel->family, dir);
+						 sel->family, dir, if_id);
 		if (!bin) {
 			spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 			return NULL;
@@ -1335,12 +1339,14 @@ static int xfrm_policy_match(const struct xfrm_policy *pol,
 }
 
 static struct xfrm_pol_inexact_bin *
-xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir)
+xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family,
+			       u8 dir, u32 if_id)
 {
 	struct xfrm_pol_inexact_key k = {
 		.family = family,
 		.type = type,
 		.dir = dir,
+		.if_id = if_id,
 	};
 
 	write_pnet(&k.net, net);
@@ -1350,14 +1356,15 @@ xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir)
 }
 
 static struct xfrm_pol_inexact_bin *
-xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir)
+xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family,
+			   u8 dir, u32 if_id)
 {
 	struct xfrm_pol_inexact_bin *bin;
 
 	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
 
 	rcu_read_lock();
-	bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir);
+	bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
 	rcu_read_unlock();
 
 	return bin;
@@ -1405,7 +1412,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 			break;
 		}
 	}
-	bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir);
+	bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
 	if (!bin)
 		goto skip_inexact;
 	chain = &bin->hhead;
-- 
2.18.1

^ permalink raw reply related

* [PATCH ipsec-next 07/11] xfrm: policy: add inexact policy search tree infrastructure
From: Florian Westphal @ 2018-11-07 22:00 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20181107220041.26205-1-fw@strlen.de>

At this time inexact policies are all searched in-order until the first
match is found.  After removal of the flow cache, this resolution has
to be performed for every packetm resulting in major slowdown when
number of inexact policies is high.

This adds infrastructure to later sort inexact policies into a tree.
This only introduces a single class: any:any.

Next patch will add a search tree to pre-sort policies that
have a fixed daddr/prefixlen, so in this patch the any:any class
will still be used for all policies.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/xfrm.h     |   1 +
 net/xfrm/xfrm_policy.c | 301 +++++++++++++++++++++++++++++++++--------
 2 files changed, 248 insertions(+), 54 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 870fa9b27f7e..9df6dca17155 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -577,6 +577,7 @@ struct xfrm_policy {
 	/* This lock only affects elements except for entry. */
 	rwlock_t		lock;
 	refcount_t		refcnt;
+	u32			pos;
 	struct timer_list	timer;
 
 	atomic_t		genid;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index dda27fd7b8a4..4eb12e9b40c2 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -46,6 +46,9 @@ struct xfrm_flo {
 	u8 flags;
 };
 
+/* prefixes smaller than this are stored in lists, not trees. */
+#define INEXACT_PREFIXLEN_IPV4	16
+#define INEXACT_PREFIXLEN_IPV6	48
 struct xfrm_pol_inexact_key {
 	possible_net_t net;
 	u32 if_id;
@@ -56,6 +59,7 @@ struct xfrm_pol_inexact_key {
 struct xfrm_pol_inexact_bin {
 	struct xfrm_pol_inexact_key k;
 	struct rhash_head head;
+	/* list containing '*:*' policies */
 	struct hlist_head hhead;
 
 	/* slow path below */
@@ -63,6 +67,16 @@ struct xfrm_pol_inexact_bin {
 	struct rcu_head rcu;
 };
 
+enum xfrm_pol_inexact_candidate_type {
+	XFRM_POL_CAND_ANY,
+
+	XFRM_POL_CAND_MAX,
+};
+
+struct xfrm_pol_inexact_candidates {
+	struct hlist_head *res[XFRM_POL_CAND_MAX];
+};
+
 static DEFINE_SPINLOCK(xfrm_if_cb_lock);
 static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
 
@@ -98,6 +112,12 @@ xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
 static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
 					    struct xfrm_policy *policy);
 
+static bool
+xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
+				    struct xfrm_pol_inexact_bin *b,
+				    const xfrm_address_t *saddr,
+				    const xfrm_address_t *daddr);
+
 static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
 {
 	return refcount_inc_not_zero(&policy->refcnt);
@@ -652,13 +672,48 @@ xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
 	return IS_ERR(prev) ? NULL : prev;
 }
 
-static void xfrm_policy_inexact_delete_bin(struct net *net,
-					   struct xfrm_pol_inexact_bin *b)
+static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr,
+					       int family, u8 prefixlen)
 {
-	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+	if (xfrm_addr_any(addr, family))
+		return true;
+
+	if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6)
+		return true;
+
+	if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4)
+		return true;
+
+	return false;
+}
+
+static bool
+xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy)
+{
+	const xfrm_address_t *addr;
+	bool saddr_any, daddr_any;
+	u8 prefixlen;
+
+	addr = &policy->selector.saddr;
+	prefixlen = policy->selector.prefixlen_s;
 
-	if (!hlist_empty(&b->hhead))
+	saddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
+						       policy->family,
+						       prefixlen);
+	addr = &policy->selector.daddr;
+	prefixlen = policy->selector.prefixlen_d;
+	daddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
+						       policy->family,
+						       prefixlen);
+	return saddr_any && daddr_any;
+}
+
+static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit)
+{
+	if (!hlist_empty(&b->hhead)) {
+		WARN_ON_ONCE(net_exit);
 		return;
+	}
 
 	if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head,
 				   xfrm_pol_inexact_params) == 0) {
@@ -667,14 +722,23 @@ static void xfrm_policy_inexact_delete_bin(struct net *net,
 	}
 }
 
+static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b)
+{
+	struct net *net = read_pnet(&b->k.net);
+
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+	__xfrm_policy_inexact_prune_bin(b, false);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+}
+
 static void __xfrm_policy_inexact_flush(struct net *net)
 {
-	struct xfrm_pol_inexact_bin *bin;
+	struct xfrm_pol_inexact_bin *bin, *t;
 
 	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
 
-	list_for_each_entry(bin, &net->xfrm.inexact_bins, inexact_bins)
-		xfrm_policy_inexact_delete_bin(net, bin);
+	list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins)
+		__xfrm_policy_inexact_prune_bin(bin, false);
 }
 
 static struct xfrm_policy *
@@ -689,14 +753,28 @@ xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
 	if (!bin)
 		return ERR_PTR(-ENOMEM);
 
-	delpol = xfrm_policy_insert_list(&bin->hhead, policy, excl);
-	if (delpol && excl)
+	net = xp_net(policy);
+	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+	if (xfrm_policy_inexact_insert_use_any_list(policy)) {
+		chain = &bin->hhead;
+		goto insert_to_list;
+	}
+
+	chain = &bin->hhead;
+insert_to_list:
+	delpol = xfrm_policy_insert_list(chain, policy, excl);
+	if (delpol && excl) {
+		__xfrm_policy_inexact_prune_bin(bin, false);
 		return ERR_PTR(-EEXIST);
+	}
 
-	net = xp_net(policy);
 	chain = &net->xfrm.policy_inexact[dir];
 	xfrm_policy_insert_inexact_list(chain, policy);
 
+	if (delpol)
+		__xfrm_policy_inexact_prune_bin(bin, false);
+
 	return delpol;
 }
 
@@ -733,6 +811,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 	 * we start with destructive action.
 	 */
 	list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
+		struct xfrm_pol_inexact_bin *bin;
 		u8 dbits, sbits;
 
 		dir = xfrm_policy_id2dir(policy->index);
@@ -761,7 +840,8 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 		    policy->selector.prefixlen_s < sbits)
 			continue;
 
-		if (!xfrm_policy_inexact_alloc_bin(policy, dir))
+		bin = xfrm_policy_inexact_alloc_bin(policy, dir);
+		if (!bin)
 			goto out_unlock;
 	}
 
@@ -820,6 +900,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 	}
 
 out_unlock:
+	__xfrm_policy_inexact_flush(net);
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	mutex_unlock(&hash_resize_mutex);
@@ -977,6 +1058,7 @@ static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
 {
 	struct xfrm_policy *pol, *delpol = NULL;
 	struct hlist_node *newpos = NULL;
+	int i = 0;
 
 	hlist_for_each_entry(pol, chain, bydst_inexact_list) {
 		if (pol->type == policy->type &&
@@ -1000,6 +1082,11 @@ static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
 		hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos);
 	else
 		hlist_add_head_rcu(&policy->bydst_inexact_list, chain);
+
+	hlist_for_each_entry(pol, chain, bydst_inexact_list) {
+		pol->pos = i;
+		i++;
+	}
 }
 
 static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
@@ -1083,6 +1170,29 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 }
 EXPORT_SYMBOL(xfrm_policy_insert);
 
+static struct xfrm_policy *
+__xfrm_policy_bysel_ctx(struct hlist_head *chain, u32 mark, u32 if_id,
+			u8 type, int dir,
+			struct xfrm_selector *sel,
+			struct xfrm_sec_ctx *ctx)
+{
+	struct xfrm_policy *pol;
+
+	if (!chain)
+		return NULL;
+
+	hlist_for_each_entry(pol, chain, bydst) {
+		if (pol->type == type &&
+		    pol->if_id == if_id &&
+		    (mark & pol->mark.m) == pol->mark.v &&
+		    !selector_cmp(sel, &pol->selector) &&
+		    xfrm_sec_ctx_match(ctx, pol->security))
+			return pol;
+	}
+
+	return NULL;
+}
+
 struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
 					  u8 type, int dir,
 					  struct xfrm_selector *sel,
@@ -1097,6 +1207,9 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = policy_hash_bysel(net, sel, sel->family, dir);
 	if (!chain) {
+		struct xfrm_pol_inexact_candidates cand;
+		int i;
+
 		bin = xfrm_policy_inexact_lookup(net, type,
 						 sel->family, dir, if_id);
 		if (!bin) {
@@ -1104,35 +1217,46 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
 			return NULL;
 		}
 
-		chain = &bin->hhead;
+		if (!xfrm_policy_find_inexact_candidates(&cand, bin,
+							 &sel->saddr,
+							 &sel->daddr)) {
+			spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+			return NULL;
+		}
+
+		pol = NULL;
+		for (i = 0; i < ARRAY_SIZE(cand.res); i++) {
+			struct xfrm_policy *tmp;
+
+			tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark,
+						      if_id, type, dir,
+						      sel, ctx);
+			if (tmp && pol && tmp->pos < pol->pos)
+				pol = tmp;
+		}
+	} else {
+		pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir,
+					      sel, ctx);
 	}
 
-	ret = NULL;
-	hlist_for_each_entry(pol, chain, bydst) {
-		if (pol->type == type &&
-		    pol->if_id == if_id &&
-		    (mark & pol->mark.m) == pol->mark.v &&
-		    !selector_cmp(sel, &pol->selector) &&
-		    xfrm_sec_ctx_match(ctx, pol->security)) {
-			xfrm_pol_hold(pol);
-			if (delete) {
-				*err = security_xfrm_policy_delete(
-								pol->security);
-				if (*err) {
-					spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
-					return pol;
-				}
-				__xfrm_policy_unlink(pol, dir);
-				xfrm_policy_inexact_delete_bin(net, bin);
+	if (pol) {
+		xfrm_pol_hold(pol);
+		if (delete) {
+			*err = security_xfrm_policy_delete(pol->security);
+			if (*err) {
+				spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+				return pol;
 			}
-			ret = pol;
-			break;
+			__xfrm_policy_unlink(pol, dir);
 		}
+		ret = pol;
 	}
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	if (ret && delete)
 		xfrm_policy_kill(ret);
+	if (bin && delete)
+		xfrm_policy_inexact_prune_bin(bin);
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -1338,6 +1462,20 @@ static int xfrm_policy_match(const struct xfrm_policy *pol,
 	return ret;
 }
 
+static bool
+xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
+				    struct xfrm_pol_inexact_bin *b,
+				    const xfrm_address_t *saddr,
+				    const xfrm_address_t *daddr)
+{
+	if (!b)
+		return false;
+
+	memset(cand, 0, sizeof(*cand));
+	cand->res[XFRM_POL_CAND_ANY] = &b->hhead;
+	return true;
+}
+
 static struct xfrm_pol_inexact_bin *
 xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family,
 			       u8 dir, u32 if_id)
@@ -1370,11 +1508,76 @@ xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family,
 	return bin;
 }
 
+static struct xfrm_policy *
+__xfrm_policy_eval_candidates(struct hlist_head *chain,
+			      struct xfrm_policy *prefer,
+			      const struct flowi *fl,
+			      u8 type, u16 family, int dir, u32 if_id)
+{
+	u32 priority = prefer ? prefer->priority : ~0u;
+	struct xfrm_policy *pol;
+
+	if (!chain)
+		return NULL;
+
+	hlist_for_each_entry_rcu(pol, chain, bydst) {
+		int err;
+
+		if (pol->priority > priority)
+			break;
+
+		err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
+		if (err) {
+			if (err != -ESRCH)
+				return ERR_PTR(err);
+
+			continue;
+		}
+
+		if (prefer) {
+			/* matches.  Is it older than *prefer? */
+			if (pol->priority == priority &&
+			    prefer->pos < pol->pos)
+				return prefer;
+		}
+
+		return pol;
+	}
+
+	return NULL;
+}
+
+static struct xfrm_policy *
+xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
+			    struct xfrm_policy *prefer,
+			    const struct flowi *fl,
+			    u8 type, u16 family, int dir, u32 if_id)
+{
+	struct xfrm_policy *tmp;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cand->res); i++) {
+		tmp = __xfrm_policy_eval_candidates(cand->res[i],
+						    prefer,
+						    fl, type, family, dir,
+						    if_id);
+		if (!tmp)
+			continue;
+
+		if (IS_ERR(tmp))
+			return tmp;
+		prefer = tmp;
+	}
+
+	return prefer;
+}
+
 static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 						     const struct flowi *fl,
 						     u16 family, u8 dir,
 						     u32 if_id)
 {
+	struct xfrm_pol_inexact_candidates cand;
 	const xfrm_address_t *daddr, *saddr;
 	struct xfrm_pol_inexact_bin *bin;
 	struct xfrm_policy *pol, *ret;
@@ -1413,25 +1616,16 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 		}
 	}
 	bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
-	if (!bin)
+	if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr,
+							 daddr))
 		goto skip_inexact;
-	chain = &bin->hhead;
-	hlist_for_each_entry_rcu(pol, chain, bydst) {
-		if ((pol->priority >= priority) && ret)
-			break;
 
-		err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
-		if (err) {
-			if (err == -ESRCH)
-				continue;
-			else {
-				ret = ERR_PTR(err);
-				goto fail;
-			}
-		} else {
-			ret = pol;
-			break;
-		}
+	pol = xfrm_policy_eval_candidates(&cand, ret, fl, type,
+					  family, dir, if_id);
+	if (pol) {
+		ret = pol;
+		if (IS_ERR(pol))
+			goto fail;
 	}
 
 skip_inexact:
@@ -3168,7 +3362,7 @@ static int __net_init xfrm_policy_init(struct net *net)
 
 static void xfrm_policy_fini(struct net *net)
 {
-	struct xfrm_pol_inexact_bin *bin, *tmp;
+	struct xfrm_pol_inexact_bin *b, *t;
 	unsigned int sz;
 	int dir;
 
@@ -3195,11 +3389,10 @@ static void xfrm_policy_fini(struct net *net)
 	WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
 	xfrm_hash_free(net->xfrm.policy_byidx, sz);
 
-	list_for_each_entry_safe(bin, tmp, &net->xfrm.inexact_bins,
-				 inexact_bins) {
-		WARN_ON(!hlist_empty(&bin->hhead));
-		xfrm_policy_inexact_delete_bin(net, bin);
-	}
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+	list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins)
+		__xfrm_policy_inexact_prune_bin(b, true);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 }
 
 static int __net_init xfrm_net_init(struct net *net)
-- 
2.18.1

^ permalink raw reply related

* [PATCH ipsec-next 08/11] xfrm: policy: store inexact policies in a tree ordered by destination address
From: Florian Westphal @ 2018-11-07 22:00 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20181107220041.26205-1-fw@strlen.de>

This adds inexact lists per destination network, stored in a search tree.

Inexact lookups now return two 'candidate lists', the 'any' policies
('any' destionations), and a list of policies that share same
daddr/prefix.

Next patch will add a second search tree for 'saddr:any' policies
so we can avoid placing those on the 'any:any' list too.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/xfrm.h     |   1 +
 net/xfrm/xfrm_policy.c | 333 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 328 insertions(+), 6 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 9df6dca17155..fa4b3c877fcf 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -590,6 +590,7 @@ struct xfrm_policy {
 	struct xfrm_lifetime_cur curlft;
 	struct xfrm_policy_walk_entry walk;
 	struct xfrm_policy_queue polq;
+	bool                    bydst_reinsert;
 	u8			type;
 	u8			action;
 	u8			flags;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 4eb12e9b40c2..81447d5d02e6 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -49,6 +49,38 @@ struct xfrm_flo {
 /* prefixes smaller than this are stored in lists, not trees. */
 #define INEXACT_PREFIXLEN_IPV4	16
 #define INEXACT_PREFIXLEN_IPV6	48
+
+struct xfrm_pol_inexact_node {
+	struct rb_node node;
+	union {
+		xfrm_address_t addr;
+		struct rcu_head rcu;
+	};
+	u8 prefixlen;
+
+	/* the policies matching this node, can be empty list */
+	struct hlist_head hhead;
+};
+
+/* xfrm inexact policy search tree:
+ * xfrm_pol_inexact_bin = hash(dir,type,family,if_id);
+ *  |
+ * +---- root_d: sorted by daddr:prefix
+ * |                 |
+ * |        xfrm_pol_inexact_node
+ * |                 |
+ * |                 +- coarse policies and all any:daddr policies
+ * |
+ * +---- coarse policies and all any:any policies
+ *
+ * Lookups return two candidate lists:
+ * 1. any:any list from top-level xfrm_pol_inexact_bin
+ * 2. any:daddr list from daddr tree
+ *
+ * This result set then needs to be searched for the policy with
+ * the lowest priority.  If two results have same prio, youngest one wins.
+ */
+
 struct xfrm_pol_inexact_key {
 	possible_net_t net;
 	u32 if_id;
@@ -62,12 +94,17 @@ struct xfrm_pol_inexact_bin {
 	/* list containing '*:*' policies */
 	struct hlist_head hhead;
 
+	seqcount_t count;
+	/* tree sorted by daddr/prefix */
+	struct rb_root root_d;
+
 	/* slow path below */
 	struct list_head inexact_bins;
 	struct rcu_head rcu;
 };
 
 enum xfrm_pol_inexact_candidate_type {
+	XFRM_POL_CAND_DADDR,
 	XFRM_POL_CAND_ANY,
 
 	XFRM_POL_CAND_MAX,
@@ -658,6 +695,8 @@ xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
 
 	bin->k = k;
 	INIT_HLIST_HEAD(&bin->hhead);
+	bin->root_d = RB_ROOT;
+	seqcount_init(&bin->count);
 
 	prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
 						&bin->k, &bin->head,
@@ -708,9 +747,211 @@ xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy)
 	return saddr_any && daddr_any;
 }
 
+static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node,
+				       const xfrm_address_t *addr, u8 prefixlen)
+{
+	node->addr = *addr;
+	node->prefixlen = prefixlen;
+}
+
+static struct xfrm_pol_inexact_node *
+xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen)
+{
+	struct xfrm_pol_inexact_node *node;
+
+	node = kzalloc(sizeof(*node), GFP_ATOMIC);
+	if (node)
+		xfrm_pol_inexact_node_init(node, addr, prefixlen);
+
+	return node;
+}
+
+static int xfrm_policy_addr_delta(const xfrm_address_t *a,
+				  const xfrm_address_t *b,
+				  u8 prefixlen, u16 family)
+{
+	unsigned int pdw, pbi;
+	int delta = 0;
+
+	switch (family) {
+	case AF_INET:
+		if (sizeof(long) == 4 && prefixlen == 0)
+			return ntohl(a->a4) - ntohl(b->a4);
+		return (ntohl(a->a4) & ((~0UL << (32 - prefixlen)))) -
+		       (ntohl(b->a4) & ((~0UL << (32 - prefixlen))));
+	case AF_INET6:
+		pdw = prefixlen >> 5;
+		pbi = prefixlen & 0x1f;
+
+		if (pdw) {
+			delta = memcmp(a->a6, b->a6, pdw << 2);
+			if (delta)
+				return delta;
+		}
+		if (pbi) {
+			u32 mask = ~0u << (32 - pbi);
+
+			delta = (ntohl(a->a6[pdw]) & mask) -
+				(ntohl(b->a6[pdw]) & mask);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return delta;
+}
+
+static void xfrm_policy_inexact_list_reinsert(struct net *net,
+					      struct xfrm_pol_inexact_node *n,
+					      u16 family)
+{
+	struct hlist_node *newpos = NULL;
+	struct xfrm_policy *policy, *p;
+
+	list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
+		if (!policy->bydst_reinsert)
+			continue;
+
+		WARN_ON_ONCE(policy->family != family);
+
+		policy->bydst_reinsert = false;
+		hlist_for_each_entry(p, &n->hhead, bydst) {
+			if (policy->priority >= p->priority)
+				newpos = &p->bydst;
+			else
+				break;
+		}
+
+		if (newpos)
+			hlist_add_behind(&policy->bydst, newpos);
+		else
+			hlist_add_head(&policy->bydst, &n->hhead);
+	}
+}
+
+/* merge nodes v and n */
+static void xfrm_policy_inexact_node_merge(struct net *net,
+					   struct xfrm_pol_inexact_node *v,
+					   struct xfrm_pol_inexact_node *n,
+					   u16 family)
+{
+	struct xfrm_policy *tmp;
+
+	hlist_for_each_entry(tmp, &v->hhead, bydst)
+		tmp->bydst_reinsert = true;
+	hlist_for_each_entry(tmp, &n->hhead, bydst)
+		tmp->bydst_reinsert = true;
+
+	INIT_HLIST_HEAD(&n->hhead);
+	xfrm_policy_inexact_list_reinsert(net, n, family);
+}
+
+static struct xfrm_pol_inexact_node *
+xfrm_policy_inexact_insert_node(struct net *net,
+				struct rb_root *root,
+				xfrm_address_t *addr,
+				u16 family, u8 prefixlen, u8 dir)
+{
+	struct xfrm_pol_inexact_node *cached = NULL;
+	struct rb_node **p, *parent = NULL;
+	struct xfrm_pol_inexact_node *node;
+
+	p = &root->rb_node;
+	while (*p) {
+		int delta;
+
+		parent = *p;
+		node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
+
+		delta = xfrm_policy_addr_delta(addr, &node->addr,
+					       node->prefixlen,
+					       family);
+		if (delta == 0 && prefixlen >= node->prefixlen) {
+			WARN_ON_ONCE(cached); /* ipsec policies got lost */
+			return node;
+		}
+
+		if (delta < 0)
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+
+		if (prefixlen < node->prefixlen) {
+			delta = xfrm_policy_addr_delta(addr, &node->addr,
+						       prefixlen,
+						       family);
+			if (delta)
+				continue;
+
+			/* This node is a subnet of the new prefix. It needs
+			 * to be removed and re-inserted with the smaller
+			 * prefix and all nodes that are now also covered
+			 * by the reduced prefixlen.
+			 */
+			rb_erase(&node->node, root);
+
+			if (!cached) {
+				xfrm_pol_inexact_node_init(node, addr,
+							   prefixlen);
+				cached = node;
+			} else {
+				/* This node also falls within the new
+				 * prefixlen. Merge the to-be-reinserted
+				 * node and this one.
+				 */
+				xfrm_policy_inexact_node_merge(net, node,
+							       cached, family);
+				kfree_rcu(node, rcu);
+			}
+
+			/* restart */
+			p = &root->rb_node;
+			parent = NULL;
+		}
+	}
+
+	node = cached;
+	if (!node) {
+		node = xfrm_pol_inexact_node_alloc(addr, prefixlen);
+		if (!node)
+			return NULL;
+	}
+
+	rb_link_node_rcu(&node->node, parent, p);
+	rb_insert_color(&node->node, root);
+
+	return node;
+}
+
+static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm)
+{
+	struct xfrm_pol_inexact_node *node;
+	struct rb_node *rn = rb_first(r);
+
+	while (rn) {
+		node = rb_entry(rn, struct xfrm_pol_inexact_node, node);
+
+		rn = rb_next(rn);
+
+		if (!hlist_empty(&node->hhead)) {
+			WARN_ON_ONCE(rm);
+			continue;
+		}
+
+		rb_erase(&node->node, r);
+		kfree_rcu(node, rcu);
+	}
+}
+
 static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit)
 {
-	if (!hlist_empty(&b->hhead)) {
+	write_seqcount_begin(&b->count);
+	xfrm_policy_inexact_gc_tree(&b->root_d, net_exit);
+	write_seqcount_end(&b->count);
+
+	if (!RB_EMPTY_ROOT(&b->root_d) ||
+	    !hlist_empty(&b->hhead)) {
 		WARN_ON_ONCE(net_exit);
 		return;
 	}
@@ -741,6 +982,37 @@ static void __xfrm_policy_inexact_flush(struct net *net)
 		__xfrm_policy_inexact_prune_bin(bin, false);
 }
 
+static struct hlist_head *
+xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
+				struct xfrm_policy *policy, u8 dir)
+{
+	struct xfrm_pol_inexact_node *n;
+	struct net *net;
+
+	net = xp_net(policy);
+	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+	if (xfrm_policy_inexact_insert_use_any_list(policy))
+		return &bin->hhead;
+
+	if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
+					       policy->family,
+					       policy->selector.prefixlen_d))
+		return &bin->hhead;
+
+	/* daddr is fixed */
+	write_seqcount_begin(&bin->count);
+	n = xfrm_policy_inexact_insert_node(net,
+					    &bin->root_d,
+					    &policy->selector.daddr,
+					    policy->family,
+					    policy->selector.prefixlen_d, dir);
+	write_seqcount_end(&bin->count);
+	if (!n)
+		return NULL;
+	return &n->hhead;
+}
+
 static struct xfrm_policy *
 xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
 {
@@ -756,13 +1028,12 @@ xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
 	net = xp_net(policy);
 	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
 
-	if (xfrm_policy_inexact_insert_use_any_list(policy)) {
-		chain = &bin->hhead;
-		goto insert_to_list;
+	chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir);
+	if (!chain) {
+		__xfrm_policy_inexact_prune_bin(bin, false);
+		return ERR_PTR(-ENOMEM);
 	}
 
-	chain = &bin->hhead;
-insert_to_list:
 	delpol = xfrm_policy_insert_list(chain, policy, excl);
 	if (delpol && excl) {
 		__xfrm_policy_inexact_prune_bin(bin, false);
@@ -843,6 +1114,9 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 		bin = xfrm_policy_inexact_alloc_bin(policy, dir);
 		if (!bin)
 			goto out_unlock;
+
+		if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir))
+			goto out_unlock;
 	}
 
 	/* reset the bydst and inexact table in all directions */
@@ -1462,17 +1736,64 @@ static int xfrm_policy_match(const struct xfrm_policy *pol,
 	return ret;
 }
 
+static struct xfrm_pol_inexact_node *
+xfrm_policy_lookup_inexact_addr(const struct rb_root *r,
+				seqcount_t *count,
+				const xfrm_address_t *addr, u16 family)
+{
+	const struct rb_node *parent;
+	int seq;
+
+again:
+	seq = read_seqcount_begin(count);
+
+	parent = rcu_dereference_raw(r->rb_node);
+	while (parent) {
+		struct xfrm_pol_inexact_node *node;
+		int delta;
+
+		node = rb_entry(parent, struct xfrm_pol_inexact_node, node);
+
+		delta = xfrm_policy_addr_delta(addr, &node->addr,
+					       node->prefixlen, family);
+		if (delta < 0) {
+			parent = rcu_dereference_raw(parent->rb_left);
+			continue;
+		} else if (delta > 0) {
+			parent = rcu_dereference_raw(parent->rb_right);
+			continue;
+		}
+
+		return node;
+	}
+
+	if (read_seqcount_retry(count, seq))
+		goto again;
+
+	return NULL;
+}
+
 static bool
 xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
 				    struct xfrm_pol_inexact_bin *b,
 				    const xfrm_address_t *saddr,
 				    const xfrm_address_t *daddr)
 {
+	struct xfrm_pol_inexact_node *n;
+	u16 family;
+
 	if (!b)
 		return false;
 
+	family = b->k.family;
 	memset(cand, 0, sizeof(*cand));
 	cand->res[XFRM_POL_CAND_ANY] = &b->hhead;
+
+	n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr,
+					    family);
+	if (n)
+		cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
+
 	return true;
 }
 
-- 
2.18.1

^ permalink raw reply related

* [PATCH ipsec-next 09/11] xfrm: policy: check reinserted policies match their node
From: Florian Westphal @ 2018-11-07 22:00 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20181107220041.26205-1-fw@strlen.de>

validate the re-inserted policies match the lookup node.
Policies that fail this test won't be returned in the candidate set.

This is enabled by default for now, it should not cause noticeable
reinsert slow down.

Such reinserts are needed when we have to merge an existing node
(e.g. for 10.0.0.0/28 because a overlapping subnet was added (e.g.
10.0.0.0/24), so whenever this happens existing policies have to
be placed on the list of the new node.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/xfrm/xfrm_policy.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 81447d5d02e6..57e28dcd7c53 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -806,10 +806,16 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net,
 					      struct xfrm_pol_inexact_node *n,
 					      u16 family)
 {
+	unsigned int matched_s, matched_d;
 	struct hlist_node *newpos = NULL;
 	struct xfrm_policy *policy, *p;
 
+	matched_s = 0;
+	matched_d = 0;
+
 	list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
+		bool matches_s, matches_d;
+
 		if (!policy->bydst_reinsert)
 			continue;
 
@@ -827,6 +833,32 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net,
 			hlist_add_behind(&policy->bydst, newpos);
 		else
 			hlist_add_head(&policy->bydst, &n->hhead);
+
+		/* paranoia checks follow.
+		 * Check that the reinserted policy matches at least
+		 * saddr or daddr for current node prefix.
+		 *
+		 * Matching both is fine, matching saddr in one policy
+		 * (but not daddr) and then matching only daddr in another
+		 * is a bug.
+		 */
+		matches_s = xfrm_policy_addr_delta(&policy->selector.saddr,
+						   &n->addr,
+						   n->prefixlen,
+						   family) == 0;
+		matches_d = xfrm_policy_addr_delta(&policy->selector.daddr,
+						   &n->addr,
+						   n->prefixlen,
+						   family) == 0;
+		if (matches_s && matches_d)
+			continue;
+
+		WARN_ON_ONCE(!matches_s && !matches_d);
+		if (matches_s)
+			matched_s++;
+		if (matches_d)
+			matched_d++;
+		WARN_ON_ONCE(matched_s && matched_d);
 	}
 }
 
-- 
2.18.1

^ permalink raw reply related

* [PATCH ipsec-next 10/11] xfrm: policy: store inexact policies in a tree ordered by source address
From: Florian Westphal @ 2018-11-07 22:00 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20181107220041.26205-1-fw@strlen.de>

This adds the 'saddr:any' search class.  It contains all policies that have
a fixed saddr/prefixlen, but 'any' destination.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/xfrm/xfrm_policy.c | 46 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 57e28dcd7c53..38e33326c856 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -71,11 +71,20 @@ struct xfrm_pol_inexact_node {
  * |                 |
  * |                 +- coarse policies and all any:daddr policies
  * |
+ * +---- root_s: sorted by saddr:prefix
+ * |                 |
+ * |        xfrm_pol_inexact_node
+ * |                 |
+ * |                 + root: unused
+ * |                 |
+ * |                 + hhead: saddr:any policies
+ * |
  * +---- coarse policies and all any:any policies
  *
- * Lookups return two candidate lists:
+ * Lookups return three candidate lists:
  * 1. any:any list from top-level xfrm_pol_inexact_bin
  * 2. any:daddr list from daddr tree
+ * 2. saddr:any list from saddr tree
  *
  * This result set then needs to be searched for the policy with
  * the lowest priority.  If two results have same prio, youngest one wins.
@@ -98,12 +107,16 @@ struct xfrm_pol_inexact_bin {
 	/* tree sorted by daddr/prefix */
 	struct rb_root root_d;
 
+	/* tree sorted by saddr/prefix */
+	struct rb_root root_s;
+
 	/* slow path below */
 	struct list_head inexact_bins;
 	struct rcu_head rcu;
 };
 
 enum xfrm_pol_inexact_candidate_type {
+	XFRM_POL_CAND_SADDR,
 	XFRM_POL_CAND_DADDR,
 	XFRM_POL_CAND_ANY,
 
@@ -696,6 +709,7 @@ xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
 	bin->k = k;
 	INIT_HLIST_HEAD(&bin->hhead);
 	bin->root_d = RB_ROOT;
+	bin->root_s = RB_ROOT;
 	seqcount_init(&bin->count);
 
 	prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
@@ -980,9 +994,10 @@ static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool
 {
 	write_seqcount_begin(&b->count);
 	xfrm_policy_inexact_gc_tree(&b->root_d, net_exit);
+	xfrm_policy_inexact_gc_tree(&b->root_s, net_exit);
 	write_seqcount_end(&b->count);
 
-	if (!RB_EMPTY_ROOT(&b->root_d) ||
+	if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) ||
 	    !hlist_empty(&b->hhead)) {
 		WARN_ON_ONCE(net_exit);
 		return;
@@ -1027,11 +1042,29 @@ xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
 	if (xfrm_policy_inexact_insert_use_any_list(policy))
 		return &bin->hhead;
 
-	if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
+	/* saddr is wildcard */
+	if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
 					       policy->family,
-					       policy->selector.prefixlen_d))
+					       policy->selector.prefixlen_s))
 		return &bin->hhead;
 
+	if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
+					       policy->family,
+					       policy->selector.prefixlen_d)) {
+		write_seqcount_begin(&bin->count);
+		n = xfrm_policy_inexact_insert_node(net,
+						    &bin->root_s,
+						    &policy->selector.saddr,
+						    policy->family,
+						    policy->selector.prefixlen_s,
+						    dir);
+		write_seqcount_end(&bin->count);
+		if (!n)
+			return NULL;
+
+		return &n->hhead;
+	}
+
 	/* daddr is fixed */
 	write_seqcount_begin(&bin->count);
 	n = xfrm_policy_inexact_insert_node(net,
@@ -1826,6 +1859,11 @@ xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
 	if (n)
 		cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
 
+	n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr,
+					    family);
+	if (n)
+		cand->res[XFRM_POL_CAND_SADDR] = &n->hhead;
+
 	return true;
 }
 
-- 
2.18.1

^ permalink raw reply related

* [PATCH ipsec-next 11/11] xfrm: policy: add 2nd-level saddr trees for inexact policies
From: Florian Westphal @ 2018-11-07 22:00 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20181107220041.26205-1-fw@strlen.de>

This adds the fourth and final search class, containing policies
where both saddr and daddr have prefix lengths (i.e., not wildcards).

Inexact policies now end up in one of the following four search classes:

1. "Any:Any" list, containing policies where both saddr and daddr are
   wildcards or have very coarse prefixes, e.g. 10.0.0.0/8 and the like.
2. "saddr:any" list, containing policies with a fixed saddr/prefixlen,
   but without destination restrictions.
   These lists are stored in rbtree nodes; each node contains those
   policies matching saddr/prefixlen.
3. "Any:daddr" list. Similar to 2), except for policies where only the
   destinations are specified.
4. "saddr:daddr" lists, containing only those policies that
   match the given source/destination network.
   The root of the saddr/daddr nodes gets stored in the nodes of the
   'daddr' tree.

This diagram illustrates the list classes, and their
placement in the lookup hierarchy:

    xfrm_pol_inexact_bin = hash(dir,type,family,if_id);
      |
      +---- root_d: sorted by daddr:prefix
      |                 |
      |        xfrm_pol_inexact_node
      |                 |
      |                 +- root: sorted by saddr/prefix
      |                 |              |
      |                 |         xfrm_pol_inexact_node
      |                 |              |
      |                 |              + root: unused
      |                 |              |
      |                 |              + hhead: saddr:daddr policies
      |                 |
      |                 +- coarse policies and all any:daddr policies
      |
      +---- root_s: sorted by saddr:prefix
      |                 |
      |        xfrm_pol_inexact_node
      |                 |
      |                 + root: unused
      |                 |
      |                 + hhead: saddr:any policies
      |
      +---- coarse policies and all any:any policies

lookup for an inexact policy returns pointers to the four relevant list
classes, after which each of the lists needs to be searched for the policy
with the higher priority.

This will only speed up lookups in case we have many policies and a
sizeable portion of these have disjunct saddr/daddr addresses.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/xfrm/xfrm_policy.c | 118 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 108 insertions(+), 10 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 38e33326c856..bd80b8a4322f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -58,6 +58,8 @@ struct xfrm_pol_inexact_node {
 	};
 	u8 prefixlen;
 
+	struct rb_root root;
+
 	/* the policies matching this node, can be empty list */
 	struct hlist_head hhead;
 };
@@ -69,6 +71,14 @@ struct xfrm_pol_inexact_node {
  * |                 |
  * |        xfrm_pol_inexact_node
  * |                 |
+ * |                 +- root: sorted by saddr/prefix
+ * |                 |              |
+ * |                 |         xfrm_pol_inexact_node
+ * |                 |              |
+ * |                 |              + root: unused
+ * |                 |              |
+ * |                 |              + hhead: saddr:daddr policies
+ * |                 |
  * |                 +- coarse policies and all any:daddr policies
  * |
  * +---- root_s: sorted by saddr:prefix
@@ -81,10 +91,11 @@ struct xfrm_pol_inexact_node {
  * |
  * +---- coarse policies and all any:any policies
  *
- * Lookups return three candidate lists:
+ * Lookups return four candidate lists:
  * 1. any:any list from top-level xfrm_pol_inexact_bin
  * 2. any:daddr list from daddr tree
- * 2. saddr:any list from saddr tree
+ * 3. saddr:daddr list from 2nd level daddr tree
+ * 4. saddr:any list from saddr tree
  *
  * This result set then needs to be searched for the policy with
  * the lowest priority.  If two results have same prio, youngest one wins.
@@ -116,6 +127,7 @@ struct xfrm_pol_inexact_bin {
 };
 
 enum xfrm_pol_inexact_candidate_type {
+	XFRM_POL_CAND_BOTH,
 	XFRM_POL_CAND_SADDR,
 	XFRM_POL_CAND_DADDR,
 	XFRM_POL_CAND_ANY,
@@ -876,13 +888,82 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net,
 	}
 }
 
+static void xfrm_policy_inexact_node_reinsert(struct net *net,
+					      struct xfrm_pol_inexact_node *n,
+					      struct rb_root *new,
+					      u16 family)
+{
+	struct rb_node **p, *parent = NULL;
+	struct xfrm_pol_inexact_node *node;
+
+	/* we should not have another subtree here */
+	WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
+
+	p = &new->rb_node;
+	while (*p) {
+		u8 prefixlen;
+		int delta;
+
+		parent = *p;
+		node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
+
+		prefixlen = min(node->prefixlen, n->prefixlen);
+
+		delta = xfrm_policy_addr_delta(&n->addr, &node->addr,
+					       prefixlen, family);
+		if (delta < 0) {
+			p = &parent->rb_left;
+		} else if (delta > 0) {
+			p = &parent->rb_right;
+		} else {
+			struct xfrm_policy *tmp;
+
+			hlist_for_each_entry(tmp, &node->hhead, bydst)
+				tmp->bydst_reinsert = true;
+			hlist_for_each_entry(tmp, &n->hhead, bydst)
+				tmp->bydst_reinsert = true;
+
+			INIT_HLIST_HEAD(&node->hhead);
+			xfrm_policy_inexact_list_reinsert(net, node, family);
+
+			if (node->prefixlen == n->prefixlen) {
+				kfree_rcu(n, rcu);
+				return;
+			}
+
+			rb_erase(*p, new);
+			kfree_rcu(n, rcu);
+			n = node;
+			n->prefixlen = prefixlen;
+			*p = new->rb_node;
+			parent = NULL;
+		}
+	}
+
+	rb_link_node_rcu(&n->node, parent, p);
+	rb_insert_color(&n->node, new);
+}
+
 /* merge nodes v and n */
 static void xfrm_policy_inexact_node_merge(struct net *net,
 					   struct xfrm_pol_inexact_node *v,
 					   struct xfrm_pol_inexact_node *n,
 					   u16 family)
 {
+	struct xfrm_pol_inexact_node *node;
 	struct xfrm_policy *tmp;
+	struct rb_node *rnode;
+
+	/* To-be-merged node v has a subtree.
+	 *
+	 * Dismantle it and insert its nodes to n->root.
+	 */
+	while ((rnode = rb_first(&v->root)) != NULL) {
+		node = rb_entry(rnode, struct xfrm_pol_inexact_node, node);
+		rb_erase(&node->node, &v->root);
+		xfrm_policy_inexact_node_reinsert(net, node, &n->root,
+						  family);
+	}
 
 	hlist_for_each_entry(tmp, &v->hhead, bydst)
 		tmp->bydst_reinsert = true;
@@ -978,9 +1059,10 @@ static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm)
 	while (rn) {
 		node = rb_entry(rn, struct xfrm_pol_inexact_node, node);
 
+		xfrm_policy_inexact_gc_tree(&node->root, rm);
 		rn = rb_next(rn);
 
-		if (!hlist_empty(&node->hhead)) {
+		if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) {
 			WARN_ON_ONCE(rm);
 			continue;
 		}
@@ -1042,12 +1124,6 @@ xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
 	if (xfrm_policy_inexact_insert_use_any_list(policy))
 		return &bin->hhead;
 
-	/* saddr is wildcard */
-	if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
-					       policy->family,
-					       policy->selector.prefixlen_s))
-		return &bin->hhead;
-
 	if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
 					       policy->family,
 					       policy->selector.prefixlen_d)) {
@@ -1075,6 +1151,23 @@ xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
 	write_seqcount_end(&bin->count);
 	if (!n)
 		return NULL;
+
+	/* saddr is wildcard */
+	if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
+					       policy->family,
+					       policy->selector.prefixlen_s))
+		return &n->hhead;
+
+	write_seqcount_begin(&bin->count);
+	n = xfrm_policy_inexact_insert_node(net,
+					    &n->root,
+					    &policy->selector.saddr,
+					    policy->family,
+					    policy->selector.prefixlen_s, dir);
+	write_seqcount_end(&bin->count);
+	if (!n)
+		return NULL;
+
 	return &n->hhead;
 }
 
@@ -1856,8 +1949,13 @@ xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
 
 	n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr,
 					    family);
-	if (n)
+	if (n) {
 		cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
+		n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr,
+						    family);
+		if (n)
+			cand->res[XFRM_POL_CAND_BOTH] = &n->hhead;
+	}
 
 	n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr,
 					    family);
-- 
2.18.1

^ permalink raw reply related

* Re: [PATCH rdma] net/mlx5: Fix XRC SRQ umem valid bits
From: Jason Gunthorpe @ 2018-11-07 22:07 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, Yishai Hadas, RDMA mailing list, Artemy Kovalyov,
	Saeed Mahameed, linux-netdev
In-Reply-To: <20181107073434.GJ9230@mtr-leonro.mtl.com>

On Wed, Nov 07, 2018 at 09:34:34AM +0200, Leon Romanovsky wrote:
> On Tue, Nov 06, 2018 at 03:11:53PM -0700, Jason Gunthorpe wrote:
> > On Tue, Nov 06, 2018 at 05:10:53PM -0500, Doug Ledford wrote:
> > > On Tue, 2018-11-06 at 22:02 +0000, Jason Gunthorpe wrote:
> > > > On Tue, Nov 06, 2018 at 04:31:08PM -0500, Doug Ledford wrote:
> > > > > On Wed, 2018-10-31 at 12:20 +0200, Leon Romanovsky wrote:
> > > > > > From: Yishai Hadas <yishaih@mellanox.com>
> > > > > >
> > > > > > Adapt XRC SRQ to the latest HW specification with fixed definition
> > > > > > around umem valid bits. The previous definition relied on a bit which
> > > > > > was taken for other purposes in legacy FW.
> > > > > >
> > > > > > Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits")
> > > > > > Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> > > > > > Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
> > > > > > Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> > > > > > Hi Doug, Jason
> > > > > >
> > > > > > This commit fixes code sent in this merge window, so I'm not marking it
> > > > > > with any rdma-rc/rdma-next. It will be better to be sent during this merge
> > > > > > window if you have extra pull request to issue, or as a -rc material, if
> > > > > > not.
> > > > > >
> > > > > > BTW, we didn't combine reserved fields, because our convention is to align such
> > > > > > fields to 32 bits for better readability.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > This looks fine.  Let me know when it's in the mlx5-next tree to pull.
> > > >
> > > > It needs to go to -rc...
> > > >
> > > > This needs a mlx5-rc branch for this I guess?
> > >
> > > I don't think so.  As long as it's the first commit in mlx5-next, and
> > > mlx5-next is 4.20-rc1 based, then pulling this commit into the -rc tree
> > > will only pull the single commit.  Then when we pull into for-next for
> > > the first time, we will get this in for-next too.  That seems best to
> > > me.
> >
> > That works too, if Leon is fast :)
> 
> Thank you both for suggestion.
> 
> I did it.
> 99b77fef3c6c net/mlx5: Fix XRC SRQ umem valid bits
> 
> It is first commit and it is based on -rc1.

Okay, I put this branch in -rc

Jason

^ permalink raw reply

* Re: [PATCH v3 1/2] kretprobe: produce sane stack traces
From: Aleksa Sarai @ 2018-11-08  7:46 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Naveen N. Rao, Anil S Keshavamurthy, David S. Miller,
	Masami Hiramatsu, Jonathan Corbet, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Jiri Olsa,
	Namhyung Kim, Shuah Khan, Alexei Starovoitov, Daniel Borkmann,
	Brendan Gregg, Christian Brauner, Aleksa Sarai, netdev, linux-doc,
	linux-kernel
In-Reply-To: <20181106171501.59ccabbc@gandalf.local.home>

[-- Attachment #1: Type: text/plain, Size: 1979 bytes --]

On 2018-11-06, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Sun, 4 Nov 2018 22:59:13 +1100
> Aleksa Sarai <cyphar@cyphar.com> wrote:
> 
> > The same issue is present in __save_stack_trace
> > (arch/x86/kernel/stacktrace.c). This is likely the only reason that --
> > as Steven said -- stacktraces wouldn't work with ftrace-graph (and thus
> > with the refactor both of you are discussing).
> 
> By the way, I was playing with the the orc unwinder and stack traces
> from the function graph tracer return code, and got it working with the
> below patch. Caution, that patch also has a stack trace hardcoded in
> the return path of the function graph tracer, so you don't want to run
> function graph tracing without filtering.

Neat!

> diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
> index 169b3c44ee97..aaeca73218cc 100644
> --- a/kernel/trace/trace_functions_graph.c
> +++ b/kernel/trace/trace_functions_graph.c
> @@ -242,13 +242,16 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
>  	trace->calltime = current->ret_stack[index].calltime;
>  	trace->overrun = atomic_read(&current->trace_overrun);
>  	trace->depth = index;
> +
> +	trace_dump_stack(0);

Right, this works because save_stack is not being passed a pt_regs. But if
you pass a pt_regs (as happens with bpf_getstackid -- which is what
spawned this discussion) then the top-most entry of the stack will still
be a trampoline because there is no ftrace_graph_ret_addr call.

(I'm struggling with how to fix this -- I can't figure out what retp
should be if you have a pt_regs. ->sp doesn't appear to work -- it's off
by a few bytes.)

I will attach what I have at the moment to hopefully explain what the
issue I've found is (re-using the kretprobe architecture but with the
shadow-stack idea).

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH bpf-next 2/2] bpftool: support loading flow dissector
From: Stanislav Fomichev @ 2018-11-07 22:15 UTC (permalink / raw)
  To: Quentin Monnet
  Cc: Jakub Kicinski, Stanislav Fomichev, netdev, linux-kselftest, ast,
	daniel, shuah, guro, jiong.wang, bhole_prashant_q7,
	john.fastabend, jbenc, treeze.taeung, yhs, osk, sandipan
In-Reply-To: <6628387d-cf89-573f-7b9c-2d49ef19634e@netronome.com>

On 11/07, Quentin Monnet wrote:
> 2018-11-07 12:32 UTC-0800 ~ Jakub Kicinski <jakub.kicinski@netronome.com>
> > On Wed, 7 Nov 2018 20:08:53 +0000, Quentin Monnet wrote:
> > > > +		err = bpf_obj_pin(bpf_program__fd(prog), pinfile);
> > > > +		if (err) {
> > > > +			p_err("failed to pin program %s",
> > > > +			      bpf_program__title(prog, false));
> > > > +			goto err_close_obj;
> > > > +		}
> > > 
> > > I don't have the same opinion as Jakub for pinning :). I was hoping we
> > > could also load additional programs (for tail calls) for
> > > non-flow_dissector programs. Could this be an occasion to update the
> > > code in that direction?
> > 
> > Do you mean having the bpftool construct an array for tail calling
> > automatically when loading an object?  Or do a "mass pin" of all
> > programs in an object file?
> > 
> > I'm not convinced about this strategy of auto assembling a tail call
> > array by assuming that a flow dissector object carries programs for
> > protocols in order (apart from the main program which doesn't have to
> > be first, for some reason).
> 
> Not constructing the prog array, I don't think this should be the role of
> bpftool either. Much more a "mass pin", so that you have a link to each
> program loaded from the object file and can later add them to a prog array
> map with subsequent calls to bpftool.
I agree, constructing the jmp_table is a bit fragile with all the
dependencies on the order of the progs. I'll drop that and will send a
v2 that pins all the programs from the obj file instead and offloads
jmp_table construction to the user. So the supposed use case would be
something like the following:

bpftool prog load bpf_flow.o /sys/fs/bpf/flow type flow_dissector
bpftool map update pinned /sys/fs/bpf/flow/jmp_table \
	key ... value pinned /sys/fs/bpf/flow/<PROTO>/0
bpftool map update ...
bpftool prog attach pinned /sys/fs/bpf/flow/flow_dissector/0 flow_dissector

^ permalink raw reply

* Re: [PATCH v3 1/2] kretprobe: produce sane stack traces
From: Aleksa Sarai @ 2018-11-08  8:04 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Naveen N. Rao, Anil S Keshavamurthy, David S. Miller,
	Masami Hiramatsu, Jonathan Corbet, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Jiri Olsa,
	Namhyung Kim, Shuah Khan, Alexei Starovoitov, Daniel Borkmann,
	Brendan Gregg, Christian Brauner, Aleksa Sarai, netdev, linux-doc,
	linux-kernel
In-Reply-To: <20181108074612.ldy6rozdpsdps6bf@yavin>

[-- Attachment #1: Type: text/plain, Size: 12219 bytes --]

On 2018-11-08, Aleksa Sarai <cyphar@cyphar.com> wrote:
> I will attach what I have at the moment to hopefully explain what the
> issue I've found is (re-using the kretprobe architecture but with the
> shadow-stack idea).

Here is the patch I have at the moment (it works, except for the
question I have about how to handle the top-level pt_regs -- I've marked
that code with XXX).

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

--8<---------------------------------------------------------------------

Since the return address is modified by kretprobe, the various unwinders
can produce invalid and confusing stack traces. ftrace mostly solved
this problem by teaching each unwinder how to find the original return
address for stack trace purposes. This same technique can be applied to
kretprobes by simply adding a pointer to where the return address was
replaced in the stack, and then looking up the relevant
kretprobe_instance when a stack trace is requested.

[WIP: This is currently broken because the *first entry* will not be
      overwritten since it looks like the stack pointer is different
      when we are provided pt_regs. All other addresses are correctly
      handled.]

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 arch/x86/events/core.c         |  6 +++-
 arch/x86/include/asm/ptrace.h  |  1 +
 arch/x86/kernel/kprobes/core.c |  5 ++--
 arch/x86/kernel/stacktrace.c   | 10 +++++--
 arch/x86/kernel/unwind_frame.c |  2 ++
 arch/x86/kernel/unwind_guess.c |  5 +++-
 arch/x86/kernel/unwind_orc.c   |  2 ++
 include/linux/kprobes.h        | 15 +++++++++-
 kernel/kprobes.c               | 55 ++++++++++++++++++++++++++++++++++
 9 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index de32741d041a..d71062702179 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2371,7 +2371,11 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
 		return;
 	}
 
-	if (perf_callchain_store(entry, regs->ip))
+	/* XXX: Currently broken -- stack_addr(regs) doesn't match entry. */
+	addr = regs->ip;
+	//addr = ftrace_graph_ret_addr(current, &state.graph_idx, addr, stack_addr(regs));
+	addr = kretprobe_ret_addr(current, addr, stack_addr(regs));
+	if (perf_callchain_store(entry, addr))
 		return;
 
 	for (unwind_start(&state, current, regs, NULL); !unwind_done(&state);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index ee696efec99f..c4dfafd43e11 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -172,6 +172,7 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
 	return regs->sp;
 }
 #endif
+#define stack_addr(regs) ((unsigned long *) kernel_stack_pointer(regs))
 
 #define GET_IP(regs) ((regs)->ip)
 #define GET_FP(regs) ((regs)->bp)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index b0d1e81c96bb..eb4da885020c 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -69,8 +69,6 @@
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
-#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
-
 #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
 	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
 	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
@@ -568,7 +566,8 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
 	unsigned long *sara = stack_addr(regs);
 
-	ri->ret_addr = (kprobe_opcode_t *) *sara;
+	ri->ret_addrp = (kprobe_opcode_t **) sara;
+	ri->ret_addr = *ri->ret_addrp;
 
 	/* Replace the return addr with trampoline addr */
 	*sara = (unsigned long) &kretprobe_trampoline;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 7627455047c2..8a4fb3109d6b 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -8,6 +8,7 @@
 #include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/export.h>
+#include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <asm/stacktrace.h>
 #include <asm/unwind.h>
@@ -37,8 +38,13 @@ static void noinline __save_stack_trace(struct stack_trace *trace,
 	struct unwind_state state;
 	unsigned long addr;
 
-	if (regs)
-		save_stack_address(trace, regs->ip, nosched);
+	if (regs) {
+		/* XXX: Currently broken -- stack_addr(regs) doesn't match entry. */
+		addr = regs->ip;
+		//addr = ftrace_graph_ret_addr(current, &state.graph_idx, addr, stack_addr(regs));
+		addr = kretprobe_ret_addr(current, addr, stack_addr(regs));
+		save_stack_address(trace, addr, nosched);
+	}
 
 	for (unwind_start(&state, task, regs, NULL); !unwind_done(&state);
 	     unwind_next_frame(&state)) {
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 3dc26f95d46e..47062427e9a3 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -1,4 +1,5 @@
 #include <linux/sched.h>
+#include <linux/kprobes.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
@@ -270,6 +271,7 @@ static bool update_stack_state(struct unwind_state *state,
 		addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
 		state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
 						  addr, addr_p);
+		state->ip = kretprobe_ret_addr(state->task, state->ip, addr_p);
 	}
 
 	/* Save the original stack pointer for unwind_dump(): */
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
index 4f0e17b90463..554fd7c5c331 100644
--- a/arch/x86/kernel/unwind_guess.c
+++ b/arch/x86/kernel/unwind_guess.c
@@ -1,5 +1,6 @@
 #include <linux/sched.h>
 #include <linux/ftrace.h>
+#include <linux/kprobes.h>
 #include <asm/ptrace.h>
 #include <asm/bitops.h>
 #include <asm/stacktrace.h>
@@ -14,8 +15,10 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
 
 	addr = READ_ONCE_NOCHECK(*state->sp);
 
-	return ftrace_graph_ret_addr(state->task, &state->graph_idx,
+	addr = ftrace_graph_ret_addr(state->task, &state->graph_idx,
 				     addr, state->sp);
+	addr = kretprobe_ret_addr(state->task, addr, state->sp);
+	return addr;
 }
 EXPORT_SYMBOL_GPL(unwind_get_return_address);
 
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 26038eacf74a..b6393500d505 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -1,5 +1,6 @@
 #include <linux/module.h>
 #include <linux/sort.h>
+#include <linux/kprobes.h>
 #include <asm/ptrace.h>
 #include <asm/stacktrace.h>
 #include <asm/unwind.h>
@@ -462,6 +463,7 @@ bool unwind_next_frame(struct unwind_state *state)
 
 		state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
 						  state->ip, (void *)ip_p);
+		state->ip = kretprobe_ret_addr(state->task, state->ip, (void *)ip_p);
 
 		state->sp = sp;
 		state->regs = NULL;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index e909413e4e38..3a01f9998064 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -172,6 +172,7 @@ struct kretprobe_instance {
 	struct hlist_node hlist;
 	struct kretprobe *rp;
 	kprobe_opcode_t *ret_addr;
+	kprobe_opcode_t **ret_addrp;
 	struct task_struct *task;
 	char data[0];
 };
@@ -203,6 +204,7 @@ static inline int kprobes_built_in(void)
 extern void arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				   struct pt_regs *regs);
 extern int arch_trampoline_kprobe(struct kprobe *p);
+extern void kretprobe_trampoline(void);
 #else /* CONFIG_KRETPROBES */
 static inline void arch_prepare_kretprobe(struct kretprobe *rp,
 					struct pt_regs *regs)
@@ -212,6 +214,9 @@ static inline int arch_trampoline_kprobe(struct kprobe *p)
 {
 	return 0;
 }
+static inline void kretprobe_trampoline(void)
+{
+}
 #endif /* CONFIG_KRETPROBES */
 
 extern struct kretprobe_blackpoint kretprobe_blacklist[];
@@ -341,7 +346,7 @@ struct kprobe *get_kprobe(void *addr);
 void kretprobe_hash_lock(struct task_struct *tsk,
 			 struct hlist_head **head, unsigned long *flags);
 void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags);
-struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk);
+struct hlist_head *kretprobe_inst_table_head(struct task_struct *tsk);
 
 /* kprobe_running() will just return the current_kprobe on this CPU */
 static inline struct kprobe *kprobe_running(void)
@@ -371,6 +376,9 @@ void unregister_kretprobe(struct kretprobe *rp);
 int register_kretprobes(struct kretprobe **rps, int num);
 void unregister_kretprobes(struct kretprobe **rps, int num);
 
+unsigned long kretprobe_ret_addr(struct task_struct *tsk, unsigned long ret,
+				 unsigned long *retp);
+
 void kprobe_flush_task(struct task_struct *tk);
 void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 
@@ -425,6 +433,11 @@ static inline void unregister_kretprobe(struct kretprobe *rp)
 static inline void unregister_kretprobes(struct kretprobe **rps, int num)
 {
 }
+unsigned long kretprobe_ret_addr(struct task_struct *task, unsigned long ret,
+				 unsigned long *retp)
+{
+	return ret;
+}
 static inline void kprobe_flush_task(struct task_struct *tk)
 {
 }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 90e98e233647..ed78141664ec 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -83,6 +83,11 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 	return &(kretprobe_table_locks[hash].lock);
 }
 
+struct hlist_head *kretprobe_inst_table_head(struct task_struct *tsk)
+{
+	return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+}
+
 /* Blacklist -- list of struct kprobe_blacklist_entry */
 static LIST_HEAD(kprobe_blacklist);
 
@@ -1206,6 +1211,15 @@ __releases(hlist_lock)
 }
 NOKPROBE_SYMBOL(kretprobe_table_unlock);
 
+static bool kretprobe_hash_is_locked(struct task_struct *tsk)
+{
+	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+	raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+
+	return raw_spin_is_locked(hlist_lock);
+}
+NOKPROBE_SYMBOL(kretprobe_hash_is_locked);
+
 /*
  * This function is called from finish_task_switch when task tk becomes dead,
  * so that we can recycle any function-return probe instances associated
@@ -1856,6 +1870,41 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(pre_handler_kretprobe);
 
+unsigned long kretprobe_ret_addr(struct task_struct *tsk, unsigned long ret,
+				 unsigned long *retp)
+{
+	struct kretprobe_instance *ri;
+	unsigned long flags = 0;
+	struct hlist_head *head;
+	bool need_lock;
+
+	if (likely(ret != (unsigned long) &kretprobe_trampoline))
+		return ret;
+
+	need_lock = !kretprobe_hash_is_locked(tsk);
+	if (WARN_ON(need_lock))
+		kretprobe_hash_lock(tsk, &head, &flags);
+	else
+		head = kretprobe_inst_table_head(tsk);
+
+	hlist_for_each_entry(ri, head, hlist) {
+		if (ri->task != current)
+			continue;
+		if (ri->ret_addr == (kprobe_opcode_t *) &kretprobe_trampoline)
+			continue;
+		if (ri->ret_addrp == (kprobe_opcode_t **) retp) {
+			ret = (unsigned long) ri->ret_addr;
+			goto out;
+		}
+	}
+
+out:
+	if (need_lock)
+		kretprobe_hash_unlock(tsk, &flags);
+	return ret;
+}
+NOKPROBE_SYMBOL(kretprobe_ret_addr);
+
 bool __weak arch_kprobe_on_func_entry(unsigned long offset)
 {
 	return !offset;
@@ -2005,6 +2054,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(pre_handler_kretprobe);
 
+unsigned long kretprobe_ret_addr(struct task_struct *tsk, unsigned long ret,
+				 unsigned long *retp)
+{
+	return ret;
+}
+NOKPROBE_SYMBOL(kretprobe_ret_addr);
 #endif /* CONFIG_KRETPROBES */
 
 /* Set the kprobe gone and remove its instruction buffer. */
-- 
2.19.1

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply related

* [PATCH net-next] sock: Reset dst when changing sk_mark via setsockopt
From: David Barmann @ 2018-11-07 22:33 UTC (permalink / raw)
  To: netdev

When setting the SO_MARK socket option, the dst needs to be reset so
that a new route lookup is performed.

This fixes the case where an application wants to change routing by
setting a new sk_mark.  If this is done after some packets have already
been sent, the dst is cached and has no effect.

Signed-off-by: David Barmann <david.barmann@stackpath.com>
---
 net/core/sock.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 6fcc4bc07d19..187badac24a3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -950,10 +950,14 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 			clear_bit(SOCK_PASSSEC, &sock->flags);
 		break;
 	case SO_MARK:
-		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
 			ret = -EPERM;
-		else
+		} else {
+			struct dst_entry *dst = sk_dst_get(sk);
 			sk->sk_mark = val;
+			sk_dst_reset(sk);
+			dst_release(dst);
+		}
 		break;
 
 	case SO_RXQ_OVFL:
-- 
2.14.4

^ permalink raw reply related

* [PATCH v2 bpf-next 0/3] bpftool: support loading flow dissector
From: Stanislav Fomichev @ 2018-11-07 22:43 UTC (permalink / raw)
  To: netdev, linux-kselftest, ast, daniel, shuah, jakub.kicinski,
	quentin.monnet
  Cc: guro, jiong.wang, sdf, bhole_prashant_q7, john.fastabend, jbenc,
	treeze.taeung, yhs, osk, sandipan

v2 changes:
* addressed comments/style issues from Jakub Kicinski & Quentin Monnet
* removed logic that populates jump table
* added cleanup for partial failure in bpf_object__pin

This patch series adds support for loading and attaching flow dissector
programs from the bpftool:

* first patch fixes flow dissector section name in the selftests (so
  libbpf auto-detection works)
* second patch adds proper cleanup to bpf_object__pin which is now being
  used to attach all flow dissector progs/maps
* third patch adds actual support to the bpftool

See third patch for the description/details.

Stanislav Fomichev (3):
  selftests/bpf: rename flow dissector section to flow_dissector
  libbpf: cleanup after partial failure in bpf_object__pin
  bpftool: support loading flow dissector

 .../bpftool/Documentation/bpftool-prog.rst    | 26 +++--
 tools/bpf/bpftool/bash-completion/bpftool     |  2 +-
 tools/bpf/bpftool/common.c                    | 30 +++---
 tools/bpf/bpftool/main.h                      |  1 +
 tools/bpf/bpftool/prog.c                      | 94 ++++++++++++++-----
 tools/lib/bpf/libbpf.c                        | 58 ++++++++++--
 tools/testing/selftests/bpf/bpf_flow.c        |  2 +-
 .../selftests/bpf/test_flow_dissector.sh      |  2 +-
 8 files changed, 151 insertions(+), 64 deletions(-)

-- 
2.19.1.930.g4563a0d9d0-goog

^ permalink raw reply

* [PATCH bpf-next 1/3] selftests/bpf: rename flow dissector section to flow_dissector
From: Stanislav Fomichev @ 2018-11-07 22:43 UTC (permalink / raw)
  To: netdev, linux-kselftest, ast, daniel, shuah, jakub.kicinski,
	quentin.monnet
  Cc: guro, jiong.wang, sdf, bhole_prashant_q7, john.fastabend, jbenc,
	treeze.taeung, yhs, osk, sandipan
In-Reply-To: <20181107224356.73080-1-sdf@google.com>

Makes it compatible with the logic that derives program type
from section name in libbpf_prog_type_by_name.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 tools/testing/selftests/bpf/bpf_flow.c             | 2 +-
 tools/testing/selftests/bpf/test_flow_dissector.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/bpf_flow.c b/tools/testing/selftests/bpf/bpf_flow.c
index 107350a7821d..b9798f558ca7 100644
--- a/tools/testing/selftests/bpf/bpf_flow.c
+++ b/tools/testing/selftests/bpf/bpf_flow.c
@@ -116,7 +116,7 @@ static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto)
 	return BPF_DROP;
 }
 
-SEC("dissect")
+SEC("flow_dissector")
 int _dissect(struct __sk_buff *skb)
 {
 	if (!skb->vlan_present)
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.sh b/tools/testing/selftests/bpf/test_flow_dissector.sh
index c0fb073b5eab..d23d4da66b83 100755
--- a/tools/testing/selftests/bpf/test_flow_dissector.sh
+++ b/tools/testing/selftests/bpf/test_flow_dissector.sh
@@ -59,7 +59,7 @@ else
 fi
 
 # Attach BPF program
-./flow_dissector_load -p bpf_flow.o -s dissect
+./flow_dissector_load -p bpf_flow.o -s flow_dissector
 
 # Setup
 tc qdisc add dev lo ingress
-- 
2.19.1.930.g4563a0d9d0-goog

^ permalink raw reply related

* [PATCH bpf-next 2/3] libbpf: cleanup after partial failure in bpf_object__pin
From: Stanislav Fomichev @ 2018-11-07 22:43 UTC (permalink / raw)
  To: netdev, linux-kselftest, ast, daniel, shuah, jakub.kicinski,
	quentin.monnet
  Cc: guro, jiong.wang, sdf, bhole_prashant_q7, john.fastabend, jbenc,
	treeze.taeung, yhs, osk, sandipan
In-Reply-To: <20181107224356.73080-1-sdf@google.com>

bpftool will use bpf_object__pin in the next commit to pin all programs
and maps from the file; in case of a partial failure, we need to get
back to the clean state (undo previous program/map pins).

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 tools/lib/bpf/libbpf.c | 58 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 10 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index d6e62e90e8d4..309abe7196f3 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1803,14 +1803,17 @@ int bpf_object__pin(struct bpf_object *obj, const char *path)
 
 		len = snprintf(buf, PATH_MAX, "%s/%s", path,
 			       bpf_map__name(map));
-		if (len < 0)
-			return -EINVAL;
-		else if (len >= PATH_MAX)
-			return -ENAMETOOLONG;
+		if (len < 0) {
+			err = -EINVAL;
+			goto err_unpin_maps;
+		} else if (len >= PATH_MAX) {
+			err = -ENAMETOOLONG;
+			goto err_unpin_maps;
+		}
 
 		err = bpf_map__pin(map, buf);
 		if (err)
-			return err;
+			goto err_unpin_maps;
 	}
 
 	bpf_object__for_each_program(prog, obj) {
@@ -1819,17 +1822,52 @@ int bpf_object__pin(struct bpf_object *obj, const char *path)
 
 		len = snprintf(buf, PATH_MAX, "%s/%s", path,
 			       prog->section_name);
-		if (len < 0)
-			return -EINVAL;
-		else if (len >= PATH_MAX)
-			return -ENAMETOOLONG;
+		if (len < 0) {
+			err = -EINVAL;
+			goto err_unpin_programs;
+		} else if (len >= PATH_MAX) {
+			err = -ENAMETOOLONG;
+			goto err_unpin_programs;
+		}
 
 		err = bpf_program__pin(prog, buf);
 		if (err)
-			return err;
+			goto err_unpin_programs;
 	}
 
 	return 0;
+
+err_unpin_programs:
+	bpf_object__for_each_program(prog, obj) {
+		char buf[PATH_MAX];
+		int len;
+
+		len = snprintf(buf, PATH_MAX, "%s/%s", path,
+			       prog->section_name);
+		if (len < 0)
+			continue;
+		else if (len >= PATH_MAX)
+			continue;
+
+		unlink(buf);
+	}
+
+err_unpin_maps:
+	bpf_map__for_each(map, obj) {
+		char buf[PATH_MAX];
+		int len;
+
+		len = snprintf(buf, PATH_MAX, "%s/%s", path,
+			       bpf_map__name(map));
+		if (len < 0)
+			continue;
+		else if (len >= PATH_MAX)
+			continue;
+
+		unlink(buf);
+	}
+
+	return err;
 }
 
 void bpf_object__close(struct bpf_object *obj)
-- 
2.19.1.930.g4563a0d9d0-goog

^ permalink raw reply related

* [PATCH bpf-next 3/3] bpftool: support loading flow dissector
From: Stanislav Fomichev @ 2018-11-07 22:43 UTC (permalink / raw)
  To: netdev, linux-kselftest, ast, daniel, shuah, jakub.kicinski,
	quentin.monnet
  Cc: guro, jiong.wang, sdf, bhole_prashant_q7, john.fastabend, jbenc,
	treeze.taeung, yhs, osk, sandipan
In-Reply-To: <20181107224356.73080-1-sdf@google.com>

This commit adds support for loading/attaching/detaching flow
dissector program. The structure of the flow dissector program is
assumed to be the same as in the selftests:

* flow_dissector section with the main entry point
* a bunch of tail call progs
* a jmp_table map that is populated with the tail call progs

When `bpftool load` is called with a flow_dissector prog (i.e. when the
first section is flow_dissector of 'type flow_dissector' argument is
passed), we load and pin all the programs/maps. User is responsible to
construct the jump table for the tail calls.

The last argument of `bpftool attach` is made optional for this use
case.

Example:
bpftool prog load tools/testing/selftests/bpf/bpf_flow.o \
	/sys/fs/bpf/flow type flow_dissector

bpftool map update pinned /sys/fs/bpf/flow/jmp_table \
        key 0 0 0 0 \
        value pinned /sys/fs/bpf/flow/IP/0

bpftool map update pinned /sys/fs/bpf/flow/jmp_table \
        key 1 0 0 0 \
        value pinned /sys/fs/bpf/flow/IPV6/0

bpftool map update pinned /sys/fs/bpf/flow/jmp_table \
        key 2 0 0 0 \
        value pinned /sys/fs/bpf/flow/IPV6OP/0

bpftool map update pinned /sys/fs/bpf/flow/jmp_table \
        key 3 0 0 0 \
        value pinned /sys/fs/bpf/flow/IPV6FR/0

bpftool map update pinned /sys/fs/bpf/flow/jmp_table \
        key 4 0 0 0 \
        value pinned /sys/fs/bpf/flow/MPLS/0

bpftool map update pinned /sys/fs/bpf/flow/jmp_table \
        key 5 0 0 0 \
        value pinned /sys/fs/bpf/flow/VLAN/0

bpftool prog attach pinned /sys/fs/bpf/flow/flow_dissector/0 flow_dissector

Tested by using the above lines to load the prog in
the test_flow_dissector.sh selftest.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 .../bpftool/Documentation/bpftool-prog.rst    | 26 +++--
 tools/bpf/bpftool/bash-completion/bpftool     |  2 +-
 tools/bpf/bpftool/common.c                    | 30 +++---
 tools/bpf/bpftool/main.h                      |  1 +
 tools/bpf/bpftool/prog.c                      | 94 ++++++++++++++-----
 5 files changed, 101 insertions(+), 52 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index ac4e904b10fb..a7b6934a8ac9 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -25,8 +25,8 @@ MAP COMMANDS
 |	**bpftool** **prog dump jited**  *PROG* [{**file** *FILE* | **opcodes**}]
 |	**bpftool** **prog pin** *PROG* *FILE*
 |	**bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*]
-|       **bpftool** **prog attach** *PROG* *ATTACH_TYPE* *MAP*
-|       **bpftool** **prog detach** *PROG* *ATTACH_TYPE* *MAP*
+|       **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
+|       **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
 |	**bpftool** **prog help**
 |
 |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
@@ -39,7 +39,9 @@ MAP COMMANDS
 |		**cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
 |		**cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6**
 |	}
-|       *ATTACH_TYPE* := { **msg_verdict** | **skb_verdict** | **skb_parse** }
+|       *ATTACH_TYPE* := {
+|		**msg_verdict** | **skb_verdict** | **skb_parse** | **flow_dissector**
+|	}
 
 
 DESCRIPTION
@@ -97,13 +99,17 @@ DESCRIPTION
 		  contain a dot character ('.'), which is reserved for future
 		  extensions of *bpffs*.
 
-        **bpftool prog attach** *PROG* *ATTACH_TYPE* *MAP*
-                  Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
-                  to the map *MAP*.
-
-        **bpftool prog detach** *PROG* *ATTACH_TYPE* *MAP*
-                  Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
-                  from the map *MAP*.
+        **bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
+                  Attach bpf program *PROG* (with type specified by
+                  *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP*
+                  parameter, with the exception of *flow_dissector* which is
+                  attached to current networking name space.
+
+        **bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
+                  Detach bpf program *PROG* (with type specified by
+                  *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP*
+                  parameter, with the exception of *flow_dissector* which is
+                  detached from the current networking name space.
 
 	**bpftool prog help**
 		  Print short help message.
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 3f78e6404589..4ab892adfa9f 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -299,7 +299,7 @@ _bpftool()
                     fi
 
                     if [[ ${#words[@]} == 6 ]]; then
-                        COMPREPLY=( $( compgen -W "msg_verdict skb_verdict skb_parse" -- "$cur" ) )
+                        COMPREPLY=( $( compgen -W "msg_verdict skb_verdict skb_parse flow_dissector" -- "$cur" ) )
                         return 0
                     fi
 
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 25af85304ebe..f671a921dec5 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -169,34 +169,24 @@ int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type)
 	return fd;
 }
 
-int do_pin_fd(int fd, const char *name)
+int mount_bpffs_for_pin(const char *name)
 {
 	char err_str[ERR_MAX_LEN];
 	char *file;
 	char *dir;
 	int err = 0;
 
-	err = bpf_obj_pin(fd, name);
-	if (!err)
-		goto out;
-
 	file = malloc(strlen(name) + 1);
 	strcpy(file, name);
 	dir = dirname(file);
 
-	if (errno != EPERM || is_bpffs(dir)) {
-		p_err("can't pin the object (%s): %s", name, strerror(errno));
+	if (is_bpffs(dir)) {
+		/* nothing to do if already mounted */
 		goto out_free;
 	}
 
-	/* Attempt to mount bpffs, then retry pinning. */
 	err = mnt_bpffs(dir, err_str, ERR_MAX_LEN);
-	if (!err) {
-		err = bpf_obj_pin(fd, name);
-		if (err)
-			p_err("can't pin the object (%s): %s", name,
-			      strerror(errno));
-	} else {
+	if (err) {
 		err_str[ERR_MAX_LEN - 1] = '\0';
 		p_err("can't mount BPF file system to pin the object (%s): %s",
 		      name, err_str);
@@ -204,10 +194,20 @@ int do_pin_fd(int fd, const char *name)
 
 out_free:
 	free(file);
-out:
 	return err;
 }
 
+int do_pin_fd(int fd, const char *name)
+{
+	int err;
+
+	err = mount_bpffs_for_pin(name);
+	if (err)
+		return err;
+
+	return bpf_obj_pin(fd, name);
+}
+
 int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32))
 {
 	unsigned int id;
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 28322ace2856..1383824c9baf 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -129,6 +129,7 @@ const char *get_fd_type_name(enum bpf_obj_type type);
 char *get_fdinfo(int fd, const char *key);
 int open_obj_pinned(char *path);
 int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type);
+int mount_bpffs_for_pin(const char *name);
 int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32));
 int do_pin_fd(int fd, const char *name);
 
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 5302ee282409..feee1d184433 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -81,6 +81,7 @@ static const char * const attach_type_strings[] = {
 	[BPF_SK_SKB_STREAM_PARSER] = "stream_parser",
 	[BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict",
 	[BPF_SK_MSG_VERDICT] = "msg_verdict",
+	[BPF_FLOW_DISSECTOR] = "flow_dissector",
 	[__MAX_BPF_ATTACH_TYPE] = NULL,
 };
 
@@ -724,10 +725,11 @@ int map_replace_compar(const void *p1, const void *p2)
 static int do_attach(int argc, char **argv)
 {
 	enum bpf_attach_type attach_type;
-	int err, mapfd, progfd;
+	int err, progfd;
+	int mapfd = 0;
 
-	if (!REQ_ARGS(5)) {
-		p_err("too few parameters for map attach");
+	if (!REQ_ARGS(3)) {
+		p_err("too few parameters for attach");
 		return -EINVAL;
 	}
 
@@ -740,11 +742,17 @@ static int do_attach(int argc, char **argv)
 		p_err("invalid attach type");
 		return -EINVAL;
 	}
-	NEXT_ARG();
+	if (attach_type != BPF_FLOW_DISSECTOR) {
+		NEXT_ARG();
+		if (!REQ_ARGS(2)) {
+			p_err("too few parameters for map attach");
+			return -EINVAL;
+		}
 
-	mapfd = map_parse_fd(&argc, &argv);
-	if (mapfd < 0)
-		return mapfd;
+		mapfd = map_parse_fd(&argc, &argv);
+		if (mapfd < 0)
+			return mapfd;
+	}
 
 	err = bpf_prog_attach(progfd, mapfd, attach_type, 0);
 	if (err) {
@@ -760,10 +768,11 @@ static int do_attach(int argc, char **argv)
 static int do_detach(int argc, char **argv)
 {
 	enum bpf_attach_type attach_type;
-	int err, mapfd, progfd;
+	int err, progfd;
+	int mapfd = 0;
 
-	if (!REQ_ARGS(5)) {
-		p_err("too few parameters for map detach");
+	if (!REQ_ARGS(3)) {
+		p_err("too few parameters for detach");
 		return -EINVAL;
 	}
 
@@ -776,11 +785,17 @@ static int do_detach(int argc, char **argv)
 		p_err("invalid attach type");
 		return -EINVAL;
 	}
-	NEXT_ARG();
+	if (attach_type != BPF_FLOW_DISSECTOR) {
+		NEXT_ARG();
+		if (!REQ_ARGS(2)) {
+			p_err("too few parameters for map detach");
+			return -EINVAL;
+		}
 
-	mapfd = map_parse_fd(&argc, &argv);
-	if (mapfd < 0)
-		return mapfd;
+		mapfd = map_parse_fd(&argc, &argv);
+		if (mapfd < 0)
+			return mapfd;
+	}
 
 	err = bpf_prog_detach2(progfd, mapfd, attach_type);
 	if (err) {
@@ -792,6 +807,7 @@ static int do_detach(int argc, char **argv)
 		jsonw_null(json_wtr);
 	return 0;
 }
+
 static int do_load(int argc, char **argv)
 {
 	enum bpf_attach_type expected_attach_type;
@@ -799,8 +815,8 @@ static int do_load(int argc, char **argv)
 		.prog_type	= BPF_PROG_TYPE_UNSPEC,
 	};
 	struct map_replace *map_replace = NULL;
+	struct bpf_program *prog = NULL, *pos;
 	unsigned int old_map_fds = 0;
-	struct bpf_program *prog;
 	struct bpf_object *obj;
 	struct bpf_map *map;
 	const char *pinfile;
@@ -918,13 +934,14 @@ static int do_load(int argc, char **argv)
 		goto err_free_reuse_maps;
 	}
 
-	prog = bpf_program__next(NULL, obj);
-	if (!prog) {
-		p_err("object file doesn't contain any bpf program");
-		goto err_close_obj;
+	if (attr.prog_type != BPF_PROG_TYPE_FLOW_DISSECTOR) {
+		prog = bpf_program__next(NULL, obj);
+		if (!prog) {
+			p_err("object file doesn't contain any bpf program");
+			goto err_close_obj;
+		}
 	}
 
-	bpf_program__set_ifindex(prog, ifindex);
 	if (attr.prog_type == BPF_PROG_TYPE_UNSPEC) {
 		const char *sec_name = bpf_program__title(prog, false);
 
@@ -936,8 +953,13 @@ static int do_load(int argc, char **argv)
 			goto err_close_obj;
 		}
 	}
-	bpf_program__set_type(prog, attr.prog_type);
-	bpf_program__set_expected_attach_type(prog, expected_attach_type);
+
+	bpf_object__for_each_program(pos, obj) {
+		bpf_program__set_ifindex(pos, ifindex);
+		bpf_program__set_type(pos, attr.prog_type);
+		bpf_program__set_expected_attach_type(pos,
+						      expected_attach_type);
+	}
 
 	qsort(map_replace, old_map_fds, sizeof(*map_replace),
 	      map_replace_compar);
@@ -1001,9 +1023,28 @@ static int do_load(int argc, char **argv)
 		goto err_close_obj;
 	}
 
-	if (do_pin_fd(bpf_program__fd(prog), pinfile))
+	err = mount_bpffs_for_pin(pinfile);
+	if (err)
 		goto err_close_obj;
 
+	if (attr.prog_type == BPF_PROG_TYPE_FLOW_DISSECTOR) {
+		/* flow dissector consist of multiple programs,
+		 * we want to pin them all
+		 */
+		err = bpf_object__pin(obj, pinfile);
+		if (err) {
+			p_err("failed to pin flow dissector object");
+			goto err_close_obj;
+		}
+	} else {
+		err = bpf_obj_pin(bpf_program__fd(prog), pinfile);
+		if (err) {
+			p_err("failed to pin program %s",
+			      bpf_program__title(prog, false));
+			goto err_close_obj;
+		}
+	}
+
 	if (json_output)
 		jsonw_null(json_wtr);
 
@@ -1037,8 +1078,8 @@ static int do_help(int argc, char **argv)
 		"       %s %s pin   PROG FILE\n"
 		"       %s %s load  OBJ  FILE [type TYPE] [dev NAME] \\\n"
 		"                         [map { idx IDX | name NAME } MAP]\n"
-		"       %s %s attach PROG ATTACH_TYPE MAP\n"
-		"       %s %s detach PROG ATTACH_TYPE MAP\n"
+		"       %s %s attach PROG ATTACH_TYPE [MAP]\n"
+		"       %s %s detach PROG ATTACH_TYPE [MAP]\n"
 		"       %s %s help\n"
 		"\n"
 		"       " HELP_SPEC_MAP "\n"
@@ -1050,7 +1091,8 @@ static int do_help(int argc, char **argv)
 		"                 cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n"
 		"                 cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n"
 		"                 cgroup/sendmsg4 | cgroup/sendmsg6 }\n"
-		"       ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse }\n"
+		"       ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse |\n"
+		"                        flow_dissector }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
-- 
2.19.1.930.g4563a0d9d0-goog

^ permalink raw reply related

* Re: [PATCH net-next v2 3/5] virtio_ring: add packed ring support
From: Jason Wang @ 2018-11-08  8:18 UTC (permalink / raw)
  To: Tiwei Bie, Michael S. Tsirkin
  Cc: virtualization, linux-kernel, netdev, virtio-dev, wexu, jfreimann
In-Reply-To: <20181108013759.GA20591@debian>


On 2018/11/8 上午9:38, Tiwei Bie wrote:
>>> +
>>> +	if (vq->vq.num_free < descs_used) {
>>> +		pr_debug("Can't add buf len %i - avail = %i\n",
>>> +			 descs_used, vq->vq.num_free);
>>> +		/* FIXME: for historical reasons, we force a notify here if
>>> +		 * there are outgoing parts to the buffer.  Presumably the
>>> +		 * host should service the ring ASAP. */
>> I don't think we have a reason to do this for packed ring.
>> No historical baggage there, right?
> Based on the original commit log, it seems that the notify here
> is just an "optimization". But I don't quite understand what does
> the "the heuristics which KVM uses" refer to. If it's safe to drop
> this in packed ring, I'd like to do it.


According to the commit log, it seems like a workaround of lguest 
networking backend. I agree to drop it, we should not have such burden.

But we should notice that, with this removed, the compare between packed 
vs split is kind of unfair. Consider the removal of lguest support 
recently, maybe we can drop this for split ring as well?

Thanks


>
> commit 44653eae1407f79dff6f52fcf594ae84cb165ec4
> Author: Rusty Russell<rusty@rustcorp.com.au>
> Date:   Fri Jul 25 12:06:04 2008 -0500
>
>      virtio: don't always force a notification when ring is full
>      
>      We force notification when the ring is full, even if the host has
>      indicated it doesn't want to know.  This seemed like a good idea at
>      the time: if we fill the transmit ring, we should tell the host
>      immediately.
>      
>      Unfortunately this logic also applies to the receiving ring, which is
>      refilled constantly.  We should introduce real notification thesholds
>      to replace this logic.  Meanwhile, removing the logic altogether breaks
>      the heuristics which KVM uses, so we use a hack: only notify if there are
>      outgoing parts of the new buffer.
>      
>      Here are the number of exits with lguest's crappy network implementation:
>      Before:
>              network xmit 7859051 recv 236420
>      After:
>              network xmit 7858610 recv 118136
>      
>      Signed-off-by: Rusty Russell<rusty@rustcorp.com.au>
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 72bf8bc09014..21d9a62767af 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -87,8 +87,11 @@ static int vring_add_buf(struct virtqueue *_vq,
>   	if (vq->num_free < out + in) {
>   		pr_debug("Can't add buf len %i - avail = %i\n",
>   			 out + in, vq->num_free);
> -		/* We notify*even if*  VRING_USED_F_NO_NOTIFY is set here. */
> -		vq->notify(&vq->vq);
> +		/* FIXME: for historical reasons, we force a notify here if
> +		 * there are outgoing parts to the buffer.  Presumably the
> +		 * host should service the ring ASAP. */
> +		if (out)
> +			vq->notify(&vq->vq);
>   		END_USE(vq);
>   		return -ENOSPC;
>   	}
>
>

^ permalink raw reply

* [net-next 00/12][pull request] Intel Wired LAN Driver Updates 2018-11-07
From: Jeff Kirsher @ 2018-11-07 22:48 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann

This series contains updates to almost all of the Intel wired LAN
drivers.

Lance Roy replaces a spin lock with lockdep_assert_held() for igbvf
driver in move toward trying to remove spin_is_locked().

Colin Ian King fixes a potential null pointer dereference by adding a
check in ixgbe.  Also fixed the igc driver by properly assigning the
return error code of a function call, so that we can properly check it.

Shannon Nelson updates the ixgbe driver to not block IPsec offload when
in VEPA mode, in VEB mode, IPsec offload is still blocked because the
device drops packets into a black hole.

Jake adds support for software timestamping for packets sent over
ixgbevf.  Also modifies i40e, iavf, igb, igc, and ixgbe to delay calling
skb_tx_timestamp() to the latest point possible, which is just prior to
notifying the hardware of the new Tx packet.

Todd adds the new WoL filter flag so that we properly report that we do
not support this new feature.

YueHaibing from Huawei fixes the igc driver by cleaning up variables
that are not "really" used.

Dan Carpenter cleans up igc whitespace issues.

Miroslav Lichvar fixes e1000e for potential underflow issue in the
timecounter, so modify the driver to use timecounter_cyc2time() to allow
non-monotonic SYSTIM readings.

Sasha provides additional igc cleanups based on community feedback.

The following are changes since commit 7c588c7468ea3f9b2fc8fa6840bed6262b5d1b00:
  Merge branch 'net-systemport-Unmap-queues-upon-DSA-unregister-event'
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 1GbE

Colin Ian King (2):
  ixgbe: don't clear_bit on xdp_ring->state if xdp_ring is null
  igc: fix error return handling from call to
    netif_set_real_num_tx_queues

Dan Carpenter (1):
  igc: Tidy up some white space

Jacob Keller (2):
  ixgbevf: add support for software timestamps
  intel-ethernet: software timestamp skbs as late as possible

Lance Roy (1):
  igbvf: Replace spin_is_locked() with lockdep

Miroslav Lichvar (1):
  e1000e: allow non-monotonic SYSTIM readings

Sasha Neftin (1):
  igc: Clean up code

Shannon Nelson (1):
  ixgbe: allow IPsec Tx offload in VEPA mode

Todd Fujinaka (1):
  i40e/ixgbe/igb: fail on new WoL flag setting WAKE_MAGICSECURE

YueHaibing (2):
  igc: Remove set but not used variables 'ctrl_ext, link_mode'
  igc: Remove set but not used variable 'pci_using_dac'

 drivers/net/ethernet/intel/e1000e/ptp.c       | 13 +++++--
 .../net/ethernet/intel/i40e/i40e_ethtool.c    |  3 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |  4 +--
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  4 +--
 drivers/net/ethernet/intel/igb/igb_ethtool.c  |  2 +-
 drivers/net/ethernet/intel/igb/igb_main.c     |  4 +--
 drivers/net/ethernet/intel/igbvf/mbx.c        |  4 +--
 drivers/net/ethernet/intel/igc/igc.h          |  9 -----
 drivers/net/ethernet/intel/igc/igc_base.c     |  8 -----
 drivers/net/ethernet/intel/igc/igc_main.c     | 36 +++++--------------
 .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c  |  3 +-
 .../net/ethernet/intel/ixgbe/ixgbe_ipsec.c    |  4 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  7 ++--
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  2 ++
 14 files changed, 41 insertions(+), 62 deletions(-)

-- 
2.19.1

^ permalink raw reply

* [net-next 01/12] igbvf: Replace spin_is_locked() with lockdep
From: Jeff Kirsher @ 2018-11-07 22:48 UTC (permalink / raw)
  To: davem; +Cc: Lance Roy, netdev, nhorman, sassmann, Jeff Kirsher
In-Reply-To: <20181107224830.9737-1-jeffrey.t.kirsher@intel.com>

From: Lance Roy <ldr709@gmail.com>

lockdep_assert_held() is better suited to checking locking requirements,
since it won't get confused when someone else holds the lock. This is
also a step towards possibly removing spin_is_locked().

Signed-off-by: Lance Roy <ldr709@gmail.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igbvf/mbx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igbvf/mbx.c b/drivers/net/ethernet/intel/igbvf/mbx.c
index 163e5838f7c2..a3cd7ac48d4b 100644
--- a/drivers/net/ethernet/intel/igbvf/mbx.c
+++ b/drivers/net/ethernet/intel/igbvf/mbx.c
@@ -241,7 +241,7 @@ static s32 e1000_write_mbx_vf(struct e1000_hw *hw, u32 *msg, u16 size)
 	s32 err;
 	u16 i;
 
-	WARN_ON_ONCE(!spin_is_locked(&hw->mbx_lock));
+	lockdep_assert_held(&hw->mbx_lock);
 
 	/* lock the mailbox to prevent pf/vf race condition */
 	err = e1000_obtain_mbx_lock_vf(hw);
@@ -279,7 +279,7 @@ static s32 e1000_read_mbx_vf(struct e1000_hw *hw, u32 *msg, u16 size)
 	s32 err;
 	u16 i;
 
-	WARN_ON_ONCE(!spin_is_locked(&hw->mbx_lock));
+	lockdep_assert_held(&hw->mbx_lock);
 
 	/* lock the mailbox to prevent pf/vf race condition */
 	err = e1000_obtain_mbx_lock_vf(hw);
-- 
2.19.1

^ permalink raw reply related

* [net-next 02/12] ixgbe: don't clear_bit on xdp_ring->state if xdp_ring is null
From: Jeff Kirsher @ 2018-11-07 22:48 UTC (permalink / raw)
  To: davem; +Cc: Colin Ian King, netdev, nhorman, sassmann, Jeff Kirsher
In-Reply-To: <20181107224830.9737-1-jeffrey.t.kirsher@intel.com>

From: Colin Ian King <colin.king@canonical.com>

There is an earlier check to see if xdp_ring is null when configuring
the tx ring, so assuming that it can still be null, the clearing of
the xdp_ring->state currently could end up with a null pointer
dereference.  Fix this by only clearing the bit if xdp_ring is not null.

Detected by CoverityScan, CID#1473795 ("Dereference after null check")

Fixes: 024aa5800f32 ("ixgbe: added Rx/Tx ring disable/enable functions")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 113b38e0defb..aeda1834e66a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10517,7 +10517,8 @@ void ixgbe_txrx_ring_enable(struct ixgbe_adapter *adapter, int ring)
 	ixgbe_configure_rx_ring(adapter, rx_ring);
 
 	clear_bit(__IXGBE_TX_DISABLED, &tx_ring->state);
-	clear_bit(__IXGBE_TX_DISABLED, &xdp_ring->state);
+	if (xdp_ring)
+		clear_bit(__IXGBE_TX_DISABLED, &xdp_ring->state);
 }
 
 /**
-- 
2.19.1

^ permalink raw reply related

* [net-next 05/12] intel-ethernet: software timestamp skbs as late as possible
From: Jeff Kirsher @ 2018-11-07 22:48 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, nhorman, sassmann, Jeff Kirsher
In-Reply-To: <20181107224830.9737-1-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

Many of the Intel Ethernet drivers call skb_tx_timestamp() earlier than
necessary. Move the calls to this function to the latest point possible,
just prior to notifying hardware of the new Tx packet when we bump the
tail register.

This affects i40e, iavf, igb, igc, and ixgbe.

The e100, e1000, e1000e, fm10k, and ice drivers already call the
skb_tx_timestamp() function just prior to indicating the Tx packet to
hardware, so they do not need to be changed.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 4 ++--
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   | 4 ++--
 drivers/net/ethernet/intel/igb/igb_main.c     | 4 ++--
 drivers/net/ethernet/intel/igc/igc_main.c     | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index aef3c89ee79c..1384a5a006a4 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3473,6 +3473,8 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
 	tx_desc->cmd_type_offset_bsz =
 			build_ctob(td_cmd, td_offset, size, td_tag);
 
+	skb_tx_timestamp(skb);
+
 	/* Force memory writes to complete before letting h/w know there
 	 * are new descriptors to fetch.
 	 *
@@ -3652,8 +3654,6 @@ static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
 	if (tsyn)
 		tx_flags |= I40E_TX_FLAGS_TSYN;
 
-	skb_tx_timestamp(skb);
-
 	/* always enable CRC insertion offload */
 	td_cmd |= I40E_TX_DESC_CMD_ICRC;
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index fb9bfad96daf..3b1dc77ae368 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -2343,6 +2343,8 @@ static inline void iavf_tx_map(struct iavf_ring *tx_ring, struct sk_buff *skb,
 	tx_desc->cmd_type_offset_bsz =
 			build_ctob(td_cmd, td_offset, size, td_tag);
 
+	skb_tx_timestamp(skb);
+
 	/* Force memory writes to complete before letting h/w know there
 	 * are new descriptors to fetch.
 	 *
@@ -2461,8 +2463,6 @@ static netdev_tx_t iavf_xmit_frame_ring(struct sk_buff *skb,
 	if (tso < 0)
 		goto out_drop;
 
-	skb_tx_timestamp(skb);
-
 	/* always enable CRC insertion offload */
 	td_cmd |= IAVF_TX_DESC_CMD_ICRC;
 
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 5df88ad8ac81..4584ebc9e8fe 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6019,6 +6019,8 @@ static int igb_tx_map(struct igb_ring *tx_ring,
 	/* set the timestamp */
 	first->time_stamp = jiffies;
 
+	skb_tx_timestamp(skb);
+
 	/* Force memory writes to complete before letting h/w know there
 	 * are new descriptors to fetch.  (Only applicable for weak-ordered
 	 * memory model archs, such as IA-64).
@@ -6147,8 +6149,6 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 	else if (!tso)
 		igb_tx_csum(tx_ring, first);
 
-	skb_tx_timestamp(skb);
-
 	if (igb_tx_map(tx_ring, first, hdr_len))
 		goto cleanup_tx_tstamp;
 
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 9d85707e8a81..615a5fcd5a00 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -865,6 +865,8 @@ static int igc_tx_map(struct igc_ring *tx_ring,
 	/* set the timestamp */
 	first->time_stamp = jiffies;
 
+	skb_tx_timestamp(skb);
+
 	/* Force memory writes to complete before letting h/w know there
 	 * are new descriptors to fetch.  (Only applicable for weak-ordered
 	 * memory model archs, such as IA-64).
@@ -959,8 +961,6 @@ static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb,
 	first->bytecount = skb->len;
 	first->gso_segs = 1;
 
-	skb_tx_timestamp(skb);
-
 	/* record initial flags and protocol */
 	first->tx_flags = tx_flags;
 	first->protocol = protocol;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index aeda1834e66a..cfb83687c3d8 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8269,6 +8269,8 @@ static int ixgbe_tx_map(struct ixgbe_ring *tx_ring,
 	/* set the timestamp */
 	first->time_stamp = jiffies;
 
+	skb_tx_timestamp(skb);
+
 	/*
 	 * Force memory writes to complete before letting h/w know there
 	 * are new descriptors to fetch.  (Only applicable for weak-ordered
@@ -8646,8 +8648,6 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 		}
 	}
 
-	skb_tx_timestamp(skb);
-
 #ifdef CONFIG_PCI_IOV
 	/*
 	 * Use the l2switch_enable flag - would be false if the DMA
-- 
2.19.1

^ permalink raw reply related

* [net-next 03/12] ixgbe: allow IPsec Tx offload in VEPA mode
From: Jeff Kirsher @ 2018-11-07 22:48 UTC (permalink / raw)
  To: davem; +Cc: Shannon Nelson, netdev, nhorman, sassmann, Jeff Kirsher
In-Reply-To: <20181107224830.9737-1-jeffrey.t.kirsher@intel.com>

From: Shannon Nelson <shannon.nelson@oracle.com>

When it's possible that the PF might end up trying to send a
packet to one of its own VFs, we have to forbid IPsec offload
because the device drops the packets into a black hole.
See commit 47b6f50077e6 ("ixgbe: disallow IPsec Tx offload
when in SR-IOV mode") for more info.

This really is only necessary when the device is in the default
VEB mode.  If instead the device is running in VEPA mode,
the packets will go through the encryption engine and out the
MAC/PHY as normal, and get "hairpinned" as needed by the switch.

So let's not block IPsec offload when in VEPA mode.  To get
there with the ixgbe device, use the handy 'bridge' command:
	bridge link set dev eth1 hwmode vepa

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index fd1b0546fd67..4d77f42e035c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -4,6 +4,7 @@
 #include "ixgbe.h"
 #include <net/xfrm.h>
 #include <crypto/aead.h>
+#include <linux/if_bridge.h>
 
 #define IXGBE_IPSEC_KEY_BITS  160
 static const char aes_gcm_name[] = "rfc4106(gcm(aes))";
@@ -693,7 +694,8 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
 	} else {
 		struct tx_sa tsa;
 
-		if (adapter->num_vfs)
+		if (adapter->num_vfs &&
+		    adapter->bridge_mode != BRIDGE_MODE_VEPA)
 			return -EOPNOTSUPP;
 
 		/* find the first unused index */
-- 
2.19.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox