Netdev List
 help / color / mirror / Atom feed
* [PATCH 2/6] rhashtable: remove incorrect comment on r{hl, hash}table_walk_enter()
From: NeilBrown @ 2018-04-18  6:47 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, inux-kernel
In-Reply-To: <152403346237.16895.8767189357062722046.stgit@noble>

Neither rhashtable_walk_enter() or rhltable_walk_enter() sleep, so
remove the comments which suggest that they do.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |    3 ---
 lib/rhashtable.c           |    3 ---
 2 files changed, 6 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 87d443a5b11d..b01d88e196c2 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1268,9 +1268,6 @@ static inline int rhashtable_walk_init(struct rhashtable *ht,
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
  *
- * This function may sleep so you must not call it from interrupt
- * context or with spin locks held.
- *
  * You must call rhashtable_walk_exit after this function returns.
  */
 static inline void rhltable_walk_enter(struct rhltable *hlt,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 2b2b79974b61..19db8e563c40 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -668,9 +668,6 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
  *
- * This function may sleep so you must not call it from interrupt
- * context or with spin locks held.
- *
  * You must call rhashtable_walk_exit after this function returns.
  */
 void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)

^ permalink raw reply related

* [PATCH 3/6] rhashtable: reset iter when rhashtable_walk_start sees new table
From: NeilBrown @ 2018-04-18  6:47 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, inux-kernel
In-Reply-To: <152403346237.16895.8767189357062722046.stgit@noble>

The documentation claims that when rhashtable_walk_start_check()
detects a resize event, it will rewind back to the beginning
of the table.  This is not true.  We need to set ->slot and
->skip to be zero for it to be true.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/rhashtable.c |    2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 19db8e563c40..28e1be9f681b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -733,6 +733,8 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 
 	if (!iter->walker.tbl && !iter->end_of_table) {
 		iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
+		iter->slot = 0;
+		iter->skip = 0;
 		return -EAGAIN;
 	}
 

^ permalink raw reply related

* [PATCH 4/6] rhashtable: improve rhashtable_walk stability when stop/start used.
From: NeilBrown @ 2018-04-18  6:47 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, inux-kernel
In-Reply-To: <152403346237.16895.8767189357062722046.stgit@noble>

When a walk of an rhashtable is interrupted with rhastable_walk_stop()
and then rhashtable_walk_start(), the location to restart from is based
on a 'skip' count in the current hash chain, and this can be incorrect
if insertions or deletions have happened.  This does not happen when
the walk is not stopped and started as iter->p is a placeholder which
is safe to use while holding the RCU read lock.

In rhashtable_walk_start() we can revalidate that 'p' is still in the
same hash chain.  If it isn't then the current method is still used.

With this patch, if a rhashtable walker ensures that the current
object remains in the table over a stop/start period (possibly by
elevating the reference count if that is sufficient), it can be sure
that a walk will not miss objects that were in the hashtable for the
whole time of the walk.

rhashtable_walk_start() may not find the object even though it is
still in the hashtable if a rehash has moved it to a new table.  In
this case it will (eventually) get -EAGAIN and will need to proceed
through the whole table again to be sure to see everything at least
once.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/rhashtable.c |   44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 28e1be9f681b..16cde54a553b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -723,6 +723,7 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 	__acquires(RCU)
 {
 	struct rhashtable *ht = iter->ht;
+	bool rhlist = ht->rhlist;
 
 	rcu_read_lock();
 
@@ -731,13 +732,52 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 		list_del(&iter->walker.list);
 	spin_unlock(&ht->lock);
 
-	if (!iter->walker.tbl && !iter->end_of_table) {
+	if (iter->end_of_table)
+		return 0;
+	if (!iter->walker.tbl) {
 		iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
 		iter->slot = 0;
 		iter->skip = 0;
 		return -EAGAIN;
 	}
 
+	if (iter->p && !rhlist) {
+		/*
+		 * We need to validate that 'p' is still in the table, and
+		 * if so, update 'skip'
+		 */
+		struct rhash_head *p;
+		int skip = 0;
+		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+			skip++;
+			if (p == iter->p) {
+				iter->skip = skip;
+				goto found;
+			}
+		}
+		iter->p = NULL;
+	} else if (iter->p && rhlist) {
+		/* Need to validate that 'list' is still in the table, and
+		 * if so, update 'skip' and 'p'.
+		 */
+		struct rhash_head *p;
+		struct rhlist_head *list;
+		int skip = 0;
+		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+			for (list = container_of(p, struct rhlist_head, rhead);
+			     list;
+			     list = rcu_dereference(list->next)) {
+				skip++;
+				if (list == iter->list) {
+					iter->p = p;
+					skip = skip;
+					goto found;
+				}
+			}
+		}
+		iter->p = NULL;
+	}
+found:
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
@@ -913,8 +953,6 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 		iter->walker.tbl = NULL;
 	spin_unlock(&ht->lock);
 
-	iter->p = NULL;
-
 out:
 	rcu_read_unlock();
 }

^ permalink raw reply related

* [PATCH 5/6] rhashtable: further improve stability of rhashtable_walk
From: NeilBrown @ 2018-04-18  6:47 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, inux-kernel
In-Reply-To: <152403346237.16895.8767189357062722046.stgit@noble>

If the sequence:
   obj = rhashtable_walk_next(iter);
   rhashtable_walk_stop(iter);
   rhashtable_remove_fast(ht, &obj->head, params);
   rhashtable_walk_start(iter);

 races with another thread inserting or removing
 an object on the same hash chain, a subsequent
 rhashtable_walk_next() is not guaranteed to get the "next"
 object. It is possible that an object could be
 repeated, or missed.

 This can be made more reliable by keeping the objects in a hash chain
 sorted by memory address.  A subsequent rhashtable_walk_next()
 call can reliably find the correct position in the list, and thus
 find the 'next' object.

 It is not possible (certainly not so easy) to achieve this with an
 rhltable as keeping the hash chain in order is not so easy.  When the
 first object with a given key is removed, it is replaced in the chain
 with the next object with the same key, and the address of that
 object may not be correctly ordered.
 No current user of rhltable_walk_enter() calls
 rhashtable_walk_start() more than once, so no current code
 could benefit from a more reliable walk of rhltables.

 This patch only attempts to improve walks for rhashtables.
 - a new object is always inserted after the last object with a
   smaller address, or at the start
 - when rhashtable_walk_start() is called, it records that 'p' is not
   'safe', meaning that it cannot be dereferenced.  The revalidation
   that was previously done here is moved to rhashtable_walk_next()
 - when rhashtable_walk_next() is called while p is not NULL and not
   safe, it walks the chain looking for the first object with an
   address greater than p and returns that.  If there is none, it moves
   to the next hash chain.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |   12 ++++++--
 lib/rhashtable.c           |   66 +++++++++++++++++++++++++++-----------------
 2 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index b01d88e196c2..5ce6201f246e 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -207,6 +207,7 @@ struct rhashtable_iter {
 	struct rhashtable_walker walker;
 	unsigned int slot;
 	unsigned int skip;
+	bool p_is_unsafe;
 	bool end_of_table;
 };
 
@@ -730,6 +731,7 @@ static inline void *__rhashtable_insert_fast(
 		.ht = ht,
 		.key = key,
 	};
+	struct rhash_head __rcu **inspos;
 	struct rhash_head __rcu **pprev;
 	struct bucket_table *tbl;
 	struct rhash_head *head;
@@ -757,6 +759,7 @@ static inline void *__rhashtable_insert_fast(
 	data = ERR_PTR(-ENOMEM);
 	if (!pprev)
 		goto out;
+	inspos = pprev;
 
 	rht_for_each_continue(head, *pprev, tbl, hash) {
 		struct rhlist_head *plist;
@@ -768,6 +771,8 @@ static inline void *__rhashtable_insert_fast(
 		     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
 		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
 			pprev = &head->next;
+			if (head < obj)
+				inspos = &head->next;
 			continue;
 		}
 
@@ -798,7 +803,7 @@ static inline void *__rhashtable_insert_fast(
 	if (unlikely(rht_grow_above_100(ht, tbl)))
 		goto slow_path;
 
-	head = rht_dereference_bucket(*pprev, tbl, hash);
+	head = rht_dereference_bucket(*inspos, tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (rhlist) {
@@ -808,7 +813,7 @@ static inline void *__rhashtable_insert_fast(
 		RCU_INIT_POINTER(list->next, NULL);
 	}
 
-	rcu_assign_pointer(*pprev, obj);
+	rcu_assign_pointer(*inspos, obj);
 
 	atomic_inc(&ht->nelems);
 	if (rht_grow_above_75(ht, tbl))
@@ -1263,7 +1268,8 @@ static inline int rhashtable_walk_init(struct rhashtable *ht,
  * Note that if you restart a walk after rhashtable_walk_stop you
  * may see the same object twice.  Also, you may miss objects if
  * there are removals in between rhashtable_walk_stop and the next
- * call to rhashtable_walk_start.
+ * call to rhashtable_walk_start.  Note that this is different to
+ * rhashtable_walk_enter() which never leads to missing objects.
  *
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 16cde54a553b..be7eb57d9398 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -566,6 +566,10 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 		return ERR_PTR(-ENOMEM);
 
 	head = rht_dereference_bucket(*pprev, tbl, hash);
+	while (!rht_is_a_nulls(head) && head < obj) {
+		pprev = &head->next;
+		head = rht_dereference_bucket(*pprev, tbl, hash);
+	}
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (ht->rhlist) {
@@ -660,10 +664,10 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  *
  * This function prepares a hash table walk.
  *
- * Note that if you restart a walk after rhashtable_walk_stop you
- * may see the same object twice.  Also, you may miss objects if
- * there are removals in between rhashtable_walk_stop and the next
- * call to rhashtable_walk_start.
+ * A walk is guaranteed to return every object that was in
+ * the table before this call, and is still in the table when
+ * rhashtable_walk_next() returns NULL.  Duplicates can be
+ * seen, but only if there is a rehash event during the walk.
  *
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
@@ -743,19 +747,10 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 
 	if (iter->p && !rhlist) {
 		/*
-		 * We need to validate that 'p' is still in the table, and
-		 * if so, update 'skip'
+		 * 'p' will be revalidated when rhashtable_walk_next()
+		 * is called.
 		 */
-		struct rhash_head *p;
-		int skip = 0;
-		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
-			skip++;
-			if (p == iter->p) {
-				iter->skip = skip;
-				goto found;
-			}
-		}
-		iter->p = NULL;
+		iter->p_is_unsafe = true;
 	} else if (iter->p && rhlist) {
 		/* Need to validate that 'list' is still in the table, and
 		 * if so, update 'skip' and 'p'.
@@ -872,15 +867,35 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 	bool rhlist = ht->rhlist;
 
 	if (p) {
-		if (!rhlist || !(list = rcu_dereference(list->next))) {
-			p = rcu_dereference(p->next);
-			list = container_of(p, struct rhlist_head, rhead);
-		}
-		if (!rht_is_a_nulls(p)) {
-			iter->skip++;
-			iter->p = p;
-			iter->list = list;
-			return rht_obj(ht, rhlist ? &list->rhead : p);
+		if (!rhlist && iter->p_is_unsafe) {
+			/*
+			 * First time next() was called after start().
+			 * Need to find location of 'p' in the list.
+			 */
+			struct rhash_head *p;
+
+			iter->skip = 0;
+			rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+				iter->skip++;
+				if (p <= iter->p)
+					continue;
+
+				/* p is the next object after iter->p */
+				iter->p = p;
+				iter->p_is_unsafe = false;
+				return rht_obj(ht, p);
+			}
+		} else {
+			if (!rhlist || !(list = rcu_dereference(list->next))) {
+				p = rcu_dereference(p->next);
+				list = container_of(p, struct rhlist_head, rhead);
+			}
+			if (!rht_is_a_nulls(p)) {
+				iter->skip++;
+				iter->p = p;
+				iter->list = list;
+				return rht_obj(ht, rhlist ? &list->rhead : p);
+			}
 		}
 
 		/* At the end of this slot, switch to next one and then find
@@ -890,6 +905,7 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 		iter->slot++;
 	}
 
+	iter->p_is_unsafe = false;
 	return __rhashtable_walk_find_next(iter);
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);

^ permalink raw reply related

* [PATCH 6/6] rhashtable: add rhashtable_walk_prev()
From: NeilBrown @ 2018-04-18  6:47 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, inux-kernel
In-Reply-To: <152403346237.16895.8767189357062722046.stgit@noble>

rhashtable_walk_prev() returns the object returned by
the previous rhashtable_walk_next(), providing it is still in the
table (or was during this grace period).
This works even if rhashtable_walk_stop() and rhashtable_talk_start()
have been called since the last rhashtable_walk_next().

If there have been no calls to rhashtable_walk_next(), or if the
object is gone from the table, then NULL is returned.

This can usefully be used in a seq_file ->start() function.
If the pos is the same as was returned by the last ->next() call,
then rhashtable_walk_prev() can be used to re-establish the
current location in the table.  If it returns NULL, then
rhashtable_walk_next() should be used.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |    1 +
 lib/rhashtable.c           |   30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 5ce6201f246e..b1ad2b6a3f3f 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -397,6 +397,7 @@ static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
 
 void *rhashtable_walk_next(struct rhashtable_iter *iter);
 void *rhashtable_walk_peek(struct rhashtable_iter *iter);
+void *rhashtable_walk_prev(struct rhashtable_iter *iter);
 void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
 
 void rhashtable_free_and_destroy(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index be7eb57d9398..d2f941146ea3 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -910,6 +910,36 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);
 
+/**
+ * rhashtable_walk_prev - Return the previously returned object, if available
+ * @iter:	Hash table iterator
+ *
+ * If rhashtable_walk_next() has previously been called and the object
+ * it returned is still in the hash table, that object is returned again,
+ * otherwise %NULL is returned.
+ *
+ * If the recent rhashtable_walk_next() call was since the most recent
+ * rhashtable_walk_start() call then the returned object may not, strictly
+ * speaking, still be in the table.  It will be safe to dereference.
+ *
+ * Note that the iterator is not changed and in particular it does not
+ * step backwards.
+ */
+void *rhashtable_walk_prev(struct rhashtable_iter *iter)
+{
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+
+	if (!p)
+		return NULL;
+	if (!iter->p_is_unsafe || ht->rhlist)
+		return p;
+	rht_for_each_rcu(p, iter->walker.tbl, iter->slot)
+		if (p == iter->p)
+			return p;
+	return NULL;
+}
+
 /**
  * rhashtable_walk_peek - Return the next object but don't advance the iterator
  * @iter:	Hash table iterator

^ permalink raw reply related

* Re: [PATCH net-next 0/5] virtio-net: Add SCTP checksum offload support
From: Xin Long @ 2018-04-18  6:57 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: Vlad Yasevich, Vladislav Yasevich, network dev, linux-sctp,
	virtualization, Michael S. Tsirkin, Jason Wang, Neil Horman
In-Reply-To: <20180418013333.GO4716@localhost.localdomain>

On Wed, Apr 18, 2018 at 9:33 AM, Marcelo Ricardo Leitner
<marcelo.leitner@gmail.com> wrote:
> On Tue, Apr 17, 2018 at 04:35:18PM -0400, Vlad Yasevich wrote:
>> On 04/02/2018 10:47 AM, Marcelo Ricardo Leitner wrote:
>> > On Mon, Apr 02, 2018 at 09:40:01AM -0400, Vladislav Yasevich wrote:
>> >> Now that we have SCTP offload capabilities in the kernel, we can add
>> >> them to virtio as well.  First step is SCTP checksum.
>> >
>> > Thanks.
>> >
>> >> As for GSO, the way sctp GSO is currently implemented buys us nothing
>> >> in added support to virtio.  To add true GSO, would require a lot of
>> >> re-work inside of SCTP and would require extensions to the virtio
>> >> net header to carry extra sctp data.
>> >
>> > Can you please elaborate more on this? Is this because SCTP GSO relies
>> > on the gso skb format for knowing how to segment it instead of having
>> > a list of sizes?
>> >
>>
>> it's mainly because all the true segmentation, placing data into chunks,
>> has already happened.  All that GSO does is allow for higher bundling
>> rate between VMs. If that is all SCTP GSO ever going to do, that fine,
>> but the goal is to do real GSO eventually and potentially reduce the
>> amount of memory copying we are doing.
The memory copying for bundling can't be avoided, as the chunks in
one packet may be from some different messages, as Marcelo said below.

>> If we do that, any current attempt at GSO in virtio would have to be
>> depricated and we'd need GSO2 or something like that.
Why would it be depreciated? virtio_net actually also supports frag_list,
all we need to do is to enable it.
I already have a patch for  sctp gso over virtio_net in my local tree:

patch on qemu-2.9.0(el7):
  https://paste.fedoraproject.org/paste/IgnSbW74L6P9aUZpnWHBsA
patch on kernel-4.16(linus):
  https://paste.fedoraproject.org/paste/L7AhFIVVp3k8VfEV29QMiw

The performance has been improved quite a lot with this:
testing over virtio-net(guest):
  https://paste.fedoraproject.org/paste/cxQMnrDQDf9AoNgBpA-twA
testing over tap(host):
  https://paste.fedoraproject.org/paste/QPdc22pgvF-x68oKDjWa9g

>>
>> This is why, after doing the GSO support, I decided not to include it.
>
> Gotcha. I don't think it will ever go further than what we have now.
> Placing data into chunks later is not really feasible/wanted,
> especially now with stream schedulers and idata chunks. Doesn't seem
> worth the hassle... we would have to support things like, "segment
> half of this message plus a third of this other one from that other
> stream." (in case it's round robin).
>
>   Marcelo

^ permalink raw reply

* Re: [PATCH v3 07/20] i2c: Remove depends on HAS_DMA in case of platform dependency
From: Wolfram Sang @ 2018-04-18  7:01 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Ulf Hansson, linux-iio-u79uwXL29TY76Z2rM5mHXA,
	linux-fpga-u79uwXL29TY76Z2rM5mHXA,
	linux-remoteproc-u79uwXL29TY76Z2rM5mHXA,
	alsa-devel-K7yf7f+aM1XWsZ/bQMPhNw, Bjorn Andersson, Eric Anholt,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-mtd-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA,
	linux1394-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	Christoph Hellwig, Stefan Wahren, Boris Brezillon, Herbert Xu,
	Richard Weinberger, Jassi Brar, Marek Vasut,
	linux-serial-u79uwXL29TY76Z2rM5mHXA, Matias Bjorling,
	linux-media-u79uwXL29TY76Z2rM5mHXA, Ohad Ben-Cohen,
	devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b, Alan Tull,
	Bartlomiej Zolnierkiewicz, linux-k
In-Reply-To: <1523987360-18760-8-git-send-email-geert-Td1EMuHUCqxL1ZNQvxDV9g@public.gmane.org>


[-- Attachment #1.1: Type: text/plain, Size: 838 bytes --]

On Tue, Apr 17, 2018 at 07:49:07PM +0200, Geert Uytterhoeven wrote:
> Remove dependencies on HAS_DMA where a Kconfig symbol depends on another
> symbol that implies HAS_DMA, and, optionally, on "|| COMPILE_TEST".
> In most cases this other symbol is an architecture or platform specific
> symbol, or PCI.
> 
> Generic symbols and drivers without platform dependencies keep their
> dependencies on HAS_DMA, to prevent compiling subsystems or drivers that
> cannot work anyway.
> 
> This simplifies the dependencies, and allows to improve compile-testing.
> 
> Signed-off-by: Geert Uytterhoeven <geert-Td1EMuHUCqxL1ZNQvxDV9g@public.gmane.org>
> Reviewed-by: Mark Brown <broonie-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Acked-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>

Applied to for-current, thanks!


[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply

* Re: [PATCH v3 1/9] net: phy: new Asix Electronics PHY driver
From: John Paul Adrian Glaubitz @ 2018-04-18  7:04 UTC (permalink / raw)
  To: Michael Schmitz, netdev
  Cc: andrew, fthain, geert, f.fainelli, linux-m68k, Michael.Karcher
In-Reply-To: <1524025616-3722-2-git-send-email-schmitzmic@gmail.com>

On 04/18/2018 06:26 AM, Michael Schmitz wrote:
> The Asix Electronics PHY found on the X-Surf 100 Amiga Zorro network
> card by Individual Computers is buggy, and needs the reset bit toggled
> as workaround to make a PHY soft reset succed.
                                         ^^^^^^
                                         typo


-- 
 .''`.  John Paul Adrian Glaubitz
: :' :  Debian Developer - glaubitz@debian.org
`. `'   Freie Universitaet Berlin - glaubitz@physik.fu-berlin.de
  `-    GPG: 62FF 8A75 84E0 2956 9546  0006 7426 3B37 F5B5 F913

^ permalink raw reply

* [PATCH 4/6] rhashtable: improve rhashtable_walk stability when stop/start used.
From: NeilBrown @ 2018-04-18  6:47 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152403346237.16895.8767189357062722046.stgit2@noble>

When a walk of an rhashtable is interrupted with rhastable_walk_stop()
and then rhashtable_walk_start(), the location to restart from is based
on a 'skip' count in the current hash chain, and this can be incorrect
if insertions or deletions have happened.  This does not happen when
the walk is not stopped and started as iter->p is a placeholder which
is safe to use while holding the RCU read lock.

In rhashtable_walk_start() we can revalidate that 'p' is still in the
same hash chain.  If it isn't then the current method is still used.

With this patch, if a rhashtable walker ensures that the current
object remains in the table over a stop/start period (possibly by
elevating the reference count if that is sufficient), it can be sure
that a walk will not miss objects that were in the hashtable for the
whole time of the walk.

rhashtable_walk_start() may not find the object even though it is
still in the hashtable if a rehash has moved it to a new table.  In
this case it will (eventually) get -EAGAIN and will need to proceed
through the whole table again to be sure to see everything at least
once.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/rhashtable.c |   44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 28e1be9f681b..16cde54a553b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -723,6 +723,7 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 	__acquires(RCU)
 {
 	struct rhashtable *ht = iter->ht;
+	bool rhlist = ht->rhlist;
 
 	rcu_read_lock();
 
@@ -731,13 +732,52 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 		list_del(&iter->walker.list);
 	spin_unlock(&ht->lock);
 
-	if (!iter->walker.tbl && !iter->end_of_table) {
+	if (iter->end_of_table)
+		return 0;
+	if (!iter->walker.tbl) {
 		iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
 		iter->slot = 0;
 		iter->skip = 0;
 		return -EAGAIN;
 	}
 
+	if (iter->p && !rhlist) {
+		/*
+		 * We need to validate that 'p' is still in the table, and
+		 * if so, update 'skip'
+		 */
+		struct rhash_head *p;
+		int skip = 0;
+		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+			skip++;
+			if (p == iter->p) {
+				iter->skip = skip;
+				goto found;
+			}
+		}
+		iter->p = NULL;
+	} else if (iter->p && rhlist) {
+		/* Need to validate that 'list' is still in the table, and
+		 * if so, update 'skip' and 'p'.
+		 */
+		struct rhash_head *p;
+		struct rhlist_head *list;
+		int skip = 0;
+		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+			for (list = container_of(p, struct rhlist_head, rhead);
+			     list;
+			     list = rcu_dereference(list->next)) {
+				skip++;
+				if (list == iter->list) {
+					iter->p = p;
+					skip = skip;
+					goto found;
+				}
+			}
+		}
+		iter->p = NULL;
+	}
+found:
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
@@ -913,8 +953,6 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 		iter->walker.tbl = NULL;
 	spin_unlock(&ht->lock);
 
-	iter->p = NULL;
-
 out:
 	rcu_read_unlock();
 }

^ permalink raw reply related

* Re: [net PATCH v2] net: sched, fix OOO packets with pfifo_fast
From: Paolo Abeni @ 2018-04-18  7:28 UTC (permalink / raw)
  To: John Fastabend, Cong Wang
  Cc: Eric Dumazet, Jiri Pirko, David Miller,
	Linux Kernel Network Developers
In-Reply-To: <7f8636e3-c04f-18b6-7e6c-0f28bc54edbb@gmail.com>

Hi,

let me revive this old thread...

On Mon, 2018-03-26 at 11:16 -0700, John Fastabend wrote:
> On 03/26/2018 10:30 AM, Cong Wang wrote:
> > On Sat, Mar 24, 2018 at 10:25 PM, John Fastabend
> > <john.fastabend@gmail.com> wrote:
> > > After the qdisc lock was dropped in pfifo_fast we allow multiple
> > > enqueue threads and dequeue threads to run in parallel. On the
> > > enqueue side the skb bit ooo_okay is used to ensure all related
> > > skbs are enqueued in-order. On the dequeue side though there is
> > > no similar logic. What we observe is with fewer queues than CPUs
> > > it is possible to re-order packets when two instances of
> > > __qdisc_run() are running in parallel. Each thread will dequeue
> > > a skb and then whichever thread calls the ndo op first will
> > > be sent on the wire. This doesn't typically happen because
> > > qdisc_run() is usually triggered by the same core that did the
> > > enqueue. However, drivers will trigger __netif_schedule()
> > > when queues are transitioning from stopped to awake using the
> > > netif_tx_wake_* APIs. When this happens netif_schedule() calls
> > > qdisc_run() on the same CPU that did the netif_tx_wake_* which
> > > is usually done in the interrupt completion context. This CPU
> > > is selected with the irq affinity which is unrelated to the
> > > enqueue operations.
> > 
> > Interesting. Why this is unique to pfifo_fast? For me it could
> > happen to other qdisc's too, when we release the qdisc root
> > lock in sch_direct_xmit(), another CPU could dequeue from
> > the same qdisc and transmit the skb in parallel too?
> > 
> 
> Agreed, my guess is it never happens because the timing is
> tighter in the lock case. Or if it is happening its infrequent
> enough that no one noticed the OOO packets.

I think the above could not happend due to the qdisc seqlock - which is
not acquired by NOLOCK qdiscs.

> For net-next we probably could clean this up. I was just
> going for something simple in net that didn't penalize all
> qdiscs as Eric noted. This patch doesn't make it any worse
> at least. And we have been living with the above race for
> years.

I've benchmarked this patch is some different scenario, and in my
testing it introduces a measurable regression in uncontended/lightly
contended scenarios. The measured peak negative delta is with a pktgen
thread using "xmit_mode queue_xmit":

before: 27674032 pps
after: 23809052 pps

I spend some time searching a way to improve this, without success.

John, did you had any chance to look at this again?

Thanks,

Paolo

^ permalink raw reply

* Re: [PATCH net 5/5] nfp: remove false positive offloads in flower vxlan
From: Or Gerlitz @ 2018-04-18  7:43 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: Linux Netdev List, oss-drivers, John Hurley
In-Reply-To: <20171117010643.18308-6-jakub.kicinski@netronome.com>

On Fri, Nov 17, 2017 at 4:06 AM, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
> From: John Hurley <john.hurley@netronome.com>
>
> Pass information to the match offload on whether or not the repr is the
> ingress or egress dev. Only accept tunnel matches if repr is the egress dev.
>
> This means rules such as the following are successfully offloaded:
> tc .. add dev vxlan0 .. enc_dst_port 4789 .. action redirect dev nfp_p0
>
> While rules such as the following are rejected:
> tc .. add dev nfp_p0 .. enc_dst_port 4789 .. action redirect dev vxlan0

cool


> Also reject non tunnel flows that are offloaded to an egress dev.
> Non tunnel matches assume that the offload dev is the ingress port and
> offload a match accordingly.

not following on the "Also" here, see below


> diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
> index a0193e0c24a0..f5d73b83dcc2 100644
> --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
> +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
> @@ -131,7 +131,8 @@ static bool nfp_flower_check_higher_than_mac(struct tc_cls_flower_offload *f)
>
>  static int
>  nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
> -                               struct tc_cls_flower_offload *flow)
> +                               struct tc_cls_flower_offload *flow,
> +                               bool egress)
>  {
>         struct flow_dissector_key_basic *mask_basic = NULL;
>         struct flow_dissector_key_basic *key_basic = NULL;
> @@ -167,6 +168,9 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
>                         skb_flow_dissector_target(flow->dissector,
>                                                   FLOW_DISSECTOR_KEY_ENC_CONTROL,
>                                                   flow->key);
> +               if (!egress)
> +                       return -EOPNOTSUPP;
> +
>                 if (mask_enc_ctl->addr_type != 0xffff ||
>                     enc_ctl->addr_type != FLOW_DISSECTOR_KEY_IPV4_ADDRS)
>                         return -EOPNOTSUPP;
> @@ -194,6 +198,9 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
>
>                 key_layer |= NFP_FLOWER_LAYER_VXLAN;
>                 key_size += sizeof(struct nfp_flower_vxlan);
> +       } else if (egress) {
> +               /* Reject non tunnel matches offloaded to egress repr. */
> +               return -EOPNOTSUPP;
>         }

with these two hunks we get: egress <- IFF -> encap match, right?

(1) we can't offload the egress way if there isn't matching on encap headers
(2) we can't go the matching on encap headers way if we are not egress

what other cases are rejected by this logic?

e.g If we add a rule with SW device (veth. tap) being the ingress, and
HW device (vf rep)
being the egress while not using skip_sw (just no flags == both) we
get the TC stack
go along the egdev callback from the vf rep hw device and add an
(uplink --> vf rep) rule
which will not be rejected if there is matching on tunnel headers, it
will also not be rejected
by some driver logic as the one we discussed to identify and ignore
rules that are attempted to being added twice.

Or.

^ permalink raw reply

* Re: [PATCH 10/10] net: New ax88796 platform driver for Amiga X-Surf 100 Zorro board (m68k)
From: Geert Uytterhoeven @ 2018-04-18  7:54 UTC (permalink / raw)
  To: Michael Schmitz; +Cc: netdev, Linux/m68k, Michael Karcher, Michael Karcher
In-Reply-To: <CAOmrzkJVPn+zpaJTGhCpc=sSSZhBtbO_2BTGaVDpmNVm+VzAEg@mail.gmail.com>

Hi Michael,

On Wed, Apr 18, 2018 at 12:35 AM, Michael Schmitz <schmitzmic@gmail.com> wrote:
> On Wed, Apr 18, 2018 at 1:53 AM, Geert Uytterhoeven
> <geert@linux-m68k.org> wrote:
>> On Tue, Apr 17, 2018 at 12:04 AM, Michael Schmitz <schmitzmic@gmail.com> wrote:
>>> Add platform device driver to populate the ax88796 platform data from
>>> information provided by the XSurf100 zorro device driver.
>>> This driver will have to be loaded before loading the ax88796 module,
>>> or compiled as built-in.

>>> --- /dev/null
>>> +++ b/drivers/net/ethernet/8390/xsurf100.c

>>> +static int xsurf100_probe(struct zorro_dev *zdev,
>>> +                         const struct zorro_device_id *ent)
>>> +{
>>
>>> +       /* error handling for ioremap regs */
>>> +       if (!ax88796_data.base_regs) {
>>> +               dev_err(&zdev->dev, "Cannot ioremap area %p (registers)\n",
>>> +                       (void *)zdev->resource.start);
>>
>> Please use %pR to format struct resource.
>> Documentation/core-api/printk-formats.rst
>
> The driver uses ioremap to map two subsections of the mem resource for
> two different purposes - control register access, and ring buffer
> access. The output of %pR may be misleading here (wrong size), and
> even more so below.

Sorry, I missed it's the same resource.

FWIW, if you want to print a phys_addr_t or resource_size_t, you can
use %pa, e.g.

   dev_err(..., "... %pa ...", ... &zdev->resource.start ...);

>>> +       /* error handling for ioremap data */
>>> +       if (!ax88796_data.data_area) {
>>> +               dev_err(&zdev->dev, "Cannot ioremap area %p (32-bit access)\n",
>>> +                       (void *)zdev->resource.start + XS100_8390_DATA32_BASE);
>>
>> %pR
>
> I've added the offset into the mem resource here to clarify what we've
> tried to map.

That's an alternative solution, fine for me.

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: tcp hang when socket fills up ?
From: Jozsef Kadlecsik @ 2018-04-18  8:13 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Michal Kubecek, netdev, Marcelo Ricardo Leitner, Eric Dumazet
In-Reply-To: <20180417132941.cutzhgbrveatrdsp@breakpoint.cc>

Hi,

On Tue, 17 Apr 2018, Florian Westphal wrote:

> Dominique Martinet <asmadeus@codewreck.org> wrote:
> 
> [ CC Jozsef ]
> 
> > Could it have something to do with the way I setup the connection?
> > I don't think the "both remotes call connect() with carefully selected
> > source/dest port" is a very common case..
> > 
> > If you look at the tcpdump outputs I attached the sequence usually is
> > something like
> >  server > client SYN
> >  client > server SYN
> >  server > client SYNACK
> >  client > server ACK
> > 
> > ultimately it IS a connection, but with an extra SYN packet in front of
> > it (that first SYN opens up the conntrack of the nat so that the
> > client's syn can come in, the client's conntrack will be that of a
> > normal connection since its first SYN goes in directly after the
> > server's (it didn't see the server's SYN))
> > 
> > Looking at my logs again, I'm seeing the same as you:
> > 
> > This looks like the actual SYN/SYN/SYNACK/ACK:
> >  - 14.364090 seq=505004283 likely SYN coming out of server
> >  - 14.661731 seq=1913287797 on next line it says receiver
> > end=505004284 so likely the matching SYN from client
> > Which this time gets a proper SYNACK from server:
> > 14.662020 seq=505004283 ack=1913287798
> > And following final dataless ACK:
> > 14.687570 seq=1913287798 ack=505004284
> > 
> > Then as you point out some data ACK, where the scale poofs:
> > 14.688762 seq=1913287798 ack=505004284+(0) sack=505004284+(0) win=229 end=1913287819
> > 14.688793 tcp_in_window: sender end=1913287798 maxend=1913316998 maxwin=29312 scale=7 receiver end=505004284 maxend=505033596 maxwin=29200 scale=7
> > 14.688824 tcp_in_window: 
> > 14.688852 seq=1913287798 ack=505004284+(0) sack=505004284+(0) win=229 end=1913287819
> > 14.688882 tcp_in_window: sender end=1913287819 maxend=1913287819 maxwin=229 scale=0 receiver end=505004284 maxend=505033596 maxwin=29200 scale=7
> >
> > As you say, only tcp_options() will clear only on side of the scales.
> > We don't have sender->td_maxwin == 0 (printed) so I see no other way
> > than we are in the last else if:
> >  - we have after(end, sender->td_end) (end=1913287819 > sender
> > end=1913287798)
> >  - I assume the tcp state machine must be confused because of the
> > SYN/SYN/SYNACK/ACK pattern and we probably enter the next check, 
> > but since this is a data packet it doesn't have the tcp option for scale
> > thus scale resets.
> 
> Yes, this looks correct. Jozsef, can you please have a look?
> 
> Problem seems to be that conntrack believes that ACK packet
> re-initializes the connection:
> 
>  595                 /*
>  596                  * RFC 793: "if a TCP is reinitialized ... then it need
>  597                  * not wait at all; it must only be sure to use sequence
>  598                  * numbers larger than those recently used."
>  599                  */
>  600                 sender->td_end =
>  601                 sender->td_maxend = end;
>  602                 sender->td_maxwin = (win == 0 ? 1 : win);
>  603 
>  604                 tcp_options(skb, dataoff, tcph, sender);
> 
> and last line clears the scale value (no wscale option in data packet).
> 
> 
> Transitions are:
>  server > client SYN          sNO -> sSS
>  client > server SYN          sSS -> sS2
>  server > client SYNACK       sS2 -> sSR /* here */
>  client > server ACK          sSR -> sES
> 
> SYN/ACK was observed in original direction so we hit
> state->state == TCP_CONNTRACK_SYN_RECV && dir == IP_CT_DIR_REPLY test
> when we see the ack packet and end up in the 'TCP is reinitialized' branch.
> 
> AFAICS, without this, connection would move to sES just fine,
> as the data ack is in window.

Yes, the state transition is wrong for simultaneous open, because the 
tcp_conntracks table is not (cannot be) smart enough. Could you verify the 
next untested patch?

diff --git a/include/uapi/linux/netfilter/nf_conntrack_tcp.h b/include/uapi/linux/netfilter/nf_conntrack_tcp.h
index 74b9115..bcba72d 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_tcp.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_tcp.h
@@ -46,6 +46,9 @@ enum tcp_conntrack {
 /* Marks possibility for expected RFC5961 challenge ACK */
 #define IP_CT_EXP_CHALLENGE_ACK 		0x40
 
+/* Simultaneous open initialized */
+#define IP_CT_TCP_SIMULTANEOUS_OPEN		0x80
+
 struct nf_ct_tcp_flags {
 	__u8 flags;
 	__u8 mask;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index e97cdc1..8e67910 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -981,6 +981,17 @@ static int tcp_packet(struct nf_conn *ct,
 			return NF_ACCEPT; /* Don't change state */
 		}
 		break;
+	case TCP_CONNTRACK_SYN_SENT2:
+		/* tcp_conntracks table is not smart enough to handle
+		 * simultaneous open.
+		 */
+		ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN;
+		break;
+	case TCP_CONNTRACK_SYN_RECV:
+		if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET &&
+		    ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN)
+			new_state = TCP_CONNTRACK_ESTABLISHED;
+		break;
 	case TCP_CONNTRACK_CLOSE:
 		if (index == TCP_RST_SET
 		    && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply related

* Re: tcp hang when socket fills up ?
From: Dominique Martinet @ 2018-04-18  8:30 UTC (permalink / raw)
  To: Jozsef Kadlecsik
  Cc: Florian Westphal, Michal Kubecek, netdev, Marcelo Ricardo Leitner,
	Eric Dumazet
In-Reply-To: <alpine.DEB.2.11.1804181007430.4316@blackhole.kfki.hu>

Jozsef Kadlecsik wrote on Wed, Apr 18, 2018:
> Yes, the state transition is wrong for simultaneous open, because the 
> tcp_conntracks table is not (cannot be) smart enough. Could you verify the 
> next untested patch?

Thanks for the patch; I'll give it a try (probably won't make it today
so will report tomorrow)

-- 
Dominique Martinet | Asmadeus

^ permalink raw reply

* [PATCH net] net: mvpp2: Fix DMA address mask size
From: Maxime Chevallier @ 2018-04-18  9:14 UTC (permalink / raw)
  To: davem
  Cc: Maxime Chevallier, netdev, linux-kernel, Antoine Tenart,
	thomas.petazzoni, gregory.clement, miquel.raynal, nadavh, stefanc,
	ymarkman, mw

PPv2 TX/RX descriptors uses 40bits DMA addresses, but 41 bits masks were
used (GENMASK_ULL(40, 0)).

This commit fixes that by using the correct mask.

Fixes: e7c5359f2eed ("net: mvpp2: introduce PPv2.2 HW descriptors and adapt accessors")
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
---
 drivers/net/ethernet/marvell/mvpp2.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 9deb79b6dcc8..4202f9b5b966 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -916,6 +916,8 @@ static struct {
 
 #define MVPP2_MIB_COUNTERS_STATS_DELAY		(1 * HZ)
 
+#define MVPP2_DESC_DMA_MASK	DMA_BIT_MASK(40)
+
 /* Definitions */
 
 /* Shared Packet Processor resources */
@@ -1429,7 +1431,7 @@ static dma_addr_t mvpp2_txdesc_dma_addr_get(struct mvpp2_port *port,
 	if (port->priv->hw_version == MVPP21)
 		return tx_desc->pp21.buf_dma_addr;
 	else
-		return tx_desc->pp22.buf_dma_addr_ptp & GENMASK_ULL(40, 0);
+		return tx_desc->pp22.buf_dma_addr_ptp & MVPP2_DESC_DMA_MASK;
 }
 
 static void mvpp2_txdesc_dma_addr_set(struct mvpp2_port *port,
@@ -1447,7 +1449,7 @@ static void mvpp2_txdesc_dma_addr_set(struct mvpp2_port *port,
 	} else {
 		u64 val = (u64)addr;
 
-		tx_desc->pp22.buf_dma_addr_ptp &= ~GENMASK_ULL(40, 0);
+		tx_desc->pp22.buf_dma_addr_ptp &= ~MVPP2_DESC_DMA_MASK;
 		tx_desc->pp22.buf_dma_addr_ptp |= val;
 		tx_desc->pp22.packet_offset = offset;
 	}
@@ -1507,7 +1509,7 @@ static dma_addr_t mvpp2_rxdesc_dma_addr_get(struct mvpp2_port *port,
 	if (port->priv->hw_version == MVPP21)
 		return rx_desc->pp21.buf_dma_addr;
 	else
-		return rx_desc->pp22.buf_dma_addr_key_hash & GENMASK_ULL(40, 0);
+		return rx_desc->pp22.buf_dma_addr_key_hash & MVPP2_DESC_DMA_MASK;
 }
 
 static unsigned long mvpp2_rxdesc_cookie_get(struct mvpp2_port *port,
@@ -1516,7 +1518,7 @@ static unsigned long mvpp2_rxdesc_cookie_get(struct mvpp2_port *port,
 	if (port->priv->hw_version == MVPP21)
 		return rx_desc->pp21.buf_cookie;
 	else
-		return rx_desc->pp22.buf_cookie_misc & GENMASK_ULL(40, 0);
+		return rx_desc->pp22.buf_cookie_misc & MVPP2_DESC_DMA_MASK;
 }
 
 static size_t mvpp2_rxdesc_size_get(struct mvpp2_port *port,
@@ -8789,7 +8791,7 @@ static int mvpp2_probe(struct platform_device *pdev)
 	}
 
 	if (priv->hw_version == MVPP22) {
-		err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(40));
+		err = dma_set_mask(&pdev->dev, MVPP2_DESC_DMA_MASK);
 		if (err)
 			goto err_mg_clk;
 		/* Sadly, the BM pools all share the same register to
-- 
2.11.0

^ permalink raw reply related

* Re: [RFC PATCH net-next v6 2/4] net: Introduce generic bypass module
From: Jiri Pirko @ 2018-04-18  9:25 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh
In-Reply-To: <6a8c1ff5-153a-e40a-91b3-48532b8d3a38@intel.com>

Wed, Apr 11, 2018 at 09:13:52PM CEST, sridhar.samudrala@intel.com wrote:
>On 4/11/2018 8:51 AM, Jiri Pirko wrote:
>> Tue, Apr 10, 2018 at 08:59:48PM CEST, sridhar.samudrala@intel.com wrote:
>> > This provides a generic interface for paravirtual drivers to listen
>> > for netdev register/unregister/link change events from pci ethernet
>> > devices with the same MAC and takeover their datapath. The notifier and
>> > event handling code is based on the existing netvsc implementation.
>> > 
>> > It exposes 2 sets of interfaces to the paravirtual drivers.
>> > 1. existing netvsc driver that uses 2 netdev model. In this model, no
>> > master netdev is created. The paravirtual driver registers each bypass
>> > instance along with a set of ops to manage the slave events.
>> >      bypass_master_register()
>> >      bypass_master_unregister()
>> > 2. new virtio_net based solution that uses 3 netdev model. In this model,
>> > the bypass module provides interfaces to create/destroy additional master
>> > netdev and all the slave events are managed internally.
>> >       bypass_master_create()
>> >       bypass_master_destroy()
>> > 
>> > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> > ---
>> > include/linux/netdevice.h |  14 +
>> > include/net/bypass.h      |  96 ++++++
>> > net/Kconfig               |  18 +
>> > net/core/Makefile         |   1 +
>> > net/core/bypass.c         | 844 ++++++++++++++++++++++++++++++++++++++++++++++
>> > 5 files changed, 973 insertions(+)
>> > create mode 100644 include/net/bypass.h
>> > create mode 100644 net/core/bypass.c
>> > 
>> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> > index cf44503ea81a..587293728f70 100644
>> > --- a/include/linux/netdevice.h
>> > +++ b/include/linux/netdevice.h
>> > @@ -1430,6 +1430,8 @@ enum netdev_priv_flags {
>> > 	IFF_PHONY_HEADROOM		= 1<<24,
>> > 	IFF_MACSEC			= 1<<25,
>> > 	IFF_NO_RX_HANDLER		= 1<<26,
>> > +	IFF_BYPASS			= 1 << 27,
>> > +	IFF_BYPASS_SLAVE		= 1 << 28,
>> I wonder, why you don't follow the existing coding style... Also, please
>> add these to into the comment above.
>
>To avoid checkpatch warnings. If it is OK to ignore these warnings, I can switch back
>to the existing coding style to be consistent.

Please do.


>
>> 
>> 
>> > };
>> > 
>> > #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
>> > @@ -1458,6 +1460,8 @@ enum netdev_priv_flags {
>> > #define IFF_RXFH_CONFIGURED		IFF_RXFH_CONFIGURED
>> > #define IFF_MACSEC			IFF_MACSEC
>> > #define IFF_NO_RX_HANDLER		IFF_NO_RX_HANDLER
>> > +#define IFF_BYPASS			IFF_BYPASS
>> > +#define IFF_BYPASS_SLAVE		IFF_BYPASS_SLAVE
>> > 
>> > /**
>> >   *	struct net_device - The DEVICE structure.
>> > @@ -4308,6 +4312,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
>> > 	return dev->priv_flags & IFF_RXFH_CONFIGURED;
>> > }
>> > 
>> > +static inline bool netif_is_bypass_master(const struct net_device *dev)
>> > +{
>> > +	return dev->priv_flags & IFF_BYPASS;
>> > +}
>> > +
>> > +static inline bool netif_is_bypass_slave(const struct net_device *dev)
>> > +{
>> > +	return dev->priv_flags & IFF_BYPASS_SLAVE;
>> > +}
>> > +
>> > /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
>> > static inline void netif_keep_dst(struct net_device *dev)
>> > {
>> > diff --git a/include/net/bypass.h b/include/net/bypass.h
>> > new file mode 100644
>> > index 000000000000..86b02cb894cf
>> > --- /dev/null
>> > +++ b/include/net/bypass.h
>> > @@ -0,0 +1,96 @@
>> > +// SPDX-License-Identifier: GPL-2.0
>> > +/* Copyright (c) 2018, Intel Corporation. */
>> > +
>> > +#ifndef _NET_BYPASS_H
>> > +#define _NET_BYPASS_H
>> > +
>> > +#include <linux/netdevice.h>
>> > +
>> > +struct bypass_ops {
>> > +	int (*slave_pre_register)(struct net_device *slave_netdev,
>> > +				  struct net_device *bypass_netdev);
>> > +	int (*slave_join)(struct net_device *slave_netdev,
>> > +			  struct net_device *bypass_netdev);
>> > +	int (*slave_pre_unregister)(struct net_device *slave_netdev,
>> > +				    struct net_device *bypass_netdev);
>> > +	int (*slave_release)(struct net_device *slave_netdev,
>> > +			     struct net_device *bypass_netdev);
>> > +	int (*slave_link_change)(struct net_device *slave_netdev,
>> > +				 struct net_device *bypass_netdev);
>> > +	rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
>> > +};
>> > +
>> > +struct bypass_master {
>> > +	struct list_head list;
>> > +	struct net_device __rcu *bypass_netdev;
>> > +	struct bypass_ops __rcu *ops;
>> > +};
>> > +
>> > +/* bypass state */
>> > +struct bypass_info {
>> > +	/* passthru netdev with same MAC */
>> > +	struct net_device __rcu *active_netdev;
>> You still use "active"/"backup" names which is highly misleading as
>> it has completely different meaning that in bond for example.
>> I noted that in my previous review already. Please change it.
>
>I guess the issue is with only the 'active'  name. 'backup' should be fine as it also
>matches with the BACKUP feature bit we are adding to virtio_net.

I think that "backup" is also misleading. Both "active" and "backup"
mean a *state* of slaves. This should be named differently.



>
>With regards to alternate names for 'active', you suggested 'stolen', but i
>am not too happy with it.
>netvsc uses vf_netdev, are you OK with this? Or another option is 'passthru'

No. The netdev could be any netdevice. It does not have to be a "VF".
I think "stolen" is quite appropriate since it describes the modus
operandi. The bypass master steals some netdevice according to some
match.

But I don't insist on "stolen". Just sounds right.



>
>
>
>> 
>> 
>> > +
>> > +	/* virtio_net netdev */
>> > +	struct net_device __rcu *backup_netdev;
>> > +
>> > +	/* active netdev stats */
>> > +	struct rtnl_link_stats64 active_stats;
>> > +
>> > +	/* backup netdev stats */
>> > +	struct rtnl_link_stats64 backup_stats;
>> > +
>> > +	/* aggregated stats */
>> > +	struct rtnl_link_stats64 bypass_stats;
>> > +
>> > +	/* spinlock while updating stats */
>> > +	spinlock_t stats_lock;
>> > +};
>> > +
>> > +#if IS_ENABLED(CONFIG_NET_BYPASS)
>> > +
>> > +int bypass_master_create(struct net_device *backup_netdev,
>> > +			 struct bypass_master **pbypass_master);
>> > +void bypass_master_destroy(struct bypass_master *bypass_master);
>> > +
>> > +int bypass_master_register(struct net_device *dev, struct bypass_ops *ops,
>> > +			   struct bypass_master **pbypass_master);
>> > +void bypass_master_unregister(struct bypass_master *bypass_master);
>> > +
>> > +int bypass_slave_unregister(struct net_device *slave_netdev);
>> > +
>> > +#else
>> > +
>> > +static inline
>> > +int bypass_master_create(struct net_device *backup_netdev,
>> > +			 struct bypass_master **pbypass_master);
>> > +{
>> > +	return 0;
>> > +}
>> > +
>> > +static inline
>> > +void bypass_master_destroy(struct bypass_master *bypass_master)
>> > +{
>> > +}
>> > +
>> > +static inline
>> > +int bypass_master_register(struct net_device *dev, struct bypass_ops *ops,
>> > +			   struct pbypass_master **pbypass_master);
>> > +{
>> > +	return 0;
>> > +}
>> > +
>> > +static inline
>> > +void bypass_master_unregister(struct bypass_master *bypass_master)
>> > +{
>> > +}
>> > +
>> > +static inline
>> > +int bypass_slave_unregister(struct net_device *slave_netdev)
>> > +{
>> > +	return 0;
>> > +}
>> > +
>> > +#endif
>> > +
>> > +#endif /* _NET_BYPASS_H */
>> > diff --git a/net/Kconfig b/net/Kconfig
>> > index 0428f12c25c2..994445f4a96a 100644
>> > --- a/net/Kconfig
>> > +++ b/net/Kconfig
>> > @@ -423,6 +423,24 @@ config MAY_USE_DEVLINK
>> > 	  on MAY_USE_DEVLINK to ensure they do not cause link errors when
>> > 	  devlink is a loadable module and the driver using it is built-in.
>> > 
>> > +config NET_BYPASS
>> > +	tristate "Bypass interface"
>> > +	---help---
>> > +	  This provides a generic interface for paravirtual drivers to listen
>> > +	  for netdev register/unregister/link change events from pci ethernet
>> > +	  devices with the same MAC and takeover their datapath. This also
>> > +	  enables live migration of a VM with direct attached VF by failing
>> > +	  over to the paravirtual datapath when the VF is unplugged.
>> > +
>> > +config MAY_USE_BYPASS
>> > +	tristate
>> > +	default m if NET_BYPASS=m
>> > +	default y if NET_BYPASS=y || NET_BYPASS=n
>> > +	help
>> > +	  Drivers using the bypass infrastructure should have a dependency
>> > +	  on MAY_USE_BYPASS to ensure they do not cause link errors when
>> > +	  bypass is a loadable module and the driver using it is built-in.
>> > +
>> > endif   # if NET
>> > 
>> > # Used by archs to tell that they support BPF JIT compiler plus which flavour.
>> > diff --git a/net/core/Makefile b/net/core/Makefile
>> > index 6dbbba8c57ae..a9727ed1c8fc 100644
>> > --- a/net/core/Makefile
>> > +++ b/net/core/Makefile
>> > @@ -30,3 +30,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
>> > obj-$(CONFIG_HWBM) += hwbm.o
>> > obj-$(CONFIG_NET_DEVLINK) += devlink.o
>> > obj-$(CONFIG_GRO_CELLS) += gro_cells.o
>> > +obj-$(CONFIG_NET_BYPASS) += bypass.o
>> > diff --git a/net/core/bypass.c b/net/core/bypass.c
>> > new file mode 100644
>> > index 000000000000..b5b9cb554c3f
>> > --- /dev/null
>> > +++ b/net/core/bypass.c
>> > @@ -0,0 +1,844 @@
>> > +// SPDX-License-Identifier: GPL-2.0
>> > +/* Copyright (c) 2018, Intel Corporation. */
>> > +
>> > +/* A common module to handle registrations and notifications for paravirtual
>> > + * drivers to enable accelerated datapath and support VF live migration.
>> > + *
>> > + * The notifier and event handling code is based on netvsc driver.
>> > + */
>> > +
>> > +#include <linux/netdevice.h>
>> > +#include <linux/etherdevice.h>
>> > +#include <linux/ethtool.h>
>> > +#include <linux/module.h>
>> > +#include <linux/slab.h>
>> > +#include <linux/netdevice.h>
>> > +#include <linux/netpoll.h>
>> > +#include <linux/rtnetlink.h>
>> > +#include <linux/if_vlan.h>
>> > +#include <linux/pci.h>
>> > +#include <net/sch_generic.h>
>> > +#include <uapi/linux/if_arp.h>
>> > +#include <net/bypass.h>
>> > +
>> > +static LIST_HEAD(bypass_master_list);
>> > +static DEFINE_SPINLOCK(bypass_lock);
>> > +
>> > +static int bypass_slave_pre_register(struct net_device *slave_netdev,
>> > +				     struct net_device *bypass_netdev,
>> > +				     struct bypass_ops *bypass_ops)
>> > +{
>> > +	struct bypass_info *bi;
>> > +	bool backup;
>> > +
>> > +	if (bypass_ops) {
>> > +		if (!bypass_ops->slave_pre_register)
>> > +			return -EINVAL;
>> > +
>> > +		return bypass_ops->slave_pre_register(slave_netdev,
>> > +						      bypass_netdev);
>> > +	}
>> > +
>> > +	bi = netdev_priv(bypass_netdev);
>> > +	backup = (slave_netdev->dev.parent == bypass_netdev->dev.parent);
>> > +	if (backup ? rtnl_dereference(bi->backup_netdev) :
>> > +			rtnl_dereference(bi->active_netdev)) {
>> > +		netdev_err(bypass_netdev, "%s attempting to register as slave dev when %s already present\n",
>> > +			   slave_netdev->name, backup ? "backup" : "active");
>> > +		return -EEXIST;
>> > +	}
>> > +
>> > +	/* Avoid non pci devices as active netdev */
>> > +	if (!backup && (!slave_netdev->dev.parent ||
>> > +			!dev_is_pci(slave_netdev->dev.parent)))
>> > +		return -EINVAL;
>> > +
>> > +	return 0;
>> > +}
>> > +
>> > +static int bypass_slave_join(struct net_device *slave_netdev,
>> > +			     struct net_device *bypass_netdev,
>> > +			     struct bypass_ops *bypass_ops)
>> > +{
>> > +	struct bypass_info *bi;
>> > +	bool backup;
>> > +
>> > +	if (bypass_ops) {
>> > +		if (!bypass_ops->slave_join)
>> > +			return -EINVAL;
>> > +
>> > +		return bypass_ops->slave_join(slave_netdev, bypass_netdev);
>> > +	}
>> > +
>> > +	bi = netdev_priv(bypass_netdev);
>> > +	backup = (slave_netdev->dev.parent == bypass_netdev->dev.parent);
>> > +
>> > +	dev_hold(slave_netdev);
>> > +
>> > +	if (backup) {
>> > +		rcu_assign_pointer(bi->backup_netdev, slave_netdev);
>> > +		dev_get_stats(bi->backup_netdev, &bi->backup_stats);
>> > +	} else {
>> > +		rcu_assign_pointer(bi->active_netdev, slave_netdev);
>> > +		dev_get_stats(bi->active_netdev, &bi->active_stats);
>> > +		bypass_netdev->min_mtu = slave_netdev->min_mtu;
>> > +		bypass_netdev->max_mtu = slave_netdev->max_mtu;
>> > +	}
>> > +
>> > +	netdev_info(bypass_netdev, "bypass slave:%s joined\n",
>> > +		    slave_netdev->name);
>> > +
>> > +	return 0;
>> > +}
>> > +
>> > +/* Called when slave dev is injecting data into network stack.
>> > + * Change the associated network device from lower dev to virtio.
>> > + * note: already called with rcu_read_lock
>> > + */
>> > +static rx_handler_result_t bypass_handle_frame(struct sk_buff **pskb)
>> > +{
>> > +	struct sk_buff *skb = *pskb;
>> > +	struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
>> > +
>> > +	skb->dev = ndev;
>> > +
>> > +	return RX_HANDLER_ANOTHER;
>> > +}
>> > +
>> > +static struct net_device *bypass_master_get_bymac(u8 *mac,
>> > +						  struct bypass_ops **ops)
>> > +{
>> > +	struct bypass_master *bypass_master;
>> > +	struct net_device *bypass_netdev;
>> > +
>> > +	spin_lock(&bypass_lock);
>> > +	list_for_each_entry(bypass_master, &bypass_master_list, list) {
>> As I wrote the last time, you don't need this list, spinlock.
>> You can do just something like:
>>          for_each_net(net) {
>>                  for_each_netdev(net, dev) {
>> 			if (netif_is_bypass_master(dev)) {
>
>This function returns the upper netdev as well as the ops associated
>with that netdev.
>bypass_master_list is a list of 'struct bypass_master' that associates

Well, can't you have it in netdev priv?


>'bypass_netdev' with 'bypass_ops' and gets added via bypass_master_register().
>We need 'ops' only to support the 2 netdev model of netvsc. ops will be
>NULL for 3-netdev model.

I see :(


>
>
>> 
>> 
>> 
>> 
>> > +		bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
>> > +		if (ether_addr_equal(bypass_netdev->perm_addr, mac)) {
>> > +			*ops = rcu_dereference(bypass_master->ops);
>> I don't see how rcu_dereference is ok here.
>> 1) I don't see rcu_read_lock taken
>> 2) Looks like bypass_master->ops has the same value across the whole
>>     existence.
>
>We hold rtnl_lock(), i think i need to change this to rtnl_dereference.
>Yes. ops doesn't change.

If it does not change, you can just access it directly.


>
>> 
>> 
>> > +			spin_unlock(&bypass_lock);
>> > +			return bypass_netdev;
>> > +		}
>> > +	}
>> > +	spin_unlock(&bypass_lock);
>> > +	return NULL;
>> > +}
>> > +
>> > +static int bypass_slave_register(struct net_device *slave_netdev)
>> > +{
>> > +	struct net_device *bypass_netdev;
>> > +	struct bypass_ops *bypass_ops;
>> > +	int ret, orig_mtu;
>> > +
>> > +	ASSERT_RTNL();
>> > +
>> > +	bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> > +						&bypass_ops);
>> For master, could you use word "master" in the variables so it is clear?
>> Also, "dev" is fine instead of "netdev".
>> Something like "bpmaster_dev"
>
>bypass_master is of  type struct bypass_master,  bypass_netdev is of type struct net_device.

I was trying to point out, that "bypass_netdev" represents a "master"
netdev, yet it does not say master. That is why I suggested
"bpmaster_dev"


>I can change all _netdev suffixes to _dev to make the names shorter.

ok.


>
>
>> 
>> 
>> > +	if (!bypass_netdev)
>> > +		goto done;
>> > +
>> > +	ret = bypass_slave_pre_register(slave_netdev, bypass_netdev,
>> > +					bypass_ops);
>> > +	if (ret != 0)
>> 	Just "if (ret)" will do. You have this on more places.
>
>OK.
>
>
>> 
>> 
>> > +		goto done;
>> > +
>> > +	ret = netdev_rx_handler_register(slave_netdev,
>> > +					 bypass_ops ? bypass_ops->handle_frame :
>> > +					 bypass_handle_frame, bypass_netdev);
>> > +	if (ret != 0) {
>> > +		netdev_err(slave_netdev, "can not register bypass rx handler (err = %d)\n",
>> > +			   ret);
>> > +		goto done;
>> > +	}
>> > +
>> > +	ret = netdev_upper_dev_link(slave_netdev, bypass_netdev, NULL);
>> > +	if (ret != 0) {
>> > +		netdev_err(slave_netdev, "can not set master device %s (err = %d)\n",
>> > +			   bypass_netdev->name, ret);
>> > +		goto upper_link_failed;
>> > +	}
>> > +
>> > +	slave_netdev->priv_flags |= IFF_BYPASS_SLAVE;
>> > +
>> > +	if (netif_running(bypass_netdev)) {
>> > +		ret = dev_open(slave_netdev);
>> > +		if (ret && (ret != -EBUSY)) {
>> > +			netdev_err(bypass_netdev, "Opening slave %s failed ret:%d\n",
>> > +				   slave_netdev->name, ret);
>> > +			goto err_interface_up;
>> > +		}
>> > +	}
>> > +
>> > +	/* Align MTU of slave with master */
>> > +	orig_mtu = slave_netdev->mtu;
>> > +	ret = dev_set_mtu(slave_netdev, bypass_netdev->mtu);
>> > +	if (ret != 0) {
>> > +		netdev_err(bypass_netdev, "unable to change mtu of %s to %u register failed\n",
>> > +			   slave_netdev->name, bypass_netdev->mtu);
>> > +		goto err_set_mtu;
>> > +	}
>> > +
>> > +	ret = bypass_slave_join(slave_netdev, bypass_netdev, bypass_ops);
>> > +	if (ret != 0)
>> > +		goto err_join;
>> > +
>> > +	call_netdevice_notifiers(NETDEV_JOIN, slave_netdev);
>> > +
>> > +	netdev_info(bypass_netdev, "bypass slave:%s registered\n",
>> > +		    slave_netdev->name);
>> > +
>> > +	goto done;
>> > +
>> > +err_join:
>> > +	dev_set_mtu(slave_netdev, orig_mtu);
>> > +err_set_mtu:
>> > +	dev_close(slave_netdev);
>> > +err_interface_up:
>> > +	netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
>> > +	slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
>> > +upper_link_failed:
>> > +	netdev_rx_handler_unregister(slave_netdev);
>> > +done:
>> > +	return NOTIFY_DONE;
>> > +}
>> > +
>> > +static int bypass_slave_pre_unregister(struct net_device *slave_netdev,
>> > +				       struct net_device *bypass_netdev,
>> > +				       struct bypass_ops *bypass_ops)
>> > +{
>> > +	struct net_device *backup_netdev, *active_netdev;
>> > +	struct bypass_info *bi;
>> > +
>> > +	if (bypass_ops) {
>> > +		if (!bypass_ops->slave_pre_unregister)
>> > +			return -EINVAL;
>> > +
>> > +		return bypass_ops->slave_pre_unregister(slave_netdev,
>> > +							bypass_netdev);
>> > +	}
>> > +
>> > +	bi = netdev_priv(bypass_netdev);
>> > +	active_netdev = rtnl_dereference(bi->active_netdev);
>> > +	backup_netdev = rtnl_dereference(bi->backup_netdev);
>> > +
>> > +	if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
>> > +		return -EINVAL;
>> > +
>> > +	return 0;
>> > +}
>> > +
>> > +static int bypass_slave_release(struct net_device *slave_netdev,
>> > +				struct net_device *bypass_netdev,
>> > +				struct bypass_ops *bypass_ops)
>> > +{
>> > +	struct net_device *backup_netdev, *active_netdev;
>> > +	struct bypass_info *bi;
>> > +
>> > +	if (bypass_ops) {
>> > +		if (!bypass_ops->slave_release)
>> > +			return -EINVAL;
>> I think it would be good to make the API to the driver more strict and
>> have a separate set of ops for "active" and "backup" netdevices.
>> That should stop people thinking about extending this to more slaves in
>> the future.
>
>We have checks in slave_pre_register() that allows only 1 'backup' and 1
>'active' slave.

I'm very well aware of that. I just thought that explicit ops for the
two slaves would make this more clear.


>
>
>> 
>> 
>> 
>> > +
>> > +		return bypass_ops->slave_release(slave_netdev, bypass_netdev);
>> > +	}
>> > +
>> > +	bi = netdev_priv(bypass_netdev);
>> > +	active_netdev = rtnl_dereference(bi->active_netdev);
>> > +	backup_netdev = rtnl_dereference(bi->backup_netdev);
>> > +
>> > +	if (slave_netdev == backup_netdev) {
>> > +		RCU_INIT_POINTER(bi->backup_netdev, NULL);
>> > +	} else {
>> > +		RCU_INIT_POINTER(bi->active_netdev, NULL);
>> > +		if (backup_netdev) {
>> > +			bypass_netdev->min_mtu = backup_netdev->min_mtu;
>> > +			bypass_netdev->max_mtu = backup_netdev->max_mtu;
>> > +		}
>> > +	}
>> > +
>> > +	dev_put(slave_netdev);
>> > +
>> > +	netdev_info(bypass_netdev, "bypass slave:%s released\n",
>> > +		    slave_netdev->name);
>> > +
>> > +	return 0;
>> > +}
>> > +
>> > +int bypass_slave_unregister(struct net_device *slave_netdev)
>> > +{
>> > +	struct net_device *bypass_netdev;
>> > +	struct bypass_ops *bypass_ops;
>> > +	int ret;
>> > +
>> > +	if (!netif_is_bypass_slave(slave_netdev))
>> > +		goto done;
>> > +
>> > +	ASSERT_RTNL();
>> > +
>> > +	bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> > +						&bypass_ops);
>> > +	if (!bypass_netdev)
>> > +		goto done;
>> > +
>> > +	ret = bypass_slave_pre_unregister(slave_netdev, bypass_netdev,
>> > +					  bypass_ops);
>> > +	if (ret != 0)
>> > +		goto done;
>> > +
>> > +	netdev_rx_handler_unregister(slave_netdev);
>> > +	netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
>> > +	slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
>> > +
>> > +	bypass_slave_release(slave_netdev, bypass_netdev, bypass_ops);
>> > +
>> > +	netdev_info(bypass_netdev, "bypass slave:%s unregistered\n",
>> > +		    slave_netdev->name);
>> > +
>> > +done:
>> > +	return NOTIFY_DONE;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_slave_unregister);
>> > +
>> > +static bool bypass_xmit_ready(struct net_device *dev)
>> > +{
>> > +	return netif_running(dev) && netif_carrier_ok(dev);
>> > +}
>> > +
>> > +static int bypass_slave_link_change(struct net_device *slave_netdev)
>> > +{
>> > +	struct net_device *bypass_netdev, *active_netdev, *backup_netdev;
>> > +	struct bypass_ops *bypass_ops;
>> > +	struct bypass_info *bi;
>> > +
>> > +	if (!netif_is_bypass_slave(slave_netdev))
>> > +		goto done;
>> > +
>> > +	ASSERT_RTNL();
>> > +
>> > +	bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> > +						&bypass_ops);
>> > +	if (!bypass_netdev)
>> > +		goto done;
>> > +
>> > +	if (bypass_ops) {
>> > +		if (!bypass_ops->slave_link_change)
>> > +			goto done;
>> > +
>> > +		return bypass_ops->slave_link_change(slave_netdev,
>> > +						     bypass_netdev);
>> > +	}
>> > +
>> > +	if (!netif_running(bypass_netdev))
>> > +		return 0;
>> > +
>> > +	bi = netdev_priv(bypass_netdev);
>> > +
>> > +	active_netdev = rtnl_dereference(bi->active_netdev);
>> > +	backup_netdev = rtnl_dereference(bi->backup_netdev);
>> > +
>> > +	if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
>> > +		goto done;
>> You don't need this check. "if (!netif_is_bypass_slave(slave_netdev))"
>> above is enough.
>
>I think we need this check to not allow events from a slave that is not
>attached to this master but has the same MAC.

Why do we need such events? Seems wrong to me. Consider:

bp1      bp2
a1 b1    a2 b2


a1 and a2 have the same mac and bp1 and bp2 have the same mac.
Now bypass_master_get_bymac() will return always bp1 or bp2 - depending on
the order of creation.
Let's say it will return bp1. Then when we have event for a2, the
bypass_ops->slave_link_change is called with (a2, bp1). That is wrong.


You cannot use bypass_master_get_bymac() here.



>
>> 
>> 
>> > +
>> > +	if ((active_netdev && bypass_xmit_ready(active_netdev)) ||
>> > +	    (backup_netdev && bypass_xmit_ready(backup_netdev))) {
>> > +		netif_carrier_on(bypass_netdev);
>> > +		netif_tx_wake_all_queues(bypass_netdev);
>> > +	} else {
>> > +		netif_carrier_off(bypass_netdev);
>> > +		netif_tx_stop_all_queues(bypass_netdev);
>> > +	}
>> > +
>> > +done:
>> > +	return NOTIFY_DONE;
>> > +}
>> > +
>> > +static bool bypass_validate_event_dev(struct net_device *dev)
>> > +{
>> > +	/* Skip parent events */
>> > +	if (netif_is_bypass_master(dev))
>> > +		return false;
>> > +
>> > +	/* Avoid non-Ethernet type devices */
>> > +	if (dev->type != ARPHRD_ETHER)
>> > +		return false;
>> > +
>> > +	/* Avoid Vlan dev with same MAC registering as VF */
>> > +	if (is_vlan_dev(dev))
>> > +		return false;
>> > +
>> > +	/* Avoid Bonding master dev with same MAC registering as slave dev */
>> > +	if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
>> Yeah, this is certainly incorrect. One thing is, you should be using the
>> helpers netif_is_bond_master().
>> But what about the rest? macsec, macvlan, team, bridge, ovs and others?
>> 
>> You need to do it not by blacklisting, but with whitelisting. You need
>> to whitelist VF devices. My port flavours patchset might help with this.
>
>May be i can use netdev_has_lower_dev() helper to make sure that the slave

I don't see such function in the code.


>device is not an upper dev.
>Can you point to your port flavours patchset? Is it upstream?

I sent rfc couple of weeks ago:
[patch net-next RFC 00/12] devlink: introduce port flavours and common phys_port_name generation


>
>> 
>> 
>> > +		return false;
>> > +
>> > +	return true;
>> > +}
>> > +
>> > +static int
>> > +bypass_event(struct notifier_block *this, unsigned long event, void *ptr)
>> > +{
>> > +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
>> > +
>> > +	if (!bypass_validate_event_dev(event_dev))
>> > +		return NOTIFY_DONE;
>> > +
>> > +	switch (event) {
>> > +	case NETDEV_REGISTER:
>> > +		return bypass_slave_register(event_dev);
>> > +	case NETDEV_UNREGISTER:
>> > +		return bypass_slave_unregister(event_dev);
>> > +	case NETDEV_UP:
>> > +	case NETDEV_DOWN:
>> > +	case NETDEV_CHANGE:
>> > +		return bypass_slave_link_change(event_dev);
>> > +	default:
>> > +		return NOTIFY_DONE;
>> > +	}
>> > +}
>> > +
>> > +static struct notifier_block bypass_notifier = {
>> > +	.notifier_call = bypass_event,
>> > +};
>> > +
>> > +int bypass_open(struct net_device *dev)
>> > +{
>> > +	struct bypass_info *bi = netdev_priv(dev);
>> > +	struct net_device *active_netdev, *backup_netdev;
>> > +	int err;
>> > +
>> > +	netif_carrier_off(dev);
>> > +	netif_tx_wake_all_queues(dev);
>> > +
>> > +	active_netdev = rtnl_dereference(bi->active_netdev);
>> > +	if (active_netdev) {
>> > +		err = dev_open(active_netdev);
>> > +		if (err)
>> > +			goto err_active_open;
>> > +	}
>> > +
>> > +	backup_netdev = rtnl_dereference(bi->backup_netdev);
>> > +	if (backup_netdev) {
>> > +		err = dev_open(backup_netdev);
>> > +		if (err)
>> > +			goto err_backup_open;
>> > +	}
>> > +
>> > +	return 0;
>> > +
>> > +err_backup_open:
>> > +	dev_close(active_netdev);
>> > +err_active_open:
>> > +	netif_tx_disable(dev);
>> > +	return err;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_open);
>> > +
>> > +int bypass_close(struct net_device *dev)
>> > +{
>> > +	struct bypass_info *vi = netdev_priv(dev);
>> This should be probably "bi"
>
>Yes.
>
>
>> 
>> 
>> > +	struct net_device *slave_netdev;
>> > +
>> > +	netif_tx_disable(dev);
>> > +
>> > +	slave_netdev = rtnl_dereference(vi->active_netdev);
>> > +	if (slave_netdev)
>> > +		dev_close(slave_netdev);
>> > +
>> > +	slave_netdev = rtnl_dereference(vi->backup_netdev);
>> > +	if (slave_netdev)
>> > +		dev_close(slave_netdev);
>> > +
>> > +	return 0;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_close);
>> > +
>> > +static netdev_tx_t bypass_drop_xmit(struct sk_buff *skb, struct net_device *dev)
>> > +{
>> > +	atomic_long_inc(&dev->tx_dropped);
>> > +	dev_kfree_skb_any(skb);
>> > +	return NETDEV_TX_OK;
>> > +}
>> > +
>> > +netdev_tx_t bypass_start_xmit(struct sk_buff *skb, struct net_device *dev)
>> > +{
>> > +	struct bypass_info *bi = netdev_priv(dev);
>> If you rename the other variable to "bpmaster_dev", it would be nice to
>> rename this to bpinfo or something more descriptive. "bi" is too short
>> to know what that is right away.
>
>Will rename bypass_netdev to bypass_dev. bypass indicates that it is
>an upper master dev.
>
>
>> 
>> 
>> > +	struct net_device *xmit_dev;
>> Don't mix "dev" and "netdev" in one .c file. Just use "dev" for all.
>
>OK.
>
>
>> 
>> 
>> 
>> > +
>> > +	/* Try xmit via active netdev followed by backup netdev */
>> > +	xmit_dev = rcu_dereference_bh(bi->active_netdev);
>> > +	if (!xmit_dev || !bypass_xmit_ready(xmit_dev)) {
>> > +		xmit_dev = rcu_dereference_bh(bi->backup_netdev);
>> > +		if (!xmit_dev || !bypass_xmit_ready(xmit_dev))
>> > +			return bypass_drop_xmit(skb, dev);
>> > +	}
>> > +
>> > +	skb->dev = xmit_dev;
>> > +	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
>> > +
>> > +	return dev_queue_xmit(skb);
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_start_xmit);
>> > +
>> > +u16 bypass_select_queue(struct net_device *dev, struct sk_buff *skb,
>> > +			void *accel_priv, select_queue_fallback_t fallback)
>> > +{
>> > +	/* This helper function exists to help dev_pick_tx get the correct
>> > +	 * destination queue.  Using a helper function skips a call to
>> > +	 * skb_tx_hash and will put the skbs in the queue we expect on their
>> > +	 * way down to the bonding driver.
>> > +	 */
>> > +	u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
>> > +
>> > +	/* Save the original txq to restore before passing to the driver */
>> > +	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
>> > +
>> > +	if (unlikely(txq >= dev->real_num_tx_queues)) {
>> > +		do {
>> > +			txq -= dev->real_num_tx_queues;
>> > +		} while (txq >= dev->real_num_tx_queues);
>> > +	}
>> > +
>> > +	return txq;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_select_queue);
>> > +
>> > +/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
>> > + * that some drivers can provide 32bit values only.
>> > + */
>> > +static void bypass_fold_stats(struct rtnl_link_stats64 *_res,
>> > +			      const struct rtnl_link_stats64 *_new,
>> > +			      const struct rtnl_link_stats64 *_old)
>> > +{
>> > +	const u64 *new = (const u64 *)_new;
>> > +	const u64 *old = (const u64 *)_old;
>> > +	u64 *res = (u64 *)_res;
>> > +	int i;
>> > +
>> > +	for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
>> > +		u64 nv = new[i];
>> > +		u64 ov = old[i];
>> > +		s64 delta = nv - ov;
>> > +
>> > +		/* detects if this particular field is 32bit only */
>> > +		if (((nv | ov) >> 32) == 0)
>> > +			delta = (s64)(s32)((u32)nv - (u32)ov);
>> > +
>> > +		/* filter anomalies, some drivers reset their stats
>> > +		 * at down/up events.
>> > +		 */
>> > +		if (delta > 0)
>> > +			res[i] += delta;
>> > +	}
>> > +}
>> > +
>> > +void bypass_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
>> > +{
>> > +	struct bypass_info *bi = netdev_priv(dev);
>> You can WARN_ON and return in case the dev is not bypass master, just
>> to catch buggy drivers. Same with other helpers.
>
>I can make this static and not export this helper as well as all
>bypass_netdev ops.

Ok.


>
>> 
>> 
>> > +	const struct rtnl_link_stats64 *new;
>> > +	struct rtnl_link_stats64 temp;
>> > +	struct net_device *slave_netdev;
>> > +
>> > +	spin_lock(&bi->stats_lock);
>> > +	memcpy(stats, &bi->bypass_stats, sizeof(*stats));
>> > +
>> > +	rcu_read_lock();
>> > +
>> > +	slave_netdev = rcu_dereference(bi->active_netdev);
>> > +	if (slave_netdev) {
>> > +		new = dev_get_stats(slave_netdev, &temp);
>> > +		bypass_fold_stats(stats, new, &bi->active_stats);
>> > +		memcpy(&bi->active_stats, new, sizeof(*new));
>> > +	}
>> > +
>> > +	slave_netdev = rcu_dereference(bi->backup_netdev);
>> > +	if (slave_netdev) {
>> > +		new = dev_get_stats(slave_netdev, &temp);
>> > +		bypass_fold_stats(stats, new, &bi->backup_stats);
>> > +		memcpy(&bi->backup_stats, new, sizeof(*new));
>> > +	}
>> > +
>> > +	rcu_read_unlock();
>> > +
>> > +	memcpy(&bi->bypass_stats, stats, sizeof(*stats));
>> > +	spin_unlock(&bi->stats_lock);
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_get_stats);
>> > +
>> > +int bypass_change_mtu(struct net_device *dev, int new_mtu)
>> > +{
>> > +	struct bypass_info *bi = netdev_priv(dev);
>> > +	struct net_device *active_netdev, *backup_netdev;
>> > +	int ret = 0;
>> Pointless initialization.
>> 
>> 
>> > +
>> > +	active_netdev = rcu_dereference(bi->active_netdev);
>> > +	if (active_netdev) {
>> > +		ret = dev_set_mtu(active_netdev, new_mtu);
>> > +		if (ret)
>> > +			return ret;
>> > +	}
>> > +
>> > +	backup_netdev = rcu_dereference(bi->backup_netdev);
>> > +	if (backup_netdev) {
>> > +		ret = dev_set_mtu(backup_netdev, new_mtu);
>> > +		if (ret) {
>> > +			dev_set_mtu(active_netdev, dev->mtu);
>> > +			return ret;
>> > +		}
>> > +	}
>> > +
>> > +	dev->mtu = new_mtu;
>> > +	return 0;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_change_mtu);
>> > +
>> > +void bypass_set_rx_mode(struct net_device *dev)
>> > +{
>> > +	struct bypass_info *bi = netdev_priv(dev);
>> > +	struct net_device *slave_netdev;
>> > +
>> > +	rcu_read_lock();
>> > +
>> > +	slave_netdev = rcu_dereference(bi->active_netdev);
>> > +	if (slave_netdev) {
>> > +		dev_uc_sync_multiple(slave_netdev, dev);
>> > +		dev_mc_sync_multiple(slave_netdev, dev);
>> > +	}
>> > +
>> > +	slave_netdev = rcu_dereference(bi->backup_netdev);
>> > +	if (slave_netdev) {
>> > +		dev_uc_sync_multiple(slave_netdev, dev);
>> > +		dev_mc_sync_multiple(slave_netdev, dev);
>> > +	}
>> > +
>> > +	rcu_read_unlock();
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_set_rx_mode);
>> > +
>> > +static const struct net_device_ops bypass_netdev_ops = {
>> > +	.ndo_open		= bypass_open,
>> > +	.ndo_stop		= bypass_close,
>> > +	.ndo_start_xmit		= bypass_start_xmit,
>> > +	.ndo_select_queue	= bypass_select_queue,
>> > +	.ndo_get_stats64	= bypass_get_stats,
>> > +	.ndo_change_mtu		= bypass_change_mtu,
>> > +	.ndo_set_rx_mode	= bypass_set_rx_mode,
>> > +	.ndo_validate_addr	= eth_validate_addr,
>> > +	.ndo_features_check	= passthru_features_check,
>> > +};
>> > +
>> > +#define BYPASS_DRV_NAME "bypass"
>> > +#define BYPASS_DRV_VERSION "0.1"
>> > +
>> > +static void bypass_ethtool_get_drvinfo(struct net_device *dev,
>> > +				       struct ethtool_drvinfo *drvinfo)
>> > +{
>> > +	strlcpy(drvinfo->driver, BYPASS_DRV_NAME, sizeof(drvinfo->driver));
>> > +	strlcpy(drvinfo->version, BYPASS_DRV_VERSION, sizeof(drvinfo->version));
>> > +}
>> > +
>> > +int bypass_ethtool_get_link_ksettings(struct net_device *dev,
>> > +				      struct ethtool_link_ksettings *cmd)
>> > +{
>> > +	struct bypass_info *bi = netdev_priv(dev);
>> > +	struct net_device *slave_netdev;
>> > +
>> > +	slave_netdev = rtnl_dereference(bi->active_netdev);
>> > +	if (!slave_netdev || !bypass_xmit_ready(slave_netdev)) {
>> > +		slave_netdev = rtnl_dereference(bi->backup_netdev);
>> > +		if (!slave_netdev || !bypass_xmit_ready(slave_netdev)) {
>> > +			cmd->base.duplex = DUPLEX_UNKNOWN;
>> > +			cmd->base.port = PORT_OTHER;
>> > +			cmd->base.speed = SPEED_UNKNOWN;
>> > +
>> > +			return 0;
>> > +		}
>> > +	}
>> > +
>> > +	return __ethtool_get_link_ksettings(slave_netdev, cmd);
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_ethtool_get_link_ksettings);
>> > +
>> > +static const struct ethtool_ops bypass_ethtool_ops = {
>> > +	.get_drvinfo            = bypass_ethtool_get_drvinfo,
>> > +	.get_link               = ethtool_op_get_link,
>> > +	.get_link_ksettings     = bypass_ethtool_get_link_ksettings,
>> > +};
>> > +
>> > +static void bypass_register_existing_slave(struct net_device *bypass_netdev)
>> > +{
>> > +	struct net *net = dev_net(bypass_netdev);
>> > +	struct net_device *dev;
>> > +
>> > +	rtnl_lock();
>> > +	for_each_netdev(net, dev) {
>> > +		if (dev == bypass_netdev)
>> > +			continue;
>> > +		if (!bypass_validate_event_dev(dev))
>> > +			continue;
>> > +		if (ether_addr_equal(bypass_netdev->perm_addr, dev->perm_addr))
>> > +			bypass_slave_register(dev);
>> > +	}
>> > +	rtnl_unlock();
>> > +}
>> > +
>> > +int bypass_master_register(struct net_device *dev, struct bypass_ops *ops,
>> > +			   struct bypass_master **pbypass_master)
>> > +{
>> > +	struct bypass_master *bypass_master;
>> > +
>> > +	bypass_master = kzalloc(sizeof(*bypass_master), GFP_KERNEL);
>> > +	if (!bypass_master)
>> > +		return -ENOMEM;
>> > +
>> > +	rcu_assign_pointer(bypass_master->ops, ops);
>> > +	dev_hold(dev);
>> > +	dev->priv_flags |= IFF_BYPASS;
>> > +	rcu_assign_pointer(bypass_master->bypass_netdev, dev);
>> > +
>> > +	spin_lock(&bypass_lock);
>> > +	list_add_tail(&bypass_master->list, &bypass_master_list);
>> > +	spin_unlock(&bypass_lock);
>> > +
>> > +	bypass_register_existing_slave(dev);
>> > +
>> > +	*pbypass_master = bypass_master;
>> > +	return 0;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_master_register);
>> > +
>> > +void bypass_master_unregister(struct bypass_master *bypass_master)
>> > +{
>> > +	struct net_device *bypass_netdev;
>> > +
>> > +	bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
>> > +
>> > +	bypass_netdev->priv_flags &= ~IFF_BYPASS;
>> > +	dev_put(bypass_netdev);
>> > +
>> > +	spin_lock(&bypass_lock);
>> > +	list_del(&bypass_master->list);
>> > +	spin_unlock(&bypass_lock);
>> > +
>> > +	kfree(bypass_master);
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_master_unregister);
>> > +
>> > +int bypass_master_create(struct net_device *backup_netdev,
>> > +			 struct bypass_master **pbypass_master)
>> > +{
>> > +	struct device *dev = backup_netdev->dev.parent;
>> > +	struct net_device *bypass_netdev;
>> > +	int err;
>> > +
>> > +	/* Alloc at least 2 queues, for now we are going with 16 assuming
>> > +	 * that most devices being bonded won't have too many queues.
>> > +	 */
>> > +	bypass_netdev = alloc_etherdev_mq(sizeof(struct bypass_info), 16);
>> > +	if (!bypass_netdev) {
>> > +		dev_err(dev, "Unable to allocate bypass_netdev!\n");
>> > +		return -ENOMEM;
>> > +	}
>> > +
>> > +	dev_net_set(bypass_netdev, dev_net(backup_netdev));
>> > +	SET_NETDEV_DEV(bypass_netdev, dev);
>> > +
>> > +	bypass_netdev->netdev_ops = &bypass_netdev_ops;
>> > +	bypass_netdev->ethtool_ops = &bypass_ethtool_ops;
>> > +
>> > +	/* Initialize the device options */
>> > +	bypass_netdev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
>> > +	bypass_netdev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
>> > +				       IFF_TX_SKB_SHARING);
>> > +
>> > +	/* don't acquire bypass netdev's netif_tx_lock when transmitting */
>> > +	bypass_netdev->features |= NETIF_F_LLTX;
>> > +
>> > +	/* Don't allow bypass devices to change network namespaces. */
>> > +	bypass_netdev->features |= NETIF_F_NETNS_LOCAL;
>> > +
>> > +	bypass_netdev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
>> > +				     NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
>> > +				     NETIF_F_HIGHDMA | NETIF_F_LRO;
>> > +
>> > +	bypass_netdev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
>> > +	bypass_netdev->features |= bypass_netdev->hw_features;
>> > +
>> > +	memcpy(bypass_netdev->dev_addr, backup_netdev->dev_addr,
>> > +	       bypass_netdev->addr_len);
>> > +
>> > +	bypass_netdev->min_mtu = backup_netdev->min_mtu;
>> > +	bypass_netdev->max_mtu = backup_netdev->max_mtu;
>> > +
>> > +	err = register_netdev(bypass_netdev);
>> > +	if (err < 0) {
>> > +		dev_err(dev, "Unable to register bypass_netdev!\n");
>> > +		goto err_register_netdev;
>> > +	}
>> > +
>> > +	netif_carrier_off(bypass_netdev);
>> > +
>> > +	err = bypass_master_register(bypass_netdev, NULL, pbypass_master);
>> > +	if (err < 0)
>> just "if (err)" would do.
>
>OK
>
>> 
>> 
>> > +		goto err_bypass;
>> > +
>> > +	return 0;
>> > +
>> > +err_bypass:
>> > +	unregister_netdev(bypass_netdev);
>> > +err_register_netdev:
>> > +	free_netdev(bypass_netdev);
>> > +
>> > +	return err;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_master_create);
>> > +
>> > +void bypass_master_destroy(struct bypass_master *bypass_master)
>> > +{
>> > +	struct net_device *bypass_netdev;
>> > +	struct net_device *slave_netdev;
>> > +	struct bypass_info *bi;
>> > +
>> > +	if (!bypass_master)
>> > +		return;
>> > +
>> > +	bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
>> > +	bi = netdev_priv(bypass_netdev);
>> > +
>> > +	netif_device_detach(bypass_netdev);
>> > +
>> > +	rtnl_lock();
>> > +
>> > +	slave_netdev = rtnl_dereference(bi->active_netdev);
>> > +	if (slave_netdev)
>> > +		bypass_slave_unregister(slave_netdev);
>> > +
>> > +	slave_netdev = rtnl_dereference(bi->backup_netdev);
>> > +	if (slave_netdev)
>> > +		bypass_slave_unregister(slave_netdev);
>> > +
>> > +	bypass_master_unregister(bypass_master);
>> > +
>> > +	unregister_netdevice(bypass_netdev);
>> > +
>> > +	rtnl_unlock();
>> > +
>> > +	free_netdev(bypass_netdev);
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_master_destroy);
>> > +
>> > +static __init int
>> > +bypass_init(void)
>> > +{
>> > +	register_netdevice_notifier(&bypass_notifier);
>> > +
>> > +	return 0;
>> > +}
>> > +module_init(bypass_init);
>> > +
>> > +static __exit
>> > +void bypass_exit(void)
>> > +{
>> > +	unregister_netdevice_notifier(&bypass_notifier);
>> > +}
>> > +module_exit(bypass_exit);
>> > +
>> > +MODULE_DESCRIPTION("Bypass infrastructure/interface for Paravirtual drivers");
>> > +MODULE_LICENSE("GPL v2");
>> > -- 
>> > 2.14.3
>> > 
>

^ permalink raw reply

* [PATCH] bpf, x86_32: add eBPF JIT compiler for ia32 (x86_32)
From: Wang YanQing @ 2018-04-18  9:31 UTC (permalink / raw)
  To: daniel; +Cc: ast, illusionist.neo, tglx, mingo, hpa, x86, netdev, linux-kernel

The JIT compiler emits ia32 bit instructions. Currently, It supports
eBPF only. Classic BPF is supported because of the conversion by BPF core.

Almost all instructions from eBPF ISA supported except the following:
BPF_ALU64 | BPF_DIV | BPF_K
BPF_ALU64 | BPF_DIV | BPF_X
BPF_ALU64 | BPF_MOD | BPF_K
BPF_ALU64 | BPF_MOD | BPF_X
BPF_STX | BPF_XADD | BPF_W
BPF_STX | BPF_XADD | BPF_DW

It doesn't support BPF_JMP|BPF_CALL with BPF_PSEUDO_CALL too.

IA32 has few general purpose registers, EAX|EDX|ECX|EBX|ESI|EDI,
and for these six registers, we can't treat all of them as real
general purpose registers:
MUL instructions need EAX:EDX, shift instructions need ECX, ESI|EDI
for string manipulation instructions.

So I decide to use stack to emulate all eBPF 64 registers, this will
simplify the implementation very much, because we don't need to face
the flexible memory address modes on ia32, for example, we don't need
to write below codes for one BPF_ADD instruction:
if (src_reg is a register && dst_reg is a register)
{
   //one instruction encoding for ADD instruction
} else if (only src is a register)
{
   //another different instruction encoding for ADD instruction
} else if (only dst is a register)
{
   //another different instruction encoding for ADD instruction
} else
{
   //src and dst are all on stack.
   //another different instruction encoding for ADD instruction
}

If you think above if-else-else-else isn't so painful, try to think
it for BPF_ALU64|BPF_*SHIFT* instruction:)

Tested on my PC(Intel(R) Core(TM) i5-5200U CPU) and virtualbox.

Testing results on i5-5200U:

1) test_bpf: Summary: 349 PASSED, 0 FAILED, [319/341 JIT'ed]
2) test_progs: Summary: 81 PASSED, 2 FAILED.
   test_progs report "libbpf: incorrect bpf_call opcode" for
   test_l4lb_noinline and test_xdp_noinline, because there is
   no llvm-6.0 on my machine, and current implementation doesn't
   support BPF_CALL, so I think we can ignore it.
3) test_lpm: OK
4) test_lru_map: OK
5) test_verifier: Summary: 823 PASSED, 5 FAILED
   test_verifier report "invalid bpf_context access off=68 size=1/2/4/8"
   for all the 5 FAILED testcases, and test_verifier report them when
   we turn off the jit, so I think the jit can do nothing to fix them.

Above tests are all done with following flags enabled discretely:
bpf_jit_enable=1 and bpf_jit_harden=2

Below are some numbers for this jit implementation:
Note:
  I run test_progs 100 times in loop for every testcase, the numbers
  are in format: total/times=avg. The numbers that test_bpf report
  almost show the same relation.

a:jit_enable=0 and jit_harden=0            b:jit_enable=1 and jit_harden=0
  test_pkt_access:PASS:ipv4:15622/100=156  test_pkt_access:PASS:ipv4:10057/100=100
  test_pkt_access:PASS:ipv6:9130/100=91    test_pkt_access:PASS:ipv6:5055/100=50
  test_xdp:PASS:ipv4:240198/100=2401       test_xdp:PASS:ipv4:145945/100=1459
  test_xdp:PASS:ipv6:137326/100=1373       test_xdp:PASS:ipv6:67337/100=673
  test_l4lb:PASS:ipv4:61100/100=611        test_l4lb:PASS:ipv4:38137/100=381
  test_l4lb:PASS:ipv6:101000/100=1010      test_l4lb:PASS:ipv6:57779/100=577

c:jit_enable=0 and jit_harden=2            b:jit_enable=1 and jit_harden=2
  test_pkt_access:PASS:ipv4:15214/100=152  test_pkt_access:PASS:ipv4:12650/100=126
  test_pkt_access:PASS:ipv6:9132/100=91    test_pkt_access:PASS:ipv6:7074/100=70
  test_xdp:PASS:ipv4:237252/100=2372       test_xdp:PASS:ipv4:147211/100=1472
  test_xdp:PASS:ipv6:135977/100=1359       test_xdp:PASS:ipv6:85783/100=857
  test_l4lb:PASS:ipv4:61324/100=613        test_l4lb:PASS:ipv4:53222/100=532
  test_l4lb:PASS:ipv6:100833/100=1008      test_l4lb:PASS:ipv6:76322/100=763

Yes, the numbers are pretty without turn on jit_harden, if we want to speedup
jit_harden, then we need to move BPF_REG_AX to *real* register instead of stack
emulation, but If we do it, we need to face all the pain I describe above. We
can do it in next step.

See Documentation/networking/filter.txt for more information.

Signed-off-by: Wang YanQing <udknight@gmail.com>
---
 arch/x86/Kconfig                     |    2 +-
 arch/x86/include/asm/nospec-branch.h |   26 +-
 arch/x86/net/Makefile                |   10 +-
 arch/x86/net/bpf_jit32.S             |  147 +++
 arch/x86/net/bpf_jit_comp32.c        | 2239 ++++++++++++++++++++++++++++++++++
 5 files changed, 2419 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/net/bpf_jit32.S
 create mode 100644 arch/x86/net/bpf_jit_comp32.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 00fcf81..1f5fa2f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -137,7 +137,7 @@ config X86
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
-	select HAVE_EBPF_JIT			if X86_64
+	select HAVE_EBPF_JIT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_EXIT_THREAD
 	select HAVE_FENTRY			if X86_64 || DYNAMIC_FTRACE
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index f928ad9..a4c7ca4 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -291,14 +291,17 @@ static inline void indirect_branch_prediction_barrier(void)
  *    lfence
  *    jmp spec_trap
  *  do_rop:
- *    mov %rax,(%rsp)
+ *    mov %rax,(%rsp) for x86_64
+ *    mov %edx,(%esp) for x86_32
  *    retq
  *
  * Without retpolines configured:
  *
- *    jmp *%rax
+ *    jmp *%rax for x86_64
+ *    jmp *%edx for x86_32
  */
 #ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_X86_64
 # define RETPOLINE_RAX_BPF_JIT_SIZE	17
 # define RETPOLINE_RAX_BPF_JIT()				\
 	EMIT1_off32(0xE8, 7);	 /* callq do_rop */		\
@@ -310,9 +313,28 @@ static inline void indirect_branch_prediction_barrier(void)
 	EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */	\
 	EMIT1(0xC3);             /* retq */
 #else
+# define RETPOLINE_EDX_BPF_JIT()				\
+do {								\
+	EMIT1_off32(0xE8, 7);	 /* call do_rop */		\
+	/* spec_trap: */					\
+	EMIT2(0xF3, 0x90);       /* pause */			\
+	EMIT3(0x0F, 0xAE, 0xE8); /* lfence */			\
+	EMIT2(0xEB, 0xF9);       /* jmp spec_trap */		\
+	/* do_rop: */						\
+	EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */		\
+	EMIT1(0xC3);             /* ret */			\
+} while (0)
+#endif
+#else /* !CONFIG_RETPOLINE */
+
+#ifdef CONFIG_X86_64
 # define RETPOLINE_RAX_BPF_JIT_SIZE	2
 # define RETPOLINE_RAX_BPF_JIT()				\
 	EMIT2(0xFF, 0xE0);	 /* jmp *%rax */
+#else
+# define RETPOLINE_EDX_BPF_JIT()				\
+	EMIT2(0xFF, 0xE2) /* jmp *%edx */
+#endif
 #endif
 
 #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index fefb4b6..adcadc6 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -1,6 +1,12 @@
 #
 # Arch-specific network modules
 #
-OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
 
-obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+
+ifeq ($(CONFIG_X86_32),y)
+        OBJECT_FILES_NON_STANDARD_bpf_jit32.o += y
+        obj-$(CONFIG_BPF_JIT) += bpf_jit32.o bpf_jit_comp32.o
+else
+        OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
+        obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+endif
diff --git a/arch/x86/net/bpf_jit32.S b/arch/x86/net/bpf_jit32.S
new file mode 100644
index 0000000..8dac424
--- /dev/null
+++ b/arch/x86/net/bpf_jit32.S
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* bpf_jit.S : BPF JIT helper functions
+ *
+ * Copyright (C) 2018 Wang YanQing (udknight@gmail.com)
+ * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+/*
+ * Calling convention :
+ * eax : skb pointer (caller-saved)
+ * edx : offset of byte(s) to fetch in skb (caller-saved)
+ * esi : copy of skb->data (callee-saved)
+ * edi : hlen = skb->len - skb->data_len (callee-saved)
+ *
+ * We don't need to push/pop eax,edx,ecx before calling kernel function,
+ * because jit always prepare eax,edx before calling helper functions,
+ * and jit uses ecx as a temporary register.
+ */
+#define SKBDATA	%esi
+#define SKF_MAX_NEG_OFF    $(-0x200000) /* SKF_LL_OFF from filter.h */
+
+#define FUNC(name) \
+	.globl name; \
+	.type name, @function; \
+	name:
+
+FUNC(sk_load_word)
+	test	%edx,%edx
+	js	bpf_slow_path_word_neg
+
+FUNC(sk_load_word_positive_offset)
+	mov	%edi,%ecx		# hlen
+	sub	%edx,%ecx		# hlen - offset
+	cmp	$3,%ecx
+	jle	bpf_slow_path_word
+	mov     (SKBDATA,%edx),%eax
+	bswap   %eax  			/* ntohl() */
+	ret
+
+FUNC(sk_load_half)
+	test	%edx,%edx
+	js	bpf_slow_path_half_neg
+
+FUNC(sk_load_half_positive_offset)
+	mov	%edi,%ecx
+	sub	%edx,%ecx		#	hlen - offset
+	cmp	$1,%ecx
+	jle	bpf_slow_path_half
+	movzwl	(SKBDATA,%edx),%eax
+	rol	$8,%ax			# ntohs()
+	ret
+
+FUNC(sk_load_byte)
+	test	%edx,%edx
+	js	bpf_slow_path_byte_neg
+
+FUNC(sk_load_byte_positive_offset)
+	cmp	%edx,%edi   /* if (offset >= hlen) goto bpf_slow_path_byte */
+	jle	bpf_slow_path_byte
+	movzbl	(SKBDATA,%edx),%eax
+	ret
+
+#define bpf_slow_path_common(LEN)		\
+	lea	104(%ebp), %ecx;		\
+	FRAME_BEGIN;				\
+	push    $LEN;				\
+	call	skb_copy_bits;			\
+	add	$4,%esp;			\
+	test    %eax,%eax;			\
+	FRAME_END
+
+
+bpf_slow_path_word:
+	bpf_slow_path_common(4)
+	js	bpf_error
+	mov	104(%ebp),%eax
+	bswap	%eax
+	ret
+
+bpf_slow_path_half:
+	bpf_slow_path_common(2)
+	js	bpf_error
+	mov	104(%ebp),%ax
+	rol	$8,%ax
+	movzwl	%ax,%eax
+	ret
+
+bpf_slow_path_byte:
+	bpf_slow_path_common(1)
+	js	bpf_error
+	movzbl	104(%ebp),%eax
+	ret
+
+#define sk_negative_common(SIZE)				\
+	FRAME_BEGIN;						\
+	mov	$SIZE,%ecx;	/* size */			\
+	call	bpf_internal_load_pointer_neg_helper;		\
+	test	%eax,%eax;					\
+	FRAME_END;						\
+	jz	bpf_error
+
+bpf_slow_path_word_neg:
+	cmp	SKF_MAX_NEG_OFF, %edx	/* test range */
+	jl	bpf_error	/* offset lower -> error  */
+
+FUNC(sk_load_word_negative_offset)
+	sk_negative_common(4)
+	mov	(%eax), %eax
+	bswap	%eax
+	ret
+
+bpf_slow_path_half_neg:
+	cmp	SKF_MAX_NEG_OFF, %edx
+	jl	bpf_error
+
+FUNC(sk_load_half_negative_offset)
+	sk_negative_common(2)
+	mov	(%eax),%ax
+	rol	$8,%ax
+	movzwl	%ax,%eax
+	ret
+
+bpf_slow_path_byte_neg:
+	cmp	SKF_MAX_NEG_OFF, %edx
+	jl	bpf_error
+
+FUNC(sk_load_byte_negative_offset)
+	sk_negative_common(1)
+	movzbl	(%eax), %eax
+	ret
+
+bpf_error:
+# force a return 0 from jit handler
+	xor	%eax,%eax
+	mov	108(%ebp),%ebx
+	mov	112(%ebp),%esi
+	mov	116(%ebp),%edi
+	add	$120, %ebp
+	leave
+	ret
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
new file mode 100644
index 0000000..39f3803
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -0,0 +1,2239 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Just-In-Time compiler for eBPF filters on ia32 (32bit x86)
+ *
+ * Copyright (c) 2018 Wang YanQing <udknight@gmail.com>
+ * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
+ * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+#include <linux/if_vlan.h>
+#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
+#include <asm/nospec-branch.h>
+#include <linux/bpf.h>
+
+/*
+ * eBPF prog stack layout:
+ *
+ *                         high
+ * original ESP =>        +-----+
+ *                        |     | callee saved registers
+ *                        +-----+
+ *                        | ... | eBPF JIT scratch space
+ * BPF_FP,IA32_EBP  =>    +-----+
+ *                        | ... | eBPF prog stack
+ *                        +-----+
+ *                        |RSVD | JIT scratchpad
+ * current ESP =>         +-----+
+ *                        |     |
+ *                        | ... | Function call stack
+ *                        |     |
+ *                        +-----+
+ *                          low
+ *
+ * The callee saved registers:
+ *
+ *                                high
+ * original ESP =>        +------------------+ \
+ *                        |        ebp       | |
+ * current EBP =>         +------------------+ } callee saved registers
+ *                        |    ebx,esi,edi   | |
+ *                        +------------------+ /
+ *                                low
+ */
+
+/*
+ * assembly code in arch/x86/net/bpf_jit32.S
+ */
+extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
+extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
+extern u8 sk_load_byte_positive_offset[];
+extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[];
+extern u8 sk_load_byte_negative_offset[];
+
+static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+	if (len == 1)
+		*ptr = bytes;
+	else if (len == 2)
+		*(u16 *)ptr = bytes;
+	else {
+		*(u32 *)ptr = bytes;
+		barrier();
+	}
+	return ptr + len;
+}
+
+#define EMIT(bytes, len) \
+	do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
+
+#define EMIT1(b1)		EMIT(b1, 1)
+#define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3)	EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4)   \
+	EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
+#define EMIT1_off32(b1, off) \
+	do {EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+	do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+	do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+	do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+
+#define jmp_label(label, jmp_insn_len) (label - cnt - jmp_insn_len)
+
+static bool is_imm8(int value)
+{
+	return value <= 127 && value >= -128;
+}
+
+static bool is_simm32(s64 value)
+{
+	return value == (s64) (s32) value;
+}
+
+#define STACK_OFFSET(k)	(k)
+#define TCALL_CNT	(MAX_BPF_JIT_REG + 0)	/* Tail Call Count */
+#define TMP_REG_1	(MAX_BPF_JIT_REG + 1)	/* TEMP Register 1 */
+#define TMP_REG_2	(MAX_BPF_JIT_REG + 2)	/* TEMP Register 2 */
+
+#define IA32_EAX	(0x0)
+#define IA32_EBX	(0x3)
+#define IA32_ECX	(0x1)
+#define IA32_EDX	(0x2)
+#define IA32_ESI	(0x6)
+#define IA32_EDI	(0x7)
+#define IA32_EBP	(0x5)
+#define IA32_ESP	(0x4)
+
+/* list of x86 cond jumps opcodes (. + s8)
+ * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
+ */
+#define IA32_JB  0x72
+#define IA32_JAE 0x73
+#define IA32_JE  0x74
+#define IA32_JNE 0x75
+#define IA32_JBE 0x76
+#define IA32_JA  0x77
+#define IA32_JL  0x7C
+#define IA32_JGE 0x7D
+#define IA32_JLE 0x7E
+#define IA32_JG  0x7F
+
+/*
+ * Map eBPF registers to x86_32 32bit registers or stack scratch space.
+ *
+ * 1. First argument is passed using the x86_32 32bit registers and rest
+ *    of the arguments are passed on stack scratch space.
+ * 2. All callee-saved arguments are mapped to scratch space on stack.
+ * 3. We need two 64 bit temp registers to do complex operations on eBPF
+ *    registers.
+ *
+ * As the eBPF registers are all 64 bit registers and x86_32 has only 32 bit
+ * registers, we have to map each eBPF registers with two x86_32 32 bit regs
+ * or scratch memory space and we have to build eBPF 64 bit register from those.
+ *
+ */
+static const u8 bpf2ia32[][2] = {
+	/* return value from in-kernel function, and exit value from eBPF */
+	[BPF_REG_0] = {STACK_OFFSET(0), STACK_OFFSET(4)},
+
+	/* arguments from eBPF program to in-kernel function */
+	/* Stored on stack scratch space */
+	[BPF_REG_1] = {STACK_OFFSET(8), STACK_OFFSET(12)},
+	[BPF_REG_2] = {STACK_OFFSET(16), STACK_OFFSET(20)},
+	[BPF_REG_3] = {STACK_OFFSET(24), STACK_OFFSET(28)},
+	[BPF_REG_4] = {STACK_OFFSET(32), STACK_OFFSET(36)},
+	[BPF_REG_5] = {STACK_OFFSET(40), STACK_OFFSET(44)},
+
+	/* callee saved registers that in-kernel function will preserve */
+	/* Stored on stack scratch space */
+	[BPF_REG_6] = {STACK_OFFSET(48), STACK_OFFSET(52)},
+	[BPF_REG_7] = {STACK_OFFSET(56), STACK_OFFSET(60)},
+	[BPF_REG_8] = {STACK_OFFSET(64), STACK_OFFSET(68)},
+	[BPF_REG_9] = {STACK_OFFSET(72), STACK_OFFSET(76)},
+
+	/* Read only Frame Pointer to access Stack */
+	[BPF_REG_FP] = {STACK_OFFSET(80), STACK_OFFSET(84)},
+
+	/* temporary register for blinding constants.
+	 * Stored on stack scratch space.
+	 */
+	[BPF_REG_AX] = {STACK_OFFSET(88), STACK_OFFSET(92)},
+
+	/* Tail call count. Stored on stack scratch space. */
+	[TCALL_CNT] = {STACK_OFFSET(96), STACK_OFFSET(100)},
+
+	/* Temporary Register for internal BPF JIT, can be used
+	 * for constant blindings and others.
+	 */
+	[TMP_REG_1] = {IA32_ESI, IA32_EDI},
+	[TMP_REG_2] = {IA32_EAX, IA32_EDX},
+};
+
+#define dst_lo	dst[0]
+#define dst_hi	dst[1]
+#define src_lo	src[0]
+#define src_hi	src[1]
+
+#define STACK_ALIGNMENT	8
+/* Stack space for BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4,
+ * BPF_REG_5, BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9,
+ * BPF_REG_FP, BPF_REG_AX and Tail call counts.
+ */
+#define SCRATCH_SIZE 104
+
+/* total stack size used in JITed code */
+#define _STACK_SIZE \
+	(stack_depth + \
+	 + SCRATCH_SIZE + \
+	 + 4 /* extra for skb_copy_bits buffer */)
+
+#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
+
+/* Get the offset of eBPF REGISTERs stored on scratch space. */
+#define STACK_VAR(off) (off)
+
+/* Offset of skb_copy_bits buffer */
+#define SKB_BUFFER STACK_VAR(SCRATCH_SIZE)
+
+/* encode 'dst_reg' register into x86_32 opcode 'byte' */
+static u8 add_1reg(u8 byte, u32 dst_reg)
+{
+	return byte + dst_reg;
+}
+
+/* encode 'dst_reg' and 'src_reg' registers into x86_32 opcode 'byte' */
+static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
+{
+	return byte + dst_reg + (src_reg << 3);
+}
+
+static void jit_fill_hole(void *area, unsigned int size)
+{
+	/* fill whole space with int3 instructions */
+	memset(area, 0xcc, size);
+}
+
+static inline void emit_ia32_mov_i(const u8 dst, const u32 val,
+				   u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	EMIT3_off32(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst), val);
+
+	*pprog = prog;
+}
+
+/* dst = imm (4 bytes)*/
+static inline void emit_ia32_mov_r(const u8 dst, const u8 src, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+
+	/* mov esi,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(src));
+	/* mov dword ptr [ebp+off],esi */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst));
+
+	*pprog = prog;
+}
+
+/* dst = src */
+static inline void emit_ia32_mov_r64(const bool is64, const u8 dst[],
+				     const u8 src[], u8 **pprog)
+{
+	emit_ia32_mov_r(dst_lo, src_lo, pprog);
+	if (is64) {
+		/* complete 8 byte move */
+		emit_ia32_mov_r(dst_hi, src_hi, pprog);
+	} else {
+		/* Zero out high 4 bytes */
+		emit_ia32_mov_i(dst_hi, 0, pprog);
+	}
+}
+
+/* Sign extended move */
+static inline void emit_ia32_mov_i64(const bool is64, const u8 dst[],
+				     const u32 val, u8 **pprog)
+{
+	u32 hi = 0;
+
+	if (is64 && (val & (1<<31)))
+		hi = (u32)~0;
+
+	emit_ia32_mov_i(dst_lo, val, pprog);
+	emit_ia32_mov_i(dst_hi, hi, pprog);
+}
+
+/* ALU operation (32 bit)
+ * dst = dst (op) src
+ */
+static inline void emit_ia32_alu_r(const bool is64, const bool hi, const u8 op,
+				   const u8 dst, const u8 src, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+
+	switch (BPF_OP(op)) {
+	/* dst = dst + src */
+	case BPF_ADD: {
+		/* mov esi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(src));
+
+		if (hi && is64)
+			EMIT3(0x11, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst));
+		else
+			EMIT3(0x01, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst));
+		break;
+	}
+	/* dst = dst - src */
+	case BPF_SUB: {
+		/* mov esi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(src));
+
+		if (hi && is64)
+			EMIT3(0x19, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst));
+		else
+			EMIT3(0x29, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst));
+		break;
+	}
+	/* dst = dst | src */
+	case BPF_OR: {
+		/* mov esi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(src));
+		EMIT3(0x09, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst));
+		break;
+	}
+	/* dst = dst & src */
+	case BPF_AND: {
+		/* mov esi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(src));
+		EMIT3(0x21, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst));
+		break;
+	}
+	/* dst = dst ^ src */
+	case BPF_XOR: {
+		/* mov esi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(src));
+		EMIT3(0x31, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst));
+		break;
+	}
+	/* dst = dst * src */
+	case BPF_MUL: {
+		const u8 *tmp = bpf2ia32[TMP_REG_2];
+
+		/* mov esi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst));
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src));
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst));
+		break;
+	}
+	/* dst = dst << src */
+	case BPF_LSH: {
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+		EMIT3(0xD3, add_1reg(0x60, IA32_EBP), STACK_VAR(dst));
+		break;
+	}
+	/* dst = dst >> src */
+	case BPF_RSH: {
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+		EMIT3(0xD3, add_1reg(0x68, IA32_EBP), STACK_VAR(dst));
+		break;
+	}
+	/* dst = dst >> src (signed)*/
+	case BPF_ARSH:
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+		EMIT3(0xD3, add_1reg(0x78, IA32_EBP), STACK_VAR(dst));
+		break;
+	}
+	*pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_r64(const bool is64, const u8 op,
+				     const u8 dst[], const u8 src[],
+				     u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	emit_ia32_alu_r(is64, false, op, dst_lo, src_lo, &prog);
+	if (is64)
+		emit_ia32_alu_r(is64, true, op, dst_hi, src_hi, &prog);
+	else
+		emit_ia32_mov_i(dst_hi, 0, &prog);
+	*pprog = prog;
+}
+
+/* ALU operation (32 bit)
+ * dst = dst (op) val
+ */
+static inline void emit_ia32_alu_i(const bool is64, const bool hi, const u8 op,
+				  const u8 dst, const s32 val, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+
+	switch (op) {
+	/* dst = dst + val */
+	case BPF_ADD: {
+		if (hi && is64) {
+			if (is_imm8(val)) {
+				EMIT3(0x83, add_1reg(0x50, IA32_EBP),
+				      STACK_VAR(dst));
+				EMIT(val, 1);
+			} else {
+				/* mov esi,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), val);
+				EMIT3(0x11, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(dst));
+			}
+		} else {
+			if (is_imm8(val)) {
+				EMIT4(0x83, add_1reg(0x40, IA32_EBP),
+				      STACK_VAR(dst), val);
+			} else {
+				/* mov esi,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), val);
+				EMIT3(0x01, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(dst));
+			}
+		}
+		break;
+	}
+	/* dst = dst - val */
+	case BPF_SUB: {
+		if (hi && is64) {
+			if (is_imm8(val)) {
+				EMIT4(0x83, add_1reg(0x58, IA32_EBP),
+				      STACK_VAR(dst), val);
+			} else {
+				/* mov esi,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), val);
+				EMIT3(0x19, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(dst));
+			}
+		} else {
+			if (is_imm8(val)) {
+				EMIT4(0x83, add_1reg(0x68, IA32_EBP),
+				      STACK_VAR(dst), val);
+			} else {
+				/* mov esi,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), val);
+				EMIT3(0x29, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(dst));
+			}
+		}
+		break;
+	}
+	/* dst = dst | val */
+	case BPF_OR: {
+		if (is_imm8(val)) {
+			EMIT4(0x83, add_1reg(0x48, IA32_EBP), STACK_VAR(dst),
+			      val);
+		} else {
+			/* mov esi,imm32*/
+			EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), val);
+			EMIT3(0x09, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst));
+		}
+		break;
+	}
+	/* dst = dst & val */
+	case BPF_AND: {
+		if (is_imm8(val)) {
+			EMIT4(0x83, add_1reg(0x60, IA32_EBP), STACK_VAR(dst),
+			      val);
+		} else {
+			/* mov esi,imm32*/
+			EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), val);
+			EMIT3(0x21, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst));
+		}
+		break;
+	}
+	/* dst = dst ^ val */
+	case BPF_XOR: {
+		if (is_imm8(val)) {
+			EMIT4(0x83, add_1reg(0x70, IA32_EBP),
+			      STACK_VAR(dst), val);
+		} else {
+			/* mov esi,imm32*/
+			EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), val);
+			EMIT3(0x31, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst));
+		}
+		break;
+	}
+	/* dst = dst * val */
+	case BPF_MUL: {
+		const u8 *tmp = bpf2ia32[TMP_REG_2];
+
+		/* mov eax,val */
+		EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), val);
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst));
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst));
+		break;
+	}
+	/* dst = dst << val */
+	case BPF_LSH: {
+		if (is_imm8(val)) {
+			EMIT4(0xC1, add_1reg(0x60, IA32_EBP), STACK_VAR(dst),
+			      val);
+		} else {
+			/* mov esi,imm32*/
+			EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), val);
+			EMIT3(0xD3, add_1reg(0x60, IA32_EBP), STACK_VAR(dst));
+		}
+		break;
+	}
+	/* dst = dst >> val */
+	case BPF_RSH: {
+		if (is_imm8(val)) {
+			EMIT4(0xC1, add_1reg(0x68, IA32_EBP), STACK_VAR(dst),
+			      val);
+		} else {
+			/* mov esi,imm32*/
+			EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), val);
+			EMIT3(0xD3, add_1reg(0x68, IA32_EBP), STACK_VAR(dst));
+		}
+		break;
+	}
+	/* dst = dst >> val (signed)*/
+	case BPF_ARSH:
+		if (is_imm8(val)) {
+			EMIT4(0xC1, add_1reg(0x78, IA32_EBP), STACK_VAR(dst),
+			      val);
+		} else {
+			/* mov esi,imm32*/
+			EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), val);
+			EMIT3(0xD3, add_1reg(0x78, IA32_EBP), STACK_VAR(dst));
+		}
+		break;
+	case BPF_NEG:
+		/* xor esi,esi */
+		EMIT2(0x31, add_2reg(0xC0, tmp[0], tmp[0]));
+		EMIT3(0x2B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst));
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst));
+		break;
+	}
+
+	*pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_i64(const bool is64, const u8 op,
+				     const u8 dst[], const u32 val,
+				     u8 **pprog)
+{
+	u8 *prog = *pprog;
+	u32 hi = 0;
+
+	if (is64 && (val & (1<<31)))
+		hi = (u32)~0;
+
+	emit_ia32_alu_i(is64, false, op, dst_lo, val, &prog);
+	if (is64)
+		emit_ia32_alu_i(is64, true, op, dst_hi, hi, &prog);
+	else
+		emit_ia32_mov_i(dst_hi, 0, &prog);
+
+	*pprog = prog;
+}
+
+/* dst = ~dst (64 bit) */
+static inline void emit_ia32_neg64(const u8 dst[], u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+
+	/* xor esi,esi */
+	EMIT2(0x31, add_2reg(0xC0, tmp[0], tmp[0]));
+
+	EMIT3(0x2B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_lo));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_lo));
+
+	EMIT3(0x19, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_hi));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_hi));
+
+	*pprog = prog;
+}
+
+/* dst = dst << src */
+static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[], u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+	static int jmp_label1 = -1;
+	static int jmp_label2 = -1;
+	static int jmp_label3 = -1;
+
+	/* mov ecx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+
+	/* cmp ecx,32 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+	/* jumps when >= 32 */
+	if (is_imm8(jmp_label(jmp_label1, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+	/* < 32 */
+	/* mov esi,32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), 32);
+	/* sub esi,ecx */
+	EMIT2(0x29, add_2reg(0xC0, tmp[0], IA32_ECX));
+
+	/* shl dword ptr [ebp+off],cl */
+	EMIT3(0xD3, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_hi));
+	/* mov edi,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[1]), STACK_VAR(dst_lo));
+	/* shl dword ptr [ebp+off],cl */
+	EMIT3(0xD3, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+
+	/* mov ecx,esi */
+	EMIT2(0x89, add_2reg(0xC0, IA32_ECX, tmp[0]));
+	/* shr edi,cl */
+	EMIT2(0xD3, add_1reg(0xE8, tmp[1]));
+	/* or dword ptr [ebp+off],edi */
+	EMIT3(0x09, add_2reg(0x40, IA32_EBP, tmp[1]), STACK_VAR(dst_hi));
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 32 */
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+	/* cmp ecx,64 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+	/* jumps when >= 64 */
+	if (is_imm8(jmp_label(jmp_label2, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+	/* >= 32 && < 64 */
+	/* sub ecx,32 */
+	EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+	/* mov esi,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_lo));
+	/* shl esi,cl */
+	EMIT2(0xD3, add_1reg(0xE0, tmp[0]));
+	/* mov dword ptr [ebp+off],esi */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_hi));
+
+	EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_lo));
+	EMIT(0x0, 4);
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 64 */
+	if (jmp_label2 == -1)
+		jmp_label2 = cnt;
+	EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_hi));
+	EMIT(0x0, 4);
+	EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_lo));
+	EMIT(0x0, 4);
+
+	if (jmp_label3 == -1)
+		jmp_label3 = cnt;
+
+	*pprog = prog;
+}
+
+/* dst = dst >> src (signed)*/
+static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[],
+				      u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+	static int jmp_label1 = -1;
+	static int jmp_label2 = -1;
+	static int jmp_label3 = -1;
+
+	/* mov ecx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+
+	/* cmp ecx,32 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+	/* jumps when >= 32 */
+	if (is_imm8(jmp_label(jmp_label1, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+	/* < 32 */
+	/* mov esi,32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), 32);
+	/* sub esi,ecx */
+	EMIT2(0x29, add_2reg(0xC0, tmp[0], IA32_ECX));
+
+	/* lshr dword ptr [ebp+off],cl */
+	EMIT3(0xD3, add_1reg(0x68, IA32_EBP), STACK_VAR(dst_lo));
+	/* mov edi,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[1]), STACK_VAR(dst_hi));
+	/* ashr dword ptr [ebp+off],cl */
+	EMIT3(0xD3, add_1reg(0x78, IA32_EBP), STACK_VAR(dst_hi));
+
+	/* mov ecx,esi */
+	EMIT2(0x89, add_2reg(0xC0, IA32_ECX, tmp[0]));
+	/* shl edi,cl */
+	EMIT2(0xD3, add_1reg(0xE0, tmp[1]));
+	/* or dword ptr [ebp+off],edi */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 32 */
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+	/* cmp ecx,64 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+	/* jumps when >= 64 */
+	if (is_imm8(jmp_label(jmp_label2, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+	/* >= 32 && < 64 */
+	/* sub ecx,32 */
+	EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+	/* mov esi,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_hi));
+	/* ashr esi,cl */
+	EMIT2(0xD3, add_1reg(0xF8, tmp[0]));
+	/* mov dword ptr [ebp+off],esi */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_lo));
+
+	/* ashr dword ptr [ebp+off],imm8 */
+	EMIT3(0xC1, add_1reg(0x78, IA32_EBP), STACK_VAR(dst_hi));
+	EMIT(31, 1);
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 64 */
+	if (jmp_label2 == -1)
+		jmp_label2 = cnt;
+	/* mov esi,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_hi));
+	/* ashr esi,imm8 */
+	EMIT3(0xC1, add_1reg(0xF8, tmp[0]), 31);
+	/* mov dword ptr [ebp+off],esi */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_hi));
+	/* mov dword ptr [ebp+off],esi */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_lo));
+
+	if (jmp_label3 == -1)
+		jmp_label3 = cnt;
+
+	*pprog = prog;
+}
+
+/* dst = dst >> src */
+static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+	static int jmp_label1 = -1;
+	static int jmp_label2 = -1;
+	static int jmp_label3 = -1;
+
+	/* mov ecx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+
+	/* cmp ecx,32 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+	/* jumps when >= 32 */
+	if (is_imm8(jmp_label(jmp_label1, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+	/* < 32 */
+	/* mov esi,32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), 32);
+	/* sub esi,ecx */
+	EMIT2(0x29, add_2reg(0xC0, tmp[0], IA32_ECX));
+
+	/* lshr dword ptr [ebp+off],cl */
+	EMIT3(0xD3, add_1reg(0x68, IA32_EBP), STACK_VAR(dst_lo));
+	/* mov edi,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[1]), STACK_VAR(dst_hi));
+	/* shr dword ptr [ebp+off],cl */
+	EMIT3(0xD3, add_1reg(0x68, IA32_EBP), STACK_VAR(dst_hi));
+
+	/* mov ecx, esi */
+	EMIT2(0x89, add_2reg(0xC0, IA32_ECX, tmp[0]));
+	/* shl edi,cl */
+	EMIT2(0xD3, add_1reg(0xE0, tmp[1]));
+	/* or dword ptr [ebp+off],edi */
+	EMIT3(0x09, add_2reg(0x40, IA32_EBP, tmp[1]), STACK_VAR(dst_lo));
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 32 */
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+	/* cmp ecx,64 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+	/* jumps when >= 64 */
+	if (is_imm8(jmp_label(jmp_label2, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+	/* >= 32 && < 64 */
+	/* sub ecx,32 */
+	EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+	/* mov esi,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_hi));
+	/* shr esi,cl */
+	EMIT2(0xD3, add_1reg(0xE8, tmp[0]));
+	/* mov dword ptr [ebp+off],esi */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_lo));
+
+	EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_hi));
+	EMIT(0x0, 4);
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 64 */
+	if (jmp_label2 == -1)
+		jmp_label2 = cnt;
+	EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_hi));
+	EMIT(0x0, 4);
+	EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_lo));
+	EMIT(0x0, 4);
+
+	if (jmp_label3 == -1)
+		jmp_label3 = cnt;
+
+	*pprog = prog;
+}
+
+/* dst = dst << val */
+static inline void emit_ia32_lsh_i64(const u8 dst[], const u32 val, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+
+	/* Do LSH operation */
+	if (val < 32) {
+		/* mov esi,32 */
+		EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), 32);
+		/* sub esi,imm8 */
+		EMIT3(0x83, add_1reg(0xE8, tmp[0]), val);
+
+		/* shl dword ptr [ebp+off],imm8 */
+		EMIT3(0xC1, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_hi));
+		EMIT(val, 1);
+		/* mov edi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[1]),
+		      STACK_VAR(dst_lo));
+		/* shl dword ptr [ebp+off],imm8 */
+		EMIT3(0xC1, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+		EMIT(val, 1);
+
+		/* mov ecx,esi */
+		EMIT2(0x89, add_2reg(0xC0, IA32_ECX, tmp[0]));
+		/* shr edi,cl */
+		EMIT2(0xD3, add_1reg(0xE8, tmp[1]));
+		/* or dword ptr [ebp+off],edi */
+		EMIT3(0x09, add_2reg(0x40, IA32_EBP, tmp[1]),
+		      STACK_VAR(dst_hi));
+	} else if (val >= 32 && val < 64) {
+		u32 value = val - 32;
+
+		/* mov esi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+		      STACK_VAR(dst_lo));
+		/* shl esi,imm8 */
+		EMIT3(0xC1, add_1reg(0xE0, tmp[0]), value);
+		/* mov dword ptr [ebp+off],esi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]),
+		      STACK_VAR(dst_hi));
+		EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_lo));
+		EMIT(0x0, 4);
+	} else {
+		EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_hi));
+		EMIT(0x0, 4);
+		EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_lo));
+		EMIT(0x0, 4);
+	}
+
+	*pprog = prog;
+}
+
+/* dst = dst >> val */
+static inline void emit_ia32_rsh_i64(const u8 dst[], const u32 val, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+
+	/* Do RSH operation */
+	if (val < 32) {
+		/* mov esi,32 */
+		EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), 32);
+		/* sub esi,imm8 */
+		EMIT3(0x83, add_1reg(0xE8, tmp[0]), val);
+
+		/* shr dword ptr [ebp+off],imm8 */
+		EMIT3(0xC1, add_1reg(0x68, IA32_EBP), STACK_VAR(dst_lo));
+		EMIT(val, 1);
+		/* mov edi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[1]),
+		      STACK_VAR(dst_hi));
+		/* shr dword ptr [ebp+off],imm8 */
+		EMIT3(0xC1, add_1reg(0x68, IA32_EBP), STACK_VAR(dst_hi));
+		EMIT(val, 1);
+
+		/* mov ecx,esi */
+		EMIT2(0x89, add_2reg(0xC0, IA32_ECX, tmp[0]));
+		/* shl edi,cl */
+		EMIT2(0xD3, add_1reg(0xE0, tmp[1]));
+		/* or dword ptr [ebp+off],edi */
+		EMIT3(0x09, add_2reg(0x40, IA32_EBP, tmp[1]),
+		      STACK_VAR(dst_lo));
+	} else if (val >= 32 && val < 64) {
+		u32 value = val - 32;
+
+		/* mov esi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+		      STACK_VAR(dst_hi));
+		/* shr esi,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, tmp[0]), value);
+		/* mov dword ptr [ebp+off],esi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]),
+		      STACK_VAR(dst_lo));
+		EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_hi));
+		EMIT(0x0, 4);
+	} else {
+		EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_hi));
+		EMIT(0x0, 4);
+		EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_lo));
+		EMIT(0x0, 4);
+	}
+
+	*pprog = prog;
+}
+
+/* dst = dst >> val (signed) */
+static inline void emit_ia32_arsh_i64(const u8 dst[], const u32 val, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+
+	/* Do RSH operation */
+	if (val < 32) {
+		/* mov esi,32 */
+		EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), 32);
+		/* sub esi,imm8 */
+		EMIT3(0x83, add_1reg(0xE8, tmp[0]), val);
+
+		/* shr dword ptr [ebp+off],imm8 */
+		EMIT3(0xC1, add_1reg(0x68, IA32_EBP), STACK_VAR(dst_lo));
+		EMIT(val, 1);
+		/* mov edi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[1]),
+		      STACK_VAR(dst_hi));
+		/* ashr dword ptr [ebp+off],imm8 */
+		EMIT3(0xC1, add_1reg(0x78, IA32_EBP), STACK_VAR(dst_hi));
+		EMIT(val, 1);
+
+		/* mov ecx,esi */
+		EMIT2(0x89, add_2reg(0xC0, IA32_ECX, tmp[0]));
+		/* shl edi,cl */
+		EMIT2(0xD3, add_1reg(0xE0, tmp[1]));
+		/* or dword ptr [ebp+off],edi */
+		EMIT3(0x09, add_2reg(0x40, IA32_EBP, tmp[1]),
+		      STACK_VAR(dst_lo));
+	} else if (val >= 32 && val < 64) {
+		u32 value = val - 32;
+
+		/* mov esi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+		      STACK_VAR(dst_hi));
+		/* ashr esi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, tmp[0]), value);
+		/* mov dword ptr [ebp+off],esi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]),
+		      STACK_VAR(dst_lo));
+
+		/* ashr dword ptr [ebp+off],imm8 */
+		EMIT3(0xC1, add_1reg(0x78, IA32_EBP), STACK_VAR(dst_hi));
+		EMIT(31, 1);
+	} else {
+		/* mov esi,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+		      STACK_VAR(dst_hi));
+		/* ashr esi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, tmp[0]), 31);
+		/* mov dword ptr [ebp+off],esi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]),
+		      STACK_VAR(dst_hi));
+		/* mov dword ptr [ebp+off],esi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]),
+		      STACK_VAR(dst_lo));
+	}
+
+	*pprog = prog;
+}
+
+static inline void emit_ia32_mul_r64(const u8 dst[], const u8 src[], u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+	const u8 *tmp2 = bpf2ia32[TMP_REG_2];
+
+	/* mov eax,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[0]), STACK_VAR(dst_hi));
+	/* mul dword ptr [ebp+off] */
+	EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+	/* mov esi,eax */
+	EMIT2(0x89, add_2reg(0xC0, tmp[0], tmp2[0]));
+
+	/* mov eax,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[0]), STACK_VAR(dst_lo));
+	/* mul dword ptr [ebp+off] */
+	EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_hi));
+	/* mov edi,eax */
+	EMIT2(0x89, add_2reg(0xC0, tmp[1], tmp2[0]));
+
+	/* add esi,edi */
+	EMIT2(0x01, add_2reg(0xC0, tmp[0], tmp[1]));
+
+	/* mov eax,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[0]), STACK_VAR(dst_lo));
+	/* mul dword ptr [ebp+off] */
+	EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+
+	/* add esi,edx */
+	EMIT2(0x01, add_2reg(0xC0, tmp[0], tmp2[1]));
+
+	/* mov dword ptr [ebp+off],eax */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp2[0]), STACK_VAR(dst_lo));
+
+	/* mov dword ptr [ebp+off],esi */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_hi));
+
+	*pprog = prog;
+}
+
+static inline void emit_ia32_mul_i64(const u8 dst[], const u32 val, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+	const u8 *tmp2 = bpf2ia32[TMP_REG_2];
+	u32 hi;
+
+	hi = val & (1<<31) ? (u32)~0 : 0;
+	/* movl eax,imm32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, tmp2[0]), val);
+	/* mul dword ptr [ebp+off] */
+	EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_hi));
+	/* mov esi,eax */
+	EMIT2(0x89, add_2reg(0xC0, tmp[0], tmp2[0]));
+
+	/* movl eax,imm32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, tmp2[0]), hi);
+	/* mul dword ptr [ebp+off] */
+	EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+	/* mov edi,eax */
+	EMIT2(0x89, add_2reg(0xC0, tmp[1], tmp2[0]));
+
+	/* add esi,edi */
+	EMIT2(0x01, add_2reg(0xC0, tmp[0], tmp[1]));
+
+	/* movl eax,imm32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, tmp2[0]), val);
+	/* mul dword ptr [ebp+off] */
+	EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+
+	/* add esi,edx */
+	EMIT2(0x01, add_2reg(0xC0, tmp[0], tmp2[1]));
+
+	/* mov dword ptr [ebp+off],eax */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp2[0]), STACK_VAR(dst_lo));
+
+	/* mov dword ptr [ebp+off],esi */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(dst_hi));
+
+	*pprog = prog;
+}
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+	if (bpf_size == BPF_W)
+		return 4;
+	else if (bpf_size == BPF_H)
+		return 2;
+	else if (bpf_size == BPF_B)
+		return 1;
+	else if (bpf_size == BPF_DW)
+		return 4; /* imm32 */
+	else
+		return 0;
+}
+
+#define CHOOSE_LOAD_FUNC(K, func) \
+	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : \
+	 func##_positive_offset)
+
+struct jit_context {
+	int cleanup_addr; /* epilogue code offset */
+};
+
+/* maximum number of bytes emitted while JITing one eBPF insn */
+#define BPF_MAX_INSN_SIZE	128
+#define BPF_INSN_SAFETY		64
+
+#define PROLOGUE_SIZE 35
+
+/* emit prologue code for BPF program and check it's size.
+ * bpf_tail_call helper will skip it while jumping into another program
+ */
+static void emit_prologue(u8 **pprog, u32 stack_depth)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *r1 = bpf2ia32[BPF_REG_1];
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+	const u8 *tmp2 = bpf2ia32[TMP_REG_2];
+	const u8 fplo = bpf2ia32[BPF_REG_FP][0];
+	const u8 fphi = bpf2ia32[BPF_REG_FP][1];
+	const u8 *tcc = bpf2ia32[TCALL_CNT];
+
+	/* push ebp */
+	EMIT1(0x55);
+	/* mov ebp,esp */
+	EMIT2(0x89, 0xE5);
+	/* push edi */
+	EMIT1(0x57);
+	/* push esi */
+	EMIT1(0x56);
+	/* push ebx */
+	EMIT1(0x53);
+
+	/* sub esp,STACK_SIZE */
+	EMIT2_off32(0x81, 0xEC, STACK_SIZE);
+	/* sub ebp,SCRATCH_SIZE+4+12*/
+	EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 16);
+	/* xor esi,esi */
+	EMIT2(0x31, add_2reg(0xC0, tmp[0], tmp[0]));
+
+	/* Set up BPF prog stack base register */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBP), STACK_VAR(fplo));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(fphi));
+
+	/* Move BPF_CTX (EAX) to BPF_REG_R1 */
+	/* mov dword ptr [ebp+off],eax */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp2[0]), STACK_VAR(r1[0]));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(r1[1]));
+
+	/* Initialize Tail Count */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(tcc[0]));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(tcc[1]));
+
+	BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+	*pprog = prog;
+}
+
+/* Emit epilogue code for BPF program */
+static void emit_epilogue(u8 **pprog, u32 stack_depth)
+{
+	u8 *prog = *pprog;
+	const u8 *r0 = bpf2ia32[BPF_REG_0];
+	int cnt = 0;
+
+	/* mov eax,dword ptr [ebp+off]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r0[0]));
+	/* mov edx,dword ptr [ebp+off]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1]));
+
+	/* add ebp,SCRATCH_SIZE+4+12*/
+	EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 16);
+
+	/* mov ebx,dword ptr [ebp-12]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12);
+	/* mov esi,dword ptr [ebp-8]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ESI), -8);
+	/* mov edi,dword ptr [ebp-4]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDI), -4);
+
+	EMIT1(0xC9); /* leave */
+	EMIT1(0xC3); /* ret */
+	*pprog = prog;
+}
+
+/* generate the following code:
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ *   if (index >= array->map.max_entries)
+ *     goto out;
+ *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ *     goto out;
+ *   prog = array->ptrs[index];
+ *   if (prog == NULL)
+ *     goto out;
+ *   goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *r1 = bpf2ia32[BPF_REG_1];
+	const u8 *r2 = bpf2ia32[BPF_REG_2];
+	const u8 *r3 = bpf2ia32[BPF_REG_3];
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+	const u8 *tmp2 = bpf2ia32[TMP_REG_2];
+	const u8 *tcc = bpf2ia32[TCALL_CNT];
+	u32 lo, hi;
+	static int jmp_label1 = -1;
+
+	/* if (index >= array->map.max_entries)
+	 *   goto out;
+	 */
+
+	/* mov esi,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(r2[0]));
+	/* mov edi,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[1]), STACK_VAR(r3[0]));
+
+	/* cmp dword ptr [esi + 16], edi */
+	EMIT3(0x39, add_2reg(0x40, tmp[0], tmp[1]),
+	      offsetof(struct bpf_array, map.max_entries));
+	/* jbe out */
+	EMIT2(IA32_JBE, jmp_label(jmp_label1, 2));
+
+	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *   goto out;
+	 */
+	lo = (u32)MAX_TAIL_CALL_CNT;
+	hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32);
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[0]), STACK_VAR(tcc[0]));
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[1]), STACK_VAR(tcc[1]));
+
+	EMIT3(0x83, add_1reg(0xF8, tmp2[1]), hi);   /* cmp edx, hi */
+	EMIT2(IA32_JNE, 3);
+	EMIT3(0x83, add_1reg(0xF8, tmp2[0]), lo);   /* cmp ecx, lo */
+
+	EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));   /* ja out */
+
+	EMIT3(0x83, add_1reg(0xC0, tmp2[0]), 0x01);   /* add eax, 0x1 */
+	EMIT3(0x83, add_1reg(0xD0, tmp2[1]), 0x00);   /* adc edx, 0x0 */
+
+	/* mov dword ptr [ebp + off], eax */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp2[0]), STACK_VAR(tcc[0]));
+	/* mov dword ptr [ebp + off], edx */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp2[1]), STACK_VAR(tcc[1]));
+
+	/* prog = array->ptrs[index]; */
+	/* mov edx, [esi + edi * 4 + offsetof(...)] */
+	EMIT3_off32(0x8B, 0x94, 0xBE, offsetof(struct bpf_array, ptrs));
+
+	/* if (prog == NULL)
+	 *   goto out;
+	 */
+	EMIT2(0x85, add_2reg(0xC0, tmp2[1], tmp2[1])); /* test edx,edx */
+	EMIT2(IA32_JE, jmp_label(jmp_label1, 2)); /* je out */
+
+	/* goto *(prog->bpf_func + prologue_size); */
+	/* mov edx, dword ptr [edx + 32] */
+	EMIT3(0x8B, add_2reg(0x40, tmp2[1], tmp2[1]),
+	      offsetof(struct bpf_prog, bpf_func));
+	/* add edx, prologue_size */
+	EMIT3(0x83, add_1reg(0xC0, tmp2[1]), PROLOGUE_SIZE);
+
+	/* mov eax,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[0]), STACK_VAR(r1[0]));
+
+	/* now we're ready to jump into next BPF program
+	 * eax == ctx (1st arg)
+	 * edx == prog->bpf_func + prologue_size
+	 */
+	RETPOLINE_EDX_BPF_JIT();
+
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+
+	/* out: */
+	*pprog = prog;
+}
+
+static void emit_load_skb_data_hlen(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *reg6 = bpf2ia32[BPF_REG_6];
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+	const u8 *tmp2 = bpf2ia32[TMP_REG_2];
+
+	/*
+	 * eax : skb pointer
+	 * esi : copy of skb->data
+	 * edi : hlen = skb->len - skb->data_len
+	 */
+
+	/* mov eax,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[0]), STACK_VAR(reg6[0]));
+
+	/* mov %edi, dword ptr [eax+off] */
+	EMIT2_off32(0x8B, add_2reg(0x80, tmp2[0], tmp[1]),
+		    offsetof(struct sk_buff, len));
+
+	/* sub %edi, dword ptr [eax+off] */
+	EMIT2_off32(0x2B, add_2reg(0x80, tmp2[0], tmp[1]),
+		    offsetof(struct sk_buff, data_len));
+
+	/* mov %esi, dword ptr [eax+off] */
+	EMIT2_off32(0x8B, add_2reg(0x80, tmp2[0], tmp[0]),
+		    offsetof(struct sk_buff, data));
+
+	*pprog = prog;
+}
+
+// push the scratch stack register on top of the stack
+static inline void emit_push_r64(const u8 src[], u8 **pprog)
+{
+	const u8 *tmp = bpf2ia32[TMP_REG_1];
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	/* mov esi,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(src_hi));
+	/* push esi */
+	EMIT1(0x56);
+
+	/* mov esi,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]), STACK_VAR(src_lo));
+	/* push esi */
+	EMIT1(0x56);
+
+	*pprog = prog;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+		  int oldproglen, struct jit_context *ctx)
+{
+	struct bpf_insn *insn = bpf_prog->insnsi;
+	int insn_cnt = bpf_prog->len;
+	bool seen_exit = false;
+	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	int i, cnt = 0;
+	int proglen = 0;
+	u8 *prog = temp;
+
+	emit_prologue(&prog, bpf_prog->aux->stack_depth);
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		const s32 imm32 = insn->imm;
+		const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+		const u8 code = insn->code;
+		const u8 *dst = bpf2ia32[insn->dst_reg];
+		const u8 *src = bpf2ia32[insn->src_reg];
+		const u8 *tmp = bpf2ia32[TMP_REG_1];
+		const u8 *tmp2 = bpf2ia32[TMP_REG_2];
+		const u8 *r0 = bpf2ia32[BPF_REG_0];
+		s64 jmp_offset;
+		u8 jmp_cond;
+		int ilen;
+		u8 *func;
+
+		switch (code) {
+		/* ALU operations */
+		/* dst = src */
+		case BPF_ALU | BPF_MOV | BPF_K:
+		case BPF_ALU | BPF_MOV | BPF_X:
+		case BPF_ALU64 | BPF_MOV | BPF_K:
+		case BPF_ALU64 | BPF_MOV | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_mov_r64(is64, dst, src, &prog);
+				break;
+			case BPF_K:
+				/* Sign-extend immediate value to dst reg */
+				emit_ia32_mov_i64(is64, dst, imm32, &prog);
+				break;
+			}
+			break;
+				/* dst = dst + src/imm */
+		/* dst = dst - src/imm */
+		/* dst = dst | src/imm */
+		/* dst = dst & src/imm */
+		/* dst = dst ^ src/imm */
+		/* dst = dst * src/imm */
+		/* dst = dst << src */
+		/* dst = dst >> src */
+		case BPF_ALU | BPF_ADD | BPF_K:
+		case BPF_ALU | BPF_ADD | BPF_X:
+		case BPF_ALU | BPF_SUB | BPF_K:
+		case BPF_ALU | BPF_SUB | BPF_X:
+		case BPF_ALU | BPF_OR | BPF_K:
+		case BPF_ALU | BPF_OR | BPF_X:
+		case BPF_ALU | BPF_AND | BPF_K:
+		case BPF_ALU | BPF_AND | BPF_X:
+		case BPF_ALU | BPF_XOR | BPF_K:
+		case BPF_ALU | BPF_XOR | BPF_X:
+		case BPF_ALU | BPF_MUL | BPF_K:
+		case BPF_ALU | BPF_MUL | BPF_X:
+		case BPF_ALU | BPF_LSH | BPF_X:
+		case BPF_ALU | BPF_RSH | BPF_X:
+		case BPF_ALU | BPF_ARSH | BPF_K:
+		case BPF_ALU | BPF_ARSH | BPF_X:
+		case BPF_ALU64 | BPF_ADD | BPF_K:
+		case BPF_ALU64 | BPF_ADD | BPF_X:
+		case BPF_ALU64 | BPF_SUB | BPF_K:
+		case BPF_ALU64 | BPF_SUB | BPF_X:
+		case BPF_ALU64 | BPF_OR | BPF_K:
+		case BPF_ALU64 | BPF_OR | BPF_X:
+		case BPF_ALU64 | BPF_AND | BPF_K:
+		case BPF_ALU64 | BPF_AND | BPF_X:
+		case BPF_ALU64 | BPF_XOR | BPF_K:
+		case BPF_ALU64 | BPF_XOR | BPF_X:
+		switch (BPF_SRC(code)) {
+		case BPF_X:
+			emit_ia32_alu_r64(is64, BPF_OP(code), dst, src,
+					  &prog);
+			break;
+		case BPF_K:
+			emit_ia32_alu_i64(is64, BPF_OP(code), dst, imm32,
+					  &prog);
+			break;
+		}
+		break;
+		/* dst = dst / src(imm) */
+		/* dst = dst % src(imm) */
+		case BPF_ALU | BPF_DIV | BPF_K:
+		case BPF_ALU | BPF_DIV | BPF_X:
+		case BPF_ALU | BPF_MOD | BPF_K:
+		case BPF_ALU | BPF_MOD | BPF_X:
+			if (BPF_SRC(code) == BPF_X)
+				/* mov esi,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(src_lo));
+			else
+				/* mov esi,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]),
+					    imm32);
+
+			/* mov eax,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[0]),
+			      STACK_VAR(dst_lo));
+			/* xor edx, edx
+			 * equivalent to 'xor rdx, rdx', but one byte less
+			 */
+			EMIT2(0x31, add_2reg(0xC0, tmp2[1], tmp2[1]));
+
+			/* div esi */
+			EMIT2(0xF7, 0xF6);
+
+			if (BPF_OP(code) == BPF_MOD)
+				EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp2[1]),
+				      STACK_VAR(dst_lo));
+			else
+				EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp2[0]),
+				      STACK_VAR(dst_lo));
+
+			EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+			      STACK_VAR(dst_hi));
+			EMIT(0x0, 4);
+			break;
+		case BPF_ALU64 | BPF_DIV | BPF_K:
+		case BPF_ALU64 | BPF_DIV | BPF_X:
+		case BPF_ALU64 | BPF_MOD | BPF_K:
+		case BPF_ALU64 | BPF_MOD | BPF_X:
+			goto notyet;
+		/* dst = dst >> imm */
+		/* dst = dst << imm */
+		case BPF_ALU | BPF_RSH | BPF_K:
+		case BPF_ALU | BPF_LSH | BPF_K:
+			if (unlikely(imm32 > 31))
+				return -EINVAL;
+			if (imm32)
+				emit_ia32_alu_i(false, false, BPF_OP(code),
+						dst_lo, imm32, &prog);
+			emit_ia32_mov_i(dst_hi, 0, &prog);
+			break;
+		/* dst = dst << imm */
+		case BPF_ALU64 | BPF_LSH | BPF_K:
+			if (unlikely(imm32 > 63))
+				return -EINVAL;
+			emit_ia32_lsh_i64(dst, imm32, &prog);
+			break;
+		/* dst = dst >> imm */
+		case BPF_ALU64 | BPF_RSH | BPF_K:
+			if (unlikely(imm32 > 63))
+				return -EINVAL;
+			emit_ia32_rsh_i64(dst, imm32, &prog);
+			break;
+		/* dst = dst << src */
+		case BPF_ALU64 | BPF_LSH | BPF_X:
+			emit_ia32_lsh_r64(dst, src, &prog);
+			break;
+		/* dst = dst >> src */
+		case BPF_ALU64 | BPF_RSH | BPF_X:
+			emit_ia32_rsh_r64(dst, src, &prog);
+			break;
+		/* dst = dst >> src (signed) */
+		case BPF_ALU64 | BPF_ARSH | BPF_X:
+			emit_ia32_arsh_r64(dst, src, &prog);
+			break;
+		/* dst = dst >> imm (signed) */
+		case BPF_ALU64 | BPF_ARSH | BPF_K:
+			if (unlikely(imm32 > 63))
+				return -EINVAL;
+			emit_ia32_arsh_i64(dst, imm32, &prog);
+			break;
+		/* dst = ~dst */
+		case BPF_ALU | BPF_NEG:
+			emit_ia32_alu_i(is64, false, BPF_OP(code),
+					dst_lo, 0, &prog);
+			emit_ia32_mov_i(dst_hi, 0, &prog);
+			break;
+		/* dst = ~dst (64 bit) */
+		case BPF_ALU64 | BPF_NEG:
+			emit_ia32_neg64(dst, &prog);
+			break;
+		/* dst = dst * src/imm */
+		case BPF_ALU64 | BPF_MUL | BPF_X:
+		case BPF_ALU64 | BPF_MUL | BPF_K:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_mul_r64(dst, src, &prog);
+				break;
+			case BPF_K:
+				emit_ia32_mul_i64(dst, imm32, &prog);
+				break;
+			}
+			break;
+		/* dst = htole(dst) */
+		case BPF_ALU | BPF_END | BPF_FROM_LE:
+			switch (imm32) {
+			case 16:
+				/* emit 'movzwl eax, ax' to zero extend 16-bit
+				 * into 64 bit
+				 */
+				/* mov esi,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(dst_lo));
+				EMIT2(0x0F, 0xB7);
+				EMIT1(add_2reg(0xC0, tmp[0], tmp[0]));
+
+				EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(dst_lo));
+				EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+				      STACK_VAR(dst_hi));
+				EMIT(0x0, 4);
+				break;
+			case 32:
+				EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+				      STACK_VAR(dst_hi));
+				EMIT(0x0, 4);
+				break;
+			case 64:
+				/* nop */
+				break;
+			}
+			break;
+		/* dst = htobe(dst) */
+		case BPF_ALU | BPF_END | BPF_FROM_BE:
+			switch (imm32) {
+			case 16:
+				/* mov esi,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(dst_lo));
+
+				/* emit 'ror %si, 8' to swap lower 2 bytes */
+				EMIT1(0x66);
+				EMIT3(0xC1, add_1reg(0xC8, tmp[0]), 8);
+
+				EMIT2(0x0F, 0xB7);
+				EMIT1(add_2reg(0xC0, tmp[0], tmp[0]));
+				EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(dst_lo));
+
+				EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+				      STACK_VAR(dst_hi));
+				EMIT(0x0, 4);
+				break;
+			case 32:
+				/* mov esi,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(dst_lo));
+
+				/* emit 'bswap esi' to swap lower 4 bytes */
+				EMIT1(0x0F);
+				EMIT1(add_1reg(0xC8, tmp[0]));
+
+				EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(dst_lo));
+
+				EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+				      STACK_VAR(dst_hi));
+				EMIT(0x0, 4);
+				break;
+			case 64:
+				/* mov esi,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(dst_lo));
+				/* emit 'bswap esi' to swap lower 4 bytes */
+				EMIT1(0x0F);
+				EMIT1(add_1reg(0xC8, tmp[0]));
+
+				/* mov esi,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[1]),
+				      STACK_VAR(dst_hi));
+				/* emit 'bswap esi' to swap lower 4 bytes */
+				EMIT1(0x0F);
+				EMIT1(add_1reg(0xC8, tmp[1]));
+
+				EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[1]),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[0]),
+				      STACK_VAR(dst_hi));
+				break;
+			}
+			break;
+		/* dst = imm64 */
+		case BPF_LD | BPF_IMM | BPF_DW:
+			EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+			      STACK_VAR(dst_lo));
+			EMIT(insn[0].imm, 4);
+
+			EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+			      STACK_VAR(dst_hi));
+			EMIT(insn[1].imm, 4);
+
+			insn++;
+			i++;
+			break;
+		/* ST: *(u8*)(dst_reg + off) = imm */
+		case BPF_ST | BPF_MEM | BPF_B:
+			/* mov esi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst_lo));
+			EMIT1(0xC6);
+			goto st;
+		case BPF_ST | BPF_MEM | BPF_H:
+			/* mov esi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst_lo));
+			EMIT2(0x66, 0xC7);
+			goto st;
+		case BPF_ST | BPF_MEM | BPF_W:
+		case BPF_ST | BPF_MEM | BPF_DW:
+			/* mov esi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst_lo));
+			EMIT1(0xC7);
+
+st:
+			if (is_imm8(insn->off))
+				EMIT2(add_1reg(0x40, tmp[0]), insn->off);
+			else
+				EMIT1_off32(add_1reg(0x80, tmp[0]), insn->off);
+			EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(code)));
+
+			if (BPF_SIZE(code) == BPF_DW) {
+				u32 hi;
+
+				hi = imm32 & (1<<31) ? (u32)~0 : 0;
+				EMIT2_off32(0xC7, add_1reg(0x80, tmp[0]),
+					    insn->off + 4);
+				EMIT(hi, 4);
+			}
+			break;
+
+		/* STX: *(u8*)(dst_reg + off) = src_reg */
+		case BPF_STX | BPF_MEM | BPF_B:
+			/* mov esi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[0]),
+			      STACK_VAR(dst_lo));
+			/* mov edi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[1]),
+			      STACK_VAR(src_lo));
+			/* emit 'mov byte ptr [dst + off], al' */
+			EMIT1(0x88);
+			goto stx;
+		case BPF_STX | BPF_MEM | BPF_H:
+			/* mov esi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[0]),
+			      STACK_VAR(dst_lo));
+			/* mov edi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[1]),
+			      STACK_VAR(src_lo));
+			EMIT2(0x66, 0x89);
+			goto stx;
+		case BPF_STX | BPF_MEM | BPF_W:
+		case BPF_STX | BPF_MEM | BPF_DW:
+			/* mov esi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[0]),
+			      STACK_VAR(dst_lo));
+			/* mov edi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[1]),
+			      STACK_VAR(src_lo));
+			EMIT1(0x89);
+
+stx:
+			if (is_imm8(insn->off))
+				EMIT2(add_2reg(0x40, tmp2[0], tmp2[1]),
+				      insn->off);
+			else
+				EMIT1_off32(add_2reg(0x80, tmp2[0], tmp2[1]),
+					    insn->off);
+
+			if (BPF_SIZE(code) == BPF_DW) {
+				/* mov edi,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[1]),
+				      STACK_VAR(src_hi));
+				EMIT1(0x89);
+
+				if (is_imm8(insn->off + 4)) {
+					EMIT2(add_2reg(0x40, tmp2[0], tmp2[1]),
+					      insn->off + 4);
+				} else {
+					EMIT1(add_2reg(0x80, tmp2[0],tmp2[1]));
+					EMIT(insn->off + 4, 4);
+				}
+			}
+			break;
+
+		/* LDX: dst_reg = *(u8*)(src_reg + off) */
+		case BPF_LDX | BPF_MEM | BPF_B:
+			/* mov esi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(src_lo));
+			/* emit 'movzx esi, byte ptr [ebp+off]' */
+			EMIT2(0x0F, 0xB6);
+			goto ldx;
+		case BPF_LDX | BPF_MEM | BPF_H:
+			/* mov esi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(src_lo));
+			/* emit 'movzx esi, word ptr [ebp+off]' */
+			EMIT2(0x0F, 0xB7);
+			goto ldx;
+		case BPF_LDX | BPF_MEM | BPF_W:
+		case BPF_LDX | BPF_MEM | BPF_DW:
+			/* mov esi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(src_lo));
+			/* emit 'mov rax, qword ptr [ebp+off]' */
+			EMIT1(0x8B);
+ldx:
+
+			if (is_imm8(insn->off))
+				EMIT2(add_2reg(0x40, tmp[0], tmp[1]),
+				      insn->off);
+			else
+				EMIT1_off32(add_2reg(0x80, tmp[0], tmp[1]),
+					    insn->off);
+
+			/* mov dword ptr [ebp+off],edi */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[1]),
+			      STACK_VAR(dst_lo));
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+			case BPF_H:
+			case BPF_W:
+				EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+				      STACK_VAR(dst_hi));
+				EMIT(0x0, 4);
+				break;
+			case BPF_DW:
+				EMIT2_off32(0x8B,
+					    add_2reg(0x80, tmp[0], tmp[1]),
+					    insn->off + 4);
+				EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp[1]),
+				      STACK_VAR(dst_hi));
+				break;
+			default:
+				break;
+			}
+			break;
+		/* call */
+		case BPF_JMP | BPF_CALL:
+		{
+			const u8 *r1 = bpf2ia32[BPF_REG_1];
+			const u8 *r2 = bpf2ia32[BPF_REG_2];
+			const u8 *r3 = bpf2ia32[BPF_REG_3];
+			const u8 *r4 = bpf2ia32[BPF_REG_4];
+			const u8 *r5 = bpf2ia32[BPF_REG_5];
+
+			if (insn->src_reg == BPF_PSEUDO_CALL)
+				goto notyet;
+
+			func = (u8 *) __bpf_call_base + imm32;
+			jmp_offset = func - (image + addrs[i]);
+
+			if (!imm32 || !is_simm32(jmp_offset)) {
+				pr_err("unsupported bpf func %d addr %p image %p\n",
+				       imm32, func, image);
+				return -EINVAL;
+			}
+
+			/* mov eax,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[0]),
+			      STACK_VAR(r1[0]));
+			/* mov edx,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp2[1]),
+			      STACK_VAR(r1[1]));
+
+			emit_push_r64(r5, &prog);
+			emit_push_r64(r4, &prog);
+			emit_push_r64(r3, &prog);
+			emit_push_r64(r2, &prog);
+
+			EMIT1_off32(0xE8, jmp_offset + 9);
+
+			/* mov dword ptr [ebp+off],eax */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp2[0]),
+			      STACK_VAR(r0[0]));
+			/* mov dword ptr [ebp+off],edx */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, tmp2[1]),
+			      STACK_VAR(r0[1]));
+
+			/* add esp,32 */
+			EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 32);
+			break;
+		}
+		case BPF_JMP | BPF_TAIL_CALL:
+			emit_bpf_tail_call(&prog);
+			break;
+
+		/* cond jump */
+		case BPF_JMP | BPF_JEQ | BPF_X:
+		case BPF_JMP | BPF_JNE | BPF_X:
+		case BPF_JMP | BPF_JGT | BPF_X:
+		case BPF_JMP | BPF_JLT | BPF_X:
+		case BPF_JMP | BPF_JGE | BPF_X:
+		case BPF_JMP | BPF_JLE | BPF_X:
+		case BPF_JMP | BPF_JSGT | BPF_X:
+		case BPF_JMP | BPF_JSLE | BPF_X:
+		case BPF_JMP | BPF_JSLT | BPF_X:
+		case BPF_JMP | BPF_JSGE | BPF_X:
+			/* mov esi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(src_hi));
+			/* cmp dword ptr [ebp+off], esi */
+			EMIT3(0x39, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst_hi));
+
+			EMIT2(IA32_JNE, 6);
+			/* mov esi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(src_lo));
+			/* cmp dword ptr [ebp+off], esi */
+			EMIT3(0x39, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst_lo));
+			goto emit_cond_jmp;
+
+		case BPF_JMP | BPF_JSET | BPF_X:
+			/* mov esi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst_lo));
+			/* and esi,dword ptr [ebp+off]*/
+			EMIT3(0x23, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(src_lo));
+
+			/* mov edi,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, tmp[1]),
+			      STACK_VAR(dst_hi));
+			/* and edi,dword ptr [ebp+off] */
+			EMIT3(0x23, add_2reg(0x40, IA32_EBP, tmp[1]),
+			      STACK_VAR(src_hi));
+			/* or esi,edi */
+			EMIT2(0x09, add_2reg(0xC0, tmp[0], tmp[1]));
+			goto emit_cond_jmp;
+
+		case BPF_JMP | BPF_JSET | BPF_K: {
+			u32 hi;
+
+			hi = imm32 & (1<<31) ? (u32)~0 : 0;
+			/* mov esi,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), imm32);
+			/* and esi,dword ptr [ebp+off]*/
+			EMIT3(0x23, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst_lo));
+
+			/* mov esi,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, tmp[1]), hi);
+			/* and esi,dword ptr [ebp+off] */
+			EMIT3(0x23, add_2reg(0x40, IA32_EBP, tmp[1]),
+			      STACK_VAR(dst_hi));
+			/* or esi,edi */
+			EMIT2(0x09, add_2reg(0xC0, tmp[0], tmp[1]));
+			goto emit_cond_jmp;
+		}
+		case BPF_JMP | BPF_JEQ | BPF_K:
+		case BPF_JMP | BPF_JNE | BPF_K:
+		case BPF_JMP | BPF_JGT | BPF_K:
+		case BPF_JMP | BPF_JLT | BPF_K:
+		case BPF_JMP | BPF_JGE | BPF_K:
+		case BPF_JMP | BPF_JLE | BPF_K:
+		case BPF_JMP | BPF_JSGT | BPF_K:
+		case BPF_JMP | BPF_JSLE | BPF_K:
+		case BPF_JMP | BPF_JSLT | BPF_K:
+		case BPF_JMP | BPF_JSGE | BPF_K: {
+			u32 hi;
+
+			hi = imm32 & (1<<31) ? (u32)~0 : 0;
+			/* mov esi,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), hi);
+			/* cmp dword ptr [ebp+off],esi */
+			EMIT3(0x39, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst_hi));
+
+			EMIT2(IA32_JNE, 6);
+			/* mov esi,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, tmp[0]), imm32);
+			/* cmp dword ptr [ebp+off],esi */
+			EMIT3(0x39, add_2reg(0x40, IA32_EBP, tmp[0]),
+			      STACK_VAR(dst_lo));
+
+emit_cond_jmp:		/* convert BPF opcode to x86 */
+			switch (BPF_OP(code)) {
+			case BPF_JEQ:
+				jmp_cond = IA32_JE;
+				break;
+			case BPF_JSET:
+			case BPF_JNE:
+				jmp_cond = IA32_JNE;
+				break;
+			case BPF_JGT:
+				/* GT is unsigned '>', JA in x86 */
+				jmp_cond = IA32_JA;
+				break;
+			case BPF_JLT:
+				/* LT is unsigned '<', JB in x86 */
+				jmp_cond = IA32_JB;
+				break;
+			case BPF_JGE:
+				/* GE is unsigned '>=', JAE in x86 */
+				jmp_cond = IA32_JAE;
+				break;
+			case BPF_JLE:
+				/* LE is unsigned '<=', JBE in x86 */
+				jmp_cond = IA32_JBE;
+				break;
+			case BPF_JSGT:
+				/* signed '>', GT in x86 */
+				jmp_cond = IA32_JG;
+				break;
+			case BPF_JSLT:
+				/* signed '<', LT in x86 */
+				jmp_cond = IA32_JL;
+				break;
+			case BPF_JSGE:
+				/* signed '>=', GE in x86 */
+				jmp_cond = IA32_JGE;
+				break;
+			case BPF_JSLE:
+				/* signed '<=', LE in x86 */
+				jmp_cond = IA32_JLE;
+				break;
+			default: /* to silence gcc warning */
+				return -EFAULT;
+			}
+			jmp_offset = addrs[i + insn->off] - addrs[i];
+			if (is_imm8(jmp_offset)) {
+				EMIT2(jmp_cond, jmp_offset);
+			} else if (is_simm32(jmp_offset)) {
+				EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+			} else {
+				pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+				return -EFAULT;
+			}
+
+			break;
+		}
+		case BPF_JMP | BPF_JA:
+			jmp_offset = addrs[i + insn->off] - addrs[i];
+			if (!jmp_offset)
+				/* optimize out nop jumps */
+				break;
+emit_jmp:
+			if (is_imm8(jmp_offset)) {
+				EMIT2(0xEB, jmp_offset);
+			} else if (is_simm32(jmp_offset)) {
+				EMIT1_off32(0xE9, jmp_offset);
+			} else {
+				pr_err("jmp gen bug %llx\n", jmp_offset);
+				return -EFAULT;
+			}
+			break;
+
+		case BPF_LD | BPF_IND | BPF_W:
+			func = sk_load_word;
+			goto common_load;
+		case BPF_LD | BPF_ABS | BPF_W:
+			func = CHOOSE_LOAD_FUNC(imm32, sk_load_word);
+common_load:
+			jmp_offset = func - (image + addrs[i]);
+			if (!func || !is_simm32(jmp_offset)) {
+				pr_err("unsupported bpf func %d addr %p image %p\n",
+				       imm32, func, image);
+				return -EINVAL;
+			}
+			if (BPF_MODE(code) == BPF_ABS) {
+				/* mov %edx, imm32 */
+				EMIT1_off32(0xBA, imm32);
+			} else {
+				/* mov edx,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(src_lo));
+				if (imm32) {
+					if (is_imm8(imm32))
+						/* add %edx, imm8 */
+						EMIT3(0x83, 0xC2, imm32);
+					else
+						/* add %edx, imm32 */
+						EMIT2_off32(0x81, 0xC2, imm32);
+				}
+			}
+			emit_load_skb_data_hlen(&prog);
+			EMIT1_off32(0xE8, jmp_offset + 10); /* call */
+
+			/* mov dword ptr [ebp+off],eax */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(r0[0]));
+			EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(r0[1]));
+			EMIT(0x0, 4);
+			break;
+
+		case BPF_LD | BPF_IND | BPF_H:
+			func = sk_load_half;
+			goto common_load;
+		case BPF_LD | BPF_ABS | BPF_H:
+			func = CHOOSE_LOAD_FUNC(imm32, sk_load_half);
+			goto common_load;
+		case BPF_LD | BPF_IND | BPF_B:
+			func = sk_load_byte;
+			goto common_load;
+		case BPF_LD | BPF_ABS | BPF_B:
+			func = CHOOSE_LOAD_FUNC(imm32, sk_load_byte);
+			goto common_load;
+		/* STX XADD: lock *(u32 *)(dst + off) += src */
+		case BPF_STX | BPF_XADD | BPF_W:
+		/* STX XADD: lock *(u64 *)(dst + off) += src */
+		case BPF_STX | BPF_XADD | BPF_DW:
+			goto notyet;
+		case BPF_JMP | BPF_EXIT:
+			if (seen_exit) {
+				jmp_offset = ctx->cleanup_addr - addrs[i];
+				goto emit_jmp;
+			}
+			seen_exit = true;
+			/* update cleanup_addr */
+			ctx->cleanup_addr = proglen;
+			emit_epilogue(&prog, bpf_prog->aux->stack_depth);
+			break;
+notyet:
+			pr_info_once("*** NOT YET: opcode %02x ***\n", code);
+			return -EFAULT;
+		default:
+			/* This error will be seen if new instruction was added
+			 * to interpreter, but not to JIT
+			 * or if there is junk in bpf_prog
+			 */
+			pr_err("bpf_jit: unknown opcode %02x\n", code);
+			return -EINVAL;
+		}
+
+		ilen = prog - temp;
+		if (ilen > BPF_MAX_INSN_SIZE) {
+			pr_err("bpf_jit: fatal insn size error\n");
+			return -EFAULT;
+		}
+
+		if (image) {
+			if (unlikely(proglen + ilen > oldproglen)) {
+				pr_err("bpf_jit: fatal error\n");
+				return -EFAULT;
+			}
+			memcpy(image + proglen, temp, ilen);
+		}
+		proglen += ilen;
+		addrs[i] = proglen;
+		prog = temp;
+	}
+	return proglen;
+}
+
+struct ia32_jit_data {
+	struct bpf_binary_header *header;
+	int *addrs;
+	u8 *image;
+	int proglen;
+	struct jit_context ctx;
+};
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
+{
+	struct bpf_binary_header *header = NULL;
+	struct bpf_prog *tmp, *orig_prog = prog;
+	struct ia32_jit_data *jit_data;
+	int proglen, oldproglen = 0;
+	struct jit_context ctx = {};
+	bool tmp_blinded = false;
+	bool extra_pass = false;
+	u8 *image = NULL;
+	int *addrs;
+	int pass;
+	int i;
+
+	if (!prog->jit_requested)
+		return orig_prog;
+
+	tmp = bpf_jit_blind_constants(prog);
+	/* If blinding was requested and we failed during blinding,
+	 * we must fall back to the interpreter.
+	 */
+	if (IS_ERR(tmp))
+		return orig_prog;
+	if (tmp != prog) {
+		tmp_blinded = true;
+		prog = tmp;
+	}
+
+	jit_data = prog->aux->jit_data;
+	if (!jit_data) {
+		jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
+		if (!jit_data) {
+			prog = orig_prog;
+			goto out;
+		}
+		prog->aux->jit_data = jit_data;
+	}
+	addrs = jit_data->addrs;
+	if (addrs) {
+		ctx = jit_data->ctx;
+		oldproglen = jit_data->proglen;
+		image = jit_data->image;
+		header = jit_data->header;
+		extra_pass = true;
+		goto skip_init_addrs;
+	}
+	addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
+	if (!addrs) {
+		prog = orig_prog;
+		goto out_addrs;
+	}
+
+	/* Before first pass, make a rough estimation of addrs[]
+	 * each bpf instruction is translated to less than 64 bytes
+	 */
+	for (proglen = 0, i = 0; i < prog->len; i++) {
+		proglen += 64;
+		addrs[i] = proglen;
+	}
+	ctx.cleanup_addr = proglen;
+skip_init_addrs:
+
+	/* JITed image shrinks with every pass and the loop iterates
+	 * until the image stops shrinking. Very large bpf programs
+	 * may converge on the last pass. In such case do one more
+	 * pass to emit the final image
+	 */
+	for (pass = 0; pass < 20 || image; pass++) {
+		proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+		if (proglen <= 0) {
+			image = NULL;
+			if (header)
+				bpf_jit_binary_free(header);
+			prog = orig_prog;
+			goto out_addrs;
+		}
+		if (image) {
+			if (proglen != oldproglen) {
+				pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+				       proglen, oldproglen);
+				prog = orig_prog;
+				goto out_addrs;
+			}
+			break;
+		}
+		if (proglen == oldproglen) {
+			header = bpf_jit_binary_alloc(proglen, &image,
+						      1, jit_fill_hole);
+			if (!header) {
+				prog = orig_prog;
+				goto out_addrs;
+			}
+		}
+		oldproglen = proglen;
+		cond_resched();
+	}
+
+	if (bpf_jit_enable > 1)
+		bpf_jit_dump(prog->len, proglen, pass + 1, image);
+
+	if (image) {
+		if (!prog->is_func || extra_pass) {
+			bpf_jit_binary_lock_ro(header);
+		} else {
+			jit_data->addrs = addrs;
+			jit_data->ctx = ctx;
+			jit_data->proglen = proglen;
+			jit_data->image = image;
+			jit_data->header = header;
+		}
+		prog->bpf_func = (void *)image;
+		prog->jited = 1;
+		prog->jited_len = proglen;
+	} else {
+		prog = orig_prog;
+	}
+
+	if (!prog->is_func || extra_pass) {
+out_addrs:
+		kfree(addrs);
+		kfree(jit_data);
+		prog->aux->jit_data = NULL;
+	}
+out:
+	if (tmp_blinded)
+		bpf_jit_prog_release_other(prog, prog == orig_prog ?
+					   tmp : orig_prog);
+	return prog;
+}
-- 
1.8.5.6.2.g3d8a54e.dirty

^ permalink raw reply related

* Re: tcp hang when socket fills up ?
From: Dominique Martinet @ 2018-04-18  9:36 UTC (permalink / raw)
  To: Jozsef Kadlecsik, Florian Westphal, Michal Kubecek, netdev,
	Marcelo Ricardo Leitner, Eric Dumazet
In-Reply-To: <20180418083054.GA29358@nautica>

Dominique Martinet wrote on Wed, Apr 18, 2018:
> Jozsef Kadlecsik wrote on Wed, Apr 18, 2018:
> > Yes, the state transition is wrong for simultaneous open, because the 
> > tcp_conntracks table is not (cannot be) smart enough. Could you verify the 
> > next untested patch?
> 
> Thanks for the patch; I'll give it a try (probably won't make it today
> so will report tomorrow)

Actually had time; I can confirm (added printks) we did get in that if
that was pointed at, and we no longer get there now.
The connection no longer gets in invalid state, so that looks like it
nailed it.

I'm now confused what this has to do with tcp_timestamp though, since
setting that off also seemed to work around the issue, but if we get
something like that in I'll be happy anyway.


Big props to everyone involved, I would have taken much longer alone,
-- 
Dominique Martinet | Asmadeus

^ permalink raw reply

* RE: [PATCH net-next 3/3] net: phy: Enable C45 PHYs with vendor specific address space
From: Vicenţiu Galanopulo @ 2018-04-18  9:38 UTC (permalink / raw)
  To: Florian Fainelli, andrew@lunn.ch, robh@kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	mark.rutland@arm.com, davem@davemloft.net, marcel@holtmann.org,
	devicetree@vger.kernel.org
  Cc: Alexandru Marginean, Madalin-cristian Bucur
In-Reply-To: <b62735bd-d411-2d21-16a2-f6cf7e5d7e4d@gmail.com>



> > Having dev-addr stored in devices_addrs, in get_phy_c45_ids(), when
> > probing the identifiers, dev-addr can be extracted from devices_addrs
> > and probed if devices_addrs[current_identifier] is not 0.
> 
> I must clearly be missing something, but why are you introducing all these
> conditionals instead of updating the existing code to be able to operate against
> an arbitrary dev-addr value, and then just making sure the first thing you do is
> fetch that property from Device Tree? There is no way someone is going to be
> testing with your specific use case in the future (except yourselves) so unless you
> make supporting an arbitrary "dev-addr" value become part of how the code
> works, this is going to be breaking badly.
>

Hi Florian,

My intention was to have this patch as "plugin" and modify the existing kernel API little to none. I was thinking that with a #ifdef, ideally,  all changes could be part of a CONFIG kernel option.
Updating the existing code, instead of the conditionals, might run into just that, and the change could propagate across multiple modules. This is from my first RFC patch, review by Andrew:
        of_mdiobus_register(), when it loops over the children, looks for 
        the new property. If found, it passed dev-id to of_mdiobus_register_phy().
        That passes it to get_phy_device(). I think get_phy_device() can 
        then set the ID in c45_ids, before passing it to get_phy_id().
        get_phy_c45_ids() will first look at devices in package and can add 
        further devices to c45_ids. It will then probe both those found, and 
        the static
        one you added.
                                                                                              Andrew

        [Vicenţiu Galanopulo]
        Just to make sure I understand. Do you want me to change the signature 
        of all of_mdiobus_register_phy(), get_phy_device(), get_phy_id() and
        get_phy_c45_ids() and include the dev_addr parameter obtained from the 
        device tree?  (a propagation of this parameter across all functions 
        all the way to
        get_phy_c45_devs_in_pkg?) This will impact xgbe-mdio.c, fixed_phy.c 
        because get_phy_device() is used in these files. 

 
 The "catch" is to transport the dev-addr value from of_mdio.c (location of the  loop of the PHY device tree node which reads all PHY node properties) to phy_device.c (this is where you can get the PHY ID).
My understanding from Andrew's comment is that the key here is the c45_ids, and that these could be filled in of_mdio.c, first, with the IDs from dev-addr (he called them "static" as they are queried directly by using the value of dev-addr) and afterwards, in phy_device.c (following the lookup loop - in a "dynamic" way).
There's nothing more to this patch than some functionality from phy_device.c ported to of_mdio.c, to enable the extraction of the PHY IDs. 
I guess the code redundancy could be reduced (between of_mdio.c and phy_device.c) and maybe you or Andrew could comment on this if you would like to go with this patch approach.

Not sure I understand your comment about the specific use case and the breaking badly part.  
Right now I'm able to test because I have access to a PHY with dev-addr = 0x1e. 
But the whole mechanism in this patch starts to work the moment you set <dev-addr> in the device tree. If you don't set that, nothing happens. If you set it to a bogus value, no PHY ID will be found at that address. Besides that, the PHY ID extraction code is the same as what is currently working in phy_device.c. 
80-90% of the patch is based on what already exists in phy_device.c and of_mdio.c. Where is the breaking badly part supposed to happen? 

> And please, can you keep me copied for next submissions?
Yes, the "to" list was pretty long and I somehow missed you. Sorry.
 
Vicentiu

^ permalink raw reply

* Re: [PATCH] bpf, x86_32: add eBPF JIT compiler for ia32 (x86_32)
From: Wang YanQing @ 2018-04-18  9:49 UTC (permalink / raw)
  To: daniel, ast, illusionist.neo, tglx, mingo, hpa, x86, netdev,
	linux-kernel
  Cc: davem
In-Reply-To: <20180418093118.GA4184@udknight>

On Wed, Apr 18, 2018 at 05:31:18PM +0800, Wang YanQing wrote:
> The JIT compiler emits ia32 bit instructions. Currently, It supports
> eBPF only. Classic BPF is supported because of the conversion by BPF core.
> 
> Almost all instructions from eBPF ISA supported except the following:
> BPF_ALU64 | BPF_DIV | BPF_K
> BPF_ALU64 | BPF_DIV | BPF_X
> BPF_ALU64 | BPF_MOD | BPF_K
> BPF_ALU64 | BPF_MOD | BPF_X
> BPF_STX | BPF_XADD | BPF_W
> BPF_STX | BPF_XADD | BPF_DW
> 
> It doesn't support BPF_JMP|BPF_CALL with BPF_PSEUDO_CALL too.
> 
> IA32 has few general purpose registers, EAX|EDX|ECX|EBX|ESI|EDI,
> and for these six registers, we can't treat all of them as real
> general purpose registers:
> MUL instructions need EAX:EDX, shift instructions need ECX, ESI|EDI
> for string manipulation instructions.
> 
> So I decide to use stack to emulate all eBPF 64 registers, this will
> simplify the implementation very much, because we don't need to face
> the flexible memory address modes on ia32, for example, we don't need
> to write below codes for one BPF_ADD instruction:
> if (src_reg is a register && dst_reg is a register)
> {
>    //one instruction encoding for ADD instruction
> } else if (only src is a register)
> {
>    //another different instruction encoding for ADD instruction
> } else if (only dst is a register)
> {
>    //another different instruction encoding for ADD instruction
> } else
> {
>    //src and dst are all on stack.
>    //another different instruction encoding for ADD instruction
> }
> 
> If you think above if-else-else-else isn't so painful, try to think
> it for BPF_ALU64|BPF_*SHIFT* instruction:)
> 
> Tested on my PC(Intel(R) Core(TM) i5-5200U CPU) and virtualbox.
> 
> Testing results on i5-5200U:
> 
> 1) test_bpf: Summary: 349 PASSED, 0 FAILED, [319/341 JIT'ed]
> 2) test_progs: Summary: 81 PASSED, 2 FAILED.
>    test_progs report "libbpf: incorrect bpf_call opcode" for
>    test_l4lb_noinline and test_xdp_noinline, because there is
>    no llvm-6.0 on my machine, and current implementation doesn't
>    support BPF_CALL, so I think we can ignore it.
> 3) test_lpm: OK
> 4) test_lru_map: OK
> 5) test_verifier: Summary: 823 PASSED, 5 FAILED
>    test_verifier report "invalid bpf_context access off=68 size=1/2/4/8"
>    for all the 5 FAILED testcases, and test_verifier report them when
>    we turn off the jit, so I think the jit can do nothing to fix them.
> 
> Above tests are all done with following flags enabled discretely:
> bpf_jit_enable=1 and bpf_jit_harden=2
> 
> Below are some numbers for this jit implementation:
> Note:
>   I run test_progs 100 times in loop for every testcase, the numbers
>   are in format: total/times=avg. The numbers that test_bpf report
>   almost show the same relation.
> 
> a:jit_enable=0 and jit_harden=0            b:jit_enable=1 and jit_harden=0
>   test_pkt_access:PASS:ipv4:15622/100=156  test_pkt_access:PASS:ipv4:10057/100=100
>   test_pkt_access:PASS:ipv6:9130/100=91    test_pkt_access:PASS:ipv6:5055/100=50
>   test_xdp:PASS:ipv4:240198/100=2401       test_xdp:PASS:ipv4:145945/100=1459
>   test_xdp:PASS:ipv6:137326/100=1373       test_xdp:PASS:ipv6:67337/100=673
>   test_l4lb:PASS:ipv4:61100/100=611        test_l4lb:PASS:ipv4:38137/100=381
>   test_l4lb:PASS:ipv6:101000/100=1010      test_l4lb:PASS:ipv6:57779/100=577
> 
> c:jit_enable=0 and jit_harden=2            b:jit_enable=1 and jit_harden=2
>   test_pkt_access:PASS:ipv4:15214/100=152  test_pkt_access:PASS:ipv4:12650/100=126
>   test_pkt_access:PASS:ipv6:9132/100=91    test_pkt_access:PASS:ipv6:7074/100=70
>   test_xdp:PASS:ipv4:237252/100=2372       test_xdp:PASS:ipv4:147211/100=1472
>   test_xdp:PASS:ipv6:135977/100=1359       test_xdp:PASS:ipv6:85783/100=857
>   test_l4lb:PASS:ipv4:61324/100=613        test_l4lb:PASS:ipv4:53222/100=532
>   test_l4lb:PASS:ipv6:100833/100=1008      test_l4lb:PASS:ipv6:76322/100=763
> 
> Yes, the numbers are pretty without turn on jit_harden, if we want to speedup
> jit_harden, then we need to move BPF_REG_AX to *real* register instead of stack
> emulation, but If we do it, we need to face all the pain I describe above. We
> can do it in next step.
> 
> See Documentation/networking/filter.txt for more information.
> 
> Signed-off-by: Wang YanQing <udknight@gmail.com>
> ---
>  arch/x86/Kconfig                     |    2 +-
>  arch/x86/include/asm/nospec-branch.h |   26 +-
>  arch/x86/net/Makefile                |   10 +-
>  arch/x86/net/bpf_jit32.S             |  147 +++
>  arch/x86/net/bpf_jit_comp32.c        | 2239 ++++++++++++++++++++++++++++++++++
>  5 files changed, 2419 insertions(+), 5 deletions(-)
>  create mode 100644 arch/x86/net/bpf_jit32.S
>  create mode 100644 arch/x86/net/bpf_jit_comp32.c
Add CC to davem@davemloft.net

^ permalink raw reply

* [PATCH net] net: stmmac: Disable ACS Feature for GMAC >= 4
From: Jose Abreu @ 2018-04-18  9:57 UTC (permalink / raw)
  To: netdev
  Cc: Jose Abreu, David S. Miller, Joao Pinto, Giuseppe Cavallaro,
	Alexandre Torgue

ACS Feature is currently enabled for GMAC >= 4 but the llc_snap status
is never checked in descriptor rx_status callback. This will cause
stmmac to always strip packets even that ACS feature is already
stripping them.

Lets be safe and disable the ACS feature for GMAC >= 4 and always strip
the packets for this GMAC version.

Fixes: 477286b53f55 ("stmmac: add GMAC4 core support")
Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
---
Hi David,

Requesting this to go up for stable also. I think its for v4.7+ but if you
could cross-check it would be great! Let me know if you need any more
info for queueing this.

Thanks
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h      |    2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c |    7 -------
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |    7 ++++++-
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index c7bff59..dedd406 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -347,7 +347,7 @@ enum power_event {
 #define MTL_RX_OVERFLOW_INT		BIT(16)
 
 /* Default operating mode of the MAC */
-#define GMAC_CORE_INIT (GMAC_CONFIG_JD | GMAC_CONFIG_PS | GMAC_CONFIG_ACS | \
+#define GMAC_CORE_INIT (GMAC_CONFIG_JD | GMAC_CONFIG_PS | \
 			GMAC_CONFIG_BE | GMAC_CONFIG_DCRS)
 
 /* To dump the core regs excluding  the Address Registers */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index a3af92e..517b1f6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -31,13 +31,6 @@ static void dwmac4_core_init(struct mac_device_info *hw,
 
 	value |= GMAC_CORE_INIT;
 
-	/* Clear ACS bit because Ethernet switch tagging formats such as
-	 * Broadcom tags can look like invalid LLC/SNAP packets and cause the
-	 * hardware to truncate packets on reception.
-	 */
-	if (netdev_uses_dsa(dev))
-		value &= ~GMAC_CONFIG_ACS;
-
 	if (mtu > 1500)
 		value |= GMAC_CONFIG_2K;
 	if (mtu > 2000)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 9a16931..b65e2d1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3495,8 +3495,13 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 
 			/* ACS is set; GMAC core strips PAD/FCS for IEEE 802.3
 			 * Type frames (LLC/LLC-SNAP)
+			 *
+			 * llc_snap is never checked in GMAC >= 4, so this ACS
+			 * feature is always disabled and packets need to be
+			 * stripped manually.
 			 */
-			if (unlikely(status != llc_snap))
+			if (unlikely(priv->synopsys_id >= DWMAC_CORE_4_00) ||
+			    unlikely(status != llc_snap))
 				frame_len -= ETH_FCS_LEN;
 
 			if (netif_msg_rx_status(priv)) {
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next] net-next/hinic: add arm64 support
From: Zhao Chen @ 2018-04-18 10:07 UTC (permalink / raw)
  To: davem
  Cc: linux-kernel, netdev, aviad.krawczyk, zhaochen6, tony.qu,
	yin.yinshi, luoshaokai

This patch enables arm64 platform support for the HINIC driver.

Signed-off-by: Zhao Chen <zhaochen6@huawei.com>
---
 drivers/net/ethernet/huawei/hinic/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/huawei/hinic/Kconfig b/drivers/net/ethernet/huawei/hinic/Kconfig
index 08db24954f7e..e4e8b24c1a5d 100644
--- a/drivers/net/ethernet/huawei/hinic/Kconfig
+++ b/drivers/net/ethernet/huawei/hinic/Kconfig
@@ -4,7 +4,7 @@
 
 config HINIC
 	tristate "Huawei Intelligent PCIE Network Interface Card"
-	depends on (PCI_MSI && X86)
+	depends on (PCI_MSI && (X86 || ARM64))
 	---help---
 	  This driver supports HiNIC PCIE Ethernet cards.
 	  To compile this driver as part of the kernel, choose Y here.
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 1/2] udp: if the rx queue is full, free the skb in __udp_enqueue_schedule_skb()
From: Paolo Abeni @ 2018-04-18 10:22 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Eric Dumazet
In-Reply-To: <cover.1524045911.git.pabeni@redhat.com>

This commit moves the kfree_skb() call on queue full event from the
ipv4/ipv6 caller into __udp_enqueue_schedule_skb(), cleaning up the
code and avoid referencing the skb after that __udp_enqueue_schedule_skb()
completes, so that we can modify the skb ptr itself into the latter
function.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/ipv4/udp.c | 2 +-
 net/ipv6/udp.c | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 24b5c59b1c53..3fb0fbf4977d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1341,6 +1341,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 drop:
 	atomic_inc(&sk->sk_drops);
 	busylock_release(busy);
+	kfree_skb(skb);
 	return err;
 }
 EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
@@ -1802,7 +1803,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 			UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
 					is_udplite);
 		UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
-		kfree_skb(skb);
 		trace_udp_fail_queue_rcv_skb(rc, sk);
 		return -1;
 	}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6861ed479469..c113222f7670 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -532,7 +532,6 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 			UDP6_INC_STATS(sock_net(sk),
 					 UDP_MIB_RCVBUFERRORS, is_udplite);
 		UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
-		kfree_skb(skb);
 		return -1;
 	}
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next 0/2] UDP: introduce RX skb cache
From: Paolo Abeni @ 2018-04-18 10:22 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Eric Dumazet

The goal of this series is to improve UDP performance in the RX path, that
got worse when spectre/meltdown mithigations were introduced.

The main idea is to move almost entirely the cost of skb handling from the 
receiver process context into the BH processing, leveraging, for small
packets, a newly introduced, BH managed, skb cache. This is somewhat
similar to copy-break and the implementation details are in the main patch.

Overall this gives a performance improvement up to 20% in for UDP flood
stress test and above 5% for real-life DNS performance tests.

Paolo Abeni (2):
  udp: if the rx queue is full, free the skb in
    __udp_enqueue_schedule_skb()
  udp: implement and use per cpu rx skbs cache

 net/ipv4/udp.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/udp.c |   1 -
 2 files changed, 160 insertions(+), 3 deletions(-)

-- 
2.14.3

^ permalink raw reply

* [PATCH net-next 2/2] udp: implement and use per cpu rx skbs cache
From: Paolo Abeni @ 2018-04-18 10:22 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Eric Dumazet
In-Reply-To: <cover.1524045911.git.pabeni@redhat.com>

This changeset extends the idea behind commit c8c8b127091b ("udp:
under rx pressure, try to condense skbs"), trading more BH cpu
time and memory bandwidth to decrease the load on the user space
receiver.

At boot time we allocate a limited amount of skbs with small
data buffer, storing them in per cpu arrays. Such skbs are never
freed.

At run time, under rx pressure, the BH tries to copy the current
skb contents into the cache - if the current cache skb is available,
and the ingress skb is small enough and without any head states.

When using the cache skb, the ingress skb is dropped by the BH
- while still hot on cache - and the cache skb is inserted into
the rx queue, after increasing its usage count. Also, the cache
array index is moved to the next entry.

The receive side is unmodified: in udp_rcvmsg() the usage skb
usage count is decreased and the skb is _not_ freed - since the
cache keeps usage > 0. Since skb->usage is hot in the cache of the
receiver at consume time - the receiver has just read skb->data,
which lies in the same cacheline - the whole skb_consume_udp() becomes
really cheap.

UDP receive performances under flood improve as follow:

NR RX queues	Kpps	Kpps	Delta (%)
		Before	After

1		2252	2305	2
2		2151	2569	19
4		2033	2396	17
8		1969	2329	18

Overall performances of knotd DNS server under real traffic flood
improves as follow:

		Kpps	Kpps	Delta (%)
		Before	After

		3777	3981	5

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
--
Performances figures are with both PAGE_TABLE_ISOLATION and
RETPOLINES enabled, this is way the baseline
---
 net/ipv4/udp.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3fb0fbf4977d..bb1879cd51b4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -125,6 +125,26 @@ EXPORT_SYMBOL(sysctl_udp_mem);
 atomic_long_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
+struct skb_cache_entry {
+	int size;
+	int head;
+	struct sk_buff *skbs[0];
+};
+
+static struct skb_cache_entry __percpu *skb_cache;
+
+/* Under socket memory pressure, small packets are copied to a percpu cache
+ * before enqueuing them, do decrease the load on the receiver process.
+ * To avoid excessive copy overhead we use a small skb size threshold.
+ * Each percpu cache should be able to cope with at least a socket under
+ * memory pressure. It doesn't need to handle many of them: if there are
+ * more than a few sockets under memory pressure, the user-space is most
+ * probably too lazy and there is no gain using the cache
+ */
+#define UDP_CACHE_MAX_SKB_LEN		512
+#define UDP_CACHE_MIN_SIZE		_SK_MEM_PACKETS
+#define UDP_CACHE_MAX_SIZE		(_SK_MEM_PACKETS * 3)
+
 #define MAX_UDP_PORTS 65536
 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
 
@@ -1246,6 +1266,82 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
 	udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
 }
 
+static inline struct sk_buff *udp_cache_get_skb(void)
+{
+	struct skb_cache_entry *cache;
+	struct sk_buff *skb;
+
+	if (unlikely(!skb_cache))
+		return NULL;
+
+	cache = this_cpu_ptr(skb_cache);
+	skb = cache->skbs[cache->head];
+	if (refcount_read(&skb->users) != 1)
+		return NULL;
+
+	/* peeking with offset clones the queued skbs, we must check that all
+	 * the cloned references are gone.
+	 * This barrier is paried with the implicit one in skb_unref(), while
+	 * decrementing skb->users.
+	 */
+	rmb();
+	if (unlikely(skb->cloned)) {
+		if (atomic_read(&skb_shinfo(skb)->dataref) != 1)
+			return NULL;
+		skb->cloned = 0;
+	}
+
+	cache->head++;
+	if (cache->head == cache->size)
+		cache->head = 0;
+	refcount_inc(&skb->users);
+	return skb;
+}
+
+static bool udp_copy_to_cache(struct sk_buff **s)
+{
+	struct sk_buff *skb2, *skb = *s;
+	int hlen;
+
+	/* check if we can copy the specified skb into the cache: data + l3 +
+	 * l4 must be below the the cached skb size and no head states must
+	 * be attached.
+	 */
+	hlen = skb_network_header_len(skb) + sizeof(struct udphdr);
+	if ((hlen + skb->len) >= UDP_CACHE_MAX_SKB_LEN || skb_sec_path(skb))
+		return false;
+
+	skb2 = udp_cache_get_skb();
+	if (!skb2)
+		return false;
+
+	/* copy the relevant header: we skip the head states - we know no state
+	 * is attached to 'skb' - the unrelevant part of the CB, and
+	 * skb->dev - will be overwritten later by udp_set_dev_scratch()
+	 */
+	skb2->tstamp	    = skb->tstamp;
+	*UDP_SKB_CB(skb2)   = *UDP_SKB_CB(skb);
+	skb2->queue_mapping = skb->queue_mapping;
+	memcpy(&skb2->headers_start, &skb->headers_start,
+	       offsetof(struct sk_buff, headers_end) -
+	       offsetof(struct sk_buff, headers_start));
+
+	/* skip the mac header, we don't need it */
+	skb_copy_bits(skb, -hlen, skb2->head, skb->len + hlen);
+
+	/* override the relevant offsets: skb2 starts from the network hdr */
+	skb2->transport_header = hlen - sizeof(struct udphdr);
+	skb2->network_header  = 0;
+	skb2->mac_header = 0;
+	skb2->data = skb2->head + hlen;
+	skb_set_tail_pointer(skb2, skb->len);
+	skb2->len = skb->len;
+	consume_skb(skb);
+
+	*s = skb2;
+	return true;
+}
+
 /* Idea of busylocks is to let producers grab an extra spinlock
  * to relieve pressure on the receive_queue spinlock shared by consumer.
  * Under flood, this means that only one producer can be in line
@@ -1290,9 +1386,12 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 	 * - Reduce memory overhead and thus increase receive queue capacity
 	 * - Less cache line misses at copyout() time
 	 * - Less work at consume_skb() (less alien page frag freeing)
+	 * Additionally, processing skbs from the cache allows udp_recvmsg()
+	 * to 'free' them with a single atomic operation on a hot cacheline
 	 */
 	if (rmem > (sk->sk_rcvbuf >> 1)) {
-		skb_condense(skb);
+		if (!udp_copy_to_cache(&skb))
+			skb_condense(skb);
 
 		busy = busylock_acquire(sk);
 	}
@@ -2858,6 +2957,64 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = {
 	.init	= udp_sysctl_init,
 };
 
+static void udp_free_cache(int nr)
+{
+	int i, cpu;
+
+	for_each_possible_cpu(cpu)
+		for (i = 0; i < nr; ++i)
+			kfree_skb(per_cpu_ptr(skb_cache, cpu)->skbs[i]);
+
+	free_percpu(skb_cache);
+	skb_cache = NULL;
+}
+
+static void udp_init_cache(unsigned long max_size)
+{
+	size_t skb_guessed_size, per_cpu_size;
+	unsigned long total_size = 0;
+	struct sk_buff *skb;
+	int i, nr, cpu = 0;
+
+	/* try to fill the cache only if we can allocate a reasonable number
+	 * of skbs
+	 */
+	skb_guessed_size = SKB_TRUESIZE(UDP_CACHE_MAX_SKB_LEN);
+	nr = min_t(unsigned long, UDP_CACHE_MAX_SIZE,
+		   max_size / (nr_cpu_ids * skb_guessed_size));
+	if (nr < UDP_CACHE_MIN_SIZE) {
+		pr_info("low memory, UDP skbs cache will not be allocated\n");
+		return;
+	}
+
+	per_cpu_size = nr * sizeof(void *) + sizeof(struct skb_cache_entry);
+	skb_cache = __alloc_percpu_gfp(per_cpu_size, L1_CACHE_BYTES,
+				       GFP_KERNEL | __GFP_ZERO);
+	if (!skb_cache) {
+		pr_warn("Can't allocate UDP skb cache\n");
+		return;
+	}
+
+	pr_info("allocating %d skbs on %d CPUs for rx cache\n", nr, nr_cpu_ids);
+	for (i = 0; i < nr && total_size < max_size; ++i) {
+		for_each_possible_cpu(cpu) {
+			skb = __alloc_skb(UDP_CACHE_MAX_SKB_LEN, GFP_KERNEL,
+					  0, cpu_to_node(cpu));
+			if (!skb) {
+				pr_warn("allocation failure, cache disabled");
+				udp_free_cache(nr);
+				return;
+			}
+
+			total_size += skb->truesize;
+			per_cpu_ptr(skb_cache, cpu)->skbs[i] = skb;
+		}
+	}
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(skb_cache, cpu)->size = nr;
+}
+
 void __init udp_init(void)
 {
 	unsigned long limit;
@@ -2871,6 +3028,7 @@ void __init udp_init(void)
 	sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
 
 	__udp_sysctl_init(&init_net);
+	udp_init_cache(sysctl_udp_mem[0] / 100 * PAGE_SIZE);
 
 	/* 16 spinlocks per cpu */
 	udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
-- 
2.14.3

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox