* [PATCH net-next v2 1/3] net: skb: plumb napi state thru skb freeing paths
2023-04-13 4:26 [PATCH net-next v2 0/3] page_pool: allow caching from safely localized NAPI Jakub Kicinski
@ 2023-04-13 4:26 ` Jakub Kicinski
2023-04-13 7:49 ` Yunsheng Lin
2023-04-13 4:26 ` [PATCH net-next v2 2/3] page_pool: allow caching from safely localized NAPI Jakub Kicinski
` (3 subsequent siblings)
4 siblings, 1 reply; 13+ messages in thread
From: Jakub Kicinski @ 2023-04-13 4:26 UTC (permalink / raw)
To: davem
Cc: netdev, edumazet, pabeni, hawk, ilias.apalodimas, linyunsheng,
alexander.duyck, Jakub Kicinski, Tariq Toukan
We maintain a NAPI-local cache of skbs which is fed by napi_consume_skb().
Going forward we will also try to cache head and data pages.
Plumb the "are we in a normal NAPI context" information thru
deeper into the freeing path, up to skb_release_data() and
skb_free_head()/skb_pp_recycle(). The "not normal NAPI context"
comes from netpoll which passes budget of 0 to try to reap
the Tx completions but not perform any Rx.
Use "bool napi_safe" rather than bare "int budget",
the further we get from NAPI the more confusing the budget
argument may seem (particularly whether 0 or MAX is the
correct value to pass in when not in NAPI).
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
v2:
- clarify the budget 0 and fix the name of the argument in
the commit message
v1:
- feed the cache in __kfree_skb_defer(), it's in NAPI
- s/in_normal_napi/napi_safe/
---
net/core/skbuff.c | 38 ++++++++++++++++++++------------------
1 file changed, 20 insertions(+), 18 deletions(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 050a875d09c5..b2092166f7e2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -839,7 +839,7 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
-static bool skb_pp_recycle(struct sk_buff *skb, void *data)
+static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
{
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false;
@@ -856,12 +856,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
kfree(head);
}
-static void skb_free_head(struct sk_buff *skb)
+static void skb_free_head(struct sk_buff *skb, bool napi_safe)
{
unsigned char *head = skb->head;
if (skb->head_frag) {
- if (skb_pp_recycle(skb, head))
+ if (skb_pp_recycle(skb, head, napi_safe))
return;
skb_free_frag(head);
} else {
@@ -869,7 +869,8 @@ static void skb_free_head(struct sk_buff *skb)
}
}
-static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
+static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
+ bool napi_safe)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
int i;
@@ -894,7 +895,7 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
if (shinfo->frag_list)
kfree_skb_list_reason(shinfo->frag_list, reason);
- skb_free_head(skb);
+ skb_free_head(skb, napi_safe);
exit:
/* When we clone an SKB we copy the reycling bit. The pp_recycle
* bit is only set on the head though, so in order to avoid races
@@ -955,11 +956,12 @@ void skb_release_head_state(struct sk_buff *skb)
}
/* Free everything but the sk_buff shell. */
-static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
+static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
+ bool napi_safe)
{
skb_release_head_state(skb);
if (likely(skb->head))
- skb_release_data(skb, reason);
+ skb_release_data(skb, reason, napi_safe);
}
/**
@@ -973,7 +975,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
void __kfree_skb(struct sk_buff *skb)
{
- skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
+ skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);
@@ -1027,7 +1029,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
return;
}
- skb_release_all(skb, reason);
+ skb_release_all(skb, reason, false);
sa->skb_array[sa->skb_count++] = skb;
if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
@@ -1201,7 +1203,7 @@ EXPORT_SYMBOL(consume_skb);
void __consume_stateless_skb(struct sk_buff *skb)
{
trace_consume_skb(skb, __builtin_return_address(0));
- skb_release_data(skb, SKB_CONSUMED);
+ skb_release_data(skb, SKB_CONSUMED, false);
kfree_skbmem(skb);
}
@@ -1226,7 +1228,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
void __kfree_skb_defer(struct sk_buff *skb)
{
- skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
+ skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, true);
napi_skb_cache_put(skb);
}
@@ -1264,7 +1266,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- skb_release_all(skb, SKB_CONSUMED);
+ skb_release_all(skb, SKB_CONSUMED, !!budget);
napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
@@ -1395,7 +1397,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
*/
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
- skb_release_all(dst, SKB_CONSUMED);
+ skb_release_all(dst, SKB_CONSUMED, false);
return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
@@ -2018,9 +2020,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb, SKB_CONSUMED);
+ skb_release_data(skb, SKB_CONSUMED, false);
} else {
- skb_free_head(skb);
+ skb_free_head(skb, false);
}
off = (data + nhead) - skb->head;
@@ -6389,12 +6391,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb, SKB_CONSUMED);
+ skb_release_data(skb, SKB_CONSUMED, false);
} else {
/* we can reuse existing recount- all we did was
* relocate values
*/
- skb_free_head(skb);
+ skb_free_head(skb, false);
}
skb->head = data;
@@ -6529,7 +6531,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_kfree_head(data, size);
return -ENOMEM;
}
- skb_release_data(skb, SKB_CONSUMED);
+ skb_release_data(skb, SKB_CONSUMED, false);
skb->head = data;
skb->head_frag = 0;
--
2.39.2
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH net-next v2 1/3] net: skb: plumb napi state thru skb freeing paths
2023-04-13 4:26 ` [PATCH net-next v2 1/3] net: skb: plumb napi state thru skb freeing paths Jakub Kicinski
@ 2023-04-13 7:49 ` Yunsheng Lin
2023-04-13 8:51 ` Eric Dumazet
0 siblings, 1 reply; 13+ messages in thread
From: Yunsheng Lin @ 2023-04-13 7:49 UTC (permalink / raw)
To: Jakub Kicinski, davem
Cc: netdev, edumazet, pabeni, hawk, ilias.apalodimas, alexander.duyck,
Tariq Toukan
On 2023/4/13 12:26, Jakub Kicinski wrote:
> We maintain a NAPI-local cache of skbs which is fed by napi_consume_skb().
> Going forward we will also try to cache head and data pages.
> Plumb the "are we in a normal NAPI context" information thru
> deeper into the freeing path, up to skb_release_data() and
> skb_free_head()/skb_pp_recycle(). The "not normal NAPI context"
> comes from netpoll which passes budget of 0 to try to reap
> the Tx completions but not perform any Rx.
Maybe I missed something obvious about netpoll here.
Does that mean the "normal NAPI context" and "not normal NAPI context"
will call napi->poll() concurrently with different budget? Doesn't
that mean two different contexts may do the tx completion concurrently?
Does it break the single-producer single-consumer assumption of tx queue?
>
> Use "bool napi_safe" rather than bare "int budget",
> the further we get from NAPI the more confusing the budget
> argument may seem (particularly whether 0 or MAX is the
> correct value to pass in when not in NAPI).
>
> Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> ---
> v2:
> - clarify the budget 0 and fix the name of the argument in
> the commit message
> v1:
> - feed the cache in __kfree_skb_defer(), it's in NAPI
> - s/in_normal_napi/napi_safe/
> ---
> net/core/skbuff.c | 38 ++++++++++++++++++++------------------
> 1 file changed, 20 insertions(+), 18 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 050a875d09c5..b2092166f7e2 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -839,7 +839,7 @@ static void skb_clone_fraglist(struct sk_buff *skb)
> skb_get(list);
> }
>
> -static bool skb_pp_recycle(struct sk_buff *skb, void *data)
> +static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
> {
> if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
> return false;
> @@ -856,12 +856,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
> kfree(head);
> }
>
> -static void skb_free_head(struct sk_buff *skb)
> +static void skb_free_head(struct sk_buff *skb, bool napi_safe)
> {
> unsigned char *head = skb->head;
>
> if (skb->head_frag) {
> - if (skb_pp_recycle(skb, head))
> + if (skb_pp_recycle(skb, head, napi_safe))
> return;
> skb_free_frag(head);
> } else {
> @@ -869,7 +869,8 @@ static void skb_free_head(struct sk_buff *skb)
> }
> }
>
> -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
> +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
> + bool napi_safe)
> {
> struct skb_shared_info *shinfo = skb_shinfo(skb);
> int i;
> @@ -894,7 +895,7 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
> if (shinfo->frag_list)
> kfree_skb_list_reason(shinfo->frag_list, reason);
>
> - skb_free_head(skb);
> + skb_free_head(skb, napi_safe);
> exit:
> /* When we clone an SKB we copy the reycling bit. The pp_recycle
> * bit is only set on the head though, so in order to avoid races
> @@ -955,11 +956,12 @@ void skb_release_head_state(struct sk_buff *skb)
> }
>
> /* Free everything but the sk_buff shell. */
> -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
> +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
> + bool napi_safe)
> {
> skb_release_head_state(skb);
> if (likely(skb->head))
> - skb_release_data(skb, reason);
> + skb_release_data(skb, reason, napi_safe);
> }
>
> /**
> @@ -973,7 +975,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
>
> void __kfree_skb(struct sk_buff *skb)
> {
> - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
> + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
> kfree_skbmem(skb);
> }
> EXPORT_SYMBOL(__kfree_skb);
> @@ -1027,7 +1029,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
> return;
> }
>
> - skb_release_all(skb, reason);
> + skb_release_all(skb, reason, false);
> sa->skb_array[sa->skb_count++] = skb;
>
> if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
> @@ -1201,7 +1203,7 @@ EXPORT_SYMBOL(consume_skb);
> void __consume_stateless_skb(struct sk_buff *skb)
> {
> trace_consume_skb(skb, __builtin_return_address(0));
> - skb_release_data(skb, SKB_CONSUMED);
> + skb_release_data(skb, SKB_CONSUMED, false);
> kfree_skbmem(skb);
> }
>
> @@ -1226,7 +1228,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
>
> void __kfree_skb_defer(struct sk_buff *skb)
> {
> - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
> + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, true);
> napi_skb_cache_put(skb);
> }
>
> @@ -1264,7 +1266,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
> return;
> }
>
> - skb_release_all(skb, SKB_CONSUMED);
> + skb_release_all(skb, SKB_CONSUMED, !!budget);
If it is not normal NAPI context, dev_consume_skb_any() is called and
return at the begin, we may just call skb_release_all() with napi_safe
being true here.
> napi_skb_cache_put(skb);
> }
> EXPORT_SYMBOL(napi_consume_skb);
> @@ -1395,7 +1397,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
> */
> struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
> {
> - skb_release_all(dst, SKB_CONSUMED);
> + skb_release_all(dst, SKB_CONSUMED, false);
> return __skb_clone(dst, src);
> }
> EXPORT_SYMBOL_GPL(skb_morph);
> @@ -2018,9 +2020,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> if (skb_has_frag_list(skb))
> skb_clone_fraglist(skb);
>
> - skb_release_data(skb, SKB_CONSUMED);
> + skb_release_data(skb, SKB_CONSUMED, false);
> } else {
> - skb_free_head(skb);
> + skb_free_head(skb, false);
> }
> off = (data + nhead) - skb->head;
>
> @@ -6389,12 +6391,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
> skb_frag_ref(skb, i);
> if (skb_has_frag_list(skb))
> skb_clone_fraglist(skb);
> - skb_release_data(skb, SKB_CONSUMED);
> + skb_release_data(skb, SKB_CONSUMED, false);
> } else {
> /* we can reuse existing recount- all we did was
> * relocate values
> */
> - skb_free_head(skb);
> + skb_free_head(skb, false);
> }
>
> skb->head = data;
> @@ -6529,7 +6531,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
> skb_kfree_head(data, size);
> return -ENOMEM;
> }
> - skb_release_data(skb, SKB_CONSUMED);
> + skb_release_data(skb, SKB_CONSUMED, false);
>
> skb->head = data;
> skb->head_frag = 0;
>
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH net-next v2 1/3] net: skb: plumb napi state thru skb freeing paths
2023-04-13 7:49 ` Yunsheng Lin
@ 2023-04-13 8:51 ` Eric Dumazet
2023-04-14 0:57 ` Yunsheng Lin
0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2023-04-13 8:51 UTC (permalink / raw)
To: Yunsheng Lin
Cc: Jakub Kicinski, davem, netdev, pabeni, hawk, ilias.apalodimas,
alexander.duyck, Tariq Toukan
On Thu, Apr 13, 2023 at 9:49 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> Maybe I missed something obvious about netpoll here.
> Does that mean the "normal NAPI context" and "not normal NAPI context"
> will call napi->poll() concurrently with different budget? Doesn't
> that mean two different contexts may do the tx completion concurrently?
Please take a look at netpoll code:
netpoll_poll_lock, poll_napi() and poll_one_napi()
> Does it break the single-producer single-consumer assumption of tx queue?
We do not think so.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next v2 1/3] net: skb: plumb napi state thru skb freeing paths
2023-04-13 8:51 ` Eric Dumazet
@ 2023-04-14 0:57 ` Yunsheng Lin
2023-04-14 5:20 ` Jakub Kicinski
0 siblings, 1 reply; 13+ messages in thread
From: Yunsheng Lin @ 2023-04-14 0:57 UTC (permalink / raw)
To: Eric Dumazet
Cc: Jakub Kicinski, davem, netdev, pabeni, hawk, ilias.apalodimas,
alexander.duyck, Tariq Toukan
On 2023/4/13 16:51, Eric Dumazet wrote:
> On Thu, Apr 13, 2023 at 9:49 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> Maybe I missed something obvious about netpoll here.
>> Does that mean the "normal NAPI context" and "not normal NAPI context"
>> will call napi->poll() concurrently with different budget? Doesn't
>> that mean two different contexts may do the tx completion concurrently?
>
> Please take a look at netpoll code:
> netpoll_poll_lock, poll_napi() and poll_one_napi()
>
>> Does it break the single-producer single-consumer assumption of tx queue?
>
> We do not think so.
Then I guess it is ok to do direct recycling for page pool case as it is
per napi?
It is per cpu cache case we are using !!bugget to protect it from preemption
while netpoll_poll_dev() is running?
> .
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next v2 1/3] net: skb: plumb napi state thru skb freeing paths
2023-04-14 0:57 ` Yunsheng Lin
@ 2023-04-14 5:20 ` Jakub Kicinski
2023-04-14 9:40 ` Yunsheng Lin
0 siblings, 1 reply; 13+ messages in thread
From: Jakub Kicinski @ 2023-04-14 5:20 UTC (permalink / raw)
To: Yunsheng Lin
Cc: Eric Dumazet, davem, netdev, pabeni, hawk, ilias.apalodimas,
alexander.duyck, Tariq Toukan
On Fri, 14 Apr 2023 08:57:03 +0800 Yunsheng Lin wrote:
> >> Does it break the single-producer single-consumer assumption of tx queue?
> >
> > We do not think so.
>
> Then I guess it is ok to do direct recycling for page pool case as it is
> per napi?
We're talking about the tx queue or the pp cache?
Those have different producers and consumers.
> It is per cpu cache case we are using !!bugget to protect it from preemption
> while netpoll_poll_dev() is running?
This is the scenario - We are feeding the page pool cache from the
deferred pages. An IRQ comes and interrupts us half way thru.
netpoll then tries to also feed the page pool cache. Unhappiness.
Note that netpoll is activated extremely rarely (only when something
writes to the console), and even more rarely does it actually poll
for space in the Tx queue.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next v2 1/3] net: skb: plumb napi state thru skb freeing paths
2023-04-14 5:20 ` Jakub Kicinski
@ 2023-04-14 9:40 ` Yunsheng Lin
0 siblings, 0 replies; 13+ messages in thread
From: Yunsheng Lin @ 2023-04-14 9:40 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Eric Dumazet, davem, netdev, pabeni, hawk, ilias.apalodimas,
alexander.duyck, Tariq Toukan
On 2023/4/14 13:20, Jakub Kicinski wrote:
> On Fri, 14 Apr 2023 08:57:03 +0800 Yunsheng Lin wrote:
>>>> Does it break the single-producer single-consumer assumption of tx queue?
>>>
>>> We do not think so.
>>
>> Then I guess it is ok to do direct recycling for page pool case as it is
>> per napi?
>
> We're talking about the tx queue or the pp cache?
> Those have different producers and consumers.
I was trying to confirm if there are more than one contexts that will call
napi->poll() concurrently, if no, then it seems we only have one consumer
for pp cache. And it does not really matter here too when napi->poll() from
netpoll only do tx completion now.
If we are able to limit the context of producer for pp cache to be
the same as consumer, then we might avoid the !!bugget checking.
I do ack that it is a bigger change considering when we might need to
hold a persisent and safe reference to the NAPI instance for this too,
so we might leave it for now like you mentioned in the cover letter.
>
>> It is per cpu cache case we are using !!bugget to protect it from preemption
>> while netpoll_poll_dev() is running?
>
> This is the scenario - We are feeding the page pool cache from the
> deferred pages. An IRQ comes and interrupts us half way thru.
> netpoll then tries to also feed the page pool cache. Unhappiness.
I assume feeding the page pool cache means producer for pp cache?
If netpoll only do tx completion by using zero bugget, it does not
seem to have any direct relation with page pool yet.
>
> Note that netpoll is activated extremely rarely (only when something
> writes to the console), and even more rarely does it actually poll
> for space in the Tx queue.
That is my point, we turn out to need a checking in the fast path in order
to handle the netpoll case, which is a extreme rare case.
> .
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH net-next v2 2/3] page_pool: allow caching from safely localized NAPI
2023-04-13 4:26 [PATCH net-next v2 0/3] page_pool: allow caching from safely localized NAPI Jakub Kicinski
2023-04-13 4:26 ` [PATCH net-next v2 1/3] net: skb: plumb napi state thru skb freeing paths Jakub Kicinski
@ 2023-04-13 4:26 ` Jakub Kicinski
2023-04-14 8:45 ` Jesper Dangaard Brouer
2023-04-13 4:26 ` [PATCH net-next v2 3/3] bnxt: hook NAPIs to page pools Jakub Kicinski
` (2 subsequent siblings)
4 siblings, 1 reply; 13+ messages in thread
From: Jakub Kicinski @ 2023-04-13 4:26 UTC (permalink / raw)
To: davem
Cc: netdev, edumazet, pabeni, hawk, ilias.apalodimas, linyunsheng,
alexander.duyck, Jakub Kicinski, Tariq Toukan
Recent patches to mlx5 mentioned a regression when moving from
driver local page pool to only using the generic page pool code.
Page pool has two recycling paths (1) direct one, which runs in
safe NAPI context (basically consumer context, so producing
can be lockless); and (2) via a ptr_ring, which takes a spin
lock because the freeing can happen from any CPU; producer
and consumer may run concurrently.
Since the page pool code was added, Eric introduced a revised version
of deferred skb freeing. TCP skbs are now usually returned to the CPU
which allocated them, and freed in softirq context. This places the
freeing (producing of pages back to the pool) enticingly close to
the allocation (consumer).
If we can prove that we're freeing in the same softirq context in which
the consumer NAPI will run - lockless use of the cache is perfectly fine,
no need for the lock.
Let drivers link the page pool to a NAPI instance. If the NAPI instance
is scheduled on the same CPU on which we're freeing - place the pages
in the direct cache.
With that and patched bnxt (XDP enabled to engage the page pool, sigh,
bnxt really needs page pool work :() I see a 2.6% perf boost with
a TCP stream test (app on a different physical core than softirq).
The CPU use of relevant functions decreases as expected:
page_pool_refill_alloc_cache 1.17% -> 0%
_raw_spin_lock 2.41% -> 0.98%
Only consider lockless path to be safe when NAPI is scheduled
- in practice this should cover majority if not all of steady state
workloads. It's usually the NAPI kicking in that causes the skb flush.
The main case we'll miss out on is when application runs on the same
CPU as NAPI. In that case we don't use the deferred skb free path.
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
v1:
- s/in_napi/napi_safe/
rfc v2: https://lore.kernel.org/all/20230405232100.103392-1-kuba@kernel.org/
- plumb thru "are we in NAPI" bool rather than guessing based
on softirq && !hardirq
CC: hawk@kernel.org
CC: ilias.apalodimas@linaro.org
---
Documentation/networking/page_pool.rst | 1 +
include/linux/netdevice.h | 3 +++
include/linux/skbuff.h | 20 +++++++++++++-------
include/net/page_pool.h | 3 ++-
net/core/dev.c | 3 +++
net/core/page_pool.c | 15 +++++++++++++--
net/core/skbuff.c | 4 ++--
7 files changed, 37 insertions(+), 12 deletions(-)
diff --git a/Documentation/networking/page_pool.rst b/Documentation/networking/page_pool.rst
index 30f1344e7cca..873efd97f822 100644
--- a/Documentation/networking/page_pool.rst
+++ b/Documentation/networking/page_pool.rst
@@ -165,6 +165,7 @@ Registration
pp_params.pool_size = DESC_NUM;
pp_params.nid = NUMA_NO_NODE;
pp_params.dev = priv->dev;
+ pp_params.napi = napi; /* only if locking is tied to NAPI */
pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
page_pool = page_pool_create(&pp_params);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fe355592dfde..a15cbc449630 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -360,8 +360,11 @@ struct napi_struct {
unsigned long gro_bitmask;
int (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
+ /* CPU actively polling if netpoll is configured */
int poll_owner;
#endif
+ /* CPU on which NAPI has been scheduled for processing */
+ int list_owner;
struct net_device *dev;
struct gro_list gro_hash[GRO_HASH_BUCKETS];
struct sk_buff *skb;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 82511b2f61ea..f7628ea94bcb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
}
+static inline void
+napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
+{
+ struct page *page = skb_frag_page(frag);
+
+#ifdef CONFIG_PAGE_POOL
+ if (recycle && page_pool_return_skb_page(page, napi_safe))
+ return;
+#endif
+ put_page(page);
+}
+
/**
* __skb_frag_unref - release a reference on a paged fragment.
* @frag: the paged fragment
@@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
*/
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
{
- struct page *page = skb_frag_page(frag);
-
-#ifdef CONFIG_PAGE_POOL
- if (recycle && page_pool_return_skb_page(page))
- return;
-#endif
- put_page(page);
+ napi_frag_unref(frag, recycle, false);
}
/**
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index ddfa0b328677..91b808dade82 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -77,6 +77,7 @@ struct page_pool_params {
unsigned int pool_size;
int nid; /* Numa node id to allocate from pages from */
struct device *dev; /* device, for DMA pre-mapping purposes */
+ struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */
enum dma_data_direction dma_dir; /* DMA mapping direction */
unsigned int max_len; /* max DMA sync memory size */
unsigned int offset; /* DMA addr offset */
@@ -239,7 +240,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
return pool->p.dma_dir;
}
-bool page_pool_return_skb_page(struct page *page);
+bool page_pool_return_skb_page(struct page *page, bool napi_safe);
struct page_pool *page_pool_create(const struct page_pool_params *params);
diff --git a/net/core/dev.c b/net/core/dev.c
index 480600a075ce..5af075458efb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4358,6 +4358,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
}
list_add_tail(&napi->poll_list, &sd->poll_list);
+ WRITE_ONCE(napi->list_owner, smp_processor_id());
/* If not called from net_rx_action()
* we have to raise NET_RX_SOFTIRQ.
*/
@@ -6068,6 +6069,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
list_del_init(&n->poll_list);
local_irq_restore(flags);
}
+ WRITE_ONCE(n->list_owner, -1);
val = READ_ONCE(n->state);
do {
@@ -6383,6 +6385,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
+ napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 193c18799865..2f6bf422ed30 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -19,6 +19,7 @@
#include <linux/mm.h> /* for put_page() */
#include <linux/poison.h>
#include <linux/ethtool.h>
+#include <linux/netdevice.h>
#include <trace/events/page_pool.h>
@@ -874,9 +875,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
}
EXPORT_SYMBOL(page_pool_update_nid);
-bool page_pool_return_skb_page(struct page *page)
+bool page_pool_return_skb_page(struct page *page, bool napi_safe)
{
+ struct napi_struct *napi;
struct page_pool *pp;
+ bool allow_direct;
page = compound_head(page);
@@ -892,12 +895,20 @@ bool page_pool_return_skb_page(struct page *page)
pp = page->pp;
+ /* Allow direct recycle if we have reasons to believe that we are
+ * in the same context as the consumer would run, so there's
+ * no possible race.
+ */
+ napi = pp->p.napi;
+ allow_direct = napi_safe && napi &&
+ READ_ONCE(napi->list_owner) == smp_processor_id();
+
/* Driver set this to memory recycling info. Reset it on recycle.
* This will *not* work for NIC using a split-page memory model.
* The page will be returned to the pool here regardless of the
* 'flipped' fragment being in use or not.
*/
- page_pool_put_full_page(pp, page, false);
+ page_pool_put_full_page(pp, page, allow_direct);
return true;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b2092166f7e2..3bf3c6540f33 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -843,7 +843,7 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
{
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false;
- return page_pool_return_skb_page(virt_to_page(data));
+ return page_pool_return_skb_page(virt_to_page(data), napi_safe);
}
static void skb_kfree_head(void *head, unsigned int end_offset)
@@ -889,7 +889,7 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
}
for (i = 0; i < shinfo->nr_frags; i++)
- __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+ napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
free_head:
if (shinfo->frag_list)
--
2.39.2
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH net-next v2 2/3] page_pool: allow caching from safely localized NAPI
2023-04-13 4:26 ` [PATCH net-next v2 2/3] page_pool: allow caching from safely localized NAPI Jakub Kicinski
@ 2023-04-14 8:45 ` Jesper Dangaard Brouer
2023-04-14 14:50 ` Jakub Kicinski
0 siblings, 1 reply; 13+ messages in thread
From: Jesper Dangaard Brouer @ 2023-04-14 8:45 UTC (permalink / raw)
To: Jakub Kicinski, davem
Cc: brouer, netdev, edumazet, pabeni, hawk, ilias.apalodimas,
linyunsheng, alexander.duyck, Tariq Toukan
On 13/04/2023 06.26, Jakub Kicinski wrote:
> Recent patches to mlx5 mentioned a regression when moving from
> driver local page pool to only using the generic page pool code.
> Page pool has two recycling paths (1) direct one, which runs in
> safe NAPI context (basically consumer context, so producing
> can be lockless); and (2) via a ptr_ring, which takes a spin
> lock because the freeing can happen from any CPU; producer
> and consumer may run concurrently.
>
> Since the page pool code was added, Eric introduced a revised version
> of deferred skb freeing. TCP skbs are now usually returned to the CPU
> which allocated them, and freed in softirq context. This places the
> freeing (producing of pages back to the pool) enticingly close to
> the allocation (consumer).
>
> If we can prove that we're freeing in the same softirq context in which
> the consumer NAPI will run - lockless use of the cache is perfectly fine,
> no need for the lock.
>
> Let drivers link the page pool to a NAPI instance. If the NAPI instance
> is scheduled on the same CPU on which we're freeing - place the pages
> in the direct cache.
>
> With that and patched bnxt (XDP enabled to engage the page pool, sigh,
> bnxt really needs page pool work :() I see a 2.6% perf boost with
> a TCP stream test (app on a different physical core than softirq).
>
> The CPU use of relevant functions decreases as expected:
>
> page_pool_refill_alloc_cache 1.17% -> 0%
> _raw_spin_lock 2.41% -> 0.98%
>
> Only consider lockless path to be safe when NAPI is scheduled
> - in practice this should cover majority if not all of steady state
> workloads. It's usually the NAPI kicking in that causes the skb flush.
>
> The main case we'll miss out on is when application runs on the same
> CPU as NAPI. In that case we don't use the deferred skb free path.
>
> Reviewed-by: Tariq Toukan<tariqt@nvidia.com>
> Signed-off-by: Jakub Kicinski<kuba@kernel.org>
> ---
> v1:
> - s/in_napi/napi_safe/
> rfc v2:https://lore.kernel.org/all/20230405232100.103392-1-kuba@kernel.org/
> - plumb thru "are we in NAPI" bool rather than guessing based
> on softirq && !hardirq
>
> CC:hawk@kernel.org
> CC:ilias.apalodimas@linaro.org
> ---
> Documentation/networking/page_pool.rst | 1 +
> include/linux/netdevice.h | 3 +++
> include/linux/skbuff.h | 20 +++++++++++++-------
> include/net/page_pool.h | 3 ++-
> net/core/dev.c | 3 +++
> net/core/page_pool.c | 15 +++++++++++++--
> net/core/skbuff.c | 4 ++--
> 7 files changed, 37 insertions(+), 12 deletions(-)
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
BTW, does it matter if I ack with <hawk@kernel.org> or <brouer@redhat.com> ?
e.g. does the patchwork automation scripts, need to maintainer email to
match?
--Jesper
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next v2 2/3] page_pool: allow caching from safely localized NAPI
2023-04-14 8:45 ` Jesper Dangaard Brouer
@ 2023-04-14 14:50 ` Jakub Kicinski
0 siblings, 0 replies; 13+ messages in thread
From: Jakub Kicinski @ 2023-04-14 14:50 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: davem, brouer, netdev, edumazet, pabeni, hawk, ilias.apalodimas,
linyunsheng, alexander.duyck, Tariq Toukan
On Fri, 14 Apr 2023 10:45:44 +0200 Jesper Dangaard Brouer wrote:
> Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Thanks!
> BTW, does it matter if I ack with <hawk@kernel.org> or <brouer@redhat.com> ?
>
> e.g. does the patchwork automation scripts, need to maintainer email to
> match?
For better or worse we don't have any substantial automation in
the area of maintainer ack tracking. But if we did we'd run it thru
various mail maps for sure, so it should be fine even then. I hope.
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH net-next v2 3/3] bnxt: hook NAPIs to page pools
2023-04-13 4:26 [PATCH net-next v2 0/3] page_pool: allow caching from safely localized NAPI Jakub Kicinski
2023-04-13 4:26 ` [PATCH net-next v2 1/3] net: skb: plumb napi state thru skb freeing paths Jakub Kicinski
2023-04-13 4:26 ` [PATCH net-next v2 2/3] page_pool: allow caching from safely localized NAPI Jakub Kicinski
@ 2023-04-13 4:26 ` Jakub Kicinski
2023-04-14 13:09 ` [PATCH net-next v2 0/3] page_pool: allow caching from safely localized NAPI Dragos Tatulea
2023-04-15 2:20 ` patchwork-bot+netdevbpf
4 siblings, 0 replies; 13+ messages in thread
From: Jakub Kicinski @ 2023-04-13 4:26 UTC (permalink / raw)
To: davem
Cc: netdev, edumazet, pabeni, hawk, ilias.apalodimas, linyunsheng,
alexander.duyck, Jakub Kicinski, Tariq Toukan, michael.chan
bnxt has 1:1 mapping of page pools and NAPIs, so it's safe
to hoook them up together.
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
CC: michael.chan@broadcom.com
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index f7602d8d79e3..b79c6780c752 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3211,6 +3211,7 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
pp.pool_size = bp->rx_ring_size;
pp.nid = dev_to_node(&bp->pdev->dev);
+ pp.napi = &rxr->bnapi->napi;
pp.dev = &bp->pdev->dev;
pp.dma_dir = DMA_BIDIRECTIONAL;
--
2.39.2
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH net-next v2 0/3] page_pool: allow caching from safely localized NAPI
2023-04-13 4:26 [PATCH net-next v2 0/3] page_pool: allow caching from safely localized NAPI Jakub Kicinski
` (2 preceding siblings ...)
2023-04-13 4:26 ` [PATCH net-next v2 3/3] bnxt: hook NAPIs to page pools Jakub Kicinski
@ 2023-04-14 13:09 ` Dragos Tatulea
2023-04-15 2:20 ` patchwork-bot+netdevbpf
4 siblings, 0 replies; 13+ messages in thread
From: Dragos Tatulea @ 2023-04-14 13:09 UTC (permalink / raw)
To: kuba@kernel.org
Cc: davem@davemloft.net, alexander.duyck@gmail.com,
ilias.apalodimas@linaro.org, linyunsheng@huawei.com,
hawk@kernel.org, pabeni@redhat.com, netdev@vger.kernel.org,
edumazet@google.com, ttoukan@nvidia.com
On Wed, 2023-04-12 at 21:26 -0700, Jakub Kicinski wrote:
> I went back to the explicit "are we in NAPI method", mostly
> because I don't like having both around :( (even tho I maintain
> that in_softirq() && !in_hardirq() is as safe, as softirqs do
> not nest).
>
> Still returning the skbs to a CPU, tho, not to the NAPI instance.
> I reckon we could create a small refcounted struct per NAPI instance
> which would allow sockets and other users so hold a persisent
> and safe reference. But that's a bigger change, and I get 90+%
> recycling thru the cache with just these patches (for RR and
> streaming tests with 100% CPU use it's almost 100%).
>
> Some numbers for streaming test with 100% CPU use (from previous
> version,
> but really they perform the same):
>
> HW-GRO page=page
> before after before after
> recycle:
> cached: 0 138669686 0 15019
> 7505
> cache_full: 0 223391 0
> 74582
> ring: 138551933
> 9997191 149299454 0
> ring_full: 0 488 3154
> 127590
> released_refcnt: 0 0 0
> 0
>
> alloc:
> fast: 136491361 148615710 146969587 15032
> 2859
> slow: 1772 1799 144
> 105
> slow_high_order: 0 0 0
> 0
> empty: 1772 1799 144
> 105
> refill: 2165245 156302 2332880
> 2128
> waive: 0 0 0
> 0
>
Enabled this on the mlx5 driver and seeing the following page_pool
cache usage improvements for single stream iperf test case:
- For 1500 MTU and legacy rq, seeing a 20% improvement of cache usage.
- For 9K MTU, seeing 33-40 % page_pool cache usage improvements for
both striding and legacy rq (depending if the app is running on the
same core as the rq or not).
One thing to note is that the page_pool cache seems to get filled more
often for striding rq now which is something that we could potentially
improve on the mlx5 side.
Regarding my earlier comment:
> After enabling this in the mlx5 driver, there is already improved
> page_pool cache usage for our test with the application running on
> the same CPU with the receive queue NAPI (0 -> 98 % cache usage).
I was testing without the deferred release optimizations that we did in
the mlx5 driver.
> v2:
> - minor commit message fixes (patch 1)
> v1:
> https://lore.kernel.org/all/20230411201800.596103-1-kuba@kernel.org/
> - rename the arg in_normal_napi -> napi_safe
> - also allow recycling in __kfree_skb_defer()
> rfcv2:
> https://lore.kernel.org/all/20230405232100.103392-1-kuba@kernel.org/
>
> Jakub Kicinski (3):
> net: skb: plumb napi state thru skb freeing paths
> page_pool: allow caching from safely localized NAPI
> bnxt: hook NAPIs to page pools
>
> Documentation/networking/page_pool.rst | 1 +
> drivers/net/ethernet/broadcom/bnxt/bnxt.c | 1 +
> include/linux/netdevice.h | 3 ++
> include/linux/skbuff.h | 20 +++++++----
> include/net/page_pool.h | 3 +-
> net/core/dev.c | 3 ++
> net/core/page_pool.c | 15 ++++++--
> net/core/skbuff.c | 42 ++++++++++++---------
> --
> 8 files changed, 58 insertions(+), 30 deletions(-)
>
Tested-by: Dragos Tatulea <dtatulea@nvidia.com>
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH net-next v2 0/3] page_pool: allow caching from safely localized NAPI
2023-04-13 4:26 [PATCH net-next v2 0/3] page_pool: allow caching from safely localized NAPI Jakub Kicinski
` (3 preceding siblings ...)
2023-04-14 13:09 ` [PATCH net-next v2 0/3] page_pool: allow caching from safely localized NAPI Dragos Tatulea
@ 2023-04-15 2:20 ` patchwork-bot+netdevbpf
4 siblings, 0 replies; 13+ messages in thread
From: patchwork-bot+netdevbpf @ 2023-04-15 2:20 UTC (permalink / raw)
To: Jakub Kicinski
Cc: davem, netdev, edumazet, pabeni, hawk, ilias.apalodimas,
linyunsheng, alexander.duyck
Hello:
This series was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Wed, 12 Apr 2023 21:26:02 -0700 you wrote:
> I went back to the explicit "are we in NAPI method", mostly
> because I don't like having both around :( (even tho I maintain
> that in_softirq() && !in_hardirq() is as safe, as softirqs do
> not nest).
>
> Still returning the skbs to a CPU, tho, not to the NAPI instance.
> I reckon we could create a small refcounted struct per NAPI instance
> which would allow sockets and other users so hold a persisent
> and safe reference. But that's a bigger change, and I get 90+%
> recycling thru the cache with just these patches (for RR and
> streaming tests with 100% CPU use it's almost 100%).
>
> [...]
Here is the summary with links:
- [net-next,v2,1/3] net: skb: plumb napi state thru skb freeing paths
https://git.kernel.org/netdev/net-next/c/b07a2d97ba5e
- [net-next,v2,2/3] page_pool: allow caching from safely localized NAPI
https://git.kernel.org/netdev/net-next/c/8c48eea3adf3
- [net-next,v2,3/3] bnxt: hook NAPIs to page pools
https://git.kernel.org/netdev/net-next/c/294e39e0d034
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply [flat|nested] 13+ messages in thread