netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Mat Martineau <martineau@kernel.org>
To: Yunsheng Lin <linyunsheng@huawei.com>
Cc: Paolo Abeni <pabeni@redhat.com>,
	davem@davemloft.net, kuba@kernel.org,  netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	 Ayush Sawal <ayush.sawal@chelsio.com>,
	Eric Dumazet <edumazet@google.com>,
	 Willem de Bruijn <willemdebruijn.kernel@gmail.com>,
	 Jason Wang <jasowang@redhat.com>, Ingo Molnar <mingo@redhat.com>,
	 Peter Zijlstra <peterz@infradead.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	 Vincent Guittot <vincent.guittot@linaro.org>,
	 Dietmar Eggemann <dietmar.eggemann@arm.com>,
	 Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,  Mel Gorman <mgorman@suse.de>,
	 Daniel Bristot de Oliveira <bristot@redhat.com>,
	 Valentin Schneider <vschneid@redhat.com>,
	 John Fastabend <john.fastabend@gmail.com>,
	 Jakub Sitnicki <jakub@cloudflare.com>,
	David Ahern <dsahern@kernel.org>,
	 Matthieu Baerts <matttbe@kernel.org>,
	Geliang Tang <geliang@kernel.org>,
	 Boris Pismenny <borisp@nvidia.com>,
	bpf@vger.kernel.org,  mptcp@lists.linux.dev
Subject: Re: [PATCH net-next v2 13/15] net: replace page_frag with page_frag_cache
Date: Tue, 16 Apr 2024 14:40:17 -0700 (PDT)	[thread overview]
Message-ID: <83991c67-8e4a-c287-b4a5-5dbba8835947@kernel.org> (raw)
In-Reply-To: <cb541985-a06d-7a71-9e6d-38827ccdf875@huawei.com>

On Tue, 16 Apr 2024, Yunsheng Lin wrote:

> On 2024/4/16 9:37, Mat Martineau wrote:
>> On Mon, 15 Apr 2024, Yunsheng Lin wrote:
>>
>>> Use the newly introduced prepare/commit API to replace
>>> page_frag with page_frag_cache for sk_page_frag().
>>>
>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>>> ---
>>> .../chelsio/inline_crypto/chtls/chtls.h       |   3 -
>>> .../chelsio/inline_crypto/chtls/chtls_io.c    | 101 ++++---------
>>> .../chelsio/inline_crypto/chtls/chtls_main.c  |   3 -
>>> drivers/net/tun.c                             |  34 ++---
>>> include/linux/sched.h                         |   4 +-
>>> include/net/sock.h                            |  14 +-
>>> kernel/exit.c                                 |   3 +-
>>> kernel/fork.c                                 |   2 +-
>>> net/core/skbuff.c                             |  32 ++--
>>> net/core/skmsg.c                              |  22 +--
>>> net/core/sock.c                               |  46 ++++--
>>> net/ipv4/ip_output.c                          |  35 +++--
>>> net/ipv4/tcp.c                                |  35 ++---
>>> net/ipv4/tcp_output.c                         |  28 ++--
>>> net/ipv6/ip6_output.c                         |  35 +++--
>>> net/kcm/kcmsock.c                             |  30 ++--
>>> net/mptcp/protocol.c                          |  74 ++++++----
>>> net/tls/tls_device.c                          | 139 ++++++++++--------
>>> 18 files changed, 342 insertions(+), 298 deletions(-)
>>
>> Hi Yunsheng,
>>
>> Just focusing on mptcp:
>
> Thanks for reviewing.
>
>>
>>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>>> index f8bc34f0d973..368dd480c4cd 100644
>>> --- a/net/mptcp/protocol.c
>>> +++ b/net/mptcp/protocol.c
>>> @@ -959,17 +959,16 @@ static bool mptcp_skb_can_collapse_to(u64 write_seq,
>>> }
>>>
>>> /* we can append data to the given data frag if:
>>> - * - there is space available in the backing page_frag
>>> - * - the data frag tail matches the current page_frag free offset
>>> + * - the data frag tail matches the current page and offset
>>>  * - the data frag end sequence number matches the current write seq
>>>  */
>>> static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
>>> -                       const struct page_frag *pfrag,
>>> +                       const struct page *page,
>>> +                       const unsigned int offset,
>>>                        const struct mptcp_data_frag *df)
>>> {
>>> -    return df && pfrag->page == df->page &&
>>> -        pfrag->size - pfrag->offset > 0 &&
>>> -        pfrag->offset == (df->offset + df->data_len) &&
>>> +    return df && page == df->page &&
>>> +        offset == (df->offset + df->data_len) &&
>>>         df->data_seq + df->data_len == msk->write_seq;
>>> }
>>>
>>> @@ -1084,30 +1083,36 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
>>> /* ensure we get enough memory for the frag hdr, beyond some minimal amount of
>>>  * data
>>>  */
>>> -static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
>>> +static struct page *mptcp_page_frag_alloc_prepare(struct sock *sk,
>>> +                          struct page_frag_cache *pfrag,
>>> +                          unsigned int *offset,
>>> +                          unsigned int *size, void **va)
>>> {
>>> -    if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
>>> -                    pfrag, sk->sk_allocation)))
>>> -        return true;
>>> +    struct page *page;
>>> +
>>> +    page = page_frag_alloc_prepare(pfrag, offset, size, va,
>>> +                       sk->sk_allocation);
>>> +    if (likely(page))
>>> +        return page;
>>>
>>>     mptcp_enter_memory_pressure(sk);
>>> -    return false;
>>> +    return NULL;
>>> }
>>>
>>> static struct mptcp_data_frag *
>>> -mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
>>> -              int orig_offset)
>>> +mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page *page,
>>> +              unsigned int orig_offset)
>>> {
>>>     int offset = ALIGN(orig_offset, sizeof(long));
>>>     struct mptcp_data_frag *dfrag;
>>>
>>> -    dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
>>> +    dfrag = (struct mptcp_data_frag *)(page_to_virt(page) + offset);
>>>     dfrag->data_len = 0;
>>>     dfrag->data_seq = msk->write_seq;
>>>     dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
>>>     dfrag->offset = offset + sizeof(struct mptcp_data_frag);
>>>     dfrag->already_sent = 0;
>>> -    dfrag->page = pfrag->page;
>>> +    dfrag->page = page;
>>>
>>>     return dfrag;
>>> }
>>> @@ -1792,7 +1797,7 @@ static u32 mptcp_send_limit(const struct sock *sk)
>>> static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>> {
>>>     struct mptcp_sock *msk = mptcp_sk(sk);
>>> -    struct page_frag *pfrag;
>>> +    struct page_frag_cache *pfrag;
>>>     size_t copied = 0;
>>>     int ret = 0;
>>>     long timeo;
>>> @@ -1831,9 +1836,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>>     while (msg_data_left(msg)) {
>>>         int total_ts, frag_truesize = 0;
>>>         struct mptcp_data_frag *dfrag;
>>> -        bool dfrag_collapsed;
>>> -        size_t psize, offset;
>>> +        bool dfrag_collapsed = false;
>>> +        unsigned int offset, size;
>>> +        struct page *page;
>>> +        size_t psize;
>>>         u32 copy_limit;
>>> +        void *va;
>>>
>>>         /* ensure fitting the notsent_lowat() constraint */
>>>         copy_limit = mptcp_send_limit(sk);
>>> @@ -1844,21 +1852,31 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>>          * page allocator
>>>          */
>>>         dfrag = mptcp_pending_tail(sk);
>>> -        dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
>>> +        size = 32U;
>>> +        page = mptcp_page_frag_alloc_prepare(sk, pfrag, &offset, &size,
>>> +                             &va);
>>> +        if (!page)
>>> +            goto wait_for_memory;
>>> +
>>> +        dfrag_collapsed = mptcp_frag_can_collapse_to(msk, page, offset,
>>> +                                 dfrag);
>>>         if (!dfrag_collapsed) {
>>> -            if (!mptcp_page_frag_refill(sk, pfrag))
>>> +            size = 32U + sizeof(struct mptcp_data_frag);
>>> +            page = mptcp_page_frag_alloc_prepare(sk, pfrag, &offset,
>>> +                                 &size, &va);
>>
>> Since 'size' was updated to contain the maximum available space on the 
>> first call to mptcp_page_frag_alloc_prepare(), is it necessary to call 
>> it again instead of checking to see if 'size' is large enough for the 
>> mptcp_data_frag struct?
>
> As the first call to the mptcp_page_frag_alloc_prepare() with the size
> being 32U, the maximum available space might less than '32U +
> sizeof(struct mptcp_data_frag)', in that case we need to call the
> mptcp_page_frag_alloc_prepare() with the size being '32U +
> sizeof(struct mptcp_data_frag)' anyway, so I am not sure if checking
> the maximum available space on the first call to
> mptcp_page_frag_alloc_prepare() before making the second call will
> make the thing simpler.

Ah, ok. If the larger amount of space is available the underlying call to 
page_frag_cache_refill() ends up being very low overhead. So I agree with 
you: it's ok to call mptcp_page_alloc_prepare() a second time.

>
>>
>>> +            if (!page)
>>>                 goto wait_for_memory;
>>>
>>> -            dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset);
>>> +            dfrag = mptcp_carve_data_frag(msk, page, offset);
>>>             frag_truesize = dfrag->overhead;
>>> +            va += dfrag->overhead;
>>>         }
>>>
>>>         /* we do not bound vs wspace, to allow a single packet.
>>>          * memory accounting will prevent execessive memory usage
>>>          * anyway
>>>          */
>>> -        offset = dfrag->offset + dfrag->data_len;
>>> -        psize = pfrag->size - offset;
>>> +        psize = size - frag_truesize;
>>>         psize = min_t(size_t, psize, msg_data_left(msg));
>>>         psize = min_t(size_t, psize, copy_limit);
>>>         total_ts = psize + frag_truesize;
>>> @@ -1866,8 +1884,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>>         if (!sk_wmem_schedule(sk, total_ts))
>>>             goto wait_for_memory;
>>>
>>> -        ret = do_copy_data_nocache(sk, psize, &msg->msg_iter,
>>> -                       page_address(dfrag->page) + offset);
>>> +        ret = do_copy_data_nocache(sk, psize, &msg->msg_iter, va);
>>>         if (ret)
>>>             goto do_error;
>>>
>>> @@ -1876,7 +1893,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>>         copied += psize;
>>>         dfrag->data_len += psize;
>>>         frag_truesize += psize;
>>> -        pfrag->offset += frag_truesize;
>>>         WRITE_ONCE(msk->write_seq, msk->write_seq + psize);
>>>
>>>         /* charge data on mptcp pending queue to the msk socket
>>> @@ -1884,11 +1900,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>>          */
>>>         sk_wmem_queued_add(sk, frag_truesize);
>>>         if (!dfrag_collapsed) {
>>> -            get_page(dfrag->page);
>>> +            page_frag_alloc_commit(pfrag, offset, frag_truesize);
>>
>> It would be more efficient (but more complicated) to defer the commit 
>> until the loop is done or the maximum frag size is reached. This would 
>> perform more like the older code, which only had to call refill when 
>> mptcp_frag_can_collapse_to() returned false.
>
> page_frag_alloc_commit() is a inlined helper, it does not seems
> to be an issue here as it is updating the reference counting
> and offset as the old code does with less overhead.
>

I wasn't concerned as much about the direct cost of the inlined 
page_frag_alloc_commit() helper, it was that we could make fewer prepare 
calls if the commit was deferred as long as possible. As we discussed 
above, I see now that the prepare is not expensive when there is more 
space available in the current frag.

> Maybe what we could do is to do the prepare in the inline
> helper instead of a function when cache is enough, so that
> we can avoid a function call as the old code does, as an
> inlined function requires less overhead and is generally
> faster than a function call.
>
> But that requires more refactoring, as this patchset is bigger
> enough now, I guess we try it later if it is possible.

A more generic (possible) optimization would be to inline some of 
page_frag_cache_refill(), but I'm not sure the code size tradeoff is 
worth it - would have to collect some data to find out for sure!

Thanks,

Mat

  reply	other threads:[~2024-04-16 21:40 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-15 13:19 [PATCH net-next v2 00/15] First try to replace page_frag with page_frag_cache Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 01/15] mm: page_frag: add a test module for page_frag Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 02/15] xtensa: remove the get_order() implementation Yunsheng Lin
2024-04-15 15:04   ` Max Filippov
2024-04-15 13:19 ` [PATCH net-next v2 03/15] mm: page_frag: use free_unref_page() to free page fragment Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 04/15] mm: move the page fragment allocator from page_alloc into its own file Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 05/15] mm: page_frag: use initial zero offset for page_frag_alloc_align() Yunsheng Lin
2024-04-15 23:55   ` Alexander H Duyck
2024-04-16 13:11     ` Yunsheng Lin
2024-04-16 15:51       ` Alexander H Duyck
2024-04-17 13:17         ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 06/15] mm: page_frag: change page_frag_alloc_* API to accept align param Yunsheng Lin
2024-04-16 16:08   ` Alexander Duyck
2024-04-17 13:18     ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 07/15] mm: page_frag: add '_va' suffix to page_frag API Yunsheng Lin
2024-04-16 16:12   ` Alexander H Duyck
2024-04-17 13:18     ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 08/15] mm: page_frag: add two inline helper for " Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 09/15] mm: page_frag: reuse MSB of 'size' field for pfmemalloc Yunsheng Lin
2024-04-16 16:22   ` Alexander H Duyck
2024-04-17 13:19     ` Yunsheng Lin
2024-04-17 15:11       ` Alexander H Duyck
2024-04-18  9:39         ` Yunsheng Lin
2024-04-26  9:38           ` Yunsheng Lin
2024-04-29 14:49             ` Alexander Duyck
2024-04-30 12:05               ` Yunsheng Lin
2024-04-30 14:54                 ` Alexander Duyck
2024-05-06 12:33                   ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 10/15] mm: page_frag: reuse existing bit field of 'va' for pagecnt_bias Yunsheng Lin
2024-04-16 16:33   ` Alexander H Duyck
2024-04-17 13:23     ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 11/15] net: introduce the skb_copy_to_va_nocache() helper Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 12/15] mm: page_frag: introduce prepare/commit API for page_frag Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 13/15] net: replace page_frag with page_frag_cache Yunsheng Lin
2024-04-16  1:37   ` Mat Martineau
2024-04-16 13:11     ` Yunsheng Lin
2024-04-16 21:40       ` Mat Martineau [this message]
2024-04-19 12:37         ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 14/15] mm: page_frag: update documentation for page_frag Yunsheng Lin
2024-04-16  6:13   ` Bagas Sanjaya
2024-04-16 13:11     ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 15/15] mm: page_frag: add a entry in MAINTAINERS " Yunsheng Lin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=83991c67-8e4a-c287-b4a5-5dbba8835947@kernel.org \
    --to=martineau@kernel.org \
    --cc=ayush.sawal@chelsio.com \
    --cc=borisp@nvidia.com \
    --cc=bpf@vger.kernel.org \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=davem@davemloft.net \
    --cc=dietmar.eggemann@arm.com \
    --cc=dsahern@kernel.org \
    --cc=edumazet@google.com \
    --cc=geliang@kernel.org \
    --cc=jakub@cloudflare.com \
    --cc=jasowang@redhat.com \
    --cc=john.fastabend@gmail.com \
    --cc=juri.lelli@redhat.com \
    --cc=kuba@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linyunsheng@huawei.com \
    --cc=matttbe@kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=mptcp@lists.linux.dev \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    --cc=willemdebruijn.kernel@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).