From: Mat Martineau <martineau@kernel.org>
To: Yunsheng Lin <linyunsheng@huawei.com>
Cc: Paolo Abeni <pabeni@redhat.com>,
davem@davemloft.net, kuba@kernel.org, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org,
Ayush Sawal <ayush.sawal@chelsio.com>,
Eric Dumazet <edumazet@google.com>,
Willem de Bruijn <willemdebruijn.kernel@gmail.com>,
Jason Wang <jasowang@redhat.com>, Ingo Molnar <mingo@redhat.com>,
Peter Zijlstra <peterz@infradead.org>,
Juri Lelli <juri.lelli@redhat.com>,
Vincent Guittot <vincent.guittot@linaro.org>,
Dietmar Eggemann <dietmar.eggemann@arm.com>,
Steven Rostedt <rostedt@goodmis.org>,
Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
Daniel Bristot de Oliveira <bristot@redhat.com>,
Valentin Schneider <vschneid@redhat.com>,
John Fastabend <john.fastabend@gmail.com>,
Jakub Sitnicki <jakub@cloudflare.com>,
David Ahern <dsahern@kernel.org>,
Matthieu Baerts <matttbe@kernel.org>,
Geliang Tang <geliang@kernel.org>,
Boris Pismenny <borisp@nvidia.com>,
bpf@vger.kernel.org, mptcp@lists.linux.dev
Subject: Re: [PATCH net-next v2 13/15] net: replace page_frag with page_frag_cache
Date: Tue, 16 Apr 2024 14:40:17 -0700 (PDT) [thread overview]
Message-ID: <83991c67-8e4a-c287-b4a5-5dbba8835947@kernel.org> (raw)
In-Reply-To: <cb541985-a06d-7a71-9e6d-38827ccdf875@huawei.com>
On Tue, 16 Apr 2024, Yunsheng Lin wrote:
> On 2024/4/16 9:37, Mat Martineau wrote:
>> On Mon, 15 Apr 2024, Yunsheng Lin wrote:
>>
>>> Use the newly introduced prepare/commit API to replace
>>> page_frag with page_frag_cache for sk_page_frag().
>>>
>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>>> ---
>>> .../chelsio/inline_crypto/chtls/chtls.h | 3 -
>>> .../chelsio/inline_crypto/chtls/chtls_io.c | 101 ++++---------
>>> .../chelsio/inline_crypto/chtls/chtls_main.c | 3 -
>>> drivers/net/tun.c | 34 ++---
>>> include/linux/sched.h | 4 +-
>>> include/net/sock.h | 14 +-
>>> kernel/exit.c | 3 +-
>>> kernel/fork.c | 2 +-
>>> net/core/skbuff.c | 32 ++--
>>> net/core/skmsg.c | 22 +--
>>> net/core/sock.c | 46 ++++--
>>> net/ipv4/ip_output.c | 35 +++--
>>> net/ipv4/tcp.c | 35 ++---
>>> net/ipv4/tcp_output.c | 28 ++--
>>> net/ipv6/ip6_output.c | 35 +++--
>>> net/kcm/kcmsock.c | 30 ++--
>>> net/mptcp/protocol.c | 74 ++++++----
>>> net/tls/tls_device.c | 139 ++++++++++--------
>>> 18 files changed, 342 insertions(+), 298 deletions(-)
>>
>> Hi Yunsheng,
>>
>> Just focusing on mptcp:
>
> Thanks for reviewing.
>
>>
>>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>>> index f8bc34f0d973..368dd480c4cd 100644
>>> --- a/net/mptcp/protocol.c
>>> +++ b/net/mptcp/protocol.c
>>> @@ -959,17 +959,16 @@ static bool mptcp_skb_can_collapse_to(u64 write_seq,
>>> }
>>>
>>> /* we can append data to the given data frag if:
>>> - * - there is space available in the backing page_frag
>>> - * - the data frag tail matches the current page_frag free offset
>>> + * - the data frag tail matches the current page and offset
>>> * - the data frag end sequence number matches the current write seq
>>> */
>>> static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
>>> - const struct page_frag *pfrag,
>>> + const struct page *page,
>>> + const unsigned int offset,
>>> const struct mptcp_data_frag *df)
>>> {
>>> - return df && pfrag->page == df->page &&
>>> - pfrag->size - pfrag->offset > 0 &&
>>> - pfrag->offset == (df->offset + df->data_len) &&
>>> + return df && page == df->page &&
>>> + offset == (df->offset + df->data_len) &&
>>> df->data_seq + df->data_len == msk->write_seq;
>>> }
>>>
>>> @@ -1084,30 +1083,36 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
>>> /* ensure we get enough memory for the frag hdr, beyond some minimal amount of
>>> * data
>>> */
>>> -static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
>>> +static struct page *mptcp_page_frag_alloc_prepare(struct sock *sk,
>>> + struct page_frag_cache *pfrag,
>>> + unsigned int *offset,
>>> + unsigned int *size, void **va)
>>> {
>>> - if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
>>> - pfrag, sk->sk_allocation)))
>>> - return true;
>>> + struct page *page;
>>> +
>>> + page = page_frag_alloc_prepare(pfrag, offset, size, va,
>>> + sk->sk_allocation);
>>> + if (likely(page))
>>> + return page;
>>>
>>> mptcp_enter_memory_pressure(sk);
>>> - return false;
>>> + return NULL;
>>> }
>>>
>>> static struct mptcp_data_frag *
>>> -mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
>>> - int orig_offset)
>>> +mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page *page,
>>> + unsigned int orig_offset)
>>> {
>>> int offset = ALIGN(orig_offset, sizeof(long));
>>> struct mptcp_data_frag *dfrag;
>>>
>>> - dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
>>> + dfrag = (struct mptcp_data_frag *)(page_to_virt(page) + offset);
>>> dfrag->data_len = 0;
>>> dfrag->data_seq = msk->write_seq;
>>> dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
>>> dfrag->offset = offset + sizeof(struct mptcp_data_frag);
>>> dfrag->already_sent = 0;
>>> - dfrag->page = pfrag->page;
>>> + dfrag->page = page;
>>>
>>> return dfrag;
>>> }
>>> @@ -1792,7 +1797,7 @@ static u32 mptcp_send_limit(const struct sock *sk)
>>> static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>> {
>>> struct mptcp_sock *msk = mptcp_sk(sk);
>>> - struct page_frag *pfrag;
>>> + struct page_frag_cache *pfrag;
>>> size_t copied = 0;
>>> int ret = 0;
>>> long timeo;
>>> @@ -1831,9 +1836,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>> while (msg_data_left(msg)) {
>>> int total_ts, frag_truesize = 0;
>>> struct mptcp_data_frag *dfrag;
>>> - bool dfrag_collapsed;
>>> - size_t psize, offset;
>>> + bool dfrag_collapsed = false;
>>> + unsigned int offset, size;
>>> + struct page *page;
>>> + size_t psize;
>>> u32 copy_limit;
>>> + void *va;
>>>
>>> /* ensure fitting the notsent_lowat() constraint */
>>> copy_limit = mptcp_send_limit(sk);
>>> @@ -1844,21 +1852,31 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>> * page allocator
>>> */
>>> dfrag = mptcp_pending_tail(sk);
>>> - dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
>>> + size = 32U;
>>> + page = mptcp_page_frag_alloc_prepare(sk, pfrag, &offset, &size,
>>> + &va);
>>> + if (!page)
>>> + goto wait_for_memory;
>>> +
>>> + dfrag_collapsed = mptcp_frag_can_collapse_to(msk, page, offset,
>>> + dfrag);
>>> if (!dfrag_collapsed) {
>>> - if (!mptcp_page_frag_refill(sk, pfrag))
>>> + size = 32U + sizeof(struct mptcp_data_frag);
>>> + page = mptcp_page_frag_alloc_prepare(sk, pfrag, &offset,
>>> + &size, &va);
>>
>> Since 'size' was updated to contain the maximum available space on the
>> first call to mptcp_page_frag_alloc_prepare(), is it necessary to call
>> it again instead of checking to see if 'size' is large enough for the
>> mptcp_data_frag struct?
>
> As the first call to the mptcp_page_frag_alloc_prepare() with the size
> being 32U, the maximum available space might less than '32U +
> sizeof(struct mptcp_data_frag)', in that case we need to call the
> mptcp_page_frag_alloc_prepare() with the size being '32U +
> sizeof(struct mptcp_data_frag)' anyway, so I am not sure if checking
> the maximum available space on the first call to
> mptcp_page_frag_alloc_prepare() before making the second call will
> make the thing simpler.
Ah, ok. If the larger amount of space is available the underlying call to
page_frag_cache_refill() ends up being very low overhead. So I agree with
you: it's ok to call mptcp_page_alloc_prepare() a second time.
>
>>
>>> + if (!page)
>>> goto wait_for_memory;
>>>
>>> - dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset);
>>> + dfrag = mptcp_carve_data_frag(msk, page, offset);
>>> frag_truesize = dfrag->overhead;
>>> + va += dfrag->overhead;
>>> }
>>>
>>> /* we do not bound vs wspace, to allow a single packet.
>>> * memory accounting will prevent execessive memory usage
>>> * anyway
>>> */
>>> - offset = dfrag->offset + dfrag->data_len;
>>> - psize = pfrag->size - offset;
>>> + psize = size - frag_truesize;
>>> psize = min_t(size_t, psize, msg_data_left(msg));
>>> psize = min_t(size_t, psize, copy_limit);
>>> total_ts = psize + frag_truesize;
>>> @@ -1866,8 +1884,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>> if (!sk_wmem_schedule(sk, total_ts))
>>> goto wait_for_memory;
>>>
>>> - ret = do_copy_data_nocache(sk, psize, &msg->msg_iter,
>>> - page_address(dfrag->page) + offset);
>>> + ret = do_copy_data_nocache(sk, psize, &msg->msg_iter, va);
>>> if (ret)
>>> goto do_error;
>>>
>>> @@ -1876,7 +1893,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>> copied += psize;
>>> dfrag->data_len += psize;
>>> frag_truesize += psize;
>>> - pfrag->offset += frag_truesize;
>>> WRITE_ONCE(msk->write_seq, msk->write_seq + psize);
>>>
>>> /* charge data on mptcp pending queue to the msk socket
>>> @@ -1884,11 +1900,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>> */
>>> sk_wmem_queued_add(sk, frag_truesize);
>>> if (!dfrag_collapsed) {
>>> - get_page(dfrag->page);
>>> + page_frag_alloc_commit(pfrag, offset, frag_truesize);
>>
>> It would be more efficient (but more complicated) to defer the commit
>> until the loop is done or the maximum frag size is reached. This would
>> perform more like the older code, which only had to call refill when
>> mptcp_frag_can_collapse_to() returned false.
>
> page_frag_alloc_commit() is a inlined helper, it does not seems
> to be an issue here as it is updating the reference counting
> and offset as the old code does with less overhead.
>
I wasn't concerned as much about the direct cost of the inlined
page_frag_alloc_commit() helper, it was that we could make fewer prepare
calls if the commit was deferred as long as possible. As we discussed
above, I see now that the prepare is not expensive when there is more
space available in the current frag.
> Maybe what we could do is to do the prepare in the inline
> helper instead of a function when cache is enough, so that
> we can avoid a function call as the old code does, as an
> inlined function requires less overhead and is generally
> faster than a function call.
>
> But that requires more refactoring, as this patchset is bigger
> enough now, I guess we try it later if it is possible.
A more generic (possible) optimization would be to inline some of
page_frag_cache_refill(), but I'm not sure the code size tradeoff is
worth it - would have to collect some data to find out for sure!
Thanks,
Mat
next prev parent reply other threads:[~2024-04-16 21:40 UTC|newest]
Thread overview: 42+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-04-15 13:19 [PATCH net-next v2 00/15] First try to replace page_frag with page_frag_cache Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 01/15] mm: page_frag: add a test module for page_frag Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 02/15] xtensa: remove the get_order() implementation Yunsheng Lin
2024-04-15 15:04 ` Max Filippov
2024-04-15 13:19 ` [PATCH net-next v2 03/15] mm: page_frag: use free_unref_page() to free page fragment Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 04/15] mm: move the page fragment allocator from page_alloc into its own file Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 05/15] mm: page_frag: use initial zero offset for page_frag_alloc_align() Yunsheng Lin
2024-04-15 23:55 ` Alexander H Duyck
2024-04-16 13:11 ` Yunsheng Lin
2024-04-16 15:51 ` Alexander H Duyck
2024-04-17 13:17 ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 06/15] mm: page_frag: change page_frag_alloc_* API to accept align param Yunsheng Lin
2024-04-16 16:08 ` Alexander Duyck
2024-04-17 13:18 ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 07/15] mm: page_frag: add '_va' suffix to page_frag API Yunsheng Lin
2024-04-16 16:12 ` Alexander H Duyck
2024-04-17 13:18 ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 08/15] mm: page_frag: add two inline helper for " Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 09/15] mm: page_frag: reuse MSB of 'size' field for pfmemalloc Yunsheng Lin
2024-04-16 16:22 ` Alexander H Duyck
2024-04-17 13:19 ` Yunsheng Lin
2024-04-17 15:11 ` Alexander H Duyck
2024-04-18 9:39 ` Yunsheng Lin
2024-04-26 9:38 ` Yunsheng Lin
2024-04-29 14:49 ` Alexander Duyck
2024-04-30 12:05 ` Yunsheng Lin
2024-04-30 14:54 ` Alexander Duyck
2024-05-06 12:33 ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 10/15] mm: page_frag: reuse existing bit field of 'va' for pagecnt_bias Yunsheng Lin
2024-04-16 16:33 ` Alexander H Duyck
2024-04-17 13:23 ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 11/15] net: introduce the skb_copy_to_va_nocache() helper Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 12/15] mm: page_frag: introduce prepare/commit API for page_frag Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 13/15] net: replace page_frag with page_frag_cache Yunsheng Lin
2024-04-16 1:37 ` Mat Martineau
2024-04-16 13:11 ` Yunsheng Lin
2024-04-16 21:40 ` Mat Martineau [this message]
2024-04-19 12:37 ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 14/15] mm: page_frag: update documentation for page_frag Yunsheng Lin
2024-04-16 6:13 ` Bagas Sanjaya
2024-04-16 13:11 ` Yunsheng Lin
2024-04-15 13:19 ` [PATCH net-next v2 15/15] mm: page_frag: add a entry in MAINTAINERS " Yunsheng Lin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=83991c67-8e4a-c287-b4a5-5dbba8835947@kernel.org \
--to=martineau@kernel.org \
--cc=ayush.sawal@chelsio.com \
--cc=borisp@nvidia.com \
--cc=bpf@vger.kernel.org \
--cc=bristot@redhat.com \
--cc=bsegall@google.com \
--cc=davem@davemloft.net \
--cc=dietmar.eggemann@arm.com \
--cc=dsahern@kernel.org \
--cc=edumazet@google.com \
--cc=geliang@kernel.org \
--cc=jakub@cloudflare.com \
--cc=jasowang@redhat.com \
--cc=john.fastabend@gmail.com \
--cc=juri.lelli@redhat.com \
--cc=kuba@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linyunsheng@huawei.com \
--cc=matttbe@kernel.org \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=mptcp@lists.linux.dev \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=peterz@infradead.org \
--cc=rostedt@goodmis.org \
--cc=vincent.guittot@linaro.org \
--cc=vschneid@redhat.com \
--cc=willemdebruijn.kernel@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).