From: Kundan Kumar <kundan.kumar@samsung.com>
To: Hannes Reinecke <hare@suse.de>
Cc: axboe@kernel.dk, hch@lst.de, willy@infradead.org,
kbusch@kernel.org, linux-block@vger.kernel.org,
joshi.k@samsung.com, mcgrof@kernel.org, anuj20.g@samsung.com,
nj.shetty@samsung.com, c.gameti@samsung.com,
gost.dev@samsung.com
Subject: Re: [PATCH v5 2/3] block: add folio awareness instead of looping through pages
Date: Thu, 20 Jun 2024 10:18:42 +0530 [thread overview]
Message-ID: <20240620044842.oxbryv2i7q4muiwf@green245> (raw)
In-Reply-To: <c29524a5-f3c7-4236-968c-58b5f3004b66@suse.de>
[-- Attachment #1: Type: text/plain, Size: 8284 bytes --]
On 19/06/24 09:47AM, Hannes Reinecke wrote:
>On 6/19/24 04:34, Kundan Kumar wrote:
>>Add a bigger size from folio to bio and skip merge processing for pages.
>>
>>Fetch the offset of page within a folio. Depending on the size of folio
>>and folio_offset, fetch a larger length. This length may consist of
>>multiple contiguous pages if folio is multiorder.
>>
>>Using the length calculate number of pages which will be added to bio and
>>increment the loop counter to skip those pages.
>>
>>Using a helper function check if pages are contiguous and belong to same
>>folio, this is done as a COW may happen and change contiguous mapping of
>>pages of folio.
>>
>>This technique helps to avoid overhead of merging pages which belong to
>>same large order folio.
>>
>>Also folio-lize the functions bio_iov_add_page() and
>>bio_iov_add_zone_append_page()
>>
>>Signed-off-by: Kundan Kumar <kundan.kumar@samsung.com>
>>---
>> block/bio.c | 72 ++++++++++++++++++++++++++++++++++++++++++-----------
>> 1 file changed, 58 insertions(+), 14 deletions(-)
>>
>>diff --git a/block/bio.c b/block/bio.c
>>index c8914febb16e..3e75b5b0eb6e 100644
>>--- a/block/bio.c
>>+++ b/block/bio.c
>>@@ -1224,7 +1224,7 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
>> bio_set_flag(bio, BIO_CLONED);
>> }
>>
>>-static int bio_iov_add_page(struct bio *bio, struct page *page,
>>+static int bio_iov_add_folio(struct bio *bio, struct folio *folio,
>> unsigned int len, unsigned int offset)
>> {
>> bool same_page = false;
>>@@ -1234,30 +1234,60 @@ static int bio_iov_add_page(struct bio *bio, struct page *page,
>>
>> if (bio->bi_vcnt > 0 &&
>> bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
>>- page, len, offset, &same_page)) {
>>+ folio_page(folio, 0), len, offset,
>>+ &same_page)) {
>> bio->bi_iter.bi_size += len;
>> if (same_page)
>>- bio_release_page(bio, page);
>>+ bio_release_page(bio, folio_page(folio, 0));
>> return 0;
>> }
>>- __bio_add_page(bio, page, len, offset);
>>+ bio_add_folio_nofail(bio, folio, len, offset);
>> return 0;
>> }
>>
>>-static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
>>+static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio,
>> unsigned int len, unsigned int offset)
>> {
>> struct request_queue *q = bdev_get_queue(bio->bi_bdev);
>> bool same_page = false;
>>
>>- if (bio_add_hw_page(q, bio, page, len, offset,
>>+ if (bio_add_hw_folio(q, bio, folio, len, offset,
>> queue_max_zone_append_sectors(q), &same_page) != len)
>> return -EINVAL;
>> if (same_page)
>>- bio_release_page(bio, page);
>>+ bio_release_page(bio, folio_page(folio, 0));
>> return 0;
>> }
>>
>>+static unsigned int get_contig_folio_len(int *num_pages, struct page **pages,
>>+ int i, struct folio *folio,
>>+ ssize_t left, size_t offset)
>>+{
>>+ ssize_t bytes = left;
>>+ size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes);
>>+ unsigned int j;
>>+
>>+ /*
>>+ * We might COW a single page in the middle of
>>+ * a large folio, so we have to check that all
>>+ * pages belong to the same folio.
>>+ */
>>+ bytes -= contig_sz;
>>+ for (j = i + 1; j < i + *num_pages; j++) {
>>+ size_t next = min_t(size_t, PAGE_SIZE, bytes);
>>+
>>+ if (page_folio(pages[j]) != folio ||
>>+ pages[j] != pages[j - 1] + 1) {
>>+ break;
>>+ }
>>+ contig_sz += next;
>>+ bytes -= next;
>>+ }
>>+ *num_pages = j - i;
>>+
>>+ return contig_sz;
>>+}
>>+
>> #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
>>
>> /**
>>@@ -1277,9 +1307,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>> unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
>> struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
>> struct page **pages = (struct page **)bv;
>>- ssize_t size, left;
>>- unsigned len, i = 0;
>>- size_t offset;
>>+ ssize_t size, left, len;
>>+ unsigned int i = 0, num_pages;
>>+ size_t offset, folio_offset;
>> int ret = 0;
>>
>> /*
>>@@ -1321,15 +1351,29 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>>
>> for (left = size, i = 0; left > 0; left -= len, i++) {
>> struct page *page = pages[i];
>>+ struct folio *folio = page_folio(page);
>>+
>>+ folio_offset = (folio_page_idx(folio, page) << PAGE_SHIFT) +
>>+ offset;
>>+
>>+ len = min_t(size_t, (folio_size(folio) - folio_offset), left);
>>+
>>+ num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
>>+
>>+ if (num_pages > 1)
>>+ len = get_contig_folio_len(&num_pages, pages, i,
>>+ folio, left, offset);
>>
>>- len = min_t(size_t, PAGE_SIZE - offset, left);
>> if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
>>- ret = bio_iov_add_zone_append_page(bio, page, len,
>>- offset);
>>+ ret = bio_iov_add_zone_append_folio(bio, folio, len,
>>+ folio_offset);
>> if (ret)
>> break;
>> } else
>>- bio_iov_add_page(bio, page, len, offset);
>>+ bio_iov_add_folio(bio, folio, len, folio_offset);
>>+
>>+ /* Skip the pages which got added */
>>+ i = i + (num_pages - 1);
>>
>> offset = 0;
>> }
>>--
>>2.25.1
>>
>>
>
>Well. The issue here is that bvecs really only use the 'struct page'
>entry as an address to the data; the page size itself is completely
>immaterial. So from that perspective it doesn't really matter whether
>we use 'struct page' or 'struct folio' to get to that address.
>However, what matters is whether we _iterate_ over pages or folios.
>The current workflow is to first allocate an array of pages,
>call one of the _get_pages() variants, and the iterate over all
>pages.
>What we should be doing is to add _get_folios() variants, working
>on folio batches and not pre-allocated arrays.
The XXX_get_pages() functions do page table walk and fill the pages
corresponding to a user space addr in pages array. The _get_folios()
variants shall return a folio_vec, rather than folio array, as every folio
entry will need a folio_offset and len per folio. If we convert to
_get_folios() variants, number of folios may be lesser than number of
pages. But we will need allocation of folio_vec array as a replacement
of pages array.
Am I missing something?
Down in the page table walk (gup_fast_pte_range), we fill the pages array
addr -> pte -> page. This shall be modified to fill a folio_vec array. The
page table walk also deals with huge pages, and looks like huge page
functions shall also be modified to fill folio_vec array. Also handling
the gup slow path will need a modification to fill the folio_vec array.
--
Kundan
>Then we could iterate over all folios in the batch, and can modify
>the 'XXX_get_pages()' variants to extract pages from the folio batch.
>And then gradually move over all callers to work on folio batches.
>
>Tall order, but I fear this is the best way going forward.
>Matthew? Christoph? Is that what you had in mind?
>
>Cheers,
>
>Hannes
>--
>Dr. Hannes Reinecke Kernel Storage Architect
>hare@suse.de +49 911 74053 688
>SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
>HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
>
[-- Attachment #2: Type: text/plain, Size: 0 bytes --]
next prev parent reply other threads:[~2024-06-20 5:17 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CGME20240619024142epcas5p22b2e0f83e526fc74fda62a4837bed544@epcas5p2.samsung.com>
2024-06-19 2:34 ` [PATCH v5 0/3] block: add larger order folio instead of pages Kundan Kumar
2024-06-19 2:34 ` [PATCH v5 1/3] block: Added folio-lized version of bio_add_hw_page() Kundan Kumar
2024-06-20 3:47 ` Matthew Wilcox
2024-06-19 2:34 ` [PATCH v5 2/3] block: add folio awareness instead of looping through pages Kundan Kumar
2024-06-19 7:47 ` Hannes Reinecke
2024-06-20 4:48 ` Kundan Kumar [this message]
2024-06-20 7:21 ` Hannes Reinecke
2024-06-19 2:34 ` [PATCH v5 3/3] block: unpin user pages belonging to a folio Kundan Kumar
2024-06-20 3:51 ` Matthew Wilcox
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240620044842.oxbryv2i7q4muiwf@green245 \
--to=kundan.kumar@samsung.com \
--cc=anuj20.g@samsung.com \
--cc=axboe@kernel.dk \
--cc=c.gameti@samsung.com \
--cc=gost.dev@samsung.com \
--cc=hare@suse.de \
--cc=hch@lst.de \
--cc=joshi.k@samsung.com \
--cc=kbusch@kernel.org \
--cc=linux-block@vger.kernel.org \
--cc=mcgrof@kernel.org \
--cc=nj.shetty@samsung.com \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox