Re: [PATCH v5 2/3] block: add folio awareness instead of looping through pages

Linux block layer
 help / color / mirror / Atom feed

From: Kundan Kumar <kundan.kumar@samsung.com>
To: Hannes Reinecke <hare@suse.de>
Cc: axboe@kernel.dk, hch@lst.de, willy@infradead.org,
	kbusch@kernel.org, linux-block@vger.kernel.org,
	joshi.k@samsung.com, mcgrof@kernel.org, anuj20.g@samsung.com,
	nj.shetty@samsung.com, c.gameti@samsung.com,
	gost.dev@samsung.com
Subject: Re: [PATCH v5 2/3] block: add folio awareness instead of looping through pages
Date: Thu, 20 Jun 2024 10:18:42 +0530	[thread overview]
Message-ID: <20240620044842.oxbryv2i7q4muiwf@green245> (raw)
In-Reply-To: <c29524a5-f3c7-4236-968c-58b5f3004b66@suse.de>

[-- Attachment #1: Type: text/plain, Size: 8284 bytes --]

On 19/06/24 09:47AM, Hannes Reinecke wrote:
>On 6/19/24 04:34, Kundan Kumar wrote:
>>Add a bigger size from folio to bio and skip merge processing for pages.
>>
>>Fetch the offset of page within a folio. Depending on the size of folio
>>and folio_offset, fetch a larger length. This length may consist of
>>multiple contiguous pages if folio is multiorder.
>>
>>Using the length calculate number of pages which will be added to bio and
>>increment the loop counter to skip those pages.
>>
>>Using a helper function check if pages are contiguous and belong to same
>>folio, this is done as a COW may happen and change contiguous mapping of
>>pages of folio.
>>
>>This technique helps to avoid overhead of merging pages which belong to
>>same large order folio.
>>
>>Also folio-lize the functions bio_iov_add_page() and
>>bio_iov_add_zone_append_page()
>>
>>Signed-off-by: Kundan Kumar <kundan.kumar@samsung.com>
>>---
>>  block/bio.c | 72 ++++++++++++++++++++++++++++++++++++++++++-----------
>>  1 file changed, 58 insertions(+), 14 deletions(-)
>>
>>diff --git a/block/bio.c b/block/bio.c
>>index c8914febb16e..3e75b5b0eb6e 100644
>>--- a/block/bio.c
>>+++ b/block/bio.c
>>@@ -1224,7 +1224,7 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
>>         bio_set_flag(bio, BIO_CLONED);
>>  }
>>
>>-static int bio_iov_add_page(struct bio *bio, struct page *page,
>>+static int bio_iov_add_folio(struct bio *bio, struct folio *folio,
>>                 unsigned int len, unsigned int offset)
>>  {
>>         bool same_page = false;
>>@@ -1234,30 +1234,60 @@ static int bio_iov_add_page(struct bio *bio, struct page *page,
>>
>>         if (bio->bi_vcnt > 0 &&
>>             bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
>>-                               page, len, offset, &same_page)) {
>>+                               folio_page(folio, 0), len, offset,
>>+                               &same_page)) {
>>                 bio->bi_iter.bi_size += len;
>>                 if (same_page)
>>-                       bio_release_page(bio, page);
>>+                       bio_release_page(bio, folio_page(folio, 0));
>>                 return 0;
>>         }
>>-       __bio_add_page(bio, page, len, offset);
>>+       bio_add_folio_nofail(bio, folio, len, offset);
>>         return 0;
>>  }
>>
>>-static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
>>+static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio,
>>                 unsigned int len, unsigned int offset)
>>  {
>>         struct request_queue *q = bdev_get_queue(bio->bi_bdev);
>>         bool same_page = false;
>>
>>-       if (bio_add_hw_page(q, bio, page, len, offset,
>>+       if (bio_add_hw_folio(q, bio, folio, len, offset,
>>                         queue_max_zone_append_sectors(q), &same_page) != len)
>>                 return -EINVAL;
>>         if (same_page)
>>-               bio_release_page(bio, page);
>>+               bio_release_page(bio, folio_page(folio, 0));
>>         return 0;
>>  }
>>
>>+static unsigned int get_contig_folio_len(int *num_pages, struct page **pages,
>>+                                        int i, struct folio *folio,
>>+                                        ssize_t left, size_t offset)
>>+{
>>+       ssize_t bytes = left;
>>+       size_t contig_sz = min_t(size_t,  PAGE_SIZE - offset, bytes);
>>+       unsigned int j;
>>+
>>+       /*
>>+        * We might COW a single page in the middle of
>>+        * a large folio, so we have to check that all
>>+        * pages belong to the same folio.
>>+        */
>>+       bytes -= contig_sz;
>>+       for (j = i + 1; j < i + *num_pages; j++) {
>>+               size_t next = min_t(size_t, PAGE_SIZE, bytes);
>>+
>>+               if (page_folio(pages[j]) != folio ||
>>+                   pages[j] != pages[j - 1] + 1) {
>>+                       break;
>>+               }
>>+               contig_sz += next;
>>+               bytes -= next;
>>+       }
>>+       *num_pages = j - i;
>>+
>>+       return contig_sz;
>>+}
>>+
>>  #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
>>
>>  /**
>>@@ -1277,9 +1307,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>>         unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
>>         struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
>>         struct page **pages = (struct page **)bv;
>>-       ssize_t size, left;
>>-       unsigned len, i = 0;
>>-       size_t offset;
>>+       ssize_t size, left, len;
>>+       unsigned int i = 0, num_pages;
>>+       size_t offset, folio_offset;
>>         int ret = 0;
>>
>>         /*
>>@@ -1321,15 +1351,29 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>>
>>         for (left = size, i = 0; left > 0; left -= len, i++) {
>>                 struct page *page = pages[i];
>>+               struct folio *folio = page_folio(page);
>>+
>>+               folio_offset = (folio_page_idx(folio, page) << PAGE_SHIFT) +
>>+                               offset;
>>+
>>+               len = min_t(size_t, (folio_size(folio) - folio_offset), left);
>>+
>>+               num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
>>+
>>+               if (num_pages > 1)
>>+                       len = get_contig_folio_len(&num_pages, pages, i,
>>+                                                  folio, left, offset);
>>
>>-               len = min_t(size_t, PAGE_SIZE - offset, left);
>>                 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
>>-                       ret = bio_iov_add_zone_append_page(bio, page, len,
>>-                                       offset);
>>+                       ret = bio_iov_add_zone_append_folio(bio, folio, len,
>>+                                       folio_offset);
>>                         if (ret)
>>                                 break;
>>                 } else
>>-                       bio_iov_add_page(bio, page, len, offset);
>>+                       bio_iov_add_folio(bio, folio, len, folio_offset);
>>+
>>+               /* Skip the pages which got added */
>>+               i = i + (num_pages - 1);
>>
>>                 offset = 0;
>>         }
>>--
>>2.25.1
>>
>>
>
>Well. The issue here is that bvecs really only use the 'struct page' 
>entry as an address to the data; the page size itself is completely
>immaterial. So from that perspective it doesn't really matter whether
>we use 'struct page' or 'struct folio' to get to that address.
>However, what matters is whether we _iterate_ over pages or folios.
>The current workflow is to first allocate an array of pages,
>call one of the _get_pages() variants, and the iterate over all
>pages.
>What we should be doing is to add _get_folios() variants, working
>on folio batches and not pre-allocated arrays.

The XXX_get_pages() functions do page table walk and fill the pages
corresponding to a user space addr in pages array. The _get_folios()
variants shall return a folio_vec, rather than folio array, as every folio
entry will need a folio_offset and len per folio. If we convert to
_get_folios() variants, number of folios may be lesser than number of
pages. But we will need allocation of folio_vec array as a replacement
of pages array.

Am I missing something?

Down in the page table walk (gup_fast_pte_range), we fill the pages array
addr -> pte -> page. This shall be modified to fill a folio_vec array. The
page table walk also deals with huge pages, and looks like huge page
functions shall also be modified to fill folio_vec array. Also handling
the gup slow path will need a modification to fill the folio_vec array.
--
Kundan

>Then we could iterate over all folios in the batch, and can modify
>the 'XXX_get_pages()' variants to extract pages from the folio batch.
>And then gradually move over all callers to work on folio batches.
>
>Tall order, but I fear this is the best way going forward.
>Matthew? Christoph? Is that what you had in mind?
>
>Cheers,
>
>Hannes
>-- 
>Dr. Hannes Reinecke                  Kernel Storage Architect
>hare@suse.de                                +49 911 74053 688
>SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
>HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
>

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]

next prev parent reply	other threads:[~2024-06-20  5:17 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20240619024142epcas5p22b2e0f83e526fc74fda62a4837bed544@epcas5p2.samsung.com>
2024-06-19  2:34 ` [PATCH v5 0/3] block: add larger order folio instead of pages Kundan Kumar
2024-06-19  2:34   ` [PATCH v5 1/3] block: Added folio-lized version of bio_add_hw_page() Kundan Kumar
2024-06-20  3:47     ` Matthew Wilcox
2024-06-19  2:34   ` [PATCH v5 2/3] block: add folio awareness instead of looping through pages Kundan Kumar
2024-06-19  7:47     ` Hannes Reinecke
2024-06-20  4:48       ` Kundan Kumar [this message]
2024-06-20  7:21         ` Hannes Reinecke
2024-06-19  2:34   ` [PATCH v5 3/3] block: unpin user pages belonging to a folio Kundan Kumar
2024-06-20  3:51     ` Matthew Wilcox

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240620044842.oxbryv2i7q4muiwf@green245 \
    --to=kundan.kumar@samsung.com \
    --cc=anuj20.g@samsung.com \
    --cc=axboe@kernel.dk \
    --cc=c.gameti@samsung.com \
    --cc=gost.dev@samsung.com \
    --cc=hare@suse.de \
    --cc=hch@lst.de \
    --cc=joshi.k@samsung.com \
    --cc=kbusch@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=mcgrof@kernel.org \
    --cc=nj.shetty@samsung.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox