* [PATCH] btrfs: raid56: use async_tx API for parity operations
@ 2026-06-29 2:39 Rosen Penev
2026-06-29 5:11 ` Qu Wenruo
0 siblings, 1 reply; 4+ messages in thread
From: Rosen Penev @ 2026-06-29 2:39 UTC (permalink / raw)
To: linux-btrfs; +Cc: Chris Mason, David Sterba, open list
Replace the kmap-local + synchronous library call + kunmap-local
pattern in raid56 compute and recovery paths with the async_tx API
(async_xor_offs, async_gen_syndrome, async_raid6_datap_recov,
async_raid6_2data_recov, async_memcpy). The async_tx API provides
DMA offloading when a suitable engine is available and falls back to
the same software implementations (xor_gen, raid6_gen_syndrome, etc.)
otherwise.
Step-loop operations inside each vertical stripe are chained as DMA
dependency chains with ASYNC_TX_FENCE. The chains are then extended
across all sectors in a stripe so that the DMA engine can pipeline
the entire stripe's parity computation before the CPU waits for
completion. For RMW writes, all sectors' parity generation is
issued as a single chain; for recovery reads, the recovery chains
for every sector are submitted together, then the CPU verifies
checksums and marks uptodate bits after a single wait.
Each DMA chain is fully waited on before the caller touches any
buffer pages, so there is never an in-flight DMA operation during
cleanup. For the typical stripe size of 4 KiB--64 KiB, per-operation
overhead from DMA submission may outweigh the offload benefit, making
the software fallback (which runs synchronously, returning NULL) the
common fast path. This trade-off is inherent in the async_tx API's
unified interface, but on platforms with a DMA engine (e.g. Marvell
mv_xor) and larger sectors the batching provides measurable offload.
Also select ASYNC_RAID6_RECOV and ASYNC_MEMCPY in Kconfig so that
the async_tx function implementations are linked; previously the
code used the synchronous RAID6_PQ / XOR_BLOCKS libraries directly,
but async_tx functions require their own Kconfig symbols.
Converted functions:
- generate_pq_vertical_step() / generate_pq_vertical()
- recover_vertical_step() / recover_vertical() / recover_sectors()
- verify_one_parity_step() / verify_one_parity_sector()
- finish_parity_scrub()
- recover_scrub_rbio()
- memcpy_from_bio_to_stripe()
- raid56_parity_cache_data_folios()
Removed the now-unused btrfs_raid_bio::finish_pointers field and
the no-longer-needed #include <linux/raid/pq.h> and
#include <linux/raid/xor.h>.
Assisted-by: opencode:big-pickle
Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
fs/btrfs/Kconfig | 4 +-
fs/btrfs/raid56.c | 654 ++++++++++++++++++++++++++++++++--------------
fs/btrfs/raid56.h | 3 -
3 files changed, 454 insertions(+), 207 deletions(-)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 43dd8ef763b8..8223b40d8ecb 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -15,7 +15,9 @@ config BTRFS_FS
select ZSTD_DECOMPRESS
select FS_IOMAP
select RAID6_PQ
- select XOR_BLOCKS
+ select ASYNC_RAID6_RECOV
+ select ASYNC_MEMCPY
+ select ASYNC_XOR
select XXHASH
depends on PAGE_SIZE_LESS_THAN_256KB
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 00a01b97cc0c..6e98e46df752 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -8,11 +8,10 @@
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
-#include <linux/raid/pq.h>
#include <linux/hash.h>
#include <linux/list_sort.h>
-#include <linux/raid/xor.h>
#include <linux/mm.h>
+#include <linux/async_tx.h>
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
@@ -154,7 +153,6 @@ static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
kfree(rbio->stripe_pages);
kfree(rbio->bio_paddrs);
kfree(rbio->stripe_paddrs);
- kfree(rbio->finish_pointers);
}
static void free_raid_bio(struct btrfs_raid_bio *rbio)
@@ -231,18 +229,33 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr)
{
const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
+ struct dma_async_tx_descriptor *tx = NULL;
ASSERT(sector_nr < rbio->nr_sectors);
for (int i = 0; i < rbio->sector_nsteps; i++) {
unsigned int index = sector_nr * rbio->sector_nsteps + i;
phys_addr_t dst = rbio->stripe_paddrs[index];
phys_addr_t src = rbio->bio_paddrs[index];
+ struct async_submit_ctl submit;
ASSERT(dst != INVALID_PADDR);
ASSERT(src != INVALID_PADDR);
- memcpy_page(phys_to_page(dst), offset_in_page(dst),
- phys_to_page(src), offset_in_page(src), step);
+ init_async_submit(&submit, ASYNC_TX_FENCE, tx, NULL, NULL, NULL);
+ tx = async_memcpy(phys_to_page(dst), phys_to_page(src),
+ offset_in_page(dst), offset_in_page(src),
+ step, &submit);
+ }
+ /*
+ * All steps are chained via ASYNC_TX_FENCE. issue_pending and
+ * dma_wait_for_async_tx on the last descriptor walk the entire
+ * chain, so earlier descriptors are submitted and waited on too.
+ * If every step completed synchronously, tx is NULL and no wait
+ * is needed.
+ */
+ if (tx) {
+ async_tx_issue_pending(tx);
+ dma_wait_for_async_tx(tx);
}
}
@@ -1079,12 +1092,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
rbio->stripe_paddrs = kzalloc_objs(phys_addr_t,
num_sectors * sector_nsteps,
GFP_NOFS);
- rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs ||
- !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) {
+ !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) {
free_raid_bio_pointers(rbio);
kfree(rbio);
return ERR_PTR(-ENOMEM);
@@ -1385,49 +1397,98 @@ static inline void *kmap_local_paddr(phys_addr_t paddr)
return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
}
-static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr,
- unsigned int step_nr)
+/*
+ * The page/offset array model used by the async_tx helpers below replaces the
+ * old kmap-local pointer array. Each entry in pages[] + offsets[] represents
+ * what was previously a single kmap_local_paddr() return value.
+ *
+ * For example, the old pattern:
+ * pointers[i] = kmap_local_paddr(paddr);
+ * raid6_gen_syndrome(..., pointers);
+ * kunmap_local(pointers[i]);
+ *
+ * becomes:
+ * pages[i] = phys_to_page(paddr);
+ * offsets[i] = offset_in_page(paddr);
+ * async_gen_syndrome(pages, offsets, ...);
+ *
+ * The async_tx API handles any necessary kmap internally.
+ */
+
+/*
+ * Maximum number of sectors per stripe.
+ * BTRFS_STRIPE_LEN (64 KiB) divided by the minimum sector size (PAGE_SIZE,
+ * typically 4 KiB) gives at most 16 entries. If stripe length ever changes,
+ * update this to BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits.
+ */
+#define RECOVER_MAX_STRIPE_SECTORS 16
+static struct dma_async_tx_descriptor *
+generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr,
+ unsigned int step_nr,
+ struct page **pages, unsigned int *offsets,
+ struct dma_async_tx_descriptor *depend_tx)
{
- void **pointers = rbio->finish_pointers;
const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
- int stripe;
const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
+ struct async_submit_ctl submit;
+ struct dma_async_tx_descriptor *tx;
+ int stripe;
- /* First collect one sector from each data stripe */
- for (stripe = 0; stripe < rbio->nr_data; stripe++)
- pointers[stripe] = kmap_local_paddr(
- sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0));
+ for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+ phys_addr_t paddr = sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0);
- /* Then add the parity stripe */
- pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr));
+ pages[stripe] = phys_to_page(paddr);
+ offsets[stripe] = offset_in_page(paddr);
+ }
+
+ {
+ phys_addr_t paddr = rbio_pstripe_paddr(rbio, sector_nr, step_nr);
+
+ pages[stripe] = phys_to_page(paddr);
+ offsets[stripe] = offset_in_page(paddr);
+ }
+ stripe++;
if (has_qstripe) {
- /*
- * RAID6, add the qstripe and call the library function
- * to fill in our p/q
- */
- pointers[stripe++] = kmap_local_paddr(
- rbio_qstripe_paddr(rbio, sector_nr, step_nr));
+ phys_addr_t paddr = rbio_qstripe_paddr(rbio, sector_nr, step_nr);
+
+ pages[stripe] = phys_to_page(paddr);
+ offsets[stripe] = offset_in_page(paddr);
assert_rbio(rbio);
- raid6_gen_syndrome(rbio->real_stripes, step, pointers);
+ init_async_submit(&submit, ASYNC_TX_FENCE, depend_tx, NULL, NULL, NULL);
+ tx = async_gen_syndrome(pages, offsets, rbio->real_stripes, step, &submit);
} else {
- /* raid5 */
- memcpy(pointers[rbio->nr_data], pointers[0], step);
- xor_gen(pointers[rbio->nr_data], pointers + 1, rbio->nr_data - 1,
- step);
+ init_async_submit(&submit, ASYNC_TX_FENCE | ASYNC_TX_XOR_ZERO_DST,
+ depend_tx, NULL, NULL, NULL);
+ tx = async_xor_offs(pages[rbio->nr_data], offsets[rbio->nr_data],
+ pages, offsets, rbio->nr_data, step, &submit);
}
- for (stripe = stripe - 1; stripe >= 0; stripe--)
- kunmap_local(pointers[stripe]);
+ return tx;
}
-/* Generate PQ for one vertical stripe. */
-static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
+/*
+ * Generate PQ for one vertical stripe, chaining into @depend_tx.
+ * The caller must issue_pending + wait for the returned descriptor
+ * (which may be NULL if the operation completed synchronously),
+ * then call generate_pq_vertical_finish() to mark parity uptodate.
+ */
+static struct dma_async_tx_descriptor *
+generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr,
+ struct page **pages, unsigned int *offsets,
+ struct dma_async_tx_descriptor *depend_tx)
{
- const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6);
+ struct dma_async_tx_descriptor *tx = depend_tx;
for (int i = 0; i < rbio->sector_nsteps; i++)
- generate_pq_vertical_step(rbio, sectornr, i);
+ tx = generate_pq_vertical_step(rbio, sectornr, i, pages, offsets, tx);
+
+ return tx;
+}
+
+static void generate_pq_vertical_finish(struct btrfs_raid_bio *rbio, int sectornr)
+{
+ const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6);
set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr),
rbio->stripe_uptodate_bitmap);
@@ -1910,39 +1971,33 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio,
return 0;
}
-static void recover_vertical_step(struct btrfs_raid_bio *rbio,
- unsigned int sector_nr,
- unsigned int step_nr,
- int faila, int failb,
- void **pointers, void **unmap_array)
+static struct dma_async_tx_descriptor *
+recover_vertical_step(struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr,
+ unsigned int step_nr,
+ int faila, int failb,
+ struct page **pages, unsigned int *offsets,
+ struct page **src_pages, unsigned int *src_offsets,
+ struct dma_async_tx_descriptor *depend_tx)
{
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
- int stripe_nr;
+ struct async_submit_ctl submit;
+ struct dma_async_tx_descriptor *tx;
ASSERT(step_nr < rbio->sector_nsteps);
ASSERT(sector_nr < rbio->stripe_nsectors);
- /*
- * Setup our array of pointers with sectors from each stripe
- *
- * NOTE: store a duplicate array of pointers to preserve the
- * pointer order.
- */
- for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+ for (int i = 0; i < rbio->real_stripes; i++) {
phys_addr_t paddr;
- /*
- * If we're rebuilding a read, we have to use pages from the
- * bio list if possible.
- */
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
- paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0);
- } else {
- paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr);
- }
- pointers[stripe_nr] = kmap_local_paddr(paddr);
- unmap_array[stripe_nr] = pointers[stripe_nr];
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
+ paddr = sector_paddr_in_rbio(rbio, i, sector_nr, step_nr, 0);
+ else
+ paddr = rbio_stripe_paddr(rbio, i, sector_nr, step_nr);
+
+ pages[i] = phys_to_page(paddr);
+ offsets[i] = offset_in_page(paddr);
}
/* All raid6 handling here */
@@ -1951,85 +2006,100 @@ static void recover_vertical_step(struct btrfs_raid_bio *rbio,
if (failb < 0) {
if (faila == rbio->nr_data)
/*
- * Just the P stripe has failed, without
- * a bad data or Q stripe.
- * We have nothing to do, just skip the
- * recovery for this stripe.
+ * Only P stripe has failed without a bad data or
+ * Q stripe. No recovery needed for this step.
+ * Returning NULL breaks the async_tx dependency
+ * chain, but this path issues no DMA writes
+ * (it is a pure no-op), so no ordering between
+ * steps is required.
*/
- goto cleanup;
- /*
- * a single failure in raid6 is rebuilt
- * in the pstripe code below
- */
+ return NULL;
goto pstripe;
}
- /*
- * If the q stripe is failed, do a pstripe reconstruction from
- * the xors.
- * If both the q stripe and the P stripe are failed, we're
- * here due to a crc mismatch and we can't give them the
- * data they want.
- */
if (failb == rbio->real_stripes - 1) {
if (faila == rbio->real_stripes - 2)
/*
- * Only P and Q are corrupted.
- * We only care about data stripes recovery,
- * can skip this vertical stripe.
+ * Only P and Q are corrupted; data stripes are
+ * intact. No recovery needed for this step.
+ * Safe to break the chain because this path
+ * issues no DMA writes (pure no-op).
*/
- goto cleanup;
- /*
- * Otherwise we have one bad data stripe and
- * a good P stripe. raid5!
- */
+ return NULL;
goto pstripe;
}
if (failb == rbio->real_stripes - 2) {
- raid6_recov_datap(rbio->real_stripes, step,
- faila, pointers);
+ init_async_submit(&submit, ASYNC_TX_FENCE,
+ depend_tx, NULL, NULL, NULL);
+ tx = async_raid6_datap_recov(rbio->real_stripes, step,
+ faila, pages, offsets, &submit);
} else {
- raid6_recov_2data(rbio->real_stripes, step,
- faila, failb, pointers);
+ init_async_submit(&submit, ASYNC_TX_FENCE,
+ depend_tx, NULL, NULL, NULL);
+ tx = async_raid6_2data_recov(rbio->real_stripes, step,
+ faila, failb, pages, offsets,
+ &submit);
}
- } else {
- void *p;
+ return tx;
+ }
- /* Rebuild from P stripe here (raid5 or raid6). */
- ASSERT(failb == -1);
+ /* Rebuild from P stripe here (raid5 or raid6 single failure). */
+ ASSERT(failb == -1);
pstripe:
- /* Copy parity block into failed block to start with */
- memcpy(pointers[faila], pointers[rbio->nr_data], step);
+ {
+ const unsigned int nr_data = rbio->nr_data;
+ int src_cnt = 0;
- /* Rearrange the pointer array */
- p = pointers[faila];
- for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
- stripe_nr++)
- pointers[stripe_nr] = pointers[stripe_nr + 1];
- pointers[rbio->nr_data - 1] = p;
+ /*
+ * Build source list: P stripe + all non-failed data stripes.
+ * The result is written into pages[faila].
+ * The caller provides src_pages/src_offsets (sized for nr_data).
+ */
+ src_pages[src_cnt] = pages[nr_data];
+ src_offsets[src_cnt] = offsets[nr_data];
+ src_cnt++;
+ for (int i = 0; i < nr_data; i++) {
+ if (i != faila) {
+ src_pages[src_cnt] = pages[i];
+ src_offsets[src_cnt] = offsets[i];
+ src_cnt++;
+ }
+ }
- /* Xor in the rest */
- xor_gen(p, pointers, rbio->nr_data - 1, step);
+ init_async_submit(&submit, ASYNC_TX_FENCE | ASYNC_TX_XOR_ZERO_DST,
+ depend_tx, NULL, NULL, NULL);
+ tx = async_xor_offs(pages[faila], offsets[faila],
+ src_pages, src_offsets, src_cnt,
+ step, &submit);
+ return tx;
}
-
-cleanup:
- for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
- kunmap_local(unmap_array[stripe_nr]);
}
/*
- * Recover a vertical stripe specified by @sector_nr.
- * @*pointers are the pre-allocated pointers by the caller, so we don't
- * need to allocate/free the pointers again and again.
+ * Per-sector issue step for vertical stripe recovery.
+ *
+ * Determines which stripes need recovery, then submits the async_tx chain
+ * (chained to @depend_tx for cross-sector batching). Sets @faila and @failb
+ * for the subsequent finish step.
+ *
+ * Return value:
+ * 0 : success, @tx_ret describes pending work (or NULL for none)
+ * -EIO: too many errors, no work submitted
*/
-static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
- void **pointers, void **unmap_array)
+static int recover_vertical_submit(struct btrfs_raid_bio *rbio, int sector_nr,
+ struct page **pages, unsigned int *offsets,
+ struct page **src_pages, unsigned int *src_offsets,
+ int *faila, int *failb,
+ struct dma_async_tx_descriptor **tx_ret,
+ struct dma_async_tx_descriptor *depend_tx)
{
int found_errors;
- int faila;
- int failb;
- int ret = 0;
+ struct dma_async_tx_descriptor *tx = depend_tx;
+
+ *tx_ret = NULL;
+ *faila = -1;
+ *failb = -1;
/*
* Now we just use bitmap to mark the horizontal stripes in
@@ -2039,8 +2109,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
!test_bit(sector_nr, &rbio->dbitmap))
return 0;
- found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila,
- &failb);
+ found_errors = get_rbio_vertical_errors(rbio, sector_nr, faila, failb);
/*
* No errors in the vertical stripe, skip it. Can happen for recovery
* which only part of a stripe failed csum check.
@@ -2051,9 +2120,27 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
if (unlikely(found_errors > rbio->bioc->max_errors))
return -EIO;
- for (int i = 0; i < rbio->sector_nsteps; i++)
- recover_vertical_step(rbio, sector_nr, i, faila, failb,
- pointers, unmap_array);
+ for (int i = 0; i < rbio->sector_nsteps; i++) {
+ tx = recover_vertical_step(rbio, sector_nr, i,
+ *faila, *failb,
+ pages, offsets,
+ src_pages, src_offsets,
+ tx);
+ }
+ *tx_ret = tx;
+ return 0;
+}
+
+/*
+ * Finish step for recover_vertical_submit().
+ * Must be called after the DMA chain (if any) has completed.
+ * Verifies csums for the recovered stripes and sets the uptodate bits.
+ */
+static int recover_vertical_finish(struct btrfs_raid_bio *rbio, int sector_nr,
+ int faila, int failb)
+{
+ int ret = 0;
+
if (faila >= 0) {
ret = verify_one_sector(rbio, faila, sector_nr);
if (ret < 0)
@@ -2073,22 +2160,61 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
return ret;
}
+/*
+ * Sequential wrapper: submit, wait, finish for one sector.
+ * Used by callers that do their own complex error filtering
+ * (e.g. scrub recovery) and cannot batch across sectors.
+ */
+static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
+ struct page **pages, unsigned int *offsets,
+ struct page **src_pages, unsigned int *src_offsets)
+{
+ struct dma_async_tx_descriptor *tx = NULL;
+ int faila, failb;
+ int ret;
+
+ ret = recover_vertical_submit(rbio, sector_nr, pages, offsets,
+ src_pages, src_offsets,
+ &faila, &failb, &tx, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (tx) {
+ async_tx_issue_pending(tx);
+ dma_wait_for_async_tx(tx);
+ }
+
+ return recover_vertical_finish(rbio, sector_nr, faila, failb);
+}
+
static int recover_sectors(struct btrfs_raid_bio *rbio)
{
- void **pointers = NULL;
- void **unmap_array = NULL;
+ struct page **pages = NULL;
+ unsigned int *offsets = NULL;
+ struct page **src_pages = NULL;
+ unsigned int *src_offsets = NULL;
+ int faila_arr[RECOVER_MAX_STRIPE_SECTORS];
+ int failb_arr[RECOVER_MAX_STRIPE_SECTORS];
int sectornr;
int ret = 0;
+ ASSERT(rbio->stripe_nsectors <= RECOVER_MAX_STRIPE_SECTORS);
+
+ pages = kzalloc_objs(struct page *, rbio->real_stripes, GFP_NOFS);
+ offsets = kzalloc_objs(unsigned int, rbio->real_stripes, GFP_NOFS);
+ if (!pages || !offsets) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
/*
- * @pointers array stores the pointer for each sector.
- *
- * @unmap_array stores copy of pointers that does not get reordered
- * during reconstruction so that kunmap_local works.
+ * src_pages/src_offsets hold the pstripe source list:
+ * P stripe (1) + non-failed data stripes (at most nr_data - 1)
+ * = at most nr_data entries.
*/
- pointers = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS);
- unmap_array = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS);
- if (!pointers || !unmap_array) {
+ src_pages = kzalloc_objs(struct page *, rbio->nr_data, GFP_NOFS);
+ src_offsets = kzalloc_objs(unsigned int, rbio->nr_data, GFP_NOFS);
+ if (!src_pages || !src_offsets) {
ret = -ENOMEM;
goto out;
}
@@ -2101,15 +2227,58 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
- if (ret < 0)
- break;
+ {
+ struct dma_async_tx_descriptor *tx = NULL;
+
+ /*
+ * Phase 1: submit all sectors' recovery chains.
+ *
+ * pages[] / offsets[] are overwritten by each sector's submit
+ * call. This is safe because the async_tx API consumes the
+ * arrays synchronously during each call: either the DMA prep
+ * function converts page+offset to DMA addresses and stores
+ * them in the hardware descriptor, or the synchronous fallback
+ * reads the data immediately. No async_tx function retains a
+ * reference to the arrays after it returns.
+ */
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ int f_a = -1, f_b = -1;
+
+ ret = recover_vertical_submit(rbio, sectornr, pages, offsets,
+ src_pages, src_offsets,
+ &f_a, &f_b, &tx, tx);
+ if (ret < 0) {
+ if (tx) {
+ async_tx_issue_pending(tx);
+ dma_wait_for_async_tx(tx);
+ }
+ goto out;
+ }
+ faila_arr[sectornr] = f_a;
+ failb_arr[sectornr] = f_b;
+ }
+
+ /* Phase 2: wait for all DMAs to complete. */
+ if (tx) {
+ async_tx_issue_pending(tx);
+ dma_wait_for_async_tx(tx);
+ }
+
+ /* Phase 3: verify recovered stripes and set uptodate bits. */
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ ret = recover_vertical_finish(rbio, sectornr,
+ faila_arr[sectornr],
+ failb_arr[sectornr]);
+ if (ret < 0)
+ goto out;
+ }
}
out:
- kfree(pointers);
- kfree(unmap_array);
+ kfree(src_pages);
+ kfree(src_offsets);
+ kfree(pages);
+ kfree(offsets);
return ret;
}
@@ -2436,6 +2605,8 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
static void rmw_rbio(struct btrfs_raid_bio *rbio)
{
struct bio_list bio_list;
+ struct page **pages = NULL;
+ unsigned int *offsets = NULL;
int sectornr;
int ret = 0;
@@ -2491,8 +2662,28 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio)
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
- generate_pq_vertical(rbio, sectornr);
+ pages = kzalloc_objs(struct page *, rbio->real_stripes, GFP_NOFS);
+ offsets = kzalloc_objs(unsigned int, rbio->real_stripes, GFP_NOFS);
+ if (!pages || !offsets) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ {
+ struct dma_async_tx_descriptor *tx = NULL;
+
+ /* Chain all sectors so a DMA engine can pipeline across them. */
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
+ tx = generate_pq_vertical(rbio, sectornr, pages, offsets, tx);
+
+ if (tx) {
+ async_tx_issue_pending(tx);
+ dma_wait_for_async_tx(tx);
+ }
+
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
+ generate_pq_vertical_finish(rbio, sectornr);
+ }
bio_list_init(&bio_list);
ret = rmw_assemble_write_bios(rbio, &bio_list);
@@ -2515,6 +2706,8 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio)
}
}
out:
+ kfree(pages);
+ kfree(offsets);
rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}
@@ -2622,60 +2815,91 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
return 0;
}
-/* Return true if the content of the step matches the caclulated one. */
+/* Return true if the content of the step matches the calculated one. */
static bool verify_one_parity_step(struct btrfs_raid_bio *rbio,
- void *pointers[], unsigned int sector_nr,
+ struct page **pages, unsigned int *offsets,
+ unsigned int sector_nr,
unsigned int step_nr)
{
const unsigned int nr_data = rbio->nr_data;
const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2);
const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
- void *parity;
+ struct async_submit_ctl submit;
+ struct dma_async_tx_descriptor *tx;
+ void *scrub_kaddr, *calc_kaddr;
bool ret = false;
ASSERT(step_nr < rbio->sector_nsteps);
- /* First collect one page from each data stripe. */
- for (int stripe = 0; stripe < nr_data; stripe++)
- pointers[stripe] = kmap_local_paddr(
- sector_paddr_in_rbio(rbio, stripe, sector_nr,
- step_nr, 0));
+ /* Fill page/offset arrays for data stripes. */
+ for (int stripe = 0; stripe < nr_data; stripe++) {
+ phys_addr_t paddr = sector_paddr_in_rbio(rbio, stripe, sector_nr,
+ step_nr, 0);
+ pages[stripe] = phys_to_page(paddr);
+ offsets[stripe] = offset_in_page(paddr);
+ }
+
+ ASSERT(rbio->scrubp >= nr_data);
+ ASSERT(rbio->scrubp < rbio->real_stripes);
+
+ /* The parity destination pages must be set by the caller. */
+ ASSERT(pages[nr_data]);
+ ASSERT(offsets[nr_data] == 0);
+ if (has_qstripe) {
+ ASSERT(pages[rbio->real_stripes - 1]);
+ ASSERT(offsets[rbio->real_stripes - 1] == 0);
+ }
if (has_qstripe) {
assert_rbio(rbio);
- /* RAID6, call the library function to fill in our P/Q. */
- raid6_gen_syndrome(rbio->real_stripes, step, pointers);
+ init_async_submit(&submit, 0, NULL, NULL, NULL, NULL);
+ tx = async_gen_syndrome(pages, offsets, rbio->real_stripes,
+ step, &submit);
} else {
- /* RAID5. */
- memcpy(pointers[nr_data], pointers[0], step);
- xor_gen(pointers[nr_data], pointers + 1, nr_data - 1, step);
+ init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, NULL, NULL, NULL);
+ /*
+ * async_xor_offs takes src_cnt = nr_data. pages[0..nr_data-1]
+ * are data sources; pages[nr_data] (the destination) is
+ * excluded from the source list by the count.
+ * ASYNC_TX_XOR_ZERO_DST zeroes the destination first, then
+ * XORs all sources: dest = 0 ^ D0 ^ ... ^ D{n-1} = parity.
+ */
+ tx = async_xor_offs(pages[rbio->nr_data], offsets[rbio->nr_data],
+ pages, offsets, rbio->nr_data, step, &submit);
+ }
+ /* async_tx returns NULL when the operation completed synchronously. */
+ if (tx) {
+ async_tx_issue_pending(tx);
+ dma_wait_for_async_tx(tx);
}
/* Check scrubbing parity and repair it. */
- parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr));
- if (memcmp(parity, pointers[rbio->scrubp], step) != 0)
- memcpy(parity, pointers[rbio->scrubp], step);
+ scrub_kaddr = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp,
+ sector_nr, step_nr));
+ calc_kaddr = kmap_local_page(pages[rbio->scrubp]);
+ if (memcmp(scrub_kaddr, (void *)((char *)calc_kaddr + offsets[rbio->scrubp]), step) != 0)
+ memcpy(scrub_kaddr, (void *)((char *)calc_kaddr + offsets[rbio->scrubp]), step);
else
ret = true;
- kunmap_local(parity);
+ kunmap_local(calc_kaddr);
+ kunmap_local(scrub_kaddr);
- for (int stripe = nr_data - 1; stripe >= 0; stripe--)
- kunmap_local(pointers[stripe]);
return ret;
}
/*
- * The @pointers array should have the P/Q parity already mapped.
+ * The @pages and @offsets arrays should have the P/Q parity pages already set.
*/
static void verify_one_parity_sector(struct btrfs_raid_bio *rbio,
- void *pointers[], unsigned int sector_nr)
+ struct page **pages, unsigned int *offsets,
+ unsigned int sector_nr)
{
bool found_error = false;
for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) {
bool match;
- match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr);
+ match = verify_one_parity_step(rbio, pages, offsets, sector_nr, step_nr);
if (!match)
found_error = true;
}
@@ -2686,17 +2910,17 @@ static void verify_one_parity_sector(struct btrfs_raid_bio *rbio,
static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
{
struct btrfs_io_context *bioc = rbio->bioc;
- void **pointers = rbio->finish_pointers;
unsigned long *pbitmap = &rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
int sectornr;
bool has_qstripe;
- struct page *page;
- phys_addr_t p_paddr = INVALID_PADDR;
- phys_addr_t q_paddr = INVALID_PADDR;
+ struct page **pages;
+ unsigned int *offsets;
+ struct page *p_page = NULL;
+ struct page *q_page = NULL;
struct bio_list bio_list;
bool is_replace = false;
- int ret;
+ int ret = 0;
bio_list_init(&bio_list);
@@ -2723,40 +2947,46 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
*/
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- p_paddr = page_to_phys(page);
- page = NULL;
- pointers[nr_data] = kmap_local_paddr(p_paddr);
+ pages = kzalloc_objs(struct page *, rbio->real_stripes, GFP_NOFS);
+ offsets = kzalloc_objs(unsigned int, rbio->real_stripes, GFP_NOFS);
+ if (!pages || !offsets) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ p_page = alloc_page(GFP_NOFS);
+ if (!p_page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pages[nr_data] = p_page;
+ offsets[nr_data] = 0;
if (has_qstripe) {
- /* RAID6, allocate and map temp space for the Q stripe */
- page = alloc_page(GFP_NOFS);
- if (!page) {
- __free_page(phys_to_page(p_paddr));
- p_paddr = INVALID_PADDR;
- return -ENOMEM;
+ q_page = alloc_page(GFP_NOFS);
+ if (!q_page) {
+ ret = -ENOMEM;
+ goto out;
}
- q_paddr = page_to_phys(page);
- page = NULL;
- pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr);
+ pages[rbio->real_stripes - 1] = q_page;
+ offsets[rbio->real_stripes - 1] = 0;
}
bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
- /* Map the parity stripe just once */
-
+ /* Temporary parity destination pages are set once and reused for all sectors */
for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors)
- verify_one_parity_sector(rbio, pointers, sectornr);
+ verify_one_parity_sector(rbio, pages, offsets, sectornr);
- kunmap_local(pointers[nr_data]);
- __free_page(phys_to_page(p_paddr));
- p_paddr = INVALID_PADDR;
- if (q_paddr != INVALID_PADDR) {
- __free_page(phys_to_page(q_paddr));
- q_paddr = INVALID_PADDR;
- }
+out:
+ if (p_page)
+ __free_page(p_page);
+ if (q_page)
+ __free_page(q_page);
+ kfree(pages);
+ kfree(offsets);
+ if (ret)
+ return ret;
/*
* time to start writing. Make bios for everything from the
@@ -2809,20 +3039,28 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
{
- void **pointers = NULL;
- void **unmap_array = NULL;
+ struct page **pages = NULL;
+ unsigned int *offsets = NULL;
+ struct page **src_pages = NULL;
+ unsigned int *src_offsets = NULL;
int sector_nr;
int ret = 0;
+ pages = kzalloc_objs(struct page *, rbio->real_stripes, GFP_NOFS);
+ offsets = kzalloc_objs(unsigned int, rbio->real_stripes, GFP_NOFS);
+ if (!pages || !offsets) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
/*
- * @pointers array stores the pointer for each sector.
- *
- * @unmap_array stores copy of pointers that does not get reordered
- * during reconstruction so that kunmap_local works.
+ * src_pages/src_offsets hold the pstripe source list:
+ * P stripe (1) + non-failed data stripes (at most nr_data - 1)
+ * = at most nr_data entries.
*/
- pointers = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS);
- unmap_array = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS);
- if (!pointers || !unmap_array) {
+ src_pages = kzalloc_objs(struct page *, rbio->nr_data, GFP_NOFS);
+ src_offsets = kzalloc_objs(unsigned int, rbio->nr_data, GFP_NOFS);
+ if (!src_pages || !src_offsets) {
ret = -ENOMEM;
goto out;
}
@@ -2881,13 +3119,16 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
goto out;
}
- ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
+ ret = recover_vertical(rbio, sector_nr, pages, offsets,
+ src_pages, src_offsets);
if (ret < 0)
goto out;
}
out:
- kfree(pointers);
- kfree(unmap_array);
+ kfree(src_pages);
+ kfree(src_offsets);
+ kfree(pages);
+ kfree(offsets);
return ret;
}
@@ -3025,11 +3266,18 @@ void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN;
cur_off += PAGE_SIZE) {
const unsigned int pindex = cur_off >> PAGE_SHIFT;
- void *kaddr;
-
- kaddr = kmap_local_page(rbio->stripe_pages[pindex]);
- memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE);
- kunmap_local(kaddr);
+ struct dma_async_tx_descriptor *tx;
+ struct async_submit_ctl submit;
+
+ ASSERT(IS_ALIGNED(foffset, PAGE_SIZE));
+ init_async_submit(&submit, 0, NULL, NULL, NULL, NULL);
+ tx = async_memcpy(rbio->stripe_pages[pindex],
+ folio_page(data_folios[findex], foffset >> PAGE_SHIFT),
+ 0, 0, PAGE_SIZE, &submit);
+ if (tx) {
+ async_tx_issue_pending(tx);
+ dma_wait_for_async_tx(tx);
+ }
foffset += PAGE_SIZE;
ASSERT(foffset <= folio_size(data_folios[findex]));
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 1f463ecf7e41..d8a63edd38b7 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -209,9 +209,6 @@ struct btrfs_raid_bio {
/* Each set bit means the corresponding sector in stripe_sectors[] is uptodate. */
unsigned long *stripe_uptodate_bitmap;
- /* Allocated with real_stripes-many pointers for finish_*() calls */
- void **finish_pointers;
-
/*
* The bitmap recording where IO errors happened.
* Each bit is corresponding to one sector in either bio_sectors[] or
--
2.54.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] btrfs: raid56: use async_tx API for parity operations
2026-06-29 2:39 [PATCH] btrfs: raid56: use async_tx API for parity operations Rosen Penev
@ 2026-06-29 5:11 ` Qu Wenruo
2026-06-29 12:40 ` Christoph Hellwig
2026-06-29 20:01 ` Rosen Penev
0 siblings, 2 replies; 4+ messages in thread
From: Qu Wenruo @ 2026-06-29 5:11 UTC (permalink / raw)
To: Rosen Penev, linux-btrfs; +Cc: Chris Mason, David Sterba, open list
在 2026/6/29 12:09, Rosen Penev 写道:
> Replace the kmap-local + synchronous library call + kunmap-local
> pattern in raid56 compute and recovery paths with the async_tx API
> (async_xor_offs, async_gen_syndrome, async_raid6_datap_recov,
> async_raid6_2data_recov, async_memcpy). The async_tx API provides
> DMA offloading when a suitable engine is available and falls back to
> the same software implementations (xor_gen, raid6_gen_syndrome, etc.)
> otherwise.
>
> Step-loop operations inside each vertical stripe are chained as DMA
> dependency chains with ASYNC_TX_FENCE. The chains are then extended
> across all sectors in a stripe so that the DMA engine can pipeline
> the entire stripe's parity computation before the CPU waits for
> completion. For RMW writes, all sectors' parity generation is
> issued as a single chain; for recovery reads, the recovery chains
> for every sector are submitted together, then the CPU verifies
> checksums and marks uptodate bits after a single wait.
>
> Each DMA chain is fully waited on before the caller touches any
> buffer pages, so there is never an in-flight DMA operation during
> cleanup. For the typical stripe size of 4 KiB--64 KiB, per-operation
> overhead from DMA submission may outweigh the offload benefit, making
> the software fallback (which runs synchronously, returning NULL) the
> common fast path. This trade-off is inherent in the async_tx API's
> unified interface, but on platforms with a DMA engine (e.g. Marvell
> mv_xor) and larger sectors the batching provides measurable offload.
Do you have any benchmark?
You do not need to do a full comprehensive workload, just some fixed
basic workloads, then measure the runtime of the converted functions,
and major entrances like rmw_rbio()/recover_rbio().
And do you have ever run the fstests on a system with async offload?
>
> Also select ASYNC_RAID6_RECOV and ASYNC_MEMCPY in Kconfig so that
> the async_tx function implementations are linked; previously the
> code used the synchronous RAID6_PQ / XOR_BLOCKS libraries directly,
> but async_tx functions require their own Kconfig symbols.
>
> Converted functions:
> - generate_pq_vertical_step() / generate_pq_vertical()
> - recover_vertical_step() / recover_vertical() / recover_sectors()
> - verify_one_parity_step() / verify_one_parity_sector()
> - finish_parity_scrub()
> - recover_scrub_rbio()
> - memcpy_from_bio_to_stripe()
> - raid56_parity_cache_data_folios()
>
> Removed the now-unused btrfs_raid_bio::finish_pointers field and
> the no-longer-needed #include <linux/raid/pq.h> and
> #include <linux/raid/xor.h>.
>
> Assisted-by: opencode:big-pickle
> Signed-off-by: Rosen Penev <rosenp@gmail.com>
> ---
> fs/btrfs/Kconfig | 4 +-
> fs/btrfs/raid56.c | 654 ++++++++++++++++++++++++++++++++--------------
> fs/btrfs/raid56.h | 3 -
> 3 files changed, 454 insertions(+), 207 deletions(-)
>
> diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
> index 43dd8ef763b8..8223b40d8ecb 100644
> --- a/fs/btrfs/Kconfig
> +++ b/fs/btrfs/Kconfig
> @@ -15,7 +15,9 @@ config BTRFS_FS
> select ZSTD_DECOMPRESS
> select FS_IOMAP
> select RAID6_PQ
> - select XOR_BLOCKS
> + select ASYNC_RAID6_RECOV
> + select ASYNC_MEMCPY
> + select ASYNC_XOR
> select XXHASH
> depends on PAGE_SIZE_LESS_THAN_256KB
>
> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> index 00a01b97cc0c..6e98e46df752 100644
> --- a/fs/btrfs/raid56.c
> +++ b/fs/btrfs/raid56.c
> @@ -8,11 +8,10 @@
> #include <linux/bio.h>
> #include <linux/slab.h>
> #include <linux/blkdev.h>
> -#include <linux/raid/pq.h>
> #include <linux/hash.h>
> #include <linux/list_sort.h>
> -#include <linux/raid/xor.h>
> #include <linux/mm.h>
> +#include <linux/async_tx.h>
> #include "messages.h"
> #include "ctree.h"
> #include "disk-io.h"
> @@ -154,7 +153,6 @@ static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
> kfree(rbio->stripe_pages);
> kfree(rbio->bio_paddrs);
> kfree(rbio->stripe_paddrs);
> - kfree(rbio->finish_pointers);
> }
>
> static void free_raid_bio(struct btrfs_raid_bio *rbio)
> @@ -231,18 +229,33 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
> static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr)
> {
> const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
> + struct dma_async_tx_descriptor *tx = NULL;
>
> ASSERT(sector_nr < rbio->nr_sectors);
> for (int i = 0; i < rbio->sector_nsteps; i++) {
> unsigned int index = sector_nr * rbio->sector_nsteps + i;
> phys_addr_t dst = rbio->stripe_paddrs[index];
> phys_addr_t src = rbio->bio_paddrs[index];
> + struct async_submit_ctl submit;
>
> ASSERT(dst != INVALID_PADDR);
> ASSERT(src != INVALID_PADDR);
>
> - memcpy_page(phys_to_page(dst), offset_in_page(dst),
> - phys_to_page(src), offset_in_page(src), step);
> + init_async_submit(&submit, ASYNC_TX_FENCE, tx, NULL, NULL, NULL);
> + tx = async_memcpy(phys_to_page(dst), phys_to_page(src),
> + offset_in_page(dst), offset_in_page(src),
> + step, &submit);
This doesn't look sane at all.
If async_memcpy() returned an @tx, the next iteration will just overwrite.
I believe most LLM review can detect this problem, you may want to
choose a better model.
> + }
> + /*
> + * All steps are chained via ASYNC_TX_FENCE. issue_pending and
> + * dma_wait_for_async_tx on the last descriptor walk the entire
> + * chain, so earlier descriptors are submitted and waited on too.
> + * If every step completed synchronously, tx is NULL and no wait
> + * is needed.
> + */
> + if (tx) {
> + async_tx_issue_pending(tx);
> + dma_wait_for_async_tx(tx);
> }
> }
>
> @@ -1079,12 +1092,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
> rbio->stripe_paddrs = kzalloc_objs(phys_addr_t,
> num_sectors * sector_nsteps,
> GFP_NOFS);
> - rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
> rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
> rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
>
> if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs ||
> - !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) {
> + !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) {
> free_raid_bio_pointers(rbio);
> kfree(rbio);
> return ERR_PTR(-ENOMEM);
> @@ -1385,49 +1397,98 @@ static inline void *kmap_local_paddr(phys_addr_t paddr)
> return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
> }
>
> -static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr,
> - unsigned int step_nr)
> +/*
> + * The page/offset array model used by the async_tx helpers below replaces the
> + * old kmap-local pointer array. Each entry in pages[] + offsets[] represents
> + * what was previously a single kmap_local_paddr() return value.
> + *
> + * For example, the old pattern:
> + * pointers[i] = kmap_local_paddr(paddr);
> + * raid6_gen_syndrome(..., pointers);
> + * kunmap_local(pointers[i]);
> + *
> + * becomes:
> + * pages[i] = phys_to_page(paddr);
> + * offsets[i] = offset_in_page(paddr);
> + * async_gen_syndrome(pages, offsets, ...);
> + *
> + * The async_tx API handles any necessary kmap internally.
> + */
What the comment is for? There is no code.
If you want to explain the change, commit message is your choice.
> +
> +/*
> + * Maximum number of sectors per stripe.
> + * BTRFS_STRIPE_LEN (64 KiB) divided by the minimum sector size (PAGE_SIZE,
> + * typically 4 KiB) gives at most 16 entries. If stripe length ever changes,
> + * update this to BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits.
> + */
> +#define RECOVER_MAX_STRIPE_SECTORS 16
Leave a new line between macro and function.
Move the macro either at the beginning of the file, or just before where
it is utilized.
And do not use immediate number, use (BTRFS_STRIPE_LEN /
BTRFS_MIN_BLOCKSIZE) instead.
> +static struct dma_async_tx_descriptor *
> +generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr,
> + unsigned int step_nr,
> + struct page **pages, unsigned int *offsets,
> + struct dma_async_tx_descriptor *depend_tx)
> {
> - void **pointers = rbio->finish_pointers;
> const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
> - int stripe;
> const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
> + struct async_submit_ctl submit;
> + struct dma_async_tx_descriptor *tx;
> + int stripe;
>
> - /* First collect one sector from each data stripe */
> - for (stripe = 0; stripe < rbio->nr_data; stripe++)
> - pointers[stripe] = kmap_local_paddr(
> - sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0));
> + for (stripe = 0; stripe < rbio->nr_data; stripe++) {
> + phys_addr_t paddr = sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0);
>
> - /* Then add the parity stripe */
> - pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr));
> + pages[stripe] = phys_to_page(paddr);
> + offsets[stripe] = offset_in_page(paddr);
> + }
> +
> + {
> + phys_addr_t paddr = rbio_pstripe_paddr(rbio, sector_nr, step_nr);
> +
> + pages[stripe] = phys_to_page(paddr);
> + offsets[stripe] = offset_in_page(paddr);
> + }
A code block for what? Just to keep paddr as a local variable?
Just big NO NO, declare @paddr at the beginning of the function and
remove the brackets.
> + stripe++;
>
> if (has_qstripe) {
> - /*
> - * RAID6, add the qstripe and call the library function
> - * to fill in our p/q
> - */
> - pointers[stripe++] = kmap_local_paddr(
> - rbio_qstripe_paddr(rbio, sector_nr, step_nr));
> + phys_addr_t paddr = rbio_qstripe_paddr(rbio, sector_nr, step_nr);
> +
> + pages[stripe] = phys_to_page(paddr);
> + offsets[stripe] = offset_in_page(paddr);
>
> assert_rbio(rbio);
> - raid6_gen_syndrome(rbio->real_stripes, step, pointers);
> + init_async_submit(&submit, ASYNC_TX_FENCE, depend_tx, NULL, NULL, NULL);
> + tx = async_gen_syndrome(pages, offsets, rbio->real_stripes, step, &submit);
> } else {
> - /* raid5 */
> - memcpy(pointers[rbio->nr_data], pointers[0], step);
> - xor_gen(pointers[rbio->nr_data], pointers + 1, rbio->nr_data - 1,
> - step);
> + init_async_submit(&submit, ASYNC_TX_FENCE | ASYNC_TX_XOR_ZERO_DST,
> + depend_tx, NULL, NULL, NULL);
> + tx = async_xor_offs(pages[rbio->nr_data], offsets[rbio->nr_data],
> + pages, offsets, rbio->nr_data, step, &submit);
> }
> - for (stripe = stripe - 1; stripe >= 0; stripe--)
> - kunmap_local(pointers[stripe]);
> + return tx;
I do not like the returning of an tx.
Can't you just wait for the @tx to finish and keep the old void return type?
> }
>
> -/* Generate PQ for one vertical stripe. */
> -static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
> +/*
> + * Generate PQ for one vertical stripe, chaining into @depend_tx.
> + * The caller must issue_pending + wait for the returned descriptor
> + * (which may be NULL if the operation completed synchronously),
> + * then call generate_pq_vertical_finish() to mark parity uptodate.
> + */
> +static struct dma_async_tx_descriptor *
> +generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr,
> + struct page **pages, unsigned int *offsets,
> + struct dma_async_tx_descriptor *depend_tx)
> {
> - const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6);
> + struct dma_async_tx_descriptor *tx = depend_tx;
>
> for (int i = 0; i < rbio->sector_nsteps; i++)
> - generate_pq_vertical_step(rbio, sectornr, i);
> + tx = generate_pq_vertical_step(rbio, sectornr, i, pages, offsets, tx);
Again, isn't @tx overwritten for each step?
Also, sashiko is reporting several critical level problems, please refer
to the review:
https://sashiko.dev/#/patchset/20260629023912.2102700-1-rosenp%40gmail.com
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] btrfs: raid56: use async_tx API for parity operations
2026-06-29 5:11 ` Qu Wenruo
@ 2026-06-29 12:40 ` Christoph Hellwig
2026-06-29 20:01 ` Rosen Penev
1 sibling, 0 replies; 4+ messages in thread
From: Christoph Hellwig @ 2026-06-29 12:40 UTC (permalink / raw)
To: Qu Wenruo; +Cc: Rosen Penev, linux-btrfs, Chris Mason, David Sterba, open list
On Mon, Jun 29, 2026 at 02:41:44PM +0930, Qu Wenruo wrote:
> Do you have any benchmark?
Yeah. The async_tx stuff is a mess, and will cause real slow downs
for CPU native operation, while in general offloads have proven to
not be very useful. There might be a tinty use case if your data is
all P2P and never touches the CPU at all, but I don't think btrfs even
supports that right now.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] btrfs: raid56: use async_tx API for parity operations
2026-06-29 5:11 ` Qu Wenruo
2026-06-29 12:40 ` Christoph Hellwig
@ 2026-06-29 20:01 ` Rosen Penev
1 sibling, 0 replies; 4+ messages in thread
From: Rosen Penev @ 2026-06-29 20:01 UTC (permalink / raw)
To: Qu Wenruo, Rosen Penev, linux-btrfs; +Cc: Chris Mason, David Sterba, open list
On Sun Jun 28, 2026 at 10:11 PM PDT, Qu Wenruo wrote:
>
>
> 在 2026/6/29 12:09, Rosen Penev 写道:
>> Replace the kmap-local + synchronous library call + kunmap-local
>> pattern in raid56 compute and recovery paths with the async_tx API
>> (async_xor_offs, async_gen_syndrome, async_raid6_datap_recov,
>> async_raid6_2data_recov, async_memcpy). The async_tx API provides
>> DMA offloading when a suitable engine is available and falls back to
>> the same software implementations (xor_gen, raid6_gen_syndrome, etc.)
>> otherwise.
>>
>> Step-loop operations inside each vertical stripe are chained as DMA
>> dependency chains with ASYNC_TX_FENCE. The chains are then extended
>> across all sectors in a stripe so that the DMA engine can pipeline
>> the entire stripe's parity computation before the CPU waits for
>> completion. For RMW writes, all sectors' parity generation is
>> issued as a single chain; for recovery reads, the recovery chains
>> for every sector are submitted together, then the CPU verifies
>> checksums and marks uptodate bits after a single wait.
>>
>> Each DMA chain is fully waited on before the caller touches any
>> buffer pages, so there is never an in-flight DMA operation during
>> cleanup. For the typical stripe size of 4 KiB--64 KiB, per-operation
>> overhead from DMA submission may outweigh the offload benefit, making
>> the software fallback (which runs synchronously, returning NULL) the
>> common fast path. This trade-off is inherent in the async_tx API's
>> unified interface, but on platforms with a DMA engine (e.g. Marvell
>> mv_xor) and larger sectors the batching provides measurable offload.
>
> Do you have any benchmark?
Not at this time.
>
> You do not need to do a full comprehensive workload, just some fixed
> basic workloads, then measure the runtime of the converted functions,
> and major entrances like rmw_rbio()/recover_rbio().
>
> And do you have ever run the fstests on a system with async offload?
No. The system in question runs Debian 12, so I assume it would be
trivial to do so.
>
>>
>> Also select ASYNC_RAID6_RECOV and ASYNC_MEMCPY in Kconfig so that
>> the async_tx function implementations are linked; previously the
>> code used the synchronous RAID6_PQ / XOR_BLOCKS libraries directly,
>> but async_tx functions require their own Kconfig symbols.
>>
>> Converted functions:
>> - generate_pq_vertical_step() / generate_pq_vertical()
>> - recover_vertical_step() / recover_vertical() / recover_sectors()
>> - verify_one_parity_step() / verify_one_parity_sector()
>> - finish_parity_scrub()
>> - recover_scrub_rbio()
>> - memcpy_from_bio_to_stripe()
>> - raid56_parity_cache_data_folios()
>>
>> Removed the now-unused btrfs_raid_bio::finish_pointers field and
>> the no-longer-needed #include <linux/raid/pq.h> and
>> #include <linux/raid/xor.h>.
>>
>> Assisted-by: opencode:big-pickle
>> Signed-off-by: Rosen Penev <rosenp@gmail.com>
>> ---
>> fs/btrfs/Kconfig | 4 +-
>> fs/btrfs/raid56.c | 654 ++++++++++++++++++++++++++++++++--------------
>> fs/btrfs/raid56.h | 3 -
>> 3 files changed, 454 insertions(+), 207 deletions(-)
>>
>> diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
>> index 43dd8ef763b8..8223b40d8ecb 100644
>> --- a/fs/btrfs/Kconfig
>> +++ b/fs/btrfs/Kconfig
>> @@ -15,7 +15,9 @@ config BTRFS_FS
>> select ZSTD_DECOMPRESS
>> select FS_IOMAP
>> select RAID6_PQ
>> - select XOR_BLOCKS
>> + select ASYNC_RAID6_RECOV
>> + select ASYNC_MEMCPY
>> + select ASYNC_XOR
>> select XXHASH
>> depends on PAGE_SIZE_LESS_THAN_256KB
>>
>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>> index 00a01b97cc0c..6e98e46df752 100644
>> --- a/fs/btrfs/raid56.c
>> +++ b/fs/btrfs/raid56.c
>> @@ -8,11 +8,10 @@
>> #include <linux/bio.h>
>> #include <linux/slab.h>
>> #include <linux/blkdev.h>
>> -#include <linux/raid/pq.h>
>> #include <linux/hash.h>
>> #include <linux/list_sort.h>
>> -#include <linux/raid/xor.h>
>> #include <linux/mm.h>
>> +#include <linux/async_tx.h>
>> #include "messages.h"
>> #include "ctree.h"
>> #include "disk-io.h"
>> @@ -154,7 +153,6 @@ static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
>> kfree(rbio->stripe_pages);
>> kfree(rbio->bio_paddrs);
>> kfree(rbio->stripe_paddrs);
>> - kfree(rbio->finish_pointers);
>> }
>>
>> static void free_raid_bio(struct btrfs_raid_bio *rbio)
>> @@ -231,18 +229,33 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
>> static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr)
>> {
>> const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
>> + struct dma_async_tx_descriptor *tx = NULL;
>>
>> ASSERT(sector_nr < rbio->nr_sectors);
>> for (int i = 0; i < rbio->sector_nsteps; i++) {
>> unsigned int index = sector_nr * rbio->sector_nsteps + i;
>> phys_addr_t dst = rbio->stripe_paddrs[index];
>> phys_addr_t src = rbio->bio_paddrs[index];
>> + struct async_submit_ctl submit;
>>
>> ASSERT(dst != INVALID_PADDR);
>> ASSERT(src != INVALID_PADDR);
>>
>> - memcpy_page(phys_to_page(dst), offset_in_page(dst),
>> - phys_to_page(src), offset_in_page(src), step);
>> + init_async_submit(&submit, ASYNC_TX_FENCE, tx, NULL, NULL, NULL);
>> + tx = async_memcpy(phys_to_page(dst), phys_to_page(src),
>> + offset_in_page(dst), offset_in_page(src),
>> + step, &submit);
>
> This doesn't look sane at all.
>
> If async_memcpy() returned an @tx, the next iteration will just overwrite.
>
> I believe most LLM review can detect this problem, you may want to
> choose a better model.
Probably.
>> + }
>> + /*
>> + * All steps are chained via ASYNC_TX_FENCE. issue_pending and
>> + * dma_wait_for_async_tx on the last descriptor walk the entire
>> + * chain, so earlier descriptors are submitted and waited on too.
>> + * If every step completed synchronously, tx is NULL and no wait
>> + * is needed.
>> + */
>> + if (tx) {
>> + async_tx_issue_pending(tx);
>> + dma_wait_for_async_tx(tx);
>> }
>> }
>>
>> @@ -1079,12 +1092,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
>> rbio->stripe_paddrs = kzalloc_objs(phys_addr_t,
>> num_sectors * sector_nsteps,
>> GFP_NOFS);
>> - rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
>> rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
>> rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
>>
>> if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs ||
>> - !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) {
>> + !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) {
>> free_raid_bio_pointers(rbio);
>> kfree(rbio);
>> return ERR_PTR(-ENOMEM);
>> @@ -1385,49 +1397,98 @@ static inline void *kmap_local_paddr(phys_addr_t paddr)
>> return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
>> }
>>
>> -static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr,
>> - unsigned int step_nr)
>> +/*
>> + * The page/offset array model used by the async_tx helpers below replaces the
>> + * old kmap-local pointer array. Each entry in pages[] + offsets[] represents
>> + * what was previously a single kmap_local_paddr() return value.
>> + *
>> + * For example, the old pattern:
>> + * pointers[i] = kmap_local_paddr(paddr);
>> + * raid6_gen_syndrome(..., pointers);
>> + * kunmap_local(pointers[i]);
>> + *
>> + * becomes:
>> + * pages[i] = phys_to_page(paddr);
>> + * offsets[i] = offset_in_page(paddr);
>> + * async_gen_syndrome(pages, offsets, ...);
>> + *
>> + * The async_tx API handles any necessary kmap internally.
>> + */
>
> What the comment is for? There is no code.
Probably stale from various iterations.
>
> If you want to explain the change, commit message is your choice.
>
>> +
>> +/*
>> + * Maximum number of sectors per stripe.
>> + * BTRFS_STRIPE_LEN (64 KiB) divided by the minimum sector size (PAGE_SIZE,
>> + * typically 4 KiB) gives at most 16 entries. If stripe length ever changes,
>> + * update this to BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits.
>> + */
>> +#define RECOVER_MAX_STRIPE_SECTORS 16
>
> Leave a new line between macro and function.
>
> Move the macro either at the beginning of the file, or just before where
> it is utilized.
>
> And do not use immediate number, use (BTRFS_STRIPE_LEN /
> BTRFS_MIN_BLOCKSIZE) instead.
Sure.
>
>> +static struct dma_async_tx_descriptor *
>> +generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr,
>> + unsigned int step_nr,
>> + struct page **pages, unsigned int *offsets,
>> + struct dma_async_tx_descriptor *depend_tx)
>> {
>> - void **pointers = rbio->finish_pointers;
>> const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
>> - int stripe;
>> const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
>> + struct async_submit_ctl submit;
>> + struct dma_async_tx_descriptor *tx;
>> + int stripe;
>>
>> - /* First collect one sector from each data stripe */
>> - for (stripe = 0; stripe < rbio->nr_data; stripe++)
>> - pointers[stripe] = kmap_local_paddr(
>> - sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0));
>> + for (stripe = 0; stripe < rbio->nr_data; stripe++) {
>> + phys_addr_t paddr = sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0);
>>
>> - /* Then add the parity stripe */
>> - pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr));
>> + pages[stripe] = phys_to_page(paddr);
>> + offsets[stripe] = offset_in_page(paddr);
>> + }
>> +
>> + {
>> + phys_addr_t paddr = rbio_pstripe_paddr(rbio, sector_nr, step_nr);
>> +
>> + pages[stripe] = phys_to_page(paddr);
>> + offsets[stripe] = offset_in_page(paddr);
>> + }
>
> A code block for what? Just to keep paddr as a local variable?
>
> Just big NO NO, declare @paddr at the beginning of the function and
> remove the brackets.
Will do.
>
>> + stripe++;
>>
>> if (has_qstripe) {
>> - /*
>> - * RAID6, add the qstripe and call the library function
>> - * to fill in our p/q
>> - */
>> - pointers[stripe++] = kmap_local_paddr(
>> - rbio_qstripe_paddr(rbio, sector_nr, step_nr));
>> + phys_addr_t paddr = rbio_qstripe_paddr(rbio, sector_nr, step_nr);
>> +
>> + pages[stripe] = phys_to_page(paddr);
>> + offsets[stripe] = offset_in_page(paddr);
>>
>> assert_rbio(rbio);
>> - raid6_gen_syndrome(rbio->real_stripes, step, pointers);
>> + init_async_submit(&submit, ASYNC_TX_FENCE, depend_tx, NULL, NULL, NULL);
>> + tx = async_gen_syndrome(pages, offsets, rbio->real_stripes, step, &submit);
>> } else {
>> - /* raid5 */
>> - memcpy(pointers[rbio->nr_data], pointers[0], step);
>> - xor_gen(pointers[rbio->nr_data], pointers + 1, rbio->nr_data - 1,
>> - step);
>> + init_async_submit(&submit, ASYNC_TX_FENCE | ASYNC_TX_XOR_ZERO_DST,
>> + depend_tx, NULL, NULL, NULL);
>> + tx = async_xor_offs(pages[rbio->nr_data], offsets[rbio->nr_data],
>> + pages, offsets, rbio->nr_data, step, &submit);
>> }
>> - for (stripe = stripe - 1; stripe >= 0; stripe--)
>> - kunmap_local(pointers[stripe]);
>> + return tx;
>
> I do not like the returning of an tx.
>
> Can't you just wait for the @tx to finish and keep the old void return type?
>
>
>> }
>>
>> -/* Generate PQ for one vertical stripe. */
>> -static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
>> +/*
>> + * Generate PQ for one vertical stripe, chaining into @depend_tx.
>> + * The caller must issue_pending + wait for the returned descriptor
>> + * (which may be NULL if the operation completed synchronously),
>> + * then call generate_pq_vertical_finish() to mark parity uptodate.
>> + */
>> +static struct dma_async_tx_descriptor *
>> +generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr,
>> + struct page **pages, unsigned int *offsets,
>> + struct dma_async_tx_descriptor *depend_tx)
>> {
>> - const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6);
>> + struct dma_async_tx_descriptor *tx = depend_tx;
>>
>> for (int i = 0; i < rbio->sector_nsteps; i++)
>> - generate_pq_vertical_step(rbio, sectornr, i);
>> + tx = generate_pq_vertical_step(rbio, sectornr, i, pages, offsets, tx);
>
> Again, isn't @tx overwritten for each step?
>
> Also, sashiko is reporting several critical level problems, please refer
> to the review:
>
> https://sashiko.dev/#/patchset/20260629023912.2102700-1-rosenp%40gmail.com
will check out.
Although based on Cristoph's comment, this should probably be abandoned.
I originally wanted to implement:
https://lore.kernel.org/all/20210106123738.GS6430@twin.jikos.cz/
The LLM told me it would result in worse performance as the async-tx API
requires batching. So I moved on to XOR.
My current setup for my RAID is md-raid5 > btrfs instead of the latter
directly as I assume the former performs better based on
/proc/interrupts output.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2026-06-29 20:01 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-29 2:39 [PATCH] btrfs: raid56: use async_tx API for parity operations Rosen Penev
2026-06-29 5:11 ` Qu Wenruo
2026-06-29 12:40 ` Christoph Hellwig
2026-06-29 20:01 ` Rosen Penev
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox