From: Dongsheng Yang <dongsheng.yang@linux.dev>
To: mpatocka@redhat.com, agk@redhat.com, snitzer@kernel.org,
axboe@kernel.dk, hch@lst.de, dan.j.williams@intel.com,
Jonathan.Cameron@Huawei.com
Cc: linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-cxl@vger.kernel.org, nvdimm@lists.linux.dev,
dm-devel@lists.linux.dev,
Dongsheng Yang <dongsheng.yang@linux.dev>
Subject: [RFC PATCH 06/11] dm-pcache: add cache_writeback
Date: Thu, 5 Jun 2025 14:23:01 +0000 [thread overview]
Message-ID: <20250605142306.1930831-7-dongsheng.yang@linux.dev> (raw)
In-Reply-To: <20250605142306.1930831-1-dongsheng.yang@linux.dev>
Introduce cache_writeback.c, which implements the asynchronous write-back
path for pcache. The new file is responsible for detecting dirty data,
organising it into an in-memory tree, issuing bios to the backing block
device, and advancing the cache’s *dirty tail* pointer once data has
been safely persisted.
* Dirty-state detection
- `__is_cache_clean()` reads the kset header at `dirty_tail`, checks
magic and CRC, and thus decides whether there is anything to flush.
* Write-back scheduler
- `cache_writeback_work` is queued on the cache task-workqueue and
re-arms itself at `PCACHE_CACHE_WRITEBACK_INTERVAL`.
- Uses an internal spin-protected `writeback_key_tree` to batch keys
belonging to the same stripe before IO.
* Key processing
- `cache_kset_insert_tree()` decodes each key inside the on-media
kset, allocates an in-memory key object, and inserts it into the
writeback_key_tree.
- `cache_key_writeback()` builds a *KMEM-type* backing request that
maps the persistent-memory range directly into a WRITE bio and
submits it with `submit_bio_noacct()`.
- After all keys from the writeback_key_tree have been flushed,
`backing_dev_flush()` issues a single FLUSH to ensure durability.
* Tail advancement
- Once a kset is written back, `cache_pos_advance()` moves
`cache->dirty_tail` by the exact on-disk size and the new position is
persisted via `cache_encode_dirty_tail()`.
- When the `PCACHE_KSET_FLAGS_LAST` flag is seen, the write-back
engine switches to the next segment indicated by `next_cache_seg_id`.
Signed-off-by: Dongsheng Yang <dongsheng.yang@linux.dev>
---
drivers/md/dm-pcache/cache_writeback.c | 239 +++++++++++++++++++++++++
1 file changed, 239 insertions(+)
create mode 100644 drivers/md/dm-pcache/cache_writeback.c
diff --git a/drivers/md/dm-pcache/cache_writeback.c b/drivers/md/dm-pcache/cache_writeback.c
new file mode 100644
index 000000000000..fe07e7fad15e
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_writeback.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/bio.h>
+
+#include "cache.h"
+#include "backing_dev.h"
+#include "cache_dev.h"
+#include "dm_pcache.h"
+
+static inline bool is_cache_clean(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_kset_onmedia *kset_onmedia;
+ u32 to_copy;
+ void *addr;
+ int ret;
+
+ addr = cache_pos_addr(dirty_tail);
+ kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf;
+
+ to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - dirty_tail->seg_off);
+ ret = copy_mc_to_kernel(kset_onmedia, addr, to_copy);
+ if (ret) {
+ pcache_dev_err(pcache, "error to read kset: %d", ret);
+ return true;
+ }
+
+ /* Check if the magic number matches the expected value */
+ if (kset_onmedia->magic != PCACHE_KSET_MAGIC) {
+ pcache_dev_debug(pcache, "dirty_tail: %u:%u magic: %llx, not expected: %llx\n",
+ dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off,
+ kset_onmedia->magic, PCACHE_KSET_MAGIC);
+ return true;
+ }
+
+ /* Verify the CRC checksum for data integrity */
+ if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) {
+ pcache_dev_debug(pcache, "dirty_tail: %u:%u crc: %x, not expected: %x\n",
+ dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off,
+ cache_kset_crc(kset_onmedia), kset_onmedia->crc);
+ return true;
+ }
+
+ return false;
+}
+
+void cache_writeback_exit(struct pcache_cache *cache)
+{
+ cancel_delayed_work_sync(&cache->writeback_work);
+ cache_tree_exit(&cache->writeback_key_tree);
+}
+
+int cache_writeback_init(struct pcache_cache *cache)
+{
+ int ret;
+
+ ret = cache_tree_init(cache, &cache->writeback_key_tree, 1);
+ if (ret)
+ goto err;
+
+ /* Queue delayed work to start writeback handling */
+ queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0);
+
+ return 0;
+err:
+ return ret;
+}
+
+static int cache_key_writeback(struct pcache_cache *cache, struct pcache_cache_key *key)
+{
+ struct pcache_backing_dev_req *writeback_req;
+ struct pcache_backing_dev_req_opts writeback_req_opts = { 0 };
+ struct pcache_cache_pos *pos;
+ void *addr;
+ u32 seg_remain;
+ u64 off;
+
+ if (cache_key_clean(key))
+ return 0;
+
+ pos = &key->cache_pos;
+
+ seg_remain = cache_seg_remain(pos);
+ BUG_ON(seg_remain < key->len);
+
+ addr = cache_pos_addr(pos);
+ off = key->off;
+
+ writeback_req_opts.type = BACKING_DEV_REQ_TYPE_KMEM;
+ writeback_req_opts.end_fn = NULL;
+ writeback_req_opts.gfp_mask = GFP_NOIO;
+
+ writeback_req_opts.kmem.data = addr;
+ writeback_req_opts.kmem.opf = REQ_OP_WRITE;
+ writeback_req_opts.kmem.len = key->len;
+ writeback_req_opts.kmem.backing_off = off;
+
+ writeback_req = backing_dev_req_create(cache->backing_dev, &writeback_req_opts);
+ if (!writeback_req)
+ return -EIO;
+
+ backing_dev_req_submit(writeback_req, true);
+
+ return 0;
+}
+
+static int cache_wb_tree_writeback(struct pcache_cache *cache)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_tree *cache_tree = &cache->writeback_key_tree;
+ struct pcache_cache_subtree *cache_subtree;
+ struct rb_node *node;
+ struct pcache_cache_key *key;
+ int ret;
+ u32 i;
+
+ for (i = 0; i < cache_tree->n_subtrees; i++) {
+ cache_subtree = &cache_tree->subtrees[i];
+
+ node = rb_first(&cache_subtree->root);
+ while (node) {
+ key = CACHE_KEY(node);
+ node = rb_next(node);
+
+ ret = cache_key_writeback(cache, key);
+ if (ret) {
+ pcache_dev_err(pcache, "writeback error: %d\n", ret);
+ return ret;
+ }
+
+ cache_key_delete(key);
+ }
+ }
+
+ backing_dev_flush(cache->backing_dev);
+
+ return 0;
+}
+
+static int cache_kset_insert_tree(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia)
+{
+ struct pcache_cache_key_onmedia *key_onmedia;
+ struct pcache_cache_key *key;
+ int ret;
+ u32 i;
+
+ /* Iterate through all keys in the kset and write each back to storage */
+ for (i = 0; i < kset_onmedia->key_num; i++) {
+ key_onmedia = &kset_onmedia->data[i];
+
+ key = cache_key_alloc(&cache->writeback_key_tree);
+ if (!key)
+ return -ENOMEM;
+
+ ret = cache_key_decode(cache, key_onmedia, key);
+ if (ret) {
+ cache_key_delete(key);
+ return ret;
+ }
+
+ ret = cache_key_insert(&cache->writeback_key_tree, key, true);
+ if (ret) {
+ cache_key_delete(key);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void last_kset_writeback(struct pcache_cache *cache,
+ struct pcache_cache_kset_onmedia *last_kset_onmedia)
+{
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_segment *next_seg;
+
+ pcache_dev_debug(pcache, "last kset, next: %u\n", last_kset_onmedia->next_cache_seg_id);
+
+ next_seg = &cache->segments[last_kset_onmedia->next_cache_seg_id];
+
+ mutex_lock(&cache->dirty_tail_lock);
+ cache->dirty_tail.cache_seg = next_seg;
+ cache->dirty_tail.seg_off = 0;
+ cache_encode_dirty_tail(cache);
+ mutex_unlock(&cache->dirty_tail_lock);
+}
+
+void cache_writeback_fn(struct work_struct *work)
+{
+ struct pcache_cache *cache = container_of(work, struct pcache_cache, writeback_work.work);
+ struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+ struct pcache_cache_pos dirty_tail;
+ struct pcache_cache_kset_onmedia *kset_onmedia;
+ int ret = 0;
+
+ mutex_lock(&cache->writeback_lock);
+ kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf;
+ /* Loop until all dirty data is written back and the cache is clean */
+ while (true) {
+ if (pcache_is_stopping(pcache)) {
+ mutex_unlock(&cache->writeback_lock);
+ return;
+ }
+
+ /* Get new dirty tail */
+ mutex_lock(&cache->dirty_tail_lock);
+ cache_pos_copy(&dirty_tail, &cache->dirty_tail);
+ mutex_unlock(&cache->dirty_tail_lock);
+
+ if (is_cache_clean(cache, &dirty_tail))
+ break;
+
+ if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) {
+ last_kset_writeback(cache, kset_onmedia);
+ continue;
+ }
+
+ ret = cache_kset_insert_tree(cache, kset_onmedia);
+ if (ret)
+ break;
+
+ ret = cache_wb_tree_writeback(cache);
+ if (ret)
+ break;
+
+ pcache_dev_debug(pcache, "writeback advance: %u:%u %u\n",
+ dirty_tail.cache_seg->cache_seg_id,
+ dirty_tail.seg_off,
+ get_kset_onmedia_size(kset_onmedia));
+
+ mutex_lock(&cache->dirty_tail_lock);
+ cache_pos_advance(&cache->dirty_tail, get_kset_onmedia_size(kset_onmedia));
+ cache_encode_dirty_tail(cache);
+ mutex_unlock(&cache->dirty_tail_lock);
+ }
+ mutex_unlock(&cache->writeback_lock);
+
+ queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, PCACHE_CACHE_WRITEBACK_INTERVAL);
+}
--
2.34.1
next prev parent reply other threads:[~2025-06-05 14:24 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-05 14:22 [RFC v2 00/11] dm-pcache – persistent-memory cache for block devices Dongsheng Yang
2025-06-05 14:22 ` [RFC PATCH 01/11] dm-pcache: add pcache_internal.h Dongsheng Yang
2025-06-05 14:22 ` [RFC PATCH 02/11] dm-pcache: add backing device management Dongsheng Yang
2025-06-05 14:22 ` [RFC PATCH 03/11] dm-pcache: add cache device Dongsheng Yang
2025-06-05 14:22 ` [RFC PATCH 04/11] dm-pcache: add segment layer Dongsheng Yang
2025-06-05 14:23 ` [RFC PATCH 05/11] dm-pcache: add cache_segment Dongsheng Yang
2025-06-05 14:23 ` Dongsheng Yang [this message]
2025-06-05 14:23 ` [RFC PATCH 07/11] dm-pcache: add cache_gc Dongsheng Yang
2025-06-05 14:23 ` [RFC PATCH 08/11] dm-pcache: add cache_key Dongsheng Yang
2025-06-05 14:23 ` [RFC PATCH 09/11] dm-pcache: add cache_req Dongsheng Yang
2025-06-05 14:23 ` [RFC PATCH 10/11] dm-pcache: add cache core Dongsheng Yang
2025-06-05 14:23 ` [RFC PATCH 11/11] dm-pcache: initial dm-pcache target Dongsheng Yang
2025-06-12 16:57 ` [RFC v2 00/11] dm-pcache – persistent-memory cache for block devices Mikulas Patocka
2025-06-13 3:39 ` Dongsheng Yang
2025-06-23 3:13 ` Dongsheng Yang
2025-06-23 4:18 ` Dongsheng Yang
2025-06-30 15:57 ` Mikulas Patocka
2025-06-30 16:28 ` Dongsheng Yang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250605142306.1930831-7-dongsheng.yang@linux.dev \
--to=dongsheng.yang@linux.dev \
--cc=Jonathan.Cameron@Huawei.com \
--cc=agk@redhat.com \
--cc=axboe@kernel.dk \
--cc=dan.j.williams@intel.com \
--cc=dm-devel@lists.linux.dev \
--cc=hch@lst.de \
--cc=linux-block@vger.kernel.org \
--cc=linux-cxl@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mpatocka@redhat.com \
--cc=nvdimm@lists.linux.dev \
--cc=snitzer@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.