From: Marcelo Tosatti <mtosatti@redhat.com>
To: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org
Cc: Alexander Viro <viro@zeniv.linux.org.uk>,
Matthew Wilcox <willy@infradead.org>,
Christoph Hellwig <hch@lst.de>, Jens Axboe <axboe@kernel.dk>,
Frederic Weisbecker <frederic@kernel.org>,
Leonardo Bras <leobras@redhat.com>,
Yair Podemsky <ypodemsk@redhat.com>, P J P <ppandit@redhat.com>
Subject: [PATCH v3] fs/buffer.c: update per-CPU bh_lru cache via RCU
Date: Wed, 1 Feb 2023 13:01:47 -0300 [thread overview]
Message-ID: <Y9qM68F+nDSYfrJ1@tpad> (raw)
umount calls invalidate_bh_lrus which IPIs each
CPU that has non empty per-CPU buffer_head cache:
on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
This interrupts CPUs which might be executing code sensitive
to interferences.
To avoid the IPI, free the per-CPU caches remotely via RCU.
Two bh_lrus structures for each CPU are allocated: one is being
used (assigned to per-CPU bh_lru pointer), and the other is
being freed (or idle).
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
v3: fix CPU hotplug
v2: fix sparse warnings (kernel test robot)
diff --git a/fs/buffer.c b/fs/buffer.c
index d9c6d1fbb6dd..0c54ffe9fd62 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1203,7 +1203,21 @@ struct bh_lru {
struct buffer_head *bhs[BH_LRU_SIZE];
};
-static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
+
+/*
+ * Allocate two bh_lrus structures for each CPU. bh_lru points to the
+ * one that is currently in use, and the update path does
+ * (consider cpu->bh_lru = bh_lrus[0]).
+ *
+ * cpu->bh_lrup = bh_lrus[1]
+ * synchronize_rcu()
+ * free bh's in bh_lrus[0]
+ */
+static unsigned int bh_lru_idx;
+static DEFINE_PER_CPU(struct bh_lru, bh_lrus[2]) = {{{ NULL }}, {{NULL}}};
+static DEFINE_PER_CPU(struct bh_lru __rcu *, bh_lrup);
+
+static DEFINE_MUTEX(bh_lru_invalidate_mutex);
#ifdef CONFIG_SMP
#define bh_lru_lock() local_irq_disable()
@@ -1245,16 +1259,19 @@ static void bh_lru_install(struct buffer_head *bh)
return;
}
- b = this_cpu_ptr(&bh_lrus);
+ rcu_read_lock();
+ b = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
for (i = 0; i < BH_LRU_SIZE; i++) {
swap(evictee, b->bhs[i]);
if (evictee == bh) {
+ rcu_read_unlock();
bh_lru_unlock();
return;
}
}
get_bh(bh);
+ rcu_read_unlock();
bh_lru_unlock();
brelse(evictee);
}
@@ -1266,28 +1283,32 @@ static struct buffer_head *
lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *ret = NULL;
+ struct bh_lru *lru;
unsigned int i;
check_irqs_on();
bh_lru_lock();
+ rcu_read_lock();
+
+ lru = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
for (i = 0; i < BH_LRU_SIZE; i++) {
- struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
+ struct buffer_head *bh = lru->bhs[i];
if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
bh->b_size == size) {
if (i) {
while (i) {
- __this_cpu_write(bh_lrus.bhs[i],
- __this_cpu_read(bh_lrus.bhs[i - 1]));
+ lru->bhs[i] = lru->bhs[i - 1];
i--;
}
- __this_cpu_write(bh_lrus.bhs[0], bh);
+ lru->bhs[0] = bh;
}
get_bh(bh);
ret = bh;
break;
}
}
+ rcu_read_unlock();
bh_lru_unlock();
return ret;
}
@@ -1381,35 +1402,54 @@ static void __invalidate_bh_lrus(struct bh_lru *b)
b->bhs[i] = NULL;
}
}
-/*
- * invalidate_bh_lrus() is called rarely - but not only at unmount.
- * This doesn't race because it runs in each cpu either in irq
- * or with preempt disabled.
- */
-static void invalidate_bh_lru(void *arg)
-{
- struct bh_lru *b = &get_cpu_var(bh_lrus);
-
- __invalidate_bh_lrus(b);
- put_cpu_var(bh_lrus);
-}
bool has_bh_in_lru(int cpu, void *dummy)
{
- struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
+ struct bh_lru *b;
int i;
-
+
+ rcu_read_lock();
+ b = rcu_dereference(per_cpu(bh_lrup, cpu));
for (i = 0; i < BH_LRU_SIZE; i++) {
- if (b->bhs[i])
+ if (b->bhs[i]) {
+ rcu_read_unlock();
return true;
+ }
}
+ rcu_read_unlock();
return false;
}
+/*
+ * invalidate_bh_lrus() is called rarely - but not only at unmount.
+ */
void invalidate_bh_lrus(void)
{
- on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
+ int cpu, oidx;
+
+ mutex_lock(&bh_lru_invalidate_mutex);
+ cpus_read_lock();
+ oidx = bh_lru_idx;
+ bh_lru_idx++;
+ if (bh_lru_idx >= 2)
+ bh_lru_idx = 0;
+
+ /* Assign the per-CPU bh_lru pointer */
+ for_each_online_cpu(cpu)
+ rcu_assign_pointer(per_cpu(bh_lrup, cpu),
+ per_cpu_ptr(&bh_lrus[bh_lru_idx], cpu));
+ synchronize_rcu_expedited();
+
+ for_each_online_cpu(cpu) {
+ struct bh_lru *b = per_cpu_ptr(&bh_lrus[oidx], cpu);
+
+ bh_lru_lock();
+ __invalidate_bh_lrus(b);
+ bh_lru_unlock();
+ }
+ cpus_read_unlock();
+ mutex_unlock(&bh_lru_invalidate_mutex);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
@@ -1422,8 +1462,10 @@ void invalidate_bh_lrus_cpu(void)
struct bh_lru *b;
bh_lru_lock();
- b = this_cpu_ptr(&bh_lrus);
+ rcu_read_lock();
+ b = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
__invalidate_bh_lrus(b);
+ rcu_read_unlock();
bh_lru_unlock();
}
@@ -2920,15 +2962,25 @@ void free_buffer_head(struct buffer_head *bh)
}
EXPORT_SYMBOL(free_buffer_head);
+static int buffer_cpu_online(unsigned int cpu)
+{
+ rcu_assign_pointer(per_cpu(bh_lrup, cpu),
+ per_cpu_ptr(&bh_lrus[bh_lru_idx], cpu));
+ return 0;
+}
+
static int buffer_exit_cpu_dead(unsigned int cpu)
{
int i;
- struct bh_lru *b = &per_cpu(bh_lrus, cpu);
+ struct bh_lru *b;
+ rcu_read_lock();
+ b = rcu_dereference(per_cpu(bh_lrup, cpu));
for (i = 0; i < BH_LRU_SIZE; i++) {
brelse(b->bhs[i]);
b->bhs[i] = NULL;
}
+ rcu_read_unlock();
this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
per_cpu(bh_accounting, cpu).nr = 0;
return 0;
@@ -3021,7 +3073,7 @@ EXPORT_SYMBOL(__bh_read_batch);
void __init buffer_init(void)
{
unsigned long nrpages;
- int ret;
+ int ret, cpu;
bh_cachep = kmem_cache_create("buffer_head",
sizeof(struct buffer_head), 0,
@@ -3029,6 +3081,11 @@ void __init buffer_init(void)
SLAB_MEM_SPREAD),
NULL);
+ cpus_read_lock();
+ for_each_online_cpu(cpu)
+ rcu_assign_pointer(per_cpu(bh_lrup, cpu), per_cpu_ptr(&bh_lrus[0], cpu));
+ cpus_read_unlock();
+
/*
* Limit the bh occupancy to 10% of ZONE_NORMAL
*/
@@ -3037,4 +3094,7 @@ void __init buffer_init(void)
ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
NULL, buffer_exit_cpu_dead);
WARN_ON(ret < 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "fs/buffer:online",
+ NULL, buffer_cpu_online);
+ WARN_ON(ret < 0);
}
next reply other threads:[~2023-02-01 16:04 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-02-01 16:01 Marcelo Tosatti [this message]
2023-02-02 22:36 ` [PATCH v3] fs/buffer.c: update per-CPU bh_lru cache via RCU Dave Chinner
2023-02-02 22:51 ` Matthew Wilcox
2023-02-03 2:04 ` Marcelo Tosatti
2023-02-03 5:49 ` Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=Y9qM68F+nDSYfrJ1@tpad \
--to=mtosatti@redhat.com \
--cc=axboe@kernel.dk \
--cc=frederic@kernel.org \
--cc=hch@lst.de \
--cc=leobras@redhat.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=ppandit@redhat.com \
--cc=viro@zeniv.linux.org.uk \
--cc=willy@infradead.org \
--cc=ypodemsk@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.