From: Thomas Gleixner <tglx@linutronix.de>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Anna-Maria Behnsen <anna-maria@linutronix.de>,
Frederic Weisbecker <frederic@kernel.org>,
Benjamin Segall <bsegall@google.com>,
Eric Dumazet <edumazet@google.com>,
Andrey Vagin <avagin@openvz.org>,
Pavel Tikhomirov <ptikhomirov@virtuozzo.com>,
Peter Zijlstra <peterz@infradead.org>
Subject: Re: [patch 07/11] posix-timers: Improve hash table performance
Date: Mon, 24 Feb 2025 20:45:56 +0100 [thread overview]
Message-ID: <87msebf7nv.ffs@tglx> (raw)
In-Reply-To: <20250224101343.410413967@linutronix.de>
On Mon, Feb 24 2025 at 11:15, Thomas Gleixner wrote:
There are two more long hanging fruits:
1) The hashing is suboptimal and can simply be improved by using
jhash32(), which gives a way better distribution in the
pathological test case with 1.2M timers
2) Avoid false sharing
struct k_itimer has the hlist_node which is used for lookup in
the hash bucket and the timer lock in the same cache line.
That's obviously bad, if one CPU fiddles with a timer and the
other is walking the hash bucket on which that timer is queued.
That can be avoided by restructuring struct k_itimer, so that
the read mostly (only modified during setup and teardown)
fields are in the first cache line and the lock and the rest of
the fields which get written to are in cacheline 2-N.
Combo patch below.
Thanks,
tglx
---
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -179,23 +179,26 @@ static inline void posix_cputimers_init_
* @rcu: RCU head for freeing the timer.
*/
struct k_itimer {
- struct hlist_node list;
- struct hlist_node ignored_list;
+ /* 1st cacheline contains read-mostly fields */
struct hlist_node t_hash;
- spinlock_t it_lock;
- const struct k_clock *kclock;
- clockid_t it_clock;
+ struct hlist_node list;
timer_t it_id;
+ clockid_t it_clock;
+ int it_sigev_notify;
+ enum pid_type it_pid_type;
+ struct signal_struct *it_signal;
+ const struct k_clock *kclock;
+
+ /* 2nd cacheline and above contain fields which are modified regularly */
+ spinlock_t it_lock;
int it_status;
bool it_sig_periodic;
s64 it_overrun;
s64 it_overrun_last;
unsigned int it_signal_seq;
unsigned int it_sigqueue_seq;
- int it_sigev_notify;
- enum pid_type it_pid_type;
ktime_t it_interval;
- struct signal_struct *it_signal;
+ struct hlist_node ignored_list;
union {
struct pid *it_pid;
struct task_struct *it_process;
@@ -212,7 +215,7 @@ struct k_itimer {
} alarm;
} it;
struct rcu_head rcu;
-};
+} ____cacheline_aligned_in_smp;
void run_posix_cpu_timers(void);
void posix_cpu_timers_exit(struct task_struct *task);
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -11,8 +11,8 @@
*/
#include <linux/compat.h>
#include <linux/compiler.h>
-#include <linux/hash.h>
#include <linux/init.h>
+#include <linux/jhash.h>
#include <linux/interrupt.h>
#include <linux/list.h>
#include <linux/memblock.h>
@@ -48,11 +48,11 @@ struct timer_hash_bucket {
static struct {
struct timer_hash_bucket *buckets;
- unsigned long bits;
+ unsigned long mask;
} __timer_data __ro_after_init __aligned(2*sizeof(long));
#define timer_buckets (__timer_data.buckets)
-#define timer_hashbits (__timer_data.bits)
+#define timer_hashmask (__timer_data.mask)
static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
@@ -74,15 +74,15 @@ static struct k_itimer *__lock_timer(tim
__timr; \
})
-static int hash(struct signal_struct *sig, unsigned int nr)
+static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr)
{
- return hash_32(hash32_ptr(sig) ^ nr, timer_hashbits);
+ return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask];
}
static struct k_itimer *posix_timer_by_id(timer_t id)
{
struct signal_struct *sig = current->signal;
- struct timer_hash_bucket *bucket = &timer_buckets[hash(sig, id)];
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
struct k_itimer *timer;
hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) {
@@ -119,7 +119,7 @@ static bool posix_timer_hashed(struct ti
static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id)
{
- struct timer_hash_bucket *bucket = &timer_buckets[hash(sig, id)];
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
scoped_guard (spinlock, &bucket->lock) {
/*
@@ -260,9 +260,9 @@ static int posix_get_hrtimer_res(clockid
static __init int init_posix_timers(void)
{
- posix_timers_cache = kmem_cache_create("posix_timers_cache",
- sizeof(struct k_itimer), 0,
- SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer),
+ __alignof__(struct k_itimer),
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
return 0;
}
__initcall(init_posix_timers);
@@ -424,8 +424,7 @@ void posixtimer_free_timer(struct k_itim
static void posix_timer_unhash_and_free(struct k_itimer *tmr)
{
- unsigned int idx = hash(posix_sig_owner(tmr), tmr->it_id);
- struct timer_hash_bucket *bucket = &timer_buckets[idx];
+ struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id);
scoped_guard (spinlock, &bucket->lock)
hlist_del_rcu(&tmr->t_hash);
@@ -1611,7 +1610,7 @@ static int __init posixtimer_init(void)
timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets),
size, 0, 0, &shift, NULL, size, size);
size = 1UL << shift;
- timer_hashbits = ilog2(size);
+ timer_hashmask = size - 1;
for (i = 0; i < size; i++) {
spin_lock_init(&timer_buckets[i].lock);
next prev parent reply other threads:[~2025-02-24 19:45 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-24 10:15 [patch 00/11] posix-timers: Rework the global hash table and provide a sane mechanism for CRIU Thomas Gleixner
2025-02-24 10:15 ` [patch 01/11] posix-timers: Initialise timer before adding it to the hash table Thomas Gleixner
2025-02-24 10:15 ` [patch 02/11] posix-timers: Add cond_resched() to posix_timer_add() search loop Thomas Gleixner
2025-02-24 10:15 ` [patch 03/11] posix-timers: Cleanup includes Thomas Gleixner
2025-02-24 10:15 ` [patch 04/11] posix-timers: Remove pointless unlock_timer() wrapper Thomas Gleixner
2025-02-24 16:21 ` Peter Zijlstra
2025-02-24 18:43 ` Thomas Gleixner
2025-02-24 21:55 ` Peter Zijlstra
2025-02-24 10:15 ` [patch 05/11] posix-timers: Rework timer removal Thomas Gleixner
2025-02-24 10:15 ` [patch 06/11] posix-timers: Make signal_struct::next_posix_timer_id an atomic_t Thomas Gleixner
2025-02-24 13:20 ` Peter Zijlstra
2025-02-24 13:34 ` Eric Dumazet
2025-02-24 19:38 ` Thomas Gleixner
2025-02-24 10:15 ` [patch 07/11] posix-timers: Improve hash table performance Thomas Gleixner
2025-02-24 19:45 ` Thomas Gleixner [this message]
2025-02-24 10:15 ` [patch 08/11] posix-timers: Make per process list RCU safe Thomas Gleixner
2025-02-24 10:15 ` [patch 09/11] posix-timers: Dont iterate /proc/$PID/timers with sighand::siglock held Thomas Gleixner
2025-02-24 10:15 ` [patch 10/11] posix-timers: Provide a mechanism to allocate a given timer ID Thomas Gleixner
2025-02-24 10:15 ` [patch 11/11] selftests/timers/posix-timers: Add a test for exact allocation mode Thomas Gleixner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87msebf7nv.ffs@tglx \
--to=tglx@linutronix.de \
--cc=anna-maria@linutronix.de \
--cc=avagin@openvz.org \
--cc=bsegall@google.com \
--cc=edumazet@google.com \
--cc=frederic@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=peterz@infradead.org \
--cc=ptikhomirov@virtuozzo.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox