* Re: [patch]smp: make smp_call_function_many use the similar logic like smp_call_function_single
2012-12-24 7:03 [patch]smp: make smp_call_function_many use the similar logic like smp_call_function_single Shaohua Li
@ 2013-01-07 8:14 ` Shaohua Li
2013-01-08 22:38 ` Andrew Morton
1 sibling, 0 replies; 3+ messages in thread
From: Shaohua Li @ 2013-01-07 8:14 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, a.p.zijlstra, rostedt, axboe
ping!
On Mon, Dec 24, 2012 at 03:03:30PM +0800, Shaohua Li wrote:
>
> I'm testing swapout workload in a two-socket Xeon machine. The workload has 10
> threads, each thread sequentially accesses separate memory region. TLB flush
> overhead is very big in the workload. For each page, page reclaim need move it
> from active lru list and then unmap it. Both need a TLB flush. And this is a
> multthread workload, TLB flush happens in 10 CPUs. In X86, TLB flush uses
> generic smp_call)function. So this workload stress smp_call_function_many
> heavily.
>
> Without patch, perf shows:
> + 24.49% [k] generic_smp_call_function_interrupt
> - 21.72% [k] _raw_spin_lock
> - _raw_spin_lock
> + 79.80% __page_check_address
> + 6.42% generic_smp_call_function_interrupt
> + 3.31% get_swap_page
> + 2.37% free_pcppages_bulk
> + 1.75% handle_pte_fault
> + 1.54% put_super
> + 1.41% grab_super_passive
> + 1.36% __swap_duplicate
> + 0.68% blk_flush_plug_list
> + 0.62% swap_info_get
> + 6.55% [k] flush_tlb_func
> + 6.46% [k] smp_call_function_many
> + 5.09% [k] call_function_interrupt
> + 4.75% [k] default_send_IPI_mask_sequence_phys
> + 2.18% [k] find_next_bit
>
> swapout throughput is around 1300M/s.
>
> With the patch, perf shows:
> - 27.23% [k] _raw_spin_lock
> - _raw_spin_lock
> + 80.53% __page_check_address
> + 8.39% generic_smp_call_function_single_interrupt
> + 2.44% get_swap_page
> + 1.76% free_pcppages_bulk
> + 1.40% handle_pte_fault
> + 1.15% __swap_duplicate
> + 1.05% put_super
> + 0.98% grab_super_passive
> + 0.86% blk_flush_plug_list
> + 0.57% swap_info_get
> + 8.25% [k] default_send_IPI_mask_sequence_phys
> + 7.55% [k] call_function_interrupt
> + 7.47% [k] smp_call_function_many
> + 7.25% [k] flush_tlb_func
> + 3.81% [k] _raw_spin_lock_irqsave
> + 3.78% [k] generic_smp_call_function_single_interrupt
>
> swapout throughput is around 1400M/s. So there is around a 7% improvement, and
> total cpu utilization doesn't change.
>
> Without the patch, cfd_data is shared by all CPUs.
> generic_smp_call_function_interrupt does read/write cfd_data several times
> which will create a lot of cache ping-pong. With the patch, the data becomes
> per-cpu. The ping-pong is avoided. And from the perf data, this doesn't make
> call_single_queue lock contend.
>
> Next step is to remove generic_smp_call_function_interrupt from arch code.
>
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
> include/linux/smp.h | 3
> kernel/smp.c | 184 ++++++++--------------------------------------------
> 2 files changed, 32 insertions(+), 155 deletions(-)
>
> Index: linux/kernel/smp.c
> ===================================================================
> --- linux.orig/kernel/smp.c 2012-12-24 11:37:28.150604463 +0800
> +++ linux/kernel/smp.c 2012-12-24 14:57:11.807949527 +0800
> @@ -16,22 +16,12 @@
> #include "smpboot.h"
>
> #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
> -static struct {
> - struct list_head queue;
> - raw_spinlock_t lock;
> -} call_function __cacheline_aligned_in_smp =
> - {
> - .queue = LIST_HEAD_INIT(call_function.queue),
> - .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
> - };
> -
> enum {
> CSD_FLAG_LOCK = 0x01,
> };
>
> struct call_function_data {
> - struct call_single_data csd;
> - atomic_t refs;
> + struct call_single_data __percpu *csd;
> cpumask_var_t cpumask;
> };
>
> @@ -56,6 +46,11 @@ hotplug_cfd(struct notifier_block *nfb,
> if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
> cpu_to_node(cpu)))
> return notifier_from_errno(-ENOMEM);
> + cfd->csd = alloc_percpu(struct call_single_data);
> + if (!cfd->csd) {
> + free_cpumask_var(cfd->cpumask);
> + return notifier_from_errno(-ENOMEM);
> + }
> break;
>
> #ifdef CONFIG_HOTPLUG_CPU
> @@ -65,6 +60,7 @@ hotplug_cfd(struct notifier_block *nfb,
> case CPU_DEAD:
> case CPU_DEAD_FROZEN:
> free_cpumask_var(cfd->cpumask);
> + free_percpu(cfd->csd);
> break;
> #endif
> };
> @@ -166,85 +162,6 @@ void generic_exec_single(int cpu, struct
> }
>
> /*
> - * Invoked by arch to handle an IPI for call function. Must be called with
> - * interrupts disabled.
> - */
> -void generic_smp_call_function_interrupt(void)
> -{
> - struct call_function_data *data;
> - int cpu = smp_processor_id();
> -
> - /*
> - * Shouldn't receive this interrupt on a cpu that is not yet online.
> - */
> - WARN_ON_ONCE(!cpu_online(cpu));
> -
> - /*
> - * Ensure entry is visible on call_function_queue after we have
> - * entered the IPI. See comment in smp_call_function_many.
> - * If we don't have this, then we may miss an entry on the list
> - * and never get another IPI to process it.
> - */
> - smp_mb();
> -
> - /*
> - * It's ok to use list_for_each_rcu() here even though we may
> - * delete 'pos', since list_del_rcu() doesn't clear ->next
> - */
> - list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
> - int refs;
> - smp_call_func_t func;
> -
> - /*
> - * Since we walk the list without any locks, we might
> - * see an entry that was completed, removed from the
> - * list and is in the process of being reused.
> - *
> - * We must check that the cpu is in the cpumask before
> - * checking the refs, and both must be set before
> - * executing the callback on this cpu.
> - */
> -
> - if (!cpumask_test_cpu(cpu, data->cpumask))
> - continue;
> -
> - smp_rmb();
> -
> - if (atomic_read(&data->refs) == 0)
> - continue;
> -
> - func = data->csd.func; /* save for later warn */
> - func(data->csd.info);
> -
> - /*
> - * If the cpu mask is not still set then func enabled
> - * interrupts (BUG), and this cpu took another smp call
> - * function interrupt and executed func(info) twice
> - * on this cpu. That nested execution decremented refs.
> - */
> - if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
> - WARN(1, "%pf enabled interrupts and double executed\n", func);
> - continue;
> - }
> -
> - refs = atomic_dec_return(&data->refs);
> - WARN_ON(refs < 0);
> -
> - if (refs)
> - continue;
> -
> - WARN_ON(!cpumask_empty(data->cpumask));
> -
> - raw_spin_lock(&call_function.lock);
> - list_del_rcu(&data->csd.list);
> - raw_spin_unlock(&call_function.lock);
> -
> - csd_unlock(&data->csd);
> - }
> -
> -}
> -
> -/*
> * Invoked by arch to handle an IPI for call function single. Must be
> * called from the arch with interrupts disabled.
> */
> @@ -448,8 +365,7 @@ void smp_call_function_many(const struct
> smp_call_func_t func, void *info, bool wait)
> {
> struct call_function_data *data;
> - unsigned long flags;
> - int refs, cpu, next_cpu, this_cpu = smp_processor_id();
> + int cpu, next_cpu, this_cpu = smp_processor_id();
>
> /*
> * Can deadlock when called with interrupts disabled.
> @@ -481,79 +397,39 @@ void smp_call_function_many(const struct
> }
>
> data = &__get_cpu_var(cfd_data);
> - csd_lock(&data->csd);
> -
> - /* This BUG_ON verifies our reuse assertions and can be removed */
> - BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
> -
> - /*
> - * The global call function queue list add and delete are protected
> - * by a lock, but the list is traversed without any lock, relying
> - * on the rcu list add and delete to allow safe concurrent traversal.
> - * We reuse the call function data without waiting for any grace
> - * period after some other cpu removes it from the global queue.
> - * This means a cpu might find our data block as it is being
> - * filled out.
> - *
> - * We hold off the interrupt handler on the other cpu by
> - * ordering our writes to the cpu mask vs our setting of the
> - * refs counter. We assert only the cpu owning the data block
> - * will set a bit in cpumask, and each bit will only be cleared
> - * by the subject cpu. Each cpu must first find its bit is
> - * set and then check that refs is set indicating the element is
> - * ready to be processed, otherwise it must skip the entry.
> - *
> - * On the previous iteration refs was set to 0 by another cpu.
> - * To avoid the use of transitivity, set the counter to 0 here
> - * so the wmb will pair with the rmb in the interrupt handler.
> - */
> - atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
>
> - data->csd.func = func;
> - data->csd.info = info;
> -
> - /* Ensure 0 refs is visible before mask. Also orders func and info */
> - smp_wmb();
> -
> - /* We rely on the "and" being processed before the store */
> cpumask_and(data->cpumask, mask, cpu_online_mask);
> cpumask_clear_cpu(this_cpu, data->cpumask);
> - refs = cpumask_weight(data->cpumask);
>
> /* Some callers race with other cpus changing the passed mask */
> - if (unlikely(!refs)) {
> - csd_unlock(&data->csd);
> + if (unlikely(!cpumask_weight(data->cpumask)))
> return;
> - }
> -
> - raw_spin_lock_irqsave(&call_function.lock, flags);
> - /*
> - * Place entry at the _HEAD_ of the list, so that any cpu still
> - * observing the entry in generic_smp_call_function_interrupt()
> - * will not miss any other list entries:
> - */
> - list_add_rcu(&data->csd.list, &call_function.queue);
> - /*
> - * We rely on the wmb() in list_add_rcu to complete our writes
> - * to the cpumask before this write to refs, which indicates
> - * data is on the list and is ready to be processed.
> - */
> - atomic_set(&data->refs, refs);
> - raw_spin_unlock_irqrestore(&call_function.lock, flags);
>
> - /*
> - * Make the list addition visible before sending the ipi.
> - * (IPIs must obey or appear to obey normal Linux cache
> - * coherency rules -- see comment in generic_exec_single).
> - */
> - smp_mb();
> + for_each_cpu(cpu, data->cpumask) {
> + struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
> + struct call_single_queue *dst =
> + &per_cpu(call_single_queue, cpu);
> + unsigned long flags;
> +
> + csd_lock(csd);
> + csd->func = func;
> + csd->info = info;
> +
> + raw_spin_lock_irqsave(&dst->lock, flags);
> + list_add_tail(&csd->list, &dst->list);
> + raw_spin_unlock_irqrestore(&dst->lock, flags);
> + }
>
> /* Send a message to all CPUs in the map */
> arch_send_call_function_ipi_mask(data->cpumask);
>
> - /* Optionally wait for the CPUs to complete */
> - if (wait)
> - csd_lock_wait(&data->csd);
> + if (wait) {
> + for_each_cpu(cpu, data->cpumask) {
> + struct call_single_data *csd =
> + per_cpu_ptr(data->csd, cpu);
> + csd_lock_wait(csd);
> + }
> + }
> }
> EXPORT_SYMBOL(smp_call_function_many);
>
> Index: linux/include/linux/smp.h
> ===================================================================
> --- linux.orig/include/linux/smp.h 2012-12-24 11:38:28.901840885 +0800
> +++ linux/include/linux/smp.h 2012-12-24 11:39:16.557243069 +0800
> @@ -89,7 +89,8 @@ void kick_all_cpus_sync(void);
> #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
> void __init call_function_init(void);
> void generic_smp_call_function_single_interrupt(void);
> -void generic_smp_call_function_interrupt(void);
> +#define generic_smp_call_function_interrupt \
> + generic_smp_call_function_single_interrupt
> #else
> static inline void call_function_init(void) { }
> #endif
^ permalink raw reply [flat|nested] 3+ messages in thread* Re: [patch]smp: make smp_call_function_many use the similar logic like smp_call_function_single
2012-12-24 7:03 [patch]smp: make smp_call_function_many use the similar logic like smp_call_function_single Shaohua Li
2013-01-07 8:14 ` Shaohua Li
@ 2013-01-08 22:38 ` Andrew Morton
1 sibling, 0 replies; 3+ messages in thread
From: Andrew Morton @ 2013-01-08 22:38 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-kernel, a.p.zijlstra, rostedt, axboe
On Mon, 24 Dec 2012 15:03:30 +0800
Shaohua Li <shli@kernel.org> wrote:
>
> I'm testing swapout workload in a two-socket Xeon machine. The workload has 10
> threads, each thread sequentially accesses separate memory region. TLB flush
> overhead is very big in the workload. For each page, page reclaim need move it
> from active lru list and then unmap it. Both need a TLB flush. And this is a
> multthread workload, TLB flush happens in 10 CPUs. In X86, TLB flush uses
> generic smp_call)function. So this workload stress smp_call_function_many
> heavily.
>
> Without patch, perf shows:
> + 24.49% [k] generic_smp_call_function_interrupt
> - 21.72% [k] _raw_spin_lock
> - _raw_spin_lock
> + 79.80% __page_check_address
> + 6.42% generic_smp_call_function_interrupt
> + 3.31% get_swap_page
> + 2.37% free_pcppages_bulk
> + 1.75% handle_pte_fault
> + 1.54% put_super
> + 1.41% grab_super_passive
> + 1.36% __swap_duplicate
> + 0.68% blk_flush_plug_list
> + 0.62% swap_info_get
> + 6.55% [k] flush_tlb_func
> + 6.46% [k] smp_call_function_many
> + 5.09% [k] call_function_interrupt
> + 4.75% [k] default_send_IPI_mask_sequence_phys
> + 2.18% [k] find_next_bit
>
> swapout throughput is around 1300M/s.
>
> With the patch, perf shows:
> - 27.23% [k] _raw_spin_lock
> - _raw_spin_lock
> + 80.53% __page_check_address
> + 8.39% generic_smp_call_function_single_interrupt
> + 2.44% get_swap_page
> + 1.76% free_pcppages_bulk
> + 1.40% handle_pte_fault
> + 1.15% __swap_duplicate
> + 1.05% put_super
> + 0.98% grab_super_passive
> + 0.86% blk_flush_plug_list
> + 0.57% swap_info_get
> + 8.25% [k] default_send_IPI_mask_sequence_phys
> + 7.55% [k] call_function_interrupt
> + 7.47% [k] smp_call_function_many
> + 7.25% [k] flush_tlb_func
> + 3.81% [k] _raw_spin_lock_irqsave
> + 3.78% [k] generic_smp_call_function_single_interrupt
>
> swapout throughput is around 1400M/s. So there is around a 7% improvement, and
> total cpu utilization doesn't change.
Boy, that was a biiiig cleanup. I hope it works ;)
The naming of locals in that code makes my eyes hurt. How's this?
From: Andrew Morton <akpm@linux-foundation.org>
Subject: kernel/smp.c: cleanups
We sometimes use "struct call_single_data *data" and sometimes "struct
call_single_data *csd". Use "cfd" consistently.
We sometimes use "struct call_function_data *data" and sometimes "struct
call_function_data *cfd". Use "cfd" consistently.
Also, avoid some 80-col layout tricks.
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Shaohua Li <shli@fusionio.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
kernel/smp.c | 87 ++++++++++++++++++++++++-------------------------
1 file changed, 44 insertions(+), 43 deletions(-)
diff -puN include/linux/smp.h~kernel-smpc-cleanups include/linux/smp.h
diff -puN kernel/smp.c~kernel-smpc-cleanups kernel/smp.c
--- a/kernel/smp.c~kernel-smpc-cleanups
+++ a/kernel/smp.c
@@ -95,16 +95,16 @@ void __init call_function_init(void)
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
-static void csd_lock_wait(struct call_single_data *data)
+static void csd_lock_wait(struct call_single_data *csd)
{
- while (data->flags & CSD_FLAG_LOCK)
+ while (csd->flags & CSD_FLAG_LOCK)
cpu_relax();
}
-static void csd_lock(struct call_single_data *data)
+static void csd_lock(struct call_single_data *csd)
{
- csd_lock_wait(data);
- data->flags = CSD_FLAG_LOCK;
+ csd_lock_wait(csd);
+ csd->flags = CSD_FLAG_LOCK;
/*
* prevent CPU from reordering the above assignment
@@ -114,16 +114,16 @@ static void csd_lock(struct call_single_
smp_mb();
}
-static void csd_unlock(struct call_single_data *data)
+static void csd_unlock(struct call_single_data *csd)
{
- WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+ WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
*/
smp_mb();
- data->flags &= ~CSD_FLAG_LOCK;
+ csd->flags &= ~CSD_FLAG_LOCK;
}
/*
@@ -132,7 +132,7 @@ static void csd_unlock(struct call_singl
* ->func, ->info, and ->flags set.
*/
static
-void generic_exec_single(int cpu, struct call_single_data *data, int wait)
+void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
{
struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
unsigned long flags;
@@ -140,7 +140,7 @@ void generic_exec_single(int cpu, struct
raw_spin_lock_irqsave(&dst->lock, flags);
ipi = list_empty(&dst->list);
- list_add_tail(&data->list, &dst->list);
+ list_add_tail(&csd->list, &dst->list);
raw_spin_unlock_irqrestore(&dst->lock, flags);
/*
@@ -158,7 +158,7 @@ void generic_exec_single(int cpu, struct
arch_send_call_function_single_ipi(cpu);
if (wait)
- csd_lock_wait(data);
+ csd_lock_wait(csd);
}
/*
@@ -168,7 +168,6 @@ void generic_exec_single(int cpu, struct
void generic_smp_call_function_single_interrupt(void)
{
struct call_single_queue *q = &__get_cpu_var(call_single_queue);
- unsigned int data_flags;
LIST_HEAD(list);
/*
@@ -181,25 +180,26 @@ void generic_smp_call_function_single_in
raw_spin_unlock(&q->lock);
while (!list_empty(&list)) {
- struct call_single_data *data;
+ struct call_single_data *csd;
+ unsigned int csd_flags;
- data = list_entry(list.next, struct call_single_data, list);
- list_del(&data->list);
+ csd = list_entry(list.next, struct call_single_data, list);
+ list_del(&csd->list);
/*
- * 'data' can be invalid after this call if flags == 0
+ * 'csd' can be invalid after this call if flags == 0
* (when called through generic_exec_single()),
* so save them away before making the call:
*/
- data_flags = data->flags;
+ csd_flags = csd->flags;
- data->func(data->info);
+ csd->func(csd->info);
/*
* Unlocked CSDs are valid through generic_exec_single():
*/
- if (data_flags & CSD_FLAG_LOCK)
- csd_unlock(data);
+ if (csd_flags & CSD_FLAG_LOCK)
+ csd_unlock(csd);
}
}
@@ -244,16 +244,16 @@ int smp_call_function_single(int cpu, sm
local_irq_restore(flags);
} else {
if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
- struct call_single_data *data = &d;
+ struct call_single_data *csd = &d;
if (!wait)
- data = &__get_cpu_var(csd_data);
+ csd = &__get_cpu_var(csd_data);
- csd_lock(data);
+ csd_lock(csd);
- data->func = func;
- data->info = info;
- generic_exec_single(cpu, data, wait);
+ csd->func = func;
+ csd->info = info;
+ generic_exec_single(cpu, csd, wait);
} else {
err = -ENXIO; /* CPU not online */
}
@@ -320,7 +320,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_any)
* pre-allocated data structure. Useful for embedding @data inside
* other structures, for instance.
*/
-void __smp_call_function_single(int cpu, struct call_single_data *data,
+void __smp_call_function_single(int cpu, struct call_single_data *csd,
int wait)
{
unsigned int this_cpu;
@@ -338,11 +338,11 @@ void __smp_call_function_single(int cpu,
if (cpu == this_cpu) {
local_irq_save(flags);
- data->func(data->info);
+ csd->func(csd->info);
local_irq_restore(flags);
} else {
- csd_lock(data);
- generic_exec_single(cpu, data, wait);
+ csd_lock(csd);
+ generic_exec_single(cpu, csd, wait);
}
put_cpu();
}
@@ -364,7 +364,7 @@ void __smp_call_function_single(int cpu,
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
- struct call_function_data *data;
+ struct call_function_data *cfd;
int cpu, next_cpu, this_cpu = smp_processor_id();
/*
@@ -396,21 +396,21 @@ void smp_call_function_many(const struct
return;
}
- data = &__get_cpu_var(cfd_data);
+ cfd = &__get_cpu_var(cfd_data);
- cpumask_and(data->cpumask, mask, cpu_online_mask);
- cpumask_clear_cpu(this_cpu, data->cpumask);
+ cpumask_and(cfd->cpumask, mask, cpu_online_mask);
+ cpumask_clear_cpu(this_cpu, cfd->cpumask);
/* Some callers race with other cpus changing the passed mask */
- if (unlikely(!cpumask_weight(data->cpumask)))
+ if (unlikely(!cpumask_weight(cfd->cpumask)))
return;
- for_each_cpu(cpu, data->cpumask) {
- struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
- struct call_single_queue *dst =
- &per_cpu(call_single_queue, cpu);
+ for_each_cpu(cpu, cfd->cpumask) {
+ struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
+ struct call_single_queue *dst;
unsigned long flags;
+ dst = &per_cpu(call_single_queue, cpu);
csd_lock(csd);
csd->func = func;
csd->info = info;
@@ -421,12 +421,13 @@ void smp_call_function_many(const struct
}
/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(data->cpumask);
+ arch_send_call_function_ipi_mask(cfd->cpumask);
if (wait) {
- for_each_cpu(cpu, data->cpumask) {
- struct call_single_data *csd =
- per_cpu_ptr(data->csd, cpu);
+ for_each_cpu(cpu, cfd->cpumask) {
+ struct call_single_data *csd;
+
+ csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock_wait(csd);
}
}
_
^ permalink raw reply [flat|nested] 3+ messages in thread