[PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
@ 2008-08-22  0:29 Jeremy Fitzhardinge
  2008-08-22  1:53 ` Nick Piggin
  2008-08-22  6:28 ` Ingo Molnar
  0 siblings, 2 replies; 33+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-22  0:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

RCU can only control the lifetime of allocated memory blocks, which
forces all the call structures to be allocated.  This is expensive
compared to allocating them on the stack, which is the common case for
synchronous calls.

This patch takes a different approach.  Rather than using RCU, the
queues are managed under rwlocks.  Adding or removing from the queue
requires holding the lock for writing, but multiple CPUs can walk the
queues to process function calls under read locks.  In the common
case, where the structures are stack allocated, the calling CPU need
only wait for its call to be done, take the lock for writing and
remove the call structure.

Lock contention - particularly write vs read - is reduced by using
multiple queues.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 kernel/smp.c |  140 +++++++++++++++++++++++-----------------------------------
 1 file changed, 56 insertions(+), 84 deletions(-)

===================================================================
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -7,8 +7,6 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
-#include <linux/rcupdate.h>
-#include <linux/rculist.h>
 #include <linux/smp.h>
 #include <asm/atomic.h>
 
@@ -26,7 +24,7 @@
 static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
 struct ____cacheline_aligned queue {
 	struct list_head list;
-	spinlock_t lock;
+	rwlock_t rwlock;
 };
 
 static __cacheline_aligned struct queue call_function_queues[NQUEUES];
@@ -40,7 +38,7 @@
 	struct call_single_data csd;
 	atomic_t refs;
 	cpumask_t cpumask;
-	struct rcu_head rcu_head;
+	struct list_head cull_list;
 };
 
 struct call_single_queue {
@@ -98,15 +96,6 @@
 		csd_flag_wait(data);
 }
 
-static void rcu_free_call_data(struct rcu_head *head)
-{
-	struct call_function_data *data;
-
-	data = container_of(head, struct call_function_data, rcu_head);
-
-	kfree(data);
-}
-
 /*
  * Invoked by arch to handle an IPI for call function. Must be called with
  * interrupts disabled.
@@ -115,17 +104,14 @@
 {
 	struct call_function_data *data;
 	struct queue *queue;
+	LIST_HEAD(cull_list);
 	int cpu = get_cpu();
 
 	queue = &call_function_queues[queue_no];
-
-	/*
-	 * It's ok to use list_for_each_rcu() here even though we may delete
-	 * 'pos', since list_del_rcu() doesn't clear ->next
-	 */
-	rcu_read_lock();
-	list_for_each_entry_rcu(data, &queue->list, csd.list) {
-		if (!cpu_isset(cpu, data->cpumask))
+
+	read_lock(&queue->rwlock);
+	list_for_each_entry(data, &queue->list, csd.list) {
+ 		if (!cpu_isset(cpu, data->cpumask))
 			continue;
 
 		data->csd.func(data->csd.info);
@@ -134,22 +120,36 @@
 		if (!atomic_dec_and_test(&data->refs))
 			continue;
 
-		spin_lock(&queue->lock);
-		list_del_rcu(&data->csd.list);
-		spin_unlock(&queue->lock);
-
 		if (data->csd.flags & CSD_FLAG_WAIT) {
 			/*
-			 * serialize stores to data with the flag clear
-			 * and wakeup
+			 * Serialize stores to data with the flag
+			 * clear and wakeup.  Waiter will remove us
+			 * from the list.
 			 */
 			smp_wmb();
 			data->csd.flags &= ~CSD_FLAG_WAIT;
+		} else {
+			/*
+			 * If there's no waiter, then the data must
+			 * have been heap-allocated.
+			 */
+			BUG_ON(!(data->csd.flags & CSD_FLAG_ALLOC));
+
+			list_add_tail(&data->cull_list, &cull_list);
 		}
-		if (data->csd.flags & CSD_FLAG_ALLOC)
-			call_rcu(&data->rcu_head, rcu_free_call_data);
 	}
-	rcu_read_unlock();
+	read_unlock(&queue->rwlock);
+
+	if (!list_empty(&cull_list)) {
+		struct call_function_data *next;
+
+		write_lock(&queue->rwlock);
+		list_for_each_entry_safe(data, next, &cull_list, cull_list) {
+			list_del(&data->csd.list);
+			kfree(data);
+		}
+		write_unlock(&queue->rwlock);
+	}
 
 	put_cpu();
 }
@@ -271,42 +271,6 @@
 	generic_exec_single(cpu, data);
 }
 
-/* Dummy function */
-static void quiesce_dummy(void *unused)
-{
-}
-
-/*
- * Ensure stack based data used in call function mask is safe to free.
- *
- * This is needed by smp_call_function_mask when using on-stack data, because
- * a single call function queue is shared by all CPUs, and any CPU may pick up
- * the data item on the queue at any time before it is deleted. So we need to
- * ensure that all CPUs have transitioned through a quiescent state after
- * this call.
- *
- * This is a very slow function, implemented by sending synchronous IPIs to
- * all possible CPUs. For this reason, we have to alloc data rather than use
- * stack based data even in the case of synchronous calls. The stack based
- * data is then just used for deadlock/oom fallback which will be very rare.
- *
- * If a faster scheme can be made, we could go back to preferring stack based
- * data -- the data allocation/free is non-zero cost.
- */
-static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
-{
-	struct call_single_data data;
-	int cpu;
-
-	data.func = quiesce_dummy;
-	data.info = NULL;
-
-	for_each_cpu_mask(cpu, mask) {
-		data.flags = CSD_FLAG_WAIT;
-		generic_exec_single(cpu, &data);
-	}
-}
-
 /**
  * smp_call_function_mask(): Run a function on a set of other CPUs.
  * @mask: The set of cpus to run on.
@@ -332,7 +296,6 @@
 	cpumask_t allbutself;
 	unsigned long flags;
 	int cpu, num_cpus;
-	int slowpath = 0;
 	unsigned queue_no;
 	struct queue *queue;
 
@@ -359,16 +322,20 @@
 		return smp_call_function_single(cpu, func, info, wait);
 	}
 
-	data = kmalloc(sizeof(*data), GFP_ATOMIC);
-	if (data) {
-		data->csd.flags = CSD_FLAG_ALLOC;
-		if (wait)
-			data->csd.flags |= CSD_FLAG_WAIT;
-	} else {
+	/*
+	 * Allocate data if it's an async call, otherwise use stack.
+	 * If the allocation fails, then convert it to a sync call and
+	 * use the stack anyway.
+	 */
+	if (!wait) {
+		data = kmalloc(sizeof(*data), GFP_ATOMIC);
+		if (data)
+			data->csd.flags = CSD_FLAG_ALLOC;
+	}
+	if (!data) {
 		data = &d;
 		data->csd.flags = CSD_FLAG_WAIT;
 		wait = 1;
-		slowpath = 1;
 	}
 
 	data->csd.func = func;
@@ -376,9 +343,9 @@
 	atomic_set(&data->refs, num_cpus);
 	data->cpumask = mask;
 
-	spin_lock_irqsave(&queue->lock, flags);
-	list_add_tail_rcu(&data->csd.list, &queue->list);
-	spin_unlock_irqrestore(&queue->lock, flags);
+	write_lock_irqsave(&queue->rwlock, flags);
+	list_add_tail(&data->csd.list, &queue->list);
+	write_unlock_irqrestore(&queue->rwlock, flags);
 
 	/* Send a message to all CPUs in the map */
 	arch_send_call_function_ipi(mask);
@@ -386,8 +353,13 @@
 	/* optionally wait for the CPUs to complete */
 	if (wait) {
 		csd_flag_wait(&data->csd);
-		if (unlikely(slowpath))
-			smp_call_function_mask_quiesce_stack(mask);
+
+		write_lock_irqsave(&queue->rwlock, flags);
+		list_del(&data->csd.list);
+		write_unlock_irqrestore(&queue->rwlock, flags);
+
+		/* We should never wait for allocated data. */
+		BUG_ON(data->csd.flags & CSD_FLAG_ALLOC);
 	}
 
 	return 0;
@@ -425,7 +397,7 @@
 	int i;
 
 	for(i = 0; i < NQUEUES; i++)
-		spin_lock(&call_function_queues[i].lock);
+		write_lock(&call_function_queues[i].rwlock);
 }
 
 void ipi_call_unlock(void)
@@ -433,7 +405,7 @@
 	int i;
 
 	for(i = 0; i < NQUEUES; i++)
-		spin_unlock(&call_function_queues[i].lock);
+		write_unlock(&call_function_queues[i].rwlock);
 }
 
 void ipi_call_lock_irq(void)
@@ -443,7 +415,7 @@
 	spin_lock_irq(&queues_lock);
 
 	for(i = 0; i < NQUEUES; i++)
-		spin_lock_nest_lock(&call_function_queues[i].lock, &queues_lock);
+		write_lock(&call_function_queues[i].rwlock);
 }
 
 void ipi_call_unlock_irq(void)
@@ -451,7 +423,7 @@
 	int i;
 
 	for(i = 0; i < NQUEUES; i++)
-		spin_unlock(&call_function_queues[i].lock);
+		write_unlock(&call_function_queues[i].rwlock);
 
 	spin_unlock_irq(&queues_lock);
 }
@@ -463,7 +435,7 @@
 
 	for(i = 0; i < NQUEUES; i++) {
 		INIT_LIST_HEAD(&call_function_queues[i].list);
-		spin_lock_init(&call_function_queues[i].lock);
+		rwlock_init(&call_function_queues[i].rwlock);
 	}
 
 	printk(KERN_INFO "smp function calls: using %d/%d queues\n",



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22  0:29 [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu Jeremy Fitzhardinge
@ 2008-08-22  1:53 ` Nick Piggin
  2008-08-22  6:28 ` Ingo Molnar
  1 sibling, 0 replies; 33+ messages in thread
From: Nick Piggin @ 2008-08-22  1:53 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

On Friday 22 August 2008 10:29, Jeremy Fitzhardinge wrote:
> RCU can only control the lifetime of allocated memory blocks, which
> forces all the call structures to be allocated.  This is expensive
> compared to allocating them on the stack, which is the common case for
> synchronous calls.
>
> This patch takes a different approach.  Rather than using RCU, the
> queues are managed under rwlocks.  Adding or removing from the queue
> requires holding the lock for writing, but multiple CPUs can walk the
> queues to process function calls under read locks.  In the common
> case, where the structures are stack allocated, the calling CPU need
> only wait for its call to be done, take the lock for writing and
> remove the call structure.
>
> Lock contention - particularly write vs read - is reduced by using
> multiple queues.

Could be reasonable. Still, it's adding like 4 or 5 more atomic
operations, which will be approaching the cost of a slab allocation.

Another approach to reduce slab allocation cost would be to do the
call_rcu at the caller, in the wait case so we hit the CPU-local
freeing path in slab.


>
> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
> ---
>  kernel/smp.c |  140
> +++++++++++++++++++++++----------------------------------- 1 file changed,
> 56 insertions(+), 84 deletions(-)
>
> ===================================================================
> --- a/kernel/smp.c
> +++ b/kernel/smp.c
> @@ -7,8 +7,6 @@
>  #include <linux/init.h>
>  #include <linux/module.h>
>  #include <linux/percpu.h>
> -#include <linux/rcupdate.h>
> -#include <linux/rculist.h>
>  #include <linux/smp.h>
>  #include <asm/atomic.h>
>
> @@ -26,7 +24,7 @@
>  static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
>  struct ____cacheline_aligned queue {
>  	struct list_head list;
> -	spinlock_t lock;
> +	rwlock_t rwlock;
>  };
>
>  static __cacheline_aligned struct queue call_function_queues[NQUEUES];
> @@ -40,7 +38,7 @@
>  	struct call_single_data csd;
>  	atomic_t refs;
>  	cpumask_t cpumask;
> -	struct rcu_head rcu_head;
> +	struct list_head cull_list;
>  };
>
>  struct call_single_queue {
> @@ -98,15 +96,6 @@
>  		csd_flag_wait(data);
>  }
>
> -static void rcu_free_call_data(struct rcu_head *head)
> -{
> -	struct call_function_data *data;
> -
> -	data = container_of(head, struct call_function_data, rcu_head);
> -
> -	kfree(data);
> -}
> -
>  /*
>   * Invoked by arch to handle an IPI for call function. Must be called with
>   * interrupts disabled.
> @@ -115,17 +104,14 @@
>  {
>  	struct call_function_data *data;
>  	struct queue *queue;
> +	LIST_HEAD(cull_list);
>  	int cpu = get_cpu();
>
>  	queue = &call_function_queues[queue_no];
> -
> -	/*
> -	 * It's ok to use list_for_each_rcu() here even though we may delete
> -	 * 'pos', since list_del_rcu() doesn't clear ->next
> -	 */
> -	rcu_read_lock();
> -	list_for_each_entry_rcu(data, &queue->list, csd.list) {
> -		if (!cpu_isset(cpu, data->cpumask))
> +
> +	read_lock(&queue->rwlock);
> +	list_for_each_entry(data, &queue->list, csd.list) {
> + 		if (!cpu_isset(cpu, data->cpumask))
>  			continue;
>
>  		data->csd.func(data->csd.info);
> @@ -134,22 +120,36 @@
>  		if (!atomic_dec_and_test(&data->refs))
>  			continue;
>
> -		spin_lock(&queue->lock);
> -		list_del_rcu(&data->csd.list);
> -		spin_unlock(&queue->lock);
> -
>  		if (data->csd.flags & CSD_FLAG_WAIT) {
>  			/*
> -			 * serialize stores to data with the flag clear
> -			 * and wakeup
> +			 * Serialize stores to data with the flag
> +			 * clear and wakeup.  Waiter will remove us
> +			 * from the list.
>  			 */
>  			smp_wmb();
>  			data->csd.flags &= ~CSD_FLAG_WAIT;
> +		} else {
> +			/*
> +			 * If there's no waiter, then the data must
> +			 * have been heap-allocated.
> +			 */
> +			BUG_ON(!(data->csd.flags & CSD_FLAG_ALLOC));
> +
> +			list_add_tail(&data->cull_list, &cull_list);
>  		}
> -		if (data->csd.flags & CSD_FLAG_ALLOC)
> -			call_rcu(&data->rcu_head, rcu_free_call_data);
>  	}
> -	rcu_read_unlock();
> +	read_unlock(&queue->rwlock);
> +
> +	if (!list_empty(&cull_list)) {
> +		struct call_function_data *next;
> +
> +		write_lock(&queue->rwlock);
> +		list_for_each_entry_safe(data, next, &cull_list, cull_list) {
> +			list_del(&data->csd.list);
> +			kfree(data);
> +		}
> +		write_unlock(&queue->rwlock);
> +	}
>
>  	put_cpu();
>  }
> @@ -271,42 +271,6 @@
>  	generic_exec_single(cpu, data);
>  }
>
> -/* Dummy function */
> -static void quiesce_dummy(void *unused)
> -{
> -}
> -
> -/*
> - * Ensure stack based data used in call function mask is safe to free.
> - *
> - * This is needed by smp_call_function_mask when using on-stack data,
> because - * a single call function queue is shared by all CPUs, and any CPU
> may pick up - * the data item on the queue at any time before it is
> deleted. So we need to - * ensure that all CPUs have transitioned through a
> quiescent state after - * this call.
> - *
> - * This is a very slow function, implemented by sending synchronous IPIs
> to - * all possible CPUs. For this reason, we have to alloc data rather
> than use - * stack based data even in the case of synchronous calls. The
> stack based - * data is then just used for deadlock/oom fallback which will
> be very rare. - *
> - * If a faster scheme can be made, we could go back to preferring stack
> based - * data -- the data allocation/free is non-zero cost.
> - */
> -static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
> -{
> -	struct call_single_data data;
> -	int cpu;
> -
> -	data.func = quiesce_dummy;
> -	data.info = NULL;
> -
> -	for_each_cpu_mask(cpu, mask) {
> -		data.flags = CSD_FLAG_WAIT;
> -		generic_exec_single(cpu, &data);
> -	}
> -}
> -
>  /**
>   * smp_call_function_mask(): Run a function on a set of other CPUs.
>   * @mask: The set of cpus to run on.
> @@ -332,7 +296,6 @@
>  	cpumask_t allbutself;
>  	unsigned long flags;
>  	int cpu, num_cpus;
> -	int slowpath = 0;
>  	unsigned queue_no;
>  	struct queue *queue;
>
> @@ -359,16 +322,20 @@
>  		return smp_call_function_single(cpu, func, info, wait);
>  	}
>
> -	data = kmalloc(sizeof(*data), GFP_ATOMIC);
> -	if (data) {
> -		data->csd.flags = CSD_FLAG_ALLOC;
> -		if (wait)
> -			data->csd.flags |= CSD_FLAG_WAIT;
> -	} else {
> +	/*
> +	 * Allocate data if it's an async call, otherwise use stack.
> +	 * If the allocation fails, then convert it to a sync call and
> +	 * use the stack anyway.
> +	 */
> +	if (!wait) {
> +		data = kmalloc(sizeof(*data), GFP_ATOMIC);
> +		if (data)
> +			data->csd.flags = CSD_FLAG_ALLOC;
> +	}
> +	if (!data) {
>  		data = &d;
>  		data->csd.flags = CSD_FLAG_WAIT;
>  		wait = 1;
> -		slowpath = 1;
>  	}
>
>  	data->csd.func = func;
> @@ -376,9 +343,9 @@
>  	atomic_set(&data->refs, num_cpus);
>  	data->cpumask = mask;
>
> -	spin_lock_irqsave(&queue->lock, flags);
> -	list_add_tail_rcu(&data->csd.list, &queue->list);
> -	spin_unlock_irqrestore(&queue->lock, flags);
> +	write_lock_irqsave(&queue->rwlock, flags);
> +	list_add_tail(&data->csd.list, &queue->list);
> +	write_unlock_irqrestore(&queue->rwlock, flags);
>
>  	/* Send a message to all CPUs in the map */
>  	arch_send_call_function_ipi(mask);
> @@ -386,8 +353,13 @@
>  	/* optionally wait for the CPUs to complete */
>  	if (wait) {
>  		csd_flag_wait(&data->csd);
> -		if (unlikely(slowpath))
> -			smp_call_function_mask_quiesce_stack(mask);
> +
> +		write_lock_irqsave(&queue->rwlock, flags);
> +		list_del(&data->csd.list);
> +		write_unlock_irqrestore(&queue->rwlock, flags);
> +
> +		/* We should never wait for allocated data. */
> +		BUG_ON(data->csd.flags & CSD_FLAG_ALLOC);
>  	}
>
>  	return 0;
> @@ -425,7 +397,7 @@
>  	int i;
>
>  	for(i = 0; i < NQUEUES; i++)
> -		spin_lock(&call_function_queues[i].lock);
> +		write_lock(&call_function_queues[i].rwlock);
>  }
>
>  void ipi_call_unlock(void)
> @@ -433,7 +405,7 @@
>  	int i;
>
>  	for(i = 0; i < NQUEUES; i++)
> -		spin_unlock(&call_function_queues[i].lock);
> +		write_unlock(&call_function_queues[i].rwlock);
>  }
>
>  void ipi_call_lock_irq(void)
> @@ -443,7 +415,7 @@
>  	spin_lock_irq(&queues_lock);
>
>  	for(i = 0; i < NQUEUES; i++)
> -		spin_lock_nest_lock(&call_function_queues[i].lock, &queues_lock);
> +		write_lock(&call_function_queues[i].rwlock);
>  }
>
>  void ipi_call_unlock_irq(void)
> @@ -451,7 +423,7 @@
>  	int i;
>
>  	for(i = 0; i < NQUEUES; i++)
> -		spin_unlock(&call_function_queues[i].lock);
> +		write_unlock(&call_function_queues[i].rwlock);
>
>  	spin_unlock_irq(&queues_lock);
>  }
> @@ -463,7 +435,7 @@
>
>  	for(i = 0; i < NQUEUES; i++) {
>  		INIT_LIST_HEAD(&call_function_queues[i].list);
> -		spin_lock_init(&call_function_queues[i].lock);
> +		rwlock_init(&call_function_queues[i].rwlock);
>  	}
>
>  	printk(KERN_INFO "smp function calls: using %d/%d queues\n",

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22  0:29 [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu Jeremy Fitzhardinge
  2008-08-22  1:53 ` Nick Piggin
@ 2008-08-22  6:28 ` Ingo Molnar
  2008-08-22  7:06   ` Pekka Enberg
  1 sibling, 1 reply; 33+ messages in thread
From: Ingo Molnar @ 2008-08-22  6:28 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List,
	Paul E. McKenney, Pekka Enberg


* Jeremy Fitzhardinge <jeremy@goop.org> wrote:

> RCU can only control the lifetime of allocated memory blocks, which 
> forces all the call structures to be allocated.  This is expensive 
> compared to allocating them on the stack, which is the common case for 
> synchronous calls.
> 
> This patch takes a different approach.  Rather than using RCU, the 
> queues are managed under rwlocks.  Adding or removing from the queue 
> requires holding the lock for writing, but multiple CPUs can walk the 
> queues to process function calls under read locks.  In the common 
> case, where the structures are stack allocated, the calling CPU need 
> only wait for its call to be done, take the lock for writing and 
> remove the call structure.
> 
> Lock contention - particularly write vs read - is reduced by using 
> multiple queues.

hm, is there any authorative data on what is cheaper on a big box, a 
full-blown MESI cache miss that occurs for every reader in this new 
fastpath, or a local SLAB/SLUB allocation+free that occurs with the 
current RCU approach?

	Ingo

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22  6:28 ` Ingo Molnar
@ 2008-08-22  7:06   ` Pekka Enberg
  2008-08-22  7:12     ` Ingo Molnar
  2008-08-22 14:01     ` Christoph Lameter
  0 siblings, 2 replies; 33+ messages in thread
From: Pekka Enberg @ 2008-08-22  7:06 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jeremy Fitzhardinge, Nick Piggin, Andi Kleen,
	Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe, Rusty Russell,
	Linux Kernel Mailing List, Paul E. McKenney, Christoph Lameter

Hi Ingo,

On Fri, Aug 22, 2008 at 9:28 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Jeremy Fitzhardinge <jeremy@goop.org> wrote:
>
>> RCU can only control the lifetime of allocated memory blocks, which
>> forces all the call structures to be allocated.  This is expensive
>> compared to allocating them on the stack, which is the common case for
>> synchronous calls.
>>
>> This patch takes a different approach.  Rather than using RCU, the
>> queues are managed under rwlocks.  Adding or removing from the queue
>> requires holding the lock for writing, but multiple CPUs can walk the
>> queues to process function calls under read locks.  In the common
>> case, where the structures are stack allocated, the calling CPU need
>> only wait for its call to be done, take the lock for writing and
>> remove the call structure.
>>
>> Lock contention - particularly write vs read - is reduced by using
>> multiple queues.
>
> hm, is there any authorative data on what is cheaper on a big box, a
> full-blown MESI cache miss that occurs for every reader in this new
> fastpath, or a local SLAB/SLUB allocation+free that occurs with the
> current RCU approach?

Christoph might have an idea about it.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22  7:06   ` Pekka Enberg
@ 2008-08-22  7:12     ` Ingo Molnar
  2008-08-22  9:12       ` Nick Piggin
  2008-08-22 14:01     ` Christoph Lameter
  1 sibling, 1 reply; 33+ messages in thread
From: Ingo Molnar @ 2008-08-22  7:12 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Jeremy Fitzhardinge, Nick Piggin, Andi Kleen,
	Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe, Rusty Russell,
	Linux Kernel Mailing List, Paul E. McKenney, Christoph Lameter


* Pekka Enberg <penberg@cs.helsinki.fi> wrote:

> Hi Ingo,
> 
> On Fri, Aug 22, 2008 at 9:28 AM, Ingo Molnar <mingo@elte.hu> wrote:
> >
> > * Jeremy Fitzhardinge <jeremy@goop.org> wrote:
> >
> >> RCU can only control the lifetime of allocated memory blocks, which
> >> forces all the call structures to be allocated.  This is expensive
> >> compared to allocating them on the stack, which is the common case for
> >> synchronous calls.
> >>
> >> This patch takes a different approach.  Rather than using RCU, the
> >> queues are managed under rwlocks.  Adding or removing from the queue
> >> requires holding the lock for writing, but multiple CPUs can walk the
> >> queues to process function calls under read locks.  In the common
> >> case, where the structures are stack allocated, the calling CPU need
> >> only wait for its call to be done, take the lock for writing and
> >> remove the call structure.
> >>
> >> Lock contention - particularly write vs read - is reduced by using
> >> multiple queues.
> >
> > hm, is there any authorative data on what is cheaper on a big box, a
> > full-blown MESI cache miss that occurs for every reader in this new
> > fastpath, or a local SLAB/SLUB allocation+free that occurs with the
> > current RCU approach?
> 
> Christoph might have an idea about it.

... thought of that missing Cc: line entry exactly 1.3 seconds after 
having sent the mail :)

Christoph, any preferences/suggestions?

	Ingo

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22  7:12     ` Ingo Molnar
@ 2008-08-22  9:12       ` Nick Piggin
  0 siblings, 0 replies; 33+ messages in thread
From: Nick Piggin @ 2008-08-22  9:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Jeremy Fitzhardinge, Andi Kleen,
	Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe, Rusty Russell,
	Linux Kernel Mailing List, Paul E. McKenney, Christoph Lameter

On Friday 22 August 2008 17:12, Ingo Molnar wrote:
> * Pekka Enberg <penberg@cs.helsinki.fi> wrote:
> > Hi Ingo,
> >
> > On Fri, Aug 22, 2008 at 9:28 AM, Ingo Molnar <mingo@elte.hu> wrote:
> > > * Jeremy Fitzhardinge <jeremy@goop.org> wrote:
> > >> RCU can only control the lifetime of allocated memory blocks, which
> > >> forces all the call structures to be allocated.  This is expensive
> > >> compared to allocating them on the stack, which is the common case for
> > >> synchronous calls.
> > >>
> > >> This patch takes a different approach.  Rather than using RCU, the
> > >> queues are managed under rwlocks.  Adding or removing from the queue
> > >> requires holding the lock for writing, but multiple CPUs can walk the
> > >> queues to process function calls under read locks.  In the common
> > >> case, where the structures are stack allocated, the calling CPU need
> > >> only wait for its call to be done, take the lock for writing and
> > >> remove the call structure.
> > >>
> > >> Lock contention - particularly write vs read - is reduced by using
> > >> multiple queues.
> > >
> > > hm, is there any authorative data on what is cheaper on a big box, a
> > > full-blown MESI cache miss that occurs for every reader in this new
> > > fastpath, or a local SLAB/SLUB allocation+free that occurs with the
> > > current RCU approach?
> >
> > Christoph might have an idea about it.
>
> ... thought of that missing Cc: line entry exactly 1.3 seconds after
> having sent the mail :)
>
> Christoph, any preferences/suggestions?

I think it's just going to be a matter of benchmarking it and seeing.
And small/medium systems are probably more important than huge ones
unless there is a pathological scalability problem with one of the
approaches (which there probably isn't seeing as there is already
locking there).

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22  7:06   ` Pekka Enberg
  2008-08-22  7:12     ` Ingo Molnar
@ 2008-08-22 14:01     ` Christoph Lameter
  2008-08-22 15:11       ` Paul E. McKenney
  1 sibling, 1 reply; 33+ messages in thread
From: Christoph Lameter @ 2008-08-22 14:01 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Ingo Molnar, Jeremy Fitzhardinge, Nick Piggin, Andi Kleen,
	Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe, Rusty Russell,
	Linux Kernel Mailing List, Paul E. McKenney

Pekka Enberg wrote:
> Hi Ingo,
> 
> On Fri, Aug 22, 2008 at 9:28 AM, Ingo Molnar <mingo@elte.hu> wrote:
>> * Jeremy Fitzhardinge <jeremy@goop.org> wrote:
>>
>>> RCU can only control the lifetime of allocated memory blocks, which
>>> forces all the call structures to be allocated.  This is expensive
>>> compared to allocating them on the stack, which is the common case for
>>> synchronous calls.
>>>
>>> This patch takes a different approach.  Rather than using RCU, the
>>> queues are managed under rwlocks.  Adding or removing from the queue
>>> requires holding the lock for writing, but multiple CPUs can walk the
>>> queues to process function calls under read locks.  In the common
>>> case, where the structures are stack allocated, the calling CPU need
>>> only wait for its call to be done, take the lock for writing and
>>> remove the call structure.
>>>
>>> Lock contention - particularly write vs read - is reduced by using
>>> multiple queues.
>> hm, is there any authorative data on what is cheaper on a big box, a
>> full-blown MESI cache miss that occurs for every reader in this new
>> fastpath, or a local SLAB/SLUB allocation+free that occurs with the
>> current RCU approach?
> 
> Christoph might have an idea about it.

Its on the stack which is presumably hot so no cache miss? If its async then
presumably we do not need to wait so its okay to call an allocator.

Generally: The larger the box (longer cacheline acquisition latencies) and the
higher the contention (cannot get cacheline because of contention) the better
a slab allocation will be compared to a cacheline miss.

RCU is problematic because it lets cachelines get cold. A hot cacheline that
is used frequently read and written to by the same cpu is very good thing for
performace.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22 14:01     ` Christoph Lameter
@ 2008-08-22 15:11       ` Paul E. McKenney
  2008-08-22 17:14         ` Christoph Lameter
  0 siblings, 1 reply; 33+ messages in thread
From: Paul E. McKenney @ 2008-08-22 15:11 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge, Nick Piggin,
	Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe,
	Rusty Russell, Linux Kernel Mailing List

On Fri, Aug 22, 2008 at 09:01:22AM -0500, Christoph Lameter wrote:
> Pekka Enberg wrote:
> > Hi Ingo,
> > 
> > On Fri, Aug 22, 2008 at 9:28 AM, Ingo Molnar <mingo@elte.hu> wrote:
> >> * Jeremy Fitzhardinge <jeremy@goop.org> wrote:
> >>
> >>> RCU can only control the lifetime of allocated memory blocks, which
> >>> forces all the call structures to be allocated.  This is expensive
> >>> compared to allocating them on the stack, which is the common case for
> >>> synchronous calls.
> >>>
> >>> This patch takes a different approach.  Rather than using RCU, the
> >>> queues are managed under rwlocks.  Adding or removing from the queue
> >>> requires holding the lock for writing, but multiple CPUs can walk the
> >>> queues to process function calls under read locks.  In the common
> >>> case, where the structures are stack allocated, the calling CPU need
> >>> only wait for its call to be done, take the lock for writing and
> >>> remove the call structure.
> >>>
> >>> Lock contention - particularly write vs read - is reduced by using
> >>> multiple queues.
> >> hm, is there any authorative data on what is cheaper on a big box, a
> >> full-blown MESI cache miss that occurs for every reader in this new
> >> fastpath, or a local SLAB/SLUB allocation+free that occurs with the
> >> current RCU approach?
> > 
> > Christoph might have an idea about it.
> 
> Its on the stack which is presumably hot so no cache miss? If its async then
> presumably we do not need to wait so its okay to call an allocator.
> 
> Generally: The larger the box (longer cacheline acquisition latencies) and the
> higher the contention (cannot get cacheline because of contention) the better
> a slab allocation will be compared to a cacheline miss.
> 
> RCU is problematic because it lets cachelines get cold. A hot cacheline that
> is used frequently read and written to by the same cpu is very good thing for
> performace.

So on your these large boxes, read-only cachelines are preferentially
ejected from the cache, so that one should write to per-CPU data
occasionally to keep it resident?  Or is the issue the long RCU grace
periods which allow the structure being freed to age out of all relevant
caches?  (My guess would be the second.)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22 15:11       ` Paul E. McKenney
@ 2008-08-22 17:14         ` Christoph Lameter
  2008-08-22 18:29           ` Paul E. McKenney
  0 siblings, 1 reply; 33+ messages in thread
From: Christoph Lameter @ 2008-08-22 17:14 UTC (permalink / raw)
  To: paulmck
  Cc: Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge, Nick Piggin,
	Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe,
	Rusty Russell, Linux Kernel Mailing List

Paul E. McKenney wrote:

>> RCU is problematic because it lets cachelines get cold. A hot cacheline that
>> is used frequently read and written to by the same cpu is very good thing for
>> performace.
> 
> So on your these large boxes, read-only cachelines are preferentially
> ejected from the cache, so that one should write to per-CPU data
> occasionally to keep it resident?  Or is the issue the long RCU grace
> periods which allow the structure being freed to age out of all relevant
> caches?  (My guess would be the second.)

The issue are the RCU grace period that are generally long enough to make the
cacheline fall out of all caches.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22 17:14         ` Christoph Lameter
@ 2008-08-22 18:29           ` Paul E. McKenney
  2008-08-22 18:33             ` Andi Kleen
  2008-08-22 18:36             ` Christoph Lameter
  0 siblings, 2 replies; 33+ messages in thread
From: Paul E. McKenney @ 2008-08-22 18:29 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge, Nick Piggin,
	Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe,
	Rusty Russell, Linux Kernel Mailing List

On Fri, Aug 22, 2008 at 12:14:37PM -0500, Christoph Lameter wrote:
> Paul E. McKenney wrote:
> 
> >> RCU is problematic because it lets cachelines get cold. A hot cacheline that
> >> is used frequently read and written to by the same cpu is very good thing for
> >> performace.
> > 
> > So on your these large boxes, read-only cachelines are preferentially
> > ejected from the cache, so that one should write to per-CPU data
> > occasionally to keep it resident?  Or is the issue the long RCU grace
> > periods which allow the structure being freed to age out of all relevant
> > caches?  (My guess would be the second.)
> 
> The issue are the RCU grace period that are generally long enough to make the
> cacheline fall out of all caches.

Would it make sense to push the freed-by-RCU memory further up the
hierarchy, so that such memory is not mistaken for recently freed
hot-in-cache memory?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22 18:29           ` Paul E. McKenney
@ 2008-08-22 18:33             ` Andi Kleen
  2008-08-22 18:35               ` Jeremy Fitzhardinge
  2008-08-22 22:40               ` Paul E. McKenney
  2008-08-22 18:36             ` Christoph Lameter
  1 sibling, 2 replies; 33+ messages in thread
From: Andi Kleen @ 2008-08-22 18:33 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Christoph Lameter, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

> Would it make sense to push the freed-by-RCU memory further up the
> hierarchy, so that such memory is not mistaken for recently freed
> hot-in-cache memory?

Right now my impression is that it is not well understood why
the kmalloc makes the IPI that much slower. In theory a kmalloc
shouldn't be all that slow, it's essentially just a 
"disable interrupts; unlink object from cpu cache; enable interrupts"
with some window dressing. kfree() is similar.

Does it bounce a cache line on freeing perhaps?

-Andi

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22 18:33             ` Andi Kleen
@ 2008-08-22 18:35               ` Jeremy Fitzhardinge
  2008-08-23  7:34                 ` Andi Kleen
  2008-08-22 22:40               ` Paul E. McKenney
  1 sibling, 1 reply; 33+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-22 18:35 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Paul E. McKenney, Christoph Lameter, Pekka Enberg, Ingo Molnar,
	Nick Piggin, Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe,
	Rusty Russell, Linux Kernel Mailing List

Andi Kleen wrote:
> Right now my impression is that it is not well understood why
> the kmalloc makes the IPI that much slower. In theory a kmalloc
> shouldn't be all that slow, it's essentially just a 
> "disable interrupts; unlink object from cpu cache; enable interrupts"
> with some window dressing. kfree() is similar.
>
> Does it bounce a cache line on freeing perhaps?

I think it's just an assumption that it would be slower.  Has anyone
measured it?

(Note: The measurements I posted do not cover this path, because it was
on a two cpu system, and it was always using the call-single path.)

    J

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22 18:35               ` Jeremy Fitzhardinge
@ 2008-08-23  7:34                 ` Andi Kleen
  2008-08-24  4:55                   ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 33+ messages in thread
From: Andi Kleen @ 2008-08-23  7:34 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Andi Kleen, Paul E. McKenney, Christoph Lameter, Pekka Enberg,
	Ingo Molnar, Nick Piggin, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

On Fri, Aug 22, 2008 at 11:35:46AM -0700, Jeremy Fitzhardinge wrote:
> Andi Kleen wrote:
> > Right now my impression is that it is not well understood why
> > the kmalloc makes the IPI that much slower. In theory a kmalloc
> > shouldn't be all that slow, it's essentially just a 
> > "disable interrupts; unlink object from cpu cache; enable interrupts"
> > with some window dressing. kfree() is similar.
> >
> > Does it bounce a cache line on freeing perhaps?
> 
> I think it's just an assumption that it would be slower.  Has anyone
> measured it?

It's likely slower than no kmalloc because
there will be more instructions executed, the question is just how much.

> 
> (Note: The measurements I posted do not cover this path, because it was
> on a two cpu system, and it was always using the call-single path.)

Ah so it was already 25% slower even without kmalloc? I thought
that was with already. That doesn't sound good. Any idea where that slowdown 
comes from?

-Andi

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-23  7:34                 ` Andi Kleen
@ 2008-08-24  4:55                   ` Jeremy Fitzhardinge
  2008-08-24  9:01                     ` Andi Kleen
  0 siblings, 1 reply; 33+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-24  4:55 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Paul E. McKenney, Christoph Lameter, Pekka Enberg, Ingo Molnar,
	Nick Piggin, Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe,
	Rusty Russell, Linux Kernel Mailing List

Andi Kleen wrote:
> On Fri, Aug 22, 2008 at 11:35:46AM -0700, Jeremy Fitzhardinge wrote:
>   
>> Andi Kleen wrote:
>>     
>>> Right now my impression is that it is not well understood why
>>> the kmalloc makes the IPI that much slower. In theory a kmalloc
>>> shouldn't be all that slow, it's essentially just a 
>>> "disable interrupts; unlink object from cpu cache; enable interrupts"
>>> with some window dressing. kfree() is similar.
>>>
>>> Does it bounce a cache line on freeing perhaps?
>>>       
>> I think it's just an assumption that it would be slower.  Has anyone
>> measured it?
>>     
>
> It's likely slower than no kmalloc because
> there will be more instructions executed, the question is just how much.
>
>   
>> (Note: The measurements I posted do not cover this path, because it was
>> on a two cpu system, and it was always using the call-single path.)
>>     
>
> Ah so it was already 25% slower even without kmalloc? I thought
> that was with already. That doesn't sound good. Any idea where that slowdown 
> comes from?

Just longer code path, I think.  It calls the generic
smp_call_function_mask(), which then does a popcount on the cpu mask
(which it needs to do anyway), sees only one bit set, and then punts to
the smp_call_function_single() path.  If we maintained a cpus_online
count, then we could fast-path the call to smp_call_function_single() in
the two core/cpu case more efficiently (would still need to scan the
mask to extract the cpu number).

Or alternatively, maybe it isn't actually worth special casing
smp_call_function_single() with a multi-queue smp_call_function_mask()
implementation?

    J

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-24  4:55                   ` Jeremy Fitzhardinge
@ 2008-08-24  9:01                     ` Andi Kleen
  0 siblings, 0 replies; 33+ messages in thread
From: Andi Kleen @ 2008-08-24  9:01 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Andi Kleen, Paul E. McKenney, Christoph Lameter, Pekka Enberg,
	Ingo Molnar, Nick Piggin, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

> > Ah so it was already 25% slower even without kmalloc? I thought
> > that was with already. That doesn't sound good. Any idea where that slowdown 
> > comes from?
> 
> Just longer code path, I think.  It calls the generic

I did IPI measurements quite some time ago and what I remember
from them is that IPI latencies were in the low multiple thousands cycle
ballpark.

> smp_call_function_mask(), which then does a popcount on the cpu mask
> (which it needs to do anyway), sees only one bit set, and then punts to
> the smp_call_function_single() path.  

But that is more in the a few tens of cycles (or maybe 1-2 hundreds
if you have a NR_CPU==4096 kernel with really large cpumask)

Doesn't really explain a 25% slowdown I would say. 

Are you sure there isn't a new cache miss in there or something? Actually
it must be even multiple ones to account for such a slow down.

-Andi

-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22 18:33             ` Andi Kleen
  2008-08-22 18:35               ` Jeremy Fitzhardinge
@ 2008-08-22 22:40               ` Paul E. McKenney
  1 sibling, 0 replies; 33+ messages in thread
From: Paul E. McKenney @ 2008-08-22 22:40 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Christoph Lameter, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe,
	Rusty Russell, Linux Kernel Mailing List

On Fri, Aug 22, 2008 at 08:33:46PM +0200, Andi Kleen wrote:
> > Would it make sense to push the freed-by-RCU memory further up the
> > hierarchy, so that such memory is not mistaken for recently freed
> > hot-in-cache memory?
> 
> Right now my impression is that it is not well understood why
> the kmalloc makes the IPI that much slower. In theory a kmalloc
> shouldn't be all that slow, it's essentially just a 
> "disable interrupts; unlink object from cpu cache; enable interrupts"
> with some window dressing. kfree() is similar.
> 
> Does it bounce a cache line on freeing perhaps?

It is indeed easy to focus on the wrong area when attempting to
improve performance.  Done it many times myself...  :-/

							Thanx, Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22 18:29           ` Paul E. McKenney
  2008-08-22 18:33             ` Andi Kleen
@ 2008-08-22 18:36             ` Christoph Lameter
  2008-08-22 19:52               ` Paul E. McKenney
  1 sibling, 1 reply; 33+ messages in thread
From: Christoph Lameter @ 2008-08-22 18:36 UTC (permalink / raw)
  To: paulmck
  Cc: Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge, Nick Piggin,
	Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe,
	Rusty Russell, Linux Kernel Mailing List

Paul E. McKenney wrote:

>>> So on your these large boxes, read-only cachelines are preferentially
>>> ejected from the cache, so that one should write to per-CPU data
>>> occasionally to keep it resident?  Or is the issue the long RCU grace
>>> periods which allow the structure being freed to age out of all relevant
>>> caches?  (My guess would be the second.)
>> The issue are the RCU grace period that are generally long enough to make the
>> cacheline fall out of all caches.
> 
> Would it make sense to push the freed-by-RCU memory further up the
> hierarchy, so that such memory is not mistaken for recently freed
> hot-in-cache memory?

That would mean passing a gfp flag like __GFP_COLD on free from RCU? Or how
would that work at the higher levels?



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22 18:36             ` Christoph Lameter
@ 2008-08-22 19:52               ` Paul E. McKenney
  2008-08-22 20:03                 ` Christoph Lameter
  0 siblings, 1 reply; 33+ messages in thread
From: Paul E. McKenney @ 2008-08-22 19:52 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge, Nick Piggin,
	Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe,
	Rusty Russell, Linux Kernel Mailing List

On Fri, Aug 22, 2008 at 01:36:37PM -0500, Christoph Lameter wrote:
> Paul E. McKenney wrote:
> 
> >>> So on your these large boxes, read-only cachelines are preferentially
> >>> ejected from the cache, so that one should write to per-CPU data
> >>> occasionally to keep it resident?  Or is the issue the long RCU grace
> >>> periods which allow the structure being freed to age out of all relevant
> >>> caches?  (My guess would be the second.)
> >> The issue are the RCU grace period that are generally long enough to make the
> >> cacheline fall out of all caches.
> > 
> > Would it make sense to push the freed-by-RCU memory further up the
> > hierarchy, so that such memory is not mistaken for recently freed
> > hot-in-cache memory?
> 
> That would mean passing a gfp flag like __GFP_COLD on free from RCU? Or how
> would that work at the higher levels?

I was indeed thinking in terms of the free from RCU being specially marked.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22 19:52               ` Paul E. McKenney
@ 2008-08-22 20:03                 ` Christoph Lameter
  2008-08-22 20:53                   ` Paul E. McKenney
  0 siblings, 1 reply; 33+ messages in thread
From: Christoph Lameter @ 2008-08-22 20:03 UTC (permalink / raw)
  To: paulmck
  Cc: Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge, Nick Piggin,
	Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe,
	Rusty Russell, Linux Kernel Mailing List

Paul E. McKenney wrote:

> I was indeed thinking in terms of the free from RCU being specially marked.

Isnt there some way to shorten the rcu periods significantly? Critical
sections do not take that long after all.

If the RCU periods are much shorter then the chance of cache hotness of the
objects is increased.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22 20:03                 ` Christoph Lameter
@ 2008-08-22 20:53                   ` Paul E. McKenney
  2008-08-25 10:31                     ` Peter Zijlstra
  0 siblings, 1 reply; 33+ messages in thread
From: Paul E. McKenney @ 2008-08-22 20:53 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge, Nick Piggin,
	Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha, Jens Axboe,
	Rusty Russell, Linux Kernel Mailing List

On Fri, Aug 22, 2008 at 03:03:13PM -0500, Christoph Lameter wrote:
> Paul E. McKenney wrote:
> 
> > I was indeed thinking in terms of the free from RCU being specially marked.
> 
> Isnt there some way to shorten the rcu periods significantly? Critical
> sections do not take that long after all.

In theory, yes.  However, the shorter the grace period, the greater the
per-update overhead of grace-period detection -- the general approach
is to use a per-CPU high-resolution timer to force RCU grace period
processing every 100 microseconds or so.  Also, by definition, the RCU
grace period can be no shorter than the longest active RCU read-side
critical section.  Nevertheless, I have designed my current hierarchical
RCU patch with expedited grace periods in mind, though more for the
purpose of reducing latency of long strings of operations that involve
synchronize_rcu() than for cache locality.

> If the RCU periods are much shorter then the chance of cache hotness of the
> objects is increased.

How short does the grace period need to be to significantly increase the
chance of an RCU-protected data element remaining in cache across an RCU
grace period?  The last time I calculated this, the knee of the curve was
at a few tens of milliseconds, but to give you an idea of how long ago
that was, the workload I used was TPC/A.  Which might no longer be very
representative.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-22 20:53                   ` Paul E. McKenney
@ 2008-08-25 10:31                     ` Peter Zijlstra
  2008-08-25 15:12                       ` Paul E. McKenney
  0 siblings, 1 reply; 33+ messages in thread
From: Peter Zijlstra @ 2008-08-25 10:31 UTC (permalink / raw)
  To: paulmck
  Cc: Christoph Lameter, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

On Fri, 2008-08-22 at 13:53 -0700, Paul E. McKenney wrote:
> On Fri, Aug 22, 2008 at 03:03:13PM -0500, Christoph Lameter wrote:
> > Paul E. McKenney wrote:
> > 
> > > I was indeed thinking in terms of the free from RCU being specially marked.
> > 
> > Isnt there some way to shorten the rcu periods significantly? Critical
> > sections do not take that long after all.
> 
> In theory, yes.  However, the shorter the grace period, the greater the
> per-update overhead of grace-period detection -- the general approach
> is to use a per-CPU high-resolution timer to force RCU grace period
> processing every 100 microseconds or so.  

You could of course also drive the rcu state machine from
rcu_read_unlock().

> Also, by definition, the RCU
> grace period can be no shorter than the longest active RCU read-side
> critical section.  Nevertheless, I have designed my current hierarchical
> RCU patch with expedited grace periods in mind, though more for the
> purpose of reducing latency of long strings of operations that involve
> synchronize_rcu() than for cache locality.

Another thing that could be done is more often force a grace period by
flipping the counters.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-25 10:31                     ` Peter Zijlstra
@ 2008-08-25 15:12                       ` Paul E. McKenney
  2008-08-25 15:22                         ` Peter Zijlstra
  2008-08-25 15:44                         ` [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu Christoph Lameter
  0 siblings, 2 replies; 33+ messages in thread
From: Paul E. McKenney @ 2008-08-25 15:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

On Mon, Aug 25, 2008 at 12:31:31PM +0200, Peter Zijlstra wrote:
> On Fri, 2008-08-22 at 13:53 -0700, Paul E. McKenney wrote:
> > On Fri, Aug 22, 2008 at 03:03:13PM -0500, Christoph Lameter wrote:
> > > Paul E. McKenney wrote:
> > > 
> > > > I was indeed thinking in terms of the free from RCU being specially marked.
> > > 
> > > Isnt there some way to shorten the rcu periods significantly? Critical
> > > sections do not take that long after all.
> > 
> > In theory, yes.  However, the shorter the grace period, the greater the
> > per-update overhead of grace-period detection -- the general approach
> > is to use a per-CPU high-resolution timer to force RCU grace period
> > processing every 100 microseconds or so.  
> 
> You could of course also drive the rcu state machine from
> rcu_read_unlock().

True, and Jim Houston implemented something similar to this some years
back: http://marc.theaimsgroup.com/?l=linux-kernel&m=109387402400673&w=2

This of course greatly increases rcu_read_unlock() overhead.  But
perhaps it is a good implementation for the workloads that Christoph is
thinking of.

> > Also, by definition, the RCU
> > grace period can be no shorter than the longest active RCU read-side
> > critical section.  Nevertheless, I have designed my current hierarchical
> > RCU patch with expedited grace periods in mind, though more for the
> > purpose of reducing latency of long strings of operations that involve
> > synchronize_rcu() than for cache locality.
> 
> Another thing that could be done is more often force a grace period by
> flipping the counters.

Yep.  That is exactly what I was getting at with the high-resolution
timer point above.  This seems to be a reasonable compromise, as it
allows someone to specify how quickly the grace periods happen
dynamically.

But I am not sure that this gets the grace periods to go fast enough to
cover Christoph's use case -- he seems to be in a "faster is better"
space rather than in an "at least this fast" space.  Still, it would
likely help in some important cases.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-25 15:12                       ` Paul E. McKenney
@ 2008-08-25 15:22                         ` Peter Zijlstra
  2008-08-25 15:46                           ` Christoph Lameter
  2008-08-25 15:44                         ` [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu Christoph Lameter
  1 sibling, 1 reply; 33+ messages in thread
From: Peter Zijlstra @ 2008-08-25 15:22 UTC (permalink / raw)
  To: paulmck
  Cc: Christoph Lameter, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

On Mon, 2008-08-25 at 08:12 -0700, Paul E. McKenney wrote:
> On Mon, Aug 25, 2008 at 12:31:31PM +0200, Peter Zijlstra wrote:
> > On Fri, 2008-08-22 at 13:53 -0700, Paul E. McKenney wrote:
> > > On Fri, Aug 22, 2008 at 03:03:13PM -0500, Christoph Lameter wrote:
> > > > Paul E. McKenney wrote:
> > > > 
> > > > > I was indeed thinking in terms of the free from RCU being specially marked.
> > > > 
> > > > Isnt there some way to shorten the rcu periods significantly? Critical
> > > > sections do not take that long after all.
> > > 
> > > In theory, yes.  However, the shorter the grace period, the greater the
> > > per-update overhead of grace-period detection -- the general approach
> > > is to use a per-CPU high-resolution timer to force RCU grace period
> > > processing every 100 microseconds or so.  
> > 
> > You could of course also drive the rcu state machine from
> > rcu_read_unlock().
> 
> True, and Jim Houston implemented something similar to this some years
> back: http://marc.theaimsgroup.com/?l=linux-kernel&m=109387402400673&w=2
> 
> This of course greatly increases rcu_read_unlock() overhead.  But
> perhaps it is a good implementation for the workloads that Christoph is
> thinking of.
> 
> > > Also, by definition, the RCU
> > > grace period can be no shorter than the longest active RCU read-side
> > > critical section.  Nevertheless, I have designed my current hierarchical
> > > RCU patch with expedited grace periods in mind, though more for the
> > > purpose of reducing latency of long strings of operations that involve
> > > synchronize_rcu() than for cache locality.
> > 
> > Another thing that could be done is more often force a grace period by
> > flipping the counters.
> 
> Yep.  That is exactly what I was getting at with the high-resolution
> timer point above.  This seems to be a reasonable compromise, as it
> allows someone to specify how quickly the grace periods happen
> dynamically.
> 
> But I am not sure that this gets the grace periods to go fast enough to
> cover Christoph's use case -- he seems to be in a "faster is better"
> space rather than in an "at least this fast" space.  Still, it would
> likely help in some important cases.

If we combine these two cases, and flip the counter as soon as we've
enqueued one callback, unless we're already waiting for a grace period
to end - which gives us a longer window to collect callbacks.

And then the rcu_read_unlock() can do:

  if (dec_and_zero(my_counter) && my_index == dying)
    raise_softirq(RCU)

to fire off the callback stuff.

/me ponders - there must be something wrong with that...

Aaah, yes, the dec_and_zero is non trivial due to the fact that its a
distributed counter. Bugger..


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-25 15:22                         ` Peter Zijlstra
@ 2008-08-25 15:46                           ` Christoph Lameter
  2008-08-25 15:51                             ` Peter Zijlstra
                                               ` (2 more replies)
  0 siblings, 3 replies; 33+ messages in thread
From: Christoph Lameter @ 2008-08-25 15:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

Peter Zijlstra wrote:
>
> If we combine these two cases, and flip the counter as soon as we've
> enqueued one callback, unless we're already waiting for a grace period
> to end - which gives us a longer window to collect callbacks.
> 
> And then the rcu_read_unlock() can do:
> 
>   if (dec_and_zero(my_counter) && my_index == dying)
>     raise_softirq(RCU)
> 
> to fire off the callback stuff.
> 
> /me ponders - there must be something wrong with that...
> 
> Aaah, yes, the dec_and_zero is non trivial due to the fact that its a
> distributed counter. Bugger..

Then lets make it per cpu. If we get the cpu ops in then dec_and_zero would be
very cheap.



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-25 15:46                           ` Christoph Lameter
@ 2008-08-25 15:51                             ` Peter Zijlstra
  2008-08-26 13:43                               ` Paul E. McKenney
  2008-08-25 20:04                             ` Paul E. McKenney
  2008-08-26  5:13                             ` Nick Piggin
  2 siblings, 1 reply; 33+ messages in thread
From: Peter Zijlstra @ 2008-08-25 15:51 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: paulmck, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

On Mon, 2008-08-25 at 10:46 -0500, Christoph Lameter wrote:
> Peter Zijlstra wrote:
> >
> > If we combine these two cases, and flip the counter as soon as we've
> > enqueued one callback, unless we're already waiting for a grace period
> > to end - which gives us a longer window to collect callbacks.
> > 
> > And then the rcu_read_unlock() can do:
> > 
> >   if (dec_and_zero(my_counter) && my_index == dying)
> >     raise_softirq(RCU)
> > 
> > to fire off the callback stuff.
> > 
> > /me ponders - there must be something wrong with that...
> > 
> > Aaah, yes, the dec_and_zero is non trivial due to the fact that its a
> > distributed counter. Bugger..
> 
> Then lets make it per cpu. If we get the cpu ops in then dec_and_zero would be
> very cheap.

Hmm, perhaps that might work for classic RCU, as that disables
preemption and thus the counters should always be balanced.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-25 15:51                             ` Peter Zijlstra
@ 2008-08-26 13:43                               ` Paul E. McKenney
  2008-08-26 14:07                                 ` Peter Zijlstra
  0 siblings, 1 reply; 33+ messages in thread
From: Paul E. McKenney @ 2008-08-26 13:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

On Mon, Aug 25, 2008 at 05:51:32PM +0200, Peter Zijlstra wrote:
> On Mon, 2008-08-25 at 10:46 -0500, Christoph Lameter wrote:
> > Peter Zijlstra wrote:
> > >
> > > If we combine these two cases, and flip the counter as soon as we've
> > > enqueued one callback, unless we're already waiting for a grace period
> > > to end - which gives us a longer window to collect callbacks.
> > > 
> > > And then the rcu_read_unlock() can do:
> > > 
> > >   if (dec_and_zero(my_counter) && my_index == dying)
> > >     raise_softirq(RCU)
> > > 
> > > to fire off the callback stuff.
> > > 
> > > /me ponders - there must be something wrong with that...
> > > 
> > > Aaah, yes, the dec_and_zero is non trivial due to the fact that its a
> > > distributed counter. Bugger..
> > 
> > Then lets make it per cpu. If we get the cpu ops in then dec_and_zero would be
> > very cheap.
> 
> Hmm, perhaps that might work for classic RCU, as that disables
> preemption and thus the counters should always be balanced.

Unless you use a pair of global counters (like QRCU), you will still
need to check a large number of counters for zero.  I suppose that one
approach would be to do something like QRCU, but with some smallish
number of counter pairs, each of which is shared by a moderate group of
CPUs.  For example, for 4,096 CPUs, use 64 pairs of counters, each
shared by 64 CPUs.  My guess is that the rcu_read_lock() overhead would
make this be a case of "Holy overhead, Batman!!!", but then again, I
cannot claim to be an expert on 4,096-CPU machines.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-26 13:43                               ` Paul E. McKenney
@ 2008-08-26 14:07                                 ` Peter Zijlstra
  2008-08-27 15:16                                   ` Paul E. McKenney
  0 siblings, 1 reply; 33+ messages in thread
From: Peter Zijlstra @ 2008-08-26 14:07 UTC (permalink / raw)
  To: paulmck
  Cc: Christoph Lameter, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

On Tue, 2008-08-26 at 06:43 -0700, Paul E. McKenney wrote:
> On Mon, Aug 25, 2008 at 05:51:32PM +0200, Peter Zijlstra wrote:
> > On Mon, 2008-08-25 at 10:46 -0500, Christoph Lameter wrote:
> > > Peter Zijlstra wrote:
> > > >
> > > > If we combine these two cases, and flip the counter as soon as we've
> > > > enqueued one callback, unless we're already waiting for a grace period
> > > > to end - which gives us a longer window to collect callbacks.
> > > > 
> > > > And then the rcu_read_unlock() can do:
> > > > 
> > > >   if (dec_and_zero(my_counter) && my_index == dying)
> > > >     raise_softirq(RCU)
> > > > 
> > > > to fire off the callback stuff.
> > > > 
> > > > /me ponders - there must be something wrong with that...
> > > > 
> > > > Aaah, yes, the dec_and_zero is non trivial due to the fact that its a
> > > > distributed counter. Bugger..
> > > 
> > > Then lets make it per cpu. If we get the cpu ops in then dec_and_zero would be
> > > very cheap.
> > 
> > Hmm, perhaps that might work for classic RCU, as that disables
> > preemption and thus the counters should always be balanced.
> 
> Unless you use a pair of global counters (like QRCU), you will still
> need to check a large number of counters for zero.  I suppose that one
> approach would be to do something like QRCU, but with some smallish
> number of counter pairs, each of which is shared by a moderate group of
> CPUs.  For example, for 4,096 CPUs, use 64 pairs of counters, each
> shared by 64 CPUs.  My guess is that the rcu_read_lock() overhead would
> make this be a case of "Holy overhead, Batman!!!", but then again, I
> cannot claim to be an expert on 4,096-CPU machines.

right - while the local count will be balanced and will always end up on
zero, you have to check remote counts for zero as well.

But after a counter flip, the dying counter will only reach zero once
per cpu.

So each cpu gets to tickle a softirq once per cycle. That softirq can
then check all remote counters, and kick off the callback list when it
finds them all zero.

Of course, this scan is very expensive, n^2 at worst, each cpu
triggering a full scan, until finally the last cpu is done.

We could optimize this by keeping cpu masks of cpus found to have !0
counts - those who were found to have 0, will always stay zero, so we'll
not have to look at them again.

Another is making use of a scanning hierarchy.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-26 14:07                                 ` Peter Zijlstra
@ 2008-08-27 15:16                                   ` Paul E. McKenney
  0 siblings, 0 replies; 33+ messages in thread
From: Paul E. McKenney @ 2008-08-27 15:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

On Tue, Aug 26, 2008 at 04:07:00PM +0200, Peter Zijlstra wrote:
> On Tue, 2008-08-26 at 06:43 -0700, Paul E. McKenney wrote:
> > On Mon, Aug 25, 2008 at 05:51:32PM +0200, Peter Zijlstra wrote:
> > > On Mon, 2008-08-25 at 10:46 -0500, Christoph Lameter wrote:
> > > > Peter Zijlstra wrote:
> > > > >
> > > > > If we combine these two cases, and flip the counter as soon as we've
> > > > > enqueued one callback, unless we're already waiting for a grace period
> > > > > to end - which gives us a longer window to collect callbacks.
> > > > > 
> > > > > And then the rcu_read_unlock() can do:
> > > > > 
> > > > >   if (dec_and_zero(my_counter) && my_index == dying)
> > > > >     raise_softirq(RCU)
> > > > > 
> > > > > to fire off the callback stuff.
> > > > > 
> > > > > /me ponders - there must be something wrong with that...
> > > > > 
> > > > > Aaah, yes, the dec_and_zero is non trivial due to the fact that its a
> > > > > distributed counter. Bugger..
> > > > 
> > > > Then lets make it per cpu. If we get the cpu ops in then dec_and_zero would be
> > > > very cheap.
> > > 
> > > Hmm, perhaps that might work for classic RCU, as that disables
> > > preemption and thus the counters should always be balanced.
> > 
> > Unless you use a pair of global counters (like QRCU), you will still
> > need to check a large number of counters for zero.  I suppose that one
> > approach would be to do something like QRCU, but with some smallish
> > number of counter pairs, each of which is shared by a moderate group of
> > CPUs.  For example, for 4,096 CPUs, use 64 pairs of counters, each
> > shared by 64 CPUs.  My guess is that the rcu_read_lock() overhead would
> > make this be a case of "Holy overhead, Batman!!!", but then again, I
> > cannot claim to be an expert on 4,096-CPU machines.
> 
> right - while the local count will be balanced and will always end up on
> zero, you have to check remote counts for zero as well.
> 
> But after a counter flip, the dying counter will only reach zero once
> per cpu.

Yep.

> So each cpu gets to tickle a softirq once per cycle. That softirq can
> then check all remote counters, and kick off the callback list when it
> finds them all zero.

Which might not be so good from a powertop viewpoint, since grace
periods only take a few milliseconds, and powertop likes to keep CPUs
sleeping for seconds.  But one could designate a set of CPUs that
scanned nearby counters -- carefully chosed WRT the system in question
to avoid preventing cores from being powered down.  :-/

> Of course, this scan is very expensive, n^2 at worst, each cpu
> triggering a full scan, until finally the last cpu is done.

One could keep an index of already-scanned counters, which would
help keep things down to a dull roar.

> We could optimize this by keeping cpu masks of cpus found to have !0
> counts - those who were found to have 0, will always stay zero, so we'll
> not have to look at them again.

OK, a mask would work, though an index would be faster.

> Another is making use of a scanning hierarchy.

You still have to scan all of the leaves...  Though you can divide the
work over a set of CPUs, again, as long as those CPUs are chosen so as
to balance the need to power down whole cores with the need to maintain
good memory locality.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-25 15:46                           ` Christoph Lameter
  2008-08-25 15:51                             ` Peter Zijlstra
@ 2008-08-25 20:04                             ` Paul E. McKenney
  2008-08-26  5:13                             ` Nick Piggin
  2 siblings, 0 replies; 33+ messages in thread
From: Paul E. McKenney @ 2008-08-25 20:04 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

On Mon, Aug 25, 2008 at 10:46:06AM -0500, Christoph Lameter wrote:
> Peter Zijlstra wrote:
> >
> > If we combine these two cases, and flip the counter as soon as we've
> > enqueued one callback, unless we're already waiting for a grace period
> > to end - which gives us a longer window to collect callbacks.
> > 
> > And then the rcu_read_unlock() can do:
> > 
> >   if (dec_and_zero(my_counter) && my_index == dying)
> >     raise_softirq(RCU)
> > 
> > to fire off the callback stuff.
> > 
> > /me ponders - there must be something wrong with that...
> > 
> > Aaah, yes, the dec_and_zero is non trivial due to the fact that its a
> > distributed counter. Bugger..
> 
> Then lets make it per cpu. If we get the cpu ops in then dec_and_zero would be
> very cheap.

The problem is that we need dec_and_zero on the sum of the per-CPU
counters.  Gets spendy.  One can make a hierarchy, and propagate up.
But still lots of cache misses.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-25 15:46                           ` Christoph Lameter
  2008-08-25 15:51                             ` Peter Zijlstra
  2008-08-25 20:04                             ` Paul E. McKenney
@ 2008-08-26  5:13                             ` Nick Piggin
  2008-08-26 13:40                               ` [PATCH 2/2] smp_call_function: use rwlocks on queues rather?than rcu Paul E. McKenney
  2 siblings, 1 reply; 33+ messages in thread
From: Nick Piggin @ 2008-08-26  5:13 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, paulmck, Pekka Enberg, Ingo Molnar,
	Jeremy Fitzhardinge, Andi Kleen, Pallipadi, Venkatesh,
	Suresh Siddha, Jens Axboe, Rusty Russell,
	Linux Kernel Mailing List

On Tuesday 26 August 2008 01:46, Christoph Lameter wrote:
> Peter Zijlstra wrote:
> > If we combine these two cases, and flip the counter as soon as we've
> > enqueued one callback, unless we're already waiting for a grace period
> > to end - which gives us a longer window to collect callbacks.
> >
> > And then the rcu_read_unlock() can do:
> >
> >   if (dec_and_zero(my_counter) && my_index == dying)
> >     raise_softirq(RCU)
> >
> > to fire off the callback stuff.
> >
> > /me ponders - there must be something wrong with that...
> >
> > Aaah, yes, the dec_and_zero is non trivial due to the fact that its a
> > distributed counter. Bugger..
>
> Then lets make it per cpu. If we get the cpu ops in then dec_and_zero would
> be very cheap.

Let's be very careful before making rcu read locks costly. Any reduction
in grace periods would be great, but IMO RCU should not be used in cases
where performance depends on the freed data remaining in cache.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather?than rcu
  2008-08-26  5:13                             ` Nick Piggin
@ 2008-08-26 13:40                               ` Paul E. McKenney
  0 siblings, 0 replies; 33+ messages in thread
From: Paul E. McKenney @ 2008-08-26 13:40 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Peter Zijlstra, Pekka Enberg, Ingo Molnar,
	Jeremy Fitzhardinge, Andi Kleen, Pallipadi, Venkatesh,
	Suresh Siddha, Jens Axboe, Rusty Russell,
	Linux Kernel Mailing List

On Tue, Aug 26, 2008 at 03:13:40PM +1000, Nick Piggin wrote:
> On Tuesday 26 August 2008 01:46, Christoph Lameter wrote:
> > Peter Zijlstra wrote:
> > > If we combine these two cases, and flip the counter as soon as we've
> > > enqueued one callback, unless we're already waiting for a grace period
> > > to end - which gives us a longer window to collect callbacks.
> > >
> > > And then the rcu_read_unlock() can do:
> > >
> > >   if (dec_and_zero(my_counter) && my_index == dying)
> > >     raise_softirq(RCU)
> > >
> > > to fire off the callback stuff.
> > >
> > > /me ponders - there must be something wrong with that...
> > >
> > > Aaah, yes, the dec_and_zero is non trivial due to the fact that its a
> > > distributed counter. Bugger..
> >
> > Then lets make it per cpu. If we get the cpu ops in then dec_and_zero would
> > be very cheap.
> 
> Let's be very careful before making rcu read locks costly. Any reduction
> in grace periods would be great, but IMO RCU should not be used in cases
> where performance depends on the freed data remaining in cache.

Indeed!

But if you were in a situation where read-side overhead was irrelevant
(perhaps a mythical machine with zero-cost atomics and cache misses),
then one approach would be to combine Oleg Nesterov's QRCU with the
callback processing from Andrea Arcangeli's implementation from the 2001
timeframe.  Of course, if your cache misses really were zero cost,
then you wouldn't care about the data remaining in cache.  So maybe
a machine were cache misses to other CPUs' caches are free, but misses
to main memory are horribly expensive?

Anyway, the trick would be to adapt QRCU (http://lkml.org/lkml/2007/2/25/18)
to store the index in the task structure (as opposed to returning it
from rcu_read_lock()), and have a single global queue of callbacks,
guarded by a global lock.  Then rcu_read_unlock() can initiate callback
processing if the counter decrements down to zero, and call_rcu() would
also initiate a counter switch in the case where the non-current counter
was zero -- and this operation would be guarded by the same lock that
guards the callback queue.

But I doubt that this would be satisfactory on 4,096-CPU machines.
At least not in most cases.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-25 15:12                       ` Paul E. McKenney
  2008-08-25 15:22                         ` Peter Zijlstra
@ 2008-08-25 15:44                         ` Christoph Lameter
  2008-08-25 20:05                           ` Paul E. McKenney
  1 sibling, 1 reply; 33+ messages in thread
From: Christoph Lameter @ 2008-08-25 15:44 UTC (permalink / raw)
  To: paulmck
  Cc: Peter Zijlstra, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

Paul E. McKenney wrote:

> But I am not sure that this gets the grace periods to go fast enough to
> cover Christoph's use case -- he seems to be in a "faster is better"
> space rather than in an "at least this fast" space.  Still, it would
> likely help in some important cases.

I think there was an AIM9 regression in the close/open tests when the struct
file was switched to RCU. That test could be run with various intervals to
figure out if a shorter RCU period is beneficial and how short an RCU period
is needed to avoid the regression.



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu
  2008-08-25 15:44                         ` [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu Christoph Lameter
@ 2008-08-25 20:05                           ` Paul E. McKenney
  0 siblings, 0 replies; 33+ messages in thread
From: Paul E. McKenney @ 2008-08-25 20:05 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, Pekka Enberg, Ingo Molnar, Jeremy Fitzhardinge,
	Nick Piggin, Andi Kleen, Pallipadi, Venkatesh, Suresh Siddha,
	Jens Axboe, Rusty Russell, Linux Kernel Mailing List

On Mon, Aug 25, 2008 at 10:44:03AM -0500, Christoph Lameter wrote:
> Paul E. McKenney wrote:
> 
> > But I am not sure that this gets the grace periods to go fast enough to
> > cover Christoph's use case -- he seems to be in a "faster is better"
> > space rather than in an "at least this fast" space.  Still, it would
> > likely help in some important cases.
> 
> I think there was an AIM9 regression in the close/open tests when the struct
> file was switched to RCU. That test could be run with various intervals to
> figure out if a shorter RCU period is beneficial and how short an RCU period
> is needed to avoid the regression.

Well, with some luck, I can get an RCU implementation that allows the
grace-period duration to be varied, which would allow someone to check
the AIM9 regression.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2008-08-27 15:17 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-22  0:29 [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu Jeremy Fitzhardinge
2008-08-22  1:53 ` Nick Piggin
2008-08-22  6:28 ` Ingo Molnar
2008-08-22  7:06   ` Pekka Enberg
2008-08-22  7:12     ` Ingo Molnar
2008-08-22  9:12       ` Nick Piggin
2008-08-22 14:01     ` Christoph Lameter
2008-08-22 15:11       ` Paul E. McKenney
2008-08-22 17:14         ` Christoph Lameter
2008-08-22 18:29           ` Paul E. McKenney
2008-08-22 18:33             ` Andi Kleen
2008-08-22 18:35               ` Jeremy Fitzhardinge
2008-08-23  7:34                 ` Andi Kleen
2008-08-24  4:55                   ` Jeremy Fitzhardinge
2008-08-24  9:01                     ` Andi Kleen
2008-08-22 22:40               ` Paul E. McKenney
2008-08-22 18:36             ` Christoph Lameter
2008-08-22 19:52               ` Paul E. McKenney
2008-08-22 20:03                 ` Christoph Lameter
2008-08-22 20:53                   ` Paul E. McKenney
2008-08-25 10:31                     ` Peter Zijlstra
2008-08-25 15:12                       ` Paul E. McKenney
2008-08-25 15:22                         ` Peter Zijlstra
2008-08-25 15:46                           ` Christoph Lameter
2008-08-25 15:51                             ` Peter Zijlstra
2008-08-26 13:43                               ` Paul E. McKenney
2008-08-26 14:07                                 ` Peter Zijlstra
2008-08-27 15:16                                   ` Paul E. McKenney
2008-08-25 20:04                             ` Paul E. McKenney
2008-08-26  5:13                             ` Nick Piggin
2008-08-26 13:40                               ` [PATCH 2/2] smp_call_function: use rwlocks on queues rather?than rcu Paul E. McKenney
2008-08-25 15:44                         ` [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu Christoph Lameter
2008-08-25 20:05                           ` Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox