All of lore.kernel.org
 help / color / mirror / Atom feed
From: Lai Jiangshan <laijs@cn.fujitsu.com>
To: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: <linux-kernel@vger.kernel.org>, Tejun Heo <tj@kernel.org>,
	Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>,
	"Gu, Zheng" <guz.fnst@cn.fujitsu.com>,
	tangchen <tangchen@cn.fujitsu.com>
Subject: Re: [PATCH 4/4] workqueue: handle change in cpu-node relationship.
Date: Mon, 15 Dec 2014 10:12:24 +0800	[thread overview]
Message-ID: <548E4388.5090308@cn.fujitsu.com> (raw)
In-Reply-To: <548C6B72.5080302@jp.fujitsu.com>

On 12/14/2014 12:38 AM, Kamezawa Hiroyuki wrote:
> Although workqueue detects relationship between cpu<->node at boot,
> it is finally determined in cpu_up().
> This patch tries to update pool->node using online status of cpus.
> 
> 1. When a node goes down, clear per-cpu pool's node attr.
> 2. When a cpu comes up, update per-cpu pool's node attr.
> 3. When a cpu comes up, update possinle node cpumask workqueue is using for sched.
> 4. Detect the best node for unbound pool's cpumask using the latest info.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
>  kernel/workqueue.c | 67 ++++++++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 53 insertions(+), 14 deletions(-)
> 
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 07b4eb5..259b3ba 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -266,7 +266,8 @@ struct workqueue_struct {
>  static struct kmem_cache *pwq_cache;
>  
>  static cpumask_var_t *wq_numa_possible_cpumask;
> -					/* possible CPUs of each node */
> +	/* possible CPUs of each node initialized with possible info at boot.
> +           but modified at cpu hotplug to be adjusted to real info.  */
>  
>  static bool wq_disable_numa;
>  module_param_named(disable_numa, wq_disable_numa, bool, 0444);
> @@ -3449,6 +3450,31 @@ static void put_unbound_pool(struct worker_pool *pool)
>  	call_rcu_sched(&pool->rcu, rcu_free_pool);
>  }
>  
> +/*
> + * detect best node for given cpumask.
> + */
> +static int pool_detect_best_node(const struct cpumask *cpumask)
> +{
> +	int node, best, match, selected;
> +	static struct cpumask andmask; /* we're under mutex */
> +
> +	/* Is any node okay ? */
> +	if (!wq_numa_enabled ||
> +	    cpumask_subset(cpu_online_mask, cpumask))
> +		return NUMA_NO_NODE;
> +	best = 0;
> +	selected = NUMA_NO_NODE;
> +	/* select a node which contains the most cpu of cpumask */
> +	for_each_node_state(node, N_ONLINE) {
> +		cpumask_and(&andmask, cpumask, cpumask_of_node(node));
> +		match = cpumask_weight(&andmask);
> +		if (match > best)
> +			selected = node;
> +	}
> +	return selected;
> +}
> +
> +
>  /**
>   * get_unbound_pool - get a worker_pool with the specified attributes
>   * @attrs: the attributes of the worker_pool to get
> @@ -3467,7 +3493,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
>  {
>  	u32 hash = wqattrs_hash(attrs);
>  	struct worker_pool *pool;
> -	int node;
>  
>  	lockdep_assert_held(&wq_pool_mutex);
>  
> @@ -3492,17 +3517,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
>  	 * 'struct workqueue_attrs' comments for detail.
>  	 */
>  	pool->attrs->no_numa = false;
> -
> -	/* if cpumask is contained inside a NUMA node, we belong to that node */
> -	if (wq_numa_enabled) {
> -		for_each_node(node) {
> -			if (cpumask_subset(pool->attrs->cpumask,
> -					   wq_numa_possible_cpumask[node])) {
> -				pool->node = node;
> -				break;
> -			}
> -		}
> -	}
> +	pool->node = pool_detect_best_node(pool->attrs->cpumask);
>  
>  	if (worker_pool_assign_id(pool) < 0)
>  		goto fail;
> @@ -4567,7 +4582,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
>  	int cpu = (unsigned long)hcpu;
>  	struct worker_pool *pool;
>  	struct workqueue_struct *wq;
> -	int pi;
> +	int pi, node;
>  
>  	switch (action & ~CPU_TASKS_FROZEN) {
>  	case CPU_UP_PREPARE:
> @@ -4583,6 +4598,16 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
>  	case CPU_ONLINE:
>  		mutex_lock(&wq_pool_mutex);
>  
> +		/* now cpu <-> node info is established, update the info. */
> +		if (!wq_disable_numa) {



> +			for_each_node_state(node, N_POSSIBLE)
> +				cpumask_clear_cpu(cpu,
> +					wq_numa_possible_cpumask[node]);

The wq code try to reuse the origin pwqs/pools when the node still have cpu online.
these 3 lines of code will cause the origin pwqs/pools be on the road of dying, and
create a new set of pwqs/pools.

> +			node = cpu_to_node(cpu);
> +			cpumask_set_cpu(cpu, wq_numa_possible_cpumask[node]);
> +		}
> +		for_each_cpu_worker_pool(pool, cpu)
> +			pool->node = cpu_to_node(cpu);
>  		for_each_pool(pool, pi) {
>  			mutex_lock(&pool->attach_mutex);
>  
> @@ -4951,7 +4976,21 @@ void workqueue_register_numanode(int nid)
>  void workqueue_unregister_numanode(int nid)
>  {
>  	struct workqueue_struct *wq;
> +	const struct cpumask *nodecpumask;
> +	struct worker_pool *pool;
> +	int cpu;
>  
> +	/* at this point, cpu-to-node relationship is not lost */
> +	nodecpumask = cpumask_of_node(nid);
> +	for_each_cpu(cpu, nodecpumask) {
> +		/*
> +		 * pool is allcated at boot and assumed to be persistent,
> +		 * we cannot free this.
> +		 * Update to be NUMA_NO_NODE. This will be fixed at ONLINE
> +		 */
> +		for_each_cpu_worker_pool(pool, cpu)
> +			pool->node = NUMA_NO_NODE;
> +	}
>  	mutex_lock(&wq_pool_mutex);
>  	list_for_each_entry(wq, &workqueues, list)
>  		wq_release_unbound_numa(wq, nid);


  reply	other threads:[~2014-12-15  2:08 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-12-12 10:19 [PATCH 0/5] workqueue: fix bug when numa mapping is changed Lai Jiangshan
2014-12-12 10:19 ` [PATCH 1/5] workqueue: fix memory leak in wq_numa_init() Lai Jiangshan
2014-12-12 17:12   ` Tejun Heo
2014-12-15  5:25     ` Lai Jiangshan
2014-12-12 10:19 ` [PATCH 2/5] workqueue: update wq_numa_possible_cpumask Lai Jiangshan
2014-12-12 17:18   ` Tejun Heo
2014-12-15  2:02     ` Lai Jiangshan
2014-12-25 20:16       ` Tejun Heo
2014-12-18  2:22   ` Lai Jiangshan
2014-12-12 10:19 ` [PATCH 3/5] workqueue: fixup existing pool->node Lai Jiangshan
2014-12-12 17:25   ` Tejun Heo
2014-12-15  1:23     ` Lai Jiangshan
2014-12-25 20:14       ` Tejun Heo
2015-01-13  7:08         ` Lai Jiangshan
2015-01-13 15:24           ` Tejun Heo
2014-12-12 10:19 ` [PATCH 4/5] workqueue: update NUMA affinity for the node lost CPU Lai Jiangshan
2014-12-12 17:27   ` Tejun Heo
2014-12-15  1:28     ` Lai Jiangshan
2014-12-25 20:17       ` Tejun Heo
2014-12-12 10:19 ` [PATCH 5/5] workqueue: retry on NUMA_NO_NODE when create_worker() fails Lai Jiangshan
2014-12-12 16:05   ` KOSAKI Motohiro
2014-12-12 17:29     ` KOSAKI Motohiro
2014-12-12 17:29   ` Tejun Heo
2014-12-12 17:13 ` [PATCH 0/5] workqueue: fix bug when numa mapping is changed Yasuaki Ishimatsu
2014-12-15  1:34   ` Lai Jiangshan
2014-12-18  1:50     ` Yasuaki Ishimatsu
2014-12-13 16:27 ` [PATCH 0/4] workqueue: fix bug when numa mapping is changed v2 Kamezawa Hiroyuki
2014-12-13 16:30   ` [PATCH 1/4] workqueue: add a hook for node hotplug Kamezawa Hiroyuki
2014-12-13 16:33   ` [PATCH 2/4] workqueue: add warning if pool->node is offline Kamezawa Hiroyuki
2014-12-13 16:35   ` [PATCH 3/4] workqueue: remove per-node unbound pool when node goes offline Kamezawa Hiroyuki
2014-12-15  2:06     ` Lai Jiangshan
2014-12-15  2:06       ` Kamezawa Hiroyuki
2014-12-13 16:38   ` [PATCH 4/4] workqueue: handle change in cpu-node relationship Kamezawa Hiroyuki
2014-12-15  2:12     ` Lai Jiangshan [this message]
2014-12-15  2:20       ` Kamezawa Hiroyuki
2014-12-15  2:48         ` Lai Jiangshan
2014-12-15  2:55           ` Kamezawa Hiroyuki
2014-12-15  3:30             ` Lai Jiangshan
2014-12-15  3:34             ` Lai Jiangshan
2014-12-15  4:04               ` Kamezawa Hiroyuki
2014-12-15  5:19                 ` Lai Jiangshan
2014-12-15  5:33                   ` Kamezawa Hiroyuki
2014-12-15 11:11   ` [PATCH 0/4] workqueue: fix memory allocation after numa mapping is changed v3 Kamezawa Hiroyuki
2014-12-15 11:14     ` [PATCH 1/4] workqueue:Fix unbound workqueue's node affinity detection Kamezawa Hiroyuki
2014-12-16  5:30       ` Lai Jiangshan
2014-12-16  7:32         ` Kamezawa Hiroyuki
2014-12-16  7:54           ` Lai Jiangshan
2014-12-15 11:16     ` [PATCH 2/4] workqueue: update per-cpu workqueue's node affinity at,online-offline Kamezawa Hiroyuki
2014-12-16  5:32       ` Lai Jiangshan
2014-12-16  7:25         ` Kamezawa Hiroyuki
2014-12-15 11:18     ` [PATCH 3/4] workqueue: Update workqueue's possible cpumask when a new node, coming up Kamezawa Hiroyuki
2014-12-16  7:49       ` Lai Jiangshan
2014-12-16  8:10         ` Kamezawa Hiroyuki
2014-12-16  8:18           ` Kamezawa Hiroyuki
2014-12-15 11:22     ` [PATCH 4/4] workqueue: Handle cpu-node affinity change at CPU_ONLINE Kamezawa Hiroyuki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=548E4388.5090308@cn.fujitsu.com \
    --to=laijs@cn.fujitsu.com \
    --cc=guz.fnst@cn.fujitsu.com \
    --cc=isimatu.yasuaki@jp.fujitsu.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tangchen@cn.fujitsu.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.