[PATCH RESEND 05/12] xen: numa-sched: make space for per-vcpu node-affinity

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Dario Faggioli <dario.faggioli@citrix.com>
To: xen-devel@lists.xen.org
Cc: Marcus Granado <Marcus.Granado@eu.citrix.com>,
	Keir Fraser <keir@xen.org>,
	Ian Campbell <Ian.Campbell@citrix.com>,
	Li Yechen <lccycc123@gmail.com>,
	George Dunlap <george.dunlap@eu.citrix.com>,
	Andrew Cooper <Andrew.Cooper3@citrix.com>,
	Juergen Gross <juergen.gross@ts.fujitsu.com>,
	Ian Jackson <Ian.Jackson@eu.citrix.com>,
	Jan Beulich <JBeulich@suse.com>,
	Justin Weaver <jtweaver@hawaii.edu>,
	Daniel De Graaf <dgdegra@tycho.nsa.gov>,
	Matt Wilson <msw@amazon.com>,
	Elena Ufimtseva <ufimtseva@gmail.com>
Subject: [PATCH RESEND 05/12] xen: numa-sched: make space for per-vcpu node-affinity
Date: Tue, 05 Nov 2013 15:35:00 +0100	[thread overview]
Message-ID: <20131105143500.30446.9976.stgit@Solace> (raw)
In-Reply-To: <20131105142844.30446.78671.stgit@Solace>

Before this change, each vcpu had its own vcpu-affinity (also
called pinning), while the whole domain had a NUMA node-affinity.
Of course, as the (credit) scheduler schedules vcpus and not
whole domains, this means that all the vcpus of a domain had
the same NUMA node-affinity.

This change is the first step toward overcoming such limitation.
It adds the data structures for storing the node-affinity on a
per-vcpu basis (along with allocating and initializing it).

As far as this change only is concerned, there is no specific
way to change the node-affinity of a vcpu to something which
is not automatically computed (basing on its vcpu-affinity).
Such logic is being introduced in subsequent commits.

Also, now that each vcpu has its own node-affinity, and in
case the domain's node-affinity is set to 'automatically
computed', we build it up as the union of all the
node-affinities of all the vcpus of the domain.

Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
---
 xen/common/domain.c     |   39 ++++++++++++++++++++++++++++++++-----
 xen/common/keyhandler.c |    6 +++++-
 xen/common/schedule.c   |   50 +++++++++++++++++++++++++++++++++++++++++++++++
 xen/include/xen/sched.h |   10 +++++++++
 4 files changed, 99 insertions(+), 6 deletions(-)

diff --git a/xen/common/domain.c b/xen/common/domain.c
index af31ab4..8d2ff49 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -128,6 +128,7 @@ struct vcpu *alloc_vcpu(
     if ( !zalloc_cpumask_var(&v->cpu_affinity) ||
          !zalloc_cpumask_var(&v->cpu_affinity_tmp) ||
          !zalloc_cpumask_var(&v->cpu_affinity_saved) ||
+         !zalloc_cpumask_var(&v->node_affinity) ||
          !zalloc_cpumask_var(&v->vcpu_dirty_cpumask) )
         goto fail_free;
 
@@ -159,6 +160,7 @@ struct vcpu *alloc_vcpu(
         free_cpumask_var(v->cpu_affinity);
         free_cpumask_var(v->cpu_affinity_tmp);
         free_cpumask_var(v->cpu_affinity_saved);
+        free_cpumask_var(v->node_affinity);
         free_cpumask_var(v->vcpu_dirty_cpumask);
         free_vcpu_struct(v);
         return NULL;
@@ -353,7 +355,7 @@ void domain_update_node_affinity(struct domain *d)
     cpumask_var_t online_affinity;
     const cpumask_t *online;
     struct vcpu *v;
-    unsigned int node;
+    unsigned int cpu;
 
     if ( !zalloc_cpumask_var(&cpumask) )
         return;
@@ -367,9 +369,36 @@ void domain_update_node_affinity(struct domain *d)
 
     spin_lock(&d->node_affinity_lock);
 
+    /*
+     * Let's prepare the cpumask that will be used below to actually update
+     * the node-affinity of the whole domain. Each vcpu has a vcpu-affinity
+     * and a numa-affinity. What gets built in cpumask (and used below) is
+     * the union of all the (online) cpus in all the vcpu's numa-affinity
+     * masks.
+     *
+     * On its turn, the numa-affinity mask of the i-eth vcpu (say, 'v') is
+     * either derived directly from the vcpu's vcpu-affinity mask (in case
+     * v->auto_node_affinity is true) or has its own value, (potentially)
+     * completely independent from v->cpu_affinity. In the former case, it
+     * is here that we make sure the two affinity masks matches (since this
+     * function gets called in correspondence of each modification to
+     * v->cpu_affinity happening in vcpu_set_affinity() ); in the latter
+     * case, we just leave v->node_affinity alone.
+     */
     for_each_vcpu ( d, v )
     {
-        cpumask_and(online_affinity, v->cpu_affinity, online);
+        if ( v->auto_node_affinity )
+        {
+            cpumask_clear(v->node_affinity);
+            for_each_cpu ( cpu, v->cpu_affinity )
+                cpumask_or(v->node_affinity, v->node_affinity,
+                           &node_to_cpumask(cpu_to_node(cpu)));
+
+            cpumask_and(online_affinity, v->node_affinity, online);
+        }
+        else
+            cpumask_copy(online_affinity, v->node_affinity);
+
         cpumask_or(cpumask, cpumask, online_affinity);
     }
 
@@ -383,9 +412,8 @@ void domain_update_node_affinity(struct domain *d)
     if ( d->auto_node_affinity )
     {
         nodes_clear(d->node_affinity);
-        for_each_online_node ( node )
-            if ( cpumask_intersects(&node_to_cpumask(node), cpumask) )
-                node_set(node, d->node_affinity);
+        for_each_cpu ( cpu, cpumask )
+            node_set(cpu_to_node(cpu), d->node_affinity);
     }
 
     sched_set_node_affinity(d, &d->node_affinity);
@@ -734,6 +762,7 @@ static void complete_domain_destroy(struct rcu_head *head)
         {
             free_cpumask_var(v->cpu_affinity);
             free_cpumask_var(v->cpu_affinity_tmp);
+            free_cpumask_var(v->node_affinity);
             free_cpumask_var(v->vcpu_dirty_cpumask);
             free_vcpu_struct(v);
         }
diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
index 8e4b3f8..8d5e8b2 100644
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -297,7 +297,11 @@ static void dump_domains(unsigned char key)
             cpuset_print(tmpstr, sizeof(tmpstr), v->vcpu_dirty_cpumask);
             printk("dirty_cpus=%s ", tmpstr);
             cpuset_print(tmpstr, sizeof(tmpstr), v->cpu_affinity);
-            printk("cpu_affinity=%s\n", tmpstr);
+            printk("cpu_affinity=%s ", tmpstr);
+            cpuset_print(tmpstr, sizeof(tmpstr), v->node_affinity);
+            printk("node_affinity=%s%s\n",
+                   v->auto_node_affinity ? "(auto)" : "(manual)",
+                   tmpstr);
             printk("    pause_count=%d pause_flags=%lx\n",
                    atomic_read(&v->pause_count), v->pause_flags);
             arch_dump_vcpu_info(v);
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 0f45f07..b3966ad 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -198,6 +198,9 @@ int sched_init_vcpu(struct vcpu *v, unsigned int processor)
     else
         cpumask_setall(v->cpu_affinity);
 
+    v->auto_node_affinity = 1;
+    cpumask_copy(v->node_affinity, v->cpu_affinity);
+
     /* Initialise the per-vcpu timers. */
     init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
                v, v->processor);
@@ -684,6 +687,53 @@ int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity)
     return 0;
 }
 
+int vcpu_set_node_affinity(struct vcpu *v, const nodemask_t *nodes)
+{
+    nodemask_t online_nodes;
+    int node;
+
+    nodes_and(online_nodes, node_online_map, *nodes);
+
+    /* Having no affinity at all is just wrong */
+    if ( nodes_empty(online_nodes) )
+        return -EINVAL;
+
+    spin_lock(&v->domain->node_affinity_lock);
+
+    /*
+     * Explicitly saying "all nodes" is not particularly useful here.
+     * Let's use it as the `reset numa-affinity to auto' command.
+     */
+    if ( nodes_full(*nodes) )
+    {
+        v->auto_node_affinity = 1;
+        goto out;
+    }
+
+    /*
+     * When someone asks for a specific numa-affinity for a vcpu we need to
+     * clear auto_node_affinity, convert the nodemask in online_nodes
+     * into a cpumask_t and store it in node_affinity.
+     */
+    v->auto_node_affinity = 0;
+
+    cpumask_clear(v->node_affinity);
+    for_each_node_mask( node, online_nodes )
+        cpumask_or(v->node_affinity, v->node_affinity,
+                   &node_to_cpumask(node));
+
+out:
+    spin_unlock(&v->domain->node_affinity_lock);
+
+    /*
+     * Changing the numa-affinity of a vcpu calls for an update
+     * of the node-affinity of the whole domain.
+     */
+    domain_update_node_affinity(v->domain);
+
+    return 0;
+}
+
 /* Block the currently-executing domain until a pertinent event occurs. */
 void vcpu_block(void)
 {
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 25bf637..732d6b6 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -172,6 +172,8 @@ struct vcpu
     /* VCPU need affinity restored */
     bool_t           affinity_broken;
 
+    /* is node_affinity (below) automatically computed from vcpu-affinity? */
+    bool_t           auto_node_affinity;
 
     /*
      * > 0: a single port is being polled;
@@ -197,6 +199,13 @@ struct vcpu
     /* Used to restore affinity across S3. */
     cpumask_var_t    cpu_affinity_saved;
 
+    /*
+     * Bitmask of CPUs on which this VCPU prefers to run. For both this
+     * and auto_node_affinity access is serialized against
+     * v->domain->node_affinity_lock.
+     */
+    cpumask_var_t    node_affinity;
+
     /* Bitmask of CPUs which are holding onto this VCPU's state. */
     cpumask_var_t    vcpu_dirty_cpumask;
 
@@ -740,6 +749,7 @@ int schedule_cpu_switch(unsigned int cpu, struct cpupool *c);
 void vcpu_force_reschedule(struct vcpu *v);
 int cpu_disable_scheduler(unsigned int cpu);
 int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity);
+int vcpu_set_node_affinity(struct vcpu *v, const nodemask_t *nodes);
 void restore_vcpu_affinity(struct domain *d);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);

next prev parent reply	other threads:[~2013-11-05 14:35 UTC|newest]

Thread overview: 71+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-11-05 14:33 [PATCH RESEND 00/12] Implement per-vcpu NUMA node-affinity for credit1 Dario Faggioli
2013-11-05 14:34 ` [PATCH RESEND 01/12] xen: numa-sched: leave node-affinity alone if not in "auto" mode Dario Faggioli
2013-11-05 14:43   ` George Dunlap
2013-11-05 14:34 ` [PATCH RESEND 02/12] xl: allow for node-wise specification of vcpu pinning Dario Faggioli
2013-11-05 14:50   ` George Dunlap
2013-11-06  8:48     ` Dario Faggioli
2013-11-07 18:17   ` Ian Jackson
2013-11-08  9:24     ` Dario Faggioli
2013-11-08 15:20       ` Ian Jackson
2013-11-05 14:34 ` [PATCH RESEND 03/12] xl: implement and enable dryrun mode for `xl vcpu-pin' Dario Faggioli
2013-11-05 14:34 ` [PATCH RESEND 04/12] xl: test script for the cpumap parser (for vCPU pinning) Dario Faggioli
2013-11-05 14:35 ` Dario Faggioli [this message]
2013-11-05 14:52   ` [PATCH RESEND 05/12] xen: numa-sched: make space for per-vcpu node-affinity Jan Beulich
2013-11-05 15:03     ` George Dunlap
2013-11-05 15:11       ` Jan Beulich
2013-11-05 15:24         ` George Dunlap
2013-11-05 22:15         ` Dario Faggioli
2013-11-05 15:11       ` George Dunlap
2013-11-05 15:23         ` Jan Beulich
2013-11-05 15:39           ` George Dunlap
2013-11-05 16:56             ` George Dunlap
2013-11-05 17:16               ` George Dunlap
2013-11-05 17:30                 ` Jan Beulich
2013-11-05 23:12                   ` Dario Faggioli
2013-11-05 23:01                 ` Dario Faggioli
2013-11-06  9:39                 ` Dario Faggioli
2013-11-06  9:46                   ` Jan Beulich
2013-11-06 10:00                     ` Dario Faggioli
2013-11-06 11:44                       ` George Dunlap
2013-11-06 14:26                         ` Dario Faggioli
2013-11-06 14:56                           ` George Dunlap
2013-11-06 15:14                             ` Jan Beulich
2013-11-06 16:12                               ` George Dunlap
2013-11-06 16:22                                 ` Jan Beulich
2013-11-06 16:48                                 ` Dario Faggioli
2013-11-06 16:20                               ` Dario Faggioli
2013-11-06 16:23                             ` Dario Faggioli
2013-11-05 17:24               ` Jan Beulich
2013-11-05 17:31                 ` George Dunlap
2013-11-05 23:08               ` Dario Faggioli
2013-11-05 22:54             ` Dario Faggioli
2013-11-05 22:22         ` Dario Faggioli
2013-11-06 11:41         ` Dario Faggioli
2013-11-06 14:47           ` George Dunlap
2013-11-06 16:53             ` Dario Faggioli
2013-11-05 14:35 ` [PATCH RESEND 06/12] xen: numa-sched: domain node-affinity always comes from vcpu node-affinity Dario Faggioli
2013-11-05 14:35 ` [PATCH RESEND 07/12] xen: numa-sched: use per-vcpu node-affinity for actual scheduling Dario Faggioli
2013-11-05 16:20   ` George Dunlap
2013-11-06  9:15     ` Dario Faggioli
2013-11-05 14:35 ` [PATCH RESEND 08/12] xen: numa-sched: enable getting/specifying per-vcpu node-affinity Dario Faggioli
2013-11-05 14:35 ` [PATCH RESEND 09/12] libxc: " Dario Faggioli
2013-11-07 18:27   ` Ian Jackson
2013-11-12 16:01   ` Konrad Rzeszutek Wilk
2013-11-12 16:43     ` George Dunlap
2013-11-12 16:55       ` Konrad Rzeszutek Wilk
2013-11-12 18:40     ` Dario Faggioli
2013-11-12 19:13       ` Konrad Rzeszutek Wilk
2013-11-12 21:36         ` Dario Faggioli
2013-11-13 10:57         ` Dario Faggioli
2013-11-05 14:35 ` [PATCH RESEND 10/12] libxl: " Dario Faggioli
2013-11-07 18:29   ` Ian Jackson
2013-11-08  9:18     ` Dario Faggioli
2013-11-08 15:07       ` Ian Jackson
2013-11-05 14:36 ` [PATCH RESEND 11/12] xl: " Dario Faggioli
2013-11-07 18:33   ` Ian Jackson
2013-11-08  9:33     ` Dario Faggioli
2013-11-08 15:18       ` Ian Jackson
2013-11-05 14:36 ` [PATCH RESEND 12/12] xl: numa-sched: enable specifying node-affinity in VM config file Dario Faggioli
2013-11-07 18:35   ` Ian Jackson
2013-11-08  9:49     ` Dario Faggioli
2013-11-08 15:22       ` Ian Jackson

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:af31ab4 dfblob:8d2ff49 dfblob:8e4b3f8 dfblob:8d5e8b2
dfblob:0f45f07 dfblob:b3966ad dfblob:25bf637 dfblob:732d6b6 )
 OR (
bs:"[PATCH RESEND 05/12] xen: numa-sched: make space for per-vcpu node-affinity" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20131105143500.30446.9976.stgit@Solace \
    --to=dario.faggioli@citrix.com \
    --cc=Andrew.Cooper3@citrix.com \
    --cc=Ian.Campbell@citrix.com \
    --cc=Ian.Jackson@eu.citrix.com \
    --cc=JBeulich@suse.com \
    --cc=Marcus.Granado@eu.citrix.com \
    --cc=dgdegra@tycho.nsa.gov \
    --cc=george.dunlap@eu.citrix.com \
    --cc=jtweaver@hawaii.edu \
    --cc=juergen.gross@ts.fujitsu.com \
    --cc=keir@xen.org \
    --cc=lccycc123@gmail.com \
    --cc=msw@amazon.com \
    --cc=ufimtseva@gmail.com \
    --cc=xen-devel@lists.xen.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.