Re: [PATCH v2 5/7] xen: credit1: increase efficiency and scalability of load balancing.

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Dario Faggioli <dario.faggioli@citrix.com>
To: George Dunlap <george.dunlap@citrix.com>, xen-devel@lists.xenproject.org
Cc: George Dunlap <george.dunlap@eu.citrix.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Anshul Makkar <anshul.makkar@citrix.com>
Subject: Re: [PATCH v2 5/7] xen: credit1: increase efficiency and scalability of load balancing.
Date: Fri, 7 Apr 2017 17:49:02 +0200	[thread overview]
Message-ID: <1491580142.3287.20.camel@citrix.com> (raw)
In-Reply-To: <1491578776.3287.18.camel@citrix.com>


[-- Attachment #1.1: Type: text/plain, Size: 9505 bytes --]

On Fri, 2017-04-07 at 17:26 +0200, Dario Faggioli wrote:
> On Fri, 2017-04-07 at 16:17 +0100, George Dunlap wrote:
> > It seems like trying to sort out most of the refcounting
> > inside fo those two functions (perhaps with runq_insert() which did
> > reference counting, and __runq_insert() that didn't, or
> > something  like
> > that) would be a better approach.
> > 
> Right. I've tried already, but without success, and I had to stop an
> resort to what's in this patch.
> 
> As I said, I am ok with this approach, so I just went for it. I can
> try
> to think harder at whether it is really possible to do something like
> you suggest above... lemme try.
> 
So, how about something like the below?

For almost all cases, all it's done inside runqueue remove and insert
helpers.

There still are two special cases, though, one where I must explicitly
call inc_nr_runnable(), the other where I need dec_nr_runnable().

I've added comments trying to explain why that is the case.

Let me know.

Thanks and Regards,
Dario
---
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index 59b87f7..a0ad167 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -172,6 +172,7 @@ struct csched_pcpu {
     struct timer ticker;
     unsigned int tick;
     unsigned int idle_bias;
+    unsigned int nr_runnable;
 };
 
 /*
@@ -262,9 +263,26 @@ static inline bool_t is_runq_idle(unsigned int cpu)
 }
 
 static inline void
+inc_nr_runnable(unsigned int cpu)
+{
+    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
+    CSCHED_PCPU(cpu)->nr_runnable++;
+
+}
+
+static inline void
+dec_nr_runnable(unsigned int cpu)
+{
+    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
+    CSCHED_PCPU(cpu)->nr_runnable--;
+    ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 0);
+}
+
+static inline void
 __runq_insert(struct csched_vcpu *svc)
 {
-    const struct list_head * const runq = RUNQ(svc->vcpu->processor);
+    unsigned int cpu = svc->vcpu->processor;
+    const struct list_head * const runq = RUNQ(cpu);
     struct list_head *iter;
 
     BUG_ON( __vcpu_on_runq(svc) );
@@ -292,12 +310,25 @@ __runq_insert(struct csched_vcpu *svc)
 }
 
 static inline void
+runq_insert(struct csched_vcpu *svc)
+{
+    __runq_insert(svc);
+    inc_nr_runnable(svc->vcpu->processor);
+}
+
+static inline void
 __runq_remove(struct csched_vcpu *svc)
 {
     BUG_ON( !__vcpu_on_runq(svc) );
     list_del_init(&svc->runq_elem);
 }
 
+static inline void
+runq_remove(struct csched_vcpu *svc)
+{
+    dec_nr_runnable(svc->vcpu->processor);
+    __runq_remove(svc);
+}
 
 #define for_each_csched_balance_step(step) \
     for ( (step) = 0; (step) <= CSCHED_BALANCE_HARD_AFFINITY; (step)++ )
@@ -601,6 +632,7 @@ init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu)
     /* Start off idling... */
     BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu)));
     cpumask_set_cpu(cpu, prv->idlers);
+    spc->nr_runnable = 0;
 }
 
 static void
@@ -1052,7 +1084,7 @@ csched_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
     lock = vcpu_schedule_lock_irq(vc);
 
     if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running )
-        __runq_insert(svc);
+        runq_insert(svc);
 
     vcpu_schedule_unlock_irq(lock, vc);
 
@@ -1117,7 +1149,7 @@ csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
         cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
     }
     else if ( __vcpu_on_runq(svc) )
-        __runq_remove(svc);
+        runq_remove(svc);
 }
 
 static void
@@ -1177,7 +1209,7 @@ csched_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
     }
 
     /* Put the VCPU on the runq and tickle CPUs */
-    __runq_insert(svc);
+    runq_insert(svc);
     __runq_tickle(svc);
 }
 
@@ -1679,8 +1711,14 @@ csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step)
             SCHED_VCPU_STAT_CRANK(speer, migrate_q);
             SCHED_STAT_CRANK(migrate_queued);
             WARN_ON(vc->is_urgent);
-            __runq_remove(speer);
+            runq_remove(speer);
             vc->processor = cpu;
+            /*
+             * speer will start executing directly on cpu, without having to
+             * go through runq_insert(). So we must update the runnable count
+             * for cpu here.
+             */
+            inc_nr_runnable(cpu);
             return speer;
         }
     }
@@ -1736,7 +1774,7 @@ csched_load_balance(struct csched_private *prv, int cpu,
         peer_node = node;
         do
         {
-            /* Find out what the !idle are in this node */
+            /* Select the pCPUs in this node that have work we can steal. */
             cpumask_andnot(&workers, online, prv->idlers);
             cpumask_and(&workers, &workers, &node_to_cpumask(peer_node));
             __cpumask_clear_cpu(cpu, &workers);
@@ -1746,6 +1784,40 @@ csched_load_balance(struct csched_private *prv, int cpu,
                 goto next_node;
             do
             {
+                spinlock_t *lock;
+
+                /*
+                 * If there is only one runnable vCPU on peer_cpu, it means
+                 * there's no one to be stolen in its runqueue, so skip it.
+                 *
+                 * Checking this without holding the lock is racy... But that's
+                 * the whole point of this optimization!
+                 *
+                 * In more details:
+                 * - if we race with dec_nr_runnable(), we may try to take the
+                 *   lock and call csched_runq_steal() for no reason. This is
+                 *   not a functional issue, and should be infrequent enough.
+                 *   And we can avoid that by re-checking nr_runnable after
+                 *   having grabbed the lock, if we want;
+                 * - if we race with inc_nr_runnable(), we skip a pCPU that may
+                 *   have runnable vCPUs in its runqueue, but that's not a
+                 *   problem because:
+                 *   + if racing with csched_vcpu_insert() or csched_vcpu_wake(),
+                 *     __runq_tickle() will be called afterwords, so the vCPU
+                 *     won't get stuck in the runqueue for too long;
+                 *   + if racing with csched_runq_steal(), it may be that a
+                 *     vCPU that we could have picked up, stays in a runqueue
+                 *     until someone else tries to steal it again. But this is
+                 *     no worse than what can happen already (without this
+                 *     optimization), it the pCPU would schedule right after we
+                 *     have taken the lock, and hence block on it.
+                 */
+                if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 )
+                {
+                    TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 0);
+                    goto next_cpu;
+                }
+
                 /*
                  * Get ahold of the scheduler lock for this peer CPU.
                  *
@@ -1753,14 +1825,13 @@ csched_load_balance(struct csched_private *prv, int cpu,
                  * could cause a deadlock if the peer CPU is also load
                  * balancing and trying to lock this CPU.
                  */
-                spinlock_t *lock = pcpu_schedule_trylock(peer_cpu);
+                lock = pcpu_schedule_trylock(peer_cpu);
                 SCHED_STAT_CRANK(steal_trylock);
                 if ( !lock )
                 {
                     SCHED_STAT_CRANK(steal_trylock_failed);
                     TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0);
-                    peer_cpu = cpumask_cycle(peer_cpu, &workers);
-                    continue;
+                    goto next_cpu;
                 }
 
                 TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1);
@@ -1777,6 +1848,7 @@ csched_load_balance(struct csched_private *prv, int cpu,
                     return speer;
                 }
 
+ next_cpu:
                 peer_cpu = cpumask_cycle(peer_cpu, &workers);
 
             } while( peer_cpu != cpumask_first(&workers) );
@@ -1907,7 +1979,11 @@ csched_schedule(
     if ( vcpu_runnable(current) )
         __runq_insert(scurr);
     else
+    {
         BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
+        /* Current has blocked. Update the runnable counter for this cpu. */
+        dec_nr_runnable(cpu);
+    }
 
     snext = __runq_elem(runq->next);
     ret.migrated = 0;
@@ -2024,7 +2100,8 @@ csched_dump_pcpu(const struct scheduler *ops, int cpu)
     runq = &spc->runq;
 
     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu));
-    printk("CPU[%02d] sort=%d, sibling=%s, ", cpu, spc->runq_sort_last, cpustr);
+    printk("CPU[%02d] nr_run=%d, sort=%d, sibling=%s, ",
+           cpu, spc->nr_runnable, spc->runq_sort_last, cpustr);
     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu));
     printk("core=%s\n", cpustr);
 
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

[-- Attachment #2: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

next prev parent reply	other threads:[~2017-04-07 15:49 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-04-06  8:16 [PATCH v2 0/7] xen: sched: improve scalability of Credit1, and optimize a bit both Credit1 and Credit2 Dario Faggioli
2017-04-06  8:16 ` [PATCH v2 1/7] xen: credit1: simplify csched_runq_steal() a little bit Dario Faggioli
2017-04-07 10:45   ` George Dunlap
2017-04-07 11:00     ` Dario Faggioli
2017-04-06  8:16 ` [PATCH v2 2/7] xen: credit: (micro) optimize csched_runq_steal() Dario Faggioli
2017-04-07 10:49   ` George Dunlap
2017-04-07 11:01     ` Dario Faggioli
2017-04-06  8:16 ` [PATCH v2 3/7] xen: credit: consider tickled pCPUs as busy Dario Faggioli
2017-04-07 10:56   ` George Dunlap
2017-04-07 12:53     ` Dario Faggioli
2017-04-06  8:16 ` [PATCH v2 4/7] xen/tools: tracing: add record for credit1 runqueue stealing Dario Faggioli
2017-04-07 11:01   ` George Dunlap
2017-04-07 13:06     ` Dario Faggioli
2017-04-06  8:16 ` [PATCH v2 5/7] xen: credit1: increase efficiency and scalability of load balancing Dario Faggioli
2017-04-07 14:38   ` George Dunlap
2017-04-07 14:49     ` Dario Faggioli
2017-04-07 15:17       ` George Dunlap
2017-04-07 15:26         ` Dario Faggioli
2017-04-07 15:49           ` Dario Faggioli [this message]
2017-04-07 16:25             ` Dario Faggioli
2017-04-06  8:16 ` [PATCH v2 6/7] xen: credit1: treat pCPUs more evenly during balancing Dario Faggioli
2017-04-07 14:44   ` George Dunlap
2017-04-06  8:17 ` [PATCH v2 7/7] xen: credit2: avoid cpumask_any() in pick_cpu() Dario Faggioli
2017-04-07 14:48   ` George Dunlap

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:59b87f7 dfblob:a0ad167 )
 OR (
bs:"Re: [PATCH v2 5/7] xen: credit1: increase efficiency and scalability of load balancing." )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1491580142.3287.20.camel@citrix.com \
    --to=dario.faggioli@citrix.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=anshul.makkar@citrix.com \
    --cc=george.dunlap@citrix.com \
    --cc=george.dunlap@eu.citrix.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.