From mboxrd@z Thu Jan  1 00:00:00 1970
From: Dario Faggioli <dario.faggioli@citrix.com>
Subject: Re: [PATCH v2 5/7] xen: credit1: increase efficiency
 and scalability of load balancing.
Date: Fri, 7 Apr 2017 17:49:02 +0200
Message-ID: <1491580142.3287.20.camel@citrix.com>
References: <149146456487.21348.8554211499146017782.stgit@Solace.fritz.box>
 <149146660655.21348.13071925386790562321.stgit@Solace.fritz.box>
 <1568f464-29a4-6330-3902-a36f3eb62775@citrix.com>
 <1491576546.3287.16.camel@citrix.com>
 <6f4e282c-ee7d-0715-5177-999da533ae46@citrix.com>
 <1491578776.3287.18.camel@citrix.com>
Mime-Version: 1.0
Content-Type: multipart/mixed; boundary="===============6912804206734789983=="
Return-path: <xen-devel-bounces@lists.xen.org>
Received: from mail6.bemta5.messagelabs.com ([195.245.231.135])
 by lists.xenproject.org with esmtp (Exim 4.84_2)
 (envelope-from <prvs=2634eef2b=dario.faggioli@citrix.com>)
 id 1cwW8Q-0003P0-6i
 for xen-devel@lists.xenproject.org; Fri, 07 Apr 2017 15:49:14 +0000
In-Reply-To: <1491578776.3287.18.camel@citrix.com>
List-Unsubscribe: <https://lists.xen.org/cgi-bin/mailman/options/xen-devel>,
 <mailto:xen-devel-request@lists.xen.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xen.org>
List-Help: <mailto:xen-devel-request@lists.xen.org?subject=help>
List-Subscribe: <https://lists.xen.org/cgi-bin/mailman/listinfo/xen-devel>,
 <mailto:xen-devel-request@lists.xen.org?subject=subscribe>
Errors-To: xen-devel-bounces@lists.xen.org
Sender: "Xen-devel" <xen-devel-bounces@lists.xen.org>
To: George Dunlap <george.dunlap@citrix.com>, xen-devel@lists.xenproject.org
Cc: George Dunlap <george.dunlap@eu.citrix.com>, Andrew Cooper <andrew.cooper3@citrix.com>, Anshul Makkar <anshul.makkar@citrix.com>
List-Id: xen-devel@lists.xenproject.org

--===============6912804206734789983==
Content-Type: multipart/signed; micalg=pgp-sha256;
	protocol="application/pgp-signature"; boundary="=-OeAwU+kutoCBFXw1cwOp"

--=-OeAwU+kutoCBFXw1cwOp
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable

On Fri, 2017-04-07 at 17:26 +0200, Dario Faggioli wrote:
> On Fri, 2017-04-07 at 16:17 +0100, George Dunlap wrote:
> > It seems like trying to sort out most of the refcounting
> > inside fo those two functions (perhaps with runq_insert() which did
> > reference counting, and __runq_insert() that didn't, or
> > something=C2=A0=C2=A0like
> > that) would be a better approach.
> >=20
> Right. I've tried already, but without success, and I had to stop an
> resort to what's in this patch.
>=20
> As I said, I am ok with this approach, so I just went for it. I can
> try
> to think harder at whether it is really possible to do something like
> you suggest above... lemme try.
>=20
So, how about something like the below?

For almost all cases, all it's done inside runqueue remove and insert
helpers.

There still are two special cases, though, one where I must explicitly
call inc_nr_runnable(), the other where I need dec_nr_runnable().

I've added comments trying to explain why that is the case.

Let me know.

Thanks and Regards,
Dario
---
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index 59b87f7..a0ad167 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -172,6 +172,7 @@ struct csched_pcpu {
     struct timer ticker;
     unsigned int tick;
     unsigned int idle_bias;
+    unsigned int nr_runnable;
 };
=20
 /*
@@ -262,9 +263,26 @@ static inline bool_t is_runq_idle(unsigned int cpu)
 }
=20
 static inline void
+inc_nr_runnable(unsigned int cpu)
+{
+    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
+    CSCHED_PCPU(cpu)->nr_runnable++;
+
+}
+
+static inline void
+dec_nr_runnable(unsigned int cpu)
+{
+    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
+    CSCHED_PCPU(cpu)->nr_runnable--;
+    ASSERT(CSCHED_PCPU(cpu)->nr_runnable >=3D 0);
+}
+
+static inline void
 __runq_insert(struct csched_vcpu *svc)
 {
-    const struct list_head * const runq =3D RUNQ(svc->vcpu->processor);
+    unsigned int cpu =3D svc->vcpu->processor;
+    const struct list_head * const runq =3D RUNQ(cpu);
     struct list_head *iter;
=20
     BUG_ON( __vcpu_on_runq(svc) );
@@ -292,12 +310,25 @@ __runq_insert(struct csched_vcpu *svc)
 }
=20
 static inline void
+runq_insert(struct csched_vcpu *svc)
+{
+    __runq_insert(svc);
+    inc_nr_runnable(svc->vcpu->processor);
+}
+
+static inline void
 __runq_remove(struct csched_vcpu *svc)
 {
     BUG_ON( !__vcpu_on_runq(svc) );
     list_del_init(&svc->runq_elem);
 }
=20
+static inline void
+runq_remove(struct csched_vcpu *svc)
+{
+    dec_nr_runnable(svc->vcpu->processor);
+    __runq_remove(svc);
+}
=20
 #define for_each_csched_balance_step(step) \
     for ( (step) =3D 0; (step) <=3D CSCHED_BALANCE_HARD_AFFINITY; (step)++=
 )
@@ -601,6 +632,7 @@ init_pdata(struct csched_private *prv, struct csched_pc=
pu *spc, int cpu)
     /* Start off idling... */
     BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu)));
     cpumask_set_cpu(cpu, prv->idlers);
+    spc->nr_runnable =3D 0;
 }
=20
 static void
@@ -1052,7 +1084,7 @@ csched_vcpu_insert(const struct scheduler *ops, struc=
t vcpu *vc)
     lock =3D vcpu_schedule_lock_irq(vc);
=20
     if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running )
-        __runq_insert(svc);
+        runq_insert(svc);
=20
     vcpu_schedule_unlock_irq(lock, vc);
=20
@@ -1117,7 +1149,7 @@ csched_vcpu_sleep(const struct scheduler *ops, struct=
 vcpu *vc)
         cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
     }
     else if ( __vcpu_on_runq(svc) )
-        __runq_remove(svc);
+        runq_remove(svc);
 }
=20
 static void
@@ -1177,7 +1209,7 @@ csched_vcpu_wake(const struct scheduler *ops, struct =
vcpu *vc)
     }
=20
     /* Put the VCPU on the runq and tickle CPUs */
-    __runq_insert(svc);
+    runq_insert(svc);
     __runq_tickle(svc);
 }
=20
@@ -1679,8 +1711,14 @@ csched_runq_steal(int peer_cpu, int cpu, int pri, in=
t balance_step)
             SCHED_VCPU_STAT_CRANK(speer, migrate_q);
             SCHED_STAT_CRANK(migrate_queued);
             WARN_ON(vc->is_urgent);
-            __runq_remove(speer);
+            runq_remove(speer);
             vc->processor =3D cpu;
+            /*
+             * speer will start executing directly on cpu, without having =
to
+             * go through runq_insert(). So we must update the runnable co=
unt
+             * for cpu here.
+             */
+            inc_nr_runnable(cpu);
             return speer;
         }
     }
@@ -1736,7 +1774,7 @@ csched_load_balance(struct csched_private *prv, int c=
pu,
         peer_node =3D node;
         do
         {
-            /* Find out what the !idle are in this node */
+            /* Select the pCPUs in this node that have work we can steal. =
*/
             cpumask_andnot(&workers, online, prv->idlers);
             cpumask_and(&workers, &workers, &node_to_cpumask(peer_node));
             __cpumask_clear_cpu(cpu, &workers);
@@ -1746,6 +1784,40 @@ csched_load_balance(struct csched_private *prv, int =
cpu,
                 goto next_node;
             do
             {
+                spinlock_t *lock;
+
+                /*
+                 * If there is only one runnable vCPU on peer_cpu, it mean=
s
+                 * there's no one to be stolen in its runqueue, so skip it=
.
+                 *
+                 * Checking this without holding the lock is racy... But t=
hat's
+                 * the whole point of this optimization!
+                 *
+                 * In more details:
+                 * - if we race with dec_nr_runnable(), we may try to take=
 the
+                 *   lock and call csched_runq_steal() for no reason. This=
 is
+                 *   not a functional issue, and should be infrequent enou=
gh.
+                 *   And we can avoid that by re-checking nr_runnable afte=
r
+                 *   having grabbed the lock, if we want;
+                 * - if we race with inc_nr_runnable(), we skip a pCPU tha=
t may
+                 *   have runnable vCPUs in its runqueue, but that's not a
+                 *   problem because:
+                 *   + if racing with csched_vcpu_insert() or csched_vcpu_=
wake(),
+                 *     __runq_tickle() will be called afterwords, so the v=
CPU
+                 *     won't get stuck in the runqueue for too long;
+                 *   + if racing with csched_runq_steal(), it may be that =
a
+                 *     vCPU that we could have picked up, stays in a runqu=
eue
+                 *     until someone else tries to steal it again. But thi=
s is
+                 *     no worse than what can happen already (without this
+                 *     optimization), it the pCPU would schedule right aft=
er we
+                 *     have taken the lock, and hence block on it.
+                 */
+                if ( CSCHED_PCPU(peer_cpu)->nr_runnable <=3D 1 )
+                {
+                    TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n =
*/ 0);
+                    goto next_cpu;
+                }
+
                 /*
                  * Get ahold of the scheduler lock for this peer CPU.
                  *
@@ -1753,14 +1825,13 @@ csched_load_balance(struct csched_private *prv, int=
 cpu,
                  * could cause a deadlock if the peer CPU is also load
                  * balancing and trying to lock this CPU.
                  */
-                spinlock_t *lock =3D pcpu_schedule_trylock(peer_cpu);
+                lock =3D pcpu_schedule_trylock(peer_cpu);
                 SCHED_STAT_CRANK(steal_trylock);
                 if ( !lock )
                 {
                     SCHED_STAT_CRANK(steal_trylock_failed);
                     TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ =
0);
-                    peer_cpu =3D cpumask_cycle(peer_cpu, &workers);
-                    continue;
+                    goto next_cpu;
                 }
=20
                 TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1=
);
@@ -1777,6 +1848,7 @@ csched_load_balance(struct csched_private *prv, int c=
pu,
                     return speer;
                 }
=20
+ next_cpu:
                 peer_cpu =3D cpumask_cycle(peer_cpu, &workers);
=20
             } while( peer_cpu !=3D cpumask_first(&workers) );
@@ -1907,7 +1979,11 @@ csched_schedule(
     if ( vcpu_runnable(current) )
         __runq_insert(scurr);
     else
+    {
         BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
+        /* Current has blocked. Update the runnable counter for this cpu. =
*/
+        dec_nr_runnable(cpu);
+    }
=20
     snext =3D __runq_elem(runq->next);
     ret.migrated =3D 0;
@@ -2024,7 +2100,8 @@ csched_dump_pcpu(const struct scheduler *ops, int cpu=
)
     runq =3D &spc->runq;
=20
     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cp=
u));
-    printk("CPU[%02d] sort=3D%d, sibling=3D%s, ", cpu, spc->runq_sort_last=
, cpustr);
+    printk("CPU[%02d] nr_run=3D%d, sort=3D%d, sibling=3D%s, ",
+           cpu, spc->nr_runnable, spc->runq_sort_last, cpustr);
     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu))=
;
     printk("core=3D%s\n", cpustr);
=20
--=20
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
--=-OeAwU+kutoCBFXw1cwOp
Content-Type: application/pgp-signature; name="signature.asc"
Content-Description: This is a digitally signed message part
Content-Transfer-Encoding: 7bit

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2

iQIcBAABCAAGBQJY57TuAAoJEBZCeImluHPuh8YP/1+lLgVZdLAzej2MsMUMHpVE
kWpoHoMwM/DuWdrwqOJDtCdjHTLj7Z4GmMuAWU867EqTR5RahoskmIKSdRHsEYi7
hpSS9iaVZsYrhnF9HfnrWjar/pPoVFPYwHvmm3y+R0VyRv44WIUvPzWZqIOVQQzf
q8UfT3ILqLMOpVVnXJsAFW6p81SSslLS0Zwu7c4ZuuIbpagnq9BVEh9lg2iFJDd5
olqJ6LMU3y38sPz96UPFhkxvmw8DbuZFO6fdYIRVwhXet921VTMQ3rOmQ7p5qoPX
5uwcoemOpXMUXAAZyGYCRxDDRoeBM3WWnh7cSNX/t3cXIZ2knxyBzxa1u+blFSW1
RkjcTpiqwRB3T+abJx2yMAkyQtOvj2821UPavOgQqxPYXrFC1FbRs2zZa6/zGLeL
1dV/uCOynOXwRRmUYtXgeNO/kvDRhRn9zomlfNZTmJcaKT1DDqZTedohe2ThAD9V
koXJQrf1Yc33xIEBMBU5fppnCVqLOjyP9L9TwvuVFgaPRPIVoEoqLYvRhV4X7yC2
6SA3BtosLY5aCWTtbyf7JGy3lkHd84hku922nKex9ocLPHRzoDPzxCXtfYdujNud
AO8yMid98cNBFu8YAaUcE/CjJOQPO91Q9wKaz/ld7DLm3pF6Js0pa2o65ySY0X2y
SqcSW+3MYBS+YvynKkz4
=P0zi
-----END PGP SIGNATURE-----

--=-OeAwU+kutoCBFXw1cwOp--


--===============6912804206734789983==
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: base64
Content-Disposition: inline

X19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX18KWGVuLWRldmVs
IG1haWxpbmcgbGlzdApYZW4tZGV2ZWxAbGlzdHMueGVuLm9yZwpodHRwczovL2xpc3RzLnhlbi5v
cmcveGVuLWRldmVsCg==

--===============6912804206734789983==--