From mboxrd@z Thu Jan 1 00:00:00 1970 From: Dario Faggioli Subject: Re: [PATCH v2 5/7] xen: credit1: increase efficiency and scalability of load balancing. Date: Fri, 7 Apr 2017 17:49:02 +0200 Message-ID: <1491580142.3287.20.camel@citrix.com> References: <149146456487.21348.8554211499146017782.stgit@Solace.fritz.box> <149146660655.21348.13071925386790562321.stgit@Solace.fritz.box> <1568f464-29a4-6330-3902-a36f3eb62775@citrix.com> <1491576546.3287.16.camel@citrix.com> <6f4e282c-ee7d-0715-5177-999da533ae46@citrix.com> <1491578776.3287.18.camel@citrix.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="===============6912804206734789983==" Return-path: Received: from mail6.bemta5.messagelabs.com ([195.245.231.135]) by lists.xenproject.org with esmtp (Exim 4.84_2) (envelope-from ) id 1cwW8Q-0003P0-6i for xen-devel@lists.xenproject.org; Fri, 07 Apr 2017 15:49:14 +0000 In-Reply-To: <1491578776.3287.18.camel@citrix.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Errors-To: xen-devel-bounces@lists.xen.org Sender: "Xen-devel" To: George Dunlap , xen-devel@lists.xenproject.org Cc: George Dunlap , Andrew Cooper , Anshul Makkar List-Id: xen-devel@lists.xenproject.org --===============6912804206734789983== Content-Type: multipart/signed; micalg=pgp-sha256; protocol="application/pgp-signature"; boundary="=-OeAwU+kutoCBFXw1cwOp" --=-OeAwU+kutoCBFXw1cwOp Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable On Fri, 2017-04-07 at 17:26 +0200, Dario Faggioli wrote: > On Fri, 2017-04-07 at 16:17 +0100, George Dunlap wrote: > > It seems like trying to sort out most of the refcounting > > inside fo those two functions (perhaps with runq_insert() which did > > reference counting, and __runq_insert() that didn't, or > > something=C2=A0=C2=A0like > > that) would be a better approach. > >=20 > Right. I've tried already, but without success, and I had to stop an > resort to what's in this patch. >=20 > As I said, I am ok with this approach, so I just went for it. I can > try > to think harder at whether it is really possible to do something like > you suggest above... lemme try. >=20 So, how about something like the below? For almost all cases, all it's done inside runqueue remove and insert helpers. There still are two special cases, though, one where I must explicitly call inc_nr_runnable(), the other where I need dec_nr_runnable(). I've added comments trying to explain why that is the case. Let me know. Thanks and Regards, Dario --- diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c index 59b87f7..a0ad167 100644 --- a/xen/common/sched_credit.c +++ b/xen/common/sched_credit.c @@ -172,6 +172,7 @@ struct csched_pcpu { struct timer ticker; unsigned int tick; unsigned int idle_bias; + unsigned int nr_runnable; }; =20 /* @@ -262,9 +263,26 @@ static inline bool_t is_runq_idle(unsigned int cpu) } =20 static inline void +inc_nr_runnable(unsigned int cpu) +{ + ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); + CSCHED_PCPU(cpu)->nr_runnable++; + +} + +static inline void +dec_nr_runnable(unsigned int cpu) +{ + ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); + CSCHED_PCPU(cpu)->nr_runnable--; + ASSERT(CSCHED_PCPU(cpu)->nr_runnable >=3D 0); +} + +static inline void __runq_insert(struct csched_vcpu *svc) { - const struct list_head * const runq =3D RUNQ(svc->vcpu->processor); + unsigned int cpu =3D svc->vcpu->processor; + const struct list_head * const runq =3D RUNQ(cpu); struct list_head *iter; =20 BUG_ON( __vcpu_on_runq(svc) ); @@ -292,12 +310,25 @@ __runq_insert(struct csched_vcpu *svc) } =20 static inline void +runq_insert(struct csched_vcpu *svc) +{ + __runq_insert(svc); + inc_nr_runnable(svc->vcpu->processor); +} + +static inline void __runq_remove(struct csched_vcpu *svc) { BUG_ON( !__vcpu_on_runq(svc) ); list_del_init(&svc->runq_elem); } =20 +static inline void +runq_remove(struct csched_vcpu *svc) +{ + dec_nr_runnable(svc->vcpu->processor); + __runq_remove(svc); +} =20 #define for_each_csched_balance_step(step) \ for ( (step) =3D 0; (step) <=3D CSCHED_BALANCE_HARD_AFFINITY; (step)++= ) @@ -601,6 +632,7 @@ init_pdata(struct csched_private *prv, struct csched_pc= pu *spc, int cpu) /* Start off idling... */ BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu))); cpumask_set_cpu(cpu, prv->idlers); + spc->nr_runnable =3D 0; } =20 static void @@ -1052,7 +1084,7 @@ csched_vcpu_insert(const struct scheduler *ops, struc= t vcpu *vc) lock =3D vcpu_schedule_lock_irq(vc); =20 if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running ) - __runq_insert(svc); + runq_insert(svc); =20 vcpu_schedule_unlock_irq(lock, vc); =20 @@ -1117,7 +1149,7 @@ csched_vcpu_sleep(const struct scheduler *ops, struct= vcpu *vc) cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); } else if ( __vcpu_on_runq(svc) ) - __runq_remove(svc); + runq_remove(svc); } =20 static void @@ -1177,7 +1209,7 @@ csched_vcpu_wake(const struct scheduler *ops, struct = vcpu *vc) } =20 /* Put the VCPU on the runq and tickle CPUs */ - __runq_insert(svc); + runq_insert(svc); __runq_tickle(svc); } =20 @@ -1679,8 +1711,14 @@ csched_runq_steal(int peer_cpu, int cpu, int pri, in= t balance_step) SCHED_VCPU_STAT_CRANK(speer, migrate_q); SCHED_STAT_CRANK(migrate_queued); WARN_ON(vc->is_urgent); - __runq_remove(speer); + runq_remove(speer); vc->processor =3D cpu; + /* + * speer will start executing directly on cpu, without having = to + * go through runq_insert(). So we must update the runnable co= unt + * for cpu here. + */ + inc_nr_runnable(cpu); return speer; } } @@ -1736,7 +1774,7 @@ csched_load_balance(struct csched_private *prv, int c= pu, peer_node =3D node; do { - /* Find out what the !idle are in this node */ + /* Select the pCPUs in this node that have work we can steal. = */ cpumask_andnot(&workers, online, prv->idlers); cpumask_and(&workers, &workers, &node_to_cpumask(peer_node)); __cpumask_clear_cpu(cpu, &workers); @@ -1746,6 +1784,40 @@ csched_load_balance(struct csched_private *prv, int = cpu, goto next_node; do { + spinlock_t *lock; + + /* + * If there is only one runnable vCPU on peer_cpu, it mean= s + * there's no one to be stolen in its runqueue, so skip it= . + * + * Checking this without holding the lock is racy... But t= hat's + * the whole point of this optimization! + * + * In more details: + * - if we race with dec_nr_runnable(), we may try to take= the + * lock and call csched_runq_steal() for no reason. This= is + * not a functional issue, and should be infrequent enou= gh. + * And we can avoid that by re-checking nr_runnable afte= r + * having grabbed the lock, if we want; + * - if we race with inc_nr_runnable(), we skip a pCPU tha= t may + * have runnable vCPUs in its runqueue, but that's not a + * problem because: + * + if racing with csched_vcpu_insert() or csched_vcpu_= wake(), + * __runq_tickle() will be called afterwords, so the v= CPU + * won't get stuck in the runqueue for too long; + * + if racing with csched_runq_steal(), it may be that = a + * vCPU that we could have picked up, stays in a runqu= eue + * until someone else tries to steal it again. But thi= s is + * no worse than what can happen already (without this + * optimization), it the pCPU would schedule right aft= er we + * have taken the lock, and hence block on it. + */ + if ( CSCHED_PCPU(peer_cpu)->nr_runnable <=3D 1 ) + { + TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n = */ 0); + goto next_cpu; + } + /* * Get ahold of the scheduler lock for this peer CPU. * @@ -1753,14 +1825,13 @@ csched_load_balance(struct csched_private *prv, int= cpu, * could cause a deadlock if the peer CPU is also load * balancing and trying to lock this CPU. */ - spinlock_t *lock =3D pcpu_schedule_trylock(peer_cpu); + lock =3D pcpu_schedule_trylock(peer_cpu); SCHED_STAT_CRANK(steal_trylock); if ( !lock ) { SCHED_STAT_CRANK(steal_trylock_failed); TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ = 0); - peer_cpu =3D cpumask_cycle(peer_cpu, &workers); - continue; + goto next_cpu; } =20 TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1= ); @@ -1777,6 +1848,7 @@ csched_load_balance(struct csched_private *prv, int c= pu, return speer; } =20 + next_cpu: peer_cpu =3D cpumask_cycle(peer_cpu, &workers); =20 } while( peer_cpu !=3D cpumask_first(&workers) ); @@ -1907,7 +1979,11 @@ csched_schedule( if ( vcpu_runnable(current) ) __runq_insert(scurr); else + { BUG_ON( is_idle_vcpu(current) || list_empty(runq) ); + /* Current has blocked. Update the runnable counter for this cpu. = */ + dec_nr_runnable(cpu); + } =20 snext =3D __runq_elem(runq->next); ret.migrated =3D 0; @@ -2024,7 +2100,8 @@ csched_dump_pcpu(const struct scheduler *ops, int cpu= ) runq =3D &spc->runq; =20 cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cp= u)); - printk("CPU[%02d] sort=3D%d, sibling=3D%s, ", cpu, spc->runq_sort_last= , cpustr); + printk("CPU[%02d] nr_run=3D%d, sort=3D%d, sibling=3D%s, ", + cpu, spc->nr_runnable, spc->runq_sort_last, cpustr); cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu))= ; printk("core=3D%s\n", cpustr); =20 --=20 <> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) --=-OeAwU+kutoCBFXw1cwOp Content-Type: application/pgp-signature; name="signature.asc" Content-Description: This is a digitally signed message part Content-Transfer-Encoding: 7bit -----BEGIN PGP SIGNATURE----- Version: GnuPG v2 iQIcBAABCAAGBQJY57TuAAoJEBZCeImluHPuh8YP/1+lLgVZdLAzej2MsMUMHpVE kWpoHoMwM/DuWdrwqOJDtCdjHTLj7Z4GmMuAWU867EqTR5RahoskmIKSdRHsEYi7 hpSS9iaVZsYrhnF9HfnrWjar/pPoVFPYwHvmm3y+R0VyRv44WIUvPzWZqIOVQQzf q8UfT3ILqLMOpVVnXJsAFW6p81SSslLS0Zwu7c4ZuuIbpagnq9BVEh9lg2iFJDd5 olqJ6LMU3y38sPz96UPFhkxvmw8DbuZFO6fdYIRVwhXet921VTMQ3rOmQ7p5qoPX 5uwcoemOpXMUXAAZyGYCRxDDRoeBM3WWnh7cSNX/t3cXIZ2knxyBzxa1u+blFSW1 RkjcTpiqwRB3T+abJx2yMAkyQtOvj2821UPavOgQqxPYXrFC1FbRs2zZa6/zGLeL 1dV/uCOynOXwRRmUYtXgeNO/kvDRhRn9zomlfNZTmJcaKT1DDqZTedohe2ThAD9V koXJQrf1Yc33xIEBMBU5fppnCVqLOjyP9L9TwvuVFgaPRPIVoEoqLYvRhV4X7yC2 6SA3BtosLY5aCWTtbyf7JGy3lkHd84hku922nKex9ocLPHRzoDPzxCXtfYdujNud AO8yMid98cNBFu8YAaUcE/CjJOQPO91Q9wKaz/ld7DLm3pF6Js0pa2o65ySY0X2y SqcSW+3MYBS+YvynKkz4 =P0zi -----END PGP SIGNATURE----- --=-OeAwU+kutoCBFXw1cwOp-- --===============6912804206734789983== Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: base64 Content-Disposition: inline X19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX18KWGVuLWRldmVs IG1haWxpbmcgbGlzdApYZW4tZGV2ZWxAbGlzdHMueGVuLm9yZwpodHRwczovL2xpc3RzLnhlbi5v cmcveGVuLWRldmVsCg== --===============6912804206734789983==--