From mboxrd@z Thu Jan 1 00:00:00 1970 From: Dario Faggioli Subject: [PATCHv2 1 of 1] Rework locking for sched_adjust. Date: Wed, 07 Dec 2011 16:02:11 +0100 Message-ID: <1323270131.22009.18.camel@Solace> References: <1322060131.30168.15.camel@Abyss> <1322060876.30168.18.camel@Abyss> <1322065473.1005.151.camel@zakaz.uk.xensource.com> <1323269351.22009.15.camel@Solace> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="===============0144470413239263929==" Return-path: In-Reply-To: <1323269351.22009.15.camel@Solace> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xensource.com Errors-To: xen-devel-bounces@lists.xensource.com To: "xen-devel@lists.xensource.com" Cc: George Dunlap , Juergen Gross , "Keir (Xen.org)" , Ian Campbell List-Id: xen-devel@lists.xenproject.org --===============0144470413239263929== Content-Type: multipart/signed; micalg="pgp-sha1"; protocol="application/pgp-signature"; boundary="=-q3C0RM6deIxjXOsU2o/V" --=-q3C0RM6deIxjXOsU2o/V Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable The main idea is to move (as much as possible) locking logic from generic code to the various pluggable schedulers. While at it, the following is also accomplished: - pausing all the non-current VCPUs of a domain while changing its scheduling parameters is not effective in avoiding races and it is prone to deadlock, so that is removed. - sedf needs a global lock for preventing races while adjusting domains' scheduling parameters (as it is for credit and credit2), so that is added. Signed-off-by: Dario Faggioli diff -r 38eb74c01d9d xen/common/sched_credit.c --- a/xen/common/sched_credit.c Tue Dec 06 21:16:56 2011 +0000 +++ b/xen/common/sched_credit.c Wed Dec 07 15:45:55 2011 +0100 @@ -161,6 +161,7 @@ struct csched_dom { * System-wide private data */ struct csched_private { + /* lock for the whole pluggable scheduler, nests inside cpupool_lock *= / spinlock_t lock; struct list_head active_sdom; uint32_t ncpus; @@ -800,6 +801,10 @@ csched_dom_cntl( struct csched_private *prv =3D CSCHED_PRIV(ops); unsigned long flags; =20 + /* Protect both get and put branches with the pluggable scheduler + * lock. Runq lock not needed anywhere in here. */ + spin_lock_irqsave(&prv->lock, flags); + if ( op->cmd =3D=3D XEN_DOMCTL_SCHEDOP_getinfo ) { op->u.credit.weight =3D sdom->weight; @@ -809,8 +814,6 @@ csched_dom_cntl( { ASSERT(op->cmd =3D=3D XEN_DOMCTL_SCHEDOP_putinfo); =20 - spin_lock_irqsave(&prv->lock, flags); - if ( op->u.credit.weight !=3D 0 ) { if ( !list_empty(&sdom->active_sdom_elem) ) @@ -824,9 +827,10 @@ csched_dom_cntl( if ( op->u.credit.cap !=3D (uint16_t)~0U ) sdom->cap =3D op->u.credit.cap; =20 - spin_unlock_irqrestore(&prv->lock, flags); } =20 + spin_unlock_irqrestore(&prv->lock, flags); + return 0; } =20 diff -r 38eb74c01d9d xen/common/sched_credit2.c --- a/xen/common/sched_credit2.c Tue Dec 06 21:16:56 2011 +0000 +++ b/xen/common/sched_credit2.c Wed Dec 07 15:45:55 2011 +0100 @@ -1384,6 +1384,10 @@ csched_dom_cntl( struct csched_private *prv =3D CSCHED_PRIV(ops); unsigned long flags; =20 + /* Must hold csched_priv lock to read and update sdom, + * runq lock to update csvcs. */ + spin_lock_irqsave(&prv->lock, flags); + if ( op->cmd =3D=3D XEN_DOMCTL_SCHEDOP_getinfo ) { op->u.credit2.weight =3D sdom->weight; @@ -1397,10 +1401,6 @@ csched_dom_cntl( struct list_head *iter; int old_weight; =20 - /* Must hold csched_priv lock to update sdom, runq lock to - * update csvcs. */ - spin_lock_irqsave(&prv->lock, flags); - old_weight =3D sdom->weight; =20 sdom->weight =3D op->u.credit2.weight; @@ -1411,22 +1411,23 @@ csched_dom_cntl( struct csched_vcpu *svc =3D list_entry(iter, struct csched= _vcpu, sdom_elem); =20 /* NB: Locking order is important here. Because we grab t= his lock here, we - * must never lock csched_priv.lock if we're holding a run= queue - * lock. */ - vcpu_schedule_lock_irq(svc->vcpu); + * must never lock csched_priv.lock if we're holding a run= queue lock. + * Also, calling vcpu_schedule_lock() is enough, since IRQ= s have already + * been disabled. */ + vcpu_schedule_lock(svc->vcpu); =20 BUG_ON(svc->rqd !=3D RQD(ops, svc->vcpu->processor)); =20 svc->weight =3D sdom->weight; update_max_weight(svc->rqd, svc->weight, old_weight); =20 - vcpu_schedule_unlock_irq(svc->vcpu); + vcpu_schedule_unlock(svc->vcpu); } - - spin_unlock_irqrestore(&prv->lock, flags); } } =20 + spin_unlock_irqrestore(&prv->lock, flags); + return 0; } =20 diff -r 38eb74c01d9d xen/common/sched_sedf.c --- a/xen/common/sched_sedf.c Tue Dec 06 21:16:56 2011 +0000 +++ b/xen/common/sched_sedf.c Wed Dec 07 15:45:55 2011 +0100 @@ -61,6 +61,11 @@ struct sedf_dom_info { struct domain *domain; }; =20 +struct sedf_priv_info { + /* lock for the whole pluggable scheduler, nests inside cpupool_lock *= / + spinlock_t lock; +}; + struct sedf_vcpu_info { struct vcpu *vcpu; struct list_head list; @@ -115,6 +120,8 @@ struct sedf_cpu_info { s_time_t current_slice_expires; }; =20 +#define SEDF_PRIV(_ops) \ + ((struct sedf_priv_info *)((_ops)->sched_data)) #define EDOM_INFO(d) ((struct sedf_vcpu_info *)((d)->sched_priv)) #define CPU_INFO(cpu) \ ((struct sedf_cpu_info *)per_cpu(schedule_data, cpu).sched_priv) @@ -772,6 +779,31 @@ static struct task_slice sedf_do_extra_s } =20 =20 +static int sedf_init(struct scheduler *ops) +{ + struct sedf_priv_info *prv; + + prv =3D xzalloc(struct sedf_priv_info); + if ( prv =3D=3D NULL ) + return -ENOMEM; + + ops->sched_data =3D prv; + spin_lock_init(&prv->lock); + + return 0; +} + + +static void sedf_deinit(const struct scheduler *ops) +{ + struct sedf_priv_info *prv; + + prv =3D SEDF_PRIV(ops); + if ( prv !=3D NULL ) + xfree(prv); +} + + /* Main scheduling function Reasons for calling this function are: -timeslice for the current period used up @@ -1320,22 +1352,15 @@ static void sedf_dump_cpu_state(const st =20 =20 /* Adjusts periods and slices of the domains accordingly to their weights.= */ -static int sedf_adjust_weights(struct cpupool *c, struct xen_domctl_schedu= ler_op *cmd) +static int sedf_adjust_weights(struct cpupool *c, int nr_cpus, int *sumw, = s_time_t *sumt) { struct vcpu *p; struct domain *d; - unsigned int cpu, nr_cpus =3D cpumask_last(&cpu_online_map) + 1= ; - int *sumw =3D xzalloc_array(int, nr_cpus); - s_time_t *sumt =3D xzalloc_array(s_time_t, nr_cpus); + unsigned int cpu; =20 - if ( !sumw || !sumt ) - { - xfree(sumt); - xfree(sumw); - return -ENOMEM; - } - - /* Sum across all weights. */ + /* Sum across all weights. Notice that no runq locking is needed + * here: the caller holds sedf_priv_info.lock and we're not changing + * anything that is accessed during scheduling. */ rcu_read_lock(&domlist_read_lock); for_each_domain_in_cpupool( d, c ) { @@ -1365,7 +1390,9 @@ static int sedf_adjust_weights(struct cp } rcu_read_unlock(&domlist_read_lock); =20 - /* Adjust all slices (and periods) to the new weight. */ + /* Adjust all slices (and periods) to the new weight. Unlike above, we + * need to take thr runq lock for the various VCPUs: we're modyfing + * slice and period which are referenced during scheduling. */ rcu_read_lock(&domlist_read_lock); for_each_domain_in_cpupool( d, c ) { @@ -1375,20 +1402,20 @@ static int sedf_adjust_weights(struct cp continue; if ( EDOM_INFO(p)->weight ) { + /* Interrupts already off */ + vcpu_schedule_lock(p); EDOM_INFO(p)->period_orig =3D=20 EDOM_INFO(p)->period =3D WEIGHT_PERIOD; EDOM_INFO(p)->slice_orig =3D EDOM_INFO(p)->slice =3D=20 (EDOM_INFO(p)->weight * (WEIGHT_PERIOD - WEIGHT_SAFETY - sumt[cpu])) / sumw[c= pu]; + vcpu_schedule_unlock(p); } } } rcu_read_unlock(&domlist_read_lock); =20 - xfree(sumt); - xfree(sumw); - return 0; } =20 @@ -1396,19 +1423,45 @@ static int sedf_adjust_weights(struct cp /* set or fetch domain scheduling parameters */ static int sedf_adjust(const struct scheduler *ops, struct domain *p, stru= ct xen_domctl_scheduler_op *op) { + struct sedf_priv_info *prv =3D SEDF_PRIV(ops); + unsigned long flags; + unsigned int nr_cpus =3D cpumask_last(&cpu_online_map) + 1; + int *sumw =3D xzalloc_array(int, nr_cpus); + s_time_t *sumt =3D xzalloc_array(s_time_t, nr_cpus); struct vcpu *v; - int rc; + int rc =3D 0; =20 PRINT(2,"sedf_adjust was called, domain-id %i new period %"PRIu64" " "new slice %"PRIu64"\nlatency %"PRIu64" extra:%s\n", p->domain_id, op->u.sedf.period, op->u.sedf.slice, op->u.sedf.latency, (op->u.sedf.extratime)?"yes":"no"); =20 + /* Serialize against the pluggable scheduler lock to protect from + * concurrent updates. We need to take the runq lock for the VCPUs + * as well, since we are touching extraweight, weight, slice and + * period. As in sched_credit2.c, runq locks nest inside the + * pluggable scheduler lock. */ + spin_lock_irqsave(&prv->lock, flags); + if ( op->cmd =3D=3D XEN_DOMCTL_SCHEDOP_putinfo ) { + /* These are used in sedf_adjust_weights() but have to be allocate= d in + * this function, as we need to avoid nesting xmem_pool_alloc's lo= ck + * within our prv->lock. */ + if ( !sumw || !sumt ) + { + /* Check for errors here, the _getinfo branch doesn't care */ + rc =3D -ENOMEM; + goto out; + } + /* Check for sane parameters. */ if ( !op->u.sedf.period && !op->u.sedf.weight ) - return -EINVAL; + { + rc =3D -EINVAL; + goto out; + } + if ( op->u.sedf.weight ) { if ( (op->u.sedf.extratime & EXTRA_AWARE) && @@ -1417,59 +1470,78 @@ static int sedf_adjust(const struct sche /* Weight-driven domains with extratime only. */ for_each_vcpu ( p, v ) { + /* (Here and everywhere in the following) IRQs are alr= eady off, + * hence vcpu_spin_lock() is the one. */ + vcpu_schedule_lock(v); EDOM_INFO(v)->extraweight =3D op->u.sedf.weight; EDOM_INFO(v)->weight =3D 0; EDOM_INFO(v)->slice =3D 0; EDOM_INFO(v)->period =3D WEIGHT_PERIOD; + vcpu_schedule_unlock(v); } } else { /* Weight-driven domains with real-time execution. */ - for_each_vcpu ( p, v ) + for_each_vcpu ( p, v ) { + vcpu_schedule_lock(v); EDOM_INFO(v)->weight =3D op->u.sedf.weight; + vcpu_schedule_unlock(v); + } } } else { + /* + * Sanity checking: note that disabling extra weight requires + * that we set a non-zero slice. + */ + if ( (op->u.sedf.period > PERIOD_MAX) || + (op->u.sedf.period < PERIOD_MIN) || + (op->u.sedf.slice > op->u.sedf.period) || + (op->u.sedf.slice < SLICE_MIN) ) + { + rc =3D -EINVAL; + goto out; + } + /* Time-driven domains. */ for_each_vcpu ( p, v ) { - /* - * Sanity checking: note that disabling extra weight requi= res - * that we set a non-zero slice. - */ - if ( (op->u.sedf.period > PERIOD_MAX) || - (op->u.sedf.period < PERIOD_MIN) || - (op->u.sedf.slice > op->u.sedf.period) || - (op->u.sedf.slice < SLICE_MIN) ) - return -EINVAL; + vcpu_schedule_lock(v); EDOM_INFO(v)->weight =3D 0; EDOM_INFO(v)->extraweight =3D 0; EDOM_INFO(v)->period_orig =3D=20 EDOM_INFO(v)->period =3D op->u.sedf.period; EDOM_INFO(v)->slice_orig =3D=20 EDOM_INFO(v)->slice =3D op->u.sedf.slice; + vcpu_schedule_unlock(v); } } =20 - rc =3D sedf_adjust_weights(p->cpupool, op); + rc =3D sedf_adjust_weights(p->cpupool, nr_cpus, sumw, sumt); if ( rc ) - return rc; + goto out; =20 for_each_vcpu ( p, v ) { + vcpu_schedule_lock(v); EDOM_INFO(v)->status =3D=20 (EDOM_INFO(v)->status & ~EXTRA_AWARE) | (op->u.sedf.extratime & EXTRA_AWARE); EDOM_INFO(v)->latency =3D op->u.sedf.latency; extraq_check(v); + vcpu_schedule_unlock(v); } } else if ( op->cmd =3D=3D XEN_DOMCTL_SCHEDOP_getinfo ) { if ( p->vcpu[0] =3D=3D NULL ) - return -EINVAL; + { + rc =3D -EINVAL; + goto out; + } + op->u.sedf.period =3D EDOM_INFO(p->vcpu[0])->period; op->u.sedf.slice =3D EDOM_INFO(p->vcpu[0])->slice; op->u.sedf.extratime =3D EDOM_INFO(p->vcpu[0])->status & EXTRA_AWA= RE; @@ -1477,14 +1549,23 @@ static int sedf_adjust(const struct sche op->u.sedf.weight =3D EDOM_INFO(p->vcpu[0])->weight; } =20 - PRINT(2,"sedf_adjust_finished\n"); - return 0; +out: + spin_unlock_irqrestore(&prv->lock, flags); + + xfree(sumt); + xfree(sumw); + + PRINT(2,"sedf_adjust_finished with return code %d\n", rc); + return rc; } =20 +static struct sedf_priv_info _sedf_priv; + const struct scheduler sched_sedf_def =3D { - .name =3D "Simple EDF Scheduler", - .opt_name =3D "sedf", - .sched_id =3D XEN_SCHEDULER_SEDF, + .name =3D "Simple EDF Scheduler", + .opt_name =3D "sedf", + .sched_id =3D XEN_SCHEDULER_SEDF, + .sched_data =3D &_sedf_priv, =20 .init_domain =3D sedf_init_domain, .destroy_domain =3D sedf_destroy_domain, @@ -1498,6 +1579,9 @@ const struct scheduler sched_sedf_def =3D=20 .alloc_domdata =3D sedf_alloc_domdata, .free_domdata =3D sedf_free_domdata, =20 + .init =3D sedf_init, + .deinit =3D sedf_deinit, + .do_schedule =3D sedf_do_schedule, .pick_cpu =3D sedf_pick_cpu, .dump_cpu_state =3D sedf_dump_cpu_state, diff -r 38eb74c01d9d xen/common/schedule.c --- a/xen/common/schedule.c Tue Dec 06 21:16:56 2011 +0000 +++ b/xen/common/schedule.c Wed Dec 07 15:45:55 2011 +0100 @@ -1005,7 +1005,6 @@ int sched_id(void) /* Adjust scheduling parameter for a given domain. */ long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op) { - struct vcpu *v; long ret; =20 if ( (op->sched_id !=3D DOM2OP(d)->sched_id) || @@ -1013,40 +1012,11 @@ long sched_adjust(struct domain *d, stru (op->cmd !=3D XEN_DOMCTL_SCHEDOP_getinfo)) ) return -EINVAL; =20 - /* - * Most VCPUs we can simply pause. If we are adjusting this VCPU then - * we acquire the local schedule_lock to guard against concurrent upda= tes. - * - * We only acquire the local schedule lock after we have paused all ot= her - * VCPUs in this domain. There are two reasons for this: - * 1- We don't want to hold up interrupts as pausing a VCPU can - * trigger a tlb shootdown. - * 2- Pausing other VCPUs involves briefly locking the schedule - * lock of the CPU they are running on. This CPU could be the - * same as ours. - */ - - for_each_vcpu ( d, v ) - { - if ( v !=3D current ) - vcpu_pause(v); - } - - if ( d =3D=3D current->domain ) - vcpu_schedule_lock_irq(current); - + /* NB: the pluggable scheduler code needs to take care + * of locking by itself. */ if ( (ret =3D SCHED_OP(DOM2OP(d), adjust, d, op)) =3D=3D 0 ) TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id); =20 - if ( d =3D=3D current->domain ) - vcpu_schedule_unlock_irq(current); - - for_each_vcpu ( d, v ) - { - if ( v !=3D current ) - vcpu_unpause(v); - } - return ret; } =20 --=20 <> (Raistlin Majere) ------------------------------------------------------------------- Dario Faggioli, http://retis.sssup.it/people/faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) PhD Candidate, ReTiS Lab, Scuola Superiore Sant'Anna, Pisa (Italy) --=-q3C0RM6deIxjXOsU2o/V Content-Type: application/pgp-signature; name="signature.asc" Content-Description: This is a digitally signed message part Content-Transfer-Encoding: 7bit -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.11 (GNU/Linux) iEYEABECAAYFAk7ff/MACgkQk4XaBE3IOsTMmQCfba0ErG7KDO1WJzflbOy289Z2 mZQAn01E9lHFuo6UD367Vv4Me2rW0fzz =raBP -----END PGP SIGNATURE----- --=-q3C0RM6deIxjXOsU2o/V-- --===============0144470413239263929== Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Disposition: inline _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel --===============0144470413239263929==--