From mboxrd@z Thu Jan  1 00:00:00 1970
From: Dario Faggioli <raistlin@linux.it>
Subject: [PATCHv2 1 of 1] Rework locking for sched_adjust.
Date: Wed, 07 Dec 2011 16:02:11 +0100
Message-ID: <1323270131.22009.18.camel@Solace>
References: <1322060131.30168.15.camel@Abyss> <1322060876.30168.18.camel@Abyss>
	<1322065473.1005.151.camel@zakaz.uk.xensource.com>
	<1323269351.22009.15.camel@Solace>
Mime-Version: 1.0
Content-Type: multipart/mixed; boundary="===============0144470413239263929=="
Return-path: <xen-devel-bounces@lists.xensource.com>
In-Reply-To: <1323269351.22009.15.camel@Solace>
List-Unsubscribe: <http://lists.xensource.com/mailman/options/xen-devel>,
	<mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xensource.com>
List-Help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-Subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
Sender: xen-devel-bounces@lists.xensource.com
Errors-To: xen-devel-bounces@lists.xensource.com
To: "xen-devel@lists.xensource.com" <xen-devel@lists.xensource.com>
Cc: George Dunlap <George.Dunlap@eu.citrix.com>, Juergen Gross <juergen.gross@ts.fujitsu.com>, "Keir (Xen.org)" <keir@xen.org>, Ian Campbell <Ian.Campbell@citrix.com>
List-Id: xen-devel@lists.xenproject.org


--===============0144470413239263929==
Content-Type: multipart/signed; micalg="pgp-sha1"; protocol="application/pgp-signature";
	boundary="=-q3C0RM6deIxjXOsU2o/V"


--=-q3C0RM6deIxjXOsU2o/V
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable

The main idea is to move (as much as possible) locking logic
from generic code to the various pluggable schedulers.

While at it, the following is also accomplished:
 - pausing all the non-current VCPUs of a domain while changing its
   scheduling parameters is not effective in avoiding races and it is
   prone to deadlock, so that is removed.
 - sedf needs a global lock for preventing races while adjusting
   domains' scheduling parameters (as it is for credit and credit2),
   so that is added.

Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>

diff -r 38eb74c01d9d xen/common/sched_credit.c
--- a/xen/common/sched_credit.c	Tue Dec 06 21:16:56 2011 +0000
+++ b/xen/common/sched_credit.c	Wed Dec 07 15:45:55 2011 +0100
@@ -161,6 +161,7 @@ struct csched_dom {
  * System-wide private data
  */
 struct csched_private {
+    /* lock for the whole pluggable scheduler, nests inside cpupool_lock *=
/
     spinlock_t lock;
     struct list_head active_sdom;
     uint32_t ncpus;
@@ -800,6 +801,10 @@ csched_dom_cntl(
     struct csched_private *prv =3D CSCHED_PRIV(ops);
     unsigned long flags;
=20
+    /* Protect both get and put branches with the pluggable scheduler
+     * lock. Runq lock not needed anywhere in here. */
+    spin_lock_irqsave(&prv->lock, flags);
+
     if ( op->cmd =3D=3D XEN_DOMCTL_SCHEDOP_getinfo )
     {
         op->u.credit.weight =3D sdom->weight;
@@ -809,8 +814,6 @@ csched_dom_cntl(
     {
         ASSERT(op->cmd =3D=3D XEN_DOMCTL_SCHEDOP_putinfo);
=20
-        spin_lock_irqsave(&prv->lock, flags);
-
         if ( op->u.credit.weight !=3D 0 )
         {
             if ( !list_empty(&sdom->active_sdom_elem) )
@@ -824,9 +827,10 @@ csched_dom_cntl(
         if ( op->u.credit.cap !=3D (uint16_t)~0U )
             sdom->cap =3D op->u.credit.cap;
=20
-        spin_unlock_irqrestore(&prv->lock, flags);
     }
=20
+    spin_unlock_irqrestore(&prv->lock, flags);
+
     return 0;
 }
=20
diff -r 38eb74c01d9d xen/common/sched_credit2.c
--- a/xen/common/sched_credit2.c	Tue Dec 06 21:16:56 2011 +0000
+++ b/xen/common/sched_credit2.c	Wed Dec 07 15:45:55 2011 +0100
@@ -1384,6 +1384,10 @@ csched_dom_cntl(
     struct csched_private *prv =3D CSCHED_PRIV(ops);
     unsigned long flags;
=20
+    /* Must hold csched_priv lock to read and update sdom,
+     * runq lock to update csvcs. */
+    spin_lock_irqsave(&prv->lock, flags);
+
     if ( op->cmd =3D=3D XEN_DOMCTL_SCHEDOP_getinfo )
     {
         op->u.credit2.weight =3D sdom->weight;
@@ -1397,10 +1401,6 @@ csched_dom_cntl(
             struct list_head *iter;
             int old_weight;
=20
-            /* Must hold csched_priv lock to update sdom, runq lock to
-             * update csvcs. */
-            spin_lock_irqsave(&prv->lock, flags);
-
             old_weight =3D sdom->weight;
=20
             sdom->weight =3D op->u.credit2.weight;
@@ -1411,22 +1411,23 @@ csched_dom_cntl(
                 struct csched_vcpu *svc =3D list_entry(iter, struct csched=
_vcpu, sdom_elem);
=20
                 /* NB: Locking order is important here.  Because we grab t=
his lock here, we
-                 * must never lock csched_priv.lock if we're holding a run=
queue
-                 * lock. */
-                vcpu_schedule_lock_irq(svc->vcpu);
+                 * must never lock csched_priv.lock if we're holding a run=
queue lock.
+                 * Also, calling vcpu_schedule_lock() is enough, since IRQ=
s have already
+                 * been disabled. */
+                vcpu_schedule_lock(svc->vcpu);
=20
                 BUG_ON(svc->rqd !=3D RQD(ops, svc->vcpu->processor));
=20
                 svc->weight =3D sdom->weight;
                 update_max_weight(svc->rqd, svc->weight, old_weight);
=20
-                vcpu_schedule_unlock_irq(svc->vcpu);
+                vcpu_schedule_unlock(svc->vcpu);
             }
-
-            spin_unlock_irqrestore(&prv->lock, flags);
         }
     }
=20
+    spin_unlock_irqrestore(&prv->lock, flags);
+
     return 0;
 }
=20
diff -r 38eb74c01d9d xen/common/sched_sedf.c
--- a/xen/common/sched_sedf.c	Tue Dec 06 21:16:56 2011 +0000
+++ b/xen/common/sched_sedf.c	Wed Dec 07 15:45:55 2011 +0100
@@ -61,6 +61,11 @@ struct sedf_dom_info {
     struct domain  *domain;
 };
=20
+struct sedf_priv_info {
+    /* lock for the whole pluggable scheduler, nests inside cpupool_lock *=
/
+    spinlock_t lock;
+};
+
 struct sedf_vcpu_info {
     struct vcpu *vcpu;
     struct list_head list;
@@ -115,6 +120,8 @@ struct sedf_cpu_info {
     s_time_t         current_slice_expires;
 };
=20
+#define SEDF_PRIV(_ops) \
+    ((struct sedf_priv_info *)((_ops)->sched_data))
 #define EDOM_INFO(d)   ((struct sedf_vcpu_info *)((d)->sched_priv))
 #define CPU_INFO(cpu)  \
     ((struct sedf_cpu_info *)per_cpu(schedule_data, cpu).sched_priv)
@@ -772,6 +779,31 @@ static struct task_slice sedf_do_extra_s
 }
=20
=20
+static int sedf_init(struct scheduler *ops)
+{
+    struct sedf_priv_info *prv;
+
+    prv =3D xzalloc(struct sedf_priv_info);
+    if ( prv =3D=3D NULL )
+        return -ENOMEM;
+
+    ops->sched_data =3D prv;
+    spin_lock_init(&prv->lock);
+
+    return 0;
+}
+
+
+static void sedf_deinit(const struct scheduler *ops)
+{
+    struct sedf_priv_info *prv;
+
+    prv =3D SEDF_PRIV(ops);
+    if ( prv !=3D NULL )
+        xfree(prv);
+}
+
+
 /* Main scheduling function
    Reasons for calling this function are:
    -timeslice for the current period used up
@@ -1320,22 +1352,15 @@ static void sedf_dump_cpu_state(const st
=20
=20
 /* Adjusts periods and slices of the domains accordingly to their weights.=
 */
-static int sedf_adjust_weights(struct cpupool *c, struct xen_domctl_schedu=
ler_op *cmd)
+static int sedf_adjust_weights(struct cpupool *c, int nr_cpus, int *sumw, =
s_time_t *sumt)
 {
     struct vcpu *p;
     struct domain      *d;
-    unsigned int        cpu, nr_cpus =3D cpumask_last(&cpu_online_map) + 1=
;
-    int                *sumw =3D xzalloc_array(int, nr_cpus);
-    s_time_t           *sumt =3D xzalloc_array(s_time_t, nr_cpus);
+    unsigned int        cpu;
=20
-    if ( !sumw || !sumt )
-    {
-        xfree(sumt);
-        xfree(sumw);
-        return -ENOMEM;
-    }
-
-    /* Sum across all weights. */
+    /* Sum across all weights. Notice that no runq locking is needed
+     * here: the caller holds sedf_priv_info.lock and we're not changing
+     * anything that is accessed during scheduling. */
     rcu_read_lock(&domlist_read_lock);
     for_each_domain_in_cpupool( d, c )
     {
@@ -1365,7 +1390,9 @@ static int sedf_adjust_weights(struct cp
     }
     rcu_read_unlock(&domlist_read_lock);
=20
-    /* Adjust all slices (and periods) to the new weight. */
+    /* Adjust all slices (and periods) to the new weight. Unlike above, we
+     * need to take thr runq lock for the various VCPUs: we're modyfing
+     * slice and period which are referenced during scheduling. */
     rcu_read_lock(&domlist_read_lock);
     for_each_domain_in_cpupool( d, c )
     {
@@ -1375,20 +1402,20 @@ static int sedf_adjust_weights(struct cp
                 continue;
             if ( EDOM_INFO(p)->weight )
             {
+                /* Interrupts already off */
+                vcpu_schedule_lock(p);
                 EDOM_INFO(p)->period_orig =3D=20
                     EDOM_INFO(p)->period  =3D WEIGHT_PERIOD;
                 EDOM_INFO(p)->slice_orig  =3D
                     EDOM_INFO(p)->slice   =3D=20
                     (EDOM_INFO(p)->weight *
                      (WEIGHT_PERIOD - WEIGHT_SAFETY - sumt[cpu])) / sumw[c=
pu];
+                vcpu_schedule_unlock(p);
             }
         }
     }
     rcu_read_unlock(&domlist_read_lock);
=20
-    xfree(sumt);
-    xfree(sumw);
-
     return 0;
 }
=20
@@ -1396,19 +1423,45 @@ static int sedf_adjust_weights(struct cp
 /* set or fetch domain scheduling parameters */
 static int sedf_adjust(const struct scheduler *ops, struct domain *p, stru=
ct xen_domctl_scheduler_op *op)
 {
+    struct sedf_priv_info *prv =3D SEDF_PRIV(ops);
+    unsigned long flags;
+    unsigned int nr_cpus =3D cpumask_last(&cpu_online_map) + 1;
+    int *sumw =3D xzalloc_array(int, nr_cpus);
+    s_time_t *sumt =3D xzalloc_array(s_time_t, nr_cpus);
     struct vcpu *v;
-    int rc;
+    int rc =3D 0;
=20
     PRINT(2,"sedf_adjust was called, domain-id %i new period %"PRIu64" "
           "new slice %"PRIu64"\nlatency %"PRIu64" extra:%s\n",
           p->domain_id, op->u.sedf.period, op->u.sedf.slice,
           op->u.sedf.latency, (op->u.sedf.extratime)?"yes":"no");
=20
+    /* Serialize against the pluggable scheduler lock to protect from
+     * concurrent updates. We need to take the runq lock for the VCPUs
+     * as well, since we are touching extraweight, weight, slice and
+     * period. As in sched_credit2.c, runq locks nest inside the
+     * pluggable scheduler lock. */
+    spin_lock_irqsave(&prv->lock, flags);
+
     if ( op->cmd =3D=3D XEN_DOMCTL_SCHEDOP_putinfo )
     {
+        /* These are used in sedf_adjust_weights() but have to be allocate=
d in
+         * this function, as we need to avoid nesting xmem_pool_alloc's lo=
ck
+         * within our prv->lock. */
+        if ( !sumw || !sumt )
+        {
+            /* Check for errors here, the _getinfo branch doesn't care */
+            rc =3D -ENOMEM;
+            goto out;
+        }
+
         /* Check for sane parameters. */
         if ( !op->u.sedf.period && !op->u.sedf.weight )
-            return -EINVAL;
+        {
+            rc =3D -EINVAL;
+            goto out;
+        }
+
         if ( op->u.sedf.weight )
         {
             if ( (op->u.sedf.extratime & EXTRA_AWARE) &&
@@ -1417,59 +1470,78 @@ static int sedf_adjust(const struct sche
                 /* Weight-driven domains with extratime only. */
                 for_each_vcpu ( p, v )
                 {
+                    /* (Here and everywhere in the following) IRQs are alr=
eady off,
+                     * hence vcpu_spin_lock() is the one. */
+                    vcpu_schedule_lock(v);
                     EDOM_INFO(v)->extraweight =3D op->u.sedf.weight;
                     EDOM_INFO(v)->weight =3D 0;
                     EDOM_INFO(v)->slice =3D 0;
                     EDOM_INFO(v)->period =3D WEIGHT_PERIOD;
+                    vcpu_schedule_unlock(v);
                 }
             }
             else
             {
                 /* Weight-driven domains with real-time execution. */
-                for_each_vcpu ( p, v )
+                for_each_vcpu ( p, v ) {
+                    vcpu_schedule_lock(v);
                     EDOM_INFO(v)->weight =3D op->u.sedf.weight;
+                    vcpu_schedule_unlock(v);
+                }
             }
         }
         else
         {
+            /*
+             * Sanity checking: note that disabling extra weight requires
+             * that we set a non-zero slice.
+             */
+            if ( (op->u.sedf.period > PERIOD_MAX) ||
+                 (op->u.sedf.period < PERIOD_MIN) ||
+                 (op->u.sedf.slice  > op->u.sedf.period) ||
+                 (op->u.sedf.slice  < SLICE_MIN) )
+            {
+                rc =3D -EINVAL;
+                goto out;
+            }
+
             /* Time-driven domains. */
             for_each_vcpu ( p, v )
             {
-                /*
-                 * Sanity checking: note that disabling extra weight requi=
res
-                 * that we set a non-zero slice.
-                 */
-                if ( (op->u.sedf.period > PERIOD_MAX) ||
-                     (op->u.sedf.period < PERIOD_MIN) ||
-                     (op->u.sedf.slice  > op->u.sedf.period) ||
-                     (op->u.sedf.slice  < SLICE_MIN) )
-                    return -EINVAL;
+                vcpu_schedule_lock(v);
                 EDOM_INFO(v)->weight =3D 0;
                 EDOM_INFO(v)->extraweight =3D 0;
                 EDOM_INFO(v)->period_orig =3D=20
                     EDOM_INFO(v)->period  =3D op->u.sedf.period;
                 EDOM_INFO(v)->slice_orig  =3D=20
                     EDOM_INFO(v)->slice   =3D op->u.sedf.slice;
+                vcpu_schedule_unlock(v);
             }
         }
=20
-        rc =3D sedf_adjust_weights(p->cpupool, op);
+        rc =3D sedf_adjust_weights(p->cpupool, nr_cpus, sumw, sumt);
         if ( rc )
-            return rc;
+            goto out;
=20
         for_each_vcpu ( p, v )
         {
+            vcpu_schedule_lock(v);
             EDOM_INFO(v)->status  =3D=20
                 (EDOM_INFO(v)->status &
                  ~EXTRA_AWARE) | (op->u.sedf.extratime & EXTRA_AWARE);
             EDOM_INFO(v)->latency =3D op->u.sedf.latency;
             extraq_check(v);
+            vcpu_schedule_unlock(v);
         }
     }
     else if ( op->cmd =3D=3D XEN_DOMCTL_SCHEDOP_getinfo )
     {
         if ( p->vcpu[0] =3D=3D NULL )
-            return -EINVAL;
+        {
+            rc =3D -EINVAL;
+            goto out;
+        }
+
         op->u.sedf.period    =3D EDOM_INFO(p->vcpu[0])->period;
         op->u.sedf.slice     =3D EDOM_INFO(p->vcpu[0])->slice;
         op->u.sedf.extratime =3D EDOM_INFO(p->vcpu[0])->status & EXTRA_AWA=
RE;
@@ -1477,14 +1549,23 @@ static int sedf_adjust(const struct sche
         op->u.sedf.weight    =3D EDOM_INFO(p->vcpu[0])->weight;
     }
=20
-    PRINT(2,"sedf_adjust_finished\n");
-    return 0;
+out:
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    xfree(sumt);
+    xfree(sumw);
+
+    PRINT(2,"sedf_adjust_finished with return code %d\n", rc);
+    return rc;
 }
=20
+static struct sedf_priv_info _sedf_priv;
+
 const struct scheduler sched_sedf_def =3D {
-    .name     =3D "Simple EDF Scheduler",
-    .opt_name =3D "sedf",
-    .sched_id =3D XEN_SCHEDULER_SEDF,
+    .name           =3D "Simple EDF Scheduler",
+    .opt_name       =3D "sedf",
+    .sched_id       =3D XEN_SCHEDULER_SEDF,
+    .sched_data     =3D &_sedf_priv,
    =20
     .init_domain    =3D sedf_init_domain,
     .destroy_domain =3D sedf_destroy_domain,
@@ -1498,6 +1579,9 @@ const struct scheduler sched_sedf_def =3D=20
     .alloc_domdata  =3D sedf_alloc_domdata,
     .free_domdata   =3D sedf_free_domdata,
=20
+    .init           =3D sedf_init,
+    .deinit         =3D sedf_deinit,
+
     .do_schedule    =3D sedf_do_schedule,
     .pick_cpu       =3D sedf_pick_cpu,
     .dump_cpu_state =3D sedf_dump_cpu_state,
diff -r 38eb74c01d9d xen/common/schedule.c
--- a/xen/common/schedule.c	Tue Dec 06 21:16:56 2011 +0000
+++ b/xen/common/schedule.c	Wed Dec 07 15:45:55 2011 +0100
@@ -1005,7 +1005,6 @@ int sched_id(void)
 /* Adjust scheduling parameter for a given domain. */
 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
 {
-    struct vcpu *v;
     long ret;
    =20
     if ( (op->sched_id !=3D DOM2OP(d)->sched_id) ||
@@ -1013,40 +1012,11 @@ long sched_adjust(struct domain *d, stru
           (op->cmd !=3D XEN_DOMCTL_SCHEDOP_getinfo)) )
         return -EINVAL;
=20
-    /*
-     * Most VCPUs we can simply pause. If we are adjusting this VCPU then
-     * we acquire the local schedule_lock to guard against concurrent upda=
tes.
-     *
-     * We only acquire the local schedule lock after we have paused all ot=
her
-     * VCPUs in this domain. There are two reasons for this:
-     * 1- We don't want to hold up interrupts as pausing a VCPU can
-     *    trigger a tlb shootdown.
-     * 2- Pausing other VCPUs involves briefly locking the schedule
-     *    lock of the CPU they are running on. This CPU could be the
-     *    same as ours.
-     */
-
-    for_each_vcpu ( d, v )
-    {
-        if ( v !=3D current )
-            vcpu_pause(v);
-    }
-
-    if ( d =3D=3D current->domain )
-        vcpu_schedule_lock_irq(current);
-
+    /* NB: the pluggable scheduler code needs to take care
+     * of locking by itself. */
     if ( (ret =3D SCHED_OP(DOM2OP(d), adjust, d, op)) =3D=3D 0 )
         TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
=20
-    if ( d =3D=3D current->domain )
-        vcpu_schedule_unlock_irq(current);
-
-    for_each_vcpu ( d, v )
-    {
-        if ( v !=3D current )
-            vcpu_unpause(v);
-    }
-
     return ret;
 }
=20

--=20
<<This happens because I choose it to happen!>> (Raistlin Majere)
-------------------------------------------------------------------
Dario Faggioli, http://retis.sssup.it/people/faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
PhD Candidate, ReTiS Lab, Scuola Superiore Sant'Anna, Pisa (Italy)


--=-q3C0RM6deIxjXOsU2o/V
Content-Type: application/pgp-signature; name="signature.asc"
Content-Description: This is a digitally signed message part
Content-Transfer-Encoding: 7bit

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.11 (GNU/Linux)

iEYEABECAAYFAk7ff/MACgkQk4XaBE3IOsTMmQCfba0ErG7KDO1WJzflbOy289Z2
mZQAn01E9lHFuo6UD367Vv4Me2rW0fzz
=raBP
-----END PGP SIGNATURE-----

--=-q3C0RM6deIxjXOsU2o/V--


--===============0144470413239263929==
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Disposition: inline

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

--===============0144470413239263929==--